{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0002, "grad_norm": 0.5344884991645813, "kl": 0.0005174986290512607, "learning_rate": 0.0, "loss": 0.0001, "num_tokens": 6480.0, "reward": 0.84515380859375, "reward_std": 0.014680828899145126, "rewards//mean": 0.84515380859375, "rewards//std": 0.027360284700989723, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0004, "grad_norm": 0.5381270051002502, "kl": 0.0006077195721445605, "learning_rate": 1.0000000000000001e-07, "loss": 0.0001, "num_tokens": 12960.0, "reward": 0.815673828125, "reward_std": 0.016982976347208023, "rewards//mean": 0.815673828125, "rewards//std": 0.02536199241876602, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0006, "grad_norm": 0.5261220335960388, "kl": 0.000551698001800105, "learning_rate": 2.0000000000000002e-07, "loss": 0.0001, "num_tokens": 19520.0, "reward": 0.81927490234375, "reward_std": 0.010811060667037964, "rewards//mean": 0.81927490234375, "rewards//std": 0.025441773235797882, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0008, "grad_norm": 0.5289216637611389, "kl": 0.0005863762053195387, "learning_rate": 3.0000000000000004e-07, "loss": 0.0001, "num_tokens": 26128.0, "reward": 0.82012939453125, "reward_std": 0.02173474058508873, "rewards//mean": 0.82012939453125, "rewards//std": 0.04268558695912361, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.001, "grad_norm": 0.5722470283508301, "kl": 0.000628954017884098, "learning_rate": 4.0000000000000003e-07, "loss": 0.0001, "num_tokens": 32704.0, "reward": 0.7650146484375, "reward_std": 0.015487657859921455, "rewards//mean": 0.7650146484375, "rewards//std": 0.023481056094169617, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0012, "grad_norm": 0.5234053730964661, "kl": 0.0005751600110670552, "learning_rate": 5.000000000000001e-07, "loss": 0.0001, "num_tokens": 39264.0, "reward": 0.8218994140625, "reward_std": 0.014946818351745605, "rewards//mean": 0.8218994140625, "rewards//std": 0.025350946933031082, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0014, "grad_norm": 0.5195151567459106, "kl": 0.0005390266160247847, "learning_rate": 6.000000000000001e-07, "loss": 0.0001, "num_tokens": 45856.0, "reward": 0.81597900390625, "reward_std": 0.015014132484793663, "rewards//mean": 0.81597900390625, "rewards//std": 0.020927488803863525, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0016, "grad_norm": 0.4819026589393616, "kl": 0.0005648515070788562, "learning_rate": 7.000000000000001e-07, "loss": 0.0001, "num_tokens": 52376.0, "reward": 0.802490234375, "reward_std": 0.009616490453481674, "rewards//mean": 0.802490234375, "rewards//std": 0.021797746419906616, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0018, "grad_norm": 0.49622002243995667, "kl": 0.0005116216343594715, "learning_rate": 8.000000000000001e-07, "loss": 0.0001, "num_tokens": 58928.0, "reward": 0.838623046875, "reward_std": 0.010705415159463882, "rewards//mean": 0.838623046875, "rewards//std": 0.01565403863787651, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.002, "grad_norm": 0.5537323355674744, "kl": 0.0006005062314216048, "learning_rate": 9.000000000000001e-07, "loss": 0.0001, "num_tokens": 65448.0, "reward": 0.8486328125, "reward_std": 0.014971021562814713, "rewards//mean": 0.8486328125, "rewards//std": 0.024861659854650497, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0022, "grad_norm": 0.4782222807407379, "kl": 0.0005192094249650836, "learning_rate": 1.0000000000000002e-06, "loss": 0.0001, "num_tokens": 72144.0, "reward": 0.8231201171875, "reward_std": 0.014505397528409958, "rewards//mean": 0.8231201171875, "rewards//std": 0.023432008922100067, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0024, "grad_norm": 0.5115165710449219, "kl": 0.000583281711442396, "learning_rate": 1.1e-06, "loss": 0.0001, "num_tokens": 78672.0, "reward": 0.7879638671875, "reward_std": 0.014534728601574898, "rewards//mean": 0.7879638671875, "rewards//std": 0.030848022550344467, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0026, "grad_norm": 0.5358304977416992, "kl": 0.0005532236391445622, "learning_rate": 1.2000000000000002e-06, "loss": 0.0001, "num_tokens": 85160.0, "reward": 0.8404541015625, "reward_std": 0.01879037171602249, "rewards//mean": 0.8404541015625, "rewards//std": 0.021690186113119125, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0028, "grad_norm": 0.5308247804641724, "kl": 0.0005572287191171199, "learning_rate": 1.3e-06, "loss": 0.0001, "num_tokens": 91664.0, "reward": 0.81329345703125, "reward_std": 0.011631859466433525, "rewards//mean": 0.81329345703125, "rewards//std": 0.022916875779628754, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.003, "grad_norm": 0.5254385471343994, "kl": 0.0005984247691230848, "learning_rate": 1.4000000000000001e-06, "loss": 0.0001, "num_tokens": 98256.0, "reward": 0.82110595703125, "reward_std": 0.015688244253396988, "rewards//mean": 0.82110595703125, "rewards//std": 0.04342671483755112, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0032, "grad_norm": 0.5159713625907898, "kl": 0.0006229286955203861, "learning_rate": 1.5e-06, "loss": 0.0001, "num_tokens": 104744.0, "reward": 0.78802490234375, "reward_std": 0.015156297013163567, "rewards//mean": 0.78802490234375, "rewards//std": 0.024192556738853455, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0034, "grad_norm": 0.5559095144271851, "kl": 0.0005670633545378223, "learning_rate": 1.6000000000000001e-06, "loss": 0.0001, "num_tokens": 111272.0, "reward": 0.8494873046875, "reward_std": 0.013256147503852844, "rewards//mean": 0.8494873046875, "rewards//std": 0.02326083369553089, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0036, "grad_norm": 0.5672537684440613, "kl": 0.0005939321563346311, "learning_rate": 1.7000000000000002e-06, "loss": 0.0001, "num_tokens": 117816.0, "reward": 0.854248046875, "reward_std": 0.018103815615177155, "rewards//mean": 0.854248046875, "rewards//std": 0.024517083540558815, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0038, "grad_norm": 0.4956085681915283, "kl": 0.0005725373921450227, "learning_rate": 1.8000000000000001e-06, "loss": 0.0001, "num_tokens": 124328.0, "reward": 0.82659912109375, "reward_std": 0.015440911054611206, "rewards//mean": 0.82659912109375, "rewards//std": 0.02957451343536377, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.004, "grad_norm": 0.5425844788551331, "kl": 0.00057974286028184, "learning_rate": 1.9000000000000002e-06, "loss": 0.0001, "num_tokens": 130768.0, "reward": 0.84521484375, "reward_std": 0.016270218417048454, "rewards//mean": 0.84521484375, "rewards//std": 0.03373591601848602, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0042, "grad_norm": 0.5052315592765808, "kl": 0.0005288290412863716, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "num_tokens": 137264.0, "reward": 0.83184814453125, "reward_std": 0.013612732291221619, "rewards//mean": 0.83184814453125, "rewards//std": 0.028505386784672737, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0044, "grad_norm": 0.6783661842346191, "kl": 0.0005678313900716603, "learning_rate": 2.1000000000000002e-06, "loss": 0.0001, "num_tokens": 143744.0, "reward": 0.82196044921875, "reward_std": 0.01590130478143692, "rewards//mean": 0.82196044921875, "rewards//std": 0.02282818965613842, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0046, "grad_norm": 0.5427769422531128, "kl": 0.0006147136591607705, "learning_rate": 2.2e-06, "loss": 0.0001, "num_tokens": 150320.0, "reward": 0.83453369140625, "reward_std": 0.013559719547629356, "rewards//mean": 0.83453369140625, "rewards//std": 0.020104041323065758, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0048, "grad_norm": 0.5730366706848145, "kl": 0.0006162305653560907, "learning_rate": 2.3000000000000004e-06, "loss": 0.0001, "num_tokens": 156832.0, "reward": 0.79132080078125, "reward_std": 0.016593754291534424, "rewards//mean": 0.79132080078125, "rewards//std": 0.0344906747341156, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.005, "grad_norm": 0.5671719312667847, "kl": 0.0006081125320633873, "learning_rate": 2.4000000000000003e-06, "loss": 0.0001, "num_tokens": 163296.0, "reward": 0.83221435546875, "reward_std": 0.015442395582795143, "rewards//mean": 0.83221435546875, "rewards//std": 0.032700031995773315, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0052, "grad_norm": 0.5694089531898499, "kl": 0.000564752466743812, "learning_rate": 2.5e-06, "loss": 0.0001, "num_tokens": 169848.0, "reward": 0.8280029296875, "reward_std": 0.013243570923805237, "rewards//mean": 0.8280029296875, "rewards//std": 0.025084422901272774, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0054, "grad_norm": 0.5058358311653137, "kl": 0.0005670115642715245, "learning_rate": 2.6e-06, "loss": 0.0001, "num_tokens": 176272.0, "reward": 0.84521484375, "reward_std": 0.013745347037911415, "rewards//mean": 0.84521484375, "rewards//std": 0.030698729678988457, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0056, "grad_norm": 0.5333991050720215, "kl": 0.0006260298541747034, "learning_rate": 2.7000000000000004e-06, "loss": 0.0001, "num_tokens": 182896.0, "reward": 0.84588623046875, "reward_std": 0.02072429470717907, "rewards//mean": 0.84588623046875, "rewards//std": 0.028569040820002556, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0058, "grad_norm": 0.5217524170875549, "kl": 0.0006107169174356386, "learning_rate": 2.8000000000000003e-06, "loss": 0.0001, "num_tokens": 189472.0, "reward": 0.85723876953125, "reward_std": 0.012601524591445923, "rewards//mean": 0.85723876953125, "rewards//std": 0.025985149666666985, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.006, "grad_norm": 0.5388394594192505, "kl": 0.0006163410871522501, "learning_rate": 2.9e-06, "loss": 0.0001, "num_tokens": 196008.0, "reward": 0.8251953125, "reward_std": 0.01776113733649254, "rewards//mean": 0.8251953125, "rewards//std": 0.04279082641005516, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0062, "grad_norm": 0.5640242099761963, "kl": 0.0006344770517898723, "learning_rate": 3e-06, "loss": 0.0001, "num_tokens": 202440.0, "reward": 0.8143310546875, "reward_std": 0.011227348819375038, "rewards//mean": 0.8143310546875, "rewards//std": 0.032421380281448364, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0064, "grad_norm": 0.530994713306427, "kl": 0.0006286523857852444, "learning_rate": 3.1000000000000004e-06, "loss": 0.0001, "num_tokens": 208936.0, "reward": 0.86181640625, "reward_std": 0.01134884636849165, "rewards//mean": 0.86181640625, "rewards//std": 0.02666206657886505, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0066, "grad_norm": 0.5347341895103455, "kl": 0.0006648319977102801, "learning_rate": 3.2000000000000003e-06, "loss": 0.0001, "num_tokens": 215520.0, "reward": 0.80657958984375, "reward_std": 0.014637555927038193, "rewards//mean": 0.80657958984375, "rewards//std": 0.025391744449734688, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0068, "grad_norm": 0.5251972675323486, "kl": 0.0005861666286364198, "learning_rate": 3.3000000000000006e-06, "loss": 0.0001, "num_tokens": 222032.0, "reward": 0.8531494140625, "reward_std": 0.018488118425011635, "rewards//mean": 0.8531494140625, "rewards//std": 0.035555075854063034, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.007, "grad_norm": 0.5468869209289551, "kl": 0.0006114646239439026, "learning_rate": 3.4000000000000005e-06, "loss": 0.0001, "num_tokens": 228656.0, "reward": 0.81988525390625, "reward_std": 0.014370636083185673, "rewards//mean": 0.81988525390625, "rewards//std": 0.03541446104645729, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0072, "grad_norm": 0.5589239597320557, "kl": 0.0006090118695283309, "learning_rate": 3.5e-06, "loss": 0.0001, "num_tokens": 235144.0, "reward": 0.83056640625, "reward_std": 0.01737598143517971, "rewards//mean": 0.83056640625, "rewards//std": 0.044120125472545624, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0074, "grad_norm": 0.5322961211204529, "kl": 0.0006877124687889591, "learning_rate": 3.6000000000000003e-06, "loss": 0.0001, "num_tokens": 241728.0, "reward": 0.8135986328125, "reward_std": 0.01702769845724106, "rewards//mean": 0.8135986328125, "rewards//std": 0.032633595168590546, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0076, "grad_norm": 0.6947181224822998, "kl": 0.0007269596826517954, "learning_rate": 3.7e-06, "loss": 0.0001, "num_tokens": 248288.0, "reward": 0.8291015625, "reward_std": 0.017653653398156166, "rewards//mean": 0.8291015625, "rewards//std": 0.025770539417862892, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0078, "grad_norm": 0.5198093056678772, "kl": 0.000663173821521923, "learning_rate": 3.8000000000000005e-06, "loss": 0.0001, "num_tokens": 254888.0, "reward": 0.87103271484375, "reward_std": 0.013572394847869873, "rewards//mean": 0.87103271484375, "rewards//std": 0.025012129917740822, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.008, "grad_norm": 0.5395210385322571, "kl": 0.0005934182700002566, "learning_rate": 3.900000000000001e-06, "loss": 0.0001, "num_tokens": 261408.0, "reward": 0.79693603515625, "reward_std": 0.009941511787474155, "rewards//mean": 0.79693603515625, "rewards//std": 0.027546655386686325, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0082, "grad_norm": 0.5234987735748291, "kl": 0.000650742367724888, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "num_tokens": 267944.0, "reward": 0.8211669921875, "reward_std": 0.01747434213757515, "rewards//mean": 0.8211669921875, "rewards//std": 0.02766927145421505, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0084, "grad_norm": 0.5588589310646057, "kl": 0.0006299799279076979, "learning_rate": 4.1e-06, "loss": 0.0001, "num_tokens": 274448.0, "reward": 0.8140869140625, "reward_std": 0.013739854097366333, "rewards//mean": 0.8140869140625, "rewards//std": 0.025465337559580803, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0086, "grad_norm": 0.5930436849594116, "kl": 0.0006357767269946635, "learning_rate": 4.2000000000000004e-06, "loss": 0.0001, "num_tokens": 280936.0, "reward": 0.81170654296875, "reward_std": 0.011782532557845116, "rewards//mean": 0.81170654296875, "rewards//std": 0.0325273722410202, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0088, "grad_norm": 0.5557746291160583, "kl": 0.0007158647204050794, "learning_rate": 4.3e-06, "loss": 0.0001, "num_tokens": 287528.0, "reward": 0.85687255859375, "reward_std": 0.014486387372016907, "rewards//mean": 0.85687255859375, "rewards//std": 0.027903733775019646, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.009, "grad_norm": 0.6580216884613037, "kl": 0.0006931709067430347, "learning_rate": 4.4e-06, "loss": 0.0001, "num_tokens": 294064.0, "reward": 0.86663818359375, "reward_std": 0.014136701822280884, "rewards//mean": 0.86663818359375, "rewards//std": 0.023703305050730705, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0092, "grad_norm": 0.5328194499015808, "kl": 0.0006727315339958295, "learning_rate": 4.5e-06, "loss": 0.0001, "num_tokens": 300624.0, "reward": 0.8414306640625, "reward_std": 0.01816842518746853, "rewards//mean": 0.8414306640625, "rewards//std": 0.03057396598160267, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0094, "grad_norm": 0.5084638595581055, "kl": 0.0007027584506431594, "learning_rate": 4.600000000000001e-06, "loss": 0.0001, "num_tokens": 307144.0, "reward": 0.8560791015625, "reward_std": 0.010555820539593697, "rewards//mean": 0.8560791015625, "rewards//std": 0.026080874726176262, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0096, "grad_norm": 0.5603559017181396, "kl": 0.000706536418874748, "learning_rate": 4.7e-06, "loss": 0.0001, "num_tokens": 313696.0, "reward": 0.82489013671875, "reward_std": 0.01863221451640129, "rewards//mean": 0.82489013671875, "rewards//std": 0.030989699065685272, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0098, "grad_norm": 0.5202544331550598, "kl": 0.0006658075581071898, "learning_rate": 4.800000000000001e-06, "loss": 0.0001, "num_tokens": 320144.0, "reward": 0.864501953125, "reward_std": 0.017458315938711166, "rewards//mean": 0.864501953125, "rewards//std": 0.02970735915005207, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.01, "grad_norm": 0.5430282354354858, "kl": 0.0007079422211972997, "learning_rate": 4.9000000000000005e-06, "loss": 0.0001, "num_tokens": 326632.0, "reward": 0.8289794921875, "reward_std": 0.015223849564790726, "rewards//mean": 0.8289794921875, "rewards//std": 0.028258241713047028, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0102, "grad_norm": 0.5527967214584351, "kl": 0.00068058175384067, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 333176.0, "reward": 0.85009765625, "reward_std": 0.014322711154818535, "rewards//mean": 0.85009765625, "rewards//std": 0.03962577506899834, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0104, "grad_norm": 0.49573835730552673, "kl": 0.0007503277156502008, "learning_rate": 4.9999994965001495e-06, "loss": 0.0001, "num_tokens": 339680.0, "reward": 0.82861328125, "reward_std": 0.012963730841875076, "rewards//mean": 0.82861328125, "rewards//std": 0.017378928139805794, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0106, "grad_norm": 0.6121876239776611, "kl": 0.000837808009237051, "learning_rate": 4.999997986000801e-06, "loss": 0.0001, "num_tokens": 346120.0, "reward": 0.8382568359375, "reward_std": 0.011654852889478207, "rewards//mean": 0.8382568359375, "rewards//std": 0.02849084511399269, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0108, "grad_norm": 0.5221219062805176, "kl": 0.0007825934153515846, "learning_rate": 4.999995468502563e-06, "loss": 0.0001, "num_tokens": 352528.0, "reward": 0.82977294921875, "reward_std": 0.01969888061285019, "rewards//mean": 0.82977294921875, "rewards//std": 0.035405483096838, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.011, "grad_norm": 0.5531036257743835, "kl": 0.0007164527050917968, "learning_rate": 4.9999919440064484e-06, "loss": 0.0001, "num_tokens": 359080.0, "reward": 0.816162109375, "reward_std": 0.011350625194609165, "rewards//mean": 0.816162109375, "rewards//std": 0.03247293457388878, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0112, "grad_norm": 0.5199887752532959, "kl": 0.0007260760758072138, "learning_rate": 4.999987412513878e-06, "loss": 0.0001, "num_tokens": 365600.0, "reward": 0.82843017578125, "reward_std": 0.019472327083349228, "rewards//mean": 0.82843017578125, "rewards//std": 0.03167574107646942, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0114, "grad_norm": 0.5673383474349976, "kl": 0.0008793429587967694, "learning_rate": 4.999981874026677e-06, "loss": 0.0001, "num_tokens": 372120.0, "reward": 0.82391357421875, "reward_std": 0.012286683544516563, "rewards//mean": 0.82391357421875, "rewards//std": 0.024570664390921593, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0116, "grad_norm": 0.5173213481903076, "kl": 0.0006918876169947907, "learning_rate": 4.9999753285470756e-06, "loss": 0.0001, "num_tokens": 378632.0, "reward": 0.83428955078125, "reward_std": 0.015794314444065094, "rewards//mean": 0.83428955078125, "rewards//std": 0.022453775629401207, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0118, "grad_norm": 0.570139467716217, "kl": 0.0007772536046104506, "learning_rate": 4.9999677760777114e-06, "loss": 0.0001, "num_tokens": 385168.0, "reward": 0.85089111328125, "reward_std": 0.015498969703912735, "rewards//mean": 0.85089111328125, "rewards//std": 0.028978342190384865, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.012, "grad_norm": 0.5766065716743469, "kl": 0.000741052339435555, "learning_rate": 4.999959216621626e-06, "loss": 0.0001, "num_tokens": 391712.0, "reward": 0.83740234375, "reward_std": 0.013245906680822372, "rewards//mean": 0.83740234375, "rewards//std": 0.02006947062909603, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0122, "grad_norm": 0.538689136505127, "kl": 0.0007515538309235126, "learning_rate": 4.999949650182267e-06, "loss": 0.0001, "num_tokens": 398168.0, "reward": 0.81243896484375, "reward_std": 0.0122340964153409, "rewards//mean": 0.81243896484375, "rewards//std": 0.031715378165245056, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0124, "grad_norm": 0.537291944026947, "kl": 0.0008048277377383783, "learning_rate": 4.999939076763487e-06, "loss": 0.0001, "num_tokens": 404648.0, "reward": 0.84130859375, "reward_std": 0.01757432520389557, "rewards//mean": 0.84130859375, "rewards//std": 0.028463469818234444, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0126, "grad_norm": 0.5939363837242126, "kl": 0.0007771121745463461, "learning_rate": 4.999927496369547e-06, "loss": 0.0001, "num_tokens": 411160.0, "reward": 0.83380126953125, "reward_std": 0.01802542805671692, "rewards//mean": 0.83380126953125, "rewards//std": 0.02362206019461155, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0128, "grad_norm": 0.5537134408950806, "kl": 0.0008526271412847564, "learning_rate": 4.99991490900511e-06, "loss": 0.0001, "num_tokens": 417712.0, "reward": 0.81298828125, "reward_std": 0.009979894384741783, "rewards//mean": 0.81298828125, "rewards//std": 0.02068750187754631, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.013, "grad_norm": 0.5816908478736877, "kl": 0.0007632242341060191, "learning_rate": 4.999901314675246e-06, "loss": 0.0001, "num_tokens": 424232.0, "reward": 0.8087158203125, "reward_std": 0.012353556230664253, "rewards//mean": 0.8087158203125, "rewards//std": 0.01686212420463562, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0132, "grad_norm": 0.5981035232543945, "kl": 0.0008245876524597406, "learning_rate": 4.999886713385432e-06, "loss": 0.0001, "num_tokens": 430712.0, "reward": 0.81884765625, "reward_std": 0.02527492679655552, "rewards//mean": 0.81884765625, "rewards//std": 0.05187131464481354, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0134, "grad_norm": 0.549401044845581, "kl": 0.000775572014390491, "learning_rate": 4.999871105141549e-06, "loss": 0.0001, "num_tokens": 437296.0, "reward": 0.8375244140625, "reward_std": 0.014890830963850021, "rewards//mean": 0.8375244140625, "rewards//std": 0.028397180140018463, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0136, "grad_norm": 0.541130542755127, "kl": 0.0007868314132792875, "learning_rate": 4.9998544899498845e-06, "loss": 0.0001, "num_tokens": 443816.0, "reward": 0.8046875, "reward_std": 0.012027734890580177, "rewards//mean": 0.8046875, "rewards//std": 0.021776903420686722, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0138, "grad_norm": 0.6027575135231018, "kl": 0.0009221135260304436, "learning_rate": 4.999836867817129e-06, "loss": 0.0001, "num_tokens": 450312.0, "reward": 0.816162109375, "reward_std": 0.016808371990919113, "rewards//mean": 0.816162109375, "rewards//std": 0.024358505383133888, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.014, "grad_norm": 0.5539107322692871, "kl": 0.0007876981690060347, "learning_rate": 4.9998182387503825e-06, "loss": 0.0001, "num_tokens": 456840.0, "reward": 0.77813720703125, "reward_std": 0.015146953985095024, "rewards//mean": 0.77813720703125, "rewards//std": 0.024361535906791687, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0142, "grad_norm": 0.5014827847480774, "kl": 0.0008181602897820994, "learning_rate": 4.999798602757149e-06, "loss": 0.0001, "num_tokens": 463288.0, "reward": 0.82305908203125, "reward_std": 0.019664010033011436, "rewards//mean": 0.82305908203125, "rewards//std": 0.035274408757686615, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0144, "grad_norm": 0.5452041029930115, "kl": 0.0008986780885607004, "learning_rate": 4.9997779598453365e-06, "loss": 0.0001, "num_tokens": 469680.0, "reward": 0.83221435546875, "reward_std": 0.016070978716015816, "rewards//mean": 0.83221435546875, "rewards//std": 0.038161925971508026, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0146, "grad_norm": 0.5421676635742188, "kl": 0.0008441337413387373, "learning_rate": 4.999756310023261e-06, "loss": 0.0001, "num_tokens": 476208.0, "reward": 0.7965087890625, "reward_std": 0.010674255900084972, "rewards//mean": 0.7965087890625, "rewards//std": 0.021038319915533066, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0148, "grad_norm": 0.49503278732299805, "kl": 0.0008629800722701475, "learning_rate": 4.999733653299643e-06, "loss": 0.0001, "num_tokens": 482752.0, "reward": 0.8499755859375, "reward_std": 0.017572959885001183, "rewards//mean": 0.8499755859375, "rewards//std": 0.02942756749689579, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.015, "grad_norm": 0.5961833000183105, "kl": 0.0009816785313887522, "learning_rate": 4.9997099896836076e-06, "loss": 0.0001, "num_tokens": 489336.0, "reward": 0.816650390625, "reward_std": 0.014112787321209908, "rewards//mean": 0.816650390625, "rewards//std": 0.021797746419906616, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0152, "grad_norm": 0.5034902095794678, "kl": 0.001062475799699314, "learning_rate": 4.999685319184688e-06, "loss": 0.0001, "num_tokens": 495976.0, "reward": 0.85406494140625, "reward_std": 0.01799612119793892, "rewards//mean": 0.85406494140625, "rewards//std": 0.026914028450846672, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0154, "grad_norm": 0.5780143141746521, "kl": 0.0010181776160607114, "learning_rate": 4.999659641812821e-06, "loss": 0.0001, "num_tokens": 502496.0, "reward": 0.79010009765625, "reward_std": 0.009412910789251328, "rewards//mean": 0.79010009765625, "rewards//std": 0.03928040713071823, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0156, "grad_norm": 0.536464273929596, "kl": 0.0010662138811312616, "learning_rate": 4.9996329575783486e-06, "loss": 0.0001, "num_tokens": 508992.0, "reward": 0.8341064453125, "reward_std": 0.015705348923802376, "rewards//mean": 0.8341064453125, "rewards//std": 0.025295954197645187, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0158, "grad_norm": 0.5415508151054382, "kl": 0.0009792676428332925, "learning_rate": 4.99960526649202e-06, "loss": 0.0001, "num_tokens": 515512.0, "reward": 0.8585205078125, "reward_std": 0.01360266376286745, "rewards//mean": 0.8585205078125, "rewards//std": 0.02985856868326664, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.016, "grad_norm": 0.5641584396362305, "kl": 0.0010558558569755405, "learning_rate": 4.999576568564989e-06, "loss": 0.0001, "num_tokens": 522056.0, "reward": 0.8646240234375, "reward_std": 0.012548411265015602, "rewards//mean": 0.8646240234375, "rewards//std": 0.03710509091615677, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0162, "grad_norm": 0.4885479807853699, "kl": 0.0010145479027414694, "learning_rate": 4.999546863808815e-06, "loss": 0.0001, "num_tokens": 528560.0, "reward": 0.83837890625, "reward_std": 0.013274097815155983, "rewards//mean": 0.83837890625, "rewards//std": 0.021242039278149605, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0164, "grad_norm": 0.5320591330528259, "kl": 0.0011373023444321007, "learning_rate": 4.999516152235463e-06, "loss": 0.0001, "num_tokens": 535144.0, "reward": 0.8441162109375, "reward_std": 0.01578804850578308, "rewards//mean": 0.8441162109375, "rewards//std": 0.02666405588388443, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0166, "grad_norm": 0.5232922434806824, "kl": 0.0012273474567336962, "learning_rate": 4.999484433857305e-06, "loss": 0.0001, "num_tokens": 541600.0, "reward": 0.8199462890625, "reward_std": 0.012842075899243355, "rewards//mean": 0.8199462890625, "rewards//std": 0.03780028596520424, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0168, "grad_norm": 0.6083173155784607, "kl": 0.0010711704671848565, "learning_rate": 4.999451708687114e-06, "loss": 0.0001, "num_tokens": 548224.0, "reward": 0.84564208984375, "reward_std": 0.015799537301063538, "rewards//mean": 0.84564208984375, "rewards//std": 0.01983264647424221, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.017, "grad_norm": 0.6593310236930847, "kl": 0.0012267531710676849, "learning_rate": 4.999417976738075e-06, "loss": 0.0001, "num_tokens": 554800.0, "reward": 0.86431884765625, "reward_std": 0.009114849381148815, "rewards//mean": 0.86431884765625, "rewards//std": 0.0262292567640543, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0172, "grad_norm": 0.6140724420547485, "kl": 0.0011840108927572146, "learning_rate": 4.999383238023773e-06, "loss": 0.0001, "num_tokens": 561328.0, "reward": 0.83831787109375, "reward_std": 0.01039905659854412, "rewards//mean": 0.83831787109375, "rewards//std": 0.024514535441994667, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0174, "grad_norm": 0.5758453607559204, "kl": 0.0012705906701739877, "learning_rate": 4.999347492558203e-06, "loss": 0.0001, "num_tokens": 567976.0, "reward": 0.85235595703125, "reward_std": 0.012438876554369926, "rewards//mean": 0.85235595703125, "rewards//std": 0.03644623979926109, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0176, "grad_norm": 0.5718148350715637, "kl": 0.0012508028012234718, "learning_rate": 4.999310740355761e-06, "loss": 0.0001, "num_tokens": 574488.0, "reward": 0.841064453125, "reward_std": 0.013601240701973438, "rewards//mean": 0.841064453125, "rewards//std": 0.039395131170749664, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0178, "grad_norm": 0.6675286889076233, "kl": 0.0014662631147075444, "learning_rate": 4.9992729814312514e-06, "loss": 0.0001, "num_tokens": 580992.0, "reward": 0.82659912109375, "reward_std": 0.016967086121439934, "rewards//mean": 0.82659912109375, "rewards//std": 0.03539735823869705, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.018, "grad_norm": 0.5523439645767212, "kl": 0.0013510131393559277, "learning_rate": 4.999234215799884e-06, "loss": 0.0001, "num_tokens": 587448.0, "reward": 0.81683349609375, "reward_std": 0.011323179118335247, "rewards//mean": 0.81683349609375, "rewards//std": 0.024706443771719933, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0182, "grad_norm": 0.5677604079246521, "kl": 0.0014052088954485953, "learning_rate": 4.999194443477273e-06, "loss": 0.0001, "num_tokens": 593944.0, "reward": 0.81878662109375, "reward_std": 0.011167924851179123, "rewards//mean": 0.81878662109375, "rewards//std": 0.024750519543886185, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0184, "grad_norm": 0.579730212688446, "kl": 0.001485287953983061, "learning_rate": 4.99915366447944e-06, "loss": 0.0001, "num_tokens": 600520.0, "reward": 0.83245849609375, "reward_std": 0.01508173905313015, "rewards//mean": 0.83245849609375, "rewards//std": 0.028220918029546738, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0186, "grad_norm": 0.5412017107009888, "kl": 0.001436470469343476, "learning_rate": 4.999111878822809e-06, "loss": 0.0001, "num_tokens": 606960.0, "reward": 0.8790283203125, "reward_std": 0.014004024676978588, "rewards//mean": 0.8790283203125, "rewards//std": 0.03480638191103935, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0188, "grad_norm": 0.5232703685760498, "kl": 0.0014006335695739836, "learning_rate": 4.999069086524212e-06, "loss": 0.0001, "num_tokens": 613488.0, "reward": 0.85296630859375, "reward_std": 0.010906463488936424, "rewards//mean": 0.85296630859375, "rewards//std": 0.028076795861124992, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.019, "grad_norm": 0.4935658276081085, "kl": 0.0014153113588690758, "learning_rate": 4.999025287600886e-06, "loss": 0.0001, "num_tokens": 620008.0, "reward": 0.8734130859375, "reward_std": 0.009302475489675999, "rewards//mean": 0.8734130859375, "rewards//std": 0.024985257536172867, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0192, "grad_norm": 0.6382635831832886, "kl": 0.001757641599397175, "learning_rate": 4.998980482070473e-06, "loss": 0.0002, "num_tokens": 626448.0, "reward": 0.84674072265625, "reward_std": 0.014600001275539398, "rewards//mean": 0.84674072265625, "rewards//std": 0.02444651536643505, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0194, "grad_norm": 0.5889910459518433, "kl": 0.0018631023849593475, "learning_rate": 4.9989346699510215e-06, "loss": 0.0002, "num_tokens": 632888.0, "reward": 0.846435546875, "reward_std": 0.012425427325069904, "rewards//mean": 0.846435546875, "rewards//std": 0.02704427018761635, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0196, "grad_norm": 0.5496983528137207, "kl": 0.0017451888270443305, "learning_rate": 4.9988878512609825e-06, "loss": 0.0002, "num_tokens": 639376.0, "reward": 0.8331298828125, "reward_std": 0.012676231563091278, "rewards//mean": 0.8331298828125, "rewards//std": 0.031408242881298065, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0198, "grad_norm": 0.5105761289596558, "kl": 0.0015143363270908594, "learning_rate": 4.998840026019217e-06, "loss": 0.0002, "num_tokens": 645936.0, "reward": 0.8331298828125, "reward_std": 0.010076148435473442, "rewards//mean": 0.8331298828125, "rewards//std": 0.019506044685840607, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.02, "grad_norm": 0.5801912546157837, "kl": 0.0020606104080798104, "learning_rate": 4.998791194244988e-06, "loss": 0.0002, "num_tokens": 652504.0, "reward": 0.8634033203125, "reward_std": 0.013777434825897217, "rewards//mean": 0.8634033203125, "rewards//std": 0.03736365959048271, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0202, "grad_norm": 0.6097487211227417, "kl": 0.0017242246540263295, "learning_rate": 4.998741355957963e-06, "loss": 0.0002, "num_tokens": 659024.0, "reward": 0.845703125, "reward_std": 0.012556832283735275, "rewards//mean": 0.845703125, "rewards//std": 0.02684764750301838, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0204, "grad_norm": 0.5697188377380371, "kl": 0.0018468490161467344, "learning_rate": 4.99869051117822e-06, "loss": 0.0002, "num_tokens": 665568.0, "reward": 0.85235595703125, "reward_std": 0.01241546031087637, "rewards//mean": 0.85235595703125, "rewards//std": 0.029282856732606888, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0206, "grad_norm": 0.5179392695426941, "kl": 0.0018879670387832448, "learning_rate": 4.998638659926238e-06, "loss": 0.0002, "num_tokens": 671992.0, "reward": 0.84124755859375, "reward_std": 0.01206475030630827, "rewards//mean": 0.84124755859375, "rewards//std": 0.023223303258419037, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0208, "grad_norm": 0.5548343062400818, "kl": 0.0020817798358621076, "learning_rate": 4.998585802222902e-06, "loss": 0.0002, "num_tokens": 678640.0, "reward": 0.79302978515625, "reward_std": 0.011504091322422028, "rewards//mean": 0.79302978515625, "rewards//std": 0.024054504930973053, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.021, "grad_norm": 0.5501526594161987, "kl": 0.0019515428430167958, "learning_rate": 4.9985319380895035e-06, "loss": 0.0002, "num_tokens": 685200.0, "reward": 0.85882568359375, "reward_std": 0.012049192562699318, "rewards//mean": 0.85882568359375, "rewards//std": 0.029772449284791946, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0212, "grad_norm": 0.5171511769294739, "kl": 0.0022368501813616604, "learning_rate": 4.99847706754774e-06, "loss": 0.0002, "num_tokens": 691664.0, "reward": 0.83056640625, "reward_std": 0.015448026359081268, "rewards//mean": 0.83056640625, "rewards//std": 0.018577925860881805, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0214, "grad_norm": 0.4974002242088318, "kl": 0.001978852436877787, "learning_rate": 4.998421190619712e-06, "loss": 0.0002, "num_tokens": 698152.0, "reward": 0.8232421875, "reward_std": 0.009565056301653385, "rewards//mean": 0.8232421875, "rewards//std": 0.022380255162715912, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0216, "grad_norm": 0.5393596291542053, "kl": 0.0022539695783052593, "learning_rate": 4.998364307327927e-06, "loss": 0.0002, "num_tokens": 704704.0, "reward": 0.806396484375, "reward_std": 0.011443907395005226, "rewards//mean": 0.806396484375, "rewards//std": 0.022850144654512405, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0218, "grad_norm": 0.5512969493865967, "kl": 0.0018980851164087653, "learning_rate": 4.998306417695298e-06, "loss": 0.0002, "num_tokens": 711256.0, "reward": 0.79241943359375, "reward_std": 0.012490647844970226, "rewards//mean": 0.79241943359375, "rewards//std": 0.0348195917904377, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.022, "grad_norm": 0.5473505258560181, "kl": 0.002349850197788328, "learning_rate": 4.998247521745142e-06, "loss": 0.0002, "num_tokens": 717784.0, "reward": 0.83465576171875, "reward_std": 0.008640991523861885, "rewards//mean": 0.83465576171875, "rewards//std": 0.017218681052327156, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0222, "grad_norm": 0.6422269344329834, "kl": 0.0024210367555497214, "learning_rate": 4.998187619501185e-06, "loss": 0.0002, "num_tokens": 724336.0, "reward": 0.82794189453125, "reward_std": 0.011137901805341244, "rewards//mean": 0.82794189453125, "rewards//std": 0.03503153845667839, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0224, "grad_norm": 0.5594554543495178, "kl": 0.002010446216445416, "learning_rate": 4.998126710987552e-06, "loss": 0.0002, "num_tokens": 730888.0, "reward": 0.86468505859375, "reward_std": 0.015423553064465523, "rewards//mean": 0.86468505859375, "rewards//std": 0.03212417662143707, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0226, "grad_norm": 0.5730808973312378, "kl": 0.0022841215832158923, "learning_rate": 4.998064796228779e-06, "loss": 0.0002, "num_tokens": 737352.0, "reward": 0.86993408203125, "reward_std": 0.014314034953713417, "rewards//mean": 0.86993408203125, "rewards//std": 0.025540942326188087, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0228, "grad_norm": 0.5489108562469482, "kl": 0.0020574367081280798, "learning_rate": 4.998001875249804e-06, "loss": 0.0002, "num_tokens": 743840.0, "reward": 0.84295654296875, "reward_std": 0.013090233318507671, "rewards//mean": 0.84295654296875, "rewards//std": 0.025018785148859024, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.023, "grad_norm": 0.5044224262237549, "kl": 0.0021600704931188375, "learning_rate": 4.997937948075973e-06, "loss": 0.0002, "num_tokens": 750432.0, "reward": 0.8189697265625, "reward_std": 0.010550140403211117, "rewards//mean": 0.8189697265625, "rewards//std": 0.01952776312828064, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0232, "grad_norm": 0.5633189678192139, "kl": 0.002223424904514104, "learning_rate": 4.997873014733036e-06, "loss": 0.0002, "num_tokens": 756888.0, "reward": 0.85516357421875, "reward_std": 0.011582260951399803, "rewards//mean": 0.85516357421875, "rewards//std": 0.04037541151046753, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0234, "grad_norm": 0.5373753309249878, "kl": 0.0026258028228767216, "learning_rate": 4.997807075247147e-06, "loss": 0.0003, "num_tokens": 763392.0, "reward": 0.81365966796875, "reward_std": 0.011771637946367264, "rewards//mean": 0.81365966796875, "rewards//std": 0.021346449851989746, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0236, "grad_norm": 0.5237939953804016, "kl": 0.0023264801857294515, "learning_rate": 4.9977401296448655e-06, "loss": 0.0002, "num_tokens": 769928.0, "reward": 0.8460693359375, "reward_std": 0.008590873330831528, "rewards//mean": 0.8460693359375, "rewards//std": 0.017849450930953026, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0238, "grad_norm": 0.5332798361778259, "kl": 0.002536685249651782, "learning_rate": 4.99767217795316e-06, "loss": 0.0003, "num_tokens": 776408.0, "reward": 0.85760498046875, "reward_std": 0.01757902279496193, "rewards//mean": 0.85760498046875, "rewards//std": 0.028330666944384575, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.024, "grad_norm": 0.5661578178405762, "kl": 0.002972067624796182, "learning_rate": 4.997603220199399e-06, "loss": 0.0003, "num_tokens": 782976.0, "reward": 0.84490966796875, "reward_std": 0.011383035220205784, "rewards//mean": 0.84490966796875, "rewards//std": 0.01630287617444992, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0242, "grad_norm": 0.5735255479812622, "kl": 0.0025082347565330565, "learning_rate": 4.99753325641136e-06, "loss": 0.0003, "num_tokens": 789624.0, "reward": 0.8165283203125, "reward_std": 0.012074325233697891, "rewards//mean": 0.8165283203125, "rewards//std": 0.023745177313685417, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0244, "grad_norm": 0.5703085660934448, "kl": 0.0030434352956945077, "learning_rate": 4.997462286617224e-06, "loss": 0.0003, "num_tokens": 796152.0, "reward": 0.84613037109375, "reward_std": 0.013441948220133781, "rewards//mean": 0.84613037109375, "rewards//std": 0.030390407890081406, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0246, "grad_norm": 0.5246081352233887, "kl": 0.00253114165388979, "learning_rate": 4.997390310845578e-06, "loss": 0.0003, "num_tokens": 802712.0, "reward": 0.85833740234375, "reward_std": 0.012656005099415779, "rewards//mean": 0.85833740234375, "rewards//std": 0.02223086729645729, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0248, "grad_norm": 0.5539637804031372, "kl": 0.003131673962343484, "learning_rate": 4.997317329125413e-06, "loss": 0.0003, "num_tokens": 809208.0, "reward": 0.85931396484375, "reward_std": 0.015795918181538582, "rewards//mean": 0.85931396484375, "rewards//std": 0.029177729040384293, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.025, "grad_norm": 0.6289098858833313, "kl": 0.002920530881965533, "learning_rate": 4.997243341486126e-06, "loss": 0.0003, "num_tokens": 815736.0, "reward": 0.87042236328125, "reward_std": 0.010492103174328804, "rewards//mean": 0.87042236328125, "rewards//std": 0.02300521731376648, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0252, "grad_norm": 0.604350745677948, "kl": 0.0029226918559288606, "learning_rate": 4.997168347957521e-06, "loss": 0.0003, "num_tokens": 822232.0, "reward": 0.8529052734375, "reward_std": 0.014255398884415627, "rewards//mean": 0.8529052734375, "rewards//std": 0.04210163280367851, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0254, "grad_norm": 0.5126016736030579, "kl": 0.0027541381714399904, "learning_rate": 4.997092348569802e-06, "loss": 0.0003, "num_tokens": 828736.0, "reward": 0.86907958984375, "reward_std": 0.009864596650004387, "rewards//mean": 0.86907958984375, "rewards//std": 0.020902881398797035, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0256, "grad_norm": 0.5625630617141724, "kl": 0.0037071430997457355, "learning_rate": 4.9970153433535855e-06, "loss": 0.0004, "num_tokens": 835208.0, "reward": 0.83758544921875, "reward_std": 0.012011556886136532, "rewards//mean": 0.83758544921875, "rewards//std": 0.026298996061086655, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0258, "grad_norm": 0.5449907779693604, "kl": 0.003065660857828334, "learning_rate": 4.996937332339887e-06, "loss": 0.0003, "num_tokens": 841672.0, "reward": 0.8533935546875, "reward_std": 0.015462837181985378, "rewards//mean": 0.8533935546875, "rewards//std": 0.021865351125597954, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.026, "grad_norm": 0.532476007938385, "kl": 0.0030444328440353274, "learning_rate": 4.996858315560129e-06, "loss": 0.0003, "num_tokens": 848168.0, "reward": 0.83038330078125, "reward_std": 0.011759311892092228, "rewards//mean": 0.83038330078125, "rewards//std": 0.026123428717255592, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0262, "grad_norm": 0.5832347273826599, "kl": 0.0031748518522363156, "learning_rate": 4.9967782930461405e-06, "loss": 0.0003, "num_tokens": 854768.0, "reward": 0.82232666015625, "reward_std": 0.012866346165537834, "rewards//mean": 0.82232666015625, "rewards//std": 0.022574130445718765, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0264, "grad_norm": 0.5763130784034729, "kl": 0.002997577394125983, "learning_rate": 4.9966972648301535e-06, "loss": 0.0003, "num_tokens": 861288.0, "reward": 0.852294921875, "reward_std": 0.008953496813774109, "rewards//mean": 0.852294921875, "rewards//std": 0.019210852682590485, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0266, "grad_norm": 0.5979419946670532, "kl": 0.0031551278370898217, "learning_rate": 4.996615230944808e-06, "loss": 0.0003, "num_tokens": 867752.0, "reward": 0.8076171875, "reward_std": 0.014883074909448624, "rewards//mean": 0.8076171875, "rewards//std": 0.02626393362879753, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0268, "grad_norm": 0.5739002227783203, "kl": 0.003316704591270536, "learning_rate": 4.996532191423145e-06, "loss": 0.0003, "num_tokens": 874240.0, "reward": 0.85858154296875, "reward_std": 0.013418575748801231, "rewards//mean": 0.85858154296875, "rewards//std": 0.025583580136299133, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.027, "grad_norm": 0.5882933735847473, "kl": 0.003152552613755688, "learning_rate": 4.996448146298615e-06, "loss": 0.0003, "num_tokens": 880704.0, "reward": 0.84490966796875, "reward_std": 0.013387042097747326, "rewards//mean": 0.84490966796875, "rewards//std": 0.023289043456315994, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0272, "grad_norm": 0.636451005935669, "kl": 0.0033634064893703908, "learning_rate": 4.996363095605069e-06, "loss": 0.0003, "num_tokens": 887200.0, "reward": 0.8017578125, "reward_std": 0.008450044319033623, "rewards//mean": 0.8017578125, "rewards//std": 0.01984495110809803, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0274, "grad_norm": 0.6109657287597656, "kl": 0.0032750566606409848, "learning_rate": 4.996277039376767e-06, "loss": 0.0003, "num_tokens": 893696.0, "reward": 0.87811279296875, "reward_std": 0.01004757173359394, "rewards//mean": 0.87811279296875, "rewards//std": 0.0377134345471859, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0276, "grad_norm": 0.5534079074859619, "kl": 0.003138502681395039, "learning_rate": 4.9961899776483725e-06, "loss": 0.0003, "num_tokens": 900264.0, "reward": 0.82147216796875, "reward_std": 0.013819834217429161, "rewards//mean": 0.82147216796875, "rewards//std": 0.030263124033808708, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0278, "grad_norm": 0.5728569626808167, "kl": 0.0036148553481325507, "learning_rate": 4.996101910454953e-06, "loss": 0.0004, "num_tokens": 906792.0, "reward": 0.8438720703125, "reward_std": 0.014269351959228516, "rewards//mean": 0.8438720703125, "rewards//std": 0.025920186191797256, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.028, "grad_norm": 0.591044008731842, "kl": 0.003646017052233219, "learning_rate": 4.996012837831983e-06, "loss": 0.0004, "num_tokens": 913376.0, "reward": 0.85736083984375, "reward_std": 0.010473604314029217, "rewards//mean": 0.85736083984375, "rewards//std": 0.013709330931305885, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0282, "grad_norm": 0.5964400172233582, "kl": 0.0035551346372812986, "learning_rate": 4.9959227598153395e-06, "loss": 0.0004, "num_tokens": 919960.0, "reward": 0.7999267578125, "reward_std": 0.009113901294767857, "rewards//mean": 0.7999267578125, "rewards//std": 0.015694117173552513, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0284, "grad_norm": 0.6457265019416809, "kl": 0.004052484320709482, "learning_rate": 4.995831676441307e-06, "loss": 0.0004, "num_tokens": 926600.0, "reward": 0.83953857421875, "reward_std": 0.017153846099972725, "rewards//mean": 0.83953857421875, "rewards//std": 0.028196770697832108, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0286, "grad_norm": 0.5374157428741455, "kl": 0.004182952019618824, "learning_rate": 4.995739587746574e-06, "loss": 0.0004, "num_tokens": 933120.0, "reward": 0.85748291015625, "reward_std": 0.012008757330477238, "rewards//mean": 0.85748291015625, "rewards//std": 0.019123880192637444, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0288, "grad_norm": 0.5459393262863159, "kl": 0.004459843650693074, "learning_rate": 4.995646493768234e-06, "loss": 0.0004, "num_tokens": 939712.0, "reward": 0.8717041015625, "reward_std": 0.012871598824858665, "rewards//mean": 0.8717041015625, "rewards//std": 0.02747381664812565, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.029, "grad_norm": 0.6245104074478149, "kl": 0.0036559294094331563, "learning_rate": 4.995552394543784e-06, "loss": 0.0004, "num_tokens": 946152.0, "reward": 0.84735107421875, "reward_std": 0.010359259322285652, "rewards//mean": 0.84735107421875, "rewards//std": 0.025145526975393295, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0292, "grad_norm": 0.49289852380752563, "kl": 0.004074933909578249, "learning_rate": 4.995457290111129e-06, "loss": 0.0004, "num_tokens": 952760.0, "reward": 0.829833984375, "reward_std": 0.01499673631042242, "rewards//mean": 0.829833984375, "rewards//std": 0.02499646320939064, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0294, "grad_norm": 0.5312600135803223, "kl": 0.004150361695792526, "learning_rate": 4.995361180508575e-06, "loss": 0.0004, "num_tokens": 959336.0, "reward": 0.8187255859375, "reward_std": 0.014176027849316597, "rewards//mean": 0.8187255859375, "rewards//std": 0.025973858311772346, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0296, "grad_norm": 0.5217401385307312, "kl": 0.0042341178632341325, "learning_rate": 4.995264065774837e-06, "loss": 0.0004, "num_tokens": 965904.0, "reward": 0.8221435546875, "reward_std": 0.014102162793278694, "rewards//mean": 0.8221435546875, "rewards//std": 0.029886946082115173, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0298, "grad_norm": 0.5702940225601196, "kl": 0.004433232854353264, "learning_rate": 4.99516594594903e-06, "loss": 0.0004, "num_tokens": 972480.0, "reward": 0.83148193359375, "reward_std": 0.009684868156909943, "rewards//mean": 0.83148193359375, "rewards//std": 0.030368482694029808, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.03, "grad_norm": 0.5658935308456421, "kl": 0.004318653082009405, "learning_rate": 4.9950668210706795e-06, "loss": 0.0004, "num_tokens": 979032.0, "reward": 0.81292724609375, "reward_std": 0.018283039331436157, "rewards//mean": 0.81292724609375, "rewards//std": 0.03353874757885933, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0302, "grad_norm": 0.6088890433311462, "kl": 0.003863652906147763, "learning_rate": 4.994966691179712e-06, "loss": 0.0004, "num_tokens": 985520.0, "reward": 0.8228759765625, "reward_std": 0.012609913945198059, "rewards//mean": 0.8228759765625, "rewards//std": 0.02047235146164894, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0304, "grad_norm": 0.5766634345054626, "kl": 0.0043726901058107615, "learning_rate": 4.9948655563164585e-06, "loss": 0.0004, "num_tokens": 992048.0, "reward": 0.83502197265625, "reward_std": 0.012382203713059425, "rewards//mean": 0.83502197265625, "rewards//std": 0.023486455902457237, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0306, "grad_norm": 0.6495277881622314, "kl": 0.004681104823248461, "learning_rate": 4.994763416521658e-06, "loss": 0.0005, "num_tokens": 998624.0, "reward": 0.8575439453125, "reward_std": 0.01804346963763237, "rewards//mean": 0.8575439453125, "rewards//std": 0.03466693311929703, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0308, "grad_norm": 0.5628495812416077, "kl": 0.005010787746869028, "learning_rate": 4.994660271836452e-06, "loss": 0.0005, "num_tokens": 1005144.0, "reward": 0.78619384765625, "reward_std": 0.014029724523425102, "rewards//mean": 0.78619384765625, "rewards//std": 0.027605941519141197, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.031, "grad_norm": 0.5648722052574158, "kl": 0.005263536557322368, "learning_rate": 4.994556122302387e-06, "loss": 0.0005, "num_tokens": 1011776.0, "reward": 0.81500244140625, "reward_std": 0.00935292523354292, "rewards//mean": 0.81500244140625, "rewards//std": 0.0195171982049942, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0312, "grad_norm": 0.5549405813217163, "kl": 0.004852844664128497, "learning_rate": 4.994450967961413e-06, "loss": 0.0005, "num_tokens": 1018328.0, "reward": 0.82122802734375, "reward_std": 0.01424551010131836, "rewards//mean": 0.82122802734375, "rewards//std": 0.017228348180651665, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0314, "grad_norm": 0.6064804196357727, "kl": 0.0048009168822318316, "learning_rate": 4.994344808855888e-06, "loss": 0.0005, "num_tokens": 1024736.0, "reward": 0.851318359375, "reward_std": 0.014092957600951195, "rewards//mean": 0.851318359375, "rewards//std": 0.027951501309871674, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0316, "grad_norm": 0.5940186381340027, "kl": 0.005103438044898212, "learning_rate": 4.994237645028573e-06, "loss": 0.0005, "num_tokens": 1031280.0, "reward": 0.83880615234375, "reward_std": 0.014840067364275455, "rewards//mean": 0.83880615234375, "rewards//std": 0.03041131980717182, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0318, "grad_norm": 0.6087404489517212, "kl": 0.005231126502621919, "learning_rate": 4.994129476522632e-06, "loss": 0.0005, "num_tokens": 1037776.0, "reward": 0.84613037109375, "reward_std": 0.017233435064554214, "rewards//mean": 0.84613037109375, "rewards//std": 0.03191426768898964, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.032, "grad_norm": 0.5607631802558899, "kl": 0.0044527553545776755, "learning_rate": 4.994020303381636e-06, "loss": 0.0004, "num_tokens": 1044264.0, "reward": 0.8514404296875, "reward_std": 0.01844841055572033, "rewards//mean": 0.8514404296875, "rewards//std": 0.030922522768378258, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0322, "grad_norm": 0.5657513737678528, "kl": 0.005017978633986786, "learning_rate": 4.993910125649561e-06, "loss": 0.0005, "num_tokens": 1050792.0, "reward": 0.856201171875, "reward_std": 0.012632312253117561, "rewards//mean": 0.856201171875, "rewards//std": 0.02604980394244194, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0324, "grad_norm": 0.5447162389755249, "kl": 0.004984280851203948, "learning_rate": 4.993798943370785e-06, "loss": 0.0005, "num_tokens": 1057344.0, "reward": 0.82562255859375, "reward_std": 0.014443015679717064, "rewards//mean": 0.82562255859375, "rewards//std": 0.024904167279601097, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0326, "grad_norm": 0.5358566045761108, "kl": 0.005205701396334916, "learning_rate": 4.993686756590093e-06, "loss": 0.0005, "num_tokens": 1063784.0, "reward": 0.85723876953125, "reward_std": 0.01433575339615345, "rewards//mean": 0.85723876953125, "rewards//std": 0.02944575995206833, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0328, "grad_norm": 0.52525395154953, "kl": 0.005318704032106325, "learning_rate": 4.993573565352674e-06, "loss": 0.0005, "num_tokens": 1070232.0, "reward": 0.836181640625, "reward_std": 0.009636202827095985, "rewards//mean": 0.836181640625, "rewards//std": 0.02487991936504841, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.033, "grad_norm": 0.6540741920471191, "kl": 0.005362637952202931, "learning_rate": 4.993459369704121e-06, "loss": 0.0005, "num_tokens": 1076808.0, "reward": 0.84423828125, "reward_std": 0.015483209863305092, "rewards//mean": 0.84423828125, "rewards//std": 0.03575059771537781, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0332, "grad_norm": 0.5869854092597961, "kl": 0.005901694006752223, "learning_rate": 4.9933441696904315e-06, "loss": 0.0006, "num_tokens": 1083304.0, "reward": 0.84246826171875, "reward_std": 0.015546703711152077, "rewards//mean": 0.84246826171875, "rewards//std": 0.02686617709696293, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0334, "grad_norm": 0.575697660446167, "kl": 0.0057476553483866155, "learning_rate": 4.993227965358008e-06, "loss": 0.0006, "num_tokens": 1089800.0, "reward": 0.86322021484375, "reward_std": 0.01910999044775963, "rewards//mean": 0.86322021484375, "rewards//std": 0.03419933468103409, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0336, "grad_norm": 0.6858865022659302, "kl": 0.005201482155825943, "learning_rate": 4.99311075675366e-06, "loss": 0.0005, "num_tokens": 1096344.0, "reward": 0.86181640625, "reward_std": 0.013126470148563385, "rewards//mean": 0.86181640625, "rewards//std": 0.02323511429131031, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0338, "grad_norm": 0.5925734043121338, "kl": 0.004758396476972848, "learning_rate": 4.992992543924597e-06, "loss": 0.0005, "num_tokens": 1102928.0, "reward": 0.8076171875, "reward_std": 0.017161305993795395, "rewards//mean": 0.8076171875, "rewards//std": 0.02628237009048462, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.034, "grad_norm": 0.5349700450897217, "kl": 0.005817261466290802, "learning_rate": 4.992873326918434e-06, "loss": 0.0006, "num_tokens": 1109536.0, "reward": 0.85833740234375, "reward_std": 0.010428598150610924, "rewards//mean": 0.85833740234375, "rewards//std": 0.03860246390104294, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0342, "grad_norm": 0.6442819237709045, "kl": 0.006201874581165612, "learning_rate": 4.992753105783194e-06, "loss": 0.0006, "num_tokens": 1116056.0, "reward": 0.83734130859375, "reward_std": 0.012758218683302402, "rewards//mean": 0.83734130859375, "rewards//std": 0.02916008234024048, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0344, "grad_norm": 0.5286543369293213, "kl": 0.006252227758523077, "learning_rate": 4.992631880567301e-06, "loss": 0.0006, "num_tokens": 1122600.0, "reward": 0.85650634765625, "reward_std": 0.01419912651181221, "rewards//mean": 0.85650634765625, "rewards//std": 0.034500766545534134, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0346, "grad_norm": 0.6253844499588013, "kl": 0.00583338420256041, "learning_rate": 4.992509651319585e-06, "loss": 0.0006, "num_tokens": 1129104.0, "reward": 0.814208984375, "reward_std": 0.015387440100312233, "rewards//mean": 0.814208984375, "rewards//std": 0.026428259909152985, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0348, "grad_norm": 0.5459834337234497, "kl": 0.006313475081697106, "learning_rate": 4.992386418089279e-06, "loss": 0.0006, "num_tokens": 1135800.0, "reward": 0.83197021484375, "reward_std": 0.011926204897463322, "rewards//mean": 0.83197021484375, "rewards//std": 0.02585022896528244, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.035, "grad_norm": 0.5601797699928284, "kl": 0.005816609016619623, "learning_rate": 4.992262180926022e-06, "loss": 0.0006, "num_tokens": 1142352.0, "reward": 0.84552001953125, "reward_std": 0.011603476479649544, "rewards//mean": 0.84552001953125, "rewards//std": 0.02518041804432869, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0352, "grad_norm": 0.49983859062194824, "kl": 0.006052123644622043, "learning_rate": 4.992136939879857e-06, "loss": 0.0006, "num_tokens": 1148984.0, "reward": 0.8468017578125, "reward_std": 0.015041794627904892, "rewards//mean": 0.8468017578125, "rewards//std": 0.033024612814188004, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0354, "grad_norm": 0.5256810188293457, "kl": 0.006643216998782009, "learning_rate": 4.992010695001229e-06, "loss": 0.0007, "num_tokens": 1155608.0, "reward": 0.8421630859375, "reward_std": 0.011597627773880959, "rewards//mean": 0.8421630859375, "rewards//std": 0.02555079385638237, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0356, "grad_norm": 0.6244553327560425, "kl": 0.007153651211410761, "learning_rate": 4.9918834463409925e-06, "loss": 0.0007, "num_tokens": 1162168.0, "reward": 0.85235595703125, "reward_std": 0.012248958460986614, "rewards//mean": 0.85235595703125, "rewards//std": 0.029620032757520676, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0358, "grad_norm": 0.5499500036239624, "kl": 0.0060980761190876365, "learning_rate": 4.991755193950401e-06, "loss": 0.0006, "num_tokens": 1168704.0, "reward": 0.83935546875, "reward_std": 0.011832846328616142, "rewards//mean": 0.83935546875, "rewards//std": 0.033120047301054, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.036, "grad_norm": 0.5794421434402466, "kl": 0.0063534324581269175, "learning_rate": 4.991625937881117e-06, "loss": 0.0006, "num_tokens": 1175256.0, "reward": 0.85357666015625, "reward_std": 0.016541291028261185, "rewards//mean": 0.85357666015625, "rewards//std": 0.039579860866069794, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0362, "grad_norm": 0.6126803159713745, "kl": 0.006563348462805152, "learning_rate": 4.991495678185202e-06, "loss": 0.0007, "num_tokens": 1181776.0, "reward": 0.8543701171875, "reward_std": 0.010336179286241531, "rewards//mean": 0.8543701171875, "rewards//std": 0.025021584704518318, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0364, "grad_norm": 0.5989588499069214, "kl": 0.00655812764307484, "learning_rate": 4.991364414915126e-06, "loss": 0.0007, "num_tokens": 1188352.0, "reward": 0.85723876953125, "reward_std": 0.015126184560358524, "rewards//mean": 0.85723876953125, "rewards//std": 0.03535842150449753, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0366, "grad_norm": 0.5920560956001282, "kl": 0.007293358852621168, "learning_rate": 4.9912321481237616e-06, "loss": 0.0007, "num_tokens": 1194944.0, "reward": 0.826416015625, "reward_std": 0.009397894144058228, "rewards//mean": 0.826416015625, "rewards//std": 0.019122395664453506, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0368, "grad_norm": 0.609233558177948, "kl": 0.0069352322607301176, "learning_rate": 4.991098877864386e-06, "loss": 0.0007, "num_tokens": 1201344.0, "reward": 0.8267822265625, "reward_std": 0.013857333920896053, "rewards//mean": 0.8267822265625, "rewards//std": 0.018535098060965538, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.037, "grad_norm": 0.6113260984420776, "kl": 0.007776582497172058, "learning_rate": 4.99096460419068e-06, "loss": 0.0008, "num_tokens": 1207896.0, "reward": 0.81707763671875, "reward_std": 0.014834532514214516, "rewards//mean": 0.81707763671875, "rewards//std": 0.035852447152137756, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0372, "grad_norm": 0.537739098072052, "kl": 0.00674009119393304, "learning_rate": 4.990829327156729e-06, "loss": 0.0007, "num_tokens": 1214408.0, "reward": 0.8321533203125, "reward_std": 0.009646743535995483, "rewards//mean": 0.8321533203125, "rewards//std": 0.022872328758239746, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0374, "grad_norm": 0.6183601021766663, "kl": 0.008413400035351515, "learning_rate": 4.990693046817023e-06, "loss": 0.0008, "num_tokens": 1220976.0, "reward": 0.81475830078125, "reward_std": 0.0088888481259346, "rewards//mean": 0.81475830078125, "rewards//std": 0.01764591969549656, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0376, "grad_norm": 0.579095184803009, "kl": 0.0072022924432531, "learning_rate": 4.990555763226456e-06, "loss": 0.0007, "num_tokens": 1227456.0, "reward": 0.8326416015625, "reward_std": 0.014683970250189304, "rewards//mean": 0.8326416015625, "rewards//std": 0.023900222033262253, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0378, "grad_norm": 0.5864057540893555, "kl": 0.008137135358992964, "learning_rate": 4.990417476440326e-06, "loss": 0.0008, "num_tokens": 1233904.0, "reward": 0.85552978515625, "reward_std": 0.012473128736019135, "rewards//mean": 0.85552978515625, "rewards//std": 0.03617484122514725, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.038, "grad_norm": 0.6523225903511047, "kl": 0.008292198239360005, "learning_rate": 4.9902781865143326e-06, "loss": 0.0008, "num_tokens": 1240416.0, "reward": 0.80804443359375, "reward_std": 0.014624349772930145, "rewards//mean": 0.80804443359375, "rewards//std": 0.026068320497870445, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0382, "grad_norm": 0.5711731314659119, "kl": 0.007915714057162404, "learning_rate": 4.990137893504585e-06, "loss": 0.0008, "num_tokens": 1246944.0, "reward": 0.8541259765625, "reward_std": 0.01398186944425106, "rewards//mean": 0.8541259765625, "rewards//std": 0.02810138463973999, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0384, "grad_norm": 0.575506329536438, "kl": 0.007600511948112398, "learning_rate": 4.989996597467591e-06, "loss": 0.0008, "num_tokens": 1253440.0, "reward": 0.8515625, "reward_std": 0.015058934688568115, "rewards//mean": 0.8515625, "rewards//std": 0.029297908768057823, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0386, "grad_norm": 0.6336308717727661, "kl": 0.007927103200927377, "learning_rate": 4.989854298460265e-06, "loss": 0.0008, "num_tokens": 1259976.0, "reward": 0.81378173828125, "reward_std": 0.010543026961386204, "rewards//mean": 0.81378173828125, "rewards//std": 0.01570315845310688, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0388, "grad_norm": 0.5761043429374695, "kl": 0.008717037679161876, "learning_rate": 4.989710996539926e-06, "loss": 0.0009, "num_tokens": 1266520.0, "reward": 0.889404296875, "reward_std": 0.013920702040195465, "rewards//mean": 0.889404296875, "rewards//std": 0.019436465576291084, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.039, "grad_norm": 0.5974484086036682, "kl": 0.008872057602275163, "learning_rate": 4.989566691764296e-06, "loss": 0.0009, "num_tokens": 1273072.0, "reward": 0.8302001953125, "reward_std": 0.016194570809602737, "rewards//mean": 0.8302001953125, "rewards//std": 0.03371773660182953, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0392, "grad_norm": 0.6319258809089661, "kl": 0.009591609938070178, "learning_rate": 4.9894213841914994e-06, "loss": 0.001, "num_tokens": 1279576.0, "reward": 0.80279541015625, "reward_std": 0.008850542828440666, "rewards//mean": 0.80279541015625, "rewards//std": 0.0178802739828825, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0394, "grad_norm": 0.585383951663971, "kl": 0.009328377433121204, "learning_rate": 4.989275073880067e-06, "loss": 0.0009, "num_tokens": 1286072.0, "reward": 0.8275146484375, "reward_std": 0.01107935793697834, "rewards//mean": 0.8275146484375, "rewards//std": 0.025997160002589226, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0396, "grad_norm": 0.5403591990470886, "kl": 0.00814232334960252, "learning_rate": 4.989127760888932e-06, "loss": 0.0008, "num_tokens": 1292584.0, "reward": 0.851318359375, "reward_std": 0.0181453637778759, "rewards//mean": 0.851318359375, "rewards//std": 0.02438831701874733, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0398, "grad_norm": 0.61063551902771, "kl": 0.008625458052847534, "learning_rate": 4.988979445277433e-06, "loss": 0.0009, "num_tokens": 1299088.0, "reward": 0.8294677734375, "reward_std": 0.010040882974863052, "rewards//mean": 0.8294677734375, "rewards//std": 0.01912674866616726, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.04, "grad_norm": 0.6044790744781494, "kl": 0.008569077064748853, "learning_rate": 4.988830127105312e-06, "loss": 0.0009, "num_tokens": 1305592.0, "reward": 0.84759521484375, "reward_std": 0.012233874760568142, "rewards//mean": 0.84759521484375, "rewards//std": 0.026906151324510574, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0402, "grad_norm": 0.6054474115371704, "kl": 0.009775987244211137, "learning_rate": 4.988679806432712e-06, "loss": 0.001, "num_tokens": 1312104.0, "reward": 0.8240966796875, "reward_std": 0.013713570311665535, "rewards//mean": 0.8240966796875, "rewards//std": 0.0317419171333313, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0404, "grad_norm": 0.5946995615959167, "kl": 0.009700121358036995, "learning_rate": 4.988528483320184e-06, "loss": 0.001, "num_tokens": 1318600.0, "reward": 0.842041015625, "reward_std": 0.01567421853542328, "rewards//mean": 0.842041015625, "rewards//std": 0.047219280153512955, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0406, "grad_norm": 0.5984341502189636, "kl": 0.009171057841740549, "learning_rate": 4.9883761578286805e-06, "loss": 0.0009, "num_tokens": 1325088.0, "reward": 0.8309326171875, "reward_std": 0.017868168652057648, "rewards//mean": 0.8309326171875, "rewards//std": 0.039114974439144135, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0408, "grad_norm": 0.5118701457977295, "kl": 0.009044189820997417, "learning_rate": 4.988222830019559e-06, "loss": 0.0009, "num_tokens": 1331568.0, "reward": 0.83575439453125, "reward_std": 0.010470295324921608, "rewards//mean": 0.83575439453125, "rewards//std": 0.01437237299978733, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.041, "grad_norm": 0.6392531991004944, "kl": 0.010742165730334818, "learning_rate": 4.988068499954578e-06, "loss": 0.0011, "num_tokens": 1338136.0, "reward": 0.85009765625, "reward_std": 0.012784970924258232, "rewards//mean": 0.85009765625, "rewards//std": 0.03285572677850723, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0412, "grad_norm": 0.5961043238639832, "kl": 0.011020470352377743, "learning_rate": 4.987913167695904e-06, "loss": 0.0011, "num_tokens": 1344584.0, "reward": 0.84033203125, "reward_std": 0.011852873489260674, "rewards//mean": 0.84033203125, "rewards//std": 0.02290969155728817, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0414, "grad_norm": 0.5800420045852661, "kl": 0.009589398512616754, "learning_rate": 4.987756833306103e-06, "loss": 0.001, "num_tokens": 1351184.0, "reward": 0.849853515625, "reward_std": 0.012520735152065754, "rewards//mean": 0.849853515625, "rewards//std": 0.03702443093061447, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0416, "grad_norm": 0.5232331156730652, "kl": 0.01012984523549676, "learning_rate": 4.987599496848147e-06, "loss": 0.001, "num_tokens": 1357760.0, "reward": 0.84149169921875, "reward_std": 0.016268065199255943, "rewards//mean": 0.84149169921875, "rewards//std": 0.024248182773590088, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0418, "grad_norm": 0.613732099533081, "kl": 0.010579521884210408, "learning_rate": 4.987441158385411e-06, "loss": 0.0011, "num_tokens": 1364392.0, "reward": 0.854736328125, "reward_std": 0.016416363418102264, "rewards//mean": 0.854736328125, "rewards//std": 0.03490995615720749, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.042, "grad_norm": 0.5997706651687622, "kl": 0.010741343721747398, "learning_rate": 4.987281817981674e-06, "loss": 0.0011, "num_tokens": 1370944.0, "reward": 0.86395263671875, "reward_std": 0.013692582026124, "rewards//mean": 0.86395263671875, "rewards//std": 0.030517160892486572, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0422, "grad_norm": 0.5470620393753052, "kl": 0.013648364343680441, "learning_rate": 4.987121475701118e-06, "loss": 0.0014, "num_tokens": 1377408.0, "reward": 0.84954833984375, "reward_std": 0.011153290048241615, "rewards//mean": 0.84954833984375, "rewards//std": 0.019954398274421692, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0424, "grad_norm": 0.7061495184898376, "kl": 0.011234368314035237, "learning_rate": 4.986960131608329e-06, "loss": 0.0011, "num_tokens": 1383864.0, "reward": 0.82733154296875, "reward_std": 0.011791674420237541, "rewards//mean": 0.82733154296875, "rewards//std": 0.022948559373617172, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0426, "grad_norm": 0.5791126489639282, "kl": 0.011082913028076291, "learning_rate": 4.986797785768296e-06, "loss": 0.0011, "num_tokens": 1390304.0, "reward": 0.851318359375, "reward_std": 0.018234960734844208, "rewards//mean": 0.851318359375, "rewards//std": 0.04792189970612526, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0428, "grad_norm": 0.6446351408958435, "kl": 0.010612470097839832, "learning_rate": 4.986634438246413e-06, "loss": 0.0011, "num_tokens": 1396776.0, "reward": 0.836181640625, "reward_std": 0.00901946984231472, "rewards//mean": 0.836181640625, "rewards//std": 0.017145667225122452, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.043, "grad_norm": 0.6024380922317505, "kl": 0.010903890011832118, "learning_rate": 4.986470089108476e-06, "loss": 0.0011, "num_tokens": 1403280.0, "reward": 0.82220458984375, "reward_std": 0.010884717106819153, "rewards//mean": 0.82220458984375, "rewards//std": 0.019161837175488472, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0432, "grad_norm": 0.5589469075202942, "kl": 0.011347507941536605, "learning_rate": 4.986304738420684e-06, "loss": 0.0011, "num_tokens": 1409752.0, "reward": 0.84674072265625, "reward_std": 0.01086362637579441, "rewards//mean": 0.84674072265625, "rewards//std": 0.016987673938274384, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0434, "grad_norm": 0.6330822706222534, "kl": 0.011820332612842321, "learning_rate": 4.986138386249641e-06, "loss": 0.0012, "num_tokens": 1416320.0, "reward": 0.84808349609375, "reward_std": 0.013128334656357765, "rewards//mean": 0.84808349609375, "rewards//std": 0.026347875595092773, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0436, "grad_norm": 0.6010637283325195, "kl": 0.00960783491609618, "learning_rate": 4.985971032662352e-06, "loss": 0.001, "num_tokens": 1422800.0, "reward": 0.83551025390625, "reward_std": 0.014542114920914173, "rewards//mean": 0.83551025390625, "rewards//std": 0.030082007870078087, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0438, "grad_norm": 0.5061883926391602, "kl": 0.010777416347991675, "learning_rate": 4.98580267772623e-06, "loss": 0.0011, "num_tokens": 1429392.0, "reward": 0.8599853515625, "reward_std": 0.013834359124302864, "rewards//mean": 0.8599853515625, "rewards//std": 0.025753792375326157, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.044, "grad_norm": 0.58171147108078, "kl": 0.011898035183548927, "learning_rate": 4.985633321509086e-06, "loss": 0.0012, "num_tokens": 1435816.0, "reward": 0.8372802734375, "reward_std": 0.01282973401248455, "rewards//mean": 0.8372802734375, "rewards//std": 0.026541143655776978, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0442, "grad_norm": 0.6470307111740112, "kl": 0.011951541411690414, "learning_rate": 4.985462964079137e-06, "loss": 0.0012, "num_tokens": 1442424.0, "reward": 0.8245849609375, "reward_std": 0.014490412548184395, "rewards//mean": 0.8245849609375, "rewards//std": 0.026085518300533295, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0444, "grad_norm": 0.6614448428153992, "kl": 0.011769724893383682, "learning_rate": 4.985291605505004e-06, "loss": 0.0012, "num_tokens": 1449072.0, "reward": 0.850341796875, "reward_std": 0.013671314343810081, "rewards//mean": 0.850341796875, "rewards//std": 0.020503751933574677, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0446, "grad_norm": 0.6548160910606384, "kl": 0.011074570356868207, "learning_rate": 4.9851192458557084e-06, "loss": 0.0011, "num_tokens": 1455600.0, "reward": 0.82940673828125, "reward_std": 0.012141291052103043, "rewards//mean": 0.82940673828125, "rewards//std": 0.03643876314163208, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0448, "grad_norm": 0.6014297008514404, "kl": 0.010654145269654691, "learning_rate": 4.984945885200679e-06, "loss": 0.0011, "num_tokens": 1462256.0, "reward": 0.85821533203125, "reward_std": 0.008870145305991173, "rewards//mean": 0.85821533203125, "rewards//std": 0.016872331500053406, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.045, "grad_norm": 0.5984156131744385, "kl": 0.011732026003301144, "learning_rate": 4.984771523609744e-06, "loss": 0.0012, "num_tokens": 1468736.0, "reward": 0.854248046875, "reward_std": 0.01087274868041277, "rewards//mean": 0.854248046875, "rewards//std": 0.03552206605672836, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0452, "grad_norm": 0.6330734491348267, "kl": 0.012012874241918325, "learning_rate": 4.9845961611531356e-06, "loss": 0.0012, "num_tokens": 1475256.0, "reward": 0.8272705078125, "reward_std": 0.012549136765301228, "rewards//mean": 0.8272705078125, "rewards//std": 0.027049588039517403, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0454, "grad_norm": 0.6217445135116577, "kl": 0.010952474898658693, "learning_rate": 4.984419797901491e-06, "loss": 0.0011, "num_tokens": 1481752.0, "reward": 0.80938720703125, "reward_std": 0.01106027141213417, "rewards//mean": 0.80938720703125, "rewards//std": 0.015995418652892113, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0456, "grad_norm": 0.5331505537033081, "kl": 0.011694666813127697, "learning_rate": 4.984242433925849e-06, "loss": 0.0012, "num_tokens": 1488312.0, "reward": 0.842041015625, "reward_std": 0.015316024422645569, "rewards//mean": 0.842041015625, "rewards//std": 0.055478423833847046, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0458, "grad_norm": 0.5423309803009033, "kl": 0.011228133691474795, "learning_rate": 4.984064069297652e-06, "loss": 0.0011, "num_tokens": 1494848.0, "reward": 0.8193359375, "reward_std": 0.01109884213656187, "rewards//mean": 0.8193359375, "rewards//std": 0.0388573482632637, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.046, "grad_norm": 0.6137152910232544, "kl": 0.013376098999287933, "learning_rate": 4.983884704088745e-06, "loss": 0.0013, "num_tokens": 1501336.0, "reward": 0.80975341796875, "reward_std": 0.01384538970887661, "rewards//mean": 0.80975341796875, "rewards//std": 0.032224390655756, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0462, "grad_norm": 0.578719973564148, "kl": 0.012490722816437483, "learning_rate": 4.983704338371375e-06, "loss": 0.0012, "num_tokens": 1507872.0, "reward": 0.8492431640625, "reward_std": 0.013666566461324692, "rewards//mean": 0.8492431640625, "rewards//std": 0.03480464220046997, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0464, "grad_norm": 0.5728334784507751, "kl": 0.013004794949665666, "learning_rate": 4.983522972218196e-06, "loss": 0.0013, "num_tokens": 1514504.0, "reward": 0.85906982421875, "reward_std": 0.015117382630705833, "rewards//mean": 0.85906982421875, "rewards//std": 0.02659662440419197, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0466, "grad_norm": 0.5708733201026917, "kl": 0.010296467458829284, "learning_rate": 4.983340605702261e-06, "loss": 0.001, "num_tokens": 1521152.0, "reward": 0.79461669921875, "reward_std": 0.011611716821789742, "rewards//mean": 0.79461669921875, "rewards//std": 0.027777044102549553, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0468, "grad_norm": 0.5398491621017456, "kl": 0.010386907437350601, "learning_rate": 4.983157238897026e-06, "loss": 0.001, "num_tokens": 1527624.0, "reward": 0.82073974609375, "reward_std": 0.009576220065355301, "rewards//mean": 0.82073974609375, "rewards//std": 0.01762274280190468, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.047, "grad_norm": 0.6520707011222839, "kl": 0.011667132377624512, "learning_rate": 4.982972871876353e-06, "loss": 0.0012, "num_tokens": 1534072.0, "reward": 0.877685546875, "reward_std": 0.014000840485095978, "rewards//mean": 0.877685546875, "rewards//std": 0.0195234976708889, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0472, "grad_norm": 0.5892413854598999, "kl": 0.010782511322759092, "learning_rate": 4.982787504714503e-06, "loss": 0.0011, "num_tokens": 1540600.0, "reward": 0.8544921875, "reward_std": 0.015170978382229805, "rewards//mean": 0.8544921875, "rewards//std": 0.03643839806318283, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0474, "grad_norm": 0.6138961315155029, "kl": 0.01105735485907644, "learning_rate": 4.982601137486144e-06, "loss": 0.0011, "num_tokens": 1547024.0, "reward": 0.8653564453125, "reward_std": 0.015798892825841904, "rewards//mean": 0.8653564453125, "rewards//std": 0.0336889922618866, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0476, "grad_norm": 0.5267355442047119, "kl": 0.011293756659142673, "learning_rate": 4.9824137702663424e-06, "loss": 0.0011, "num_tokens": 1553528.0, "reward": 0.856689453125, "reward_std": 0.013259888626635075, "rewards//mean": 0.856689453125, "rewards//std": 0.02487991936504841, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0478, "grad_norm": 0.5977954864501953, "kl": 0.012308461940847337, "learning_rate": 4.982225403130572e-06, "loss": 0.0012, "num_tokens": 1560032.0, "reward": 0.8436279296875, "reward_std": 0.01567743718624115, "rewards//mean": 0.8436279296875, "rewards//std": 0.028501469641923904, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.048, "grad_norm": 0.5983859300613403, "kl": 0.011517034086864442, "learning_rate": 4.982036036154706e-06, "loss": 0.0012, "num_tokens": 1566608.0, "reward": 0.81439208984375, "reward_std": 0.015477526932954788, "rewards//mean": 0.81439208984375, "rewards//std": 0.034139085561037064, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0482, "grad_norm": 0.6366744637489319, "kl": 0.010967534384690225, "learning_rate": 4.981845669415022e-06, "loss": 0.0011, "num_tokens": 1573104.0, "reward": 0.79840087890625, "reward_std": 0.013367927633225918, "rewards//mean": 0.79840087890625, "rewards//std": 0.030470000579953194, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0484, "grad_norm": 0.5575968027114868, "kl": 0.010500523203518242, "learning_rate": 4.981654302988198e-06, "loss": 0.0011, "num_tokens": 1579664.0, "reward": 0.80682373046875, "reward_std": 0.010329650714993477, "rewards//mean": 0.80682373046875, "rewards//std": 0.016217226162552834, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0486, "grad_norm": 0.5449930429458618, "kl": 0.013868011650629342, "learning_rate": 4.9814619369513184e-06, "loss": 0.0014, "num_tokens": 1586088.0, "reward": 0.8521728515625, "reward_std": 0.015307075344026089, "rewards//mean": 0.8521728515625, "rewards//std": 0.03274288401007652, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0488, "grad_norm": 0.6359149813652039, "kl": 0.011965149664320052, "learning_rate": 4.981268571381867e-06, "loss": 0.0012, "num_tokens": 1592592.0, "reward": 0.85205078125, "reward_std": 0.009656844660639763, "rewards//mean": 0.85205078125, "rewards//std": 0.02580341137945652, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.049, "grad_norm": 0.6533293724060059, "kl": 0.011329148430377245, "learning_rate": 4.981074206357732e-06, "loss": 0.0011, "num_tokens": 1599216.0, "reward": 0.80035400390625, "reward_std": 0.013833885081112385, "rewards//mean": 0.80035400390625, "rewards//std": 0.026191137731075287, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0492, "grad_norm": 0.5970718264579773, "kl": 0.01282750372774899, "learning_rate": 4.980878841957203e-06, "loss": 0.0013, "num_tokens": 1605792.0, "reward": 0.85223388671875, "reward_std": 0.01925882138311863, "rewards//mean": 0.85223388671875, "rewards//std": 0.029467342421412468, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0494, "grad_norm": 0.5749726295471191, "kl": 0.015270262490957975, "learning_rate": 4.980682478258973e-06, "loss": 0.0015, "num_tokens": 1612288.0, "reward": 0.86114501953125, "reward_std": 0.01426261942833662, "rewards//mean": 0.86114501953125, "rewards//std": 0.030289122834801674, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0496, "grad_norm": 0.6399112939834595, "kl": 0.012950829579494894, "learning_rate": 4.980485115342138e-06, "loss": 0.0013, "num_tokens": 1618808.0, "reward": 0.84722900390625, "reward_std": 0.010733772069215775, "rewards//mean": 0.84722900390625, "rewards//std": 0.015186639502644539, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0498, "grad_norm": 0.5356336236000061, "kl": 0.015245730872265995, "learning_rate": 4.980286753286196e-06, "loss": 0.0015, "num_tokens": 1625264.0, "reward": 0.79901123046875, "reward_std": 0.008075348101556301, "rewards//mean": 0.79901123046875, "rewards//std": 0.018752193078398705, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.05, "grad_norm": 0.6941496729850769, "kl": 0.013328676926903427, "learning_rate": 4.980087392171045e-06, "loss": 0.0013, "num_tokens": 1631784.0, "reward": 0.83270263671875, "reward_std": 0.015555602498352528, "rewards//mean": 0.83270263671875, "rewards//std": 0.04173377901315689, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0502, "grad_norm": 0.5972203612327576, "kl": 0.01535372855141759, "learning_rate": 4.9798870320769884e-06, "loss": 0.0015, "num_tokens": 1638272.0, "reward": 0.86041259765625, "reward_std": 0.01315420214086771, "rewards//mean": 0.86041259765625, "rewards//std": 0.031080909073352814, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0504, "grad_norm": 0.5607619881629944, "kl": 0.012482833117246628, "learning_rate": 4.979685673084733e-06, "loss": 0.0012, "num_tokens": 1644856.0, "reward": 0.8389892578125, "reward_std": 0.0162490364164114, "rewards//mean": 0.8389892578125, "rewards//std": 0.027370035648345947, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0506, "grad_norm": 0.6199642419815063, "kl": 0.013209497090429068, "learning_rate": 4.979483315275385e-06, "loss": 0.0013, "num_tokens": 1651312.0, "reward": 0.85308837890625, "reward_std": 0.015295417048037052, "rewards//mean": 0.85308837890625, "rewards//std": 0.024375824257731438, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0508, "grad_norm": 0.6028778553009033, "kl": 0.015642864862456918, "learning_rate": 4.979279958730454e-06, "loss": 0.0016, "num_tokens": 1657896.0, "reward": 0.8355712890625, "reward_std": 0.013927731662988663, "rewards//mean": 0.8355712890625, "rewards//std": 0.03703648969531059, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.051, "grad_norm": 0.6312702298164368, "kl": 0.014434279641136527, "learning_rate": 4.979075603531852e-06, "loss": 0.0014, "num_tokens": 1664464.0, "reward": 0.81414794921875, "reward_std": 0.011265669949352741, "rewards//mean": 0.81414794921875, "rewards//std": 0.01821327768266201, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0512, "grad_norm": 0.5943956971168518, "kl": 0.01590067707002163, "learning_rate": 4.978870249761893e-06, "loss": 0.0016, "num_tokens": 1670912.0, "reward": 0.88873291015625, "reward_std": 0.011189533397555351, "rewards//mean": 0.88873291015625, "rewards//std": 0.02351994626224041, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0514, "grad_norm": 0.6058782935142517, "kl": 0.015274307108484209, "learning_rate": 4.978663897503294e-06, "loss": 0.0015, "num_tokens": 1677448.0, "reward": 0.87750244140625, "reward_std": 0.013855671510100365, "rewards//mean": 0.87750244140625, "rewards//std": 0.029026882722973824, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0516, "grad_norm": 0.6686677932739258, "kl": 0.015571855707094073, "learning_rate": 4.978456546839175e-06, "loss": 0.0016, "num_tokens": 1683936.0, "reward": 0.83636474609375, "reward_std": 0.014321016147732735, "rewards//mean": 0.83636474609375, "rewards//std": 0.03732571005821228, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0518, "grad_norm": 0.5977317094802856, "kl": 0.015480601461604238, "learning_rate": 4.978248197853053e-06, "loss": 0.0015, "num_tokens": 1690536.0, "reward": 0.87762451171875, "reward_std": 0.014677740633487701, "rewards//mean": 0.87762451171875, "rewards//std": 0.029107604175806046, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.052, "grad_norm": 0.5967756509780884, "kl": 0.013380717020481825, "learning_rate": 4.978038850628855e-06, "loss": 0.0013, "num_tokens": 1697112.0, "reward": 0.8309326171875, "reward_std": 0.015575399622321129, "rewards//mean": 0.8309326171875, "rewards//std": 0.030181292444467545, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0522, "grad_norm": 0.6308935284614563, "kl": 0.017400558106601238, "learning_rate": 4.977828505250903e-06, "loss": 0.0017, "num_tokens": 1703664.0, "reward": 0.8321533203125, "reward_std": 0.00893273763358593, "rewards//mean": 0.8321533203125, "rewards//std": 0.016703374683856964, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0524, "grad_norm": 0.585950493812561, "kl": 0.01643908827099949, "learning_rate": 4.977617161803927e-06, "loss": 0.0016, "num_tokens": 1710240.0, "reward": 0.82623291015625, "reward_std": 0.012724083848297596, "rewards//mean": 0.82623291015625, "rewards//std": 0.02370458096265793, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0526, "grad_norm": 0.6090091466903687, "kl": 0.017514563747681677, "learning_rate": 4.977404820373053e-06, "loss": 0.0018, "num_tokens": 1716792.0, "reward": 0.86749267578125, "reward_std": 0.013316700235009193, "rewards//mean": 0.86749267578125, "rewards//std": 0.02335784211754799, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0528, "grad_norm": 0.653965413570404, "kl": 0.014788356143981218, "learning_rate": 4.977191481043814e-06, "loss": 0.0015, "num_tokens": 1723320.0, "reward": 0.850830078125, "reward_std": 0.016372665762901306, "rewards//mean": 0.850830078125, "rewards//std": 0.05733340233564377, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.053, "grad_norm": 0.6236534714698792, "kl": 0.020398969296365976, "learning_rate": 4.976977143902143e-06, "loss": 0.002, "num_tokens": 1729736.0, "reward": 0.8720703125, "reward_std": 0.017627835273742676, "rewards//mean": 0.8720703125, "rewards//std": 0.02900712378323078, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0532, "grad_norm": 0.6153638958930969, "kl": 0.022930864011868834, "learning_rate": 4.976761809034375e-06, "loss": 0.0023, "num_tokens": 1736176.0, "reward": 0.8203125, "reward_std": 0.010127585381269455, "rewards//mean": 0.8203125, "rewards//std": 0.023313162848353386, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0534, "grad_norm": 0.6875045895576477, "kl": 0.02023582032416016, "learning_rate": 4.976545476527246e-06, "loss": 0.002, "num_tokens": 1742640.0, "reward": 0.84423828125, "reward_std": 0.01115519367158413, "rewards//mean": 0.84423828125, "rewards//std": 0.03886669874191284, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0536, "grad_norm": 0.5966421365737915, "kl": 0.018444908084347844, "learning_rate": 4.976328146467895e-06, "loss": 0.0018, "num_tokens": 1749224.0, "reward": 0.8155517578125, "reward_std": 0.011006256565451622, "rewards//mean": 0.8155517578125, "rewards//std": 0.02147979475557804, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0538, "grad_norm": 0.6186309456825256, "kl": 0.019571573473513126, "learning_rate": 4.976109818943863e-06, "loss": 0.002, "num_tokens": 1755704.0, "reward": 0.8206787109375, "reward_std": 0.008112533017992973, "rewards//mean": 0.8206787109375, "rewards//std": 0.021176017820835114, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.054, "grad_norm": 0.611027181148529, "kl": 0.016425883513875306, "learning_rate": 4.975890494043092e-06, "loss": 0.0016, "num_tokens": 1762216.0, "reward": 0.811767578125, "reward_std": 0.011278307065367699, "rewards//mean": 0.811767578125, "rewards//std": 0.017328334972262383, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0542, "grad_norm": 0.5591591596603394, "kl": 0.015827651717700064, "learning_rate": 4.975670171853926e-06, "loss": 0.0016, "num_tokens": 1768744.0, "reward": 0.85272216796875, "reward_std": 0.011784248054027557, "rewards//mean": 0.85272216796875, "rewards//std": 0.01991415023803711, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0544, "grad_norm": 0.5942398905754089, "kl": 0.016672777361236513, "learning_rate": 4.975448852465111e-06, "loss": 0.0017, "num_tokens": 1775304.0, "reward": 0.845703125, "reward_std": 0.010333438403904438, "rewards//mean": 0.845703125, "rewards//std": 0.02365351840853691, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0546, "grad_norm": 0.5608808994293213, "kl": 0.021653135307133198, "learning_rate": 4.975226535965795e-06, "loss": 0.0022, "num_tokens": 1781808.0, "reward": 0.84881591796875, "reward_std": 0.018758047372102737, "rewards//mean": 0.84881591796875, "rewards//std": 0.03514370694756508, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0548, "grad_norm": 0.6057482361793518, "kl": 0.019426355720497668, "learning_rate": 4.975003222445525e-06, "loss": 0.0019, "num_tokens": 1788288.0, "reward": 0.773193359375, "reward_std": 0.010500762611627579, "rewards//mean": 0.773193359375, "rewards//std": 0.020420897752046585, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.055, "grad_norm": 0.7404553890228271, "kl": 0.017519006971269846, "learning_rate": 4.974778911994254e-06, "loss": 0.0018, "num_tokens": 1794840.0, "reward": 0.85693359375, "reward_std": 0.011788511648774147, "rewards//mean": 0.85693359375, "rewards//std": 0.029548974707722664, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0552, "grad_norm": 0.586951732635498, "kl": 0.017076579038985074, "learning_rate": 4.974553604702332e-06, "loss": 0.0017, "num_tokens": 1801368.0, "reward": 0.84478759765625, "reward_std": 0.011476319283246994, "rewards//mean": 0.84478759765625, "rewards//std": 0.022121649235486984, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0554, "grad_norm": 0.6556093692779541, "kl": 0.016207139007747173, "learning_rate": 4.974327300660515e-06, "loss": 0.0016, "num_tokens": 1807880.0, "reward": 0.81658935546875, "reward_std": 0.010874106548726559, "rewards//mean": 0.81658935546875, "rewards//std": 0.02728992886841297, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0556, "grad_norm": 0.6637612581253052, "kl": 0.02063511044252664, "learning_rate": 4.974099999959957e-06, "loss": 0.0021, "num_tokens": 1814448.0, "reward": 0.85821533203125, "reward_std": 0.008319644257426262, "rewards//mean": 0.85821533203125, "rewards//std": 0.01655350811779499, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0558, "grad_norm": 0.5973981618881226, "kl": 0.020477684447541833, "learning_rate": 4.973871702692215e-06, "loss": 0.002, "num_tokens": 1820936.0, "reward": 0.849609375, "reward_std": 0.010854586958885193, "rewards//mean": 0.849609375, "rewards//std": 0.03117240034043789, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.056, "grad_norm": 0.5967226624488831, "kl": 0.023090009344741702, "learning_rate": 4.973642408949247e-06, "loss": 0.0023, "num_tokens": 1827440.0, "reward": 0.804931640625, "reward_std": 0.009756384417414665, "rewards//mean": 0.804931640625, "rewards//std": 0.015545355156064034, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0562, "grad_norm": 0.6917018890380859, "kl": 0.021913145435974002, "learning_rate": 4.9734121188234115e-06, "loss": 0.0022, "num_tokens": 1833984.0, "reward": 0.84295654296875, "reward_std": 0.01435273140668869, "rewards//mean": 0.84295654296875, "rewards//std": 0.024912068620324135, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0564, "grad_norm": 0.6409370303153992, "kl": 0.020682931412011385, "learning_rate": 4.973180832407471e-06, "loss": 0.0021, "num_tokens": 1840440.0, "reward": 0.7938232421875, "reward_std": 0.008664321154356003, "rewards//mean": 0.7938232421875, "rewards//std": 0.015801778063178062, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0566, "grad_norm": 0.5642397999763489, "kl": 0.016882274649105966, "learning_rate": 4.972948549794587e-06, "loss": 0.0017, "num_tokens": 1846928.0, "reward": 0.8372802734375, "reward_std": 0.00897817499935627, "rewards//mean": 0.8372802734375, "rewards//std": 0.020448677241802216, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0568, "grad_norm": 0.5508286952972412, "kl": 0.018819021992385387, "learning_rate": 4.972715271078323e-06, "loss": 0.0019, "num_tokens": 1853400.0, "reward": 0.8465576171875, "reward_std": 0.012325981631875038, "rewards//mean": 0.8465576171875, "rewards//std": 0.026073908433318138, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.057, "grad_norm": 0.5768721103668213, "kl": 0.020360214170068502, "learning_rate": 4.972480996352644e-06, "loss": 0.002, "num_tokens": 1859832.0, "reward": 0.83929443359375, "reward_std": 0.011263377964496613, "rewards//mean": 0.83929443359375, "rewards//std": 0.026592640206217766, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0572, "grad_norm": 0.5812131762504578, "kl": 0.016725558903999627, "learning_rate": 4.9722457257119144e-06, "loss": 0.0017, "num_tokens": 1866384.0, "reward": 0.822265625, "reward_std": 0.008006171323359013, "rewards//mean": 0.822265625, "rewards//std": 0.01790745183825493, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0574, "grad_norm": 0.6562964916229248, "kl": 0.022804646170698106, "learning_rate": 4.972009459250903e-06, "loss": 0.0023, "num_tokens": 1872928.0, "reward": 0.82635498046875, "reward_std": 0.011693461798131466, "rewards//mean": 0.82635498046875, "rewards//std": 0.03444236144423485, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0576, "grad_norm": 0.7030390501022339, "kl": 0.021410147426649928, "learning_rate": 4.971772197064776e-06, "loss": 0.0021, "num_tokens": 1879496.0, "reward": 0.8388671875, "reward_std": 0.012064478360116482, "rewards//mean": 0.8388671875, "rewards//std": 0.020611261948943138, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0578, "grad_norm": 0.6291245222091675, "kl": 0.017788728117011487, "learning_rate": 4.971533939249105e-06, "loss": 0.0018, "num_tokens": 1886096.0, "reward": 0.847412109375, "reward_std": 0.00863015465438366, "rewards//mean": 0.847412109375, "rewards//std": 0.022807708010077477, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.058, "grad_norm": 0.5937075614929199, "kl": 0.019959610304795206, "learning_rate": 4.9712946858998576e-06, "loss": 0.002, "num_tokens": 1892696.0, "reward": 0.86376953125, "reward_std": 0.00976363942027092, "rewards//mean": 0.86376953125, "rewards//std": 0.020237719640135765, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0582, "grad_norm": 0.7244158983230591, "kl": 0.02064157126005739, "learning_rate": 4.971054437113406e-06, "loss": 0.0021, "num_tokens": 1899240.0, "reward": 0.85516357421875, "reward_std": 0.012879934161901474, "rewards//mean": 0.85516357421875, "rewards//std": 0.036751486361026764, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0584, "grad_norm": 0.6756367087364197, "kl": 0.02059971122071147, "learning_rate": 4.9708131929865235e-06, "loss": 0.0021, "num_tokens": 1905696.0, "reward": 0.8631591796875, "reward_std": 0.019185233861207962, "rewards//mean": 0.8631591796875, "rewards//std": 0.03356654942035675, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0586, "grad_norm": 0.6094867587089539, "kl": 0.02083066711202264, "learning_rate": 4.970570953616383e-06, "loss": 0.0021, "num_tokens": 1912248.0, "reward": 0.87664794921875, "reward_std": 0.012814180925488472, "rewards//mean": 0.87664794921875, "rewards//std": 0.023896027356386185, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0588, "grad_norm": 0.673071563243866, "kl": 0.017373336595483124, "learning_rate": 4.970327719100556e-06, "loss": 0.0017, "num_tokens": 1918816.0, "reward": 0.8592529296875, "reward_std": 0.011386911384761333, "rewards//mean": 0.8592529296875, "rewards//std": 0.029031902551651, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.059, "grad_norm": 0.6086812615394592, "kl": 0.019332718802616, "learning_rate": 4.970083489537021e-06, "loss": 0.0019, "num_tokens": 1925240.0, "reward": 0.86419677734375, "reward_std": 0.017809508368372917, "rewards//mean": 0.86419677734375, "rewards//std": 0.02093472145497799, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0592, "grad_norm": 0.6028139591217041, "kl": 0.015487979399040341, "learning_rate": 4.96983826502415e-06, "loss": 0.0015, "num_tokens": 1931848.0, "reward": 0.84808349609375, "reward_std": 0.01572677493095398, "rewards//mean": 0.84808349609375, "rewards//std": 0.03593721240758896, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0594, "grad_norm": 0.6186932921409607, "kl": 0.014006961951963603, "learning_rate": 4.969592045660723e-06, "loss": 0.0014, "num_tokens": 1938352.0, "reward": 0.87347412109375, "reward_std": 0.016976596787571907, "rewards//mean": 0.87347412109375, "rewards//std": 0.03520524874329567, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0596, "grad_norm": 0.5595616698265076, "kl": 0.019682875834405422, "learning_rate": 4.969344831545914e-06, "loss": 0.002, "num_tokens": 1944976.0, "reward": 0.85174560546875, "reward_std": 0.013488125056028366, "rewards//mean": 0.85174560546875, "rewards//std": 0.023379866033792496, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0598, "grad_norm": 0.6204686760902405, "kl": 0.020846938248723745, "learning_rate": 4.969096622779303e-06, "loss": 0.0021, "num_tokens": 1951480.0, "reward": 0.85546875, "reward_std": 0.017438407987356186, "rewards//mean": 0.85546875, "rewards//std": 0.05409525707364082, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.06, "grad_norm": 0.6783480644226074, "kl": 0.021353234420530498, "learning_rate": 4.968847419460867e-06, "loss": 0.0021, "num_tokens": 1957992.0, "reward": 0.80029296875, "reward_std": 0.009782904759049416, "rewards//mean": 0.80029296875, "rewards//std": 0.024602141231298447, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0602, "grad_norm": 0.6517021656036377, "kl": 0.02009395882487297, "learning_rate": 4.968597221690986e-06, "loss": 0.002, "num_tokens": 1964584.0, "reward": 0.842041015625, "reward_std": 0.011664390563964844, "rewards//mean": 0.842041015625, "rewards//std": 0.023218169808387756, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0604, "grad_norm": 0.6198382377624512, "kl": 0.019521123496815562, "learning_rate": 4.96834602957044e-06, "loss": 0.002, "num_tokens": 1971112.0, "reward": 0.8695068359375, "reward_std": 0.02505212277173996, "rewards//mean": 0.8695068359375, "rewards//std": 0.037069171667099, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0606, "grad_norm": 0.5796791315078735, "kl": 0.017055419622920454, "learning_rate": 4.968093843200407e-06, "loss": 0.0017, "num_tokens": 1977768.0, "reward": 0.84686279296875, "reward_std": 0.011925576254725456, "rewards//mean": 0.84686279296875, "rewards//std": 0.04024247080087662, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0608, "grad_norm": 0.6452294588088989, "kl": 0.02542811818420887, "learning_rate": 4.96784066268247e-06, "loss": 0.0025, "num_tokens": 1984296.0, "reward": 0.8172607421875, "reward_std": 0.0118525680154562, "rewards//mean": 0.8172607421875, "rewards//std": 0.019493624567985535, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.061, "grad_norm": 0.5470280647277832, "kl": 0.017710189567878842, "learning_rate": 4.967586488118609e-06, "loss": 0.0018, "num_tokens": 1990848.0, "reward": 0.8353271484375, "reward_std": 0.012310231104493141, "rewards//mean": 0.8353271484375, "rewards//std": 0.019290665164589882, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0612, "grad_norm": 0.5871063470840454, "kl": 0.018388585303910077, "learning_rate": 4.967331319611206e-06, "loss": 0.0018, "num_tokens": 1997344.0, "reward": 0.829345703125, "reward_std": 0.009855730459094048, "rewards//mean": 0.829345703125, "rewards//std": 0.0323907844722271, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0614, "grad_norm": 0.5350654721260071, "kl": 0.020239924429915845, "learning_rate": 4.9670751572630425e-06, "loss": 0.002, "num_tokens": 2003848.0, "reward": 0.82427978515625, "reward_std": 0.011770764365792274, "rewards//mean": 0.82427978515625, "rewards//std": 0.027529064565896988, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0616, "grad_norm": 0.597829282283783, "kl": 0.022617788519710302, "learning_rate": 4.9668180011773e-06, "loss": 0.0023, "num_tokens": 2010360.0, "reward": 0.80657958984375, "reward_std": 0.013200776651501656, "rewards//mean": 0.80657958984375, "rewards//std": 0.03527655452489853, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0618, "grad_norm": 0.6784936785697937, "kl": 0.02636838029138744, "learning_rate": 4.966559851457562e-06, "loss": 0.0026, "num_tokens": 2016808.0, "reward": 0.742431640625, "reward_std": 0.013185866177082062, "rewards//mean": 0.742431640625, "rewards//std": 0.02122350223362446, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.062, "grad_norm": 0.6267093420028687, "kl": 0.02070951892528683, "learning_rate": 4.966300708207811e-06, "loss": 0.0021, "num_tokens": 2023344.0, "reward": 0.82403564453125, "reward_std": 0.01098247803747654, "rewards//mean": 0.82403564453125, "rewards//std": 0.030690282583236694, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0622, "grad_norm": 0.570456862449646, "kl": 0.022752946242690086, "learning_rate": 4.96604057153243e-06, "loss": 0.0023, "num_tokens": 2029784.0, "reward": 0.858154296875, "reward_std": 0.012154900468885899, "rewards//mean": 0.858154296875, "rewards//std": 0.042659733444452286, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0624, "grad_norm": 0.608584463596344, "kl": 0.02454413822852075, "learning_rate": 4.965779441536202e-06, "loss": 0.0025, "num_tokens": 2036360.0, "reward": 0.825439453125, "reward_std": 0.013766555115580559, "rewards//mean": 0.825439453125, "rewards//std": 0.04080842435359955, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0626, "grad_norm": 0.5525429844856262, "kl": 0.019763967022299767, "learning_rate": 4.965517318324308e-06, "loss": 0.002, "num_tokens": 2042848.0, "reward": 0.707763671875, "reward_std": 0.012647854164242744, "rewards//mean": 0.707763671875, "rewards//std": 0.031870659440755844, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0628, "grad_norm": 0.6055047512054443, "kl": 0.021872437093406916, "learning_rate": 4.965254202002334e-06, "loss": 0.0022, "num_tokens": 2049312.0, "reward": 0.880126953125, "reward_std": 0.01076760608702898, "rewards//mean": 0.880126953125, "rewards//std": 0.024188879877328873, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.063, "grad_norm": 0.6491442322731018, "kl": 0.018391093239188194, "learning_rate": 4.964990092676263e-06, "loss": 0.0018, "num_tokens": 2055776.0, "reward": 0.83758544921875, "reward_std": 0.014810843393206596, "rewards//mean": 0.83758544921875, "rewards//std": 0.028338147327303886, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0632, "grad_norm": 0.5824354887008667, "kl": 0.024069178150966763, "learning_rate": 4.964724990452476e-06, "loss": 0.0024, "num_tokens": 2062288.0, "reward": 0.8619384765625, "reward_std": 0.011154308915138245, "rewards//mean": 0.8619384765625, "rewards//std": 0.025384364649653435, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0634, "grad_norm": 0.6070606708526611, "kl": 0.02114152815192938, "learning_rate": 4.9644588954377595e-06, "loss": 0.0021, "num_tokens": 2068864.0, "reward": 0.806640625, "reward_std": 0.006708196830004454, "rewards//mean": 0.806640625, "rewards//std": 0.01836155168712139, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0636, "grad_norm": 0.6098837852478027, "kl": 0.019263736554421484, "learning_rate": 4.964191807739293e-06, "loss": 0.0019, "num_tokens": 2075320.0, "reward": 0.85296630859375, "reward_std": 0.012227406725287437, "rewards//mean": 0.85296630859375, "rewards//std": 0.020224157720804214, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0638, "grad_norm": 0.6126257181167603, "kl": 0.02082336728926748, "learning_rate": 4.963923727464661e-06, "loss": 0.0021, "num_tokens": 2081848.0, "reward": 0.84228515625, "reward_std": 0.015138156712055206, "rewards//mean": 0.84228515625, "rewards//std": 0.02339095063507557, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.064, "grad_norm": 0.6214805245399475, "kl": 0.018937196349725127, "learning_rate": 4.963654654721848e-06, "loss": 0.0019, "num_tokens": 2088320.0, "reward": 0.86175537109375, "reward_std": 0.011250907555222511, "rewards//mean": 0.86175537109375, "rewards//std": 0.025321299210190773, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0642, "grad_norm": 0.6301703453063965, "kl": 0.021705118124373257, "learning_rate": 4.963384589619233e-06, "loss": 0.0022, "num_tokens": 2094792.0, "reward": 0.861328125, "reward_std": 0.015380768105387688, "rewards//mean": 0.861328125, "rewards//std": 0.022184601053595543, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0644, "grad_norm": 0.5786391496658325, "kl": 0.022070253267884254, "learning_rate": 4.9631135322656e-06, "loss": 0.0022, "num_tokens": 2101216.0, "reward": 0.80401611328125, "reward_std": 0.01128392294049263, "rewards//mean": 0.80401611328125, "rewards//std": 0.01851334050297737, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0646, "grad_norm": 0.6821287274360657, "kl": 0.021000708919018507, "learning_rate": 4.962841482770131e-06, "loss": 0.0021, "num_tokens": 2107696.0, "reward": 0.8448486328125, "reward_std": 0.014750917442142963, "rewards//mean": 0.8448486328125, "rewards//std": 0.02973664551973343, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0648, "grad_norm": 0.6906484961509705, "kl": 0.020665901945903897, "learning_rate": 4.962568441242408e-06, "loss": 0.0021, "num_tokens": 2114184.0, "reward": 0.85546875, "reward_std": 0.014176122844219208, "rewards//mean": 0.85546875, "rewards//std": 0.0196856502443552, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.065, "grad_norm": 0.6005605459213257, "kl": 0.019298722269013524, "learning_rate": 4.962294407792411e-06, "loss": 0.0019, "num_tokens": 2120744.0, "reward": 0.81097412109375, "reward_std": 0.008400456979870796, "rewards//mean": 0.81097412109375, "rewards//std": 0.01867048256099224, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0652, "grad_norm": 0.6110001802444458, "kl": 0.021721608005464077, "learning_rate": 4.962019382530521e-06, "loss": 0.0022, "num_tokens": 2127184.0, "reward": 0.854248046875, "reward_std": 0.009410550817847252, "rewards//mean": 0.854248046875, "rewards//std": 0.041572827845811844, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0654, "grad_norm": 0.6329718828201294, "kl": 0.02375357341952622, "learning_rate": 4.961743365567517e-06, "loss": 0.0024, "num_tokens": 2133728.0, "reward": 0.8387451171875, "reward_std": 0.015492910519242287, "rewards//mean": 0.8387451171875, "rewards//std": 0.03223032131791115, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0656, "grad_norm": 0.5807815194129944, "kl": 0.02455346193164587, "learning_rate": 4.961466357014581e-06, "loss": 0.0025, "num_tokens": 2140152.0, "reward": 0.86456298828125, "reward_std": 0.013275086879730225, "rewards//mean": 0.86456298828125, "rewards//std": 0.021598830819129944, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0658, "grad_norm": 0.6281384825706482, "kl": 0.020192489260807633, "learning_rate": 4.961188356983291e-06, "loss": 0.002, "num_tokens": 2146656.0, "reward": 0.86334228515625, "reward_std": 0.010247575119137764, "rewards//mean": 0.86334228515625, "rewards//std": 0.017792008817195892, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.066, "grad_norm": 0.6021639108657837, "kl": 0.019850074197165668, "learning_rate": 4.960909365585624e-06, "loss": 0.002, "num_tokens": 2153152.0, "reward": 0.83782958984375, "reward_std": 0.010318451561033726, "rewards//mean": 0.83782958984375, "rewards//std": 0.019722430035471916, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0662, "grad_norm": 0.6681183576583862, "kl": 0.01899259549099952, "learning_rate": 4.960629382933959e-06, "loss": 0.0019, "num_tokens": 2159784.0, "reward": 0.830322265625, "reward_std": 0.011032642796635628, "rewards//mean": 0.830322265625, "rewards//std": 0.02663821168243885, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0664, "grad_norm": 0.6400523781776428, "kl": 0.01971001864876598, "learning_rate": 4.960348409141074e-06, "loss": 0.002, "num_tokens": 2166248.0, "reward": 0.82122802734375, "reward_std": 0.011383827775716782, "rewards//mean": 0.82122802734375, "rewards//std": 0.017576295882463455, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0666, "grad_norm": 0.7005535960197449, "kl": 0.021976487361826003, "learning_rate": 4.960066444320143e-06, "loss": 0.0022, "num_tokens": 2172712.0, "reward": 0.84197998046875, "reward_std": 0.017401963472366333, "rewards//mean": 0.84197998046875, "rewards//std": 0.023714158684015274, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0668, "grad_norm": 0.5731592178344727, "kl": 0.018848864710889757, "learning_rate": 4.959783488584743e-06, "loss": 0.0019, "num_tokens": 2179336.0, "reward": 0.83154296875, "reward_std": 0.011851711198687553, "rewards//mean": 0.83154296875, "rewards//std": 0.02426510863006115, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.067, "grad_norm": 0.6444560289382935, "kl": 0.022729619406163692, "learning_rate": 4.9594995420488475e-06, "loss": 0.0023, "num_tokens": 2185896.0, "reward": 0.85693359375, "reward_std": 0.009836044162511826, "rewards//mean": 0.85693359375, "rewards//std": 0.019704097881913185, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0672, "grad_norm": 0.823472261428833, "kl": 0.024720686255022883, "learning_rate": 4.959214604826831e-06, "loss": 0.0025, "num_tokens": 2192408.0, "reward": 0.78509521484375, "reward_std": 0.011101828888058662, "rewards//mean": 0.78509521484375, "rewards//std": 0.020558221265673637, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0674, "grad_norm": 0.616445004940033, "kl": 0.020393103593960404, "learning_rate": 4.958928677033465e-06, "loss": 0.002, "num_tokens": 2199016.0, "reward": 0.84246826171875, "reward_std": 0.014257056638598442, "rewards//mean": 0.84246826171875, "rewards//std": 0.028340283781290054, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0676, "grad_norm": 0.6037515997886658, "kl": 0.020665846299380064, "learning_rate": 4.9586417587839225e-06, "loss": 0.0021, "num_tokens": 2205544.0, "reward": 0.80145263671875, "reward_std": 0.013119444251060486, "rewards//mean": 0.80145263671875, "rewards//std": 0.020688142627477646, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0678, "grad_norm": 0.7130603194236755, "kl": 0.022849680623039603, "learning_rate": 4.958353850193773e-06, "loss": 0.0023, "num_tokens": 2212120.0, "reward": 0.8408203125, "reward_std": 0.016479603946208954, "rewards//mean": 0.8408203125, "rewards//std": 0.02777874656021595, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.068, "grad_norm": 0.6459859013557434, "kl": 0.018527620006352663, "learning_rate": 4.958064951378988e-06, "loss": 0.0019, "num_tokens": 2218704.0, "reward": 0.84002685546875, "reward_std": 0.015253793448209763, "rewards//mean": 0.84002685546875, "rewards//std": 0.020368104800581932, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0682, "grad_norm": 0.6155900955200195, "kl": 0.019558688392862678, "learning_rate": 4.957775062455933e-06, "loss": 0.002, "num_tokens": 2225408.0, "reward": 0.83038330078125, "reward_std": 0.011874992400407791, "rewards//mean": 0.83038330078125, "rewards//std": 0.025683971121907234, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0684, "grad_norm": 0.6561264395713806, "kl": 0.022024541860446334, "learning_rate": 4.957484183541378e-06, "loss": 0.0022, "num_tokens": 2231888.0, "reward": 0.80804443359375, "reward_std": 0.013313915580511093, "rewards//mean": 0.80804443359375, "rewards//std": 0.02254057675600052, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0686, "grad_norm": 0.5750051736831665, "kl": 0.020300572272390127, "learning_rate": 4.957192314752487e-06, "loss": 0.002, "num_tokens": 2238424.0, "reward": 0.8651123046875, "reward_std": 0.009746493771672249, "rewards//mean": 0.8651123046875, "rewards//std": 0.02353000082075596, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0688, "grad_norm": 0.6531442999839783, "kl": 0.023013052181340754, "learning_rate": 4.9568994562068265e-06, "loss": 0.0023, "num_tokens": 2244936.0, "reward": 0.8524169921875, "reward_std": 0.012329033575952053, "rewards//mean": 0.8524169921875, "rewards//std": 0.025233639404177666, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.069, "grad_norm": 0.6909974813461304, "kl": 0.019845292437821627, "learning_rate": 4.9566056080223576e-06, "loss": 0.002, "num_tokens": 2251424.0, "reward": 0.82861328125, "reward_std": 0.014280682429671288, "rewards//mean": 0.82861328125, "rewards//std": 0.023245535790920258, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0692, "grad_norm": 0.6558386087417603, "kl": 0.01972190651576966, "learning_rate": 4.9563107703174444e-06, "loss": 0.002, "num_tokens": 2258032.0, "reward": 0.8741455078125, "reward_std": 0.014798332937061787, "rewards//mean": 0.8741455078125, "rewards//std": 0.03160810470581055, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0694, "grad_norm": 0.6157135963439941, "kl": 0.024010731372982264, "learning_rate": 4.956014943210845e-06, "loss": 0.0024, "num_tokens": 2264480.0, "reward": 0.81573486328125, "reward_std": 0.00979662872850895, "rewards//mean": 0.81573486328125, "rewards//std": 0.02577575109899044, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0696, "grad_norm": 0.6779954433441162, "kl": 0.023366793291643262, "learning_rate": 4.9557181268217225e-06, "loss": 0.0023, "num_tokens": 2271016.0, "reward": 0.837890625, "reward_std": 0.016689497977495193, "rewards//mean": 0.837890625, "rewards//std": 0.01790745183825493, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0698, "grad_norm": 0.6324844360351562, "kl": 0.01984335097949952, "learning_rate": 4.9554203212696304e-06, "loss": 0.002, "num_tokens": 2277560.0, "reward": 0.83612060546875, "reward_std": 0.016164684668183327, "rewards//mean": 0.83612060546875, "rewards//std": 0.025262044742703438, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.07, "grad_norm": 0.6446642875671387, "kl": 0.01843974506482482, "learning_rate": 4.955121526674528e-06, "loss": 0.0018, "num_tokens": 2284056.0, "reward": 0.85296630859375, "reward_std": 0.01069987565279007, "rewards//mean": 0.85296630859375, "rewards//std": 0.01646088808774948, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0702, "grad_norm": 0.6699240803718567, "kl": 0.020488249836489558, "learning_rate": 4.9548217431567665e-06, "loss": 0.002, "num_tokens": 2290576.0, "reward": 0.818603515625, "reward_std": 0.015040895901620388, "rewards//mean": 0.818603515625, "rewards//std": 0.024278830736875534, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0704, "grad_norm": 0.6167379021644592, "kl": 0.020811604452319443, "learning_rate": 4.9545209708371025e-06, "loss": 0.0021, "num_tokens": 2297096.0, "reward": 0.828125, "reward_std": 0.012585700489580631, "rewards//mean": 0.828125, "rewards//std": 0.03919249400496483, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0706, "grad_norm": 0.6443126797676086, "kl": 0.021438468480482697, "learning_rate": 4.9542192098366835e-06, "loss": 0.0021, "num_tokens": 2303560.0, "reward": 0.82696533203125, "reward_std": 0.008636537939310074, "rewards//mean": 0.82696533203125, "rewards//std": 0.01631772518157959, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0708, "grad_norm": 0.7037899494171143, "kl": 0.02008171903435141, "learning_rate": 4.95391646027706e-06, "loss": 0.002, "num_tokens": 2310104.0, "reward": 0.826416015625, "reward_std": 0.010460684075951576, "rewards//mean": 0.826416015625, "rewards//std": 0.01779721863567829, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.071, "grad_norm": 0.6334845423698425, "kl": 0.022552308510057628, "learning_rate": 4.953612722280181e-06, "loss": 0.0023, "num_tokens": 2316616.0, "reward": 0.81109619140625, "reward_std": 0.013959594070911407, "rewards//mean": 0.81109619140625, "rewards//std": 0.022578153759241104, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0712, "grad_norm": 0.6384918093681335, "kl": 0.021024183952249587, "learning_rate": 4.953307995968391e-06, "loss": 0.0021, "num_tokens": 2323224.0, "reward": 0.8253173828125, "reward_std": 0.011187070980668068, "rewards//mean": 0.8253173828125, "rewards//std": 0.02631431631743908, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0714, "grad_norm": 0.6690114736557007, "kl": 0.021767844446003437, "learning_rate": 4.953002281464432e-06, "loss": 0.0022, "num_tokens": 2329728.0, "reward": 0.82196044921875, "reward_std": 0.010448087006807327, "rewards//mean": 0.82196044921875, "rewards//std": 0.016960028558969498, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0716, "grad_norm": 0.6867706775665283, "kl": 0.0235063168220222, "learning_rate": 4.952695578891449e-06, "loss": 0.0024, "num_tokens": 2336144.0, "reward": 0.861328125, "reward_std": 0.012647712603211403, "rewards//mean": 0.861328125, "rewards//std": 0.024220149964094162, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0718, "grad_norm": 0.6281720995903015, "kl": 0.01960526150651276, "learning_rate": 4.9523878883729794e-06, "loss": 0.002, "num_tokens": 2342616.0, "reward": 0.84552001953125, "reward_std": 0.019969893619418144, "rewards//mean": 0.84552001953125, "rewards//std": 0.031257204711437225, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.072, "grad_norm": 0.5789004564285278, "kl": 0.021749505307525396, "learning_rate": 4.952079210032962e-06, "loss": 0.0022, "num_tokens": 2349200.0, "reward": 0.84375, "reward_std": 0.012675793841481209, "rewards//mean": 0.84375, "rewards//std": 0.021281909197568893, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0722, "grad_norm": 0.591282069683075, "kl": 0.025245020631700754, "learning_rate": 4.951769543995731e-06, "loss": 0.0025, "num_tokens": 2355608.0, "reward": 0.8421630859375, "reward_std": 0.0136952493339777, "rewards//mean": 0.8421630859375, "rewards//std": 0.019546357914805412, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0724, "grad_norm": 0.5938491821289062, "kl": 0.019946447922848165, "learning_rate": 4.951458890386021e-06, "loss": 0.002, "num_tokens": 2362096.0, "reward": 0.84320068359375, "reward_std": 0.013788199983537197, "rewards//mean": 0.84320068359375, "rewards//std": 0.026068320497870445, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0726, "grad_norm": 0.626299262046814, "kl": 0.022490440169349313, "learning_rate": 4.951147249328964e-06, "loss": 0.0022, "num_tokens": 2368736.0, "reward": 0.85345458984375, "reward_std": 0.013921165838837624, "rewards//mean": 0.85345458984375, "rewards//std": 0.027107935398817062, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0728, "grad_norm": 0.6578425765037537, "kl": 0.022625273559242487, "learning_rate": 4.950834620950089e-06, "loss": 0.0023, "num_tokens": 2375176.0, "reward": 0.8509521484375, "reward_std": 0.014235584065318108, "rewards//mean": 0.8509521484375, "rewards//std": 0.03371234983205795, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.073, "grad_norm": 0.6105694770812988, "kl": 0.023182813776656985, "learning_rate": 4.9505210053753204e-06, "loss": 0.0023, "num_tokens": 2381712.0, "reward": 0.85064697265625, "reward_std": 0.010071905329823494, "rewards//mean": 0.85064697265625, "rewards//std": 0.03252411261200905, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0732, "grad_norm": 0.6019152402877808, "kl": 0.023740790085867047, "learning_rate": 4.950206402730984e-06, "loss": 0.0024, "num_tokens": 2388352.0, "reward": 0.8419189453125, "reward_std": 0.01870671659708023, "rewards//mean": 0.8419189453125, "rewards//std": 0.031086571514606476, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0734, "grad_norm": 0.6212697625160217, "kl": 0.023675142554566264, "learning_rate": 4.949890813143802e-06, "loss": 0.0024, "num_tokens": 2395024.0, "reward": 0.8397216796875, "reward_std": 0.015749718993902206, "rewards//mean": 0.8397216796875, "rewards//std": 0.037660665810108185, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0736, "grad_norm": 0.598435640335083, "kl": 0.025552398525178432, "learning_rate": 4.949574236740893e-06, "loss": 0.0026, "num_tokens": 2401528.0, "reward": 0.87725830078125, "reward_std": 0.01766745001077652, "rewards//mean": 0.87725830078125, "rewards//std": 0.03177117556333542, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0738, "grad_norm": 0.5873764157295227, "kl": 0.024055095855146646, "learning_rate": 4.949256673649774e-06, "loss": 0.0024, "num_tokens": 2408024.0, "reward": 0.8677978515625, "reward_std": 0.008664744906127453, "rewards//mean": 0.8677978515625, "rewards//std": 0.02137523889541626, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.074, "grad_norm": 0.5763117671012878, "kl": 0.023328717099502683, "learning_rate": 4.94893812399836e-06, "loss": 0.0023, "num_tokens": 2414536.0, "reward": 0.86468505859375, "reward_std": 0.012787047773599625, "rewards//mean": 0.86468505859375, "rewards//std": 0.03016442246735096, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0742, "grad_norm": 0.5695638060569763, "kl": 0.026884031016379595, "learning_rate": 4.948618587914963e-06, "loss": 0.0027, "num_tokens": 2420984.0, "reward": 0.843017578125, "reward_std": 0.013181064277887344, "rewards//mean": 0.843017578125, "rewards//std": 0.020994020625948906, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0744, "grad_norm": 0.7202668786048889, "kl": 0.02846267749555409, "learning_rate": 4.948298065528292e-06, "loss": 0.0028, "num_tokens": 2427496.0, "reward": 0.83990478515625, "reward_std": 0.012518524192273617, "rewards//mean": 0.83990478515625, "rewards//std": 0.017900580540299416, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0746, "grad_norm": 0.5586526393890381, "kl": 0.022693981183692813, "learning_rate": 4.947976556967452e-06, "loss": 0.0023, "num_tokens": 2434000.0, "reward": 0.83734130859375, "reward_std": 0.011747884564101696, "rewards//mean": 0.83734130859375, "rewards//std": 0.03377801179885864, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0748, "grad_norm": 0.6451295614242554, "kl": 0.02354696928523481, "learning_rate": 4.947654062361949e-06, "loss": 0.0024, "num_tokens": 2440632.0, "reward": 0.84344482421875, "reward_std": 0.01550312340259552, "rewards//mean": 0.84344482421875, "rewards//std": 0.027972545474767685, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.075, "grad_norm": 0.693091869354248, "kl": 0.030426556011661887, "learning_rate": 4.9473305818416805e-06, "loss": 0.003, "num_tokens": 2447120.0, "reward": 0.85211181640625, "reward_std": 0.013688227161765099, "rewards//mean": 0.85211181640625, "rewards//std": 0.026409853249788284, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0752, "grad_norm": 0.5874215960502625, "kl": 0.024489378090947866, "learning_rate": 4.947006115536947e-06, "loss": 0.0024, "num_tokens": 2453656.0, "reward": 0.81732177734375, "reward_std": 0.011487660929560661, "rewards//mean": 0.81732177734375, "rewards//std": 0.02587130106985569, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0754, "grad_norm": 0.6001226305961609, "kl": 0.026121094590052962, "learning_rate": 4.946680663578443e-06, "loss": 0.0026, "num_tokens": 2460248.0, "reward": 0.86529541015625, "reward_std": 0.011276507750153542, "rewards//mean": 0.86529541015625, "rewards//std": 0.02111041732132435, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0756, "grad_norm": 0.5803017020225525, "kl": 0.023147993255406618, "learning_rate": 4.946354226097261e-06, "loss": 0.0023, "num_tokens": 2466752.0, "reward": 0.86126708984375, "reward_std": 0.01178767066448927, "rewards//mean": 0.86126708984375, "rewards//std": 0.022498900070786476, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0758, "grad_norm": 0.6185136437416077, "kl": 0.023552838247269392, "learning_rate": 4.946026803224888e-06, "loss": 0.0024, "num_tokens": 2473256.0, "reward": 0.8831787109375, "reward_std": 0.012100385501980782, "rewards//mean": 0.8831787109375, "rewards//std": 0.020312009379267693, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.076, "grad_norm": 0.5861929059028625, "kl": 0.023264746530912817, "learning_rate": 4.945698395093212e-06, "loss": 0.0023, "num_tokens": 2479752.0, "reward": 0.86248779296875, "reward_std": 0.011219041422009468, "rewards//mean": 0.86248779296875, "rewards//std": 0.02088186889886856, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0762, "grad_norm": 0.619530439376831, "kl": 0.026569161796942353, "learning_rate": 4.9453690018345144e-06, "loss": 0.0027, "num_tokens": 2486256.0, "reward": 0.84765625, "reward_std": 0.012358471751213074, "rewards//mean": 0.84765625, "rewards//std": 0.02428007684648037, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0764, "grad_norm": 0.5811346769332886, "kl": 0.02320175990462303, "learning_rate": 4.9450386235814755e-06, "loss": 0.0023, "num_tokens": 2492904.0, "reward": 0.84619140625, "reward_std": 0.010218563489615917, "rewards//mean": 0.84619140625, "rewards//std": 0.02535841055214405, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0766, "grad_norm": 0.6582991480827332, "kl": 0.0316473871935159, "learning_rate": 4.944707260467172e-06, "loss": 0.0032, "num_tokens": 2499432.0, "reward": 0.83831787109375, "reward_std": 0.018780037760734558, "rewards//mean": 0.83831787109375, "rewards//std": 0.022376112639904022, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0768, "grad_norm": 0.634960412979126, "kl": 0.02587113855406642, "learning_rate": 4.944374912625076e-06, "loss": 0.0026, "num_tokens": 2505944.0, "reward": 0.85772705078125, "reward_std": 0.012217249721288681, "rewards//mean": 0.85772705078125, "rewards//std": 0.029163716360926628, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.077, "grad_norm": 0.6517542004585266, "kl": 0.026097522117197514, "learning_rate": 4.944041580189057e-06, "loss": 0.0026, "num_tokens": 2512432.0, "reward": 0.820556640625, "reward_std": 0.012241235002875328, "rewards//mean": 0.820556640625, "rewards//std": 0.024977076798677444, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0772, "grad_norm": 0.7040196061134338, "kl": 0.024413017323240638, "learning_rate": 4.943707263293382e-06, "loss": 0.0024, "num_tokens": 2518928.0, "reward": 0.8297119140625, "reward_std": 0.00819058995693922, "rewards//mean": 0.8297119140625, "rewards//std": 0.022083040326833725, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0774, "grad_norm": 0.6075165867805481, "kl": 0.028073451248928905, "learning_rate": 4.943371962072714e-06, "loss": 0.0028, "num_tokens": 2525416.0, "reward": 0.85955810546875, "reward_std": 0.015763863921165466, "rewards//mean": 0.85955810546875, "rewards//std": 0.02799093909561634, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0776, "grad_norm": 0.6392875909805298, "kl": 0.026283514220267534, "learning_rate": 4.9430356766621114e-06, "loss": 0.0026, "num_tokens": 2531888.0, "reward": 0.8511962890625, "reward_std": 0.011728147976100445, "rewards//mean": 0.8511962890625, "rewards//std": 0.017740566283464432, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0778, "grad_norm": 0.6550029516220093, "kl": 0.028789987321943045, "learning_rate": 4.942698407197031e-06, "loss": 0.0029, "num_tokens": 2538400.0, "reward": 0.818115234375, "reward_std": 0.011933239176869392, "rewards//mean": 0.818115234375, "rewards//std": 0.03468024730682373, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.078, "grad_norm": 0.6924291253089905, "kl": 0.029282074654474854, "learning_rate": 4.942360153813324e-06, "loss": 0.0029, "num_tokens": 2544920.0, "reward": 0.82562255859375, "reward_std": 0.010113751515746117, "rewards//mean": 0.82562255859375, "rewards//std": 0.031042898073792458, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0782, "grad_norm": 0.6393311619758606, "kl": 0.027023433009162545, "learning_rate": 4.9420209166472386e-06, "loss": 0.0027, "num_tokens": 2551416.0, "reward": 0.88250732421875, "reward_std": 0.013438566587865353, "rewards//mean": 0.88250732421875, "rewards//std": 0.032676879316568375, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0784, "grad_norm": 0.6631886959075928, "kl": 0.028520054882392287, "learning_rate": 4.9416806958354206e-06, "loss": 0.0029, "num_tokens": 2557888.0, "reward": 0.780517578125, "reward_std": 0.011763013899326324, "rewards//mean": 0.780517578125, "rewards//std": 0.030392462387681007, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0786, "grad_norm": 0.5338965058326721, "kl": 0.02854826208204031, "learning_rate": 4.9413394915149094e-06, "loss": 0.0029, "num_tokens": 2564472.0, "reward": 0.83892822265625, "reward_std": 0.01127648912370205, "rewards//mean": 0.83892822265625, "rewards//std": 0.025852570310235023, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0788, "grad_norm": 0.6340290904045105, "kl": 0.024304196937009692, "learning_rate": 4.940997303823144e-06, "loss": 0.0024, "num_tokens": 2570952.0, "reward": 0.78216552734375, "reward_std": 0.012750035151839256, "rewards//mean": 0.78216552734375, "rewards//std": 0.026763997972011566, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.079, "grad_norm": 0.6295396685600281, "kl": 0.02592435828410089, "learning_rate": 4.940654132897957e-06, "loss": 0.0026, "num_tokens": 2577448.0, "reward": 0.852294921875, "reward_std": 0.012507164850831032, "rewards//mean": 0.852294921875, "rewards//std": 0.027267245575785637, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0792, "grad_norm": 0.5991517305374146, "kl": 0.03304093307815492, "learning_rate": 4.940309978877576e-06, "loss": 0.0033, "num_tokens": 2584056.0, "reward": 0.822021484375, "reward_std": 0.008339623920619488, "rewards//mean": 0.822021484375, "rewards//std": 0.02647404372692108, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0794, "grad_norm": 0.5874148607254028, "kl": 0.027525671990588307, "learning_rate": 4.939964841900627e-06, "loss": 0.0028, "num_tokens": 2590496.0, "reward": 0.872314453125, "reward_std": 0.014325520023703575, "rewards//mean": 0.872314453125, "rewards//std": 0.03774357587099075, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0796, "grad_norm": 0.6355072855949402, "kl": 0.0354489772580564, "learning_rate": 4.9396187221061324e-06, "loss": 0.0035, "num_tokens": 2597056.0, "reward": 0.82666015625, "reward_std": 0.01194741204380989, "rewards//mean": 0.82666015625, "rewards//std": 0.015893952921032906, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0798, "grad_norm": 0.6167977452278137, "kl": 0.028267529793083668, "learning_rate": 4.939271619633508e-06, "loss": 0.0028, "num_tokens": 2603568.0, "reward": 0.82171630859375, "reward_std": 0.013992267660796642, "rewards//mean": 0.82171630859375, "rewards//std": 0.03480915725231171, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.08, "grad_norm": 0.6546652317047119, "kl": 0.026222907239571214, "learning_rate": 4.938923534622567e-06, "loss": 0.0026, "num_tokens": 2610160.0, "reward": 0.84625244140625, "reward_std": 0.011356605216860771, "rewards//mean": 0.84625244140625, "rewards//std": 0.029670584946870804, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0802, "grad_norm": 0.618540346622467, "kl": 0.02860379498451948, "learning_rate": 4.938574467213519e-06, "loss": 0.0029, "num_tokens": 2616672.0, "reward": 0.83953857421875, "reward_std": 0.011920612305402756, "rewards//mean": 0.83953857421875, "rewards//std": 0.01723625510931015, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0804, "grad_norm": 0.6380559206008911, "kl": 0.032505206996575, "learning_rate": 4.938224417546965e-06, "loss": 0.0033, "num_tokens": 2623240.0, "reward": 0.8426513671875, "reward_std": 0.012489142827689648, "rewards//mean": 0.8426513671875, "rewards//std": 0.02962239645421505, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0806, "grad_norm": 0.6414657235145569, "kl": 0.026258823927491903, "learning_rate": 4.937873385763909e-06, "loss": 0.0026, "num_tokens": 2629728.0, "reward": 0.82769775390625, "reward_std": 0.011077295988798141, "rewards//mean": 0.82769775390625, "rewards//std": 0.027260513976216316, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0808, "grad_norm": 0.672893762588501, "kl": 0.03017253940925002, "learning_rate": 4.9375213720057435e-06, "loss": 0.003, "num_tokens": 2636176.0, "reward": 0.82470703125, "reward_std": 0.007876119576394558, "rewards//mean": 0.82470703125, "rewards//std": 0.012955565936863422, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.081, "grad_norm": 0.6322163939476013, "kl": 0.030439491849392653, "learning_rate": 4.937168376414261e-06, "loss": 0.003, "num_tokens": 2642640.0, "reward": 0.84759521484375, "reward_std": 0.008020121604204178, "rewards//mean": 0.84759521484375, "rewards//std": 0.02140098437666893, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0812, "grad_norm": 0.6374755501747131, "kl": 0.030731099424883723, "learning_rate": 4.9368143991316485e-06, "loss": 0.0031, "num_tokens": 2649184.0, "reward": 0.86627197265625, "reward_std": 0.014455043710768223, "rewards//mean": 0.86627197265625, "rewards//std": 0.038125813007354736, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0814, "grad_norm": 0.712587833404541, "kl": 0.026140809408389032, "learning_rate": 4.936459440300487e-06, "loss": 0.0026, "num_tokens": 2655808.0, "reward": 0.85528564453125, "reward_std": 0.010381786152720451, "rewards//mean": 0.85528564453125, "rewards//std": 0.027791209518909454, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0816, "grad_norm": 0.6231846213340759, "kl": 0.02688520005904138, "learning_rate": 4.936103500063755e-06, "loss": 0.0027, "num_tokens": 2662344.0, "reward": 0.851806640625, "reward_std": 0.017744846642017365, "rewards//mean": 0.851806640625, "rewards//std": 0.04603494331240654, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0818, "grad_norm": 0.6654735803604126, "kl": 0.030552038457244635, "learning_rate": 4.935746578564825e-06, "loss": 0.0031, "num_tokens": 2668896.0, "reward": 0.85076904296875, "reward_std": 0.016120346263051033, "rewards//mean": 0.85076904296875, "rewards//std": 0.02107812464237213, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.082, "grad_norm": 0.6213189363479614, "kl": 0.029119898565113544, "learning_rate": 4.935388675947463e-06, "loss": 0.0029, "num_tokens": 2675448.0, "reward": 0.86163330078125, "reward_std": 0.014135929755866528, "rewards//mean": 0.86163330078125, "rewards//std": 0.029747523367404938, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0822, "grad_norm": 0.6235265135765076, "kl": 0.03541518189013004, "learning_rate": 4.935029792355834e-06, "loss": 0.0035, "num_tokens": 2681992.0, "reward": 0.84326171875, "reward_std": 0.01086876168847084, "rewards//mean": 0.84326171875, "rewards//std": 0.024592293426394463, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0824, "grad_norm": 0.6536543965339661, "kl": 0.030560293700546026, "learning_rate": 4.934669927934496e-06, "loss": 0.0031, "num_tokens": 2688488.0, "reward": 0.866455078125, "reward_std": 0.014581255614757538, "rewards//mean": 0.866455078125, "rewards//std": 0.02263716049492359, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0826, "grad_norm": 0.6777145862579346, "kl": 0.032293920405209064, "learning_rate": 4.9343090828284025e-06, "loss": 0.0032, "num_tokens": 2695024.0, "reward": 0.82843017578125, "reward_std": 0.009871330112218857, "rewards//mean": 0.82843017578125, "rewards//std": 0.016012445092201233, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0828, "grad_norm": 0.5964574813842773, "kl": 0.03125216974876821, "learning_rate": 4.933947257182901e-06, "loss": 0.0031, "num_tokens": 2701456.0, "reward": 0.821533203125, "reward_std": 0.007897812873125076, "rewards//mean": 0.821533203125, "rewards//std": 0.023611243814229965, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.083, "grad_norm": 0.6268133521080017, "kl": 0.03192015551030636, "learning_rate": 4.933584451143736e-06, "loss": 0.0032, "num_tokens": 2708000.0, "reward": 0.82464599609375, "reward_std": 0.01356738805770874, "rewards//mean": 0.82464599609375, "rewards//std": 0.029166312888264656, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0832, "grad_norm": 0.6093042492866516, "kl": 0.028882501646876335, "learning_rate": 4.933220664857045e-06, "loss": 0.0029, "num_tokens": 2714520.0, "reward": 0.86712646484375, "reward_std": 0.013428254052996635, "rewards//mean": 0.86712646484375, "rewards//std": 0.023730112239718437, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0834, "grad_norm": 0.6019577980041504, "kl": 0.031143209664151073, "learning_rate": 4.93285589846936e-06, "loss": 0.0031, "num_tokens": 2721088.0, "reward": 0.85479736328125, "reward_std": 0.01068512536585331, "rewards//mean": 0.85479736328125, "rewards//std": 0.019912630319595337, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0836, "grad_norm": 0.6004226207733154, "kl": 0.031462118960916996, "learning_rate": 4.932490152127611e-06, "loss": 0.0031, "num_tokens": 2727568.0, "reward": 0.8572998046875, "reward_std": 0.008544829674065113, "rewards//mean": 0.8572998046875, "rewards//std": 0.022390736266970634, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0838, "grad_norm": 0.6271358132362366, "kl": 0.0408686890732497, "learning_rate": 4.93212342597912e-06, "loss": 0.0041, "num_tokens": 2734056.0, "reward": 0.83489990234375, "reward_std": 0.011716969311237335, "rewards//mean": 0.83489990234375, "rewards//std": 0.023686693981289864, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.084, "grad_norm": 0.6660474538803101, "kl": 0.035757099045440555, "learning_rate": 4.931755720171603e-06, "loss": 0.0036, "num_tokens": 2740504.0, "reward": 0.87750244140625, "reward_std": 0.013047153130173683, "rewards//mean": 0.87750244140625, "rewards//std": 0.03766965866088867, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0842, "grad_norm": 0.6816895008087158, "kl": 0.03769821021705866, "learning_rate": 4.931387034853173e-06, "loss": 0.0038, "num_tokens": 2747024.0, "reward": 0.843505859375, "reward_std": 0.01912887953221798, "rewards//mean": 0.843505859375, "rewards//std": 0.035172607749700546, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0844, "grad_norm": 0.6255840063095093, "kl": 0.03659572545439005, "learning_rate": 4.9310173701723365e-06, "loss": 0.0037, "num_tokens": 2753560.0, "reward": 0.8350830078125, "reward_std": 0.012399287894368172, "rewards//mean": 0.8350830078125, "rewards//std": 0.018668556585907936, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0846, "grad_norm": 0.6702026724815369, "kl": 0.038369332905858755, "learning_rate": 4.930646726277994e-06, "loss": 0.0038, "num_tokens": 2759984.0, "reward": 0.85687255859375, "reward_std": 0.017981387674808502, "rewards//mean": 0.85687255859375, "rewards//std": 0.04027368128299713, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0848, "grad_norm": 0.6983957886695862, "kl": 0.03270720690488815, "learning_rate": 4.930275103319441e-06, "loss": 0.0033, "num_tokens": 2766472.0, "reward": 0.8499755859375, "reward_std": 0.01107704732567072, "rewards//mean": 0.8499755859375, "rewards//std": 0.024177299812436104, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.085, "grad_norm": 0.6446329951286316, "kl": 0.034597059013321996, "learning_rate": 4.9299025014463665e-06, "loss": 0.0035, "num_tokens": 2772920.0, "reward": 0.84576416015625, "reward_std": 0.014222029596567154, "rewards//mean": 0.84576416015625, "rewards//std": 0.024685604497790337, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0852, "grad_norm": 0.6605253219604492, "kl": 0.02986510982736945, "learning_rate": 4.9295289208088545e-06, "loss": 0.003, "num_tokens": 2779360.0, "reward": 0.81646728515625, "reward_std": 0.010702652856707573, "rewards//mean": 0.81646728515625, "rewards//std": 0.027094529941678047, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0854, "grad_norm": 0.6196510791778564, "kl": 0.031927338568493724, "learning_rate": 4.929154361557384e-06, "loss": 0.0032, "num_tokens": 2785904.0, "reward": 0.83935546875, "reward_std": 0.01270909421145916, "rewards//mean": 0.83935546875, "rewards//std": 0.04159976541996002, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0856, "grad_norm": 0.6354705691337585, "kl": 0.032114935806021094, "learning_rate": 4.928778823842828e-06, "loss": 0.0032, "num_tokens": 2792512.0, "reward": 0.8443603515625, "reward_std": 0.015137514099478722, "rewards//mean": 0.8443603515625, "rewards//std": 0.030932310968637466, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0858, "grad_norm": 0.618382453918457, "kl": 0.0317255153786391, "learning_rate": 4.928402307816452e-06, "loss": 0.0032, "num_tokens": 2799152.0, "reward": 0.86468505859375, "reward_std": 0.01585196889936924, "rewards//mean": 0.86468505859375, "rewards//std": 0.031058497726917267, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.086, "grad_norm": 0.6240487694740295, "kl": 0.034124003956094384, "learning_rate": 4.928024813629917e-06, "loss": 0.0034, "num_tokens": 2805640.0, "reward": 0.83660888671875, "reward_std": 0.015030371025204659, "rewards//mean": 0.83660888671875, "rewards//std": 0.03876720368862152, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0862, "grad_norm": 0.6973584890365601, "kl": 0.04224087065085769, "learning_rate": 4.927646341435276e-06, "loss": 0.0042, "num_tokens": 2812120.0, "reward": 0.7916259765625, "reward_std": 0.009834514930844307, "rewards//mean": 0.7916259765625, "rewards//std": 0.03112744726240635, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0864, "grad_norm": 0.6077401638031006, "kl": 0.03255357057787478, "learning_rate": 4.92726689138498e-06, "loss": 0.0033, "num_tokens": 2818576.0, "reward": 0.840087890625, "reward_std": 0.01285285409539938, "rewards//mean": 0.840087890625, "rewards//std": 0.02187538892030716, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0866, "grad_norm": 0.6180047988891602, "kl": 0.032373772002756596, "learning_rate": 4.92688646363187e-06, "loss": 0.0032, "num_tokens": 2825176.0, "reward": 0.8251953125, "reward_std": 0.01345391571521759, "rewards//mean": 0.8251953125, "rewards//std": 0.03164280578494072, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0868, "grad_norm": 0.6412854194641113, "kl": 0.0392610477283597, "learning_rate": 4.926505058329184e-06, "loss": 0.0039, "num_tokens": 2831768.0, "reward": 0.8336181640625, "reward_std": 0.013466738164424896, "rewards//mean": 0.8336181640625, "rewards//std": 0.018826814368367195, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.087, "grad_norm": 0.7546082139015198, "kl": 0.03666896419599652, "learning_rate": 4.9261226756305495e-06, "loss": 0.0037, "num_tokens": 2838224.0, "reward": 0.85064697265625, "reward_std": 0.011050796136260033, "rewards//mean": 0.85064697265625, "rewards//std": 0.024377064779400826, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0872, "grad_norm": 0.6662473678588867, "kl": 0.02848008507862687, "learning_rate": 4.925739315689991e-06, "loss": 0.0028, "num_tokens": 2844688.0, "reward": 0.8526611328125, "reward_std": 0.014631968922913074, "rewards//mean": 0.8526611328125, "rewards//std": 0.018128827214241028, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0874, "grad_norm": 0.7872958779335022, "kl": 0.03451111959293485, "learning_rate": 4.925354978661928e-06, "loss": 0.0035, "num_tokens": 2851200.0, "reward": 0.8326416015625, "reward_std": 0.01695983111858368, "rewards//mean": 0.8326416015625, "rewards//std": 0.03947708383202553, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0876, "grad_norm": 0.6215630173683167, "kl": 0.0340423216111958, "learning_rate": 4.924969664701168e-06, "loss": 0.0034, "num_tokens": 2857664.0, "reward": 0.82806396484375, "reward_std": 0.009415829554200172, "rewards//mean": 0.82806396484375, "rewards//std": 0.02834402211010456, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0878, "grad_norm": 0.6829410791397095, "kl": 0.03426765673793852, "learning_rate": 4.924583373962918e-06, "loss": 0.0034, "num_tokens": 2864208.0, "reward": 0.83428955078125, "reward_std": 0.008964203298091888, "rewards//mean": 0.83428955078125, "rewards//std": 0.021393202245235443, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.088, "grad_norm": 0.6143079996109009, "kl": 0.03419288503937423, "learning_rate": 4.924196106602774e-06, "loss": 0.0034, "num_tokens": 2870664.0, "reward": 0.85797119140625, "reward_std": 0.011148895137012005, "rewards//mean": 0.85797119140625, "rewards//std": 0.037456877529621124, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0882, "grad_norm": 0.6812248229980469, "kl": 0.034670352237299085, "learning_rate": 4.9238078627767285e-06, "loss": 0.0035, "num_tokens": 2877192.0, "reward": 0.8590087890625, "reward_std": 0.01278429850935936, "rewards//mean": 0.8590087890625, "rewards//std": 0.018051842227578163, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0884, "grad_norm": 0.6505360007286072, "kl": 0.03824305930174887, "learning_rate": 4.923418642641166e-06, "loss": 0.0038, "num_tokens": 2883736.0, "reward": 0.87548828125, "reward_std": 0.01399047952145338, "rewards//mean": 0.87548828125, "rewards//std": 0.02312016673386097, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0886, "grad_norm": 0.6440812349319458, "kl": 0.03093017195351422, "learning_rate": 4.923028446352864e-06, "loss": 0.0031, "num_tokens": 2890376.0, "reward": 0.8350830078125, "reward_std": 0.012048540636897087, "rewards//mean": 0.8350830078125, "rewards//std": 0.021468516439199448, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0888, "grad_norm": 0.63178551197052, "kl": 0.031103559769690037, "learning_rate": 4.922637274068993e-06, "loss": 0.0031, "num_tokens": 2896840.0, "reward": 0.84912109375, "reward_std": 0.013408930972218513, "rewards//mean": 0.84912109375, "rewards//std": 0.024602141231298447, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.089, "grad_norm": 0.5702874660491943, "kl": 0.035526285879313946, "learning_rate": 4.9222451259471185e-06, "loss": 0.0036, "num_tokens": 2903384.0, "reward": 0.8553466796875, "reward_std": 0.010418427176773548, "rewards//mean": 0.8553466796875, "rewards//std": 0.018688006326556206, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0892, "grad_norm": 0.6411604285240173, "kl": 0.035621481481939554, "learning_rate": 4.921852002145196e-06, "loss": 0.0036, "num_tokens": 2910000.0, "reward": 0.8338623046875, "reward_std": 0.013247603550553322, "rewards//mean": 0.8338623046875, "rewards//std": 0.03268921375274658, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0894, "grad_norm": 0.6727077960968018, "kl": 0.03247371851466596, "learning_rate": 4.921457902821578e-06, "loss": 0.0032, "num_tokens": 2916512.0, "reward": 0.81097412109375, "reward_std": 0.01571122370660305, "rewards//mean": 0.81097412109375, "rewards//std": 0.03608853369951248, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0896, "grad_norm": 0.7633801698684692, "kl": 0.03594936756417155, "learning_rate": 4.921062828135006e-06, "loss": 0.0036, "num_tokens": 2923072.0, "reward": 0.8360595703125, "reward_std": 0.016459252685308456, "rewards//mean": 0.8360595703125, "rewards//std": 0.032998934388160706, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0898, "grad_norm": 0.6687192916870117, "kl": 0.03433335409499705, "learning_rate": 4.920666778244616e-06, "loss": 0.0034, "num_tokens": 2929680.0, "reward": 0.78240966796875, "reward_std": 0.009573189541697502, "rewards//mean": 0.78240966796875, "rewards//std": 0.021728798747062683, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.09, "grad_norm": 0.6165691614151001, "kl": 0.03592061810195446, "learning_rate": 4.920269753309937e-06, "loss": 0.0036, "num_tokens": 2936152.0, "reward": 0.81842041015625, "reward_std": 0.014015309512615204, "rewards//mean": 0.81842041015625, "rewards//std": 0.026593778282403946, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0902, "grad_norm": 0.6066876649856567, "kl": 0.036984790582209826, "learning_rate": 4.919871753490892e-06, "loss": 0.0037, "num_tokens": 2942736.0, "reward": 0.84619140625, "reward_std": 0.008205235004425049, "rewards//mean": 0.84619140625, "rewards//std": 0.03186020627617836, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0904, "grad_norm": 0.6752570867538452, "kl": 0.03590616839937866, "learning_rate": 4.919472778947793e-06, "loss": 0.0036, "num_tokens": 2949344.0, "reward": 0.85333251953125, "reward_std": 0.013651330024003983, "rewards//mean": 0.85333251953125, "rewards//std": 0.019186310470104218, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0906, "grad_norm": 0.6786683201789856, "kl": 0.03594211395829916, "learning_rate": 4.919072829841347e-06, "loss": 0.0036, "num_tokens": 2955960.0, "reward": 0.82818603515625, "reward_std": 0.007302135229110718, "rewards//mean": 0.82818603515625, "rewards//std": 0.01555496733635664, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0908, "grad_norm": 0.6827971935272217, "kl": 0.030247324146330357, "learning_rate": 4.918671906332656e-06, "loss": 0.003, "num_tokens": 2962352.0, "reward": 0.84320068359375, "reward_std": 0.008547368459403515, "rewards//mean": 0.84320068359375, "rewards//std": 0.020061831921339035, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.091, "grad_norm": 0.7300602197647095, "kl": 0.03687928104773164, "learning_rate": 4.91827000858321e-06, "loss": 0.0037, "num_tokens": 2968912.0, "reward": 0.81256103515625, "reward_std": 0.0169554203748703, "rewards//mean": 0.81256103515625, "rewards//std": 0.04146559163928032, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0912, "grad_norm": 0.609417200088501, "kl": 0.028801521519199014, "learning_rate": 4.917867136754894e-06, "loss": 0.0029, "num_tokens": 2975400.0, "reward": 0.842529296875, "reward_std": 0.012160791084170341, "rewards//mean": 0.842529296875, "rewards//std": 0.025352440774440765, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0914, "grad_norm": 0.648038387298584, "kl": 0.03594379290007055, "learning_rate": 4.917463291009984e-06, "loss": 0.0036, "num_tokens": 2981952.0, "reward": 0.82843017578125, "reward_std": 0.010313676670193672, "rewards//mean": 0.82843017578125, "rewards//std": 0.03293528035283089, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0916, "grad_norm": 0.6578435897827148, "kl": 0.03743579494766891, "learning_rate": 4.917058471511149e-06, "loss": 0.0037, "num_tokens": 2988512.0, "reward": 0.851318359375, "reward_std": 0.013669105246663094, "rewards//mean": 0.851318359375, "rewards//std": 0.02833874709904194, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0918, "grad_norm": 0.6159623861312866, "kl": 0.03537259087897837, "learning_rate": 4.916652678421451e-06, "loss": 0.0035, "num_tokens": 2995008.0, "reward": 0.8701171875, "reward_std": 0.009295953437685966, "rewards//mean": 0.8701171875, "rewards//std": 0.033160243183374405, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.092, "grad_norm": 0.6083399653434753, "kl": 0.03422114229761064, "learning_rate": 4.916245911904344e-06, "loss": 0.0034, "num_tokens": 3001648.0, "reward": 0.8624267578125, "reward_std": 0.011924168094992638, "rewards//mean": 0.8624267578125, "rewards//std": 0.0302133746445179, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0922, "grad_norm": 0.6568530797958374, "kl": 0.03064131084829569, "learning_rate": 4.9158381721236715e-06, "loss": 0.0031, "num_tokens": 3008168.0, "reward": 0.85479736328125, "reward_std": 0.01007093396037817, "rewards//mean": 0.85479736328125, "rewards//std": 0.025291988626122475, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0924, "grad_norm": 0.6399670839309692, "kl": 0.030397456837818027, "learning_rate": 4.915429459243673e-06, "loss": 0.003, "num_tokens": 3014872.0, "reward": 0.82806396484375, "reward_std": 0.012727971188724041, "rewards//mean": 0.82806396484375, "rewards//std": 0.02784399501979351, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0926, "grad_norm": 0.6363940834999084, "kl": 0.032815776066854596, "learning_rate": 4.9150197734289764e-06, "loss": 0.0033, "num_tokens": 3021392.0, "reward": 0.85394287109375, "reward_std": 0.011889531277120113, "rewards//mean": 0.85394287109375, "rewards//std": 0.03702928498387337, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0928, "grad_norm": 0.6081061959266663, "kl": 0.03635578881949186, "learning_rate": 4.9146091148446055e-06, "loss": 0.0036, "num_tokens": 3027848.0, "reward": 0.84417724609375, "reward_std": 0.012306313961744308, "rewards//mean": 0.84417724609375, "rewards//std": 0.02853669971227646, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.093, "grad_norm": 0.6227095127105713, "kl": 0.0410658591426909, "learning_rate": 4.91419748365597e-06, "loss": 0.0041, "num_tokens": 3034400.0, "reward": 0.83636474609375, "reward_std": 0.017500348389148712, "rewards//mean": 0.83636474609375, "rewards//std": 0.04229619726538658, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0932, "grad_norm": 0.693902313709259, "kl": 0.03862925851717591, "learning_rate": 4.9137848800288775e-06, "loss": 0.0039, "num_tokens": 3040912.0, "reward": 0.8209228515625, "reward_std": 0.013860415667295456, "rewards//mean": 0.8209228515625, "rewards//std": 0.024061819538474083, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0934, "grad_norm": 0.7567556500434875, "kl": 0.036113617941737175, "learning_rate": 4.9133713041295235e-06, "loss": 0.0036, "num_tokens": 3047472.0, "reward": 0.85614013671875, "reward_std": 0.00819423608481884, "rewards//mean": 0.85614013671875, "rewards//std": 0.016769740730524063, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0936, "grad_norm": 0.661746084690094, "kl": 0.0400458665098995, "learning_rate": 4.912956756124498e-06, "loss": 0.004, "num_tokens": 3053976.0, "reward": 0.79193115234375, "reward_std": 0.012701844796538353, "rewards//mean": 0.79193115234375, "rewards//std": 0.020963624119758606, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0938, "grad_norm": 0.6104310750961304, "kl": 0.03691074647940695, "learning_rate": 4.912541236180779e-06, "loss": 0.0037, "num_tokens": 3060544.0, "reward": 0.84771728515625, "reward_std": 0.010005248710513115, "rewards//mean": 0.84771728515625, "rewards//std": 0.032499901950359344, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.094, "grad_norm": 0.6204822063446045, "kl": 0.029529433464631438, "learning_rate": 4.9121247444657384e-06, "loss": 0.003, "num_tokens": 3067056.0, "reward": 0.8560791015625, "reward_std": 0.015111008659005165, "rewards//mean": 0.8560791015625, "rewards//std": 0.036882489919662476, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0942, "grad_norm": 0.6523507237434387, "kl": 0.037716952385380864, "learning_rate": 4.91170728114714e-06, "loss": 0.0038, "num_tokens": 3073576.0, "reward": 0.83990478515625, "reward_std": 0.019746724516153336, "rewards//mean": 0.83990478515625, "rewards//std": 0.02970423921942711, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0944, "grad_norm": 0.5663513541221619, "kl": 0.03594962228089571, "learning_rate": 4.911288846393136e-06, "loss": 0.0036, "num_tokens": 3080128.0, "reward": 0.850341796875, "reward_std": 0.01110504474490881, "rewards//mean": 0.850341796875, "rewards//std": 0.028526155278086662, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0946, "grad_norm": 0.6797471046447754, "kl": 0.03497613500803709, "learning_rate": 4.910869440372274e-06, "loss": 0.0035, "num_tokens": 3086584.0, "reward": 0.8629150390625, "reward_std": 0.014508411288261414, "rewards//mean": 0.8629150390625, "rewards//std": 0.02641766145825386, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0948, "grad_norm": 0.6318498849868774, "kl": 0.039045897545292974, "learning_rate": 4.910449063253489e-06, "loss": 0.0039, "num_tokens": 3093040.0, "reward": 0.7581787109375, "reward_std": 0.011910462751984596, "rewards//mean": 0.7581787109375, "rewards//std": 0.026150431483983994, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.095, "grad_norm": 0.6022250652313232, "kl": 0.03229112015105784, "learning_rate": 4.9100277152061105e-06, "loss": 0.0032, "num_tokens": 3099536.0, "reward": 0.8621826171875, "reward_std": 0.016534287482500076, "rewards//mean": 0.8621826171875, "rewards//std": 0.02098933421075344, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0952, "grad_norm": 0.5952948927879333, "kl": 0.026717626955360174, "learning_rate": 4.9096053963998555e-06, "loss": 0.0027, "num_tokens": 3106128.0, "reward": 0.84246826171875, "reward_std": 0.009889265522360802, "rewards//mean": 0.84246826171875, "rewards//std": 0.02417064644396305, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0954, "grad_norm": 0.7570531964302063, "kl": 0.03380241570994258, "learning_rate": 4.909182107004835e-06, "loss": 0.0034, "num_tokens": 3112680.0, "reward": 0.811279296875, "reward_std": 0.008888293989002705, "rewards//mean": 0.811279296875, "rewards//std": 0.02740016020834446, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0956, "grad_norm": 0.6230834126472473, "kl": 0.04082751553505659, "learning_rate": 4.908757847191551e-06, "loss": 0.0041, "num_tokens": 3119192.0, "reward": 0.86517333984375, "reward_std": 0.012800133787095547, "rewards//mean": 0.86517333984375, "rewards//std": 0.030899198725819588, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0958, "grad_norm": 0.5696311593055725, "kl": 0.0354614038951695, "learning_rate": 4.908332617130893e-06, "loss": 0.0035, "num_tokens": 3125672.0, "reward": 0.79400634765625, "reward_std": 0.00883110798895359, "rewards//mean": 0.79400634765625, "rewards//std": 0.0182057972997427, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.096, "grad_norm": 0.594983696937561, "kl": 0.03111358336172998, "learning_rate": 4.907906416994146e-06, "loss": 0.0031, "num_tokens": 3132080.0, "reward": 0.879638671875, "reward_std": 0.015366640873253345, "rewards//mean": 0.879638671875, "rewards//std": 0.028038017451763153, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0962, "grad_norm": 0.6047464609146118, "kl": 0.030363738536834717, "learning_rate": 4.907479246952981e-06, "loss": 0.003, "num_tokens": 3138576.0, "reward": 0.85302734375, "reward_std": 0.01101978775113821, "rewards//mean": 0.85302734375, "rewards//std": 0.020297471433877945, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0964, "grad_norm": 0.6047098636627197, "kl": 0.03049930650740862, "learning_rate": 4.907051107179464e-06, "loss": 0.003, "num_tokens": 3145080.0, "reward": 0.83258056640625, "reward_std": 0.011874470859766006, "rewards//mean": 0.83258056640625, "rewards//std": 0.03233833983540535, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0966, "grad_norm": 0.6885430216789246, "kl": 0.033972281496971846, "learning_rate": 4.9066219978460485e-06, "loss": 0.0034, "num_tokens": 3151552.0, "reward": 0.86956787109375, "reward_std": 0.01559597160667181, "rewards//mean": 0.86956787109375, "rewards//std": 0.033491771668195724, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0968, "grad_norm": 0.6130304932594299, "kl": 0.03379618399776518, "learning_rate": 4.90619191912558e-06, "loss": 0.0034, "num_tokens": 3158000.0, "reward": 0.8402099609375, "reward_std": 0.013679608702659607, "rewards//mean": 0.8402099609375, "rewards//std": 0.019246671348810196, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.097, "grad_norm": 0.6161892414093018, "kl": 0.03453512443229556, "learning_rate": 4.905760871191295e-06, "loss": 0.0035, "num_tokens": 3164560.0, "reward": 0.86322021484375, "reward_std": 0.008620163425803185, "rewards//mean": 0.86322021484375, "rewards//std": 0.020652256906032562, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0972, "grad_norm": 0.6238561868667603, "kl": 0.03610549960285425, "learning_rate": 4.9053288542168185e-06, "loss": 0.0036, "num_tokens": 3171048.0, "reward": 0.8316650390625, "reward_std": 0.012315354309976101, "rewards//mean": 0.8316650390625, "rewards//std": 0.0226247850805521, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0974, "grad_norm": 0.6723244786262512, "kl": 0.0330163084436208, "learning_rate": 4.904895868376167e-06, "loss": 0.0033, "num_tokens": 3177536.0, "reward": 0.81072998046875, "reward_std": 0.011163340881466866, "rewards//mean": 0.81072998046875, "rewards//std": 0.01871340349316597, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0976, "grad_norm": 0.6025989055633545, "kl": 0.027839731890708208, "learning_rate": 4.904461913843747e-06, "loss": 0.0028, "num_tokens": 3183992.0, "reward": 0.84307861328125, "reward_std": 0.00990958884358406, "rewards//mean": 0.84307861328125, "rewards//std": 0.020511779934167862, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0978, "grad_norm": 0.6140331029891968, "kl": 0.035078632878139615, "learning_rate": 4.904026990794356e-06, "loss": 0.0035, "num_tokens": 3190488.0, "reward": 0.830810546875, "reward_std": 0.012093999423086643, "rewards//mean": 0.830810546875, "rewards//std": 0.02350844070315361, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.098, "grad_norm": 0.590908408164978, "kl": 0.03199684526771307, "learning_rate": 4.903591099403181e-06, "loss": 0.0032, "num_tokens": 3197032.0, "reward": 0.8131103515625, "reward_std": 0.008650442585349083, "rewards//mean": 0.8131103515625, "rewards//std": 0.014419297687709332, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0982, "grad_norm": 0.604034423828125, "kl": 0.029838560381904244, "learning_rate": 4.903154239845798e-06, "loss": 0.003, "num_tokens": 3203568.0, "reward": 0.85394287109375, "reward_std": 0.01665610447525978, "rewards//mean": 0.85394287109375, "rewards//std": 0.02005881257355213, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0984, "grad_norm": 0.5806700587272644, "kl": 0.03282779362052679, "learning_rate": 4.902716412298174e-06, "loss": 0.0033, "num_tokens": 3210000.0, "reward": 0.85321044921875, "reward_std": 0.016802972182631493, "rewards//mean": 0.85321044921875, "rewards//std": 0.04070698097348213, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0986, "grad_norm": 0.6078839898109436, "kl": 0.03290758584626019, "learning_rate": 4.902277616936667e-06, "loss": 0.0033, "num_tokens": 3216488.0, "reward": 0.82843017578125, "reward_std": 0.009662855416536331, "rewards//mean": 0.82843017578125, "rewards//std": 0.022695180028676987, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0988, "grad_norm": 0.7613587379455566, "kl": 0.03131350572220981, "learning_rate": 4.901837853938024e-06, "loss": 0.0031, "num_tokens": 3222912.0, "reward": 0.78887939453125, "reward_std": 0.012067398056387901, "rewards//mean": 0.78887939453125, "rewards//std": 0.030761227011680603, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.099, "grad_norm": 0.6558058857917786, "kl": 0.03489048779010773, "learning_rate": 4.90139712347938e-06, "loss": 0.0035, "num_tokens": 3229480.0, "reward": 0.84124755859375, "reward_std": 0.01762406900525093, "rewards//mean": 0.84124755859375, "rewards//std": 0.035579923540353775, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0992, "grad_norm": 0.6649083495140076, "kl": 0.036183219868689775, "learning_rate": 4.900955425738262e-06, "loss": 0.0036, "num_tokens": 3235984.0, "reward": 0.8514404296875, "reward_std": 0.014244592748582363, "rewards//mean": 0.8514404296875, "rewards//std": 0.036238234490156174, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0994, "grad_norm": 0.7367611527442932, "kl": 0.03572784992866218, "learning_rate": 4.900512760892585e-06, "loss": 0.0036, "num_tokens": 3242488.0, "reward": 0.855712890625, "reward_std": 0.011523640714585781, "rewards//mean": 0.855712890625, "rewards//std": 0.01867382600903511, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0996, "grad_norm": 0.6988459229469299, "kl": 0.03196686413139105, "learning_rate": 4.900069129120656e-06, "loss": 0.0032, "num_tokens": 3249040.0, "reward": 0.8355712890625, "reward_std": 0.017917927354574203, "rewards//mean": 0.8355712890625, "rewards//std": 0.0311624426394701, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.0998, "grad_norm": 0.5815377235412598, "kl": 0.031875348184257746, "learning_rate": 4.899624530601168e-06, "loss": 0.0032, "num_tokens": 3255536.0, "reward": 0.84539794921875, "reward_std": 0.012960272841155529, "rewards//mean": 0.84539794921875, "rewards//std": 0.03601589426398277, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1, "grad_norm": 0.6033855080604553, "kl": 0.03386277239769697, "learning_rate": 4.899178965513206e-06, "loss": 0.0034, "num_tokens": 3262064.0, "reward": 0.82635498046875, "reward_std": 0.011881604790687561, "rewards//mean": 0.82635498046875, "rewards//std": 0.02400852181017399, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1002, "grad_norm": 0.6250687837600708, "kl": 0.03108464856632054, "learning_rate": 4.8987324340362445e-06, "loss": 0.0031, "num_tokens": 3268632.0, "reward": 0.8099365234375, "reward_std": 0.009064503014087677, "rewards//mean": 0.8099365234375, "rewards//std": 0.022906716912984848, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1004, "grad_norm": 0.6700149774551392, "kl": 0.03416571137495339, "learning_rate": 4.898284936350144e-06, "loss": 0.0034, "num_tokens": 3275104.0, "reward": 0.83502197265625, "reward_std": 0.012307664379477501, "rewards//mean": 0.83502197265625, "rewards//std": 0.026518534868955612, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1006, "grad_norm": 0.6595599055290222, "kl": 0.03373102657496929, "learning_rate": 4.897836472635159e-06, "loss": 0.0034, "num_tokens": 3281712.0, "reward": 0.865478515625, "reward_std": 0.010280190035700798, "rewards//mean": 0.865478515625, "rewards//std": 0.03414534777402878, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1008, "grad_norm": 0.6997821927070618, "kl": 0.02997818193398416, "learning_rate": 4.89738704307193e-06, "loss": 0.003, "num_tokens": 3288280.0, "reward": 0.82611083984375, "reward_std": 0.0106052216142416, "rewards//mean": 0.82611083984375, "rewards//std": 0.0170197244733572, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.101, "grad_norm": 0.5888683795928955, "kl": 0.03280960535630584, "learning_rate": 4.896936647841485e-06, "loss": 0.0033, "num_tokens": 3294792.0, "reward": 0.8609619140625, "reward_std": 0.013485745526850224, "rewards//mean": 0.8609619140625, "rewards//std": 0.02458459883928299, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1012, "grad_norm": 0.6109232902526855, "kl": 0.03339107520878315, "learning_rate": 4.896485287125247e-06, "loss": 0.0033, "num_tokens": 3301264.0, "reward": 0.84881591796875, "reward_std": 0.011974513530731201, "rewards//mean": 0.84881591796875, "rewards//std": 0.038548704236745834, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1014, "grad_norm": 0.5779264569282532, "kl": 0.027007358381524682, "learning_rate": 4.896032961105021e-06, "loss": 0.0027, "num_tokens": 3307824.0, "reward": 0.8441162109375, "reward_std": 0.010420424863696098, "rewards//mean": 0.8441162109375, "rewards//std": 0.028803668916225433, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1016, "grad_norm": 0.6584574580192566, "kl": 0.03429147810675204, "learning_rate": 4.8955796699630045e-06, "loss": 0.0034, "num_tokens": 3314424.0, "reward": 0.7806396484375, "reward_std": 0.009217744693160057, "rewards//mean": 0.7806396484375, "rewards//std": 0.015859151259064674, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1018, "grad_norm": 0.6022382974624634, "kl": 0.03081224230118096, "learning_rate": 4.895125413881783e-06, "loss": 0.0031, "num_tokens": 3320928.0, "reward": 0.8909912109375, "reward_std": 0.009043923579156399, "rewards//mean": 0.8909912109375, "rewards//std": 0.0231329295784235, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.102, "grad_norm": 0.672275960445404, "kl": 0.03528377669863403, "learning_rate": 4.894670193044332e-06, "loss": 0.0035, "num_tokens": 3327376.0, "reward": 0.86248779296875, "reward_std": 0.011328982189297676, "rewards//mean": 0.86248779296875, "rewards//std": 0.035140261054039, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1022, "grad_norm": 0.8039358258247375, "kl": 0.036853160709142685, "learning_rate": 4.894214007634014e-06, "loss": 0.0037, "num_tokens": 3333928.0, "reward": 0.79327392578125, "reward_std": 0.01666302978992462, "rewards//mean": 0.79327392578125, "rewards//std": 0.030523112043738365, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1024, "grad_norm": 0.6324937343597412, "kl": 0.030147293815389276, "learning_rate": 4.893756857834579e-06, "loss": 0.003, "num_tokens": 3340520.0, "reward": 0.800048828125, "reward_std": 0.010150490328669548, "rewards//mean": 0.800048828125, "rewards//std": 0.026810409501194954, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1026, "grad_norm": 0.6281384229660034, "kl": 0.0276860895100981, "learning_rate": 4.893298743830168e-06, "loss": 0.0028, "num_tokens": 3346992.0, "reward": 0.8341064453125, "reward_std": 0.009020951576530933, "rewards//mean": 0.8341064453125, "rewards//std": 0.02563832886517048, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1028, "grad_norm": 0.6529840230941772, "kl": 0.027598173590376973, "learning_rate": 4.89283966580531e-06, "loss": 0.0028, "num_tokens": 3353528.0, "reward": 0.84344482421875, "reward_std": 0.012123117223381996, "rewards//mean": 0.84344482421875, "rewards//std": 0.03667231649160385, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.103, "grad_norm": 0.6482062339782715, "kl": 0.03003290994092822, "learning_rate": 4.8923796239449206e-06, "loss": 0.003, "num_tokens": 3360064.0, "reward": 0.83209228515625, "reward_std": 0.010229753330349922, "rewards//mean": 0.83209228515625, "rewards//std": 0.019155515357851982, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1032, "grad_norm": 0.6167376041412354, "kl": 0.03031931398436427, "learning_rate": 4.891918618434305e-06, "loss": 0.003, "num_tokens": 3366520.0, "reward": 0.834228515625, "reward_std": 0.010733341798186302, "rewards//mean": 0.834228515625, "rewards//std": 0.014762187376618385, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1034, "grad_norm": 0.6656414270401001, "kl": 0.03264021617360413, "learning_rate": 4.891456649459156e-06, "loss": 0.0033, "num_tokens": 3373016.0, "reward": 0.84588623046875, "reward_std": 0.012665395624935627, "rewards//mean": 0.84588623046875, "rewards//std": 0.02881384827196598, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1036, "grad_norm": 0.6099600791931152, "kl": 0.030192186124622822, "learning_rate": 4.890993717205553e-06, "loss": 0.003, "num_tokens": 3379576.0, "reward": 0.8043212890625, "reward_std": 0.010902250185608864, "rewards//mean": 0.8043212890625, "rewards//std": 0.027416452765464783, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1038, "grad_norm": 0.599140465259552, "kl": 0.027934797341004014, "learning_rate": 4.8905298218599685e-06, "loss": 0.0028, "num_tokens": 3386096.0, "reward": 0.83251953125, "reward_std": 0.01088377833366394, "rewards//mean": 0.83251953125, "rewards//std": 0.020978152751922607, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.104, "grad_norm": 0.5696952939033508, "kl": 0.03250581002794206, "learning_rate": 4.8900649636092565e-06, "loss": 0.0033, "num_tokens": 3392512.0, "reward": 0.8446044921875, "reward_std": 0.01649361103773117, "rewards//mean": 0.8446044921875, "rewards//std": 0.04427339881658554, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1042, "grad_norm": 0.6459939479827881, "kl": 0.03033285029232502, "learning_rate": 4.889599142640663e-06, "loss": 0.003, "num_tokens": 3399056.0, "reward": 0.83172607421875, "reward_std": 0.011939212679862976, "rewards//mean": 0.83172607421875, "rewards//std": 0.020801976323127747, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1044, "grad_norm": 0.5709954500198364, "kl": 0.0319932468701154, "learning_rate": 4.889132359141822e-06, "loss": 0.0032, "num_tokens": 3405552.0, "reward": 0.858642578125, "reward_std": 0.017098452895879745, "rewards//mean": 0.858642578125, "rewards//std": 0.02903946116566658, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1046, "grad_norm": 0.8689175844192505, "kl": 0.028929896419867873, "learning_rate": 4.888664613300751e-06, "loss": 0.0029, "num_tokens": 3411992.0, "reward": 0.85455322265625, "reward_std": 0.020008470863103867, "rewards//mean": 0.85455322265625, "rewards//std": 0.02635362185537815, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1048, "grad_norm": 0.6963226795196533, "kl": 0.031829854240641, "learning_rate": 4.888195905305859e-06, "loss": 0.0032, "num_tokens": 3418448.0, "reward": 0.788818359375, "reward_std": 0.014885769225656986, "rewards//mean": 0.788818359375, "rewards//std": 0.0243386123329401, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.105, "grad_norm": 0.6378881931304932, "kl": 0.028337573632597923, "learning_rate": 4.887726235345943e-06, "loss": 0.0028, "num_tokens": 3424928.0, "reward": 0.81781005859375, "reward_std": 0.010992229916155338, "rewards//mean": 0.81781005859375, "rewards//std": 0.016839103773236275, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1052, "grad_norm": 0.6507261395454407, "kl": 0.033656949177384377, "learning_rate": 4.8872556036101845e-06, "loss": 0.0034, "num_tokens": 3431400.0, "reward": 0.85791015625, "reward_std": 0.011110810562968254, "rewards//mean": 0.85791015625, "rewards//std": 0.03369999676942825, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1054, "grad_norm": 0.6233684420585632, "kl": 0.02477980637922883, "learning_rate": 4.886784010288155e-06, "loss": 0.0025, "num_tokens": 3438000.0, "reward": 0.82098388671875, "reward_std": 0.016229748725891113, "rewards//mean": 0.82098388671875, "rewards//std": 0.021916091442108154, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1056, "grad_norm": 0.7725349068641663, "kl": 0.030107759404927492, "learning_rate": 4.886311455569811e-06, "loss": 0.003, "num_tokens": 3444520.0, "reward": 0.8575439453125, "reward_std": 0.013236935250461102, "rewards//mean": 0.8575439453125, "rewards//std": 0.024675559252500534, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1058, "grad_norm": 0.6618022322654724, "kl": 0.030748266261070967, "learning_rate": 4.885837939645499e-06, "loss": 0.0031, "num_tokens": 3451088.0, "reward": 0.84832763671875, "reward_std": 0.012683728709816933, "rewards//mean": 0.84832763671875, "rewards//std": 0.023923883214592934, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.106, "grad_norm": 0.6690743565559387, "kl": 0.030612023547291756, "learning_rate": 4.885363462705949e-06, "loss": 0.0031, "num_tokens": 3457648.0, "reward": 0.855224609375, "reward_std": 0.015518763102591038, "rewards//mean": 0.855224609375, "rewards//std": 0.03478484973311424, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1062, "grad_norm": 0.6613344550132751, "kl": 0.03167608566582203, "learning_rate": 4.884888024942282e-06, "loss": 0.0032, "num_tokens": 3464160.0, "reward": 0.8558349609375, "reward_std": 0.018330905586481094, "rewards//mean": 0.8558349609375, "rewards//std": 0.035486891865730286, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1064, "grad_norm": 0.7026782035827637, "kl": 0.030375287402421236, "learning_rate": 4.884411626546004e-06, "loss": 0.003, "num_tokens": 3470768.0, "reward": 0.822998046875, "reward_std": 0.012220524251461029, "rewards//mean": 0.822998046875, "rewards//std": 0.024654999375343323, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1066, "grad_norm": 0.6098203063011169, "kl": 0.0286163913551718, "learning_rate": 4.883934267709007e-06, "loss": 0.0029, "num_tokens": 3477336.0, "reward": 0.84466552734375, "reward_std": 0.012859920039772987, "rewards//mean": 0.84466552734375, "rewards//std": 0.0268813855946064, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1068, "grad_norm": 0.6562339663505554, "kl": 0.0278691683197394, "learning_rate": 4.883455948623574e-06, "loss": 0.0028, "num_tokens": 3483840.0, "reward": 0.85479736328125, "reward_std": 0.010976341553032398, "rewards//mean": 0.85479736328125, "rewards//std": 0.022364608943462372, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.107, "grad_norm": 0.6684576272964478, "kl": 0.03230980271473527, "learning_rate": 4.882976669482368e-06, "loss": 0.0032, "num_tokens": 3490368.0, "reward": 0.8468017578125, "reward_std": 0.010632255114614964, "rewards//mean": 0.8468017578125, "rewards//std": 0.017778070643544197, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1072, "grad_norm": 0.6153666377067566, "kl": 0.03179552615620196, "learning_rate": 4.882496430478445e-06, "loss": 0.0032, "num_tokens": 3496880.0, "reward": 0.857177734375, "reward_std": 0.01358007825911045, "rewards//mean": 0.857177734375, "rewards//std": 0.02453683316707611, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1074, "grad_norm": 0.6012992262840271, "kl": 0.03250846825540066, "learning_rate": 4.882015231805245e-06, "loss": 0.0033, "num_tokens": 3503432.0, "reward": 0.87005615234375, "reward_std": 0.010263869538903236, "rewards//mean": 0.87005615234375, "rewards//std": 0.01699480228126049, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1076, "grad_norm": 0.6483962535858154, "kl": 0.03316758247092366, "learning_rate": 4.881533073656594e-06, "loss": 0.0033, "num_tokens": 3509960.0, "reward": 0.8270263671875, "reward_std": 0.009003392420709133, "rewards//mean": 0.8270263671875, "rewards//std": 0.01653578132390976, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1078, "grad_norm": 0.6129351258277893, "kl": 0.03461948991753161, "learning_rate": 4.8810499562267066e-06, "loss": 0.0035, "num_tokens": 3516416.0, "reward": 0.84686279296875, "reward_std": 0.008496642112731934, "rewards//mean": 0.84686279296875, "rewards//std": 0.02763170190155506, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.108, "grad_norm": 0.5928858518600464, "kl": 0.03543858043849468, "learning_rate": 4.88056587971018e-06, "loss": 0.0035, "num_tokens": 3522880.0, "reward": 0.80194091796875, "reward_std": 0.011357907205820084, "rewards//mean": 0.80194091796875, "rewards//std": 0.020311543717980385, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1082, "grad_norm": 0.7632496356964111, "kl": 0.034994550282135606, "learning_rate": 4.880080844302004e-06, "loss": 0.0035, "num_tokens": 3529352.0, "reward": 0.78594970703125, "reward_std": 0.009396329522132874, "rewards//mean": 0.78594970703125, "rewards//std": 0.017719540745019913, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1084, "grad_norm": 0.6258948445320129, "kl": 0.027121696388348937, "learning_rate": 4.879594850197548e-06, "loss": 0.0027, "num_tokens": 3535864.0, "reward": 0.82269287109375, "reward_std": 0.010259194299578667, "rewards//mean": 0.82269287109375, "rewards//std": 0.03360007703304291, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1086, "grad_norm": 0.6535525321960449, "kl": 0.029613361693918705, "learning_rate": 4.87910789759257e-06, "loss": 0.003, "num_tokens": 3542448.0, "reward": 0.84735107421875, "reward_std": 0.012180662713944912, "rewards//mean": 0.84735107421875, "rewards//std": 0.029580144211649895, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1088, "grad_norm": 0.654188871383667, "kl": 0.02947809756733477, "learning_rate": 4.878619986683215e-06, "loss": 0.0029, "num_tokens": 3548968.0, "reward": 0.83056640625, "reward_std": 0.013154493644833565, "rewards//mean": 0.83056640625, "rewards//std": 0.036301881074905396, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.109, "grad_norm": 0.6415969729423523, "kl": 0.029625217197462916, "learning_rate": 4.8781311176660144e-06, "loss": 0.003, "num_tokens": 3555472.0, "reward": 0.82647705078125, "reward_std": 0.010771572589874268, "rewards//mean": 0.82647705078125, "rewards//std": 0.024693572893738747, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1092, "grad_norm": 0.5805480480194092, "kl": 0.029833517502993345, "learning_rate": 4.8776412907378845e-06, "loss": 0.003, "num_tokens": 3561944.0, "reward": 0.83563232421875, "reward_std": 0.014117766171693802, "rewards//mean": 0.83563232421875, "rewards//std": 0.029275618493556976, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1094, "grad_norm": 0.6056393384933472, "kl": 0.029833688866347075, "learning_rate": 4.877150506096127e-06, "loss": 0.003, "num_tokens": 3568408.0, "reward": 0.84912109375, "reward_std": 0.013056284748017788, "rewards//mean": 0.84912109375, "rewards//std": 0.022200971841812134, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1096, "grad_norm": 0.7053980827331543, "kl": 0.03147451486438513, "learning_rate": 4.8766587639384285e-06, "loss": 0.0031, "num_tokens": 3574896.0, "reward": 0.85784912109375, "reward_std": 0.014083274640142918, "rewards//mean": 0.85784912109375, "rewards//std": 0.025321299210190773, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1098, "grad_norm": 0.5739861130714417, "kl": 0.03014273289591074, "learning_rate": 4.876166064462866e-06, "loss": 0.003, "num_tokens": 3581352.0, "reward": 0.82379150390625, "reward_std": 0.012559041380882263, "rewards//mean": 0.82379150390625, "rewards//std": 0.029798876494169235, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.11, "grad_norm": 0.5770258903503418, "kl": 0.030987678095698357, "learning_rate": 4.8756724078678955e-06, "loss": 0.0031, "num_tokens": 3587776.0, "reward": 0.8641357421875, "reward_std": 0.015060827136039734, "rewards//mean": 0.8641357421875, "rewards//std": 0.02080388553440571, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1102, "grad_norm": 0.6917813420295715, "kl": 0.031157562509179115, "learning_rate": 4.875177794352364e-06, "loss": 0.0031, "num_tokens": 3594472.0, "reward": 0.82940673828125, "reward_std": 0.010959278792142868, "rewards//mean": 0.82940673828125, "rewards//std": 0.017772428691387177, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1104, "grad_norm": 0.6400755643844604, "kl": 0.03357220510952175, "learning_rate": 4.8746822241155006e-06, "loss": 0.0034, "num_tokens": 3601016.0, "reward": 0.80755615234375, "reward_std": 0.010078158229589462, "rewards//mean": 0.80755615234375, "rewards//std": 0.021544797345995903, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1106, "grad_norm": 0.5885239839553833, "kl": 0.031021602218970656, "learning_rate": 4.874185697356921e-06, "loss": 0.0031, "num_tokens": 3607544.0, "reward": 0.84344482421875, "reward_std": 0.014324812218546867, "rewards//mean": 0.84344482421875, "rewards//std": 0.03417675569653511, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1108, "grad_norm": 0.5927717089653015, "kl": 0.03164471639320254, "learning_rate": 4.873688214276628e-06, "loss": 0.0032, "num_tokens": 3614016.0, "reward": 0.84332275390625, "reward_std": 0.008280541747808456, "rewards//mean": 0.84332275390625, "rewards//std": 0.016515973955392838, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.111, "grad_norm": 0.6338167786598206, "kl": 0.035400526598095894, "learning_rate": 4.873189775075005e-06, "loss": 0.0035, "num_tokens": 3620536.0, "reward": 0.86553955078125, "reward_std": 0.011882786639034748, "rewards//mean": 0.86553955078125, "rewards//std": 0.022378141060471535, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1112, "grad_norm": 0.6624349355697632, "kl": 0.03116373484954238, "learning_rate": 4.872690379952824e-06, "loss": 0.0031, "num_tokens": 3627152.0, "reward": 0.8284912109375, "reward_std": 0.013359297066926956, "rewards//mean": 0.8284912109375, "rewards//std": 0.028820481151342392, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1114, "grad_norm": 0.587210476398468, "kl": 0.03523128619417548, "learning_rate": 4.8721900291112415e-06, "loss": 0.0035, "num_tokens": 3633672.0, "reward": 0.86175537109375, "reward_std": 0.013639332726597786, "rewards//mean": 0.86175537109375, "rewards//std": 0.03241875395178795, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1116, "grad_norm": 0.6118485331535339, "kl": 0.03052513301372528, "learning_rate": 4.871688722751799e-06, "loss": 0.0031, "num_tokens": 3640240.0, "reward": 0.7965087890625, "reward_std": 0.012003393843770027, "rewards//mean": 0.7965087890625, "rewards//std": 0.018145518377423286, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1118, "grad_norm": 0.6017597317695618, "kl": 0.031704419292509556, "learning_rate": 4.8711864610764235e-06, "loss": 0.0032, "num_tokens": 3646784.0, "reward": 0.86309814453125, "reward_std": 0.013073796406388283, "rewards//mean": 0.86309814453125, "rewards//std": 0.028868434950709343, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.112, "grad_norm": 0.6723352074623108, "kl": 0.036269206553697586, "learning_rate": 4.870683244287425e-06, "loss": 0.0036, "num_tokens": 3653280.0, "reward": 0.85546875, "reward_std": 0.01556839793920517, "rewards//mean": 0.85546875, "rewards//std": 0.03975088149309158, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1122, "grad_norm": 0.6495388746261597, "kl": 0.03623637277632952, "learning_rate": 4.870179072587499e-06, "loss": 0.0036, "num_tokens": 3659856.0, "reward": 0.8348388671875, "reward_std": 0.010795267298817635, "rewards//mean": 0.8348388671875, "rewards//std": 0.021253081038594246, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1124, "grad_norm": 0.6097123622894287, "kl": 0.03436880186200142, "learning_rate": 4.869673946179726e-06, "loss": 0.0034, "num_tokens": 3666440.0, "reward": 0.84228515625, "reward_std": 0.012537036091089249, "rewards//mean": 0.84228515625, "rewards//std": 0.027555515989661217, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1126, "grad_norm": 0.6926887631416321, "kl": 0.02803546912036836, "learning_rate": 4.8691678652675715e-06, "loss": 0.0028, "num_tokens": 3673000.0, "reward": 0.86138916015625, "reward_std": 0.016193002462387085, "rewards//mean": 0.86138916015625, "rewards//std": 0.03052063286304474, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1128, "grad_norm": 0.592189371585846, "kl": 0.03593447944149375, "learning_rate": 4.8686608300548836e-06, "loss": 0.0036, "num_tokens": 3679384.0, "reward": 0.853515625, "reward_std": 0.012158144265413284, "rewards//mean": 0.853515625, "rewards//std": 0.017947981134057045, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.113, "grad_norm": 0.6793131232261658, "kl": 0.03625408164225519, "learning_rate": 4.868152840745896e-06, "loss": 0.0036, "num_tokens": 3685888.0, "reward": 0.8232421875, "reward_std": 0.011547568254172802, "rewards//mean": 0.8232421875, "rewards//std": 0.020063435658812523, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1132, "grad_norm": 0.629365861415863, "kl": 0.029668693197891116, "learning_rate": 4.8676438975452276e-06, "loss": 0.003, "num_tokens": 3692424.0, "reward": 0.8212890625, "reward_std": 0.010893348604440689, "rewards//mean": 0.8212890625, "rewards//std": 0.027330461889505386, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1134, "grad_norm": 0.6305131316184998, "kl": 0.03516363073140383, "learning_rate": 4.86713400065788e-06, "loss": 0.0035, "num_tokens": 3698872.0, "reward": 0.8587646484375, "reward_std": 0.011101635172963142, "rewards//mean": 0.8587646484375, "rewards//std": 0.029472799971699715, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1136, "grad_norm": 0.5981056690216064, "kl": 0.03191525675356388, "learning_rate": 4.866623150289241e-06, "loss": 0.0032, "num_tokens": 3705320.0, "reward": 0.88812255859375, "reward_std": 0.010125132277607918, "rewards//mean": 0.88812255859375, "rewards//std": 0.02194715104997158, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1138, "grad_norm": 0.5857009887695312, "kl": 0.03170208935625851, "learning_rate": 4.86611134664508e-06, "loss": 0.0032, "num_tokens": 3711736.0, "reward": 0.854248046875, "reward_std": 0.012967083603143692, "rewards//mean": 0.854248046875, "rewards//std": 0.017837999388575554, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.114, "grad_norm": 0.6245691180229187, "kl": 0.03494541137479246, "learning_rate": 4.865598589931552e-06, "loss": 0.0035, "num_tokens": 3718328.0, "reward": 0.79693603515625, "reward_std": 0.011958854272961617, "rewards//mean": 0.79693603515625, "rewards//std": 0.022363930940628052, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1142, "grad_norm": 0.6825640797615051, "kl": 0.03346801269799471, "learning_rate": 4.865084880355193e-06, "loss": 0.0033, "num_tokens": 3724896.0, "reward": 0.8687744140625, "reward_std": 0.01131730992347002, "rewards//mean": 0.8687744140625, "rewards//std": 0.015449140220880508, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1144, "grad_norm": 0.6870326399803162, "kl": 0.03804417513310909, "learning_rate": 4.864570218122928e-06, "loss": 0.0038, "num_tokens": 3731392.0, "reward": 0.84197998046875, "reward_std": 0.012165633030235767, "rewards//mean": 0.84197998046875, "rewards//std": 0.020326443016529083, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1146, "grad_norm": 0.6390773057937622, "kl": 0.034224626841023564, "learning_rate": 4.864054603442063e-06, "loss": 0.0034, "num_tokens": 3737888.0, "reward": 0.84515380859375, "reward_std": 0.01106143370270729, "rewards//mean": 0.84515380859375, "rewards//std": 0.02885112538933754, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1148, "grad_norm": 0.6096854209899902, "kl": 0.03863173257559538, "learning_rate": 4.863538036520285e-06, "loss": 0.0039, "num_tokens": 3744464.0, "reward": 0.8778076171875, "reward_std": 0.012170696631073952, "rewards//mean": 0.8778076171875, "rewards//std": 0.021704141050577164, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.115, "grad_norm": 0.6443453431129456, "kl": 0.041011053370311856, "learning_rate": 4.863020517565669e-06, "loss": 0.0041, "num_tokens": 3750944.0, "reward": 0.81353759765625, "reward_std": 0.009858833625912666, "rewards//mean": 0.81353759765625, "rewards//std": 0.024500945582985878, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1152, "grad_norm": 0.6102349758148193, "kl": 0.033064100658521056, "learning_rate": 4.862502046786671e-06, "loss": 0.0033, "num_tokens": 3757576.0, "reward": 0.82891845703125, "reward_std": 0.010357474908232689, "rewards//mean": 0.82891845703125, "rewards//std": 0.016131121665239334, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1154, "grad_norm": 0.6194673180580139, "kl": 0.030226377304643393, "learning_rate": 4.861982624392132e-06, "loss": 0.003, "num_tokens": 3764128.0, "reward": 0.86114501953125, "reward_std": 0.01061889249831438, "rewards//mean": 0.86114501953125, "rewards//std": 0.02764703705906868, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1156, "grad_norm": 0.6460333466529846, "kl": 0.03713248623535037, "learning_rate": 4.861462250591273e-06, "loss": 0.0037, "num_tokens": 3770640.0, "reward": 0.841064453125, "reward_std": 0.013898089528083801, "rewards//mean": 0.841064453125, "rewards//std": 0.033529773354530334, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1158, "grad_norm": 0.6622205376625061, "kl": 0.03561116009950638, "learning_rate": 4.860940925593703e-06, "loss": 0.0036, "num_tokens": 3777336.0, "reward": 0.834716796875, "reward_std": 0.01101248525083065, "rewards//mean": 0.834716796875, "rewards//std": 0.030344609171152115, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.116, "grad_norm": 0.6115918159484863, "kl": 0.041194694116711617, "learning_rate": 4.86041864960941e-06, "loss": 0.0041, "num_tokens": 3783792.0, "reward": 0.85552978515625, "reward_std": 0.011757418513298035, "rewards//mean": 0.85552978515625, "rewards//std": 0.01923123002052307, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1162, "grad_norm": 0.5668520927429199, "kl": 0.03501286217942834, "learning_rate": 4.859895422848767e-06, "loss": 0.0035, "num_tokens": 3790240.0, "reward": 0.81658935546875, "reward_std": 0.010440420359373093, "rewards//mean": 0.81658935546875, "rewards//std": 0.028868434950709343, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1164, "grad_norm": 0.6220555901527405, "kl": 0.041811997536569834, "learning_rate": 4.859371245522531e-06, "loss": 0.0042, "num_tokens": 3796760.0, "reward": 0.826904296875, "reward_std": 0.013789907097816467, "rewards//mean": 0.826904296875, "rewards//std": 0.025323763489723206, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1166, "grad_norm": 0.6384966969490051, "kl": 0.035850258776918054, "learning_rate": 4.8588461178418375e-06, "loss": 0.0036, "num_tokens": 3803336.0, "reward": 0.85491943359375, "reward_std": 0.016050763428211212, "rewards//mean": 0.85491943359375, "rewards//std": 0.03363789618015289, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1168, "grad_norm": 0.681158185005188, "kl": 0.04152256716042757, "learning_rate": 4.858320040018212e-06, "loss": 0.0042, "num_tokens": 3809832.0, "reward": 0.821044921875, "reward_std": 0.014268225058913231, "rewards//mean": 0.821044921875, "rewards//std": 0.023734018206596375, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.117, "grad_norm": 0.6424878835678101, "kl": 0.035631838254630566, "learning_rate": 4.857793012263555e-06, "loss": 0.0036, "num_tokens": 3816272.0, "reward": 0.88055419921875, "reward_std": 0.010589011013507843, "rewards//mean": 0.88055419921875, "rewards//std": 0.02607703022658825, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1172, "grad_norm": 0.6001079082489014, "kl": 0.03453458775766194, "learning_rate": 4.857265034790155e-06, "loss": 0.0035, "num_tokens": 3822744.0, "reward": 0.84454345703125, "reward_std": 0.01057126373052597, "rewards//mean": 0.84454345703125, "rewards//std": 0.02217564359307289, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1174, "grad_norm": 0.6222141981124878, "kl": 0.041371190221980214, "learning_rate": 4.85673610781068e-06, "loss": 0.0041, "num_tokens": 3829272.0, "reward": 0.8468017578125, "reward_std": 0.014979726634919643, "rewards//mean": 0.8468017578125, "rewards//std": 0.018115462735295296, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1176, "grad_norm": 0.6488871574401855, "kl": 0.037690630881115794, "learning_rate": 4.856206231538184e-06, "loss": 0.0038, "num_tokens": 3835808.0, "reward": 0.87548828125, "reward_std": 0.013796349987387657, "rewards//mean": 0.87548828125, "rewards//std": 0.024155063554644585, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1178, "grad_norm": 0.6412578225135803, "kl": 0.0396907776594162, "learning_rate": 4.855675406186099e-06, "loss": 0.004, "num_tokens": 3842312.0, "reward": 0.82733154296875, "reward_std": 0.014147238805890083, "rewards//mean": 0.82733154296875, "rewards//std": 0.022736497223377228, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.118, "grad_norm": 0.6577656865119934, "kl": 0.03925878135487437, "learning_rate": 4.855143631968242e-06, "loss": 0.0039, "num_tokens": 3848880.0, "reward": 0.85369873046875, "reward_std": 0.012300359085202217, "rewards//mean": 0.85369873046875, "rewards//std": 0.0363980270922184, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1182, "grad_norm": 0.6258107423782349, "kl": 0.04022790654562414, "learning_rate": 4.854610909098813e-06, "loss": 0.004, "num_tokens": 3855376.0, "reward": 0.83575439453125, "reward_std": 0.012947885319590569, "rewards//mean": 0.83575439453125, "rewards//std": 0.033850088715553284, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1184, "grad_norm": 0.6062402129173279, "kl": 0.04229812603443861, "learning_rate": 4.854077237792389e-06, "loss": 0.0042, "num_tokens": 3861856.0, "reward": 0.854736328125, "reward_std": 0.007470754906535149, "rewards//mean": 0.854736328125, "rewards//std": 0.018227506428956985, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1186, "grad_norm": 0.5723037719726562, "kl": 0.03844059142284095, "learning_rate": 4.853542618263937e-06, "loss": 0.0038, "num_tokens": 3868264.0, "reward": 0.8463134765625, "reward_std": 0.00796434748917818, "rewards//mean": 0.8463134765625, "rewards//std": 0.023263435810804367, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1188, "grad_norm": 0.6231993436813354, "kl": 0.03992640180513263, "learning_rate": 4.8530070507288e-06, "loss": 0.004, "num_tokens": 3874784.0, "reward": 0.8321533203125, "reward_std": 0.014231465756893158, "rewards//mean": 0.8321533203125, "rewards//std": 0.03230912983417511, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.119, "grad_norm": 0.6749900579452515, "kl": 0.0327031088527292, "learning_rate": 4.852470535402703e-06, "loss": 0.0033, "num_tokens": 3881304.0, "reward": 0.84979248046875, "reward_std": 0.010251807048916817, "rewards//mean": 0.84979248046875, "rewards//std": 0.02176499553024769, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1192, "grad_norm": 0.6172786951065063, "kl": 0.03722618380561471, "learning_rate": 4.851933072501756e-06, "loss": 0.0037, "num_tokens": 3887856.0, "reward": 0.83587646484375, "reward_std": 0.00847033690661192, "rewards//mean": 0.83587646484375, "rewards//std": 0.021150536835193634, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1194, "grad_norm": 0.6346173286437988, "kl": 0.042119944002479315, "learning_rate": 4.851394662242449e-06, "loss": 0.0042, "num_tokens": 3894440.0, "reward": 0.8529052734375, "reward_std": 0.014024918898940086, "rewards//mean": 0.8529052734375, "rewards//std": 0.0261920765042305, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1196, "grad_norm": 0.6278045773506165, "kl": 0.03757873619906604, "learning_rate": 4.850855304841653e-06, "loss": 0.0038, "num_tokens": 3900944.0, "reward": 0.786376953125, "reward_std": 0.010384895838797092, "rewards//mean": 0.786376953125, "rewards//std": 0.0285516157746315, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1198, "grad_norm": 0.6595480442047119, "kl": 0.04688437725417316, "learning_rate": 4.8503150005166225e-06, "loss": 0.0047, "num_tokens": 3907464.0, "reward": 0.802734375, "reward_std": 0.012809094972908497, "rewards//mean": 0.802734375, "rewards//std": 0.026502618566155434, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.12, "grad_norm": 0.5957270860671997, "kl": 0.03809724980965257, "learning_rate": 4.849773749484989e-06, "loss": 0.0038, "num_tokens": 3914072.0, "reward": 0.82818603515625, "reward_std": 0.012442629784345627, "rewards//mean": 0.82818603515625, "rewards//std": 0.023576516658067703, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1202, "grad_norm": 0.7704198360443115, "kl": 0.03094646194949746, "learning_rate": 4.849231551964771e-06, "loss": 0.0031, "num_tokens": 3920608.0, "reward": 0.7781982421875, "reward_std": 0.012034199200570583, "rewards//mean": 0.7781982421875, "rewards//std": 0.02012934535741806, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1204, "grad_norm": 0.6139811277389526, "kl": 0.040575713850557804, "learning_rate": 4.848688408174366e-06, "loss": 0.0041, "num_tokens": 3927152.0, "reward": 0.82891845703125, "reward_std": 0.012762252241373062, "rewards//mean": 0.82891845703125, "rewards//std": 0.0212725717574358, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1206, "grad_norm": 0.6253176331520081, "kl": 0.03625836200080812, "learning_rate": 4.84814431833255e-06, "loss": 0.0036, "num_tokens": 3933664.0, "reward": 0.828369140625, "reward_std": 0.00994873046875, "rewards//mean": 0.828369140625, "rewards//std": 0.03292476385831833, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1208, "grad_norm": 0.6379534006118774, "kl": 0.03596004471182823, "learning_rate": 4.847599282658483e-06, "loss": 0.0036, "num_tokens": 3940112.0, "reward": 0.83465576171875, "reward_std": 0.012594557367265224, "rewards//mean": 0.83465576171875, "rewards//std": 0.021179860457777977, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.121, "grad_norm": 0.6428269147872925, "kl": 0.04226218955591321, "learning_rate": 4.847053301371706e-06, "loss": 0.0042, "num_tokens": 3946704.0, "reward": 0.83428955078125, "reward_std": 0.010459795594215393, "rewards//mean": 0.83428955078125, "rewards//std": 0.022356484085321426, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1212, "grad_norm": 0.550739586353302, "kl": 0.03827002551406622, "learning_rate": 4.84650637469214e-06, "loss": 0.0038, "num_tokens": 3953160.0, "reward": 0.87030029296875, "reward_std": 0.011451773345470428, "rewards//mean": 0.87030029296875, "rewards//std": 0.0321001335978508, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1214, "grad_norm": 0.6382452249526978, "kl": 0.0332846415694803, "learning_rate": 4.845958502840087e-06, "loss": 0.0033, "num_tokens": 3959576.0, "reward": 0.86163330078125, "reward_std": 0.01746852695941925, "rewards//mean": 0.86163330078125, "rewards//std": 0.02861880511045456, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1216, "grad_norm": 0.7068121433258057, "kl": 0.0464365491643548, "learning_rate": 4.8454096860362284e-06, "loss": 0.0046, "num_tokens": 3966064.0, "reward": 0.79376220703125, "reward_std": 0.010808397084474564, "rewards//mean": 0.79376220703125, "rewards//std": 0.023053860291838646, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1218, "grad_norm": 0.6986582279205322, "kl": 0.040672212140634656, "learning_rate": 4.8448599245016306e-06, "loss": 0.0041, "num_tokens": 3972592.0, "reward": 0.8546142578125, "reward_std": 0.01843278855085373, "rewards//mean": 0.8546142578125, "rewards//std": 0.027085380628705025, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.122, "grad_norm": 0.7402421832084656, "kl": 0.04260013531893492, "learning_rate": 4.844309218457735e-06, "loss": 0.0043, "num_tokens": 3979104.0, "reward": 0.8359375, "reward_std": 0.011350443586707115, "rewards//mean": 0.8359375, "rewards//std": 0.027294989675283432, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1222, "grad_norm": 0.5928390622138977, "kl": 0.04137154296040535, "learning_rate": 4.843757568126366e-06, "loss": 0.0041, "num_tokens": 3985616.0, "reward": 0.8240966796875, "reward_std": 0.01190880499780178, "rewards//mean": 0.8240966796875, "rewards//std": 0.02292521297931671, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1224, "grad_norm": 0.681674599647522, "kl": 0.039917706744745374, "learning_rate": 4.84320497372973e-06, "loss": 0.004, "num_tokens": 3992328.0, "reward": 0.86822509765625, "reward_std": 0.014179814606904984, "rewards//mean": 0.86822509765625, "rewards//std": 0.02591339498758316, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1226, "grad_norm": 0.6957495808601379, "kl": 0.0385527154430747, "learning_rate": 4.8426514354904096e-06, "loss": 0.0039, "num_tokens": 3998832.0, "reward": 0.88836669921875, "reward_std": 0.014827560633420944, "rewards//mean": 0.88836669921875, "rewards//std": 0.03412356227636337, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1228, "grad_norm": 0.6795756220817566, "kl": 0.04442503722384572, "learning_rate": 4.842096953631371e-06, "loss": 0.0044, "num_tokens": 4005336.0, "reward": 0.8365478515625, "reward_std": 0.01342087984085083, "rewards//mean": 0.8365478515625, "rewards//std": 0.026831572875380516, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.123, "grad_norm": 0.6525948643684387, "kl": 0.04063747264444828, "learning_rate": 4.841541528375961e-06, "loss": 0.0041, "num_tokens": 4011760.0, "reward": 0.87091064453125, "reward_std": 0.016083110123872757, "rewards//mean": 0.87091064453125, "rewards//std": 0.02499214932322502, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1232, "grad_norm": 0.721349835395813, "kl": 0.036919957026839256, "learning_rate": 4.840985159947902e-06, "loss": 0.0037, "num_tokens": 4018352.0, "reward": 0.797607421875, "reward_std": 0.010739820078015327, "rewards//mean": 0.797607421875, "rewards//std": 0.021109074354171753, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1234, "grad_norm": 0.614031970500946, "kl": 0.041909544728696346, "learning_rate": 4.8404278485713005e-06, "loss": 0.0042, "num_tokens": 4024936.0, "reward": 0.86962890625, "reward_std": 0.015312773175537586, "rewards//mean": 0.86962890625, "rewards//std": 0.04199381172657013, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1236, "grad_norm": 0.7404701113700867, "kl": 0.037076502572745085, "learning_rate": 4.839869594470642e-06, "loss": 0.0037, "num_tokens": 4031464.0, "reward": 0.85089111328125, "reward_std": 0.013311058282852173, "rewards//mean": 0.85089111328125, "rewards//std": 0.02026229538023472, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1238, "grad_norm": 0.6280961036682129, "kl": 0.04641252523288131, "learning_rate": 4.839310397870791e-06, "loss": 0.0046, "num_tokens": 4038104.0, "reward": 0.82000732421875, "reward_std": 0.010196110233664513, "rewards//mean": 0.82000732421875, "rewards//std": 0.02073199860751629, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.124, "grad_norm": 0.6301640272140503, "kl": 0.03853551088832319, "learning_rate": 4.838750258996992e-06, "loss": 0.0039, "num_tokens": 4044672.0, "reward": 0.827392578125, "reward_std": 0.010251728817820549, "rewards//mean": 0.827392578125, "rewards//std": 0.032954175025224686, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1242, "grad_norm": 0.7439788579940796, "kl": 0.03770592529326677, "learning_rate": 4.838189178074867e-06, "loss": 0.0038, "num_tokens": 4051200.0, "reward": 0.833740234375, "reward_std": 0.01518142782151699, "rewards//mean": 0.833740234375, "rewards//std": 0.019160354509949684, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1244, "grad_norm": 0.6230072379112244, "kl": 0.043836275581270456, "learning_rate": 4.837627155330421e-06, "loss": 0.0044, "num_tokens": 4057696.0, "reward": 0.869873046875, "reward_std": 0.010131296701729298, "rewards//mean": 0.869873046875, "rewards//std": 0.026873571798205376, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1246, "grad_norm": 0.7052754759788513, "kl": 0.040967176435515285, "learning_rate": 4.837064190990036e-06, "loss": 0.0041, "num_tokens": 4064232.0, "reward": 0.79345703125, "reward_std": 0.00889979861676693, "rewards//mean": 0.79345703125, "rewards//std": 0.028590822592377663, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1248, "grad_norm": 0.6665441393852234, "kl": 0.04415145283564925, "learning_rate": 4.836500285280476e-06, "loss": 0.0044, "num_tokens": 4070696.0, "reward": 0.86328125, "reward_std": 0.01058972254395485, "rewards//mean": 0.86328125, "rewards//std": 0.02436969056725502, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.125, "grad_norm": 0.6481565237045288, "kl": 0.032439467730000615, "learning_rate": 4.83593543842888e-06, "loss": 0.0032, "num_tokens": 4077192.0, "reward": 0.88177490234375, "reward_std": 0.015364980325102806, "rewards//mean": 0.88177490234375, "rewards//std": 0.027341466397047043, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1252, "grad_norm": 0.6698442697525024, "kl": 0.04040778544731438, "learning_rate": 4.835369650662767e-06, "loss": 0.004, "num_tokens": 4083736.0, "reward": 0.8175048828125, "reward_std": 0.013933517970144749, "rewards//mean": 0.8175048828125, "rewards//std": 0.032722536474466324, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1254, "grad_norm": 0.6354655623435974, "kl": 0.03824352379888296, "learning_rate": 4.83480292221004e-06, "loss": 0.0038, "num_tokens": 4090264.0, "reward": 0.8193359375, "reward_std": 0.011118542402982712, "rewards//mean": 0.8193359375, "rewards//std": 0.0255344957113266, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1256, "grad_norm": 0.5788552165031433, "kl": 0.036963594146072865, "learning_rate": 4.834235253298973e-06, "loss": 0.0037, "num_tokens": 4096880.0, "reward": 0.8267822265625, "reward_std": 0.012452752329409122, "rewards//mean": 0.8267822265625, "rewards//std": 0.023012209683656693, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1258, "grad_norm": 0.6248599290847778, "kl": 0.04226681496948004, "learning_rate": 4.833666644158227e-06, "loss": 0.0042, "num_tokens": 4103368.0, "reward": 0.84954833984375, "reward_std": 0.008312016725540161, "rewards//mean": 0.84954833984375, "rewards//std": 0.021179145202040672, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.126, "grad_norm": 0.6156426072120667, "kl": 0.03705371776595712, "learning_rate": 4.833097095016835e-06, "loss": 0.0037, "num_tokens": 4109936.0, "reward": 0.83428955078125, "reward_std": 0.012196335941553116, "rewards//mean": 0.83428955078125, "rewards//std": 0.018437961116433144, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1262, "grad_norm": 0.6548982858657837, "kl": 0.038468639831990004, "learning_rate": 4.832526606104213e-06, "loss": 0.0038, "num_tokens": 4116472.0, "reward": 0.84368896484375, "reward_std": 0.013091469183564186, "rewards//mean": 0.84368896484375, "rewards//std": 0.029972108080983162, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1264, "grad_norm": 0.7338609099388123, "kl": 0.038367170840501785, "learning_rate": 4.831955177650153e-06, "loss": 0.0038, "num_tokens": 4122992.0, "reward": 0.820556640625, "reward_std": 0.01122078113257885, "rewards//mean": 0.820556640625, "rewards//std": 0.025542795658111572, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1266, "grad_norm": 0.7307354211807251, "kl": 0.04504147078841925, "learning_rate": 4.831382809884826e-06, "loss": 0.0045, "num_tokens": 4129504.0, "reward": 0.84442138671875, "reward_std": 0.01341638807207346, "rewards//mean": 0.84442138671875, "rewards//std": 0.03062807209789753, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1268, "grad_norm": 0.6589616537094116, "kl": 0.0420961850322783, "learning_rate": 4.830809503038781e-06, "loss": 0.0042, "num_tokens": 4136008.0, "reward": 0.84881591796875, "reward_std": 0.014670501463115215, "rewards//mean": 0.84881591796875, "rewards//std": 0.0334705226123333, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.127, "grad_norm": 0.5887529850006104, "kl": 0.036529581528156996, "learning_rate": 4.830235257342948e-06, "loss": 0.0037, "num_tokens": 4142480.0, "reward": 0.811279296875, "reward_std": 0.008088908158242702, "rewards//mean": 0.811279296875, "rewards//std": 0.022128576412796974, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1272, "grad_norm": 0.6737771034240723, "kl": 0.039779233280569315, "learning_rate": 4.829660073028631e-06, "loss": 0.004, "num_tokens": 4149000.0, "reward": 0.858642578125, "reward_std": 0.015279535204172134, "rewards//mean": 0.858642578125, "rewards//std": 0.03358030319213867, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1274, "grad_norm": 0.6876733303070068, "kl": 0.03895897837355733, "learning_rate": 4.829083950327516e-06, "loss": 0.0039, "num_tokens": 4155560.0, "reward": 0.82916259765625, "reward_std": 0.01598961651325226, "rewards//mean": 0.82916259765625, "rewards//std": 0.033176615834236145, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1276, "grad_norm": 0.6112537384033203, "kl": 0.03786856750957668, "learning_rate": 4.828506889471664e-06, "loss": 0.0038, "num_tokens": 4162040.0, "reward": 0.83795166015625, "reward_std": 0.00955367460846901, "rewards//mean": 0.83795166015625, "rewards//std": 0.0371297150850296, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1278, "grad_norm": 0.6563738584518433, "kl": 0.039538444718346, "learning_rate": 4.827928890693515e-06, "loss": 0.004, "num_tokens": 4168552.0, "reward": 0.8182373046875, "reward_std": 0.013406902551651001, "rewards//mean": 0.8182373046875, "rewards//std": 0.019978374242782593, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.128, "grad_norm": 0.6196538805961609, "kl": 0.04083075327798724, "learning_rate": 4.8273499542258885e-06, "loss": 0.0041, "num_tokens": 4175080.0, "reward": 0.82525634765625, "reward_std": 0.01369745098054409, "rewards//mean": 0.82525634765625, "rewards//std": 0.027859212830662727, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1282, "grad_norm": 0.6548612713813782, "kl": 0.04186346894130111, "learning_rate": 4.826770080301978e-06, "loss": 0.0042, "num_tokens": 4181592.0, "reward": 0.8125, "reward_std": 0.013666237704455853, "rewards//mean": 0.8125, "rewards//std": 0.02883964590728283, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1284, "grad_norm": 0.6164513826370239, "kl": 0.03509138128720224, "learning_rate": 4.826189269155357e-06, "loss": 0.0035, "num_tokens": 4188112.0, "reward": 0.81109619140625, "reward_std": 0.011790347285568714, "rewards//mean": 0.81109619140625, "rewards//std": 0.027990398928523064, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1286, "grad_norm": 0.5807366967201233, "kl": 0.03635508916340768, "learning_rate": 4.825607521019978e-06, "loss": 0.0036, "num_tokens": 4194632.0, "reward": 0.857666015625, "reward_std": 0.010109765455126762, "rewards//mean": 0.857666015625, "rewards//std": 0.021864313632249832, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1288, "grad_norm": 0.6619797945022583, "kl": 0.036573923425748944, "learning_rate": 4.825024836130166e-06, "loss": 0.0037, "num_tokens": 4201152.0, "reward": 0.88934326171875, "reward_std": 0.011922219768166542, "rewards//mean": 0.88934326171875, "rewards//std": 0.023908693343400955, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.129, "grad_norm": 0.7236711382865906, "kl": 0.031774683156982064, "learning_rate": 4.824441214720629e-06, "loss": 0.0032, "num_tokens": 4207632.0, "reward": 0.88836669921875, "reward_std": 0.010845713317394257, "rewards//mean": 0.88836669921875, "rewards//std": 0.02806331403553486, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1292, "grad_norm": 0.617940366268158, "kl": 0.035609227837994695, "learning_rate": 4.823856657026448e-06, "loss": 0.0036, "num_tokens": 4214288.0, "reward": 0.84954833984375, "reward_std": 0.011059116572141647, "rewards//mean": 0.84954833984375, "rewards//std": 0.019783737137913704, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1294, "grad_norm": 0.6680132150650024, "kl": 0.03969927388243377, "learning_rate": 4.823271163283084e-06, "loss": 0.004, "num_tokens": 4220816.0, "reward": 0.82989501953125, "reward_std": 0.011213782243430614, "rewards//mean": 0.82989501953125, "rewards//std": 0.02739180251955986, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1296, "grad_norm": 0.657684326171875, "kl": 0.03403781563974917, "learning_rate": 4.822684733726373e-06, "loss": 0.0034, "num_tokens": 4227408.0, "reward": 0.79302978515625, "reward_std": 0.01171116717159748, "rewards//mean": 0.79302978515625, "rewards//std": 0.02404443360865116, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1298, "grad_norm": 0.6134021282196045, "kl": 0.03312623081728816, "learning_rate": 4.822097368592529e-06, "loss": 0.0033, "num_tokens": 4233896.0, "reward": 0.87103271484375, "reward_std": 0.015468908473849297, "rewards//mean": 0.87103271484375, "rewards//std": 0.031067268922924995, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.13, "grad_norm": 0.6155107617378235, "kl": 0.03560430929064751, "learning_rate": 4.821509068118143e-06, "loss": 0.0036, "num_tokens": 4240480.0, "reward": 0.8189697265625, "reward_std": 0.010782364755868912, "rewards//mean": 0.8189697265625, "rewards//std": 0.027780653908848763, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1302, "grad_norm": 0.6770623922348022, "kl": 0.034555776976048946, "learning_rate": 4.8209198325401815e-06, "loss": 0.0035, "num_tokens": 4246936.0, "reward": 0.80694580078125, "reward_std": 0.00983287114650011, "rewards//mean": 0.80694580078125, "rewards//std": 0.015606461092829704, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1304, "grad_norm": 0.674925684928894, "kl": 0.03344253194518387, "learning_rate": 4.82032966209599e-06, "loss": 0.0033, "num_tokens": 4253448.0, "reward": 0.85491943359375, "reward_std": 0.011441394686698914, "rewards//mean": 0.85491943359375, "rewards//std": 0.0302005335688591, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1306, "grad_norm": 0.623838484287262, "kl": 0.04128697048872709, "learning_rate": 4.819738557023287e-06, "loss": 0.0041, "num_tokens": 4260000.0, "reward": 0.873779296875, "reward_std": 0.012213251553475857, "rewards//mean": 0.873779296875, "rewards//std": 0.02301911450922489, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1308, "grad_norm": 0.6719110608100891, "kl": 0.03923142282292247, "learning_rate": 4.819146517560171e-06, "loss": 0.0039, "num_tokens": 4266544.0, "reward": 0.84075927734375, "reward_std": 0.011564802378416061, "rewards//mean": 0.84075927734375, "rewards//std": 0.021539175882935524, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.131, "grad_norm": 0.6305099129676819, "kl": 0.03507873183116317, "learning_rate": 4.818553543945115e-06, "loss": 0.0035, "num_tokens": 4273048.0, "reward": 0.85064697265625, "reward_std": 0.01198691874742508, "rewards//mean": 0.85064697265625, "rewards//std": 0.02057294361293316, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1312, "grad_norm": 0.6202968955039978, "kl": 0.032768177799880505, "learning_rate": 4.817959636416969e-06, "loss": 0.0033, "num_tokens": 4279552.0, "reward": 0.857177734375, "reward_std": 0.017117980867624283, "rewards//mean": 0.857177734375, "rewards//std": 0.03547430410981178, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1314, "grad_norm": 0.6903461217880249, "kl": 0.04518736433237791, "learning_rate": 4.8173647952149584e-06, "loss": 0.0045, "num_tokens": 4286096.0, "reward": 0.775634765625, "reward_std": 0.009762182831764221, "rewards//mean": 0.775634765625, "rewards//std": 0.023197297006845474, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1316, "grad_norm": 0.6239075064659119, "kl": 0.03696441138163209, "learning_rate": 4.816769020578685e-06, "loss": 0.0037, "num_tokens": 4292704.0, "reward": 0.84454345703125, "reward_std": 0.010429391637444496, "rewards//mean": 0.84454345703125, "rewards//std": 0.03256458044052124, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1318, "grad_norm": 0.6943949460983276, "kl": 0.03952537663280964, "learning_rate": 4.816172312748128e-06, "loss": 0.004, "num_tokens": 4299208.0, "reward": 0.85125732421875, "reward_std": 0.00947652943432331, "rewards//mean": 0.85125732421875, "rewards//std": 0.01891135238111019, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.132, "grad_norm": 0.6111933588981628, "kl": 0.03215678269043565, "learning_rate": 4.81557467196364e-06, "loss": 0.0032, "num_tokens": 4305728.0, "reward": 0.841064453125, "reward_std": 0.010960794985294342, "rewards//mean": 0.841064453125, "rewards//std": 0.014530673623085022, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1322, "grad_norm": 0.6737866401672363, "kl": 0.036603975808247924, "learning_rate": 4.814976098465951e-06, "loss": 0.0037, "num_tokens": 4312296.0, "reward": 0.81103515625, "reward_std": 0.008650883100926876, "rewards//mean": 0.81103515625, "rewards//std": 0.015570652671158314, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1324, "grad_norm": 0.6496907472610474, "kl": 0.040806922828778625, "learning_rate": 4.814376592496167e-06, "loss": 0.0041, "num_tokens": 4318760.0, "reward": 0.8494873046875, "reward_std": 0.01330764964222908, "rewards//mean": 0.8494873046875, "rewards//std": 0.026180516928434372, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1326, "grad_norm": 0.6427296996116638, "kl": 0.034578942228108644, "learning_rate": 4.813776154295767e-06, "loss": 0.0035, "num_tokens": 4325256.0, "reward": 0.850341796875, "reward_std": 0.011855723336338997, "rewards//mean": 0.850341796875, "rewards//std": 0.028415564447641373, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1328, "grad_norm": 0.6374642252922058, "kl": 0.03211224218830466, "learning_rate": 4.81317478410661e-06, "loss": 0.0032, "num_tokens": 4331768.0, "reward": 0.80657958984375, "reward_std": 0.0074308766052126884, "rewards//mean": 0.80657958984375, "rewards//std": 0.012738611549139023, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.133, "grad_norm": 0.7321417927742004, "kl": 0.04336266568861902, "learning_rate": 4.812572482170926e-06, "loss": 0.0043, "num_tokens": 4338224.0, "reward": 0.84429931640625, "reward_std": 0.01558034960180521, "rewards//mean": 0.84429931640625, "rewards//std": 0.02651967667043209, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1332, "grad_norm": 0.6395056843757629, "kl": 0.03777423477731645, "learning_rate": 4.811969248731323e-06, "loss": 0.0038, "num_tokens": 4344768.0, "reward": 0.81304931640625, "reward_std": 0.01339129637926817, "rewards//mean": 0.81304931640625, "rewards//std": 0.02242206782102585, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1334, "grad_norm": 0.6424086689949036, "kl": 0.036166464909911156, "learning_rate": 4.811365084030784e-06, "loss": 0.0036, "num_tokens": 4351168.0, "reward": 0.84857177734375, "reward_std": 0.011064697988331318, "rewards//mean": 0.84857177734375, "rewards//std": 0.02043117955327034, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1336, "grad_norm": 0.654913604259491, "kl": 0.03382642939686775, "learning_rate": 4.8107599883126634e-06, "loss": 0.0034, "num_tokens": 4357736.0, "reward": 0.81500244140625, "reward_std": 0.0177701935172081, "rewards//mean": 0.81500244140625, "rewards//std": 0.03539564833045006, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1338, "grad_norm": 0.6352917551994324, "kl": 0.0319740588311106, "learning_rate": 4.810153961820697e-06, "loss": 0.0032, "num_tokens": 4364320.0, "reward": 0.824951171875, "reward_std": 0.012290971353650093, "rewards//mean": 0.824951171875, "rewards//std": 0.023291075602173805, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.134, "grad_norm": 0.6589383482933044, "kl": 0.032449120189994574, "learning_rate": 4.809547004798991e-06, "loss": 0.0032, "num_tokens": 4370872.0, "reward": 0.8184814453125, "reward_std": 0.013435855507850647, "rewards//mean": 0.8184814453125, "rewards//std": 0.018970992416143417, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1342, "grad_norm": 0.6411390900611877, "kl": 0.03467783285304904, "learning_rate": 4.808939117492028e-06, "loss": 0.0035, "num_tokens": 4377256.0, "reward": 0.8438720703125, "reward_std": 0.009257117286324501, "rewards//mean": 0.8438720703125, "rewards//std": 0.031696103513240814, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1344, "grad_norm": 0.666558027267456, "kl": 0.037281798431649804, "learning_rate": 4.808330300144664e-06, "loss": 0.0037, "num_tokens": 4383744.0, "reward": 0.82806396484375, "reward_std": 0.011676324531435966, "rewards//mean": 0.82806396484375, "rewards//std": 0.01860387995839119, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1346, "grad_norm": 0.7036078572273254, "kl": 0.041853292379528284, "learning_rate": 4.807720553002132e-06, "loss": 0.0042, "num_tokens": 4390200.0, "reward": 0.83892822265625, "reward_std": 0.009568301029503345, "rewards//mean": 0.83892822265625, "rewards//std": 0.03169437125325203, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1348, "grad_norm": 0.6020171046257019, "kl": 0.03041183133609593, "learning_rate": 4.807109876310037e-06, "loss": 0.003, "num_tokens": 4396848.0, "reward": 0.853759765625, "reward_std": 0.011211195029318333, "rewards//mean": 0.853759765625, "rewards//std": 0.02409859374165535, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.135, "grad_norm": 0.6017598509788513, "kl": 0.03304919390939176, "learning_rate": 4.806498270314359e-06, "loss": 0.0033, "num_tokens": 4403320.0, "reward": 0.83056640625, "reward_std": 0.013944298960268497, "rewards//mean": 0.83056640625, "rewards//std": 0.03543758764863014, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1352, "grad_norm": 0.6396259665489197, "kl": 0.035610402934253216, "learning_rate": 4.805885735261454e-06, "loss": 0.0036, "num_tokens": 4409832.0, "reward": 0.8157958984375, "reward_std": 0.014298718422651291, "rewards//mean": 0.8157958984375, "rewards//std": 0.025393905118107796, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1354, "grad_norm": 0.6395999789237976, "kl": 0.034702015575021505, "learning_rate": 4.805272271398051e-06, "loss": 0.0035, "num_tokens": 4416296.0, "reward": 0.8299560546875, "reward_std": 0.010690449737012386, "rewards//mean": 0.8299560546875, "rewards//std": 0.019512252882122993, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1356, "grad_norm": 0.6502525210380554, "kl": 0.03755046101287007, "learning_rate": 4.804657878971252e-06, "loss": 0.0038, "num_tokens": 4422784.0, "reward": 0.84521484375, "reward_std": 0.010795504786074162, "rewards//mean": 0.84521484375, "rewards//std": 0.022973036393523216, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1358, "grad_norm": 0.6069939136505127, "kl": 0.03229914209805429, "learning_rate": 4.804042558228535e-06, "loss": 0.0032, "num_tokens": 4429288.0, "reward": 0.86700439453125, "reward_std": 0.013926582410931587, "rewards//mean": 0.86700439453125, "rewards//std": 0.027930304408073425, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.136, "grad_norm": 1.218776822090149, "kl": 0.037880075396969914, "learning_rate": 4.803426309417752e-06, "loss": 0.0038, "num_tokens": 4435824.0, "reward": 0.86199951171875, "reward_std": 0.010651204735040665, "rewards//mean": 0.86199951171875, "rewards//std": 0.022628381848335266, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1362, "grad_norm": 0.6532304883003235, "kl": 0.035220161313191056, "learning_rate": 4.802809132787125e-06, "loss": 0.0035, "num_tokens": 4442240.0, "reward": 0.8717041015625, "reward_std": 0.012849440798163414, "rewards//mean": 0.8717041015625, "rewards//std": 0.02241506241261959, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1364, "grad_norm": 0.7135369181632996, "kl": 0.03639285918325186, "learning_rate": 4.802191028585257e-06, "loss": 0.0036, "num_tokens": 4448760.0, "reward": 0.83935546875, "reward_std": 0.009929399937391281, "rewards//mean": 0.83935546875, "rewards//std": 0.025614995509386063, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1366, "grad_norm": 0.7048020362854004, "kl": 0.04157875198870897, "learning_rate": 4.801571997061117e-06, "loss": 0.0042, "num_tokens": 4455248.0, "reward": 0.77557373046875, "reward_std": 0.01009085588157177, "rewards//mean": 0.77557373046875, "rewards//std": 0.016904599964618683, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1368, "grad_norm": 0.6199202537536621, "kl": 0.035192727111279964, "learning_rate": 4.800952038464051e-06, "loss": 0.0035, "num_tokens": 4461736.0, "reward": 0.80517578125, "reward_std": 0.00936543196439743, "rewards//mean": 0.80517578125, "rewards//std": 0.0161209125071764, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.137, "grad_norm": 0.5773229002952576, "kl": 0.030090483836829662, "learning_rate": 4.800331153043781e-06, "loss": 0.003, "num_tokens": 4468272.0, "reward": 0.850341796875, "reward_std": 0.009957228787243366, "rewards//mean": 0.850341796875, "rewards//std": 0.024318700656294823, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1372, "grad_norm": 0.5711527466773987, "kl": 0.034664512844756246, "learning_rate": 4.799709341050397e-06, "loss": 0.0035, "num_tokens": 4474696.0, "reward": 0.8583984375, "reward_std": 0.012437745928764343, "rewards//mean": 0.8583984375, "rewards//std": 0.02202022820711136, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1374, "grad_norm": 0.6654622554779053, "kl": 0.03352417494170368, "learning_rate": 4.799086602734364e-06, "loss": 0.0034, "num_tokens": 4481176.0, "reward": 0.869140625, "reward_std": 0.013702813535928726, "rewards//mean": 0.869140625, "rewards//std": 0.031673409044742584, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1376, "grad_norm": 0.7128596305847168, "kl": 0.038529411889612675, "learning_rate": 4.798462938346524e-06, "loss": 0.0039, "num_tokens": 4487768.0, "reward": 0.8201904296875, "reward_std": 0.015007439069449902, "rewards//mean": 0.8201904296875, "rewards//std": 0.021631482988595963, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1378, "grad_norm": 0.6258558034896851, "kl": 0.038319848012179136, "learning_rate": 4.7978383481380865e-06, "loss": 0.0038, "num_tokens": 4494224.0, "reward": 0.833984375, "reward_std": 0.018096666783094406, "rewards//mean": 0.833984375, "rewards//std": 0.0328446701169014, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.138, "grad_norm": 0.7074935436248779, "kl": 0.033116700826212764, "learning_rate": 4.797212832360637e-06, "loss": 0.0033, "num_tokens": 4500672.0, "reward": 0.86041259765625, "reward_std": 0.014863163232803345, "rewards//mean": 0.86041259765625, "rewards//std": 0.03233366087079048, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1382, "grad_norm": 0.6275817155838013, "kl": 0.036013098899275064, "learning_rate": 4.796586391266135e-06, "loss": 0.0036, "num_tokens": 4507152.0, "reward": 0.794921875, "reward_std": 0.009963938035070896, "rewards//mean": 0.794921875, "rewards//std": 0.03113352507352829, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1384, "grad_norm": 0.6234140992164612, "kl": 0.03075044380966574, "learning_rate": 4.795959025106907e-06, "loss": 0.0031, "num_tokens": 4513600.0, "reward": 0.80718994140625, "reward_std": 0.012309936806559563, "rewards//mean": 0.80718994140625, "rewards//std": 0.03157665953040123, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1386, "grad_norm": 0.6930115222930908, "kl": 0.03649276262149215, "learning_rate": 4.7953307341356595e-06, "loss": 0.0036, "num_tokens": 4520136.0, "reward": 0.83502197265625, "reward_std": 0.011502781882882118, "rewards//mean": 0.83502197265625, "rewards//std": 0.02733703702688217, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1388, "grad_norm": 0.7128375172615051, "kl": 0.03017679904587567, "learning_rate": 4.794701518605467e-06, "loss": 0.003, "num_tokens": 4526608.0, "reward": 0.83087158203125, "reward_std": 0.014898988418281078, "rewards//mean": 0.83087158203125, "rewards//std": 0.0271587036550045, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.139, "grad_norm": 0.6921178698539734, "kl": 0.03340840828604996, "learning_rate": 4.794071378769776e-06, "loss": 0.0033, "num_tokens": 4533112.0, "reward": 0.83184814453125, "reward_std": 0.009263399988412857, "rewards//mean": 0.83184814453125, "rewards//std": 0.02486584521830082, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1392, "grad_norm": 0.6168423295021057, "kl": 0.03176119248382747, "learning_rate": 4.7934403148824085e-06, "loss": 0.0032, "num_tokens": 4539592.0, "reward": 0.83331298828125, "reward_std": 0.011103109456598759, "rewards//mean": 0.83331298828125, "rewards//std": 0.028772840276360512, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1394, "grad_norm": 0.6904219388961792, "kl": 0.03471069340594113, "learning_rate": 4.792808327197556e-06, "loss": 0.0035, "num_tokens": 4546088.0, "reward": 0.855712890625, "reward_std": 0.013138792477548122, "rewards//mean": 0.855712890625, "rewards//std": 0.027080070227384567, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1396, "grad_norm": 0.6291631460189819, "kl": 0.035946730989962816, "learning_rate": 4.792175415969786e-06, "loss": 0.0036, "num_tokens": 4552576.0, "reward": 0.8463134765625, "reward_std": 0.01710490882396698, "rewards//mean": 0.8463134765625, "rewards//std": 0.03874010592699051, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1398, "grad_norm": 0.7094471454620361, "kl": 0.03311455622315407, "learning_rate": 4.79154158145403e-06, "loss": 0.0033, "num_tokens": 4559080.0, "reward": 0.83734130859375, "reward_std": 0.012111399322748184, "rewards//mean": 0.83734130859375, "rewards//std": 0.02404317446053028, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.14, "grad_norm": 0.5819594860076904, "kl": 0.030330684036016464, "learning_rate": 4.790906823905599e-06, "loss": 0.003, "num_tokens": 4565544.0, "reward": 0.8709716796875, "reward_std": 0.007809075061231852, "rewards//mean": 0.8709716796875, "rewards//std": 0.017934059724211693, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1402, "grad_norm": 0.6491738557815552, "kl": 0.03869724064134061, "learning_rate": 4.790271143580174e-06, "loss": 0.0039, "num_tokens": 4572000.0, "reward": 0.88092041015625, "reward_std": 0.008929949253797531, "rewards//mean": 0.88092041015625, "rewards//std": 0.018714213743805885, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1404, "grad_norm": 0.6010414958000183, "kl": 0.030946203973144293, "learning_rate": 4.789634540733807e-06, "loss": 0.0031, "num_tokens": 4578432.0, "reward": 0.8525390625, "reward_std": 0.01049017533659935, "rewards//mean": 0.8525390625, "rewards//std": 0.024150049313902855, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1406, "grad_norm": 0.6383006572723389, "kl": 0.03664790280163288, "learning_rate": 4.78899701562292e-06, "loss": 0.0037, "num_tokens": 4584928.0, "reward": 0.88543701171875, "reward_std": 0.0163101963698864, "rewards//mean": 0.88543701171875, "rewards//std": 0.04444585740566254, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1408, "grad_norm": 0.6771392226219177, "kl": 0.027444692328572273, "learning_rate": 4.788358568504308e-06, "loss": 0.0027, "num_tokens": 4591416.0, "reward": 0.81182861328125, "reward_std": 0.011187758296728134, "rewards//mean": 0.81182861328125, "rewards//std": 0.029450898990035057, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.141, "grad_norm": 0.6804143190383911, "kl": 0.030982508091256022, "learning_rate": 4.78771919963514e-06, "loss": 0.0031, "num_tokens": 4597880.0, "reward": 0.86578369140625, "reward_std": 0.016307076439261436, "rewards//mean": 0.86578369140625, "rewards//std": 0.03342481330037117, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1412, "grad_norm": 0.6565612554550171, "kl": 0.032242163084447384, "learning_rate": 4.787078909272951e-06, "loss": 0.0032, "num_tokens": 4604512.0, "reward": 0.85791015625, "reward_std": 0.01364915445446968, "rewards//mean": 0.85791015625, "rewards//std": 0.033476460725069046, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1414, "grad_norm": 0.6641219854354858, "kl": 0.032242062501609325, "learning_rate": 4.786437697675651e-06, "loss": 0.0032, "num_tokens": 4611120.0, "reward": 0.82781982421875, "reward_std": 0.009526567533612251, "rewards//mean": 0.82781982421875, "rewards//std": 0.02961287833750248, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1416, "grad_norm": 0.6178065538406372, "kl": 0.033733992371708155, "learning_rate": 4.78579556510152e-06, "loss": 0.0034, "num_tokens": 4617624.0, "reward": 0.8480224609375, "reward_std": 0.009033482521772385, "rewards//mean": 0.8480224609375, "rewards//std": 0.030951879918575287, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1418, "grad_norm": 0.6321604251861572, "kl": 0.034345758613198996, "learning_rate": 4.785152511809208e-06, "loss": 0.0034, "num_tokens": 4624160.0, "reward": 0.823486328125, "reward_std": 0.010395782068371773, "rewards//mean": 0.823486328125, "rewards//std": 0.02851766347885132, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.142, "grad_norm": 0.6739100813865662, "kl": 0.038603739347308874, "learning_rate": 4.784508538057738e-06, "loss": 0.0039, "num_tokens": 4630568.0, "reward": 0.8701171875, "reward_std": 0.012175248935818672, "rewards//mean": 0.8701171875, "rewards//std": 0.026666609570384026, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1422, "grad_norm": 0.6938429474830627, "kl": 0.0384972074534744, "learning_rate": 4.783863644106502e-06, "loss": 0.0038, "num_tokens": 4637120.0, "reward": 0.8455810546875, "reward_std": 0.020297762006521225, "rewards//mean": 0.8455810546875, "rewards//std": 0.03314540535211563, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1424, "grad_norm": 0.6242883205413818, "kl": 0.034390477230772376, "learning_rate": 4.783217830215264e-06, "loss": 0.0034, "num_tokens": 4643592.0, "reward": 0.82000732421875, "reward_std": 0.009873378090560436, "rewards//mean": 0.82000732421875, "rewards//std": 0.017734911292791367, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1426, "grad_norm": 0.635278046131134, "kl": 0.03100576065480709, "learning_rate": 4.782571096644157e-06, "loss": 0.0031, "num_tokens": 4650128.0, "reward": 0.85980224609375, "reward_std": 0.012011419981718063, "rewards//mean": 0.85980224609375, "rewards//std": 0.027970923110842705, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1428, "grad_norm": 0.6182490587234497, "kl": 0.03299332037568092, "learning_rate": 4.7819234436536845e-06, "loss": 0.0033, "num_tokens": 4656776.0, "reward": 0.84429931640625, "reward_std": 0.016069753095507622, "rewards//mean": 0.84429931640625, "rewards//std": 0.026299571618437767, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.143, "grad_norm": 0.7233075499534607, "kl": 0.03955763205885887, "learning_rate": 4.781274871504722e-06, "loss": 0.004, "num_tokens": 4663352.0, "reward": 0.86322021484375, "reward_std": 0.009645191952586174, "rewards//mean": 0.86322021484375, "rewards//std": 0.01891375333070755, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1432, "grad_norm": 0.6025941967964172, "kl": 0.03357991902157664, "learning_rate": 4.780625380458513e-06, "loss": 0.0034, "num_tokens": 4669752.0, "reward": 0.83551025390625, "reward_std": 0.009832639247179031, "rewards//mean": 0.83551025390625, "rewards//std": 0.022564740851521492, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1434, "grad_norm": 0.6996767520904541, "kl": 0.03547016205266118, "learning_rate": 4.7799749707766754e-06, "loss": 0.0035, "num_tokens": 4676280.0, "reward": 0.8021240234375, "reward_std": 0.013020787388086319, "rewards//mean": 0.8021240234375, "rewards//std": 0.03197190538048744, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1436, "grad_norm": 0.7296597361564636, "kl": 0.039803872583433986, "learning_rate": 4.779323642721191e-06, "loss": 0.004, "num_tokens": 4682800.0, "reward": 0.8380126953125, "reward_std": 0.014720786362886429, "rewards//mean": 0.8380126953125, "rewards//std": 0.03200786933302879, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1438, "grad_norm": 0.5901254415512085, "kl": 0.04078292986378074, "learning_rate": 4.778671396554417e-06, "loss": 0.0041, "num_tokens": 4689280.0, "reward": 0.83428955078125, "reward_std": 0.012603724375367165, "rewards//mean": 0.83428955078125, "rewards//std": 0.04113903269171715, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.144, "grad_norm": 0.6492975950241089, "kl": 0.04299718514084816, "learning_rate": 4.778018232539075e-06, "loss": 0.0043, "num_tokens": 4695808.0, "reward": 0.875244140625, "reward_std": 0.01032957248389721, "rewards//mean": 0.875244140625, "rewards//std": 0.022583600133657455, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1442, "grad_norm": 0.6899735927581787, "kl": 0.037773449905216694, "learning_rate": 4.777364150938263e-06, "loss": 0.0038, "num_tokens": 4702496.0, "reward": 0.81561279296875, "reward_std": 0.01541346125304699, "rewards//mean": 0.81561279296875, "rewards//std": 0.03828706964850426, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1444, "grad_norm": 0.6682742238044739, "kl": 0.03817844996228814, "learning_rate": 4.776709152015443e-06, "loss": 0.0038, "num_tokens": 4709128.0, "reward": 0.833251953125, "reward_std": 0.013756323605775833, "rewards//mean": 0.833251953125, "rewards//std": 0.027231693267822266, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1446, "grad_norm": 0.6066592931747437, "kl": 0.03351732715964317, "learning_rate": 4.776053236034449e-06, "loss": 0.0034, "num_tokens": 4715600.0, "reward": 0.83123779296875, "reward_std": 0.01098795235157013, "rewards//mean": 0.83123779296875, "rewards//std": 0.022230185568332672, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1448, "grad_norm": 0.5982468128204346, "kl": 0.031167512759566307, "learning_rate": 4.775396403259483e-06, "loss": 0.0031, "num_tokens": 4722040.0, "reward": 0.86956787109375, "reward_std": 0.012180990539491177, "rewards//mean": 0.86956787109375, "rewards//std": 0.022354451939463615, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.145, "grad_norm": 0.743056058883667, "kl": 0.041748383082449436, "learning_rate": 4.774738653955119e-06, "loss": 0.0042, "num_tokens": 4728568.0, "reward": 0.86041259765625, "reward_std": 0.020265260711312294, "rewards//mean": 0.86041259765625, "rewards//std": 0.036869507282972336, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1452, "grad_norm": 0.5705842971801758, "kl": 0.025937842903658748, "learning_rate": 4.7740799883862966e-06, "loss": 0.0026, "num_tokens": 4735136.0, "reward": 0.814453125, "reward_std": 0.008926140144467354, "rewards//mean": 0.814453125, "rewards//std": 0.018082424998283386, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1454, "grad_norm": 0.6330570578575134, "kl": 0.03641601069830358, "learning_rate": 4.773420406818327e-06, "loss": 0.0036, "num_tokens": 4741624.0, "reward": 0.85540771484375, "reward_std": 0.009088218212127686, "rewards//mean": 0.85540771484375, "rewards//std": 0.0307695921510458, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1456, "grad_norm": 0.7037932276725769, "kl": 0.03219994530081749, "learning_rate": 4.772759909516889e-06, "loss": 0.0032, "num_tokens": 4748176.0, "reward": 0.86865234375, "reward_std": 0.01569826528429985, "rewards//mean": 0.86865234375, "rewards//std": 0.026277761906385422, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1458, "grad_norm": 0.6403439044952393, "kl": 0.03100749058648944, "learning_rate": 4.772098496748031e-06, "loss": 0.0031, "num_tokens": 4754688.0, "reward": 0.82391357421875, "reward_std": 0.013501830399036407, "rewards//mean": 0.82391357421875, "rewards//std": 0.03286166116595268, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.146, "grad_norm": 0.7018864750862122, "kl": 0.03662884212099016, "learning_rate": 4.7714361687781705e-06, "loss": 0.0037, "num_tokens": 4761136.0, "reward": 0.8272705078125, "reward_std": 0.011517598293721676, "rewards//mean": 0.8272705078125, "rewards//std": 0.025793731212615967, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1462, "grad_norm": 0.7192935347557068, "kl": 0.03436392149887979, "learning_rate": 4.770772925874093e-06, "loss": 0.0034, "num_tokens": 4767784.0, "reward": 0.814208984375, "reward_std": 0.012513557448983192, "rewards//mean": 0.814208984375, "rewards//std": 0.031202493235468864, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1464, "grad_norm": 0.6513822078704834, "kl": 0.03598013357259333, "learning_rate": 4.770108768302953e-06, "loss": 0.0036, "num_tokens": 4774360.0, "reward": 0.8048095703125, "reward_std": 0.011015706695616245, "rewards//mean": 0.8048095703125, "rewards//std": 0.020267244428396225, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1466, "grad_norm": 0.6309319138526917, "kl": 0.03459942061454058, "learning_rate": 4.769443696332272e-06, "loss": 0.0035, "num_tokens": 4780872.0, "reward": 0.85797119140625, "reward_std": 0.013075457885861397, "rewards//mean": 0.85797119140625, "rewards//std": 0.03282524645328522, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1468, "grad_norm": 0.6636326313018799, "kl": 0.04208630672656, "learning_rate": 4.768777710229941e-06, "loss": 0.0042, "num_tokens": 4787488.0, "reward": 0.84857177734375, "reward_std": 0.011825092136859894, "rewards//mean": 0.84857177734375, "rewards//std": 0.03186964988708496, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.147, "grad_norm": 0.6215096712112427, "kl": 0.03159097209572792, "learning_rate": 4.768110810264221e-06, "loss": 0.0032, "num_tokens": 4794088.0, "reward": 0.86846923828125, "reward_std": 0.0126973120495677, "rewards//mean": 0.86846923828125, "rewards//std": 0.03158193454146385, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1472, "grad_norm": 0.8456329703330994, "kl": 0.03009815188124776, "learning_rate": 4.767442996703737e-06, "loss": 0.003, "num_tokens": 4800736.0, "reward": 0.8399658203125, "reward_std": 0.011654919944703579, "rewards//mean": 0.8399658203125, "rewards//std": 0.01840067096054554, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1474, "grad_norm": 0.6510376334190369, "kl": 0.03821468539535999, "learning_rate": 4.7667742698174855e-06, "loss": 0.0038, "num_tokens": 4807216.0, "reward": 0.84649658203125, "reward_std": 0.01183587871491909, "rewards//mean": 0.84649658203125, "rewards//std": 0.022038694471120834, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1476, "grad_norm": 0.7117304801940918, "kl": 0.03541584825143218, "learning_rate": 4.766104629874829e-06, "loss": 0.0035, "num_tokens": 4813776.0, "reward": 0.836669921875, "reward_std": 0.014451714232563972, "rewards//mean": 0.836669921875, "rewards//std": 0.03537174314260483, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1478, "grad_norm": 0.6581489443778992, "kl": 0.03779011429287493, "learning_rate": 4.765434077145499e-06, "loss": 0.0038, "num_tokens": 4820216.0, "reward": 0.79486083984375, "reward_std": 0.009967784397304058, "rewards//mean": 0.79486083984375, "rewards//std": 0.02294658124446869, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.148, "grad_norm": 0.6047603487968445, "kl": 0.03846903960220516, "learning_rate": 4.764762611899593e-06, "loss": 0.0038, "num_tokens": 4826760.0, "reward": 0.8321533203125, "reward_std": 0.014414850622415543, "rewards//mean": 0.8321533203125, "rewards//std": 0.033378418534994125, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1482, "grad_norm": 0.636673092842102, "kl": 0.03530361200682819, "learning_rate": 4.764090234407578e-06, "loss": 0.0035, "num_tokens": 4833336.0, "reward": 0.80633544921875, "reward_std": 0.011949518695473671, "rewards//mean": 0.80633544921875, "rewards//std": 0.033745281398296356, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1484, "grad_norm": 0.6843360066413879, "kl": 0.03777982760220766, "learning_rate": 4.763416944940287e-06, "loss": 0.0038, "num_tokens": 4839888.0, "reward": 0.8756103515625, "reward_std": 0.010489646345376968, "rewards//mean": 0.8756103515625, "rewards//std": 0.052274663001298904, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1486, "grad_norm": 0.6168850064277649, "kl": 0.03202090901322663, "learning_rate": 4.762742743768921e-06, "loss": 0.0032, "num_tokens": 4846600.0, "reward": 0.852294921875, "reward_std": 0.015699390321969986, "rewards//mean": 0.852294921875, "rewards//std": 0.024178866297006607, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1488, "grad_norm": 0.6348932981491089, "kl": 0.03048681584186852, "learning_rate": 4.762067631165049e-06, "loss": 0.003, "num_tokens": 4853128.0, "reward": 0.8388671875, "reward_std": 0.012227789498865604, "rewards//mean": 0.8388671875, "rewards//std": 0.026106689125299454, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.149, "grad_norm": 0.7314891815185547, "kl": 0.03466706443578005, "learning_rate": 4.761391607400606e-06, "loss": 0.0035, "num_tokens": 4859608.0, "reward": 0.79595947265625, "reward_std": 0.01198391243815422, "rewards//mean": 0.79595947265625, "rewards//std": 0.018891330808401108, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1492, "grad_norm": 0.633370041847229, "kl": 0.03609757055528462, "learning_rate": 4.7607146727478935e-06, "loss": 0.0036, "num_tokens": 4866080.0, "reward": 0.84503173828125, "reward_std": 0.008432768285274506, "rewards//mean": 0.84503173828125, "rewards//std": 0.011395055800676346, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1494, "grad_norm": 0.7546378374099731, "kl": 0.03983141761273146, "learning_rate": 4.760036827479582e-06, "loss": 0.004, "num_tokens": 4872552.0, "reward": 0.87335205078125, "reward_std": 0.010378586128354073, "rewards//mean": 0.87335205078125, "rewards//std": 0.023331904783844948, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1496, "grad_norm": 0.7977150678634644, "kl": 0.03384686843492091, "learning_rate": 4.759358071868705e-06, "loss": 0.0034, "num_tokens": 4879048.0, "reward": 0.848388671875, "reward_std": 0.012764126062393188, "rewards//mean": 0.848388671875, "rewards//std": 0.032263416796922684, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1498, "grad_norm": 0.7911922335624695, "kl": 0.03975024959072471, "learning_rate": 4.758678406188668e-06, "loss": 0.004, "num_tokens": 4885656.0, "reward": 0.87164306640625, "reward_std": 0.01325987372547388, "rewards//mean": 0.87164306640625, "rewards//std": 0.02184206061065197, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.15, "grad_norm": 0.5640290975570679, "kl": 0.03782695718109608, "learning_rate": 4.757997830713239e-06, "loss": 0.0038, "num_tokens": 4892056.0, "reward": 0.850830078125, "reward_std": 0.011497659608721733, "rewards//mean": 0.850830078125, "rewards//std": 0.019806774333119392, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1502, "grad_norm": 0.6675816774368286, "kl": 0.038821807596832514, "learning_rate": 4.757316345716554e-06, "loss": 0.0039, "num_tokens": 4898448.0, "reward": 0.8446044921875, "reward_std": 0.015330525115132332, "rewards//mean": 0.8446044921875, "rewards//std": 0.037256550043821335, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1504, "grad_norm": 0.7459909319877625, "kl": 0.03718794463202357, "learning_rate": 4.756633951473114e-06, "loss": 0.0037, "num_tokens": 4905000.0, "reward": 0.8050537109375, "reward_std": 0.011927587911486626, "rewards//mean": 0.8050537109375, "rewards//std": 0.021973086521029472, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1506, "grad_norm": 0.6190351247787476, "kl": 0.03919585421681404, "learning_rate": 4.755950648257789e-06, "loss": 0.0039, "num_tokens": 4911648.0, "reward": 0.8504638671875, "reward_std": 0.013286629691720009, "rewards//mean": 0.8504638671875, "rewards//std": 0.02287762239575386, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1508, "grad_norm": 0.6951397657394409, "kl": 0.03513274388387799, "learning_rate": 4.755266436345812e-06, "loss": 0.0035, "num_tokens": 4918208.0, "reward": 0.76495361328125, "reward_std": 0.007555033545941114, "rewards//mean": 0.76495361328125, "rewards//std": 0.017532316967844963, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.151, "grad_norm": 0.6622002124786377, "kl": 0.03432131186127663, "learning_rate": 4.754581316012785e-06, "loss": 0.0034, "num_tokens": 4924768.0, "reward": 0.7640380859375, "reward_std": 0.01022288203239441, "rewards//mean": 0.7640380859375, "rewards//std": 0.03617635741829872, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1512, "grad_norm": 0.6431364417076111, "kl": 0.04085060302168131, "learning_rate": 4.753895287534673e-06, "loss": 0.0041, "num_tokens": 4931248.0, "reward": 0.839111328125, "reward_std": 0.012506959959864616, "rewards//mean": 0.839111328125, "rewards//std": 0.023703385144472122, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1514, "grad_norm": 0.6833599209785461, "kl": 0.038734698202461004, "learning_rate": 4.753208351187809e-06, "loss": 0.0039, "num_tokens": 4937744.0, "reward": 0.82275390625, "reward_std": 0.018028873950242996, "rewards//mean": 0.82275390625, "rewards//std": 0.034114301204681396, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1516, "grad_norm": 0.731910228729248, "kl": 0.044021994806826115, "learning_rate": 4.75252050724889e-06, "loss": 0.0044, "num_tokens": 4944224.0, "reward": 0.84613037109375, "reward_std": 0.010319128632545471, "rewards//mean": 0.84613037109375, "rewards//std": 0.023381808772683144, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1518, "grad_norm": 0.7127944827079773, "kl": 0.030607926659286022, "learning_rate": 4.751831755994981e-06, "loss": 0.0031, "num_tokens": 4950800.0, "reward": 0.84515380859375, "reward_std": 0.01648303121328354, "rewards//mean": 0.84515380859375, "rewards//std": 0.03060285560786724, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.152, "grad_norm": 0.5836415886878967, "kl": 0.037573390174657106, "learning_rate": 4.75114209770351e-06, "loss": 0.0038, "num_tokens": 4957328.0, "reward": 0.8306884765625, "reward_std": 0.01186058297753334, "rewards//mean": 0.8306884765625, "rewards//std": 0.023043762892484665, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1522, "grad_norm": 0.727539598941803, "kl": 0.038794671185314655, "learning_rate": 4.75045153265227e-06, "loss": 0.0039, "num_tokens": 4963840.0, "reward": 0.8233642578125, "reward_std": 0.010286824777722359, "rewards//mean": 0.8233642578125, "rewards//std": 0.02826681174337864, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1524, "grad_norm": 0.6977169513702393, "kl": 0.04045017482712865, "learning_rate": 4.749760061119423e-06, "loss": 0.004, "num_tokens": 4970328.0, "reward": 0.83831787109375, "reward_std": 0.011999544687569141, "rewards//mean": 0.83831787109375, "rewards//std": 0.016227491199970245, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1526, "grad_norm": 0.7427241206169128, "kl": 0.03671771287918091, "learning_rate": 4.749067683383491e-06, "loss": 0.0037, "num_tokens": 4976840.0, "reward": 0.79437255859375, "reward_std": 0.012813002802431583, "rewards//mean": 0.79437255859375, "rewards//std": 0.020884044468402863, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1528, "grad_norm": 0.6940443515777588, "kl": 0.035541698802262545, "learning_rate": 4.748374399723366e-06, "loss": 0.0036, "num_tokens": 4983216.0, "reward": 0.7872314453125, "reward_std": 0.011262375861406326, "rewards//mean": 0.7872314453125, "rewards//std": 0.02338545024394989, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.153, "grad_norm": 0.6882827877998352, "kl": 0.041853323113173246, "learning_rate": 4.747680210418302e-06, "loss": 0.0042, "num_tokens": 4989792.0, "reward": 0.83917236328125, "reward_std": 0.014110451564192772, "rewards//mean": 0.83917236328125, "rewards//std": 0.03160110116004944, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1532, "grad_norm": 0.6096701622009277, "kl": 0.037586374673992395, "learning_rate": 4.746985115747918e-06, "loss": 0.0038, "num_tokens": 4996456.0, "reward": 0.8651123046875, "reward_std": 0.010422743856906891, "rewards//mean": 0.8651123046875, "rewards//std": 0.02676604874432087, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1534, "grad_norm": 0.6226118206977844, "kl": 0.03840547287836671, "learning_rate": 4.746289115992198e-06, "loss": 0.0038, "num_tokens": 5002968.0, "reward": 0.8330078125, "reward_std": 0.011964459903538227, "rewards//mean": 0.8330078125, "rewards//std": 0.02288324572145939, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1536, "grad_norm": 0.5931572318077087, "kl": 0.03918908000923693, "learning_rate": 4.74559221143149e-06, "loss": 0.0039, "num_tokens": 5009488.0, "reward": 0.886474609375, "reward_std": 0.013542715460062027, "rewards//mean": 0.886474609375, "rewards//std": 0.021775512024760246, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1538, "grad_norm": 0.6926736831665039, "kl": 0.0409997534006834, "learning_rate": 4.744894402346508e-06, "loss": 0.0041, "num_tokens": 5016080.0, "reward": 0.86126708984375, "reward_std": 0.01152787171304226, "rewards//mean": 0.86126708984375, "rewards//std": 0.017427530139684677, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.154, "grad_norm": 0.6112828254699707, "kl": 0.03526647645048797, "learning_rate": 4.744195689018331e-06, "loss": 0.0035, "num_tokens": 5022568.0, "reward": 0.8729248046875, "reward_std": 0.013185831718146801, "rewards//mean": 0.8729248046875, "rewards//std": 0.03000018745660782, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1542, "grad_norm": 0.7000649571418762, "kl": 0.03744476521387696, "learning_rate": 4.743496071728396e-06, "loss": 0.0037, "num_tokens": 5029120.0, "reward": 0.828369140625, "reward_std": 0.009715208783745766, "rewards//mean": 0.828369140625, "rewards//std": 0.021663999184966087, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1544, "grad_norm": 0.7032372951507568, "kl": 0.04153305198997259, "learning_rate": 4.742795550758514e-06, "loss": 0.0042, "num_tokens": 5035608.0, "reward": 0.83880615234375, "reward_std": 0.014286772347986698, "rewards//mean": 0.83880615234375, "rewards//std": 0.02569752372801304, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1546, "grad_norm": 0.6795111298561096, "kl": 0.04108638036996126, "learning_rate": 4.742094126390851e-06, "loss": 0.0041, "num_tokens": 5042016.0, "reward": 0.81842041015625, "reward_std": 0.010245722718536854, "rewards//mean": 0.81842041015625, "rewards//std": 0.018109088763594627, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1548, "grad_norm": 0.6258853077888489, "kl": 0.03198192222043872, "learning_rate": 4.7413917989079415e-06, "loss": 0.0032, "num_tokens": 5048688.0, "reward": 0.85015869140625, "reward_std": 0.014012346044182777, "rewards//mean": 0.85015869140625, "rewards//std": 0.02717375010251999, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.155, "grad_norm": 0.6283664703369141, "kl": 0.0385423768311739, "learning_rate": 4.740688568592685e-06, "loss": 0.0039, "num_tokens": 5055208.0, "reward": 0.83642578125, "reward_std": 0.007748948875814676, "rewards//mean": 0.83642578125, "rewards//std": 0.016955677419900894, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1552, "grad_norm": 0.7539633512496948, "kl": 0.044441851787269115, "learning_rate": 4.73998443572834e-06, "loss": 0.0044, "num_tokens": 5061736.0, "reward": 0.84857177734375, "reward_std": 0.013371797278523445, "rewards//mean": 0.84857177734375, "rewards//std": 0.02089998498558998, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1554, "grad_norm": 0.6789582967758179, "kl": 0.0363217368721962, "learning_rate": 4.7392794005985324e-06, "loss": 0.0036, "num_tokens": 5068240.0, "reward": 0.85369873046875, "reward_std": 0.01439887098968029, "rewards//mean": 0.85369873046875, "rewards//std": 0.03139637038111687, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1556, "grad_norm": 0.6625312566757202, "kl": 0.0394647023640573, "learning_rate": 4.7385734634872504e-06, "loss": 0.0039, "num_tokens": 5074752.0, "reward": 0.8798828125, "reward_std": 0.009248141199350357, "rewards//mean": 0.8798828125, "rewards//std": 0.019722526893019676, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1558, "grad_norm": 0.6864038109779358, "kl": 0.03931661881506443, "learning_rate": 4.7378666246788444e-06, "loss": 0.0039, "num_tokens": 5081280.0, "reward": 0.82672119140625, "reward_std": 0.010209892876446247, "rewards//mean": 0.82672119140625, "rewards//std": 0.021703006699681282, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.156, "grad_norm": 0.7165336608886719, "kl": 0.045301258796826005, "learning_rate": 4.73715888445803e-06, "loss": 0.0045, "num_tokens": 5087808.0, "reward": 0.78179931640625, "reward_std": 0.008207255974411964, "rewards//mean": 0.78179931640625, "rewards//std": 0.011643425561487675, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1562, "grad_norm": 0.6524578332901001, "kl": 0.032820957247167826, "learning_rate": 4.736450243109885e-06, "loss": 0.0033, "num_tokens": 5094328.0, "reward": 0.8231201171875, "reward_std": 0.011289829388260841, "rewards//mean": 0.8231201171875, "rewards//std": 0.02681577205657959, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1564, "grad_norm": 0.6669304370880127, "kl": 0.0449559367261827, "learning_rate": 4.735740700919848e-06, "loss": 0.0045, "num_tokens": 5100752.0, "reward": 0.8516845703125, "reward_std": 0.011253753677010536, "rewards//mean": 0.8516845703125, "rewards//std": 0.024655919522047043, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1566, "grad_norm": 0.6886339783668518, "kl": 0.045894182519987226, "learning_rate": 4.7350302581737255e-06, "loss": 0.0046, "num_tokens": 5107296.0, "reward": 0.84368896484375, "reward_std": 0.010614918544888496, "rewards//mean": 0.84368896484375, "rewards//std": 0.026406412944197655, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1568, "grad_norm": 0.6456897854804993, "kl": 0.04228213196620345, "learning_rate": 4.734318915157682e-06, "loss": 0.0042, "num_tokens": 5113960.0, "reward": 0.8406982421875, "reward_std": 0.014344006776809692, "rewards//mean": 0.8406982421875, "rewards//std": 0.03267809748649597, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.157, "grad_norm": 0.6308059692382812, "kl": 0.03701934986747801, "learning_rate": 4.7336066721582464e-06, "loss": 0.0037, "num_tokens": 5120456.0, "reward": 0.822265625, "reward_std": 0.013070710003376007, "rewards//mean": 0.822265625, "rewards//std": 0.02879762277007103, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1572, "grad_norm": 0.6404659152030945, "kl": 0.040786292403936386, "learning_rate": 4.73289352946231e-06, "loss": 0.0041, "num_tokens": 5126968.0, "reward": 0.85162353515625, "reward_std": 0.011420845054090023, "rewards//mean": 0.85162353515625, "rewards//std": 0.02786138653755188, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1574, "grad_norm": 0.6793121099472046, "kl": 0.046549074817448854, "learning_rate": 4.732179487357127e-06, "loss": 0.0047, "num_tokens": 5133368.0, "reward": 0.85272216796875, "reward_std": 0.014255034737288952, "rewards//mean": 0.85272216796875, "rewards//std": 0.018813444301486015, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1576, "grad_norm": 0.6696339249610901, "kl": 0.04117098404094577, "learning_rate": 4.731464546130315e-06, "loss": 0.0041, "num_tokens": 5139944.0, "reward": 0.78778076171875, "reward_std": 0.011665645986795425, "rewards//mean": 0.78778076171875, "rewards//std": 0.02138470858335495, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1578, "grad_norm": 0.6691408753395081, "kl": 0.036057604011148214, "learning_rate": 4.730748706069849e-06, "loss": 0.0036, "num_tokens": 5146472.0, "reward": 0.80499267578125, "reward_std": 0.011604068800807, "rewards//mean": 0.80499267578125, "rewards//std": 0.030689295381307602, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.158, "grad_norm": 0.6431038975715637, "kl": 0.03318591718561947, "learning_rate": 4.730031967464071e-06, "loss": 0.0033, "num_tokens": 5153088.0, "reward": 0.8427734375, "reward_std": 0.011495914310216904, "rewards//mean": 0.8427734375, "rewards//std": 0.014894897118210793, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1582, "grad_norm": 0.6348263621330261, "kl": 0.037593565415591, "learning_rate": 4.729314330601684e-06, "loss": 0.0038, "num_tokens": 5159672.0, "reward": 0.81695556640625, "reward_std": 0.01404502335935831, "rewards//mean": 0.81695556640625, "rewards//std": 0.01725819706916809, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1584, "grad_norm": 0.6973318457603455, "kl": 0.041428486816585064, "learning_rate": 4.72859579577175e-06, "loss": 0.0041, "num_tokens": 5166144.0, "reward": 0.7879638671875, "reward_std": 0.012339320033788681, "rewards//mean": 0.7879638671875, "rewards//std": 0.017778070643544197, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1586, "grad_norm": 0.6690841913223267, "kl": 0.043853968381881714, "learning_rate": 4.7278763632636974e-06, "loss": 0.0044, "num_tokens": 5172720.0, "reward": 0.8472900390625, "reward_std": 0.014657915569841862, "rewards//mean": 0.8472900390625, "rewards//std": 0.02887086011469364, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1588, "grad_norm": 0.6164650321006775, "kl": 0.04399008536711335, "learning_rate": 4.727156033367312e-06, "loss": 0.0044, "num_tokens": 5179160.0, "reward": 0.85150146484375, "reward_std": 0.012938274070620537, "rewards//mean": 0.85150146484375, "rewards//std": 0.01753145270049572, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.159, "grad_norm": 0.6124863624572754, "kl": 0.03883708012290299, "learning_rate": 4.7264348063727415e-06, "loss": 0.0039, "num_tokens": 5185632.0, "reward": 0.84783935546875, "reward_std": 0.008572231978178024, "rewards//mean": 0.84783935546875, "rewards//std": 0.01818166859447956, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1592, "grad_norm": 0.6611150503158569, "kl": 0.036757583264261484, "learning_rate": 4.725712682570498e-06, "loss": 0.0037, "num_tokens": 5192240.0, "reward": 0.7779541015625, "reward_std": 0.01289368700236082, "rewards//mean": 0.7779541015625, "rewards//std": 0.02997797727584839, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1594, "grad_norm": 0.5918968319892883, "kl": 0.04113738890737295, "learning_rate": 4.724989662251452e-06, "loss": 0.0041, "num_tokens": 5198688.0, "reward": 0.80126953125, "reward_std": 0.012578248977661133, "rewards//mean": 0.80126953125, "rewards//std": 0.03841540217399597, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1596, "grad_norm": 0.5858060717582703, "kl": 0.03528264374472201, "learning_rate": 4.724265745706837e-06, "loss": 0.0035, "num_tokens": 5205104.0, "reward": 0.85760498046875, "reward_std": 0.013114457949995995, "rewards//mean": 0.85760498046875, "rewards//std": 0.03279987350106239, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1598, "grad_norm": 0.6399039030075073, "kl": 0.03707644832320511, "learning_rate": 4.723540933228245e-06, "loss": 0.0037, "num_tokens": 5211616.0, "reward": 0.85491943359375, "reward_std": 0.011643383651971817, "rewards//mean": 0.85491943359375, "rewards//std": 0.02243286743760109, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.16, "grad_norm": 0.7004696726799011, "kl": 0.038236090214923024, "learning_rate": 4.7228152251076295e-06, "loss": 0.0038, "num_tokens": 5218080.0, "reward": 0.85076904296875, "reward_std": 0.01267247460782528, "rewards//mean": 0.85076904296875, "rewards//std": 0.018986446782946587, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1602, "grad_norm": 0.6958453059196472, "kl": 0.035821242490783334, "learning_rate": 4.7220886216373095e-06, "loss": 0.0036, "num_tokens": 5224624.0, "reward": 0.8458251953125, "reward_std": 0.012326447293162346, "rewards//mean": 0.8458251953125, "rewards//std": 0.0214402936398983, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1604, "grad_norm": 0.6648064851760864, "kl": 0.03554333420470357, "learning_rate": 4.7213611231099575e-06, "loss": 0.0036, "num_tokens": 5231176.0, "reward": 0.8375244140625, "reward_std": 0.015521214343607426, "rewards//mean": 0.8375244140625, "rewards//std": 0.0276188924908638, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1606, "grad_norm": 0.694705605506897, "kl": 0.04491404443979263, "learning_rate": 4.7206327298186105e-06, "loss": 0.0045, "num_tokens": 5237688.0, "reward": 0.85906982421875, "reward_std": 0.011747853830456734, "rewards//mean": 0.85906982421875, "rewards//std": 0.017038391903042793, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1608, "grad_norm": 0.700727641582489, "kl": 0.03822877514176071, "learning_rate": 4.7199034420566656e-06, "loss": 0.0038, "num_tokens": 5244160.0, "reward": 0.86480712890625, "reward_std": 0.012476685456931591, "rewards//mean": 0.86480712890625, "rewards//std": 0.028035791590809822, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.161, "grad_norm": 0.6538044214248657, "kl": 0.036812544567510486, "learning_rate": 4.7191732601178795e-06, "loss": 0.0037, "num_tokens": 5250784.0, "reward": 0.81585693359375, "reward_std": 0.013583207502961159, "rewards//mean": 0.81585693359375, "rewards//std": 0.023019032552838326, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1612, "grad_norm": 0.6818084716796875, "kl": 0.04158745566383004, "learning_rate": 4.71844218429637e-06, "loss": 0.0042, "num_tokens": 5257312.0, "reward": 0.86090087890625, "reward_std": 0.012027601711452007, "rewards//mean": 0.86090087890625, "rewards//std": 0.0386737696826458, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1614, "grad_norm": 0.6377874612808228, "kl": 0.03743350366130471, "learning_rate": 4.717710214886614e-06, "loss": 0.0037, "num_tokens": 5263848.0, "reward": 0.84771728515625, "reward_std": 0.01110083982348442, "rewards//mean": 0.84771728515625, "rewards//std": 0.03908414766192436, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1616, "grad_norm": 0.6489036679267883, "kl": 0.04202882153913379, "learning_rate": 4.716977352183449e-06, "loss": 0.0042, "num_tokens": 5270336.0, "reward": 0.87298583984375, "reward_std": 0.011964542791247368, "rewards//mean": 0.87298583984375, "rewards//std": 0.04070214927196503, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1618, "grad_norm": 0.6986774206161499, "kl": 0.04818084090948105, "learning_rate": 4.716243596482071e-06, "loss": 0.0048, "num_tokens": 5276920.0, "reward": 0.855712890625, "reward_std": 0.014097800478339195, "rewards//mean": 0.855712890625, "rewards//std": 0.04160777106881142, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.162, "grad_norm": 0.7664393782615662, "kl": 0.03928353264927864, "learning_rate": 4.715508948078037e-06, "loss": 0.0039, "num_tokens": 5283392.0, "reward": 0.865234375, "reward_std": 0.016030853614211082, "rewards//mean": 0.865234375, "rewards//std": 0.042117632925510406, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1622, "grad_norm": 0.6129316091537476, "kl": 0.04093162016943097, "learning_rate": 4.714773407267264e-06, "loss": 0.0041, "num_tokens": 5289944.0, "reward": 0.8447265625, "reward_std": 0.012979458086192608, "rewards//mean": 0.8447265625, "rewards//std": 0.024140017107129097, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1624, "grad_norm": 0.6467927694320679, "kl": 0.04427446564659476, "learning_rate": 4.714036974346028e-06, "loss": 0.0044, "num_tokens": 5296464.0, "reward": 0.82745361328125, "reward_std": 0.014607482589781284, "rewards//mean": 0.82745361328125, "rewards//std": 0.034782618284225464, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1626, "grad_norm": 0.7141029238700867, "kl": 0.03883550688624382, "learning_rate": 4.7132996496109625e-06, "loss": 0.0039, "num_tokens": 5303040.0, "reward": 0.80206298828125, "reward_std": 0.010535812005400658, "rewards//mean": 0.80206298828125, "rewards//std": 0.017134075984358788, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1628, "grad_norm": 0.6874210834503174, "kl": 0.03825618769042194, "learning_rate": 4.712561433359064e-06, "loss": 0.0038, "num_tokens": 5309568.0, "reward": 0.84271240234375, "reward_std": 0.011939950287342072, "rewards//mean": 0.84271240234375, "rewards//std": 0.024391964077949524, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.163, "grad_norm": 0.6792731285095215, "kl": 0.047371129505336285, "learning_rate": 4.7118223258876845e-06, "loss": 0.0047, "num_tokens": 5316040.0, "reward": 0.79132080078125, "reward_std": 0.011203013360500336, "rewards//mean": 0.79132080078125, "rewards//std": 0.01953502930700779, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1632, "grad_norm": 0.606471598148346, "kl": 0.03948652045801282, "learning_rate": 4.711082327494536e-06, "loss": 0.0039, "num_tokens": 5322584.0, "reward": 0.84771728515625, "reward_std": 0.014606360346078873, "rewards//mean": 0.84771728515625, "rewards//std": 0.02167089842259884, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1634, "grad_norm": 0.6929451823234558, "kl": 0.04543322464451194, "learning_rate": 4.710341438477691e-06, "loss": 0.0045, "num_tokens": 5329112.0, "reward": 0.82305908203125, "reward_std": 0.010059753432869911, "rewards//mean": 0.82305908203125, "rewards//std": 0.024692347273230553, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1636, "grad_norm": 0.6723214387893677, "kl": 0.04372272826731205, "learning_rate": 4.709599659135579e-06, "loss": 0.0044, "num_tokens": 5335608.0, "reward": 0.8341064453125, "reward_std": 0.013317547738552094, "rewards//mean": 0.8341064453125, "rewards//std": 0.024159763008356094, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1638, "grad_norm": 0.6974331140518188, "kl": 0.05139124137349427, "learning_rate": 4.708856989766988e-06, "loss": 0.0051, "num_tokens": 5342152.0, "reward": 0.82366943359375, "reward_std": 0.011363249272108078, "rewards//mean": 0.82366943359375, "rewards//std": 0.02652880735695362, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.164, "grad_norm": 0.7144467234611511, "kl": 0.04704140080139041, "learning_rate": 4.708113430671066e-06, "loss": 0.0047, "num_tokens": 5348688.0, "reward": 0.82305908203125, "reward_std": 0.011271877214312553, "rewards//mean": 0.82305908203125, "rewards//std": 0.01991415023803711, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1642, "grad_norm": 0.6751676797866821, "kl": 0.034764718497172, "learning_rate": 4.707368982147318e-06, "loss": 0.0035, "num_tokens": 5355232.0, "reward": 0.8125, "reward_std": 0.010452823713421822, "rewards//mean": 0.8125, "rewards//std": 0.022423502057790756, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1644, "grad_norm": 0.7800047397613525, "kl": 0.04225182137452066, "learning_rate": 4.706623644495608e-06, "loss": 0.0042, "num_tokens": 5361824.0, "reward": 0.86041259765625, "reward_std": 0.00938594713807106, "rewards//mean": 0.86041259765625, "rewards//std": 0.027074409648776054, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1646, "grad_norm": 0.6850138902664185, "kl": 0.042775253765285015, "learning_rate": 4.705877418016157e-06, "loss": 0.0043, "num_tokens": 5368328.0, "reward": 0.83941650390625, "reward_std": 0.011559306643903255, "rewards//mean": 0.83941650390625, "rewards//std": 0.021588314324617386, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1648, "grad_norm": 0.6577291488647461, "kl": 0.04705011798068881, "learning_rate": 4.705130303009547e-06, "loss": 0.0047, "num_tokens": 5374976.0, "reward": 0.843994140625, "reward_std": 0.015368317253887653, "rewards//mean": 0.843994140625, "rewards//std": 0.03274773806333542, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.165, "grad_norm": 0.6514129638671875, "kl": 0.034172447165474296, "learning_rate": 4.7043822997767145e-06, "loss": 0.0034, "num_tokens": 5381512.0, "reward": 0.85137939453125, "reward_std": 0.015748068690299988, "rewards//mean": 0.85137939453125, "rewards//std": 0.0229300819337368, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1652, "grad_norm": 0.7008969783782959, "kl": 0.0387201386038214, "learning_rate": 4.703633408618955e-06, "loss": 0.0039, "num_tokens": 5388152.0, "reward": 0.82940673828125, "reward_std": 0.010123923420906067, "rewards//mean": 0.82940673828125, "rewards//std": 0.024310529232025146, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1654, "grad_norm": 0.7166876792907715, "kl": 0.04297814145684242, "learning_rate": 4.702883629837922e-06, "loss": 0.0043, "num_tokens": 5394632.0, "reward": 0.8524169921875, "reward_std": 0.015698876231908798, "rewards//mean": 0.8524169921875, "rewards//std": 0.031592775136232376, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1656, "grad_norm": 0.7098883390426636, "kl": 0.04341789707541466, "learning_rate": 4.7021329637356274e-06, "loss": 0.0043, "num_tokens": 5401120.0, "reward": 0.8165283203125, "reward_std": 0.014469243586063385, "rewards//mean": 0.8165283203125, "rewards//std": 0.030440986156463623, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1658, "grad_norm": 0.677291750907898, "kl": 0.04045027890242636, "learning_rate": 4.701381410614437e-06, "loss": 0.004, "num_tokens": 5407688.0, "reward": 0.84991455078125, "reward_std": 0.01516575738787651, "rewards//mean": 0.84991455078125, "rewards//std": 0.031054111197590828, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.166, "grad_norm": 0.611182689666748, "kl": 0.03975608246400952, "learning_rate": 4.700628970777078e-06, "loss": 0.004, "num_tokens": 5414168.0, "reward": 0.87420654296875, "reward_std": 0.011042594909667969, "rewards//mean": 0.87420654296875, "rewards//std": 0.02186768874526024, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1662, "grad_norm": 0.6226016283035278, "kl": 0.03868942544795573, "learning_rate": 4.699875644526633e-06, "loss": 0.0039, "num_tokens": 5420696.0, "reward": 0.849853515625, "reward_std": 0.009386981837451458, "rewards//mean": 0.849853515625, "rewards//std": 0.018160944804549217, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1664, "grad_norm": 0.7501590847969055, "kl": 0.04054065514355898, "learning_rate": 4.699121432166542e-06, "loss": 0.0041, "num_tokens": 5427272.0, "reward": 0.86236572265625, "reward_std": 0.013934629037976265, "rewards//mean": 0.86236572265625, "rewards//std": 0.03134860470890999, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1666, "grad_norm": 0.6518362760543823, "kl": 0.03787135658785701, "learning_rate": 4.6983663340006e-06, "loss": 0.0038, "num_tokens": 5433880.0, "reward": 0.8211669921875, "reward_std": 0.010044453665614128, "rewards//mean": 0.8211669921875, "rewards//std": 0.024274777621030807, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1668, "grad_norm": 0.7006896734237671, "kl": 0.03864523209631443, "learning_rate": 4.697610350332962e-06, "loss": 0.0039, "num_tokens": 5440504.0, "reward": 0.87115478515625, "reward_std": 0.013994935899972916, "rewards//mean": 0.87115478515625, "rewards//std": 0.020862286910414696, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.167, "grad_norm": 0.7370618581771851, "kl": 0.043036503717303276, "learning_rate": 4.696853481468137e-06, "loss": 0.0043, "num_tokens": 5447056.0, "reward": 0.811279296875, "reward_std": 0.010254578664898872, "rewards//mean": 0.811279296875, "rewards//std": 0.016512421891093254, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1672, "grad_norm": 0.6868679523468018, "kl": 0.0437261825427413, "learning_rate": 4.6960957277109945e-06, "loss": 0.0044, "num_tokens": 5453480.0, "reward": 0.8250732421875, "reward_std": 0.01129057165235281, "rewards//mean": 0.8250732421875, "rewards//std": 0.018155528232455254, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1674, "grad_norm": 0.6761255860328674, "kl": 0.039087172131985426, "learning_rate": 4.695337089366754e-06, "loss": 0.0039, "num_tokens": 5460008.0, "reward": 0.85760498046875, "reward_std": 0.012311861850321293, "rewards//mean": 0.85760498046875, "rewards//std": 0.02614312246441841, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1676, "grad_norm": 0.6296420097351074, "kl": 0.04748300788924098, "learning_rate": 4.694577566740996e-06, "loss": 0.0047, "num_tokens": 5466456.0, "reward": 0.87274169921875, "reward_std": 0.014107296243309975, "rewards//mean": 0.87274169921875, "rewards//std": 0.03133362904191017, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1678, "grad_norm": 0.717560887336731, "kl": 0.04282214818522334, "learning_rate": 4.693817160139657e-06, "loss": 0.0043, "num_tokens": 5473096.0, "reward": 0.83001708984375, "reward_std": 0.012546509504318237, "rewards//mean": 0.83001708984375, "rewards//std": 0.025458427146077156, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.168, "grad_norm": 0.7495692372322083, "kl": 0.04018871276639402, "learning_rate": 4.693055869869029e-06, "loss": 0.004, "num_tokens": 5479584.0, "reward": 0.86773681640625, "reward_std": 0.015347221866250038, "rewards//mean": 0.86773681640625, "rewards//std": 0.03341394290328026, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1682, "grad_norm": 0.6433113813400269, "kl": 0.04005603678524494, "learning_rate": 4.692293696235758e-06, "loss": 0.004, "num_tokens": 5486064.0, "reward": 0.8238525390625, "reward_std": 0.012554389424622059, "rewards//mean": 0.8238525390625, "rewards//std": 0.022996416315436363, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1684, "grad_norm": 0.7169666886329651, "kl": 0.03929765592329204, "learning_rate": 4.6915306395468485e-06, "loss": 0.0039, "num_tokens": 5492592.0, "reward": 0.83978271484375, "reward_std": 0.0161487627774477, "rewards//mean": 0.83978271484375, "rewards//std": 0.026185357943177223, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1686, "grad_norm": 0.7075477838516235, "kl": 0.042687549255788326, "learning_rate": 4.690766700109659e-06, "loss": 0.0043, "num_tokens": 5499096.0, "reward": 0.85882568359375, "reward_std": 0.015180788934230804, "rewards//mean": 0.85882568359375, "rewards//std": 0.03275507688522339, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1688, "grad_norm": 0.6575949192047119, "kl": 0.04145486559718847, "learning_rate": 4.690001878231906e-06, "loss": 0.0041, "num_tokens": 5505616.0, "reward": 0.8748779296875, "reward_std": 0.014031785540282726, "rewards//mean": 0.8748779296875, "rewards//std": 0.019375229254364967, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.169, "grad_norm": 0.6833431124687195, "kl": 0.03976182406768203, "learning_rate": 4.689236174221658e-06, "loss": 0.004, "num_tokens": 5512120.0, "reward": 0.7947998046875, "reward_std": 0.01373874768614769, "rewards//mean": 0.7947998046875, "rewards//std": 0.020948907360434532, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1692, "grad_norm": 0.7554038166999817, "kl": 0.03467200044542551, "learning_rate": 4.688469588387339e-06, "loss": 0.0035, "num_tokens": 5518696.0, "reward": 0.87481689453125, "reward_std": 0.01231304369866848, "rewards//mean": 0.87481689453125, "rewards//std": 0.024154357612133026, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1694, "grad_norm": 0.7133338451385498, "kl": 0.044337095227092505, "learning_rate": 4.687702121037734e-06, "loss": 0.0044, "num_tokens": 5525248.0, "reward": 0.76873779296875, "reward_std": 0.010633913800120354, "rewards//mean": 0.76873779296875, "rewards//std": 0.03006640449166298, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1696, "grad_norm": 0.570580005645752, "kl": 0.039059948176145554, "learning_rate": 4.6869337724819745e-06, "loss": 0.0039, "num_tokens": 5531752.0, "reward": 0.8349609375, "reward_std": 0.008149969391524792, "rewards//mean": 0.8349609375, "rewards//std": 0.019212428480386734, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1698, "grad_norm": 0.7679130434989929, "kl": 0.045566384214907885, "learning_rate": 4.686164543029554e-06, "loss": 0.0046, "num_tokens": 5538288.0, "reward": 0.7852783203125, "reward_std": 0.011804823763668537, "rewards//mean": 0.7852783203125, "rewards//std": 0.020575610920786858, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.17, "grad_norm": 0.6468374729156494, "kl": 0.04972790856845677, "learning_rate": 4.685394432990316e-06, "loss": 0.005, "num_tokens": 5544752.0, "reward": 0.85662841796875, "reward_std": 0.010980328544974327, "rewards//mean": 0.85662841796875, "rewards//std": 0.03794073686003685, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1702, "grad_norm": 0.8263505101203918, "kl": 0.04581076675094664, "learning_rate": 4.684623442674463e-06, "loss": 0.0046, "num_tokens": 5551272.0, "reward": 0.8409423828125, "reward_std": 0.012282561510801315, "rewards//mean": 0.8409423828125, "rewards//std": 0.022003378719091415, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1704, "grad_norm": 0.6327822804450989, "kl": 0.04290678119286895, "learning_rate": 4.683851572392548e-06, "loss": 0.0043, "num_tokens": 5557872.0, "reward": 0.84039306640625, "reward_std": 0.010098406113684177, "rewards//mean": 0.84039306640625, "rewards//std": 0.01718788407742977, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1706, "grad_norm": 0.5984649062156677, "kl": 0.041823903331533074, "learning_rate": 4.68307882245548e-06, "loss": 0.0042, "num_tokens": 5564400.0, "reward": 0.8529052734375, "reward_std": 0.008947965689003468, "rewards//mean": 0.8529052734375, "rewards//std": 0.029349274933338165, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1708, "grad_norm": 0.6533376574516296, "kl": 0.04166531004011631, "learning_rate": 4.682305193174524e-06, "loss": 0.0042, "num_tokens": 5570936.0, "reward": 0.8236083984375, "reward_std": 0.00973258726298809, "rewards//mean": 0.8236083984375, "rewards//std": 0.011671505868434906, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.171, "grad_norm": 0.6973181366920471, "kl": 0.038289697375148535, "learning_rate": 4.681530684861298e-06, "loss": 0.0038, "num_tokens": 5577480.0, "reward": 0.79681396484375, "reward_std": 0.007634049281477928, "rewards//mean": 0.79681396484375, "rewards//std": 0.018681829795241356, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1712, "grad_norm": 0.6991540789604187, "kl": 0.04020833037793636, "learning_rate": 4.680755297827772e-06, "loss": 0.004, "num_tokens": 5583984.0, "reward": 0.82098388671875, "reward_std": 0.012616423889994621, "rewards//mean": 0.82098388671875, "rewards//std": 0.022867942228913307, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1714, "grad_norm": 0.6678091883659363, "kl": 0.03987215249799192, "learning_rate": 4.6799790323862735e-06, "loss": 0.004, "num_tokens": 5590536.0, "reward": 0.82659912109375, "reward_std": 0.014262435957789421, "rewards//mean": 0.82659912109375, "rewards//std": 0.033164750784635544, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1716, "grad_norm": 0.6583226919174194, "kl": 0.04954609926789999, "learning_rate": 4.679201888849481e-06, "loss": 0.005, "num_tokens": 5596984.0, "reward": 0.85125732421875, "reward_std": 0.018312999978661537, "rewards//mean": 0.85125732421875, "rewards//std": 0.034297019243240356, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1718, "grad_norm": 0.6026672720909119, "kl": 0.035488207591697574, "learning_rate": 4.678423867530428e-06, "loss": 0.0035, "num_tokens": 5603520.0, "reward": 0.85992431640625, "reward_std": 0.009927749633789062, "rewards//mean": 0.85992431640625, "rewards//std": 0.02918913960456848, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.172, "grad_norm": 0.779086709022522, "kl": 0.04589052964001894, "learning_rate": 4.677644968742503e-06, "loss": 0.0046, "num_tokens": 5610072.0, "reward": 0.84796142578125, "reward_std": 0.013327594846487045, "rewards//mean": 0.84796142578125, "rewards//std": 0.03296468406915665, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1722, "grad_norm": 0.7797392010688782, "kl": 0.03917522868141532, "learning_rate": 4.676865192799443e-06, "loss": 0.0039, "num_tokens": 5616520.0, "reward": 0.8319091796875, "reward_std": 0.015295634046196938, "rewards//mean": 0.8319091796875, "rewards//std": 0.031780049204826355, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1724, "grad_norm": 0.7025132179260254, "kl": 0.04026870639063418, "learning_rate": 4.676084540015345e-06, "loss": 0.004, "num_tokens": 5623024.0, "reward": 0.8492431640625, "reward_std": 0.011275040917098522, "rewards//mean": 0.8492431640625, "rewards//std": 0.015386302955448627, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1726, "grad_norm": 0.7985981106758118, "kl": 0.04054851154796779, "learning_rate": 4.675303010704654e-06, "loss": 0.0041, "num_tokens": 5629568.0, "reward": 0.8460693359375, "reward_std": 0.011221226304769516, "rewards//mean": 0.8460693359375, "rewards//std": 0.023945782333612442, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1728, "grad_norm": 0.7008482813835144, "kl": 0.04197323229163885, "learning_rate": 4.674520605182171e-06, "loss": 0.0042, "num_tokens": 5636168.0, "reward": 0.8560791015625, "reward_std": 0.01303877867758274, "rewards//mean": 0.8560791015625, "rewards//std": 0.03236343339085579, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.173, "grad_norm": 0.660435676574707, "kl": 0.038663000566884875, "learning_rate": 4.673737323763048e-06, "loss": 0.0039, "num_tokens": 5642648.0, "reward": 0.86883544921875, "reward_std": 0.009736208245158195, "rewards//mean": 0.86883544921875, "rewards//std": 0.020384449511766434, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1732, "grad_norm": 0.6624974608421326, "kl": 0.043648229679092765, "learning_rate": 4.672953166762791e-06, "loss": 0.0044, "num_tokens": 5649224.0, "reward": 0.8408203125, "reward_std": 0.013061913661658764, "rewards//mean": 0.8408203125, "rewards//std": 0.02628237009048462, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1734, "grad_norm": 0.6637141704559326, "kl": 0.04361239541321993, "learning_rate": 4.672168134497258e-06, "loss": 0.0044, "num_tokens": 5655744.0, "reward": 0.8326416015625, "reward_std": 0.01376126054674387, "rewards//mean": 0.8326416015625, "rewards//std": 0.02711889147758484, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1736, "grad_norm": 0.7084535956382751, "kl": 0.04428035207092762, "learning_rate": 4.671382227282661e-06, "loss": 0.0044, "num_tokens": 5662272.0, "reward": 0.88690185546875, "reward_std": 0.018743984401226044, "rewards//mean": 0.88690185546875, "rewards//std": 0.03211710602045059, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1738, "grad_norm": 0.6958263516426086, "kl": 0.038506853859871626, "learning_rate": 4.670595445435561e-06, "loss": 0.0039, "num_tokens": 5668808.0, "reward": 0.853515625, "reward_std": 0.014965204522013664, "rewards//mean": 0.853515625, "rewards//std": 0.03174978867173195, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.174, "grad_norm": 0.6819318532943726, "kl": 0.042381568579003215, "learning_rate": 4.669807789272877e-06, "loss": 0.0042, "num_tokens": 5675336.0, "reward": 0.839111328125, "reward_std": 0.014586934819817543, "rewards//mean": 0.839111328125, "rewards//std": 0.04526562616229057, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1742, "grad_norm": 0.618557333946228, "kl": 0.04537222860381007, "learning_rate": 4.669019259111873e-06, "loss": 0.0045, "num_tokens": 5681856.0, "reward": 0.83349609375, "reward_std": 0.011118524707853794, "rewards//mean": 0.83349609375, "rewards//std": 0.023046717047691345, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1744, "grad_norm": 0.6631488800048828, "kl": 0.04104706086218357, "learning_rate": 4.668229855270172e-06, "loss": 0.0041, "num_tokens": 5688536.0, "reward": 0.8538818359375, "reward_std": 0.013846810907125473, "rewards//mean": 0.8538818359375, "rewards//std": 0.02200612984597683, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1746, "grad_norm": 0.5976673364639282, "kl": 0.04125231131911278, "learning_rate": 4.667439578065745e-06, "loss": 0.0041, "num_tokens": 5695080.0, "reward": 0.83306884765625, "reward_std": 0.01117327157407999, "rewards//mean": 0.83306884765625, "rewards//std": 0.02727050706744194, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1748, "grad_norm": 0.6268535256385803, "kl": 0.037555265706032515, "learning_rate": 4.666648427816914e-06, "loss": 0.0038, "num_tokens": 5701576.0, "reward": 0.85052490234375, "reward_std": 0.00950030330568552, "rewards//mean": 0.85052490234375, "rewards//std": 0.018535403534770012, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.175, "grad_norm": 0.677286684513092, "kl": 0.03843085980042815, "learning_rate": 4.665856404842356e-06, "loss": 0.0038, "num_tokens": 5708056.0, "reward": 0.86199951171875, "reward_std": 0.017461739480495453, "rewards//mean": 0.86199951171875, "rewards//std": 0.03396883234381676, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1752, "grad_norm": 0.5880584120750427, "kl": 0.03914790088310838, "learning_rate": 4.665063509461098e-06, "loss": 0.0039, "num_tokens": 5714608.0, "reward": 0.84222412109375, "reward_std": 0.009593464434146881, "rewards//mean": 0.84222412109375, "rewards//std": 0.024935755878686905, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1754, "grad_norm": 0.6908532977104187, "kl": 0.04660891415551305, "learning_rate": 4.664269741992516e-06, "loss": 0.0047, "num_tokens": 5721080.0, "reward": 0.8328857421875, "reward_std": 0.01860264502465725, "rewards//mean": 0.8328857421875, "rewards//std": 0.030949924141168594, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1756, "grad_norm": 0.656211256980896, "kl": 0.04456622153520584, "learning_rate": 4.663475102756341e-06, "loss": 0.0045, "num_tokens": 5727464.0, "reward": 0.86834716796875, "reward_std": 0.011676906608045101, "rewards//mean": 0.86834716796875, "rewards//std": 0.024166887626051903, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1758, "grad_norm": 0.6266794800758362, "kl": 0.043249581241980195, "learning_rate": 4.662679592072653e-06, "loss": 0.0043, "num_tokens": 5733976.0, "reward": 0.8218994140625, "reward_std": 0.009951181709766388, "rewards//mean": 0.8218994140625, "rewards//std": 0.017493095248937607, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.176, "grad_norm": 0.6799640655517578, "kl": 0.03653007443062961, "learning_rate": 4.661883210261884e-06, "loss": 0.0037, "num_tokens": 5740520.0, "reward": 0.8427734375, "reward_std": 0.011458927765488625, "rewards//mean": 0.8427734375, "rewards//std": 0.02008756436407566, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1762, "grad_norm": 0.6607202887535095, "kl": 0.047988517209887505, "learning_rate": 4.661085957644817e-06, "loss": 0.0048, "num_tokens": 5747056.0, "reward": 0.84716796875, "reward_std": 0.017563801258802414, "rewards//mean": 0.84716796875, "rewards//std": 0.03399338945746422, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1764, "grad_norm": 0.761650562286377, "kl": 0.044383881613612175, "learning_rate": 4.660287834542585e-06, "loss": 0.0044, "num_tokens": 5753488.0, "reward": 0.838623046875, "reward_std": 0.011798453517258167, "rewards//mean": 0.838623046875, "rewards//std": 0.021719828248023987, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1766, "grad_norm": 0.6888118982315063, "kl": 0.04098816681653261, "learning_rate": 4.659488841276671e-06, "loss": 0.0041, "num_tokens": 5760008.0, "reward": 0.8052978515625, "reward_std": 0.00954475812613964, "rewards//mean": 0.8052978515625, "rewards//std": 0.02581954002380371, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1768, "grad_norm": 0.7232738137245178, "kl": 0.03795227757655084, "learning_rate": 4.65868897816891e-06, "loss": 0.0038, "num_tokens": 5766448.0, "reward": 0.82147216796875, "reward_std": 0.00959029421210289, "rewards//mean": 0.82147216796875, "rewards//std": 0.021806685253977776, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.177, "grad_norm": 0.6652290225028992, "kl": 0.042733343318104744, "learning_rate": 4.6578882455414865e-06, "loss": 0.0043, "num_tokens": 5773016.0, "reward": 0.80224609375, "reward_std": 0.011170993559062481, "rewards//mean": 0.80224609375, "rewards//std": 0.01766917295753956, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1772, "grad_norm": 0.6574732661247253, "kl": 0.0360676443669945, "learning_rate": 4.657086643716937e-06, "loss": 0.0036, "num_tokens": 5779536.0, "reward": 0.82135009765625, "reward_std": 0.011647794395685196, "rewards//mean": 0.82135009765625, "rewards//std": 0.027937347069382668, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1774, "grad_norm": 0.6396461725234985, "kl": 0.04124319599941373, "learning_rate": 4.656284173018144e-06, "loss": 0.0041, "num_tokens": 5785952.0, "reward": 0.86334228515625, "reward_std": 0.012334790080785751, "rewards//mean": 0.86334228515625, "rewards//std": 0.025793951004743576, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1776, "grad_norm": 0.6651362180709839, "kl": 0.041016052244231105, "learning_rate": 4.655480833768344e-06, "loss": 0.0041, "num_tokens": 5792528.0, "reward": 0.86566162109375, "reward_std": 0.010340459644794464, "rewards//mean": 0.86566162109375, "rewards//std": 0.03301836550235748, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1778, "grad_norm": 0.7577189803123474, "kl": 0.04309235932305455, "learning_rate": 4.654676626291123e-06, "loss": 0.0043, "num_tokens": 5799048.0, "reward": 0.85028076171875, "reward_std": 0.015380732715129852, "rewards//mean": 0.85028076171875, "rewards//std": 0.022989420220255852, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.178, "grad_norm": 0.5894119143486023, "kl": 0.04171917075291276, "learning_rate": 4.653871550910414e-06, "loss": 0.0042, "num_tokens": 5805568.0, "reward": 0.84588623046875, "reward_std": 0.012175725772976875, "rewards//mean": 0.84588623046875, "rewards//std": 0.027795566245913506, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1782, "grad_norm": 0.646531343460083, "kl": 0.043966994155198336, "learning_rate": 4.653065607950502e-06, "loss": 0.0044, "num_tokens": 5812064.0, "reward": 0.8343505859375, "reward_std": 0.012879867106676102, "rewards//mean": 0.8343505859375, "rewards//std": 0.022774165496230125, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1784, "grad_norm": 0.6579074859619141, "kl": 0.04005965869873762, "learning_rate": 4.65225879773602e-06, "loss": 0.004, "num_tokens": 5818600.0, "reward": 0.85028076171875, "reward_std": 0.01558544673025608, "rewards//mean": 0.85028076171875, "rewards//std": 0.030758274719119072, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1786, "grad_norm": 0.77456134557724, "kl": 0.03999159322120249, "learning_rate": 4.651451120591952e-06, "loss": 0.004, "num_tokens": 5825048.0, "reward": 0.83135986328125, "reward_std": 0.010922490619122982, "rewards//mean": 0.83135986328125, "rewards//std": 0.01836886815726757, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1788, "grad_norm": 0.6243206858634949, "kl": 0.037127970019355416, "learning_rate": 4.650642576843631e-06, "loss": 0.0037, "num_tokens": 5831704.0, "reward": 0.86248779296875, "reward_std": 0.011517677456140518, "rewards//mean": 0.86248779296875, "rewards//std": 0.028773892670869827, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.179, "grad_norm": 0.6566209197044373, "kl": 0.042215952184051275, "learning_rate": 4.649833166816736e-06, "loss": 0.0042, "num_tokens": 5838304.0, "reward": 0.86236572265625, "reward_std": 0.011435369029641151, "rewards//mean": 0.86236572265625, "rewards//std": 0.023661118000745773, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1792, "grad_norm": 0.7270216941833496, "kl": 0.04611260769888759, "learning_rate": 4.649022890837298e-06, "loss": 0.0046, "num_tokens": 5844856.0, "reward": 0.84814453125, "reward_std": 0.012920394539833069, "rewards//mean": 0.84814453125, "rewards//std": 0.020297471433877945, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1794, "grad_norm": 0.6820149421691895, "kl": 0.03484724368900061, "learning_rate": 4.648211749231698e-06, "loss": 0.0035, "num_tokens": 5851392.0, "reward": 0.83905029296875, "reward_std": 0.012019114568829536, "rewards//mean": 0.83905029296875, "rewards//std": 0.02499941736459732, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1796, "grad_norm": 0.668484091758728, "kl": 0.04221570072695613, "learning_rate": 4.6473997423266615e-06, "loss": 0.0042, "num_tokens": 5857856.0, "reward": 0.86260986328125, "reward_std": 0.015107907354831696, "rewards//mean": 0.86260986328125, "rewards//std": 0.019370052963495255, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1798, "grad_norm": 0.6678747534751892, "kl": 0.04167328402400017, "learning_rate": 4.646586870449266e-06, "loss": 0.0042, "num_tokens": 5864392.0, "reward": 0.8489990234375, "reward_std": 0.010178307071328163, "rewards//mean": 0.8489990234375, "rewards//std": 0.03089313581585884, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.18, "grad_norm": 0.6929616332054138, "kl": 0.039298949064686894, "learning_rate": 4.645773133926936e-06, "loss": 0.0039, "num_tokens": 5870864.0, "reward": 0.803955078125, "reward_std": 0.011242110282182693, "rewards//mean": 0.803955078125, "rewards//std": 0.018918653950095177, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1802, "grad_norm": 0.6821906566619873, "kl": 0.03505802736617625, "learning_rate": 4.644958533087443e-06, "loss": 0.0035, "num_tokens": 5877504.0, "reward": 0.86138916015625, "reward_std": 0.011615011841058731, "rewards//mean": 0.86138916015625, "rewards//std": 0.023958658799529076, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1804, "grad_norm": 0.675546407699585, "kl": 0.04049504571594298, "learning_rate": 4.64414306825891e-06, "loss": 0.004, "num_tokens": 5884008.0, "reward": 0.86346435546875, "reward_std": 0.017759256064891815, "rewards//mean": 0.86346435546875, "rewards//std": 0.03580428287386894, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1806, "grad_norm": 0.6656494140625, "kl": 0.04367375490255654, "learning_rate": 4.643326739769805e-06, "loss": 0.0044, "num_tokens": 5890592.0, "reward": 0.853515625, "reward_std": 0.011980659328401089, "rewards//mean": 0.853515625, "rewards//std": 0.025094378739595413, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1808, "grad_norm": 0.6163875460624695, "kl": 0.04099143436178565, "learning_rate": 4.642509547948947e-06, "loss": 0.0041, "num_tokens": 5897120.0, "reward": 0.8583984375, "reward_std": 0.015584884211421013, "rewards//mean": 0.8583984375, "rewards//std": 0.031024422496557236, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.181, "grad_norm": 0.6468968391418457, "kl": 0.04323425958864391, "learning_rate": 4.6416914931254984e-06, "loss": 0.0043, "num_tokens": 5903480.0, "reward": 0.8427734375, "reward_std": 0.010204439982771873, "rewards//mean": 0.8427734375, "rewards//std": 0.022053200751543045, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1812, "grad_norm": 0.6285822987556458, "kl": 0.042249388294294477, "learning_rate": 4.640872575628973e-06, "loss": 0.0042, "num_tokens": 5909928.0, "reward": 0.8438720703125, "reward_std": 0.014041738584637642, "rewards//mean": 0.8438720703125, "rewards//std": 0.026087839156389236, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1814, "grad_norm": 0.6541216373443604, "kl": 0.03615096234716475, "learning_rate": 4.6400527957892295e-06, "loss": 0.0036, "num_tokens": 5916424.0, "reward": 0.83563232421875, "reward_std": 0.011880407109856606, "rewards//mean": 0.83563232421875, "rewards//std": 0.022275084629654884, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1816, "grad_norm": 0.6855373382568359, "kl": 0.041469790041446686, "learning_rate": 4.639232153936476e-06, "loss": 0.0041, "num_tokens": 5922968.0, "reward": 0.79296875, "reward_std": 0.008580282330513, "rewards//mean": 0.79296875, "rewards//std": 0.01980830356478691, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1818, "grad_norm": 0.6281968355178833, "kl": 0.039629788370803, "learning_rate": 4.638410650401267e-06, "loss": 0.004, "num_tokens": 5929408.0, "reward": 0.8785400390625, "reward_std": 0.01470201276242733, "rewards//mean": 0.8785400390625, "rewards//std": 0.031278807669878006, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.182, "grad_norm": 0.7264119386672974, "kl": 0.046727875247597694, "learning_rate": 4.637588285514504e-06, "loss": 0.0047, "num_tokens": 5935848.0, "reward": 0.81976318359375, "reward_std": 0.00966187845915556, "rewards//mean": 0.81976318359375, "rewards//std": 0.020947733893990517, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1822, "grad_norm": 0.6796708703041077, "kl": 0.04055432416498661, "learning_rate": 4.636765059607434e-06, "loss": 0.0041, "num_tokens": 5942368.0, "reward": 0.85064697265625, "reward_std": 0.014742225408554077, "rewards//mean": 0.85064697265625, "rewards//std": 0.0238548144698143, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1824, "grad_norm": 0.5940519571304321, "kl": 0.04338231682777405, "learning_rate": 4.6359409730116546e-06, "loss": 0.0043, "num_tokens": 5948928.0, "reward": 0.83978271484375, "reward_std": 0.010560210794210434, "rewards//mean": 0.83978271484375, "rewards//std": 0.01866886019706726, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1826, "grad_norm": 0.715964674949646, "kl": 0.053954784758388996, "learning_rate": 4.635116026059107e-06, "loss": 0.0054, "num_tokens": 5955528.0, "reward": 0.82318115234375, "reward_std": 0.014161437749862671, "rewards//mean": 0.82318115234375, "rewards//std": 0.022154470905661583, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1828, "grad_norm": 0.8227600455284119, "kl": 0.04878250975161791, "learning_rate": 4.634290219082078e-06, "loss": 0.0049, "num_tokens": 5962024.0, "reward": 0.84210205078125, "reward_std": 0.011272008530795574, "rewards//mean": 0.84210205078125, "rewards//std": 0.02768971212208271, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.183, "grad_norm": 0.6412074565887451, "kl": 0.036306142108514905, "learning_rate": 4.633463552413205e-06, "loss": 0.0036, "num_tokens": 5968568.0, "reward": 0.8790283203125, "reward_std": 0.008153083734214306, "rewards//mean": 0.8790283203125, "rewards//std": 0.024890564382076263, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1832, "grad_norm": 0.6921270489692688, "kl": 0.04746776120737195, "learning_rate": 4.632636026385468e-06, "loss": 0.0047, "num_tokens": 5975152.0, "reward": 0.86175537109375, "reward_std": 0.013499973341822624, "rewards//mean": 0.86175537109375, "rewards//std": 0.02922024019062519, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1834, "grad_norm": 0.6777154207229614, "kl": 0.043070978252217174, "learning_rate": 4.631807641332195e-06, "loss": 0.0043, "num_tokens": 5981696.0, "reward": 0.85870361328125, "reward_std": 0.01207971666008234, "rewards//mean": 0.85870361328125, "rewards//std": 0.030101627111434937, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1836, "grad_norm": 0.7225809097290039, "kl": 0.03947909642010927, "learning_rate": 4.630978397587058e-06, "loss": 0.0039, "num_tokens": 5988504.0, "reward": 0.85260009765625, "reward_std": 0.015898864716291428, "rewards//mean": 0.85260009765625, "rewards//std": 0.039611972868442535, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1838, "grad_norm": 0.6816273331642151, "kl": 0.043278159108012915, "learning_rate": 4.630148295484078e-06, "loss": 0.0043, "num_tokens": 5995032.0, "reward": 0.842041015625, "reward_std": 0.010010555386543274, "rewards//mean": 0.842041015625, "rewards//std": 0.023937247693538666, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.184, "grad_norm": 0.7362410426139832, "kl": 0.046471588080748916, "learning_rate": 4.62931733535762e-06, "loss": 0.0046, "num_tokens": 6001544.0, "reward": 0.87420654296875, "reward_std": 0.014177069067955017, "rewards//mean": 0.87420654296875, "rewards//std": 0.028630439192056656, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1842, "grad_norm": 0.6675663590431213, "kl": 0.03981461701914668, "learning_rate": 4.628485517542393e-06, "loss": 0.004, "num_tokens": 6008048.0, "reward": 0.8458251953125, "reward_std": 0.00875449925661087, "rewards//mean": 0.8458251953125, "rewards//std": 0.021909615024924278, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1844, "grad_norm": 0.781726062297821, "kl": 0.04557593772187829, "learning_rate": 4.627652842373454e-06, "loss": 0.0046, "num_tokens": 6014448.0, "reward": 0.8353271484375, "reward_std": 0.015135039575397968, "rewards//mean": 0.8353271484375, "rewards//std": 0.030355332419276237, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1846, "grad_norm": 1.0834941864013672, "kl": 0.04436720535159111, "learning_rate": 4.626819310186204e-06, "loss": 0.0044, "num_tokens": 6020960.0, "reward": 0.89306640625, "reward_std": 0.011011643335223198, "rewards//mean": 0.89306640625, "rewards//std": 0.01766917295753956, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1848, "grad_norm": 0.738615870475769, "kl": 0.04177575511857867, "learning_rate": 4.625984921316392e-06, "loss": 0.0042, "num_tokens": 6027448.0, "reward": 0.87786865234375, "reward_std": 0.012972611002624035, "rewards//mean": 0.87786865234375, "rewards//std": 0.017829405143857002, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.185, "grad_norm": 0.6873884201049805, "kl": 0.04864580603316426, "learning_rate": 4.625149676100107e-06, "loss": 0.0049, "num_tokens": 6034000.0, "reward": 0.847900390625, "reward_std": 0.012141989544034004, "rewards//mean": 0.847900390625, "rewards//std": 0.023866314440965652, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1852, "grad_norm": 0.7753527760505676, "kl": 0.042073047487065196, "learning_rate": 4.624313574873787e-06, "loss": 0.0042, "num_tokens": 6040528.0, "reward": 0.85040283203125, "reward_std": 0.013579750433564186, "rewards//mean": 0.85040283203125, "rewards//std": 0.03724491596221924, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1854, "grad_norm": 0.6963757276535034, "kl": 0.05082692578434944, "learning_rate": 4.623476617974212e-06, "loss": 0.0051, "num_tokens": 6047024.0, "reward": 0.7879638671875, "reward_std": 0.01067714486271143, "rewards//mean": 0.7879638671875, "rewards//std": 0.02702271193265915, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1856, "grad_norm": 0.6761671900749207, "kl": 0.0422394210472703, "learning_rate": 4.62263880573851e-06, "loss": 0.0042, "num_tokens": 6053544.0, "reward": 0.82110595703125, "reward_std": 0.01217557117342949, "rewards//mean": 0.82110595703125, "rewards//std": 0.016807610169053078, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1858, "grad_norm": 0.7448234558105469, "kl": 0.0404583215713501, "learning_rate": 4.6218001385041504e-06, "loss": 0.004, "num_tokens": 6060168.0, "reward": 0.88616943359375, "reward_std": 0.008901288732886314, "rewards//mean": 0.88616943359375, "rewards//std": 0.014124834910035133, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.186, "grad_norm": 0.6166906356811523, "kl": 0.04047831706702709, "learning_rate": 4.6209606166089495e-06, "loss": 0.004, "num_tokens": 6066720.0, "reward": 0.82843017578125, "reward_std": 0.012854673899710178, "rewards//mean": 0.82843017578125, "rewards//std": 0.026895461603999138, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1862, "grad_norm": 0.8489100933074951, "kl": 0.0454192163888365, "learning_rate": 4.620120240391065e-06, "loss": 0.0045, "num_tokens": 6073216.0, "reward": 0.79522705078125, "reward_std": 0.008993346244096756, "rewards//mean": 0.79522705078125, "rewards//std": 0.01763218827545643, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1864, "grad_norm": 0.65506911277771, "kl": 0.039396191481500864, "learning_rate": 4.619279010189002e-06, "loss": 0.0039, "num_tokens": 6079744.0, "reward": 0.82257080078125, "reward_std": 0.008998468518257141, "rewards//mean": 0.82257080078125, "rewards//std": 0.018332570791244507, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1866, "grad_norm": 0.6887573599815369, "kl": 0.04090560902841389, "learning_rate": 4.618436926341607e-06, "loss": 0.0041, "num_tokens": 6086280.0, "reward": 0.871337890625, "reward_std": 0.014091781340539455, "rewards//mean": 0.871337890625, "rewards//std": 0.019610146060585976, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1868, "grad_norm": 0.6609535813331604, "kl": 0.03739321604371071, "learning_rate": 4.617593989188071e-06, "loss": 0.0037, "num_tokens": 6092832.0, "reward": 0.8531494140625, "reward_std": 0.013474280014634132, "rewards//mean": 0.8531494140625, "rewards//std": 0.030062692239880562, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.187, "grad_norm": 0.7688198089599609, "kl": 0.0410981688182801, "learning_rate": 4.616750199067929e-06, "loss": 0.0041, "num_tokens": 6099320.0, "reward": 0.85919189453125, "reward_std": 0.013847172260284424, "rewards//mean": 0.85919189453125, "rewards//std": 0.025952504947781563, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1872, "grad_norm": 0.643115758895874, "kl": 0.04064711183309555, "learning_rate": 4.615905556321061e-06, "loss": 0.0041, "num_tokens": 6105904.0, "reward": 0.8336181640625, "reward_std": 0.012946332804858685, "rewards//mean": 0.8336181640625, "rewards//std": 0.01982930861413479, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1874, "grad_norm": 0.7157496809959412, "kl": 0.041027721017599106, "learning_rate": 4.615060061287688e-06, "loss": 0.0041, "num_tokens": 6112448.0, "reward": 0.8719482421875, "reward_std": 0.017627859488129616, "rewards//mean": 0.8719482421875, "rewards//std": 0.030068732798099518, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1876, "grad_norm": 0.7336728572845459, "kl": 0.04614663729444146, "learning_rate": 4.614213714308374e-06, "loss": 0.0046, "num_tokens": 6118968.0, "reward": 0.82769775390625, "reward_std": 0.00945833045989275, "rewards//mean": 0.82769775390625, "rewards//std": 0.01365844439715147, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1878, "grad_norm": 0.6875671148300171, "kl": 0.06045741937123239, "learning_rate": 4.6133665157240306e-06, "loss": 0.006, "num_tokens": 6125440.0, "reward": 0.875244140625, "reward_std": 0.01146995835006237, "rewards//mean": 0.875244140625, "rewards//std": 0.03147299960255623, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.188, "grad_norm": 0.6396803855895996, "kl": 0.03798764920793474, "learning_rate": 4.612518465875906e-06, "loss": 0.0038, "num_tokens": 6131888.0, "reward": 0.84844970703125, "reward_std": 0.013945532031357288, "rewards//mean": 0.84844970703125, "rewards//std": 0.03200189769268036, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1882, "grad_norm": 0.683394193649292, "kl": 0.04144331160932779, "learning_rate": 4.611669565105597e-06, "loss": 0.0041, "num_tokens": 6138344.0, "reward": 0.85504150390625, "reward_std": 0.011722896248102188, "rewards//mean": 0.85504150390625, "rewards//std": 0.025172600522637367, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1884, "grad_norm": 0.6219683885574341, "kl": 0.042291250778362155, "learning_rate": 4.610819813755038e-06, "loss": 0.0042, "num_tokens": 6144864.0, "reward": 0.865966796875, "reward_std": 0.016607439145445824, "rewards//mean": 0.865966796875, "rewards//std": 0.031426794826984406, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1886, "grad_norm": 0.6586542725563049, "kl": 0.03660945198498666, "learning_rate": 4.609969212166512e-06, "loss": 0.0037, "num_tokens": 6151488.0, "reward": 0.8427734375, "reward_std": 0.010870680212974548, "rewards//mean": 0.8427734375, "rewards//std": 0.01233302429318428, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1888, "grad_norm": 0.6816508769989014, "kl": 0.03906205715611577, "learning_rate": 4.609117760682639e-06, "loss": 0.0039, "num_tokens": 6158024.0, "reward": 0.82928466796875, "reward_std": 0.010280165821313858, "rewards//mean": 0.82928466796875, "rewards//std": 0.02844797447323799, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.189, "grad_norm": 0.6851620674133301, "kl": 0.04188694804906845, "learning_rate": 4.608265459646384e-06, "loss": 0.0042, "num_tokens": 6164552.0, "reward": 0.8612060546875, "reward_std": 0.012371834367513657, "rewards//mean": 0.8612060546875, "rewards//std": 0.02456735260784626, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1892, "grad_norm": 0.7054226994514465, "kl": 0.03868069825693965, "learning_rate": 4.607412309401054e-06, "loss": 0.0039, "num_tokens": 6171072.0, "reward": 0.843505859375, "reward_std": 0.013521338813006878, "rewards//mean": 0.843505859375, "rewards//std": 0.029321666806936264, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1894, "grad_norm": 0.6950118541717529, "kl": 0.0383919773157686, "learning_rate": 4.606558310290298e-06, "loss": 0.0038, "num_tokens": 6177488.0, "reward": 0.79071044921875, "reward_std": 0.00887228175997734, "rewards//mean": 0.79071044921875, "rewards//std": 0.017130540683865547, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1896, "grad_norm": 0.712725818157196, "kl": 0.05234731128439307, "learning_rate": 4.605703462658107e-06, "loss": 0.0052, "num_tokens": 6183992.0, "reward": 0.85357666015625, "reward_std": 0.0174628384411335, "rewards//mean": 0.85357666015625, "rewards//std": 0.03561139106750488, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1898, "grad_norm": 0.7520847916603088, "kl": 0.04160321340896189, "learning_rate": 4.604847766848812e-06, "loss": 0.0042, "num_tokens": 6190416.0, "reward": 0.87713623046875, "reward_std": 0.015189571306109428, "rewards//mean": 0.87713623046875, "rewards//std": 0.02517079748213291, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.19, "grad_norm": 0.6863551735877991, "kl": 0.04947540210559964, "learning_rate": 4.60399122320709e-06, "loss": 0.0049, "num_tokens": 6196872.0, "reward": 0.8363037109375, "reward_std": 0.015436182729899883, "rewards//mean": 0.8363037109375, "rewards//std": 0.020560890436172485, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1902, "grad_norm": 0.6834877729415894, "kl": 0.045546500477939844, "learning_rate": 4.603133832077953e-06, "loss": 0.0046, "num_tokens": 6203424.0, "reward": 0.86474609375, "reward_std": 0.012943100184202194, "rewards//mean": 0.86474609375, "rewards//std": 0.04526362195611, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1904, "grad_norm": 0.7253804802894592, "kl": 0.04165040701627731, "learning_rate": 4.602275593806761e-06, "loss": 0.0042, "num_tokens": 6209920.0, "reward": 0.85626220703125, "reward_std": 0.015964912250638008, "rewards//mean": 0.85626220703125, "rewards//std": 0.02952994965016842, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1906, "grad_norm": 0.6976743936538696, "kl": 0.04115398577414453, "learning_rate": 4.601416508739211e-06, "loss": 0.0041, "num_tokens": 6216496.0, "reward": 0.82269287109375, "reward_std": 0.017609048634767532, "rewards//mean": 0.82269287109375, "rewards//std": 0.026324884966015816, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1908, "grad_norm": 0.6280128359794617, "kl": 0.04923348594456911, "learning_rate": 4.600556577221342e-06, "loss": 0.0049, "num_tokens": 6223056.0, "reward": 0.83355712890625, "reward_std": 0.012344447895884514, "rewards//mean": 0.83355712890625, "rewards//std": 0.02510998211801052, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.191, "grad_norm": 0.7094703316688538, "kl": 0.04144132649526, "learning_rate": 4.599695799599537e-06, "loss": 0.0041, "num_tokens": 6229536.0, "reward": 0.86810302734375, "reward_std": 0.01247455459088087, "rewards//mean": 0.86810302734375, "rewards//std": 0.022549975663423538, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1912, "grad_norm": 0.7454099655151367, "kl": 0.04297591699287295, "learning_rate": 4.5988341762205125e-06, "loss": 0.0043, "num_tokens": 6236112.0, "reward": 0.8541259765625, "reward_std": 0.01617106795310974, "rewards//mean": 0.8541259765625, "rewards//std": 0.029840312898159027, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1914, "grad_norm": 0.7461182475090027, "kl": 0.04712600726634264, "learning_rate": 4.5979717074313336e-06, "loss": 0.0047, "num_tokens": 6242616.0, "reward": 0.8643798828125, "reward_std": 0.009678803384304047, "rewards//mean": 0.8643798828125, "rewards//std": 0.0312691293656826, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1916, "grad_norm": 0.6835668683052063, "kl": 0.04492692369967699, "learning_rate": 4.5971083935794026e-06, "loss": 0.0045, "num_tokens": 6249200.0, "reward": 0.84637451171875, "reward_std": 0.011677243746817112, "rewards//mean": 0.84637451171875, "rewards//std": 0.03571368753910065, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1918, "grad_norm": 0.7025179862976074, "kl": 0.038464770652353764, "learning_rate": 4.5962442350124605e-06, "loss": 0.0038, "num_tokens": 6255744.0, "reward": 0.818115234375, "reward_std": 0.00972269382327795, "rewards//mean": 0.818115234375, "rewards//std": 0.029014429077506065, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.192, "grad_norm": 0.7778719663619995, "kl": 0.04615939827635884, "learning_rate": 4.595379232078592e-06, "loss": 0.0046, "num_tokens": 6262224.0, "reward": 0.84735107421875, "reward_std": 0.017559725791215897, "rewards//mean": 0.84735107421875, "rewards//std": 0.021091047674417496, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1922, "grad_norm": 0.6818391680717468, "kl": 0.05492235207930207, "learning_rate": 4.5945133851262185e-06, "loss": 0.0055, "num_tokens": 6268784.0, "reward": 0.83404541015625, "reward_std": 0.011023140512406826, "rewards//mean": 0.83404541015625, "rewards//std": 0.021553227677941322, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1924, "grad_norm": 0.7694291472434998, "kl": 0.05033859750255942, "learning_rate": 4.593646694504105e-06, "loss": 0.005, "num_tokens": 6275304.0, "reward": 0.8218994140625, "reward_std": 0.011773250997066498, "rewards//mean": 0.8218994140625, "rewards//std": 0.020430902019143105, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1926, "grad_norm": 0.692175567150116, "kl": 0.048755659721791744, "learning_rate": 4.5927791605613525e-06, "loss": 0.0049, "num_tokens": 6281896.0, "reward": 0.83563232421875, "reward_std": 0.013875285163521767, "rewards//mean": 0.83563232421875, "rewards//std": 0.02050882764160633, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1928, "grad_norm": 0.7087896466255188, "kl": 0.04856082936748862, "learning_rate": 4.591910783647405e-06, "loss": 0.0049, "num_tokens": 6288408.0, "reward": 0.85595703125, "reward_std": 0.012770500034093857, "rewards//mean": 0.85595703125, "rewards//std": 0.021378425881266594, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.193, "grad_norm": 0.6834204196929932, "kl": 0.05175302270799875, "learning_rate": 4.591041564112043e-06, "loss": 0.0052, "num_tokens": 6294968.0, "reward": 0.85260009765625, "reward_std": 0.017285553738474846, "rewards//mean": 0.85260009765625, "rewards//std": 0.0240419153124094, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1932, "grad_norm": 0.69572913646698, "kl": 0.044029935728758574, "learning_rate": 4.59017150230539e-06, "loss": 0.0044, "num_tokens": 6301496.0, "reward": 0.8465576171875, "reward_std": 0.013601857237517834, "rewards//mean": 0.8465576171875, "rewards//std": 0.03215508535504341, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1934, "grad_norm": 0.8224660754203796, "kl": 0.04022330488078296, "learning_rate": 4.589300598577906e-06, "loss": 0.004, "num_tokens": 6308064.0, "reward": 0.836181640625, "reward_std": 0.008823839016258717, "rewards//mean": 0.836181640625, "rewards//std": 0.02338447794318199, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1936, "grad_norm": 0.7732920050621033, "kl": 0.04558022366836667, "learning_rate": 4.58842885328039e-06, "loss": 0.0046, "num_tokens": 6314568.0, "reward": 0.83984375, "reward_std": 0.013668116182088852, "rewards//mean": 0.83984375, "rewards//std": 0.03754492849111557, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1938, "grad_norm": 0.6807088851928711, "kl": 0.05070904782041907, "learning_rate": 4.587556266763982e-06, "loss": 0.0051, "num_tokens": 6321104.0, "reward": 0.77545166015625, "reward_std": 0.009375620633363724, "rewards//mean": 0.77545166015625, "rewards//std": 0.01899680867791176, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.194, "grad_norm": 0.6597617268562317, "kl": 0.04255011701025069, "learning_rate": 4.586682839380159e-06, "loss": 0.0043, "num_tokens": 6327720.0, "reward": 0.8553466796875, "reward_std": 0.014259079471230507, "rewards//mean": 0.8553466796875, "rewards//std": 0.0322866328060627, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1942, "grad_norm": 0.6452547907829285, "kl": 0.04155507404357195, "learning_rate": 4.585808571480739e-06, "loss": 0.0042, "num_tokens": 6334232.0, "reward": 0.83111572265625, "reward_std": 0.009173991158604622, "rewards//mean": 0.83111572265625, "rewards//std": 0.018265563994646072, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1944, "grad_norm": 0.6843393445014954, "kl": 0.04133532801643014, "learning_rate": 4.584933463417874e-06, "loss": 0.0041, "num_tokens": 6340720.0, "reward": 0.83685302734375, "reward_std": 0.012771625071763992, "rewards//mean": 0.83685302734375, "rewards//std": 0.020036159083247185, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1946, "grad_norm": 0.6603941917419434, "kl": 0.052238917676731944, "learning_rate": 4.584057515544061e-06, "loss": 0.0052, "num_tokens": 6347392.0, "reward": 0.8248291015625, "reward_std": 0.012165896594524384, "rewards//mean": 0.8248291015625, "rewards//std": 0.030538296326994896, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1948, "grad_norm": 0.6929309368133545, "kl": 0.04384617321193218, "learning_rate": 4.583180728212128e-06, "loss": 0.0044, "num_tokens": 6353848.0, "reward": 0.88787841796875, "reward_std": 0.0120087293908, "rewards//mean": 0.88787841796875, "rewards//std": 0.01843632012605667, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.195, "grad_norm": 0.6984637379646301, "kl": 0.03931608865968883, "learning_rate": 4.582303101775249e-06, "loss": 0.0039, "num_tokens": 6360504.0, "reward": 0.82904052734375, "reward_std": 0.011513952165842056, "rewards//mean": 0.82904052734375, "rewards//std": 0.033347733318805695, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1952, "grad_norm": 0.6983664631843567, "kl": 0.047918176744133234, "learning_rate": 4.5814246365869285e-06, "loss": 0.0048, "num_tokens": 6367160.0, "reward": 0.85308837890625, "reward_std": 0.012053456157445908, "rewards//mean": 0.85308837890625, "rewards//std": 0.030966732650995255, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1954, "grad_norm": 0.6621443033218384, "kl": 0.045792046934366226, "learning_rate": 4.580545333001014e-06, "loss": 0.0046, "num_tokens": 6373640.0, "reward": 0.84912109375, "reward_std": 0.012664642184972763, "rewards//mean": 0.84912109375, "rewards//std": 0.025794023647904396, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1956, "grad_norm": 0.7002736926078796, "kl": 0.040672547183930874, "learning_rate": 4.579665191371687e-06, "loss": 0.0041, "num_tokens": 6380208.0, "reward": 0.860107421875, "reward_std": 0.017162654548883438, "rewards//mean": 0.860107421875, "rewards//std": 0.03360193222761154, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1958, "grad_norm": 0.6573413610458374, "kl": 0.043526260647922754, "learning_rate": 4.578784212053471e-06, "loss": 0.0044, "num_tokens": 6386752.0, "reward": 0.85009765625, "reward_std": 0.013537352904677391, "rewards//mean": 0.85009765625, "rewards//std": 0.015786921605467796, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.196, "grad_norm": 0.6943175792694092, "kl": 0.05249407817609608, "learning_rate": 4.577902395401222e-06, "loss": 0.0052, "num_tokens": 6393320.0, "reward": 0.83551025390625, "reward_std": 0.011513197794556618, "rewards//mean": 0.83551025390625, "rewards//std": 0.032235193997621536, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1962, "grad_norm": 0.6779718399047852, "kl": 0.04732549702748656, "learning_rate": 4.577019741770137e-06, "loss": 0.0047, "num_tokens": 6399824.0, "reward": 0.83935546875, "reward_std": 0.012707693502306938, "rewards//mean": 0.83935546875, "rewards//std": 0.026397312059998512, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1964, "grad_norm": 0.7734324336051941, "kl": 0.05470086168497801, "learning_rate": 4.576136251515748e-06, "loss": 0.0055, "num_tokens": 6406368.0, "reward": 0.7996826171875, "reward_std": 0.010812653228640556, "rewards//mean": 0.7996826171875, "rewards//std": 0.022728921845555305, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1966, "grad_norm": 0.7584691643714905, "kl": 0.049642365891486406, "learning_rate": 4.575251924993926e-06, "loss": 0.005, "num_tokens": 6412840.0, "reward": 0.87274169921875, "reward_std": 0.016802702099084854, "rewards//mean": 0.87274169921875, "rewards//std": 0.020981669425964355, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1968, "grad_norm": 0.6663657426834106, "kl": 0.05738833174109459, "learning_rate": 4.574366762560876e-06, "loss": 0.0057, "num_tokens": 6419368.0, "reward": 0.8297119140625, "reward_std": 0.014144438318908215, "rewards//mean": 0.8297119140625, "rewards//std": 0.02907150238752365, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.197, "grad_norm": 0.6651089191436768, "kl": 0.0494233095087111, "learning_rate": 4.573480764573143e-06, "loss": 0.0049, "num_tokens": 6425872.0, "reward": 0.85003662109375, "reward_std": 0.011118832975625992, "rewards//mean": 0.85003662109375, "rewards//std": 0.022995345294475555, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1972, "grad_norm": 0.6475024223327637, "kl": 0.04345037881284952, "learning_rate": 4.572593931387604e-06, "loss": 0.0043, "num_tokens": 6432344.0, "reward": 0.819091796875, "reward_std": 0.013030166737735271, "rewards//mean": 0.819091796875, "rewards//std": 0.03384611755609512, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1974, "grad_norm": 0.6717162728309631, "kl": 0.04292596550658345, "learning_rate": 4.571706263361479e-06, "loss": 0.0043, "num_tokens": 6438768.0, "reward": 0.8155517578125, "reward_std": 0.012478616088628769, "rewards//mean": 0.8155517578125, "rewards//std": 0.017322655767202377, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1976, "grad_norm": 0.7700872421264648, "kl": 0.050918920896947384, "learning_rate": 4.570817760852319e-06, "loss": 0.0051, "num_tokens": 6445376.0, "reward": 0.80120849609375, "reward_std": 0.01503811962902546, "rewards//mean": 0.80120849609375, "rewards//std": 0.025027859956026077, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1978, "grad_norm": 0.695588231086731, "kl": 0.04219944030046463, "learning_rate": 4.569928424218012e-06, "loss": 0.0042, "num_tokens": 6451944.0, "reward": 0.8466796875, "reward_std": 0.01621868647634983, "rewards//mean": 0.8466796875, "rewards//std": 0.03559103608131409, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.198, "grad_norm": 0.7971217036247253, "kl": 0.04319071024656296, "learning_rate": 4.569038253816783e-06, "loss": 0.0043, "num_tokens": 6458592.0, "reward": 0.85443115234375, "reward_std": 0.016578156501054764, "rewards//mean": 0.85443115234375, "rewards//std": 0.03290539234876633, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1982, "grad_norm": 0.7145017981529236, "kl": 0.047632794827222824, "learning_rate": 4.5681472500071935e-06, "loss": 0.0048, "num_tokens": 6465136.0, "reward": 0.7896728515625, "reward_std": 0.012131381779909134, "rewards//mean": 0.7896728515625, "rewards//std": 0.021147403866052628, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1984, "grad_norm": 0.6252520084381104, "kl": 0.04791990341618657, "learning_rate": 4.567255413148139e-06, "loss": 0.0048, "num_tokens": 6471728.0, "reward": 0.8143310546875, "reward_std": 0.01138047780841589, "rewards//mean": 0.8143310546875, "rewards//std": 0.018662068992853165, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1986, "grad_norm": 0.6275078058242798, "kl": 0.047070488799363375, "learning_rate": 4.566362743598851e-06, "loss": 0.0047, "num_tokens": 6478208.0, "reward": 0.8868408203125, "reward_std": 0.013486402109265327, "rewards//mean": 0.8868408203125, "rewards//std": 0.023591680452227592, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1988, "grad_norm": 0.7225343585014343, "kl": 0.05764315905980766, "learning_rate": 4.565469241718896e-06, "loss": 0.0058, "num_tokens": 6484760.0, "reward": 0.810546875, "reward_std": 0.01867908425629139, "rewards//mean": 0.810546875, "rewards//std": 0.03193993121385574, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.199, "grad_norm": 0.6515203714370728, "kl": 0.042084392393007874, "learning_rate": 4.564574907868179e-06, "loss": 0.0042, "num_tokens": 6491320.0, "reward": 0.79107666015625, "reward_std": 0.00986007135361433, "rewards//mean": 0.79107666015625, "rewards//std": 0.020763371139764786, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1992, "grad_norm": 0.6625439524650574, "kl": 0.044497475028038025, "learning_rate": 4.563679742406935e-06, "loss": 0.0044, "num_tokens": 6497864.0, "reward": 0.84674072265625, "reward_std": 0.016481919214129448, "rewards//mean": 0.84674072265625, "rewards//std": 0.03052658401429653, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1994, "grad_norm": 0.7319129109382629, "kl": 0.04618475493043661, "learning_rate": 4.562783745695738e-06, "loss": 0.0046, "num_tokens": 6504464.0, "reward": 0.81353759765625, "reward_std": 0.015955062583088875, "rewards//mean": 0.81353759765625, "rewards//std": 0.028766000643372536, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1996, "grad_norm": 0.6688764095306396, "kl": 0.040976963471621275, "learning_rate": 4.561886918095495e-06, "loss": 0.0041, "num_tokens": 6511160.0, "reward": 0.852294921875, "reward_std": 0.013901523314416409, "rewards//mean": 0.852294921875, "rewards//std": 0.026882583275437355, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.1998, "grad_norm": 0.791424572467804, "kl": 0.07057768478989601, "learning_rate": 4.560989259967447e-06, "loss": 0.0071, "num_tokens": 6517640.0, "reward": 0.85760498046875, "reward_std": 0.017160706222057343, "rewards//mean": 0.85760498046875, "rewards//std": 0.02633695863187313, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2, "grad_norm": 0.6725102066993713, "kl": 0.04978520795702934, "learning_rate": 4.560090771673174e-06, "loss": 0.005, "num_tokens": 6524192.0, "reward": 0.83251953125, "reward_std": 0.009535292163491249, "rewards//mean": 0.83251953125, "rewards//std": 0.033707186579704285, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2002, "grad_norm": 0.7156857252120972, "kl": 0.05547969276085496, "learning_rate": 4.559191453574582e-06, "loss": 0.0055, "num_tokens": 6530648.0, "reward": 0.8668212890625, "reward_std": 0.011887451633810997, "rewards//mean": 0.8668212890625, "rewards//std": 0.023834262043237686, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2004, "grad_norm": 0.6377259492874146, "kl": 0.043812160613015294, "learning_rate": 4.55829130603392e-06, "loss": 0.0044, "num_tokens": 6537216.0, "reward": 0.86077880859375, "reward_std": 0.00933208130300045, "rewards//mean": 0.86077880859375, "rewards//std": 0.02148992381989956, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2006, "grad_norm": 0.7118643522262573, "kl": 0.05888244602829218, "learning_rate": 4.557390329413765e-06, "loss": 0.0059, "num_tokens": 6543872.0, "reward": 0.856201171875, "reward_std": 0.01756351627409458, "rewards//mean": 0.856201171875, "rewards//std": 0.034833554178476334, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2008, "grad_norm": 0.7139714360237122, "kl": 0.052430893294513226, "learning_rate": 4.556488524077033e-06, "loss": 0.0052, "num_tokens": 6550400.0, "reward": 0.85186767578125, "reward_std": 0.011976618319749832, "rewards//mean": 0.85186767578125, "rewards//std": 0.03985618054866791, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.201, "grad_norm": 0.6759874224662781, "kl": 0.051580125698819757, "learning_rate": 4.555585890386969e-06, "loss": 0.0052, "num_tokens": 6556832.0, "reward": 0.833984375, "reward_std": 0.00952422060072422, "rewards//mean": 0.833984375, "rewards//std": 0.01940060593187809, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2012, "grad_norm": 0.796593427658081, "kl": 0.055535249412059784, "learning_rate": 4.554682428707153e-06, "loss": 0.0056, "num_tokens": 6563368.0, "reward": 0.84625244140625, "reward_std": 0.010847987607121468, "rewards//mean": 0.84625244140625, "rewards//std": 0.02817152813076973, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2014, "grad_norm": 0.6898808479309082, "kl": 0.04403488012030721, "learning_rate": 4.553778139401501e-06, "loss": 0.0044, "num_tokens": 6569888.0, "reward": 0.81536865234375, "reward_std": 0.010745392180979252, "rewards//mean": 0.81536865234375, "rewards//std": 0.024883493781089783, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2016, "grad_norm": 0.6899251937866211, "kl": 0.043834451120346785, "learning_rate": 4.55287302283426e-06, "loss": 0.0044, "num_tokens": 6576424.0, "reward": 0.84014892578125, "reward_std": 0.0147162564098835, "rewards//mean": 0.84014892578125, "rewards//std": 0.03132203221321106, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2018, "grad_norm": 0.7430453896522522, "kl": 0.04782510409131646, "learning_rate": 4.551967079370011e-06, "loss": 0.0048, "num_tokens": 6583016.0, "reward": 0.881591796875, "reward_std": 0.01338224671781063, "rewards//mean": 0.881591796875, "rewards//std": 0.023600982502102852, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.202, "grad_norm": 0.667633593082428, "kl": 0.04944427451118827, "learning_rate": 4.551060309373668e-06, "loss": 0.0049, "num_tokens": 6589576.0, "reward": 0.82391357421875, "reward_std": 0.012446447275578976, "rewards//mean": 0.82391357421875, "rewards//std": 0.02659662440419197, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2022, "grad_norm": 0.8124962449073792, "kl": 0.047788978554308414, "learning_rate": 4.550152713210478e-06, "loss": 0.0048, "num_tokens": 6596184.0, "reward": 0.840087890625, "reward_std": 0.013710832223296165, "rewards//mean": 0.840087890625, "rewards//std": 0.022701265290379524, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2024, "grad_norm": 0.6640636324882507, "kl": 0.05186305474489927, "learning_rate": 4.54924429124602e-06, "loss": 0.0052, "num_tokens": 6602688.0, "reward": 0.83074951171875, "reward_std": 0.011372091248631477, "rewards//mean": 0.83074951171875, "rewards//std": 0.024210695177316666, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2026, "grad_norm": 0.6470499038696289, "kl": 0.05514884181320667, "learning_rate": 4.5483350438462066e-06, "loss": 0.0055, "num_tokens": 6609200.0, "reward": 0.86224365234375, "reward_std": 0.015330832451581955, "rewards//mean": 0.86224365234375, "rewards//std": 0.02585725486278534, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2028, "grad_norm": 0.7708079218864441, "kl": 0.05913720210082829, "learning_rate": 4.547424971377282e-06, "loss": 0.0059, "num_tokens": 6615848.0, "reward": 0.86962890625, "reward_std": 0.01416373997926712, "rewards//mean": 0.86962890625, "rewards//std": 0.025943826884031296, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.203, "grad_norm": 0.7233020663261414, "kl": 0.04181193048134446, "learning_rate": 4.546514074205824e-06, "loss": 0.0042, "num_tokens": 6622392.0, "reward": 0.8388671875, "reward_std": 0.015545856207609177, "rewards//mean": 0.8388671875, "rewards//std": 0.025723503902554512, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2032, "grad_norm": 0.7091516852378845, "kl": 0.044875356601551175, "learning_rate": 4.545602352698742e-06, "loss": 0.0045, "num_tokens": 6628952.0, "reward": 0.84423828125, "reward_std": 0.017908543348312378, "rewards//mean": 0.84423828125, "rewards//std": 0.03759327903389931, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2034, "grad_norm": 0.8109896779060364, "kl": 0.05211695982143283, "learning_rate": 4.544689807223277e-06, "loss": 0.0052, "num_tokens": 6635496.0, "reward": 0.86138916015625, "reward_std": 0.010065915063023567, "rewards//mean": 0.86138916015625, "rewards//std": 0.02320895716547966, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2036, "grad_norm": 0.7074781656265259, "kl": 0.04632331570610404, "learning_rate": 4.543776438147002e-06, "loss": 0.0046, "num_tokens": 6641912.0, "reward": 0.80072021484375, "reward_std": 0.01151367835700512, "rewards//mean": 0.80072021484375, "rewards//std": 0.023203739896416664, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2038, "grad_norm": 0.6169856190681458, "kl": 0.05148485489189625, "learning_rate": 4.542862245837821e-06, "loss": 0.0051, "num_tokens": 6648400.0, "reward": 0.84613037109375, "reward_std": 0.01268923282623291, "rewards//mean": 0.84613037109375, "rewards//std": 0.030477948486804962, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.204, "grad_norm": 0.7163510918617249, "kl": 0.05498185846954584, "learning_rate": 4.541947230663973e-06, "loss": 0.0055, "num_tokens": 6654952.0, "reward": 0.85394287109375, "reward_std": 0.014796373434364796, "rewards//mean": 0.85394287109375, "rewards//std": 0.03642255812883377, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2042, "grad_norm": 0.6965911388397217, "kl": 0.053093669936060905, "learning_rate": 4.541031392994025e-06, "loss": 0.0053, "num_tokens": 6661416.0, "reward": 0.85162353515625, "reward_std": 0.015556513331830502, "rewards//mean": 0.85162353515625, "rewards//std": 0.023791274055838585, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2044, "grad_norm": 0.7249484658241272, "kl": 0.05137994000688195, "learning_rate": 4.540114733196875e-06, "loss": 0.0051, "num_tokens": 6668024.0, "reward": 0.84442138671875, "reward_std": 0.011441156268119812, "rewards//mean": 0.84442138671875, "rewards//std": 0.023923883214592934, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2046, "grad_norm": 0.7473930716514587, "kl": 0.04864614084362984, "learning_rate": 4.5391972516417545e-06, "loss": 0.0049, "num_tokens": 6674592.0, "reward": 0.82647705078125, "reward_std": 0.011105773970484734, "rewards//mean": 0.82647705078125, "rewards//std": 0.01851661130785942, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2048, "grad_norm": 0.7338449954986572, "kl": 0.05664684623479843, "learning_rate": 4.538278948698226e-06, "loss": 0.0057, "num_tokens": 6681128.0, "reward": 0.827880859375, "reward_std": 0.017563579604029655, "rewards//mean": 0.827880859375, "rewards//std": 0.04145614430308342, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.205, "grad_norm": 0.654859185218811, "kl": 0.05037943972274661, "learning_rate": 4.537359824736179e-06, "loss": 0.005, "num_tokens": 6687576.0, "reward": 0.85479736328125, "reward_std": 0.011732356622815132, "rewards//mean": 0.85479736328125, "rewards//std": 0.026951685547828674, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2052, "grad_norm": 0.6613022685050964, "kl": 0.054212228395044804, "learning_rate": 4.53643988012584e-06, "loss": 0.0054, "num_tokens": 6694136.0, "reward": 0.8226318359375, "reward_std": 0.010827379301190376, "rewards//mean": 0.8226318359375, "rewards//std": 0.020312009379267693, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2054, "grad_norm": 0.6739047765731812, "kl": 0.043983591720461845, "learning_rate": 4.53551911523776e-06, "loss": 0.0044, "num_tokens": 6700600.0, "reward": 0.861328125, "reward_std": 0.018970249220728874, "rewards//mean": 0.861328125, "rewards//std": 0.03623175993561745, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2056, "grad_norm": 0.6742681860923767, "kl": 0.05573314940556884, "learning_rate": 4.534597530442824e-06, "loss": 0.0056, "num_tokens": 6707096.0, "reward": 0.873291015625, "reward_std": 0.013601814396679401, "rewards//mean": 0.873291015625, "rewards//std": 0.022270411252975464, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2058, "grad_norm": 0.6766579747200012, "kl": 0.045060742646455765, "learning_rate": 4.5336751261122455e-06, "loss": 0.0045, "num_tokens": 6713672.0, "reward": 0.82659912109375, "reward_std": 0.012738298624753952, "rewards//mean": 0.82659912109375, "rewards//std": 0.025254253298044205, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.206, "grad_norm": 0.6950955986976624, "kl": 0.051106690894812346, "learning_rate": 4.5327519026175694e-06, "loss": 0.0051, "num_tokens": 6720128.0, "reward": 0.8707275390625, "reward_std": 0.011838306672871113, "rewards//mean": 0.8707275390625, "rewards//std": 0.02739877812564373, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2062, "grad_norm": 0.7250239253044128, "kl": 0.04865707317367196, "learning_rate": 4.53182786033067e-06, "loss": 0.0049, "num_tokens": 6726696.0, "reward": 0.78778076171875, "reward_std": 0.013004752807319164, "rewards//mean": 0.78778076171875, "rewards//std": 0.030899688601493835, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2064, "grad_norm": 0.6481121182441711, "kl": 0.04569572675973177, "learning_rate": 4.530902999623752e-06, "loss": 0.0046, "num_tokens": 6733320.0, "reward": 0.82281494140625, "reward_std": 0.0106744896620512, "rewards//mean": 0.82281494140625, "rewards//std": 0.016694650053977966, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2066, "grad_norm": 0.7017217874526978, "kl": 0.05128394206985831, "learning_rate": 4.529977320869349e-06, "loss": 0.0051, "num_tokens": 6739800.0, "reward": 0.84490966796875, "reward_std": 0.010644054040312767, "rewards//mean": 0.84490966796875, "rewards//std": 0.013490051962435246, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2068, "grad_norm": 0.6792444586753845, "kl": 0.04046568903140724, "learning_rate": 4.529050824440323e-06, "loss": 0.004, "num_tokens": 6746384.0, "reward": 0.8082275390625, "reward_std": 0.012271732091903687, "rewards//mean": 0.8082275390625, "rewards//std": 0.018942244350910187, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.207, "grad_norm": 0.6792720556259155, "kl": 0.05103640258312225, "learning_rate": 4.528123510709868e-06, "loss": 0.0051, "num_tokens": 6752968.0, "reward": 0.84246826171875, "reward_std": 0.012242881581187248, "rewards//mean": 0.84246826171875, "rewards//std": 0.021030671894550323, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2072, "grad_norm": 0.7758316993713379, "kl": 0.052819902542978525, "learning_rate": 4.527195380051505e-06, "loss": 0.0053, "num_tokens": 6759560.0, "reward": 0.77215576171875, "reward_std": 0.014756198972463608, "rewards//mean": 0.77215576171875, "rewards//std": 0.029074301943182945, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2074, "grad_norm": 0.6563489437103271, "kl": 0.04161089728586376, "learning_rate": 4.526266432839086e-06, "loss": 0.0042, "num_tokens": 6766128.0, "reward": 0.85589599609375, "reward_std": 0.013862330466508865, "rewards//mean": 0.85589599609375, "rewards//std": 0.02493089996278286, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2076, "grad_norm": 0.6082646250724792, "kl": 0.054137222934514284, "learning_rate": 4.525336669446789e-06, "loss": 0.0054, "num_tokens": 6772712.0, "reward": 0.8592529296875, "reward_std": 0.010362466797232628, "rewards//mean": 0.8592529296875, "rewards//std": 0.020983563736081123, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2078, "grad_norm": 0.7403309345245361, "kl": 0.052175013814121485, "learning_rate": 4.524406090249125e-06, "loss": 0.0052, "num_tokens": 6779248.0, "reward": 0.85736083984375, "reward_std": 0.01716860942542553, "rewards//mean": 0.85736083984375, "rewards//std": 0.02671191282570362, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.208, "grad_norm": 0.7403693795204163, "kl": 0.04759194469079375, "learning_rate": 4.5234746956209295e-06, "loss": 0.0048, "num_tokens": 6785784.0, "reward": 0.8343505859375, "reward_std": 0.0121842622756958, "rewards//mean": 0.8343505859375, "rewards//std": 0.017479244619607925, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2082, "grad_norm": 0.6689099669456482, "kl": 0.04842198523692787, "learning_rate": 4.522542485937369e-06, "loss": 0.0048, "num_tokens": 6792384.0, "reward": 0.85675048828125, "reward_std": 0.009583639912307262, "rewards//mean": 0.85675048828125, "rewards//std": 0.019035812467336655, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2084, "grad_norm": 0.6974130868911743, "kl": 0.049008972477167845, "learning_rate": 4.521609461573937e-06, "loss": 0.0049, "num_tokens": 6798912.0, "reward": 0.84893798828125, "reward_std": 0.010589668527245522, "rewards//mean": 0.84893798828125, "rewards//std": 0.02157639153301716, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2086, "grad_norm": 0.7175626754760742, "kl": 0.05459157284349203, "learning_rate": 4.520675622906455e-06, "loss": 0.0055, "num_tokens": 6805512.0, "reward": 0.83209228515625, "reward_std": 0.009936067275702953, "rewards//mean": 0.83209228515625, "rewards//std": 0.020165691152215004, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2088, "grad_norm": 0.7344629764556885, "kl": 0.05480337282642722, "learning_rate": 4.519740970311074e-06, "loss": 0.0055, "num_tokens": 6811976.0, "reward": 0.80059814453125, "reward_std": 0.01489953137934208, "rewards//mean": 0.80059814453125, "rewards//std": 0.030988723039627075, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.209, "grad_norm": 0.6893591284751892, "kl": 0.056181959342211485, "learning_rate": 4.518805504164272e-06, "loss": 0.0056, "num_tokens": 6818504.0, "reward": 0.87652587890625, "reward_std": 0.013555187731981277, "rewards//mean": 0.87652587890625, "rewards//std": 0.022953176870942116, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2092, "grad_norm": 0.7041027545928955, "kl": 0.04849055455997586, "learning_rate": 4.517869224842853e-06, "loss": 0.0048, "num_tokens": 6825080.0, "reward": 0.80621337890625, "reward_std": 0.010871358215808868, "rewards//mean": 0.80621337890625, "rewards//std": 0.03782525658607483, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2094, "grad_norm": 0.6842744946479797, "kl": 0.04927981551736593, "learning_rate": 4.516932132723953e-06, "loss": 0.0049, "num_tokens": 6831600.0, "reward": 0.845703125, "reward_std": 0.014329057186841965, "rewards//mean": 0.845703125, "rewards//std": 0.0255344957113266, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2096, "grad_norm": 0.6956482529640198, "kl": 0.04865870298817754, "learning_rate": 4.515994228185031e-06, "loss": 0.0049, "num_tokens": 6838096.0, "reward": 0.85894775390625, "reward_std": 0.010142069309949875, "rewards//mean": 0.85894775390625, "rewards//std": 0.028289495036005974, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2098, "grad_norm": 0.7180635929107666, "kl": 0.05673117144033313, "learning_rate": 4.5150555116038755e-06, "loss": 0.0057, "num_tokens": 6844696.0, "reward": 0.8359375, "reward_std": 0.01695888489484787, "rewards//mean": 0.8359375, "rewards//std": 0.026300795376300812, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.21, "grad_norm": 0.631371021270752, "kl": 0.05663994327187538, "learning_rate": 4.5141159833586e-06, "loss": 0.0057, "num_tokens": 6851232.0, "reward": 0.83294677734375, "reward_std": 0.011457376182079315, "rewards//mean": 0.83294677734375, "rewards//std": 0.024016717448830605, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2102, "grad_norm": 0.743635892868042, "kl": 0.04824688425287604, "learning_rate": 4.513175643827647e-06, "loss": 0.0048, "num_tokens": 6857768.0, "reward": 0.81158447265625, "reward_std": 0.014033078216016293, "rewards//mean": 0.81158447265625, "rewards//std": 0.027837470173835754, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2104, "grad_norm": 0.6559764742851257, "kl": 0.05190576473250985, "learning_rate": 4.512234493389785e-06, "loss": 0.0052, "num_tokens": 6864304.0, "reward": 0.80523681640625, "reward_std": 0.01037571206688881, "rewards//mean": 0.80523681640625, "rewards//std": 0.015960363671183586, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2106, "grad_norm": 0.7995378971099854, "kl": 0.05641445331275463, "learning_rate": 4.511292532424111e-06, "loss": 0.0056, "num_tokens": 6870800.0, "reward": 0.85882568359375, "reward_std": 0.014005579985678196, "rewards//mean": 0.85882568359375, "rewards//std": 0.032173145562410355, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2108, "grad_norm": 0.7648438215255737, "kl": 0.055406813975423574, "learning_rate": 4.510349761310046e-06, "loss": 0.0055, "num_tokens": 6877312.0, "reward": 0.83905029296875, "reward_std": 0.011487704701721668, "rewards//mean": 0.83905029296875, "rewards//std": 0.024960633367300034, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.211, "grad_norm": 0.6964836120605469, "kl": 0.05148274824023247, "learning_rate": 4.509406180427336e-06, "loss": 0.0051, "num_tokens": 6883832.0, "reward": 0.8211669921875, "reward_std": 0.01340387761592865, "rewards//mean": 0.8211669921875, "rewards//std": 0.0224824957549572, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2112, "grad_norm": 0.7067990899085999, "kl": 0.05086009297519922, "learning_rate": 4.508461790156057e-06, "loss": 0.0051, "num_tokens": 6890368.0, "reward": 0.8411865234375, "reward_std": 0.010597625747323036, "rewards//mean": 0.8411865234375, "rewards//std": 0.028221789747476578, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2114, "grad_norm": 0.7596552968025208, "kl": 0.05806903960183263, "learning_rate": 4.5075165908766095e-06, "loss": 0.0058, "num_tokens": 6896800.0, "reward": 0.868408203125, "reward_std": 0.020702209323644638, "rewards//mean": 0.868408203125, "rewards//std": 0.0394749753177166, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2116, "grad_norm": 0.7750549912452698, "kl": 0.05868978565558791, "learning_rate": 4.506570582969719e-06, "loss": 0.0059, "num_tokens": 6903288.0, "reward": 0.84307861328125, "reward_std": 0.01405446045100689, "rewards//mean": 0.84307861328125, "rewards//std": 0.03590644896030426, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2118, "grad_norm": 0.6291803121566772, "kl": 0.04893304267898202, "learning_rate": 4.505623766816438e-06, "loss": 0.0049, "num_tokens": 6909728.0, "reward": 0.81134033203125, "reward_std": 0.010957226157188416, "rewards//mean": 0.81134033203125, "rewards//std": 0.025052646175026894, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.212, "grad_norm": 0.722735583782196, "kl": 0.05405984679237008, "learning_rate": 4.504676142798143e-06, "loss": 0.0054, "num_tokens": 6916232.0, "reward": 0.81109619140625, "reward_std": 0.008852384053170681, "rewards//mean": 0.81109619140625, "rewards//std": 0.024243811145424843, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2122, "grad_norm": 0.6918269991874695, "kl": 0.04810579586774111, "learning_rate": 4.503727711296539e-06, "loss": 0.0048, "num_tokens": 6922768.0, "reward": 0.8367919921875, "reward_std": 0.009785614907741547, "rewards//mean": 0.8367919921875, "rewards//std": 0.023390628397464752, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2124, "grad_norm": 0.7062320709228516, "kl": 0.05486150225624442, "learning_rate": 4.502778472693651e-06, "loss": 0.0055, "num_tokens": 6929224.0, "reward": 0.86639404296875, "reward_std": 0.013502485118806362, "rewards//mean": 0.86639404296875, "rewards//std": 0.022565411403775215, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2126, "grad_norm": 0.8047633171081543, "kl": 0.05379184568300843, "learning_rate": 4.501828427371834e-06, "loss": 0.0054, "num_tokens": 6935752.0, "reward": 0.7978515625, "reward_std": 0.010705290362238884, "rewards//mean": 0.7978515625, "rewards//std": 0.023520028218626976, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2128, "grad_norm": 0.711575984954834, "kl": 0.05587818846106529, "learning_rate": 4.500877575713766e-06, "loss": 0.0056, "num_tokens": 6942280.0, "reward": 0.83258056640625, "reward_std": 0.01386846974492073, "rewards//mean": 0.83258056640625, "rewards//std": 0.023144949227571487, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.213, "grad_norm": 0.7528927326202393, "kl": 0.05176067887805402, "learning_rate": 4.4999259181024504e-06, "loss": 0.0052, "num_tokens": 6948880.0, "reward": 0.83349609375, "reward_std": 0.0130771454423666, "rewards//mean": 0.83349609375, "rewards//std": 0.029810113832354546, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2132, "grad_norm": 0.8098199963569641, "kl": 0.05495151784271002, "learning_rate": 4.498973454921213e-06, "loss": 0.0055, "num_tokens": 6955448.0, "reward": 0.8436279296875, "reward_std": 0.015587521716952324, "rewards//mean": 0.8436279296875, "rewards//std": 0.026171263307332993, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2134, "grad_norm": 0.7014596462249756, "kl": 0.05284912372007966, "learning_rate": 4.498020186553707e-06, "loss": 0.0053, "num_tokens": 6962112.0, "reward": 0.767822265625, "reward_std": 0.009187604300677776, "rewards//mean": 0.767822265625, "rewards//std": 0.018386267125606537, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2136, "grad_norm": 0.6799576282501221, "kl": 0.04939666297286749, "learning_rate": 4.49706611338391e-06, "loss": 0.0049, "num_tokens": 6968664.0, "reward": 0.86041259765625, "reward_std": 0.014115551486611366, "rewards//mean": 0.86041259765625, "rewards//std": 0.03244582563638687, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2138, "grad_norm": 0.8710656762123108, "kl": 0.061485808342695236, "learning_rate": 4.49611123579612e-06, "loss": 0.0061, "num_tokens": 6975128.0, "reward": 0.86297607421875, "reward_std": 0.013511918485164642, "rewards//mean": 0.86297607421875, "rewards//std": 0.0251840241253376, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.214, "grad_norm": 0.7368143200874329, "kl": 0.04965536668896675, "learning_rate": 4.495155554174963e-06, "loss": 0.005, "num_tokens": 6981720.0, "reward": 0.8397216796875, "reward_std": 0.014077004045248032, "rewards//mean": 0.8397216796875, "rewards//std": 0.029552815482020378, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2142, "grad_norm": 0.7696762084960938, "kl": 0.050469483248889446, "learning_rate": 4.494199068905389e-06, "loss": 0.005, "num_tokens": 6988312.0, "reward": 0.81024169921875, "reward_std": 0.009506959468126297, "rewards//mean": 0.81024169921875, "rewards//std": 0.02266848459839821, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2144, "grad_norm": 0.7175611257553101, "kl": 0.05993840447627008, "learning_rate": 4.493241780372667e-06, "loss": 0.006, "num_tokens": 6994840.0, "reward": 0.80352783203125, "reward_std": 0.011830540373921394, "rewards//mean": 0.80352783203125, "rewards//std": 0.015835629776120186, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2146, "grad_norm": 0.7151725888252258, "kl": 0.05650515854358673, "learning_rate": 4.492283688962395e-06, "loss": 0.0057, "num_tokens": 7001384.0, "reward": 0.77960205078125, "reward_std": 0.009965864010155201, "rewards//mean": 0.77960205078125, "rewards//std": 0.021517377346754074, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2148, "grad_norm": 0.720371663570404, "kl": 0.05455192318186164, "learning_rate": 4.491324795060491e-06, "loss": 0.0055, "num_tokens": 7007816.0, "reward": 0.8603515625, "reward_std": 0.01756669394671917, "rewards//mean": 0.8603515625, "rewards//std": 0.04132814705371857, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.215, "grad_norm": 0.7541438937187195, "kl": 0.06161863449960947, "learning_rate": 4.490365099053198e-06, "loss": 0.0062, "num_tokens": 7014408.0, "reward": 0.8507080078125, "reward_std": 0.012220902368426323, "rewards//mean": 0.8507080078125, "rewards//std": 0.031730469316244125, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2152, "grad_norm": 0.6846283674240112, "kl": 0.05471772514283657, "learning_rate": 4.489404601327081e-06, "loss": 0.0055, "num_tokens": 7021088.0, "reward": 0.845703125, "reward_std": 0.012277290225028992, "rewards//mean": 0.845703125, "rewards//std": 0.01861048862338066, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2154, "grad_norm": 0.8211379051208496, "kl": 0.059677815064787865, "learning_rate": 4.488443302269028e-06, "loss": 0.006, "num_tokens": 7027640.0, "reward": 0.85443115234375, "reward_std": 0.018647147342562675, "rewards//mean": 0.85443115234375, "rewards//std": 0.04309113323688507, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2156, "grad_norm": 0.70787113904953, "kl": 0.05222921958193183, "learning_rate": 4.487481202266251e-06, "loss": 0.0052, "num_tokens": 7034208.0, "reward": 0.84844970703125, "reward_std": 0.012464895844459534, "rewards//mean": 0.84844970703125, "rewards//std": 0.023106331005692482, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2158, "grad_norm": 0.9118306040763855, "kl": 0.05560669722035527, "learning_rate": 4.4865183017062835e-06, "loss": 0.0056, "num_tokens": 7040680.0, "reward": 0.83746337890625, "reward_std": 0.01008305698633194, "rewards//mean": 0.83746337890625, "rewards//std": 0.01618265174329281, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.216, "grad_norm": 0.7151490449905396, "kl": 0.05385538097470999, "learning_rate": 4.485554600976981e-06, "loss": 0.0054, "num_tokens": 7047144.0, "reward": 0.84735107421875, "reward_std": 0.007640528492629528, "rewards//mean": 0.84735107421875, "rewards//std": 0.026832349598407745, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2162, "grad_norm": 0.7240871787071228, "kl": 0.05846234317868948, "learning_rate": 4.484590100466524e-06, "loss": 0.0058, "num_tokens": 7053608.0, "reward": 0.84149169921875, "reward_std": 0.011705256998538971, "rewards//mean": 0.84149169921875, "rewards//std": 0.029444729909300804, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2164, "grad_norm": 0.7130594849586487, "kl": 0.06413034279830754, "learning_rate": 4.483624800563411e-06, "loss": 0.0064, "num_tokens": 7060056.0, "reward": 0.87249755859375, "reward_std": 0.012848039157688618, "rewards//mean": 0.87249755859375, "rewards//std": 0.026487691327929497, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2166, "grad_norm": 0.7812443375587463, "kl": 0.05608916888013482, "learning_rate": 4.482658701656465e-06, "loss": 0.0056, "num_tokens": 7066576.0, "reward": 0.8524169921875, "reward_std": 0.010697348043322563, "rewards//mean": 0.8524169921875, "rewards//std": 0.018525294959545135, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2168, "grad_norm": 0.6899544596672058, "kl": 0.0515352631919086, "learning_rate": 4.4816918041348325e-06, "loss": 0.0052, "num_tokens": 7073136.0, "reward": 0.8521728515625, "reward_std": 0.011487320065498352, "rewards//mean": 0.8521728515625, "rewards//std": 0.021989615634083748, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.217, "grad_norm": 0.6955999135971069, "kl": 0.05347476666793227, "learning_rate": 4.4807241083879774e-06, "loss": 0.0053, "num_tokens": 7079688.0, "reward": 0.85345458984375, "reward_std": 0.010488041676580906, "rewards//mean": 0.85345458984375, "rewards//std": 0.031527724117040634, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2172, "grad_norm": 0.8102149367332458, "kl": 0.05124241579324007, "learning_rate": 4.4797556148056884e-06, "loss": 0.0051, "num_tokens": 7086248.0, "reward": 0.85125732421875, "reward_std": 0.01146087795495987, "rewards//mean": 0.85125732421875, "rewards//std": 0.016650160774588585, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2174, "grad_norm": 0.7164828181266785, "kl": 0.06326874671503901, "learning_rate": 4.478786323778074e-06, "loss": 0.0063, "num_tokens": 7092672.0, "reward": 0.86993408203125, "reward_std": 0.008689985610544682, "rewards//mean": 0.86993408203125, "rewards//std": 0.021118303760886192, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2176, "grad_norm": 0.7143504619598389, "kl": 0.05925315152853727, "learning_rate": 4.477816235695566e-06, "loss": 0.0059, "num_tokens": 7099240.0, "reward": 0.83685302734375, "reward_std": 0.014498643577098846, "rewards//mean": 0.83685302734375, "rewards//std": 0.03646990656852722, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2178, "grad_norm": 0.7515742182731628, "kl": 0.05690917233005166, "learning_rate": 4.476845350948914e-06, "loss": 0.0057, "num_tokens": 7105704.0, "reward": 0.849365234375, "reward_std": 0.01064487174153328, "rewards//mean": 0.849365234375, "rewards//std": 0.022303014993667603, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.218, "grad_norm": 0.728315532207489, "kl": 0.054148969473317266, "learning_rate": 4.475873669929192e-06, "loss": 0.0054, "num_tokens": 7112184.0, "reward": 0.83758544921875, "reward_std": 0.009316690266132355, "rewards//mean": 0.83758544921875, "rewards//std": 0.01821327768266201, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2182, "grad_norm": 0.6564595103263855, "kl": 0.04327485337853432, "learning_rate": 4.474901193027791e-06, "loss": 0.0043, "num_tokens": 7118664.0, "reward": 0.8489990234375, "reward_std": 0.01144429948180914, "rewards//mean": 0.8489990234375, "rewards//std": 0.03124394454061985, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2184, "grad_norm": 0.8155738711357117, "kl": 0.06336658587679267, "learning_rate": 4.473927920636426e-06, "loss": 0.0063, "num_tokens": 7125136.0, "reward": 0.83843994140625, "reward_std": 0.012236444279551506, "rewards//mean": 0.83843994140625, "rewards//std": 0.026221174746751785, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2186, "grad_norm": 0.7210853099822998, "kl": 0.05314556369557977, "learning_rate": 4.472953853147131e-06, "loss": 0.0053, "num_tokens": 7131696.0, "reward": 0.861572265625, "reward_std": 0.01641913689672947, "rewards//mean": 0.861572265625, "rewards//std": 0.024447832256555557, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2188, "grad_norm": 0.6829798221588135, "kl": 0.05199786368757486, "learning_rate": 4.471978990952259e-06, "loss": 0.0052, "num_tokens": 7138296.0, "reward": 0.86865234375, "reward_std": 0.00928974524140358, "rewards//mean": 0.86865234375, "rewards//std": 0.018275611102581024, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.219, "grad_norm": 0.7742270231246948, "kl": 0.06741668144240975, "learning_rate": 4.471003334444486e-06, "loss": 0.0067, "num_tokens": 7144704.0, "reward": 0.7857666015625, "reward_std": 0.01397575531154871, "rewards//mean": 0.7857666015625, "rewards//std": 0.03282415121793747, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2192, "grad_norm": 0.879043459892273, "kl": 0.06132833752781153, "learning_rate": 4.470026884016805e-06, "loss": 0.0061, "num_tokens": 7151272.0, "reward": 0.76904296875, "reward_std": 0.009890593588352203, "rewards//mean": 0.76904296875, "rewards//std": 0.01766917295753956, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2194, "grad_norm": 0.8010754585266113, "kl": 0.06009903363883495, "learning_rate": 4.469049640062532e-06, "loss": 0.006, "num_tokens": 7157888.0, "reward": 0.86663818359375, "reward_std": 0.008416292257606983, "rewards//mean": 0.86663818359375, "rewards//std": 0.03420907258987427, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2196, "grad_norm": 0.7043450474739075, "kl": 0.07603266416117549, "learning_rate": 4.468071602975298e-06, "loss": 0.0076, "num_tokens": 7164320.0, "reward": 0.80230712890625, "reward_std": 0.014044564217329025, "rewards//mean": 0.80230712890625, "rewards//std": 0.02760045789182186, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2198, "grad_norm": 0.7999110221862793, "kl": 0.05718750460073352, "learning_rate": 4.467092773149058e-06, "loss": 0.0057, "num_tokens": 7170784.0, "reward": 0.84808349609375, "reward_std": 0.013420334085822105, "rewards//mean": 0.84808349609375, "rewards//std": 0.039032213389873505, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.22, "grad_norm": 0.8376137614250183, "kl": 0.060066268779337406, "learning_rate": 4.466113150978085e-06, "loss": 0.006, "num_tokens": 7177336.0, "reward": 0.84912109375, "reward_std": 0.013609793037176132, "rewards//mean": 0.84912109375, "rewards//std": 0.023903073742985725, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2202, "grad_norm": 0.7788048982620239, "kl": 0.05014662817120552, "learning_rate": 4.4651327368569695e-06, "loss": 0.005, "num_tokens": 7183872.0, "reward": 0.88482666015625, "reward_std": 0.01064593531191349, "rewards//mean": 0.88482666015625, "rewards//std": 0.025920404121279716, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2204, "grad_norm": 0.7592735886573792, "kl": 0.05675828875973821, "learning_rate": 4.464151531180622e-06, "loss": 0.0057, "num_tokens": 7190424.0, "reward": 0.851318359375, "reward_std": 0.013784235343337059, "rewards//mean": 0.851318359375, "rewards//std": 0.038781698793172836, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2206, "grad_norm": 0.8835424780845642, "kl": 0.052044869400560856, "learning_rate": 4.463169534344273e-06, "loss": 0.0052, "num_tokens": 7196944.0, "reward": 0.82891845703125, "reward_std": 0.011992018669843674, "rewards//mean": 0.82891845703125, "rewards//std": 0.03385545313358307, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2208, "grad_norm": 0.7888392806053162, "kl": 0.048590237740427256, "learning_rate": 4.462186746743471e-06, "loss": 0.0049, "num_tokens": 7203472.0, "reward": 0.79150390625, "reward_std": 0.01324221771210432, "rewards//mean": 0.79150390625, "rewards//std": 0.03512178733944893, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.221, "grad_norm": 0.764504611492157, "kl": 0.07198935514315963, "learning_rate": 4.461203168774081e-06, "loss": 0.0072, "num_tokens": 7209960.0, "reward": 0.87127685546875, "reward_std": 0.01221170648932457, "rewards//mean": 0.87127685546875, "rewards//std": 0.03144647553563118, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2212, "grad_norm": 0.7696700096130371, "kl": 0.055837410502135754, "learning_rate": 4.46021880083229e-06, "loss": 0.0056, "num_tokens": 7216504.0, "reward": 0.81390380859375, "reward_std": 0.009542936459183693, "rewards//mean": 0.81390380859375, "rewards//std": 0.026119371876120567, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2214, "grad_norm": 0.6581017374992371, "kl": 0.058999020140618086, "learning_rate": 4.4592336433146e-06, "loss": 0.0059, "num_tokens": 7223000.0, "reward": 0.855224609375, "reward_std": 0.012505415827035904, "rewards//mean": 0.855224609375, "rewards//std": 0.015246453694999218, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2216, "grad_norm": 0.7815372347831726, "kl": 0.06378600466996431, "learning_rate": 4.458247696617833e-06, "loss": 0.0064, "num_tokens": 7229408.0, "reward": 0.81158447265625, "reward_std": 0.011170436628162861, "rewards//mean": 0.81158447265625, "rewards//std": 0.017451833933591843, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2218, "grad_norm": 0.7033050060272217, "kl": 0.04359610565006733, "learning_rate": 4.4572609611391275e-06, "loss": 0.0044, "num_tokens": 7235920.0, "reward": 0.81097412109375, "reward_std": 0.009881779551506042, "rewards//mean": 0.81097412109375, "rewards//std": 0.03233646973967552, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.222, "grad_norm": 0.697451651096344, "kl": 0.060411959420889616, "learning_rate": 4.456273437275941e-06, "loss": 0.006, "num_tokens": 7242472.0, "reward": 0.82696533203125, "reward_std": 0.009723921306431293, "rewards//mean": 0.82696533203125, "rewards//std": 0.026618242263793945, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2222, "grad_norm": 0.8232316374778748, "kl": 0.06183359259739518, "learning_rate": 4.455285125426049e-06, "loss": 0.0062, "num_tokens": 7248976.0, "reward": 0.816650390625, "reward_std": 0.016397636383771896, "rewards//mean": 0.816650390625, "rewards//std": 0.02792549319565296, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2224, "grad_norm": 0.7985503077507019, "kl": 0.06392278848215938, "learning_rate": 4.4542960259875415e-06, "loss": 0.0064, "num_tokens": 7255536.0, "reward": 0.8507080078125, "reward_std": 0.01287984661757946, "rewards//mean": 0.8507080078125, "rewards//std": 0.026506900787353516, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2226, "grad_norm": 0.7494643926620483, "kl": 0.051736967638134956, "learning_rate": 4.453306139358828e-06, "loss": 0.0052, "num_tokens": 7262032.0, "reward": 0.8048095703125, "reward_std": 0.013149973936378956, "rewards//mean": 0.8048095703125, "rewards//std": 0.04071390628814697, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2228, "grad_norm": 0.7103723287582397, "kl": 0.05173104349523783, "learning_rate": 4.4523154659386355e-06, "loss": 0.0052, "num_tokens": 7268520.0, "reward": 0.81585693359375, "reward_std": 0.011336097493767738, "rewards//mean": 0.81585693359375, "rewards//std": 0.025131676346063614, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.223, "grad_norm": 0.664604127407074, "kl": 0.057766727171838284, "learning_rate": 4.451324006126006e-06, "loss": 0.0058, "num_tokens": 7275064.0, "reward": 0.84283447265625, "reward_std": 0.00736248679459095, "rewards//mean": 0.84283447265625, "rewards//std": 0.019890571013092995, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2232, "grad_norm": 0.6953597664833069, "kl": 0.04468205850571394, "learning_rate": 4.4503317603203025e-06, "loss": 0.0045, "num_tokens": 7281560.0, "reward": 0.8668212890625, "reward_std": 0.016364458948373795, "rewards//mean": 0.8668212890625, "rewards//std": 0.021862581372261047, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2234, "grad_norm": 0.6802504062652588, "kl": 0.04388501262292266, "learning_rate": 4.449338728921197e-06, "loss": 0.0044, "num_tokens": 7288064.0, "reward": 0.8482666015625, "reward_std": 0.007848985493183136, "rewards//mean": 0.8482666015625, "rewards//std": 0.02135256491601467, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2236, "grad_norm": 0.7413816452026367, "kl": 0.05946168629452586, "learning_rate": 4.448344912328686e-06, "loss": 0.0059, "num_tokens": 7294504.0, "reward": 0.8037109375, "reward_std": 0.012719184160232544, "rewards//mean": 0.8037109375, "rewards//std": 0.019313016906380653, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2238, "grad_norm": 0.7356482148170471, "kl": 0.06002384517341852, "learning_rate": 4.447350310943077e-06, "loss": 0.006, "num_tokens": 7301056.0, "reward": 0.81292724609375, "reward_std": 0.01024347823113203, "rewards//mean": 0.81292724609375, "rewards//std": 0.021543391048908234, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.224, "grad_norm": 0.6294835805892944, "kl": 0.047821365762501955, "learning_rate": 4.4463549251649954e-06, "loss": 0.0048, "num_tokens": 7307528.0, "reward": 0.84912109375, "reward_std": 0.01112578809261322, "rewards//mean": 0.84912109375, "rewards//std": 0.02609276957809925, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2242, "grad_norm": 0.7654352784156799, "kl": 0.04468666622415185, "learning_rate": 4.445358755395382e-06, "loss": 0.0045, "num_tokens": 7314024.0, "reward": 0.80914306640625, "reward_std": 0.008740440011024475, "rewards//mean": 0.80914306640625, "rewards//std": 0.014778456650674343, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2244, "grad_norm": 0.8374254107475281, "kl": 0.050389107782393694, "learning_rate": 4.444361802035495e-06, "loss": 0.005, "num_tokens": 7320544.0, "reward": 0.82965087890625, "reward_std": 0.011834653094410896, "rewards//mean": 0.82965087890625, "rewards//std": 0.027371348813176155, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2246, "grad_norm": 0.7569440007209778, "kl": 0.048264033161103725, "learning_rate": 4.443364065486907e-06, "loss": 0.0048, "num_tokens": 7327008.0, "reward": 0.8603515625, "reward_std": 0.012495627626776695, "rewards//mean": 0.8603515625, "rewards//std": 0.028142563998699188, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2248, "grad_norm": 0.7266397476196289, "kl": 0.05534402886405587, "learning_rate": 4.442365546151506e-06, "loss": 0.0055, "num_tokens": 7333632.0, "reward": 0.874267578125, "reward_std": 0.012159336358308792, "rewards//mean": 0.874267578125, "rewards//std": 0.02561854012310505, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.225, "grad_norm": 0.7702950239181519, "kl": 0.05120529746636748, "learning_rate": 4.441366244431494e-06, "loss": 0.0051, "num_tokens": 7340160.0, "reward": 0.84295654296875, "reward_std": 0.013117571361362934, "rewards//mean": 0.84295654296875, "rewards//std": 0.03302982449531555, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2252, "grad_norm": 0.7216554880142212, "kl": 0.06197409052401781, "learning_rate": 4.440366160729393e-06, "loss": 0.0062, "num_tokens": 7346616.0, "reward": 0.79620361328125, "reward_std": 0.011800525709986687, "rewards//mean": 0.79620361328125, "rewards//std": 0.027573570609092712, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2254, "grad_norm": 0.7519876956939697, "kl": 0.06195631390437484, "learning_rate": 4.439365295448032e-06, "loss": 0.0062, "num_tokens": 7353192.0, "reward": 0.868408203125, "reward_std": 0.011770559474825859, "rewards//mean": 0.868408203125, "rewards//std": 0.02921408787369728, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2256, "grad_norm": 0.755064845085144, "kl": 0.051075505558401346, "learning_rate": 4.438363648990564e-06, "loss": 0.0051, "num_tokens": 7359648.0, "reward": 0.85284423828125, "reward_std": 0.011770223267376423, "rewards//mean": 0.85284423828125, "rewards//std": 0.03519449755549431, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2258, "grad_norm": 0.7416606545448303, "kl": 0.05549000529572368, "learning_rate": 4.437361221760449e-06, "loss": 0.0055, "num_tokens": 7366168.0, "reward": 0.8243408203125, "reward_std": 0.014618674293160439, "rewards//mean": 0.8243408203125, "rewards//std": 0.043747834861278534, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.226, "grad_norm": 0.7960250377655029, "kl": 0.0530172074213624, "learning_rate": 4.436358014161466e-06, "loss": 0.0053, "num_tokens": 7372776.0, "reward": 0.806640625, "reward_std": 0.014216450974345207, "rewards//mean": 0.806640625, "rewards//std": 0.035297200083732605, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2262, "grad_norm": 0.6872504353523254, "kl": 0.04862110363319516, "learning_rate": 4.435354026597707e-06, "loss": 0.0049, "num_tokens": 7379296.0, "reward": 0.84197998046875, "reward_std": 0.008724553510546684, "rewards//mean": 0.84197998046875, "rewards//std": 0.01962316781282425, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2264, "grad_norm": 0.8029571175575256, "kl": 0.04854334658011794, "learning_rate": 4.434349259473576e-06, "loss": 0.0049, "num_tokens": 7385768.0, "reward": 0.8134765625, "reward_std": 0.01589266024529934, "rewards//mean": 0.8134765625, "rewards//std": 0.0293144378811121, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2266, "grad_norm": 0.6830440163612366, "kl": 0.04517551138997078, "learning_rate": 4.433343713193796e-06, "loss": 0.0045, "num_tokens": 7392272.0, "reward": 0.82867431640625, "reward_std": 0.01055665872991085, "rewards//mean": 0.82867431640625, "rewards//std": 0.02844957262277603, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2268, "grad_norm": 0.7341215014457703, "kl": 0.050112880766391754, "learning_rate": 4.432337388163399e-06, "loss": 0.005, "num_tokens": 7398808.0, "reward": 0.817626953125, "reward_std": 0.01283479668200016, "rewards//mean": 0.817626953125, "rewards//std": 0.02972366102039814, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.227, "grad_norm": 0.8620299100875854, "kl": 0.056606184458360076, "learning_rate": 4.431330284787733e-06, "loss": 0.0057, "num_tokens": 7405376.0, "reward": 0.85198974609375, "reward_std": 0.014177478849887848, "rewards//mean": 0.85198974609375, "rewards//std": 0.035325437784194946, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2272, "grad_norm": 0.7248114943504333, "kl": 0.054031000938266516, "learning_rate": 4.430322403472459e-06, "loss": 0.0054, "num_tokens": 7411880.0, "reward": 0.8480224609375, "reward_std": 0.012687819078564644, "rewards//mean": 0.8480224609375, "rewards//std": 0.028719456866383553, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2274, "grad_norm": 0.6626878380775452, "kl": 0.04423896688967943, "learning_rate": 4.429313744623553e-06, "loss": 0.0044, "num_tokens": 7418360.0, "reward": 0.8560791015625, "reward_std": 0.01173429749906063, "rewards//mean": 0.8560791015625, "rewards//std": 0.018768833950161934, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2276, "grad_norm": 0.7388173937797546, "kl": 0.04783879336901009, "learning_rate": 4.4283043086473e-06, "loss": 0.0048, "num_tokens": 7424960.0, "reward": 0.81427001953125, "reward_std": 0.010703140869736671, "rewards//mean": 0.81427001953125, "rewards//std": 0.030233096331357956, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2278, "grad_norm": 0.7611457109451294, "kl": 0.05443268595263362, "learning_rate": 4.427294095950303e-06, "loss": 0.0054, "num_tokens": 7431496.0, "reward": 0.848876953125, "reward_std": 0.014159945771098137, "rewards//mean": 0.848876953125, "rewards//std": 0.029576625674962997, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.228, "grad_norm": 0.735953152179718, "kl": 0.0506066617090255, "learning_rate": 4.426283106939474e-06, "loss": 0.0051, "num_tokens": 7437968.0, "reward": 0.84857177734375, "reward_std": 0.011140226386487484, "rewards//mean": 0.84857177734375, "rewards//std": 0.030191509053111076, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2282, "grad_norm": 0.7134144902229309, "kl": 0.051812713500112295, "learning_rate": 4.425271342022039e-06, "loss": 0.0052, "num_tokens": 7444464.0, "reward": 0.87091064453125, "reward_std": 0.015304855071008205, "rewards//mean": 0.87091064453125, "rewards//std": 0.03494847193360329, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2284, "grad_norm": 0.6951360106468201, "kl": 0.04901850130409002, "learning_rate": 4.42425880160554e-06, "loss": 0.0049, "num_tokens": 7450920.0, "reward": 0.83599853515625, "reward_std": 0.01106259785592556, "rewards//mean": 0.83599853515625, "rewards//std": 0.029753122478723526, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2286, "grad_norm": 0.8088107109069824, "kl": 0.05430246377363801, "learning_rate": 4.423245486097823e-06, "loss": 0.0054, "num_tokens": 7457432.0, "reward": 0.8489990234375, "reward_std": 0.015645597130060196, "rewards//mean": 0.8489990234375, "rewards//std": 0.028316037729382515, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2288, "grad_norm": 0.6849681735038757, "kl": 0.05447221780195832, "learning_rate": 4.4222313959070565e-06, "loss": 0.0054, "num_tokens": 7463928.0, "reward": 0.87420654296875, "reward_std": 0.009698194451630116, "rewards//mean": 0.87420654296875, "rewards//std": 0.017936907708644867, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.229, "grad_norm": 0.7622599005699158, "kl": 0.05767669342458248, "learning_rate": 4.421216531441713e-06, "loss": 0.0058, "num_tokens": 7470368.0, "reward": 0.87432861328125, "reward_std": 0.015318895690143108, "rewards//mean": 0.87432861328125, "rewards//std": 0.02723773755133152, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2292, "grad_norm": 0.8112951517105103, "kl": 0.054642336908727884, "learning_rate": 4.42020089311058e-06, "loss": 0.0055, "num_tokens": 7476840.0, "reward": 0.80120849609375, "reward_std": 0.010481067933142185, "rewards//mean": 0.80120849609375, "rewards//std": 0.025066539645195007, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2294, "grad_norm": 0.7559449076652527, "kl": 0.050276882480829954, "learning_rate": 4.419184481322757e-06, "loss": 0.005, "num_tokens": 7483456.0, "reward": 0.84344482421875, "reward_std": 0.017322290688753128, "rewards//mean": 0.84344482421875, "rewards//std": 0.02175803855061531, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2296, "grad_norm": 0.799741804599762, "kl": 0.06788119161501527, "learning_rate": 4.418167296487655e-06, "loss": 0.0068, "num_tokens": 7490000.0, "reward": 0.86407470703125, "reward_std": 0.01789180003106594, "rewards//mean": 0.86407470703125, "rewards//std": 0.027082795277237892, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2298, "grad_norm": 0.7408566474914551, "kl": 0.05523125175386667, "learning_rate": 4.417149339014994e-06, "loss": 0.0055, "num_tokens": 7496472.0, "reward": 0.82373046875, "reward_std": 0.01489008218050003, "rewards//mean": 0.82373046875, "rewards//std": 0.041078321635723114, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.23, "grad_norm": 0.789924144744873, "kl": 0.05154033284634352, "learning_rate": 4.41613060931481e-06, "loss": 0.0052, "num_tokens": 7502912.0, "reward": 0.8475341796875, "reward_std": 0.014700725674629211, "rewards//mean": 0.8475341796875, "rewards//std": 0.02918168343603611, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2302, "grad_norm": 0.7815218567848206, "kl": 0.06677363021299243, "learning_rate": 4.415111107797445e-06, "loss": 0.0067, "num_tokens": 7509376.0, "reward": 0.85076904296875, "reward_std": 0.009429564699530602, "rewards//mean": 0.85076904296875, "rewards//std": 0.018973685801029205, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2304, "grad_norm": 0.7740068435668945, "kl": 0.06355136074125767, "learning_rate": 4.4140908348735555e-06, "loss": 0.0064, "num_tokens": 7515936.0, "reward": 0.8701171875, "reward_std": 0.019320735707879066, "rewards//mean": 0.8701171875, "rewards//std": 0.021943099796772003, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2306, "grad_norm": 0.7540057301521301, "kl": 0.0565732060931623, "learning_rate": 4.413069790954106e-06, "loss": 0.0057, "num_tokens": 7522424.0, "reward": 0.842041015625, "reward_std": 0.011478090658783913, "rewards//mean": 0.842041015625, "rewards//std": 0.022818323224782944, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2308, "grad_norm": 0.7604495882987976, "kl": 0.05395061383023858, "learning_rate": 4.412047976450373e-06, "loss": 0.0054, "num_tokens": 7528920.0, "reward": 0.85467529296875, "reward_std": 0.01335496362298727, "rewards//mean": 0.85467529296875, "rewards//std": 0.02128395438194275, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.231, "grad_norm": 0.8167135715484619, "kl": 0.06251343665644526, "learning_rate": 4.411025391773945e-06, "loss": 0.0063, "num_tokens": 7535440.0, "reward": 0.860595703125, "reward_std": 0.011862451210618019, "rewards//mean": 0.860595703125, "rewards//std": 0.028762908652424812, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2312, "grad_norm": 0.7621413469314575, "kl": 0.06091212620958686, "learning_rate": 4.4100020373367166e-06, "loss": 0.0061, "num_tokens": 7541928.0, "reward": 0.823486328125, "reward_std": 0.011545015498995781, "rewards//mean": 0.823486328125, "rewards//std": 0.020073994994163513, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2314, "grad_norm": 0.7512155175209045, "kl": 0.050995375495404005, "learning_rate": 4.408977913550897e-06, "loss": 0.0051, "num_tokens": 7548408.0, "reward": 0.8341064453125, "reward_std": 0.008369403891265392, "rewards//mean": 0.8341064453125, "rewards//std": 0.025276795029640198, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2316, "grad_norm": 0.6955687403678894, "kl": 0.0514345052652061, "learning_rate": 4.407953020829001e-06, "loss": 0.0051, "num_tokens": 7554904.0, "reward": 0.8214111328125, "reward_std": 0.0067789992317557335, "rewards//mean": 0.8214111328125, "rewards//std": 0.020020760595798492, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2318, "grad_norm": 1.0827809572219849, "kl": 0.06982277473434806, "learning_rate": 4.406927359583857e-06, "loss": 0.007, "num_tokens": 7561408.0, "reward": 0.8072509765625, "reward_std": 0.009238315746188164, "rewards//mean": 0.8072509765625, "rewards//std": 0.02810138463973999, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.232, "grad_norm": 0.7518680095672607, "kl": 0.056215353310108185, "learning_rate": 4.4059009302286e-06, "loss": 0.0056, "num_tokens": 7567872.0, "reward": 0.88201904296875, "reward_std": 0.010519944131374359, "rewards//mean": 0.88201904296875, "rewards//std": 0.02036513201892376, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2322, "grad_norm": 0.7316574454307556, "kl": 0.05090909218415618, "learning_rate": 4.404873733176678e-06, "loss": 0.0051, "num_tokens": 7574360.0, "reward": 0.85406494140625, "reward_std": 0.013384755700826645, "rewards//mean": 0.85406494140625, "rewards//std": 0.03589169308543205, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2324, "grad_norm": 0.848837673664093, "kl": 0.05481278616935015, "learning_rate": 4.403845768841842e-06, "loss": 0.0055, "num_tokens": 7580808.0, "reward": 0.80706787109375, "reward_std": 0.010311786085367203, "rewards//mean": 0.80706787109375, "rewards//std": 0.01984027959406376, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2326, "grad_norm": 0.8469411134719849, "kl": 0.052352816332131624, "learning_rate": 4.402817037638159e-06, "loss": 0.0052, "num_tokens": 7587328.0, "reward": 0.841552734375, "reward_std": 0.010697674006223679, "rewards//mean": 0.841552734375, "rewards//std": 0.025247134268283844, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2328, "grad_norm": 0.7755252718925476, "kl": 0.05234520696103573, "learning_rate": 4.40178753998e-06, "loss": 0.0052, "num_tokens": 7593808.0, "reward": 0.88531494140625, "reward_std": 0.011211846023797989, "rewards//mean": 0.88531494140625, "rewards//std": 0.028884684666991234, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.233, "grad_norm": 0.7884958982467651, "kl": 0.0614306153729558, "learning_rate": 4.400757276282048e-06, "loss": 0.0061, "num_tokens": 7600440.0, "reward": 0.85302734375, "reward_std": 0.012455660849809647, "rewards//mean": 0.85302734375, "rewards//std": 0.022707931697368622, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2332, "grad_norm": 0.8270741105079651, "kl": 0.05990053666755557, "learning_rate": 4.399726246959293e-06, "loss": 0.006, "num_tokens": 7606880.0, "reward": 0.7957763671875, "reward_std": 0.01058391947299242, "rewards//mean": 0.7957763671875, "rewards//std": 0.019481195136904716, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2334, "grad_norm": 0.9876028895378113, "kl": 0.06321314396336675, "learning_rate": 4.398694452427032e-06, "loss": 0.0063, "num_tokens": 7613424.0, "reward": 0.830810546875, "reward_std": 0.008742851205170155, "rewards//mean": 0.830810546875, "rewards//std": 0.020313872024416924, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2336, "grad_norm": 0.840717613697052, "kl": 0.05298218899406493, "learning_rate": 4.397661893100873e-06, "loss": 0.0053, "num_tokens": 7619888.0, "reward": 0.84637451171875, "reward_std": 0.01293135154992342, "rewards//mean": 0.84637451171875, "rewards//std": 0.03244442865252495, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2338, "grad_norm": 0.7370341420173645, "kl": 0.06683492055162787, "learning_rate": 4.39662856939673e-06, "loss": 0.0067, "num_tokens": 7626336.0, "reward": 0.84527587890625, "reward_std": 0.012473786249756813, "rewards//mean": 0.84527587890625, "rewards//std": 0.021254774183034897, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.234, "grad_norm": 0.6993956565856934, "kl": 0.055620127357542515, "learning_rate": 4.3955944817308265e-06, "loss": 0.0056, "num_tokens": 7632800.0, "reward": 0.83782958984375, "reward_std": 0.011314465664327145, "rewards//mean": 0.83782958984375, "rewards//std": 0.015374861657619476, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2342, "grad_norm": 0.7468785047531128, "kl": 0.059578930493444204, "learning_rate": 4.3945596305196925e-06, "loss": 0.006, "num_tokens": 7639368.0, "reward": 0.8533935546875, "reward_std": 0.016725096851587296, "rewards//mean": 0.8533935546875, "rewards//std": 0.022052858024835587, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2344, "grad_norm": 0.7919687628746033, "kl": 0.059568730648607016, "learning_rate": 4.393524016180166e-06, "loss": 0.006, "num_tokens": 7645984.0, "reward": 0.7333984375, "reward_std": 0.008255419321358204, "rewards//mean": 0.7333984375, "rewards//std": 0.013832584023475647, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2346, "grad_norm": 0.7425962686538696, "kl": 0.06552393082529306, "learning_rate": 4.3924876391293915e-06, "loss": 0.0066, "num_tokens": 7652496.0, "reward": 0.84033203125, "reward_std": 0.01373964175581932, "rewards//mean": 0.84033203125, "rewards//std": 0.035190682858228683, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2348, "grad_norm": 0.7406244277954102, "kl": 0.06242781179025769, "learning_rate": 4.391450499784823e-06, "loss": 0.0062, "num_tokens": 7659008.0, "reward": 0.8787841796875, "reward_std": 0.014782899990677834, "rewards//mean": 0.8787841796875, "rewards//std": 0.024498242884874344, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.235, "grad_norm": 0.7897083759307861, "kl": 0.05972008081153035, "learning_rate": 4.3904125985642185e-06, "loss": 0.006, "num_tokens": 7665552.0, "reward": 0.850341796875, "reward_std": 0.01687709614634514, "rewards//mean": 0.850341796875, "rewards//std": 0.049188897013664246, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2352, "grad_norm": 0.8242207169532776, "kl": 0.08471109298989177, "learning_rate": 4.3893739358856465e-06, "loss": 0.0085, "num_tokens": 7672024.0, "reward": 0.811279296875, "reward_std": 0.010585353709757328, "rewards//mean": 0.811279296875, "rewards//std": 0.021017082035541534, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2354, "grad_norm": 0.7442731857299805, "kl": 0.053473112639039755, "learning_rate": 4.388334512167478e-06, "loss": 0.0053, "num_tokens": 7678576.0, "reward": 0.79205322265625, "reward_std": 0.010488292202353477, "rewards//mean": 0.79205322265625, "rewards//std": 0.02807140350341797, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2356, "grad_norm": 0.7720754146575928, "kl": 0.06579515989869833, "learning_rate": 4.387294327828394e-06, "loss": 0.0066, "num_tokens": 7685128.0, "reward": 0.78564453125, "reward_std": 0.01288970373570919, "rewards//mean": 0.78564453125, "rewards//std": 0.03435484319925308, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2358, "grad_norm": 0.8420997858047485, "kl": 0.051751340739429, "learning_rate": 4.386253383287381e-06, "loss": 0.0052, "num_tokens": 7691696.0, "reward": 0.84149169921875, "reward_std": 0.0111770611256361, "rewards//mean": 0.84149169921875, "rewards//std": 0.02909720130264759, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.236, "grad_norm": 0.7483237385749817, "kl": 0.055227629374712706, "learning_rate": 4.385211678963731e-06, "loss": 0.0055, "num_tokens": 7698312.0, "reward": 0.76568603515625, "reward_std": 0.008920064195990562, "rewards//mean": 0.76568603515625, "rewards//std": 0.024817705154418945, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2362, "grad_norm": 0.785773515701294, "kl": 0.06607152428478003, "learning_rate": 4.384169215277042e-06, "loss": 0.0066, "num_tokens": 7704816.0, "reward": 0.8785400390625, "reward_std": 0.011582277715206146, "rewards//mean": 0.8785400390625, "rewards//std": 0.03483942151069641, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2364, "grad_norm": 0.9415640234947205, "kl": 0.05959198507480323, "learning_rate": 4.383125992647218e-06, "loss": 0.006, "num_tokens": 7711384.0, "reward": 0.80645751953125, "reward_std": 0.010686928406357765, "rewards//mean": 0.80645751953125, "rewards//std": 0.0207976084202528, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2366, "grad_norm": 0.7203471064567566, "kl": 0.056330994702875614, "learning_rate": 4.382082011494469e-06, "loss": 0.0056, "num_tokens": 7717896.0, "reward": 0.83038330078125, "reward_std": 0.01168217882514, "rewards//mean": 0.83038330078125, "rewards//std": 0.025198446586728096, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2368, "grad_norm": 0.8689064383506775, "kl": 0.057104616425931454, "learning_rate": 4.381037272239311e-06, "loss": 0.0057, "num_tokens": 7724344.0, "reward": 0.8607177734375, "reward_std": 0.015245886519551277, "rewards//mean": 0.8607177734375, "rewards//std": 0.022208811715245247, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.237, "grad_norm": 0.737154483795166, "kl": 0.05586253013461828, "learning_rate": 4.379991775302566e-06, "loss": 0.0056, "num_tokens": 7730808.0, "reward": 0.84423828125, "reward_std": 0.010938527062535286, "rewards//mean": 0.84423828125, "rewards//std": 0.021367093548178673, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2372, "grad_norm": 0.8509859442710876, "kl": 0.05808749608695507, "learning_rate": 4.3789455211053565e-06, "loss": 0.0058, "num_tokens": 7737352.0, "reward": 0.843505859375, "reward_std": 0.012333495542407036, "rewards//mean": 0.843505859375, "rewards//std": 0.015853900462388992, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2374, "grad_norm": 0.7508853077888489, "kl": 0.06454203929752111, "learning_rate": 4.377898510069117e-06, "loss": 0.0065, "num_tokens": 7743872.0, "reward": 0.85137939453125, "reward_std": 0.011925472877919674, "rewards//mean": 0.85137939453125, "rewards//std": 0.016724545508623123, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2376, "grad_norm": 0.7764558792114258, "kl": 0.05904494505375624, "learning_rate": 4.376850742615583e-06, "loss": 0.0059, "num_tokens": 7750344.0, "reward": 0.81390380859375, "reward_std": 0.010760542005300522, "rewards//mean": 0.81390380859375, "rewards//std": 0.025905216112732887, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2378, "grad_norm": 0.7114004492759705, "kl": 0.05623784591443837, "learning_rate": 4.375802219166794e-06, "loss": 0.0056, "num_tokens": 7756904.0, "reward": 0.85028076171875, "reward_std": 0.014666072092950344, "rewards//mean": 0.85028076171875, "rewards//std": 0.025094905868172646, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.238, "grad_norm": 0.8546610474586487, "kl": 0.056913277599960566, "learning_rate": 4.374752940145094e-06, "loss": 0.0057, "num_tokens": 7763408.0, "reward": 0.89605712890625, "reward_std": 0.01257692463696003, "rewards//mean": 0.89605712890625, "rewards//std": 0.023350711911916733, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2382, "grad_norm": 0.8152788877487183, "kl": 0.0510336640290916, "learning_rate": 4.373702905973136e-06, "loss": 0.0051, "num_tokens": 7769856.0, "reward": 0.84869384765625, "reward_std": 0.01274905912578106, "rewards//mean": 0.84869384765625, "rewards//std": 0.016056815162301064, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2384, "grad_norm": 0.7936467528343201, "kl": 0.05479299928992987, "learning_rate": 4.37265211707387e-06, "loss": 0.0055, "num_tokens": 7776416.0, "reward": 0.85797119140625, "reward_std": 0.014550534076988697, "rewards//mean": 0.85797119140625, "rewards//std": 0.03857814520597458, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2386, "grad_norm": 0.8437953591346741, "kl": 0.061718545854091644, "learning_rate": 4.371600573870556e-06, "loss": 0.0062, "num_tokens": 7782960.0, "reward": 0.87030029296875, "reward_std": 0.019070550799369812, "rewards//mean": 0.87030029296875, "rewards//std": 0.03532586619257927, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2388, "grad_norm": 0.7733673453330994, "kl": 0.04780123010277748, "learning_rate": 4.370548276786753e-06, "loss": 0.0048, "num_tokens": 7789472.0, "reward": 0.83734130859375, "reward_std": 0.011778298765420914, "rewards//mean": 0.83734130859375, "rewards//std": 0.03000340424478054, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.239, "grad_norm": 0.8576741218566895, "kl": 0.06534230709075928, "learning_rate": 4.36949522624633e-06, "loss": 0.0065, "num_tokens": 7796000.0, "reward": 0.8323974609375, "reward_std": 0.010347463190555573, "rewards//mean": 0.8323974609375, "rewards//std": 0.019496729597449303, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2392, "grad_norm": 0.8778085708618164, "kl": 0.06259611807763577, "learning_rate": 4.368441422673453e-06, "loss": 0.0063, "num_tokens": 7802584.0, "reward": 0.87139892578125, "reward_std": 0.011626170948147774, "rewards//mean": 0.87139892578125, "rewards//std": 0.019191043451428413, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2394, "grad_norm": 0.8402171730995178, "kl": 0.05447361618280411, "learning_rate": 4.367386866492593e-06, "loss": 0.0054, "num_tokens": 7809096.0, "reward": 0.8602294921875, "reward_std": 0.011925055645406246, "rewards//mean": 0.8602294921875, "rewards//std": 0.020850403234362602, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2396, "grad_norm": 0.7777771353721619, "kl": 0.05061132041737437, "learning_rate": 4.366331558128528e-06, "loss": 0.0051, "num_tokens": 7815584.0, "reward": 0.87762451171875, "reward_std": 0.014197268523275852, "rewards//mean": 0.87762451171875, "rewards//std": 0.018740886822342873, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2398, "grad_norm": 0.7637667655944824, "kl": 0.06869003036990762, "learning_rate": 4.3652754980063335e-06, "loss": 0.0069, "num_tokens": 7822080.0, "reward": 0.8323974609375, "reward_std": 0.012152224779129028, "rewards//mean": 0.8323974609375, "rewards//std": 0.04720821604132652, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.24, "grad_norm": 0.862048327922821, "kl": 0.058464520145207644, "learning_rate": 4.364218686551392e-06, "loss": 0.0058, "num_tokens": 7828496.0, "reward": 0.85626220703125, "reward_std": 0.014938581734895706, "rewards//mean": 0.85626220703125, "rewards//std": 0.031041434034705162, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2402, "grad_norm": 0.8148413896560669, "kl": 0.06335383886471391, "learning_rate": 4.363161124189387e-06, "loss": 0.0063, "num_tokens": 7834976.0, "reward": 0.84063720703125, "reward_std": 0.010651223361492157, "rewards//mean": 0.84063720703125, "rewards//std": 0.014897818677127361, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2404, "grad_norm": 0.9026826024055481, "kl": 0.06057633925229311, "learning_rate": 4.362102811346304e-06, "loss": 0.0061, "num_tokens": 7841512.0, "reward": 0.84698486328125, "reward_std": 0.011474347673356533, "rewards//mean": 0.84698486328125, "rewards//std": 0.024175656959414482, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2406, "grad_norm": 0.7985884547233582, "kl": 0.06102538714185357, "learning_rate": 4.36104374844843e-06, "loss": 0.0061, "num_tokens": 7847928.0, "reward": 0.87127685546875, "reward_std": 0.012629255652427673, "rewards//mean": 0.87127685546875, "rewards//std": 0.01986239291727543, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2408, "grad_norm": 0.8051925897598267, "kl": 0.06813991814851761, "learning_rate": 4.3599839359223575e-06, "loss": 0.0068, "num_tokens": 7854392.0, "reward": 0.82244873046875, "reward_std": 0.009924883022904396, "rewards//mean": 0.82244873046875, "rewards//std": 0.025381607934832573, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.241, "grad_norm": 0.820210337638855, "kl": 0.06907196482643485, "learning_rate": 4.358923374194978e-06, "loss": 0.0069, "num_tokens": 7860864.0, "reward": 0.86883544921875, "reward_std": 0.011671808548271656, "rewards//mean": 0.86883544921875, "rewards//std": 0.022356484085321426, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2412, "grad_norm": 0.7782877087593079, "kl": 0.05946532590314746, "learning_rate": 4.357862063693486e-06, "loss": 0.0059, "num_tokens": 7867424.0, "reward": 0.836181640625, "reward_std": 0.013085326179862022, "rewards//mean": 0.836181640625, "rewards//std": 0.019818998873233795, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2414, "grad_norm": 0.8465147018432617, "kl": 0.06372471107169986, "learning_rate": 4.356800004845376e-06, "loss": 0.0064, "num_tokens": 7873928.0, "reward": 0.8348388671875, "reward_std": 0.009354200214147568, "rewards//mean": 0.8348388671875, "rewards//std": 0.018168862909078598, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2416, "grad_norm": 0.9025537371635437, "kl": 0.06273051863536239, "learning_rate": 4.355737198078447e-06, "loss": 0.0063, "num_tokens": 7880336.0, "reward": 0.86676025390625, "reward_std": 0.013327643275260925, "rewards//mean": 0.86676025390625, "rewards//std": 0.033444736152887344, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2418, "grad_norm": 0.8107250928878784, "kl": 0.07085918728262186, "learning_rate": 4.354673643820796e-06, "loss": 0.0071, "num_tokens": 7886832.0, "reward": 0.87872314453125, "reward_std": 0.009981873445212841, "rewards//mean": 0.87872314453125, "rewards//std": 0.025262044742703438, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.242, "grad_norm": 0.8578004240989685, "kl": 0.06494465563446283, "learning_rate": 4.353609342500824e-06, "loss": 0.0065, "num_tokens": 7893240.0, "reward": 0.857177734375, "reward_std": 0.012346778064966202, "rewards//mean": 0.857177734375, "rewards//std": 0.021051626652479172, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2422, "grad_norm": 0.767950713634491, "kl": 0.06942639080807567, "learning_rate": 4.352544294547229e-06, "loss": 0.0069, "num_tokens": 7899680.0, "reward": 0.838134765625, "reward_std": 0.013853689655661583, "rewards//mean": 0.838134765625, "rewards//std": 0.027151526883244514, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2424, "grad_norm": 0.719933807849884, "kl": 0.07354043051600456, "learning_rate": 4.351478500389014e-06, "loss": 0.0074, "num_tokens": 7906296.0, "reward": 0.84014892578125, "reward_std": 0.016509942710399628, "rewards//mean": 0.84014892578125, "rewards//std": 0.029134634882211685, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2426, "grad_norm": 0.8338012099266052, "kl": 0.06961872428655624, "learning_rate": 4.350411960455482e-06, "loss": 0.007, "num_tokens": 7912816.0, "reward": 0.84564208984375, "reward_std": 0.013635460287332535, "rewards//mean": 0.84564208984375, "rewards//std": 0.021598130464553833, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2428, "grad_norm": 0.7571225762367249, "kl": 0.07182550011202693, "learning_rate": 4.349344675176232e-06, "loss": 0.0072, "num_tokens": 7919344.0, "reward": 0.7916259765625, "reward_std": 0.008822690695524216, "rewards//mean": 0.7916259765625, "rewards//std": 0.014636032283306122, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.243, "grad_norm": 0.7849579453468323, "kl": 0.07299706153571606, "learning_rate": 4.348276644981169e-06, "loss": 0.0073, "num_tokens": 7925872.0, "reward": 0.82940673828125, "reward_std": 0.013377547264099121, "rewards//mean": 0.82940673828125, "rewards//std": 0.03323360159993172, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2432, "grad_norm": 0.8266616463661194, "kl": 0.05137030174955726, "learning_rate": 4.347207870300495e-06, "loss": 0.0051, "num_tokens": 7932272.0, "reward": 0.856201171875, "reward_std": 0.009793533943593502, "rewards//mean": 0.856201171875, "rewards//std": 0.025627993047237396, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2434, "grad_norm": 0.8135539889335632, "kl": 0.07008045259863138, "learning_rate": 4.346138351564711e-06, "loss": 0.007, "num_tokens": 7938784.0, "reward": 0.8636474609375, "reward_std": 0.012057607993483543, "rewards//mean": 0.8636474609375, "rewards//std": 0.034130048006772995, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2436, "grad_norm": 0.8504191040992737, "kl": 0.06503559742122889, "learning_rate": 4.3450680892046185e-06, "loss": 0.0065, "num_tokens": 7945312.0, "reward": 0.8055419921875, "reward_std": 0.013056033290922642, "rewards//mean": 0.8055419921875, "rewards//std": 0.029515912756323814, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2438, "grad_norm": 0.8832058906555176, "kl": 0.06584553513675928, "learning_rate": 4.343997083651321e-06, "loss": 0.0066, "num_tokens": 7951880.0, "reward": 0.84246826171875, "reward_std": 0.012090344913303852, "rewards//mean": 0.84246826171875, "rewards//std": 0.02109965868294239, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.244, "grad_norm": 0.8915728330612183, "kl": 0.07569032581523061, "learning_rate": 4.342925335336219e-06, "loss": 0.0076, "num_tokens": 7958376.0, "reward": 0.77099609375, "reward_std": 0.013944875448942184, "rewards//mean": 0.77099609375, "rewards//std": 0.03147780895233154, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2442, "grad_norm": 0.8388910293579102, "kl": 0.07088658446446061, "learning_rate": 4.341852844691012e-06, "loss": 0.0071, "num_tokens": 7965008.0, "reward": 0.83282470703125, "reward_std": 0.014923842623829842, "rewards//mean": 0.83282470703125, "rewards//std": 0.02580040507018566, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2444, "grad_norm": 0.9286354780197144, "kl": 0.07783356308937073, "learning_rate": 4.340779612147701e-06, "loss": 0.0078, "num_tokens": 7971504.0, "reward": 0.82489013671875, "reward_std": 0.010019993409514427, "rewards//mean": 0.82489013671875, "rewards//std": 0.02485731989145279, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2446, "grad_norm": 0.8361189365386963, "kl": 0.06825034646317363, "learning_rate": 4.33970563813858e-06, "loss": 0.0068, "num_tokens": 7978152.0, "reward": 0.8323974609375, "reward_std": 0.010328681208193302, "rewards//mean": 0.8323974609375, "rewards//std": 0.026399319991469383, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2448, "grad_norm": 0.8250643014907837, "kl": 0.08895124029368162, "learning_rate": 4.33863092309625e-06, "loss": 0.0089, "num_tokens": 7984640.0, "reward": 0.86651611328125, "reward_std": 0.015993226319551468, "rewards//mean": 0.86651611328125, "rewards//std": 0.025463776662945747, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.245, "grad_norm": 0.8378040194511414, "kl": 0.06887993216514587, "learning_rate": 4.337555467453603e-06, "loss": 0.0069, "num_tokens": 7991152.0, "reward": 0.78192138671875, "reward_std": 0.008592363446950912, "rewards//mean": 0.78192138671875, "rewards//std": 0.024876799434423447, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2452, "grad_norm": 0.849920392036438, "kl": 0.07920261705294251, "learning_rate": 4.336479271643833e-06, "loss": 0.0079, "num_tokens": 7997584.0, "reward": 0.79815673828125, "reward_std": 0.011128269135951996, "rewards//mean": 0.79815673828125, "rewards//std": 0.022968340665102005, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2454, "grad_norm": 0.8305087089538574, "kl": 0.08177454583346844, "learning_rate": 4.335402336100433e-06, "loss": 0.0082, "num_tokens": 8004088.0, "reward": 0.86181640625, "reward_std": 0.016794472932815552, "rewards//mean": 0.86181640625, "rewards//std": 0.03704160079360008, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2456, "grad_norm": 0.8848510384559631, "kl": 0.07005961518734694, "learning_rate": 4.334324661257191e-06, "loss": 0.007, "num_tokens": 8010616.0, "reward": 0.88055419921875, "reward_std": 0.01237991638481617, "rewards//mean": 0.88055419921875, "rewards//std": 0.020807797089219093, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2458, "grad_norm": 0.7988156080245972, "kl": 0.07215326651930809, "learning_rate": 4.3332462475481955e-06, "loss": 0.0072, "num_tokens": 8017112.0, "reward": 0.85040283203125, "reward_std": 0.01336895301938057, "rewards//mean": 0.85040283203125, "rewards//std": 0.033730026334524155, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.246, "grad_norm": 0.7687072157859802, "kl": 0.07663374952971935, "learning_rate": 4.33216709540783e-06, "loss": 0.0077, "num_tokens": 8023648.0, "reward": 0.82989501953125, "reward_std": 0.01696275919675827, "rewards//mean": 0.82989501953125, "rewards//std": 0.02718767337501049, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2462, "grad_norm": 0.8864173293113708, "kl": 0.09046565694734454, "learning_rate": 4.331087205270778e-06, "loss": 0.009, "num_tokens": 8030128.0, "reward": 0.80426025390625, "reward_std": 0.01462810579687357, "rewards//mean": 0.80426025390625, "rewards//std": 0.02746410295367241, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2464, "grad_norm": 0.9079626798629761, "kl": 0.09371237549930811, "learning_rate": 4.330006577572018e-06, "loss": 0.0094, "num_tokens": 8036544.0, "reward": 0.87310791015625, "reward_std": 0.013784512877464294, "rewards//mean": 0.87310791015625, "rewards//std": 0.03718186169862747, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2466, "grad_norm": 0.9096731543540955, "kl": 0.0938173271715641, "learning_rate": 4.328925212746828e-06, "loss": 0.0094, "num_tokens": 8043048.0, "reward": 0.8834228515625, "reward_std": 0.016453778371214867, "rewards//mean": 0.8834228515625, "rewards//std": 0.025374822318553925, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2468, "grad_norm": 0.9072927832603455, "kl": 0.08642908884212375, "learning_rate": 4.3278431112307806e-06, "loss": 0.0086, "num_tokens": 8049584.0, "reward": 0.83258056640625, "reward_std": 0.008319806307554245, "rewards//mean": 0.83258056640625, "rewards//std": 0.01386523898690939, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.247, "grad_norm": 0.9830998778343201, "kl": 0.07700034463778138, "learning_rate": 4.326760273459747e-06, "loss": 0.0077, "num_tokens": 8056032.0, "reward": 0.85284423828125, "reward_std": 0.013240346685051918, "rewards//mean": 0.85284423828125, "rewards//std": 0.036285560578107834, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2472, "grad_norm": 0.8765468001365662, "kl": 0.08756387373432517, "learning_rate": 4.325676699869894e-06, "loss": 0.0088, "num_tokens": 8062544.0, "reward": 0.82275390625, "reward_std": 0.01175556518137455, "rewards//mean": 0.82275390625, "rewards//std": 0.030157459899783134, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2474, "grad_norm": 1.0720548629760742, "kl": 0.09055411396548152, "learning_rate": 4.324592390897684e-06, "loss": 0.0091, "num_tokens": 8069072.0, "reward": 0.83331298828125, "reward_std": 0.011638753116130829, "rewards//mean": 0.83331298828125, "rewards//std": 0.025191236287355423, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2476, "grad_norm": 0.958775520324707, "kl": 0.09030033741146326, "learning_rate": 4.323507346979877e-06, "loss": 0.009, "num_tokens": 8075584.0, "reward": 0.870361328125, "reward_std": 0.01301239151507616, "rewards//mean": 0.870361328125, "rewards//std": 0.042323432862758636, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2478, "grad_norm": 0.9561014175415039, "kl": 0.07736630132421851, "learning_rate": 4.322421568553529e-06, "loss": 0.0077, "num_tokens": 8082144.0, "reward": 0.845947265625, "reward_std": 0.011625757440924644, "rewards//mean": 0.845947265625, "rewards//std": 0.022892504930496216, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.248, "grad_norm": 0.8701299428939819, "kl": 0.08184200571849942, "learning_rate": 4.321335056055991e-06, "loss": 0.0082, "num_tokens": 8088760.0, "reward": 0.81396484375, "reward_std": 0.009591728448867798, "rewards//mean": 0.81396484375, "rewards//std": 0.02545374259352684, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2482, "grad_norm": 0.9002383351325989, "kl": 0.08176716603338718, "learning_rate": 4.320247809924911e-06, "loss": 0.0082, "num_tokens": 8095336.0, "reward": 0.84423828125, "reward_std": 0.013347811996936798, "rewards//mean": 0.84423828125, "rewards//std": 0.02211352251470089, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2484, "grad_norm": 0.7977737784385681, "kl": 0.06103194132447243, "learning_rate": 4.31915983059823e-06, "loss": 0.0061, "num_tokens": 8101808.0, "reward": 0.809814453125, "reward_std": 0.010607263073325157, "rewards//mean": 0.809814453125, "rewards//std": 0.022497639060020447, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2486, "grad_norm": 0.8180047273635864, "kl": 0.08289417950436473, "learning_rate": 4.318071118514187e-06, "loss": 0.0083, "num_tokens": 8108360.0, "reward": 0.84661865234375, "reward_std": 0.009752935729920864, "rewards//mean": 0.84661865234375, "rewards//std": 0.018678588792681694, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2488, "grad_norm": 0.8576231598854065, "kl": 0.07269895169883966, "learning_rate": 4.316981674111314e-06, "loss": 0.0073, "num_tokens": 8114912.0, "reward": 0.8658447265625, "reward_std": 0.010676906444132328, "rewards//mean": 0.8658447265625, "rewards//std": 0.013923662714660168, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.249, "grad_norm": 0.8851017355918884, "kl": 0.09387305937707424, "learning_rate": 4.315891497828442e-06, "loss": 0.0094, "num_tokens": 8121424.0, "reward": 0.84307861328125, "reward_std": 0.01564178057014942, "rewards//mean": 0.84307861328125, "rewards//std": 0.028751788660883904, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2492, "grad_norm": 0.8387963175773621, "kl": 0.07727562403306365, "learning_rate": 4.314800590104691e-06, "loss": 0.0077, "num_tokens": 8128000.0, "reward": 0.86541748046875, "reward_std": 0.011410118080675602, "rewards//mean": 0.86541748046875, "rewards//std": 0.03175687789916992, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2494, "grad_norm": 0.8352131843566895, "kl": 0.0777947036549449, "learning_rate": 4.313708951379478e-06, "loss": 0.0078, "num_tokens": 8134608.0, "reward": 0.8218994140625, "reward_std": 0.0116087906062603, "rewards//mean": 0.8218994140625, "rewards//std": 0.03252393752336502, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2496, "grad_norm": 1.072700023651123, "kl": 0.06667021987959743, "learning_rate": 4.312616582092517e-06, "loss": 0.0067, "num_tokens": 8141080.0, "reward": 0.81396484375, "reward_std": 0.010689671151340008, "rewards//mean": 0.81396484375, "rewards//std": 0.014967892318964005, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2498, "grad_norm": 0.7833284735679626, "kl": 0.07724492065608501, "learning_rate": 4.311523482683815e-06, "loss": 0.0077, "num_tokens": 8147544.0, "reward": 0.83477783203125, "reward_std": 0.01325620710849762, "rewards//mean": 0.83477783203125, "rewards//std": 0.015099671669304371, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.25, "grad_norm": 0.9056288003921509, "kl": 0.09507902525365353, "learning_rate": 4.3104296535936695e-06, "loss": 0.0095, "num_tokens": 8154032.0, "reward": 0.79290771484375, "reward_std": 0.013404129073023796, "rewards//mean": 0.79290771484375, "rewards//std": 0.030895279720425606, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2502, "grad_norm": 0.863763689994812, "kl": 0.07805246859788895, "learning_rate": 4.309335095262675e-06, "loss": 0.0078, "num_tokens": 8160720.0, "reward": 0.85675048828125, "reward_std": 0.010213732719421387, "rewards//mean": 0.85675048828125, "rewards//std": 0.012796707451343536, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2504, "grad_norm": 0.7710720896720886, "kl": 0.06887036189436913, "learning_rate": 4.308239808131722e-06, "loss": 0.0069, "num_tokens": 8167248.0, "reward": 0.81561279296875, "reward_std": 0.01143108680844307, "rewards//mean": 0.81561279296875, "rewards//std": 0.02004144713282585, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2506, "grad_norm": 0.8013695478439331, "kl": 0.07486086850985885, "learning_rate": 4.30714379264199e-06, "loss": 0.0075, "num_tokens": 8173656.0, "reward": 0.8421630859375, "reward_std": 0.013716299086809158, "rewards//mean": 0.8421630859375, "rewards//std": 0.026554828509688377, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2508, "grad_norm": 0.8946172595024109, "kl": 0.11334248445928097, "learning_rate": 4.306047049234955e-06, "loss": 0.0113, "num_tokens": 8180160.0, "reward": 0.80670166015625, "reward_std": 0.013924263417720795, "rewards//mean": 0.80670166015625, "rewards//std": 0.03099311888217926, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.251, "grad_norm": 0.9053825736045837, "kl": 0.07695498783141375, "learning_rate": 4.3049495783523845e-06, "loss": 0.0077, "num_tokens": 8186776.0, "reward": 0.83465576171875, "reward_std": 0.011949039995670319, "rewards//mean": 0.83465576171875, "rewards//std": 0.01620415225625038, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2512, "grad_norm": 0.752427339553833, "kl": 0.07894555944949389, "learning_rate": 4.3038513804363395e-06, "loss": 0.0079, "num_tokens": 8193456.0, "reward": 0.8856201171875, "reward_std": 0.016780532896518707, "rewards//mean": 0.8856201171875, "rewards//std": 0.023576276376843452, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2514, "grad_norm": 0.6809706687927246, "kl": 0.07994749629870057, "learning_rate": 4.302752455929174e-06, "loss": 0.008, "num_tokens": 8199888.0, "reward": 0.84124755859375, "reward_std": 0.011203891597688198, "rewards//mean": 0.84124755859375, "rewards//std": 0.02224312163889408, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2516, "grad_norm": 0.77884441614151, "kl": 0.07531741121783853, "learning_rate": 4.301652805273535e-06, "loss": 0.0075, "num_tokens": 8206488.0, "reward": 0.8438720703125, "reward_std": 0.013284945860505104, "rewards//mean": 0.8438720703125, "rewards//std": 0.02548672817647457, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2518, "grad_norm": 0.8080945014953613, "kl": 0.07648079702630639, "learning_rate": 4.300552428912361e-06, "loss": 0.0076, "num_tokens": 8212984.0, "reward": 0.86578369140625, "reward_std": 0.02121882140636444, "rewards//mean": 0.86578369140625, "rewards//std": 0.040405768901109695, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.252, "grad_norm": 0.8831493258476257, "kl": 0.06573500856757164, "learning_rate": 4.299451327288884e-06, "loss": 0.0066, "num_tokens": 8219472.0, "reward": 0.82342529296875, "reward_std": 0.01960720121860504, "rewards//mean": 0.82342529296875, "rewards//std": 0.03449330851435661, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2522, "grad_norm": 0.76123046875, "kl": 0.06781908171251416, "learning_rate": 4.2983495008466285e-06, "loss": 0.0068, "num_tokens": 8226016.0, "reward": 0.8275146484375, "reward_std": 0.009658968076109886, "rewards//mean": 0.8275146484375, "rewards//std": 0.023981157690286636, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2524, "grad_norm": 0.9495390057563782, "kl": 0.07224578410387039, "learning_rate": 4.2972469500294085e-06, "loss": 0.0072, "num_tokens": 8232480.0, "reward": 0.85699462890625, "reward_std": 0.013417374342679977, "rewards//mean": 0.85699462890625, "rewards//std": 0.02733593061566353, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2526, "grad_norm": 0.7615146040916443, "kl": 0.08687197510153055, "learning_rate": 4.296143675281332e-06, "loss": 0.0087, "num_tokens": 8238936.0, "reward": 0.8626708984375, "reward_std": 0.013505864888429642, "rewards//mean": 0.8626708984375, "rewards//std": 0.03300810605287552, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2528, "grad_norm": 0.6506736278533936, "kl": 0.055786733981221914, "learning_rate": 4.295039677046797e-06, "loss": 0.0056, "num_tokens": 8245456.0, "reward": 0.84747314453125, "reward_std": 0.007791526615619659, "rewards//mean": 0.84747314453125, "rewards//std": 0.013592896983027458, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.253, "grad_norm": 0.8280842304229736, "kl": 0.0640347502194345, "learning_rate": 4.293934955770496e-06, "loss": 0.0064, "num_tokens": 8251992.0, "reward": 0.86798095703125, "reward_std": 0.010897631756961346, "rewards//mean": 0.86798095703125, "rewards//std": 0.02635764330625534, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2532, "grad_norm": 0.9240650534629822, "kl": 0.0818904796615243, "learning_rate": 4.292829511897409e-06, "loss": 0.0082, "num_tokens": 8258544.0, "reward": 0.81475830078125, "reward_std": 0.010935626924037933, "rewards//mean": 0.81475830078125, "rewards//std": 0.022753801196813583, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2534, "grad_norm": 0.7753582000732422, "kl": 0.07747271517291665, "learning_rate": 4.291723345872809e-06, "loss": 0.0077, "num_tokens": 8265160.0, "reward": 0.844970703125, "reward_std": 0.01344046276062727, "rewards//mean": 0.844970703125, "rewards//std": 0.04684334248304367, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2536, "grad_norm": 0.9073060750961304, "kl": 0.08940480556339025, "learning_rate": 4.2906164581422594e-06, "loss": 0.0089, "num_tokens": 8271712.0, "reward": 0.75860595703125, "reward_std": 0.01416555605828762, "rewards//mean": 0.75860595703125, "rewards//std": 0.029669053852558136, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2538, "grad_norm": 0.9661238193511963, "kl": 0.06021655164659023, "learning_rate": 4.289508849151614e-06, "loss": 0.006, "num_tokens": 8278248.0, "reward": 0.8511962890625, "reward_std": 0.011291861534118652, "rewards//mean": 0.8511962890625, "rewards//std": 0.023393217474222183, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.254, "grad_norm": 0.7455636858940125, "kl": 0.11090100277215242, "learning_rate": 4.28840051934702e-06, "loss": 0.0111, "num_tokens": 8284784.0, "reward": 0.87567138671875, "reward_std": 0.010319629684090614, "rewards//mean": 0.87567138671875, "rewards//std": 0.031059959903359413, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2542, "grad_norm": 0.9833388328552246, "kl": 0.08067311625927687, "learning_rate": 4.287291469174909e-06, "loss": 0.0081, "num_tokens": 8291360.0, "reward": 0.82440185546875, "reward_std": 0.012323485687375069, "rewards//mean": 0.82440185546875, "rewards//std": 0.026341555640101433, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2544, "grad_norm": 1.1063218116760254, "kl": 0.08659009356051683, "learning_rate": 4.286181699082008e-06, "loss": 0.0087, "num_tokens": 8297856.0, "reward": 0.87896728515625, "reward_std": 0.015453742817044258, "rewards//mean": 0.87896728515625, "rewards//std": 0.026018332690000534, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2546, "grad_norm": 0.8011521100997925, "kl": 0.08021100144833326, "learning_rate": 4.285071209515334e-06, "loss": 0.008, "num_tokens": 8304432.0, "reward": 0.8594970703125, "reward_std": 0.012957783415913582, "rewards//mean": 0.8594970703125, "rewards//std": 0.01588585413992405, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2548, "grad_norm": 0.8594513535499573, "kl": 0.060587269719690084, "learning_rate": 4.283960000922188e-06, "loss": 0.0061, "num_tokens": 8310920.0, "reward": 0.85028076171875, "reward_std": 0.011846556328237057, "rewards//mean": 0.85028076171875, "rewards//std": 0.03172731027007103, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.255, "grad_norm": 0.8862637877464294, "kl": 0.0842428170144558, "learning_rate": 4.282848073750169e-06, "loss": 0.0084, "num_tokens": 8317440.0, "reward": 0.84429931640625, "reward_std": 0.011008869856595993, "rewards//mean": 0.84429931640625, "rewards//std": 0.03567849099636078, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2552, "grad_norm": 0.9152967929840088, "kl": 0.07231645053252578, "learning_rate": 4.281735428447158e-06, "loss": 0.0072, "num_tokens": 8323952.0, "reward": 0.822509765625, "reward_std": 0.009229231625795364, "rewards//mean": 0.822509765625, "rewards//std": 0.012330569326877594, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2554, "grad_norm": 0.8065710067749023, "kl": 0.08104236610233784, "learning_rate": 4.280622065461329e-06, "loss": 0.0081, "num_tokens": 8330496.0, "reward": 0.8529052734375, "reward_std": 0.009538231417536736, "rewards//mean": 0.8529052734375, "rewards//std": 0.014044899493455887, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2556, "grad_norm": 1.1578333377838135, "kl": 0.08660793583840132, "learning_rate": 4.279507985241146e-06, "loss": 0.0087, "num_tokens": 8337040.0, "reward": 0.84405517578125, "reward_std": 0.011715585365891457, "rewards//mean": 0.84405517578125, "rewards//std": 0.04154619574546814, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2558, "grad_norm": 0.8904696106910706, "kl": 0.08173890598118305, "learning_rate": 4.278393188235359e-06, "loss": 0.0082, "num_tokens": 8343656.0, "reward": 0.85748291015625, "reward_std": 0.013532406650483608, "rewards//mean": 0.85748291015625, "rewards//std": 0.026162223890423775, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.256, "grad_norm": 0.8989867568016052, "kl": 0.08564620045945048, "learning_rate": 4.277277674893008e-06, "loss": 0.0086, "num_tokens": 8350128.0, "reward": 0.8385009765625, "reward_std": 0.012558192014694214, "rewards//mean": 0.8385009765625, "rewards//std": 0.018311606720089912, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2562, "grad_norm": 0.7826845645904541, "kl": 0.08263193350285292, "learning_rate": 4.276161445663423e-06, "loss": 0.0083, "num_tokens": 8356656.0, "reward": 0.812744140625, "reward_std": 0.010619234293699265, "rewards//mean": 0.812744140625, "rewards//std": 0.025909962132573128, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2564, "grad_norm": 0.9877669811248779, "kl": 0.10489043407142162, "learning_rate": 4.275044500996219e-06, "loss": 0.0105, "num_tokens": 8363184.0, "reward": 0.82464599609375, "reward_std": 0.009953678585588932, "rewards//mean": 0.82464599609375, "rewards//std": 0.02290564402937889, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2566, "grad_norm": 0.9093990325927734, "kl": 0.07232193928211927, "learning_rate": 4.273926841341303e-06, "loss": 0.0072, "num_tokens": 8369600.0, "reward": 0.87188720703125, "reward_std": 0.010946843773126602, "rewards//mean": 0.87188720703125, "rewards//std": 0.025422129780054092, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2568, "grad_norm": 0.8505975008010864, "kl": 0.072668447624892, "learning_rate": 4.272808467148866e-06, "loss": 0.0073, "num_tokens": 8376080.0, "reward": 0.86004638671875, "reward_std": 0.012600808404386044, "rewards//mean": 0.86004638671875, "rewards//std": 0.03286350145936012, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.257, "grad_norm": 0.9348027110099792, "kl": 0.0767467450350523, "learning_rate": 4.271689378869392e-06, "loss": 0.0077, "num_tokens": 8382592.0, "reward": 0.7977294921875, "reward_std": 0.010815320536494255, "rewards//mean": 0.7977294921875, "rewards//std": 0.021491067484021187, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2572, "grad_norm": 0.8370448350906372, "kl": 0.08453593449667096, "learning_rate": 4.270569576953648e-06, "loss": 0.0085, "num_tokens": 8389056.0, "reward": 0.88006591796875, "reward_std": 0.012837251648306847, "rewards//mean": 0.88006591796875, "rewards//std": 0.026745326817035675, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2574, "grad_norm": 0.7774531245231628, "kl": 0.06783967791125178, "learning_rate": 4.26944906185269e-06, "loss": 0.0068, "num_tokens": 8395544.0, "reward": 0.869873046875, "reward_std": 0.013874299824237823, "rewards//mean": 0.869873046875, "rewards//std": 0.028619399294257164, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2576, "grad_norm": 0.7840937972068787, "kl": 0.07538590021431446, "learning_rate": 4.268327834017862e-06, "loss": 0.0075, "num_tokens": 8402080.0, "reward": 0.8773193359375, "reward_std": 0.014451378956437111, "rewards//mean": 0.8773193359375, "rewards//std": 0.032141901552677155, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2578, "grad_norm": 0.8003010153770447, "kl": 0.07674728520214558, "learning_rate": 4.267205893900793e-06, "loss": 0.0077, "num_tokens": 8408600.0, "reward": 0.85064697265625, "reward_std": 0.009131163358688354, "rewards//mean": 0.85064697265625, "rewards//std": 0.026763997972011566, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.258, "grad_norm": 1.027834177017212, "kl": 0.07778862118721008, "learning_rate": 4.266083241953402e-06, "loss": 0.0078, "num_tokens": 8415112.0, "reward": 0.8453369140625, "reward_std": 0.010225171223282814, "rewards//mean": 0.8453369140625, "rewards//std": 0.019546357914805412, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2582, "grad_norm": 0.8484137654304504, "kl": 0.08180333813652396, "learning_rate": 4.264959878627891e-06, "loss": 0.0082, "num_tokens": 8421576.0, "reward": 0.80291748046875, "reward_std": 0.009717999957501888, "rewards//mean": 0.80291748046875, "rewards//std": 0.02228187955915928, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2584, "grad_norm": 0.9477463364601135, "kl": 0.06657165149226785, "learning_rate": 4.263835804376754e-06, "loss": 0.0067, "num_tokens": 8428072.0, "reward": 0.81976318359375, "reward_std": 0.011712562292814255, "rewards//mean": 0.81976318359375, "rewards//std": 0.02164154127240181, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2586, "grad_norm": 0.8215687870979309, "kl": 0.06623630784451962, "learning_rate": 4.262711019652764e-06, "loss": 0.0066, "num_tokens": 8434664.0, "reward": 0.8367919921875, "reward_std": 0.009944912977516651, "rewards//mean": 0.8367919921875, "rewards//std": 0.032529521733522415, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2588, "grad_norm": 0.7833945155143738, "kl": 0.06941379839554429, "learning_rate": 4.261585524908987e-06, "loss": 0.0069, "num_tokens": 8441128.0, "reward": 0.82916259765625, "reward_std": 0.010338153690099716, "rewards//mean": 0.82916259765625, "rewards//std": 0.018058864399790764, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.259, "grad_norm": 0.8388248682022095, "kl": 0.065641266759485, "learning_rate": 4.260459320598771e-06, "loss": 0.0066, "num_tokens": 8447656.0, "reward": 0.826171875, "reward_std": 0.010786205530166626, "rewards//mean": 0.826171875, "rewards//std": 0.029651254415512085, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2592, "grad_norm": 0.8419365882873535, "kl": 0.08845777157694101, "learning_rate": 4.259332407175751e-06, "loss": 0.0088, "num_tokens": 8454224.0, "reward": 0.7901611328125, "reward_std": 0.011718017980456352, "rewards//mean": 0.7901611328125, "rewards//std": 0.029359586536884308, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2594, "grad_norm": 0.9212959408760071, "kl": 0.06948975706472993, "learning_rate": 4.258204785093849e-06, "loss": 0.0069, "num_tokens": 8460848.0, "reward": 0.828857421875, "reward_std": 0.012238556519150734, "rewards//mean": 0.828857421875, "rewards//std": 0.018530605360865593, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2596, "grad_norm": 0.8497466444969177, "kl": 0.07639342220500112, "learning_rate": 4.257076454807269e-06, "loss": 0.0076, "num_tokens": 8467432.0, "reward": 0.86114501953125, "reward_std": 0.016841381788253784, "rewards//mean": 0.86114501953125, "rewards//std": 0.029437532648444176, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2598, "grad_norm": 0.9674074053764343, "kl": 0.07910048076882958, "learning_rate": 4.255947416770503e-06, "loss": 0.0079, "num_tokens": 8473976.0, "reward": 0.85552978515625, "reward_std": 0.011918941512703896, "rewards//mean": 0.85552978515625, "rewards//std": 0.01879815012216568, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.26, "grad_norm": 0.8345233798027039, "kl": 0.06950137857347727, "learning_rate": 4.2548176714383274e-06, "loss": 0.007, "num_tokens": 8480432.0, "reward": 0.8660888671875, "reward_std": 0.01642564684152603, "rewards//mean": 0.8660888671875, "rewards//std": 0.0195927694439888, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2602, "grad_norm": 0.8037116527557373, "kl": 0.08393787406384945, "learning_rate": 4.253687219265803e-06, "loss": 0.0084, "num_tokens": 8486920.0, "reward": 0.81072998046875, "reward_std": 0.013953003101050854, "rewards//mean": 0.81072998046875, "rewards//std": 0.03582794964313507, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2604, "grad_norm": 0.838883101940155, "kl": 0.0881900885142386, "learning_rate": 4.252556060708277e-06, "loss": 0.0088, "num_tokens": 8493496.0, "reward": 0.84417724609375, "reward_std": 0.011804094538092613, "rewards//mean": 0.84417724609375, "rewards//std": 0.021271860226988792, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2606, "grad_norm": 0.9238391518592834, "kl": 0.08966582547873259, "learning_rate": 4.2514241962213794e-06, "loss": 0.009, "num_tokens": 8499992.0, "reward": 0.865234375, "reward_std": 0.016116295009851456, "rewards//mean": 0.865234375, "rewards//std": 0.03029767982661724, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2608, "grad_norm": 0.7656685709953308, "kl": 0.06920235091820359, "learning_rate": 4.2502916262610264e-06, "loss": 0.0069, "num_tokens": 8506464.0, "reward": 0.8458251953125, "reward_std": 0.010772697627544403, "rewards//mean": 0.8458251953125, "rewards//std": 0.018028346821665764, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.261, "grad_norm": 0.9480398893356323, "kl": 0.0736740822903812, "learning_rate": 4.249158351283414e-06, "loss": 0.0074, "num_tokens": 8512968.0, "reward": 0.80401611328125, "reward_std": 0.012157089076936245, "rewards//mean": 0.80401611328125, "rewards//std": 0.015320613980293274, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2612, "grad_norm": 0.8648391366004944, "kl": 0.0824127933010459, "learning_rate": 4.248024371745027e-06, "loss": 0.0082, "num_tokens": 8519600.0, "reward": 0.8931884765625, "reward_std": 0.013073273003101349, "rewards//mean": 0.8931884765625, "rewards//std": 0.023356951773166656, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2614, "grad_norm": 1.0960179567337036, "kl": 0.08391740079969168, "learning_rate": 4.246889688102632e-06, "loss": 0.0084, "num_tokens": 8526112.0, "reward": 0.82421875, "reward_std": 0.012468471191823483, "rewards//mean": 0.82421875, "rewards//std": 0.04508937895298004, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2616, "grad_norm": 0.8242467045783997, "kl": 0.08617008849978447, "learning_rate": 4.24575430081328e-06, "loss": 0.0086, "num_tokens": 8532664.0, "reward": 0.8463134765625, "reward_std": 0.013643577694892883, "rewards//mean": 0.8463134765625, "rewards//std": 0.03023541159927845, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2618, "grad_norm": 0.9851990342140198, "kl": 0.08464868599548936, "learning_rate": 4.244618210334305e-06, "loss": 0.0085, "num_tokens": 8539368.0, "reward": 0.8502197265625, "reward_std": 0.010623812675476074, "rewards//mean": 0.8502197265625, "rewards//std": 0.01898694410920143, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.262, "grad_norm": 0.9217378497123718, "kl": 0.05844205757603049, "learning_rate": 4.243481417123323e-06, "loss": 0.0058, "num_tokens": 8545848.0, "reward": 0.82867431640625, "reward_std": 0.009874123148620129, "rewards//mean": 0.82867431640625, "rewards//std": 0.03344292566180229, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2622, "grad_norm": 0.8081804513931274, "kl": 0.07049195561558008, "learning_rate": 4.242343921638235e-06, "loss": 0.007, "num_tokens": 8552328.0, "reward": 0.850830078125, "reward_std": 0.008803065866231918, "rewards//mean": 0.850830078125, "rewards//std": 0.02310313656926155, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2624, "grad_norm": 0.8056511878967285, "kl": 0.06523380661383271, "learning_rate": 4.241205724337223e-06, "loss": 0.0065, "num_tokens": 8558864.0, "reward": 0.86517333984375, "reward_std": 0.012138761579990387, "rewards//mean": 0.86517333984375, "rewards//std": 0.0306631438434124, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2626, "grad_norm": 0.8356900215148926, "kl": 0.09078505868092179, "learning_rate": 4.2400668256787534e-06, "loss": 0.0091, "num_tokens": 8565408.0, "reward": 0.85308837890625, "reward_std": 0.00954306498169899, "rewards//mean": 0.85308837890625, "rewards//std": 0.023173710331320763, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2628, "grad_norm": 0.958789587020874, "kl": 0.0849305260926485, "learning_rate": 4.238927226121574e-06, "loss": 0.0085, "num_tokens": 8571976.0, "reward": 0.84100341796875, "reward_std": 0.0120496591553092, "rewards//mean": 0.84100341796875, "rewards//std": 0.02725662663578987, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.263, "grad_norm": 0.9570683836936951, "kl": 0.10529785230755806, "learning_rate": 4.237786926124718e-06, "loss": 0.0105, "num_tokens": 8578472.0, "reward": 0.77301025390625, "reward_std": 0.010668868198990822, "rewards//mean": 0.77301025390625, "rewards//std": 0.026884764432907104, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2632, "grad_norm": 0.8049548268318176, "kl": 0.0899099800735712, "learning_rate": 4.236645926147493e-06, "loss": 0.009, "num_tokens": 8584928.0, "reward": 0.8349609375, "reward_std": 0.012075395323336124, "rewards//mean": 0.8349609375, "rewards//std": 0.021053064614534378, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2634, "grad_norm": 0.9283333420753479, "kl": 0.0890887388959527, "learning_rate": 4.235504226649499e-06, "loss": 0.0089, "num_tokens": 8591416.0, "reward": 0.86444091796875, "reward_std": 0.015067913569509983, "rewards//mean": 0.86444091796875, "rewards//std": 0.032672248780727386, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2636, "grad_norm": 0.839745044708252, "kl": 0.08572990959510207, "learning_rate": 4.234361828090609e-06, "loss": 0.0086, "num_tokens": 8597832.0, "reward": 0.8453369140625, "reward_std": 0.012744767591357231, "rewards//mean": 0.8453369140625, "rewards//std": 0.021538913249969482, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2638, "grad_norm": 1.0210202932357788, "kl": 0.07561337621882558, "learning_rate": 4.233218730930983e-06, "loss": 0.0076, "num_tokens": 8604368.0, "reward": 0.8658447265625, "reward_std": 0.009697857312858105, "rewards//mean": 0.8658447265625, "rewards//std": 0.019564935937523842, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.264, "grad_norm": 0.9889983534812927, "kl": 0.10409204149618745, "learning_rate": 4.232074935631059e-06, "loss": 0.0104, "num_tokens": 8610992.0, "reward": 0.83740234375, "reward_std": 0.015492606908082962, "rewards//mean": 0.83740234375, "rewards//std": 0.04504369571805, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2642, "grad_norm": 0.8774916529655457, "kl": 0.09316294826567173, "learning_rate": 4.230930442651558e-06, "loss": 0.0093, "num_tokens": 8617544.0, "reward": 0.86981201171875, "reward_std": 0.01392995472997427, "rewards//mean": 0.86981201171875, "rewards//std": 0.01779966428875923, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2644, "grad_norm": 0.9598444700241089, "kl": 0.11282587517052889, "learning_rate": 4.229785252453481e-06, "loss": 0.0113, "num_tokens": 8624080.0, "reward": 0.82342529296875, "reward_std": 0.01147119514644146, "rewards//mean": 0.82342529296875, "rewards//std": 0.026786047965288162, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2646, "grad_norm": 0.8868212699890137, "kl": 0.07838137401267886, "learning_rate": 4.228639365498111e-06, "loss": 0.0078, "num_tokens": 8630568.0, "reward": 0.84906005859375, "reward_std": 0.013994920067489147, "rewards//mean": 0.84906005859375, "rewards//std": 0.036922842264175415, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2648, "grad_norm": 0.9224172234535217, "kl": 0.09408709779381752, "learning_rate": 4.227492782247013e-06, "loss": 0.0094, "num_tokens": 8637096.0, "reward": 0.82379150390625, "reward_std": 0.010377112776041031, "rewards//mean": 0.82379150390625, "rewards//std": 0.02254326269030571, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.265, "grad_norm": 0.9817544221878052, "kl": 0.07457373896613717, "learning_rate": 4.226345503162027e-06, "loss": 0.0075, "num_tokens": 8643600.0, "reward": 0.85052490234375, "reward_std": 0.01577845588326454, "rewards//mean": 0.85052490234375, "rewards//std": 0.022843437269330025, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2652, "grad_norm": 0.91078782081604, "kl": 0.07854951499029994, "learning_rate": 4.2251975287052804e-06, "loss": 0.0079, "num_tokens": 8650128.0, "reward": 0.84014892578125, "reward_std": 0.01147688739001751, "rewards//mean": 0.84014892578125, "rewards//std": 0.02813657745718956, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2654, "grad_norm": 0.8413807153701782, "kl": 0.08949362486600876, "learning_rate": 4.224048859339175e-06, "loss": 0.0089, "num_tokens": 8656712.0, "reward": 0.8001708984375, "reward_std": 0.010851677507162094, "rewards//mean": 0.8001708984375, "rewards//std": 0.02035370096564293, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2656, "grad_norm": 0.9359486699104309, "kl": 0.08307844027876854, "learning_rate": 4.222899495526396e-06, "loss": 0.0083, "num_tokens": 8663216.0, "reward": 0.84075927734375, "reward_std": 0.008999675512313843, "rewards//mean": 0.84075927734375, "rewards//std": 0.0173264779150486, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2658, "grad_norm": 0.9486803412437439, "kl": 0.07434258004650474, "learning_rate": 4.221749437729905e-06, "loss": 0.0074, "num_tokens": 8669640.0, "reward": 0.8570556640625, "reward_std": 0.012211926281452179, "rewards//mean": 0.8570556640625, "rewards//std": 0.020853307098150253, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.266, "grad_norm": 0.9731333255767822, "kl": 0.07990019116550684, "learning_rate": 4.220598686412946e-06, "loss": 0.008, "num_tokens": 8676112.0, "reward": 0.83306884765625, "reward_std": 0.009959236718714237, "rewards//mean": 0.83306884765625, "rewards//std": 0.035449493676424026, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2662, "grad_norm": 0.978728175163269, "kl": 0.10341206006705761, "learning_rate": 4.219447242039043e-06, "loss": 0.0103, "num_tokens": 8682624.0, "reward": 0.84326171875, "reward_std": 0.011410992592573166, "rewards//mean": 0.84326171875, "rewards//std": 0.017968211323022842, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2664, "grad_norm": 0.970518946647644, "kl": 0.10507941851392388, "learning_rate": 4.2182951050719955e-06, "loss": 0.0105, "num_tokens": 8689128.0, "reward": 0.85498046875, "reward_std": 0.017168883234262466, "rewards//mean": 0.85498046875, "rewards//std": 0.02457258850336075, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2666, "grad_norm": 0.9613063335418701, "kl": 0.09090633131563663, "learning_rate": 4.217142275975886e-06, "loss": 0.0091, "num_tokens": 8695696.0, "reward": 0.813232421875, "reward_std": 0.010450057685375214, "rewards//mean": 0.813232421875, "rewards//std": 0.021337317302823067, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2668, "grad_norm": 0.8495792746543884, "kl": 0.07541084289550781, "learning_rate": 4.215988755215073e-06, "loss": 0.0075, "num_tokens": 8702120.0, "reward": 0.80255126953125, "reward_std": 0.010713635012507439, "rewards//mean": 0.80255126953125, "rewards//std": 0.025295577943325043, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.267, "grad_norm": 0.9322383403778076, "kl": 0.09921678714454174, "learning_rate": 4.214834543254195e-06, "loss": 0.0099, "num_tokens": 8708720.0, "reward": 0.81085205078125, "reward_std": 0.013290788978338242, "rewards//mean": 0.81085205078125, "rewards//std": 0.028466062620282173, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2672, "grad_norm": 0.8545933961868286, "kl": 0.08800690434873104, "learning_rate": 4.2136796405581674e-06, "loss": 0.0088, "num_tokens": 8715224.0, "reward": 0.8372802734375, "reward_std": 0.011329732835292816, "rewards//mean": 0.8372802734375, "rewards//std": 0.03302277997136116, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2674, "grad_norm": 0.9750823378562927, "kl": 0.10804656567052007, "learning_rate": 4.212524047592185e-06, "loss": 0.0108, "num_tokens": 8721736.0, "reward": 0.8507080078125, "reward_std": 0.014218117110431194, "rewards//mean": 0.8507080078125, "rewards//std": 0.03062739223241806, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2676, "grad_norm": 0.8245490789413452, "kl": 0.08041206235066056, "learning_rate": 4.211367764821722e-06, "loss": 0.008, "num_tokens": 8728256.0, "reward": 0.864013671875, "reward_std": 0.009838636964559555, "rewards//mean": 0.864013671875, "rewards//std": 0.03560379147529602, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2678, "grad_norm": 0.9784286022186279, "kl": 0.12092388048768044, "learning_rate": 4.210210792712528e-06, "loss": 0.0121, "num_tokens": 8734712.0, "reward": 0.85650634765625, "reward_std": 0.010938748717308044, "rewards//mean": 0.85650634765625, "rewards//std": 0.018678588792681694, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.268, "grad_norm": 0.9574248194694519, "kl": 0.12767875008285046, "learning_rate": 4.209053131730631e-06, "loss": 0.0128, "num_tokens": 8741152.0, "reward": 0.846435546875, "reward_std": 0.010094012133777142, "rewards//mean": 0.846435546875, "rewards//std": 0.034798771142959595, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2682, "grad_norm": 0.896277904510498, "kl": 0.11947044963017106, "learning_rate": 4.207894782342337e-06, "loss": 0.0119, "num_tokens": 8747576.0, "reward": 0.847900390625, "reward_std": 0.01741372048854828, "rewards//mean": 0.847900390625, "rewards//std": 0.02875448763370514, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2684, "grad_norm": 0.9103860259056091, "kl": 0.14337039180099964, "learning_rate": 4.206735745014228e-06, "loss": 0.0143, "num_tokens": 8754128.0, "reward": 0.85333251953125, "reward_std": 0.015794824808835983, "rewards//mean": 0.85333251953125, "rewards//std": 0.03230883553624153, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2686, "grad_norm": 1.0340559482574463, "kl": 0.12088215304538608, "learning_rate": 4.205576020213166e-06, "loss": 0.0121, "num_tokens": 8760600.0, "reward": 0.86090087890625, "reward_std": 0.011910364963114262, "rewards//mean": 0.86090087890625, "rewards//std": 0.021994004026055336, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2688, "grad_norm": 0.9145066142082214, "kl": 0.09802971687167883, "learning_rate": 4.204415608406287e-06, "loss": 0.0098, "num_tokens": 8767168.0, "reward": 0.8563232421875, "reward_std": 0.01336738746613264, "rewards//mean": 0.8563232421875, "rewards//std": 0.02539628930389881, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.269, "grad_norm": 0.9168880581855774, "kl": 0.10528220981359482, "learning_rate": 4.203254510061005e-06, "loss": 0.0105, "num_tokens": 8773688.0, "reward": 0.85577392578125, "reward_std": 0.008123239502310753, "rewards//mean": 0.85577392578125, "rewards//std": 0.011185899376869202, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2692, "grad_norm": 1.157829999923706, "kl": 0.08841514214873314, "learning_rate": 4.2020927256450085e-06, "loss": 0.0088, "num_tokens": 8780272.0, "reward": 0.86553955078125, "reward_std": 0.006943578831851482, "rewards//mean": 0.86553955078125, "rewards//std": 0.014243297278881073, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2694, "grad_norm": 0.9286763668060303, "kl": 0.12383406329900026, "learning_rate": 4.200930255626267e-06, "loss": 0.0124, "num_tokens": 8786728.0, "reward": 0.8634033203125, "reward_std": 0.01223432645201683, "rewards//mean": 0.8634033203125, "rewards//std": 0.02796097658574581, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2696, "grad_norm": 0.8585650324821472, "kl": 0.08361157029867172, "learning_rate": 4.199767100473022e-06, "loss": 0.0084, "num_tokens": 8793184.0, "reward": 0.86346435546875, "reward_std": 0.009187709540128708, "rewards//mean": 0.86346435546875, "rewards//std": 0.017627036198973656, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2698, "grad_norm": 0.8274407386779785, "kl": 0.08368509309366345, "learning_rate": 4.198603260653792e-06, "loss": 0.0084, "num_tokens": 8799704.0, "reward": 0.8763427734375, "reward_std": 0.011150863952934742, "rewards//mean": 0.8763427734375, "rewards//std": 0.021857041865587234, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.27, "grad_norm": 0.8967868685722351, "kl": 0.10353417880833149, "learning_rate": 4.197438736637372e-06, "loss": 0.0104, "num_tokens": 8806272.0, "reward": 0.82196044921875, "reward_std": 0.010382997803390026, "rewards//mean": 0.82196044921875, "rewards//std": 0.027150901034474373, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2702, "grad_norm": 0.9681665897369385, "kl": 0.09204654349014163, "learning_rate": 4.196273528892831e-06, "loss": 0.0092, "num_tokens": 8812808.0, "reward": 0.80908203125, "reward_std": 0.010345985181629658, "rewards//mean": 0.80908203125, "rewards//std": 0.018222521990537643, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2704, "grad_norm": 0.8730531334877014, "kl": 0.12335515394806862, "learning_rate": 4.195107637889515e-06, "loss": 0.0123, "num_tokens": 8819320.0, "reward": 0.82958984375, "reward_std": 0.009287282824516296, "rewards//mean": 0.82958984375, "rewards//std": 0.018901044502854347, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2706, "grad_norm": 0.8973050713539124, "kl": 0.08792508114129305, "learning_rate": 4.193941064097047e-06, "loss": 0.0088, "num_tokens": 8825888.0, "reward": 0.8619384765625, "reward_std": 0.013040721416473389, "rewards//mean": 0.8619384765625, "rewards//std": 0.03121292218565941, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2708, "grad_norm": 1.1608765125274658, "kl": 0.10655087837949395, "learning_rate": 4.19277380798532e-06, "loss": 0.0107, "num_tokens": 8832448.0, "reward": 0.8150634765625, "reward_std": 0.013973737135529518, "rewards//mean": 0.8150634765625, "rewards//std": 0.02187919430434704, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.271, "grad_norm": 0.9968780279159546, "kl": 0.08224233845248818, "learning_rate": 4.191605870024506e-06, "loss": 0.0082, "num_tokens": 8838952.0, "reward": 0.80548095703125, "reward_std": 0.008485966362059116, "rewards//mean": 0.80548095703125, "rewards//std": 0.01648753508925438, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2712, "grad_norm": 0.9012913107872009, "kl": 0.07915652729570866, "learning_rate": 4.190437250685049e-06, "loss": 0.0079, "num_tokens": 8845456.0, "reward": 0.868896484375, "reward_std": 0.01108338963240385, "rewards//mean": 0.868896484375, "rewards//std": 0.021585598587989807, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2714, "grad_norm": 0.9560938477516174, "kl": 0.11390934698283672, "learning_rate": 4.18926795043767e-06, "loss": 0.0114, "num_tokens": 8852000.0, "reward": 0.81829833984375, "reward_std": 0.012500876560807228, "rewards//mean": 0.81829833984375, "rewards//std": 0.033211272209882736, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2716, "grad_norm": 0.8838367462158203, "kl": 0.10300877969712019, "learning_rate": 4.188097969753363e-06, "loss": 0.0103, "num_tokens": 8858616.0, "reward": 0.8248291015625, "reward_std": 0.009685773402452469, "rewards//mean": 0.8248291015625, "rewards//std": 0.023747727274894714, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2718, "grad_norm": 0.9101223945617676, "kl": 0.1033704737201333, "learning_rate": 4.186927309103395e-06, "loss": 0.0103, "num_tokens": 8865200.0, "reward": 0.83758544921875, "reward_std": 0.013100400567054749, "rewards//mean": 0.83758544921875, "rewards//std": 0.022138750180602074, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.272, "grad_norm": 0.878481924533844, "kl": 0.08396721165627241, "learning_rate": 4.185755968959308e-06, "loss": 0.0084, "num_tokens": 8871776.0, "reward": 0.82940673828125, "reward_std": 0.011456351727247238, "rewards//mean": 0.82940673828125, "rewards//std": 0.01999153383076191, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2722, "grad_norm": 0.9551863074302673, "kl": 0.09904797561466694, "learning_rate": 4.18458394979292e-06, "loss": 0.0099, "num_tokens": 8878352.0, "reward": 0.85101318359375, "reward_std": 0.01483334694057703, "rewards//mean": 0.85101318359375, "rewards//std": 0.03526883199810982, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2724, "grad_norm": 0.8716391921043396, "kl": 0.09620317677035928, "learning_rate": 4.183411252076318e-06, "loss": 0.0096, "num_tokens": 8884864.0, "reward": 0.8665771484375, "reward_std": 0.013590037822723389, "rewards//mean": 0.8665771484375, "rewards//std": 0.035278111696243286, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2726, "grad_norm": 0.9840959310531616, "kl": 0.10599198564887047, "learning_rate": 4.182237876281865e-06, "loss": 0.0106, "num_tokens": 8891376.0, "reward": 0.83428955078125, "reward_std": 0.010283821262419224, "rewards//mean": 0.83428955078125, "rewards//std": 0.023394105955958366, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2728, "grad_norm": 1.0388611555099487, "kl": 0.11024851724505424, "learning_rate": 4.181063822882196e-06, "loss": 0.011, "num_tokens": 8897832.0, "reward": 0.80487060546875, "reward_std": 0.015708625316619873, "rewards//mean": 0.80487060546875, "rewards//std": 0.0269415732473135, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.273, "grad_norm": 1.0221225023269653, "kl": 0.0983053995296359, "learning_rate": 4.17988909235022e-06, "loss": 0.0098, "num_tokens": 8904360.0, "reward": 0.8193359375, "reward_std": 0.009582128375768661, "rewards//mean": 0.8193359375, "rewards//std": 0.017771683633327484, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2732, "grad_norm": 1.0053266286849976, "kl": 0.09315591724589467, "learning_rate": 4.178713685159119e-06, "loss": 0.0093, "num_tokens": 8910872.0, "reward": 0.860595703125, "reward_std": 0.013069585897028446, "rewards//mean": 0.860595703125, "rewards//std": 0.021775512024760246, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2734, "grad_norm": 0.9461567401885986, "kl": 0.09796981606632471, "learning_rate": 4.1775376017823465e-06, "loss": 0.0098, "num_tokens": 8917424.0, "reward": 0.825927734375, "reward_std": 0.007579619996249676, "rewards//mean": 0.825927734375, "rewards//std": 0.01252545416355133, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2736, "grad_norm": 0.9378394484519958, "kl": 0.08479075785726309, "learning_rate": 4.176360842693629e-06, "loss": 0.0085, "num_tokens": 8923936.0, "reward": 0.83416748046875, "reward_std": 0.010234002023935318, "rewards//mean": 0.83416748046875, "rewards//std": 0.02062218263745308, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2738, "grad_norm": 0.8771849870681763, "kl": 0.12517736200243235, "learning_rate": 4.175183408366964e-06, "loss": 0.0125, "num_tokens": 8930432.0, "reward": 0.857666015625, "reward_std": 0.014525767415761948, "rewards//mean": 0.857666015625, "rewards//std": 0.027267245575785637, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.274, "grad_norm": 0.8395724296569824, "kl": 0.12157221511006355, "learning_rate": 4.174005299276622e-06, "loss": 0.0122, "num_tokens": 8936992.0, "reward": 0.8094482421875, "reward_std": 0.013624751940369606, "rewards//mean": 0.8094482421875, "rewards//std": 0.03410697728395462, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2742, "grad_norm": 0.9754379391670227, "kl": 0.11679825372993946, "learning_rate": 4.172826515897146e-06, "loss": 0.0117, "num_tokens": 8943656.0, "reward": 0.81878662109375, "reward_std": 0.010937703773379326, "rewards//mean": 0.81878662109375, "rewards//std": 0.0163908489048481, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2744, "grad_norm": 0.9930682182312012, "kl": 0.11405424401164055, "learning_rate": 4.17164705870335e-06, "loss": 0.0114, "num_tokens": 8950240.0, "reward": 0.849853515625, "reward_std": 0.014802366495132446, "rewards//mean": 0.849853515625, "rewards//std": 0.036450859159231186, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2746, "grad_norm": 1.098365068435669, "kl": 0.09995874483138323, "learning_rate": 4.1704669281703184e-06, "loss": 0.01, "num_tokens": 8956720.0, "reward": 0.808837890625, "reward_std": 0.0123313432559371, "rewards//mean": 0.808837890625, "rewards//std": 0.03378882259130478, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2748, "grad_norm": 1.1442967653274536, "kl": 0.09744041878730059, "learning_rate": 4.169286124773406e-06, "loss": 0.0097, "num_tokens": 8963304.0, "reward": 0.8607177734375, "reward_std": 0.011526426300406456, "rewards//mean": 0.8607177734375, "rewards//std": 0.025546055287122726, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.275, "grad_norm": 0.8478241562843323, "kl": 0.11292814649641514, "learning_rate": 4.168104648988245e-06, "loss": 0.0113, "num_tokens": 8969824.0, "reward": 0.8585205078125, "reward_std": 0.010427812114357948, "rewards//mean": 0.8585205078125, "rewards//std": 0.015678677707910538, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2752, "grad_norm": 0.9310275316238403, "kl": 0.07350325444713235, "learning_rate": 4.16692250129073e-06, "loss": 0.0074, "num_tokens": 8976312.0, "reward": 0.84393310546875, "reward_std": 0.010553129948675632, "rewards//mean": 0.84393310546875, "rewards//std": 0.020674968138337135, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2754, "grad_norm": 0.9833964109420776, "kl": 0.10556693654507399, "learning_rate": 4.16573968215703e-06, "loss": 0.0106, "num_tokens": 8982768.0, "reward": 0.82000732421875, "reward_std": 0.009758921340107918, "rewards//mean": 0.82000732421875, "rewards//std": 0.014420741237699986, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2756, "grad_norm": 0.9521714448928833, "kl": 0.09163808077573776, "learning_rate": 4.164556192063586e-06, "loss": 0.0092, "num_tokens": 8989256.0, "reward": 0.8468017578125, "reward_std": 0.017473407089710236, "rewards//mean": 0.8468017578125, "rewards//std": 0.02600880339741707, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2758, "grad_norm": 0.8649796843528748, "kl": 0.09599553002044559, "learning_rate": 4.163372031487106e-06, "loss": 0.0096, "num_tokens": 8995824.0, "reward": 0.80633544921875, "reward_std": 0.009877233766019344, "rewards//mean": 0.80633544921875, "rewards//std": 0.022367315366864204, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.276, "grad_norm": 0.9136069416999817, "kl": 0.09177302569150925, "learning_rate": 4.162187200904572e-06, "loss": 0.0092, "num_tokens": 9002288.0, "reward": 0.857177734375, "reward_std": 0.01124110072851181, "rewards//mean": 0.857177734375, "rewards//std": 0.016005944460630417, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2762, "grad_norm": 1.0987542867660522, "kl": 0.08739394834265113, "learning_rate": 4.161001700793231e-06, "loss": 0.0087, "num_tokens": 9008800.0, "reward": 0.8759765625, "reward_std": 0.015020214021205902, "rewards//mean": 0.8759765625, "rewards//std": 0.024498552083969116, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2764, "grad_norm": 0.9147922396659851, "kl": 0.08722687792032957, "learning_rate": 4.159815531630604e-06, "loss": 0.0087, "num_tokens": 9015280.0, "reward": 0.83941650390625, "reward_std": 0.014225109480321407, "rewards//mean": 0.83941650390625, "rewards//std": 0.021999508142471313, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2766, "grad_norm": 0.851744532585144, "kl": 0.10264648869633675, "learning_rate": 4.15862869389448e-06, "loss": 0.0103, "num_tokens": 9021728.0, "reward": 0.8446044921875, "reward_std": 0.015609742142260075, "rewards//mean": 0.8446044921875, "rewards//std": 0.034389857202768326, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2768, "grad_norm": 0.842970609664917, "kl": 0.09397553093731403, "learning_rate": 4.157441188062916e-06, "loss": 0.0094, "num_tokens": 9028288.0, "reward": 0.864501953125, "reward_std": 0.013896476477384567, "rewards//mean": 0.864501953125, "rewards//std": 0.027951501309871674, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.277, "grad_norm": 0.8801262378692627, "kl": 0.10796691849827766, "learning_rate": 4.156253014614239e-06, "loss": 0.0108, "num_tokens": 9034856.0, "reward": 0.77783203125, "reward_std": 0.01300850510597229, "rewards//mean": 0.77783203125, "rewards//std": 0.016653001308441162, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2772, "grad_norm": 0.8686948418617249, "kl": 0.07531116111204028, "learning_rate": 4.155064174027047e-06, "loss": 0.0075, "num_tokens": 9041368.0, "reward": 0.834716796875, "reward_std": 0.012319343164563179, "rewards//mean": 0.834716796875, "rewards//std": 0.029788779094815254, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2774, "grad_norm": 0.841205358505249, "kl": 0.08711703307926655, "learning_rate": 4.153874666780202e-06, "loss": 0.0087, "num_tokens": 9047848.0, "reward": 0.79290771484375, "reward_std": 0.012671373784542084, "rewards//mean": 0.79290771484375, "rewards//std": 0.019481487572193146, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2776, "grad_norm": 0.8953306078910828, "kl": 0.08481625886633992, "learning_rate": 4.152684493352841e-06, "loss": 0.0085, "num_tokens": 9054296.0, "reward": 0.85693359375, "reward_std": 0.014514542184770107, "rewards//mean": 0.85693359375, "rewards//std": 0.02696916088461876, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2778, "grad_norm": 1.1076816320419312, "kl": 0.09191433386877179, "learning_rate": 4.151493654224362e-06, "loss": 0.0092, "num_tokens": 9060792.0, "reward": 0.8536376953125, "reward_std": 0.011561451479792595, "rewards//mean": 0.8536376953125, "rewards//std": 0.021732021123170853, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.278, "grad_norm": 0.9067957401275635, "kl": 0.0913756350055337, "learning_rate": 4.150302149874438e-06, "loss": 0.0091, "num_tokens": 9067440.0, "reward": 0.876708984375, "reward_std": 0.010508124716579914, "rewards//mean": 0.876708984375, "rewards//std": 0.018841683864593506, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2782, "grad_norm": 0.9613187909126282, "kl": 0.09954186715185642, "learning_rate": 4.149109980783004e-06, "loss": 0.01, "num_tokens": 9073984.0, "reward": 0.83740234375, "reward_std": 0.011531364172697067, "rewards//mean": 0.83740234375, "rewards//std": 0.018926655873656273, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2784, "grad_norm": 0.9580591917037964, "kl": 0.11753913015127182, "learning_rate": 4.1479171474302675e-06, "loss": 0.0118, "num_tokens": 9080504.0, "reward": 0.77239990234375, "reward_std": 0.012164803221821785, "rewards//mean": 0.77239990234375, "rewards//std": 0.0234503336250782, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2786, "grad_norm": 0.9476571679115295, "kl": 0.07861991506069899, "learning_rate": 4.146723650296701e-06, "loss": 0.0079, "num_tokens": 9087032.0, "reward": 0.86273193359375, "reward_std": 0.015774179250001907, "rewards//mean": 0.86273193359375, "rewards//std": 0.03323952108621597, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2788, "grad_norm": 0.8460774421691895, "kl": 0.10191156854853034, "learning_rate": 4.145529489863046e-06, "loss": 0.0102, "num_tokens": 9093480.0, "reward": 0.83990478515625, "reward_std": 0.013780036009848118, "rewards//mean": 0.83990478515625, "rewards//std": 0.02098960429430008, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.279, "grad_norm": 0.8317822813987732, "kl": 0.10017718467861414, "learning_rate": 4.144334666610308e-06, "loss": 0.01, "num_tokens": 9099936.0, "reward": 0.79150390625, "reward_std": 0.01070000883191824, "rewards//mean": 0.79150390625, "rewards//std": 0.016090836375951767, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2792, "grad_norm": 1.1106147766113281, "kl": 0.10664213728159666, "learning_rate": 4.143139181019764e-06, "loss": 0.0107, "num_tokens": 9106496.0, "reward": 0.86663818359375, "reward_std": 0.014831550419330597, "rewards//mean": 0.86663818359375, "rewards//std": 0.01730024814605713, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2794, "grad_norm": 0.9266260266304016, "kl": 0.07910352060571313, "learning_rate": 4.141943033572954e-06, "loss": 0.0079, "num_tokens": 9112984.0, "reward": 0.856689453125, "reward_std": 0.013333147391676903, "rewards//mean": 0.856689453125, "rewards//std": 0.033717066049575806, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2796, "grad_norm": 0.961739718914032, "kl": 0.10227519180625677, "learning_rate": 4.140746224751686e-06, "loss": 0.0102, "num_tokens": 9119504.0, "reward": 0.84197998046875, "reward_std": 0.015240367501974106, "rewards//mean": 0.84197998046875, "rewards//std": 0.02835630252957344, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2798, "grad_norm": 0.992081344127655, "kl": 0.10214615613222122, "learning_rate": 4.139548755038035e-06, "loss": 0.0102, "num_tokens": 9126064.0, "reward": 0.8333740234375, "reward_std": 0.016911549493670464, "rewards//mean": 0.8333740234375, "rewards//std": 0.023439761251211166, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.28, "grad_norm": 1.0658843517303467, "kl": 0.08046005479991436, "learning_rate": 4.138350624914342e-06, "loss": 0.008, "num_tokens": 9132552.0, "reward": 0.7735595703125, "reward_std": 0.016805499792099, "rewards//mean": 0.7735595703125, "rewards//std": 0.027268078178167343, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2802, "grad_norm": 0.9907328486442566, "kl": 0.11037387792021036, "learning_rate": 4.137151834863213e-06, "loss": 0.011, "num_tokens": 9139064.0, "reward": 0.8690185546875, "reward_std": 0.011154585517942905, "rewards//mean": 0.8690185546875, "rewards//std": 0.01723153330385685, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2804, "grad_norm": 1.035707950592041, "kl": 0.11078378278762102, "learning_rate": 4.135952385367521e-06, "loss": 0.0111, "num_tokens": 9145576.0, "reward": 0.84564208984375, "reward_std": 0.014471041969954967, "rewards//mean": 0.84564208984375, "rewards//std": 0.02592974714934826, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2806, "grad_norm": 1.1544976234436035, "kl": 0.1000408474355936, "learning_rate": 4.134752276910403e-06, "loss": 0.01, "num_tokens": 9152088.0, "reward": 0.83514404296875, "reward_std": 0.011170648038387299, "rewards//mean": 0.83514404296875, "rewards//std": 0.030837900936603546, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2808, "grad_norm": 1.0290011167526245, "kl": 0.08803244587033987, "learning_rate": 4.133551509975264e-06, "loss": 0.0088, "num_tokens": 9158568.0, "reward": 0.85235595703125, "reward_std": 0.010861358605325222, "rewards//mean": 0.85235595703125, "rewards//std": 0.022033199667930603, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.281, "grad_norm": 1.0953137874603271, "kl": 0.10138277523219585, "learning_rate": 4.132350085045772e-06, "loss": 0.0101, "num_tokens": 9165016.0, "reward": 0.8192138671875, "reward_std": 0.00947296991944313, "rewards//mean": 0.8192138671875, "rewards//std": 0.03602036088705063, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2812, "grad_norm": 0.9435038566589355, "kl": 0.10534818191081285, "learning_rate": 4.131148002605861e-06, "loss": 0.0105, "num_tokens": 9171480.0, "reward": 0.84246826171875, "reward_std": 0.01163963321596384, "rewards//mean": 0.84246826171875, "rewards//std": 0.019351288676261902, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2814, "grad_norm": 1.1009148359298706, "kl": 0.0917098093777895, "learning_rate": 4.1299452631397295e-06, "loss": 0.0092, "num_tokens": 9178064.0, "reward": 0.7855224609375, "reward_std": 0.009099576622247696, "rewards//mean": 0.7855224609375, "rewards//std": 0.03390220180153847, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2816, "grad_norm": 0.9433514475822449, "kl": 0.13720806408673525, "learning_rate": 4.128741867131841e-06, "loss": 0.0137, "num_tokens": 9184624.0, "reward": 0.8614501953125, "reward_std": 0.013486592099070549, "rewards//mean": 0.8614501953125, "rewards//std": 0.027813328430056572, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2818, "grad_norm": 0.8864006996154785, "kl": 0.10798555333167315, "learning_rate": 4.127537815066924e-06, "loss": 0.0108, "num_tokens": 9191088.0, "reward": 0.85986328125, "reward_std": 0.009593471884727478, "rewards//mean": 0.85986328125, "rewards//std": 0.01830209791660309, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.282, "grad_norm": 0.9609375596046448, "kl": 0.10670737363398075, "learning_rate": 4.126333107429968e-06, "loss": 0.0107, "num_tokens": 9197648.0, "reward": 0.822265625, "reward_std": 0.010928682051599026, "rewards//mean": 0.822265625, "rewards//std": 0.02979792281985283, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2822, "grad_norm": 0.9541574120521545, "kl": 0.0901440056040883, "learning_rate": 4.125127744706232e-06, "loss": 0.009, "num_tokens": 9204272.0, "reward": 0.84674072265625, "reward_std": 0.01939518004655838, "rewards//mean": 0.84674072265625, "rewards//std": 0.03892425075173378, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2824, "grad_norm": 0.9248620867729187, "kl": 0.11490076640620828, "learning_rate": 4.123921727381234e-06, "loss": 0.0115, "num_tokens": 9210728.0, "reward": 0.86944580078125, "reward_std": 0.013988449238240719, "rewards//mean": 0.86944580078125, "rewards//std": 0.020947011187672615, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2826, "grad_norm": 1.0765790939331055, "kl": 0.11172952502965927, "learning_rate": 4.122715055940759e-06, "loss": 0.0112, "num_tokens": 9217296.0, "reward": 0.869384765625, "reward_std": 0.018982261419296265, "rewards//mean": 0.869384765625, "rewards//std": 0.03477788344025612, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2828, "grad_norm": 1.047537088394165, "kl": 0.10603108396753669, "learning_rate": 4.121507730870853e-06, "loss": 0.0106, "num_tokens": 9223880.0, "reward": 0.85418701171875, "reward_std": 0.00943954661488533, "rewards//mean": 0.85418701171875, "rewards//std": 0.0278923399746418, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.283, "grad_norm": 1.198005199432373, "kl": 0.12700159661471844, "learning_rate": 4.120299752657828e-06, "loss": 0.0127, "num_tokens": 9230448.0, "reward": 0.85137939453125, "reward_std": 0.018470413982868195, "rewards//mean": 0.85137939453125, "rewards//std": 0.0236988328397274, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2832, "grad_norm": 0.9795251488685608, "kl": 0.10209903120994568, "learning_rate": 4.119091121788256e-06, "loss": 0.0102, "num_tokens": 9236960.0, "reward": 0.85247802734375, "reward_std": 0.015660984441637993, "rewards//mean": 0.85247802734375, "rewards//std": 0.03295319899916649, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2834, "grad_norm": 0.982731819152832, "kl": 0.10011581797152758, "learning_rate": 4.117881838748972e-06, "loss": 0.01, "num_tokens": 9243408.0, "reward": 0.877197265625, "reward_std": 0.015080027282238007, "rewards//mean": 0.877197265625, "rewards//std": 0.026999453082680702, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2836, "grad_norm": 1.0815963745117188, "kl": 0.12661637552082539, "learning_rate": 4.116671904027079e-06, "loss": 0.0127, "num_tokens": 9249952.0, "reward": 0.837890625, "reward_std": 0.011933853849768639, "rewards//mean": 0.837890625, "rewards//std": 0.028399579226970673, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2838, "grad_norm": 1.220989465713501, "kl": 0.10614203102886677, "learning_rate": 4.115461318109936e-06, "loss": 0.0106, "num_tokens": 9256496.0, "reward": 0.84246826171875, "reward_std": 0.011132098734378815, "rewards//mean": 0.84246826171875, "rewards//std": 0.015932835638523102, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.284, "grad_norm": 0.9442338347434998, "kl": 0.09198569133877754, "learning_rate": 4.114250081485166e-06, "loss": 0.0092, "num_tokens": 9263048.0, "reward": 0.86602783203125, "reward_std": 0.011320341378450394, "rewards//mean": 0.86602783203125, "rewards//std": 0.030873224139213562, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2842, "grad_norm": 0.9709688425064087, "kl": 0.1107373433187604, "learning_rate": 4.113038194640658e-06, "loss": 0.0111, "num_tokens": 9269632.0, "reward": 0.83465576171875, "reward_std": 0.019071731716394424, "rewards//mean": 0.83465576171875, "rewards//std": 0.030065396800637245, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2844, "grad_norm": 1.2176882028579712, "kl": 0.09772455412894487, "learning_rate": 4.111825658064557e-06, "loss": 0.0098, "num_tokens": 9276144.0, "reward": 0.8070068359375, "reward_std": 0.01232879888266325, "rewards//mean": 0.8070068359375, "rewards//std": 0.028499344363808632, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2846, "grad_norm": 1.1489514112472534, "kl": 0.12684042751789093, "learning_rate": 4.110612472245274e-06, "loss": 0.0127, "num_tokens": 9282688.0, "reward": 0.8575439453125, "reward_std": 0.016326384618878365, "rewards//mean": 0.8575439453125, "rewards//std": 0.04632912576198578, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2848, "grad_norm": 1.0903536081314087, "kl": 0.1013258439488709, "learning_rate": 4.10939863767148e-06, "loss": 0.0101, "num_tokens": 9289208.0, "reward": 0.76788330078125, "reward_std": 0.011656071059405804, "rewards//mean": 0.76788330078125, "rewards//std": 0.025198446586728096, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.285, "grad_norm": 1.0070606470108032, "kl": 0.11491417232900858, "learning_rate": 4.108184154832106e-06, "loss": 0.0115, "num_tokens": 9295616.0, "reward": 0.86956787109375, "reward_std": 0.014879025518894196, "rewards//mean": 0.86956787109375, "rewards//std": 0.023454206064343452, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2852, "grad_norm": 1.1153665781021118, "kl": 0.10679158009588718, "learning_rate": 4.106969024216348e-06, "loss": 0.0107, "num_tokens": 9302168.0, "reward": 0.8209228515625, "reward_std": 0.013330909423530102, "rewards//mean": 0.8209228515625, "rewards//std": 0.02246902324259281, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2854, "grad_norm": 1.4217824935913086, "kl": 0.11787516716867685, "learning_rate": 4.1057532463136594e-06, "loss": 0.0118, "num_tokens": 9308688.0, "reward": 0.8704833984375, "reward_std": 0.010804399847984314, "rewards//mean": 0.8704833984375, "rewards//std": 0.025753792375326157, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2856, "grad_norm": 0.9635083675384521, "kl": 0.09567527193576097, "learning_rate": 4.104536821613755e-06, "loss": 0.0096, "num_tokens": 9315248.0, "reward": 0.7930908203125, "reward_std": 0.0072563644498586655, "rewards//mean": 0.7930908203125, "rewards//std": 0.01996624656021595, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2858, "grad_norm": 1.0851579904556274, "kl": 0.1065157214179635, "learning_rate": 4.10331975060661e-06, "loss": 0.0107, "num_tokens": 9321856.0, "reward": 0.83807373046875, "reward_std": 0.009960665367543697, "rewards//mean": 0.83807373046875, "rewards//std": 0.018294548615813255, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.286, "grad_norm": 1.0098700523376465, "kl": 0.11234741192311049, "learning_rate": 4.102102033782462e-06, "loss": 0.0112, "num_tokens": 9328360.0, "reward": 0.86376953125, "reward_std": 0.015780659392476082, "rewards//mean": 0.86376953125, "rewards//std": 0.025756437331438065, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2862, "grad_norm": 1.071488618850708, "kl": 0.11517471633851528, "learning_rate": 4.100883671631806e-06, "loss": 0.0115, "num_tokens": 9334920.0, "reward": 0.845947265625, "reward_std": 0.014144480228424072, "rewards//mean": 0.845947265625, "rewards//std": 0.01640942320227623, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2864, "grad_norm": 0.9159356951713562, "kl": 0.09234226401895285, "learning_rate": 4.099664664645399e-06, "loss": 0.0092, "num_tokens": 9341528.0, "reward": 0.84173583984375, "reward_std": 0.012502588331699371, "rewards//mean": 0.84173583984375, "rewards//std": 0.025562861934304237, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2866, "grad_norm": 1.0430186986923218, "kl": 0.10675412509590387, "learning_rate": 4.098445013314255e-06, "loss": 0.0107, "num_tokens": 9347984.0, "reward": 0.86834716796875, "reward_std": 0.010800573043525219, "rewards//mean": 0.86834716796875, "rewards//std": 0.027087265625596046, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2868, "grad_norm": 0.8774861693382263, "kl": 0.0912235789000988, "learning_rate": 4.097224718129652e-06, "loss": 0.0091, "num_tokens": 9354440.0, "reward": 0.83782958984375, "reward_std": 0.008915307931602001, "rewards//mean": 0.83782958984375, "rewards//std": 0.019111210480332375, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.287, "grad_norm": 1.161855936050415, "kl": 0.11477775778621435, "learning_rate": 4.096003779583123e-06, "loss": 0.0115, "num_tokens": 9360952.0, "reward": 0.822998046875, "reward_std": 0.008658187463879585, "rewards//mean": 0.822998046875, "rewards//std": 0.012467307969927788, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2872, "grad_norm": 0.9545578360557556, "kl": 0.10096718603745103, "learning_rate": 4.094782198166463e-06, "loss": 0.0101, "num_tokens": 9367472.0, "reward": 0.844970703125, "reward_std": 0.015348583459854126, "rewards//mean": 0.844970703125, "rewards//std": 0.036497343331575394, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2874, "grad_norm": 0.8353918790817261, "kl": 0.11153100337833166, "learning_rate": 4.093559974371725e-06, "loss": 0.0112, "num_tokens": 9374040.0, "reward": 0.849853515625, "reward_std": 0.013006769120693207, "rewards//mean": 0.849853515625, "rewards//std": 0.03421620652079582, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2876, "grad_norm": 0.9415484666824341, "kl": 0.09260338824242353, "learning_rate": 4.092337108691219e-06, "loss": 0.0093, "num_tokens": 9380608.0, "reward": 0.8663330078125, "reward_std": 0.010838965885341167, "rewards//mean": 0.8663330078125, "rewards//std": 0.015048079192638397, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2878, "grad_norm": 0.9631484746932983, "kl": 0.08959976863116026, "learning_rate": 4.091113601617516e-06, "loss": 0.009, "num_tokens": 9387208.0, "reward": 0.864013671875, "reward_std": 0.01239719893783331, "rewards//mean": 0.864013671875, "rewards//std": 0.028372913599014282, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.288, "grad_norm": 1.1706578731536865, "kl": 0.10352026671171188, "learning_rate": 4.0898894536434445e-06, "loss": 0.0104, "num_tokens": 9393760.0, "reward": 0.83428955078125, "reward_std": 0.012415911071002483, "rewards//mean": 0.83428955078125, "rewards//std": 0.023331904783844948, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2882, "grad_norm": 0.9097560048103333, "kl": 0.12394083477556705, "learning_rate": 4.088664665262091e-06, "loss": 0.0124, "num_tokens": 9400600.0, "reward": 0.84613037109375, "reward_std": 0.014827309176325798, "rewards//mean": 0.84613037109375, "rewards//std": 0.0342073030769825, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2884, "grad_norm": 1.0180933475494385, "kl": 0.13066105311736465, "learning_rate": 4.0874392369668005e-06, "loss": 0.0131, "num_tokens": 9407112.0, "reward": 0.83953857421875, "reward_std": 0.011881370097398758, "rewards//mean": 0.83953857421875, "rewards//std": 0.02084849588572979, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2886, "grad_norm": 1.1155046224594116, "kl": 0.12286428781226277, "learning_rate": 4.0862131692511755e-06, "loss": 0.0123, "num_tokens": 9413616.0, "reward": 0.85260009765625, "reward_std": 0.011454642750322819, "rewards//mean": 0.85260009765625, "rewards//std": 0.017277482897043228, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2888, "grad_norm": 1.07012140750885, "kl": 0.11291062738746405, "learning_rate": 4.084986462609075e-06, "loss": 0.0113, "num_tokens": 9420136.0, "reward": 0.8671875, "reward_std": 0.014170526526868343, "rewards//mean": 0.8671875, "rewards//std": 0.02720610983669758, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.289, "grad_norm": 1.1895315647125244, "kl": 0.12126316828653216, "learning_rate": 4.083759117534617e-06, "loss": 0.0121, "num_tokens": 9426648.0, "reward": 0.83001708984375, "reward_std": 0.01551617868244648, "rewards//mean": 0.83001708984375, "rewards//std": 0.030401363968849182, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2892, "grad_norm": 0.9804394245147705, "kl": 0.11598385497927666, "learning_rate": 4.082531134522176e-06, "loss": 0.0116, "num_tokens": 9433064.0, "reward": 0.782470703125, "reward_std": 0.009592229500412941, "rewards//mean": 0.782470703125, "rewards//std": 0.016231337562203407, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2894, "grad_norm": 0.8221905827522278, "kl": 0.09029260091483593, "learning_rate": 4.081302514066384e-06, "loss": 0.009, "num_tokens": 9439632.0, "reward": 0.86090087890625, "reward_std": 0.011869443580508232, "rewards//mean": 0.86090087890625, "rewards//std": 0.022256728261709213, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2896, "grad_norm": 0.9797345995903015, "kl": 0.1768848728388548, "learning_rate": 4.080073256662128e-06, "loss": 0.0177, "num_tokens": 9446184.0, "reward": 0.86468505859375, "reward_std": 0.011002801358699799, "rewards//mean": 0.86468505859375, "rewards//std": 0.01709604263305664, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2898, "grad_norm": 1.0773303508758545, "kl": 0.16483253054320812, "learning_rate": 4.078843362804553e-06, "loss": 0.0165, "num_tokens": 9452696.0, "reward": 0.84228515625, "reward_std": 0.01050544623285532, "rewards//mean": 0.84228515625, "rewards//std": 0.02405458316206932, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.29, "grad_norm": 0.9164630174636841, "kl": 0.1643889332190156, "learning_rate": 4.07761283298906e-06, "loss": 0.0164, "num_tokens": 9459280.0, "reward": 0.829345703125, "reward_std": 0.011544741690158844, "rewards//mean": 0.829345703125, "rewards//std": 0.029305141419172287, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2902, "grad_norm": 1.370638370513916, "kl": 0.13925799448043108, "learning_rate": 4.076381667711306e-06, "loss": 0.0139, "num_tokens": 9465736.0, "reward": 0.8695068359375, "reward_std": 0.01142690610140562, "rewards//mean": 0.8695068359375, "rewards//std": 0.022985881194472313, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2904, "grad_norm": 1.2325938940048218, "kl": 0.14621046837419271, "learning_rate": 4.075149867467206e-06, "loss": 0.0146, "num_tokens": 9472264.0, "reward": 0.73541259765625, "reward_std": 0.010383212007582188, "rewards//mean": 0.73541259765625, "rewards//std": 0.021284664049744606, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2906, "grad_norm": 1.0405800342559814, "kl": 0.14867439214140177, "learning_rate": 4.073917432752927e-06, "loss": 0.0149, "num_tokens": 9478728.0, "reward": 0.861572265625, "reward_std": 0.015261801891028881, "rewards//mean": 0.861572265625, "rewards//std": 0.02539062686264515, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2908, "grad_norm": 1.4667929410934448, "kl": 0.1414516782388091, "learning_rate": 4.072684364064895e-06, "loss": 0.0141, "num_tokens": 9485168.0, "reward": 0.80865478515625, "reward_std": 0.011553778313100338, "rewards//mean": 0.80865478515625, "rewards//std": 0.026652909815311432, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.291, "grad_norm": 1.2408013343811035, "kl": 0.19575621094554663, "learning_rate": 4.071450661899789e-06, "loss": 0.0196, "num_tokens": 9491688.0, "reward": 0.87451171875, "reward_std": 0.015168572776019573, "rewards//mean": 0.87451171875, "rewards//std": 0.02346332184970379, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2912, "grad_norm": 0.930512011051178, "kl": 0.18390180077403784, "learning_rate": 4.070216326754544e-06, "loss": 0.0184, "num_tokens": 9498224.0, "reward": 0.81024169921875, "reward_std": 0.008332496508955956, "rewards//mean": 0.81024169921875, "rewards//std": 0.024722980335354805, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2914, "grad_norm": 1.0420191287994385, "kl": 0.1482674479484558, "learning_rate": 4.06898135912635e-06, "loss": 0.0148, "num_tokens": 9504728.0, "reward": 0.8685302734375, "reward_std": 0.011950846761465073, "rewards//mean": 0.8685302734375, "rewards//std": 0.028032347559928894, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2916, "grad_norm": 1.2089178562164307, "kl": 0.2419847995042801, "learning_rate": 4.067745759512654e-06, "loss": 0.0242, "num_tokens": 9511320.0, "reward": 0.85015869140625, "reward_std": 0.010974111966788769, "rewards//mean": 0.85015869140625, "rewards//std": 0.024913892149925232, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2918, "grad_norm": 1.3839377164840698, "kl": 0.2210477478802204, "learning_rate": 4.066509528411151e-06, "loss": 0.0221, "num_tokens": 9517864.0, "reward": 0.84588623046875, "reward_std": 0.013671338558197021, "rewards//mean": 0.84588623046875, "rewards//std": 0.025295577943325043, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.292, "grad_norm": 1.127463698387146, "kl": 0.12716227397322655, "learning_rate": 4.065272666319799e-06, "loss": 0.0127, "num_tokens": 9524384.0, "reward": 0.88916015625, "reward_std": 0.011658545583486557, "rewards//mean": 0.88916015625, "rewards//std": 0.01899053156375885, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2922, "grad_norm": 1.8438172340393066, "kl": 0.2706862650811672, "learning_rate": 4.064035173736804e-06, "loss": 0.0271, "num_tokens": 9530992.0, "reward": 0.84002685546875, "reward_std": 0.01677975431084633, "rewards//mean": 0.84002685546875, "rewards//std": 0.024758469313383102, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2924, "grad_norm": 1.0116584300994873, "kl": 0.12787321861833334, "learning_rate": 4.062797051160628e-06, "loss": 0.0128, "num_tokens": 9537504.0, "reward": 0.86090087890625, "reward_std": 0.010842365212738514, "rewards//mean": 0.86090087890625, "rewards//std": 0.0290706567466259, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2926, "grad_norm": 1.48806893825531, "kl": 0.25177933368831873, "learning_rate": 4.061558299089986e-06, "loss": 0.0252, "num_tokens": 9544128.0, "reward": 0.8865966796875, "reward_std": 0.015859346836805344, "rewards//mean": 0.8865966796875, "rewards//std": 0.031342629343271255, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2928, "grad_norm": 1.2298635244369507, "kl": 0.1497166072949767, "learning_rate": 4.060318918023849e-06, "loss": 0.015, "num_tokens": 9550584.0, "reward": 0.8763427734375, "reward_std": 0.008863152004778385, "rewards//mean": 0.8763427734375, "rewards//std": 0.015859151259064674, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.293, "grad_norm": 1.2120453119277954, "kl": 0.1496733631938696, "learning_rate": 4.059078908461437e-06, "loss": 0.015, "num_tokens": 9557136.0, "reward": 0.83984375, "reward_std": 0.00886900257319212, "rewards//mean": 0.83984375, "rewards//std": 0.019966626539826393, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2932, "grad_norm": 1.4771562814712524, "kl": 0.22556419111788273, "learning_rate": 4.057838270902228e-06, "loss": 0.0226, "num_tokens": 9563624.0, "reward": 0.83905029296875, "reward_std": 0.016592662781476974, "rewards//mean": 0.83905029296875, "rewards//std": 0.028486261144280434, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2934, "grad_norm": 1.3420743942260742, "kl": 0.1893452014774084, "learning_rate": 4.05659700584595e-06, "loss": 0.0189, "num_tokens": 9570088.0, "reward": 0.8336181640625, "reward_std": 0.014053305611014366, "rewards//mean": 0.8336181640625, "rewards//std": 0.016930213198065758, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2936, "grad_norm": 1.3484526872634888, "kl": 0.23788007255643606, "learning_rate": 4.055355113792584e-06, "loss": 0.0238, "num_tokens": 9576560.0, "reward": 0.85858154296875, "reward_std": 0.013363005593419075, "rewards//mean": 0.85858154296875, "rewards//std": 0.023315031081438065, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2938, "grad_norm": 1.4559316635131836, "kl": 0.21919224690645933, "learning_rate": 4.054112595242364e-06, "loss": 0.0219, "num_tokens": 9583120.0, "reward": 0.8697509765625, "reward_std": 0.013238479383289814, "rewards//mean": 0.8697509765625, "rewards//std": 0.019639072939753532, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.294, "grad_norm": 1.623126745223999, "kl": 0.18563608825206757, "learning_rate": 4.052869450695776e-06, "loss": 0.0186, "num_tokens": 9589752.0, "reward": 0.8463134765625, "reward_std": 0.013819476589560509, "rewards//mean": 0.8463134765625, "rewards//std": 0.02644973061978817, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2942, "grad_norm": 1.6477928161621094, "kl": 0.23351212963461876, "learning_rate": 4.05162568065356e-06, "loss": 0.0234, "num_tokens": 9596312.0, "reward": 0.8597412109375, "reward_std": 0.010276904329657555, "rewards//mean": 0.8597412109375, "rewards//std": 0.022975342348217964, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2944, "grad_norm": 1.236037254333496, "kl": 0.14013463538140059, "learning_rate": 4.050381285616704e-06, "loss": 0.014, "num_tokens": 9602800.0, "reward": 0.858154296875, "reward_std": 0.014526089653372765, "rewards//mean": 0.858154296875, "rewards//std": 0.023734018206596375, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2946, "grad_norm": 1.1711252927780151, "kl": 0.26334461383521557, "learning_rate": 4.049136266086453e-06, "loss": 0.0263, "num_tokens": 9609496.0, "reward": 0.82861328125, "reward_std": 0.013160791248083115, "rewards//mean": 0.82861328125, "rewards//std": 0.018512625247240067, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2948, "grad_norm": 1.3799346685409546, "kl": 0.2583620361983776, "learning_rate": 4.047890622564299e-06, "loss": 0.0258, "num_tokens": 9616096.0, "reward": 0.85455322265625, "reward_std": 0.016093868762254715, "rewards//mean": 0.85455322265625, "rewards//std": 0.044104985892772675, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.295, "grad_norm": 1.2277685403823853, "kl": 0.18356910441070795, "learning_rate": 4.046644355551986e-06, "loss": 0.0184, "num_tokens": 9622536.0, "reward": 0.86181640625, "reward_std": 0.013910414651036263, "rewards//mean": 0.86181640625, "rewards//std": 0.04232700914144516, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2952, "grad_norm": 1.0714882612228394, "kl": 0.18714805413037539, "learning_rate": 4.045397465551513e-06, "loss": 0.0187, "num_tokens": 9629032.0, "reward": 0.854248046875, "reward_std": 0.012925941497087479, "rewards//mean": 0.854248046875, "rewards//std": 0.01992868259549141, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2954, "grad_norm": 1.0526518821716309, "kl": 0.1630002660676837, "learning_rate": 4.044149953065126e-06, "loss": 0.0163, "num_tokens": 9635512.0, "reward": 0.836669921875, "reward_std": 0.014097323641180992, "rewards//mean": 0.836669921875, "rewards//std": 0.027803802862763405, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2956, "grad_norm": 1.1917614936828613, "kl": 0.11780481319874525, "learning_rate": 4.042901818595321e-06, "loss": 0.0118, "num_tokens": 9642024.0, "reward": 0.82391357421875, "reward_std": 0.011092804372310638, "rewards//mean": 0.82391357421875, "rewards//std": 0.021968523040413857, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2958, "grad_norm": 1.118537425994873, "kl": 0.15136539097875357, "learning_rate": 4.0416530626448495e-06, "loss": 0.0151, "num_tokens": 9648432.0, "reward": 0.87408447265625, "reward_std": 0.011182261630892754, "rewards//mean": 0.87408447265625, "rewards//std": 0.030740059912204742, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.296, "grad_norm": 0.9869058132171631, "kl": 0.15003001503646374, "learning_rate": 4.040403685716708e-06, "loss": 0.015, "num_tokens": 9654928.0, "reward": 0.832763671875, "reward_std": 0.012067338451743126, "rewards//mean": 0.832763671875, "rewards//std": 0.031000016257166862, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2962, "grad_norm": 0.9904590845108032, "kl": 0.1259972406551242, "learning_rate": 4.039153688314146e-06, "loss": 0.0126, "num_tokens": 9661480.0, "reward": 0.86383056640625, "reward_std": 0.011252230033278465, "rewards//mean": 0.86383056640625, "rewards//std": 0.02313448302447796, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2964, "grad_norm": 1.0344334840774536, "kl": 0.15467227064073086, "learning_rate": 4.037903070940663e-06, "loss": 0.0155, "num_tokens": 9668096.0, "reward": 0.84698486328125, "reward_std": 0.01276254653930664, "rewards//mean": 0.84698486328125, "rewards//std": 0.0314551405608654, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2966, "grad_norm": 1.000920057296753, "kl": 0.11305194813758135, "learning_rate": 4.036651834100006e-06, "loss": 0.0113, "num_tokens": 9674664.0, "reward": 0.85479736328125, "reward_std": 0.013624632731080055, "rewards//mean": 0.85479736328125, "rewards//std": 0.02857433818280697, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2968, "grad_norm": 1.1631202697753906, "kl": 0.21629176568239927, "learning_rate": 4.035399978296175e-06, "loss": 0.0216, "num_tokens": 9681104.0, "reward": 0.85479736328125, "reward_std": 0.014385364949703217, "rewards//mean": 0.85479736328125, "rewards//std": 0.0204763263463974, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.297, "grad_norm": 0.9765872955322266, "kl": 0.20351298339664936, "learning_rate": 4.034147504033416e-06, "loss": 0.0204, "num_tokens": 9687624.0, "reward": 0.8299560546875, "reward_std": 0.017347533255815506, "rewards//mean": 0.8299560546875, "rewards//std": 0.039093296974897385, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2972, "grad_norm": 1.2227191925048828, "kl": 0.20875768177211285, "learning_rate": 4.032894411816226e-06, "loss": 0.0209, "num_tokens": 9694224.0, "reward": 0.842041015625, "reward_std": 0.011289146728813648, "rewards//mean": 0.842041015625, "rewards//std": 0.018828826025128365, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2974, "grad_norm": 1.0543596744537354, "kl": 0.13768827077001333, "learning_rate": 4.03164070214935e-06, "loss": 0.0138, "num_tokens": 9700784.0, "reward": 0.84869384765625, "reward_std": 0.013281045481562614, "rewards//mean": 0.84869384765625, "rewards//std": 0.04151666909456253, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2976, "grad_norm": 1.121751070022583, "kl": 0.18548458442091942, "learning_rate": 4.030386375537782e-06, "loss": 0.0185, "num_tokens": 9707312.0, "reward": 0.83837890625, "reward_std": 0.011025451123714447, "rewards//mean": 0.83837890625, "rewards//std": 0.018222521990537643, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2978, "grad_norm": 0.9521909952163696, "kl": 0.1483901534229517, "learning_rate": 4.029131432486765e-06, "loss": 0.0148, "num_tokens": 9713856.0, "reward": 0.86669921875, "reward_std": 0.011670071631669998, "rewards//mean": 0.86669921875, "rewards//std": 0.02853146195411682, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.298, "grad_norm": 0.9832926988601685, "kl": 0.10297422017902136, "learning_rate": 4.02787587350179e-06, "loss": 0.0103, "num_tokens": 9720320.0, "reward": 0.8720703125, "reward_std": 0.013584131374955177, "rewards//mean": 0.8720703125, "rewards//std": 0.024705294519662857, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2982, "grad_norm": 1.035644292831421, "kl": 0.13656291086226702, "learning_rate": 4.0266196990885955e-06, "loss": 0.0137, "num_tokens": 9726784.0, "reward": 0.81396484375, "reward_std": 0.014303882606327534, "rewards//mean": 0.81396484375, "rewards//std": 0.020129719749093056, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2984, "grad_norm": 1.6437016725540161, "kl": 0.13031875900924206, "learning_rate": 4.02536290975317e-06, "loss": 0.013, "num_tokens": 9733304.0, "reward": 0.77197265625, "reward_std": 0.009252749383449554, "rewards//mean": 0.77197265625, "rewards//std": 0.020628880709409714, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2986, "grad_norm": 0.964187502861023, "kl": 0.10361974500119686, "learning_rate": 4.024105506001745e-06, "loss": 0.0104, "num_tokens": 9739872.0, "reward": 0.78948974609375, "reward_std": 0.013057501055300236, "rewards//mean": 0.78948974609375, "rewards//std": 0.030908016487956047, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2988, "grad_norm": 0.8852583169937134, "kl": 0.10270308936014771, "learning_rate": 4.022847488340806e-06, "loss": 0.0103, "num_tokens": 9746448.0, "reward": 0.8092041015625, "reward_std": 0.00957825779914856, "rewards//mean": 0.8092041015625, "rewards//std": 0.021170299500226974, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.299, "grad_norm": 1.188012957572937, "kl": 0.10409956611692905, "learning_rate": 4.02158885727708e-06, "loss": 0.0104, "num_tokens": 9752976.0, "reward": 0.89398193359375, "reward_std": 0.010849881917238235, "rewards//mean": 0.89398193359375, "rewards//std": 0.028132811188697815, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2992, "grad_norm": 0.8904669880867004, "kl": 0.08989690616726875, "learning_rate": 4.020329613317545e-06, "loss": 0.009, "num_tokens": 9759560.0, "reward": 0.86859130859375, "reward_std": 0.012392792850732803, "rewards//mean": 0.86859130859375, "rewards//std": 0.02895168773829937, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2994, "grad_norm": 0.9864640235900879, "kl": 0.11153603252023458, "learning_rate": 4.0190697569694235e-06, "loss": 0.0112, "num_tokens": 9766144.0, "reward": 0.82501220703125, "reward_std": 0.011273293755948544, "rewards//mean": 0.82501220703125, "rewards//std": 0.02582855336368084, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2996, "grad_norm": 0.8567317724227905, "kl": 0.0999477356672287, "learning_rate": 4.0178092887401845e-06, "loss": 0.01, "num_tokens": 9772656.0, "reward": 0.84063720703125, "reward_std": 0.011627024039626122, "rewards//mean": 0.84063720703125, "rewards//std": 0.030664624646306038, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.2998, "grad_norm": 1.0833796262741089, "kl": 0.11925695557147264, "learning_rate": 4.0165482091375466e-06, "loss": 0.0119, "num_tokens": 9779224.0, "reward": 0.827880859375, "reward_std": 0.00963970459997654, "rewards//mean": 0.827880859375, "rewards//std": 0.02509317174553871, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3, "grad_norm": 1.155179500579834, "kl": 0.12464006897062063, "learning_rate": 4.015286518669471e-06, "loss": 0.0125, "num_tokens": 9785768.0, "reward": 0.79083251953125, "reward_std": 0.010890122503042221, "rewards//mean": 0.79083251953125, "rewards//std": 0.02218383364379406, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3002, "grad_norm": 0.9555351734161377, "kl": 0.11808125302195549, "learning_rate": 4.014024217844167e-06, "loss": 0.0118, "num_tokens": 9792232.0, "reward": 0.824462890625, "reward_std": 0.013646856881678104, "rewards//mean": 0.824462890625, "rewards//std": 0.02582569606602192, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3004, "grad_norm": 1.0187458992004395, "kl": 0.09524169331416488, "learning_rate": 4.012761307170089e-06, "loss": 0.0095, "num_tokens": 9798640.0, "reward": 0.811279296875, "reward_std": 0.008143045008182526, "rewards//mean": 0.811279296875, "rewards//std": 0.027151526883244514, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3006, "grad_norm": 0.9885643720626831, "kl": 0.11309793032705784, "learning_rate": 4.011497787155938e-06, "loss": 0.0113, "num_tokens": 9805192.0, "reward": 0.7860107421875, "reward_std": 0.011032561771571636, "rewards//mean": 0.7860107421875, "rewards//std": 0.020675424486398697, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3008, "grad_norm": 1.1193957328796387, "kl": 0.09058209974318743, "learning_rate": 4.010233658310658e-06, "loss": 0.0091, "num_tokens": 9811656.0, "reward": 0.86651611328125, "reward_std": 0.010552256368100643, "rewards//mean": 0.86651611328125, "rewards//std": 0.022332094609737396, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.301, "grad_norm": 0.9449204206466675, "kl": 0.10706906672567129, "learning_rate": 4.008968921143441e-06, "loss": 0.0107, "num_tokens": 9818208.0, "reward": 0.8438720703125, "reward_std": 0.01311251800507307, "rewards//mean": 0.8438720703125, "rewards//std": 0.03281492739915848, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3012, "grad_norm": 0.8751006722450256, "kl": 0.0867734020575881, "learning_rate": 4.007703576163724e-06, "loss": 0.0087, "num_tokens": 9824688.0, "reward": 0.826171875, "reward_std": 0.006749882362782955, "rewards//mean": 0.826171875, "rewards//std": 0.021654214709997177, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3014, "grad_norm": 1.2223790884017944, "kl": 0.09092782577499747, "learning_rate": 4.006437623881186e-06, "loss": 0.0091, "num_tokens": 9831160.0, "reward": 0.892333984375, "reward_std": 0.014247046783566475, "rewards//mean": 0.892333984375, "rewards//std": 0.03086690977215767, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3016, "grad_norm": 0.9416851997375488, "kl": 0.11521990271285176, "learning_rate": 4.005171064805754e-06, "loss": 0.0115, "num_tokens": 9837664.0, "reward": 0.80084228515625, "reward_std": 0.015122679993510246, "rewards//mean": 0.80084228515625, "rewards//std": 0.024265030398964882, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3018, "grad_norm": 0.9480648636817932, "kl": 0.09349404834210873, "learning_rate": 4.003903899447597e-06, "loss": 0.0093, "num_tokens": 9844288.0, "reward": 0.82733154296875, "reward_std": 0.012186985462903976, "rewards//mean": 0.82733154296875, "rewards//std": 0.015934735536575317, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.302, "grad_norm": 0.8254448175430298, "kl": 0.07622414035722613, "learning_rate": 4.0026361283171285e-06, "loss": 0.0076, "num_tokens": 9850808.0, "reward": 0.80224609375, "reward_std": 0.010978786274790764, "rewards//mean": 0.80224609375, "rewards//std": 0.020105641335248947, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3022, "grad_norm": 1.0643914937973022, "kl": 0.09281850652769208, "learning_rate": 4.001367751925008e-06, "loss": 0.0093, "num_tokens": 9857376.0, "reward": 0.80853271484375, "reward_std": 0.010640128515660763, "rewards//mean": 0.80853271484375, "rewards//std": 0.02167089842259884, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3024, "grad_norm": 1.0243295431137085, "kl": 0.10585340484976768, "learning_rate": 4.000098770782136e-06, "loss": 0.0106, "num_tokens": 9863928.0, "reward": 0.83380126953125, "reward_std": 0.013894623145461082, "rewards//mean": 0.83380126953125, "rewards//std": 0.027223283424973488, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3026, "grad_norm": 0.9573279023170471, "kl": 0.11550587136298418, "learning_rate": 3.998829185399659e-06, "loss": 0.0116, "num_tokens": 9870408.0, "reward": 0.8316650390625, "reward_std": 0.016471361741423607, "rewards//mean": 0.8316650390625, "rewards//std": 0.023049017414450645, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3028, "grad_norm": 0.9079932570457458, "kl": 0.09464981965720654, "learning_rate": 3.997558996288965e-06, "loss": 0.0095, "num_tokens": 9876872.0, "reward": 0.79937744140625, "reward_std": 0.010661354288458824, "rewards//mean": 0.79937744140625, "rewards//std": 0.02342062070965767, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.303, "grad_norm": 0.8314977288246155, "kl": 0.11573614086955786, "learning_rate": 3.996288203961686e-06, "loss": 0.0116, "num_tokens": 9883400.0, "reward": 0.834716796875, "reward_std": 0.009656884707510471, "rewards//mean": 0.834716796875, "rewards//std": 0.022303014993667603, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3032, "grad_norm": 0.9864110946655273, "kl": 0.11670671310275793, "learning_rate": 3.995016808929698e-06, "loss": 0.0117, "num_tokens": 9889824.0, "reward": 0.85137939453125, "reward_std": 0.011265913024544716, "rewards//mean": 0.85137939453125, "rewards//std": 0.02554805390536785, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3034, "grad_norm": 0.9441384673118591, "kl": 0.09770321287214756, "learning_rate": 3.993744811705118e-06, "loss": 0.0098, "num_tokens": 9896312.0, "reward": 0.84283447265625, "reward_std": 0.011553101241588593, "rewards//mean": 0.84283447265625, "rewards//std": 0.029868897050619125, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3036, "grad_norm": 0.9125849008560181, "kl": 0.09580626897513866, "learning_rate": 3.992472212800307e-06, "loss": 0.0096, "num_tokens": 9902784.0, "reward": 0.85498046875, "reward_std": 0.015424443408846855, "rewards//mean": 0.85498046875, "rewards//std": 0.04143642261624336, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3038, "grad_norm": 0.8712005615234375, "kl": 0.10071872919797897, "learning_rate": 3.991199012727867e-06, "loss": 0.0101, "num_tokens": 9909440.0, "reward": 0.8607177734375, "reward_std": 0.009875558316707611, "rewards//mean": 0.8607177734375, "rewards//std": 0.019265538081526756, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.304, "grad_norm": 0.9775246977806091, "kl": 0.09578885696828365, "learning_rate": 3.989925212000641e-06, "loss": 0.0096, "num_tokens": 9915952.0, "reward": 0.8408203125, "reward_std": 0.012960642576217651, "rewards//mean": 0.8408203125, "rewards//std": 0.029264822602272034, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3042, "grad_norm": 1.0084975957870483, "kl": 0.10804043291136622, "learning_rate": 3.98865081113172e-06, "loss": 0.0108, "num_tokens": 9922552.0, "reward": 0.8486328125, "reward_std": 0.010642222128808498, "rewards//mean": 0.8486328125, "rewards//std": 0.026143772527575493, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3044, "grad_norm": 1.0175291299819946, "kl": 0.10143769904971123, "learning_rate": 3.98737581063443e-06, "loss": 0.0101, "num_tokens": 9929112.0, "reward": 0.88055419921875, "reward_std": 0.014652518555521965, "rewards//mean": 0.88055419921875, "rewards//std": 0.03307104855775833, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3046, "grad_norm": 1.0959678888320923, "kl": 0.09590136911720037, "learning_rate": 3.986100211022341e-06, "loss": 0.0096, "num_tokens": 9935616.0, "reward": 0.86181640625, "reward_std": 0.01263584103435278, "rewards//mean": 0.86181640625, "rewards//std": 0.0288186427205801, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3048, "grad_norm": 0.8565276265144348, "kl": 0.09777226811274886, "learning_rate": 3.984824012809265e-06, "loss": 0.0098, "num_tokens": 9942128.0, "reward": 0.8427734375, "reward_std": 0.011245591565966606, "rewards//mean": 0.8427734375, "rewards//std": 0.02083333395421505, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.305, "grad_norm": 1.2849348783493042, "kl": 0.12948387954384089, "learning_rate": 3.983547216509254e-06, "loss": 0.0129, "num_tokens": 9948592.0, "reward": 0.86907958984375, "reward_std": 0.020118363201618195, "rewards//mean": 0.86907958984375, "rewards//std": 0.02806493267416954, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3052, "grad_norm": 0.8933499455451965, "kl": 0.11073662992566824, "learning_rate": 3.982269822636602e-06, "loss": 0.0111, "num_tokens": 9955032.0, "reward": 0.779052734375, "reward_std": 0.006543848663568497, "rewards//mean": 0.779052734375, "rewards//std": 0.015420207753777504, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3054, "grad_norm": 1.0669031143188477, "kl": 0.1041721673682332, "learning_rate": 3.980991831705842e-06, "loss": 0.0104, "num_tokens": 9961552.0, "reward": 0.82623291015625, "reward_std": 0.010609949938952923, "rewards//mean": 0.82623291015625, "rewards//std": 0.021564461290836334, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3056, "grad_norm": 1.0093790292739868, "kl": 0.11958145722746849, "learning_rate": 3.97971324423175e-06, "loss": 0.012, "num_tokens": 9968072.0, "reward": 0.82122802734375, "reward_std": 0.010390098206698895, "rewards//mean": 0.82122802734375, "rewards//std": 0.015667449682950974, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3058, "grad_norm": 1.0176607370376587, "kl": 0.114197903778404, "learning_rate": 3.97843406072934e-06, "loss": 0.0114, "num_tokens": 9974520.0, "reward": 0.8182373046875, "reward_std": 0.009466897696256638, "rewards//mean": 0.8182373046875, "rewards//std": 0.017033616080880165, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.306, "grad_norm": 0.9734399318695068, "kl": 0.1230583181604743, "learning_rate": 3.977154281713866e-06, "loss": 0.0123, "num_tokens": 9981016.0, "reward": 0.84307861328125, "reward_std": 0.01285941805690527, "rewards//mean": 0.84307861328125, "rewards//std": 0.025624966248869896, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3062, "grad_norm": 0.9041325449943542, "kl": 0.11644632322713733, "learning_rate": 3.9758739077008256e-06, "loss": 0.0116, "num_tokens": 9987600.0, "reward": 0.84881591796875, "reward_std": 0.013761574402451515, "rewards//mean": 0.84881591796875, "rewards//std": 0.03623630106449127, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3064, "grad_norm": 1.1362683773040771, "kl": 0.13513679057359695, "learning_rate": 3.97459293920595e-06, "loss": 0.0135, "num_tokens": 9994120.0, "reward": 0.861572265625, "reward_std": 0.013692237436771393, "rewards//mean": 0.861572265625, "rewards//std": 0.02397768571972847, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3066, "grad_norm": 0.9370095729827881, "kl": 0.1517316708341241, "learning_rate": 3.9733113767452165e-06, "loss": 0.0152, "num_tokens": 10000624.0, "reward": 0.83917236328125, "reward_std": 0.012271118350327015, "rewards//mean": 0.83917236328125, "rewards//std": 0.037824857980012894, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3068, "grad_norm": 0.9835516810417175, "kl": 0.10404063668102026, "learning_rate": 3.972029220834836e-06, "loss": 0.0104, "num_tokens": 10007136.0, "reward": 0.837158203125, "reward_std": 0.011765485629439354, "rewards//mean": 0.837158203125, "rewards//std": 0.020913109183311462, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.307, "grad_norm": 1.086817979812622, "kl": 0.14606587681919336, "learning_rate": 3.970746471991261e-06, "loss": 0.0146, "num_tokens": 10013744.0, "reward": 0.870361328125, "reward_std": 0.015291957184672356, "rewards//mean": 0.870361328125, "rewards//std": 0.03600291535258293, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3072, "grad_norm": 1.3119032382965088, "kl": 0.1423627771437168, "learning_rate": 3.969463130731183e-06, "loss": 0.0142, "num_tokens": 10020248.0, "reward": 0.85736083984375, "reward_std": 0.012822803109884262, "rewards//mean": 0.85736083984375, "rewards//std": 0.022412613034248352, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3074, "grad_norm": 1.0490734577178955, "kl": 0.08914090041071177, "learning_rate": 3.968179197571532e-06, "loss": 0.0089, "num_tokens": 10026768.0, "reward": 0.84765625, "reward_std": 0.008574573323130608, "rewards//mean": 0.84765625, "rewards//std": 0.015717731788754463, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3076, "grad_norm": 1.0039983987808228, "kl": 0.11316837649792433, "learning_rate": 3.966894673029476e-06, "loss": 0.0113, "num_tokens": 10033336.0, "reward": 0.8018798828125, "reward_std": 0.01197902113199234, "rewards//mean": 0.8018798828125, "rewards//std": 0.027449561282992363, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3078, "grad_norm": 1.0520570278167725, "kl": 0.11704367212951183, "learning_rate": 3.965609557622421e-06, "loss": 0.0117, "num_tokens": 10039800.0, "reward": 0.85040283203125, "reward_std": 0.011031118221580982, "rewards//mean": 0.85040283203125, "rewards//std": 0.024076521396636963, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.308, "grad_norm": 1.1553947925567627, "kl": 0.14275426231324673, "learning_rate": 3.964323851868012e-06, "loss": 0.0143, "num_tokens": 10046416.0, "reward": 0.83209228515625, "reward_std": 0.012375419959425926, "rewards//mean": 0.83209228515625, "rewards//std": 0.027405615895986557, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3082, "grad_norm": 1.0867881774902344, "kl": 0.12534686364233494, "learning_rate": 3.96303755628413e-06, "loss": 0.0125, "num_tokens": 10052888.0, "reward": 0.830078125, "reward_std": 0.013117661699652672, "rewards//mean": 0.830078125, "rewards//std": 0.025808105245232582, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3084, "grad_norm": 1.0147706270217896, "kl": 0.12783116288483143, "learning_rate": 3.961750671388894e-06, "loss": 0.0128, "num_tokens": 10059336.0, "reward": 0.87933349609375, "reward_std": 0.013535341247916222, "rewards//mean": 0.87933349609375, "rewards//std": 0.029166312888264656, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3086, "grad_norm": 1.1973650455474854, "kl": 0.12111096177250147, "learning_rate": 3.960463197700664e-06, "loss": 0.0121, "num_tokens": 10065784.0, "reward": 0.85302734375, "reward_std": 0.012875966727733612, "rewards//mean": 0.85302734375, "rewards//std": 0.03488653153181076, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3088, "grad_norm": 1.0904971361160278, "kl": 0.12519002333283424, "learning_rate": 3.959175135738032e-06, "loss": 0.0125, "num_tokens": 10072320.0, "reward": 0.81646728515625, "reward_std": 0.015413210727274418, "rewards//mean": 0.81646728515625, "rewards//std": 0.03787564858794212, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.309, "grad_norm": 1.1575191020965576, "kl": 0.1391693903133273, "learning_rate": 3.95788648601983e-06, "loss": 0.0139, "num_tokens": 10078792.0, "reward": 0.87799072265625, "reward_std": 0.014824006706476212, "rewards//mean": 0.87799072265625, "rewards//std": 0.03309255465865135, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3092, "grad_norm": 1.1259450912475586, "kl": 0.10671613551676273, "learning_rate": 3.956597249065126e-06, "loss": 0.0107, "num_tokens": 10085360.0, "reward": 0.84185791015625, "reward_std": 0.015546904876828194, "rewards//mean": 0.84185791015625, "rewards//std": 0.03328412026166916, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3094, "grad_norm": 0.9413769245147705, "kl": 0.10240978840738535, "learning_rate": 3.955307425393224e-06, "loss": 0.0102, "num_tokens": 10091856.0, "reward": 0.84832763671875, "reward_std": 0.010495718568563461, "rewards//mean": 0.84832763671875, "rewards//std": 0.03229102864861488, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3096, "grad_norm": 1.1100678443908691, "kl": 0.11617637844756246, "learning_rate": 3.954017015523665e-06, "loss": 0.0116, "num_tokens": 10098392.0, "reward": 0.88360595703125, "reward_std": 0.012345760129392147, "rewards//mean": 0.88360595703125, "rewards//std": 0.02323177456855774, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3098, "grad_norm": 0.9572536945343018, "kl": 0.09792736452072859, "learning_rate": 3.9527260199762266e-06, "loss": 0.0098, "num_tokens": 10104888.0, "reward": 0.822509765625, "reward_std": 0.011632275767624378, "rewards//mean": 0.822509765625, "rewards//std": 0.041915152221918106, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.31, "grad_norm": 1.044927954673767, "kl": 0.12094098795205355, "learning_rate": 3.95143443927092e-06, "loss": 0.0121, "num_tokens": 10111384.0, "reward": 0.8411865234375, "reward_std": 0.013515893369913101, "rewards//mean": 0.8411865234375, "rewards//std": 0.01394104678183794, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3102, "grad_norm": 1.51292884349823, "kl": 0.15140191745012999, "learning_rate": 3.950142273927996e-06, "loss": 0.0151, "num_tokens": 10117952.0, "reward": 0.83819580078125, "reward_std": 0.012147306464612484, "rewards//mean": 0.83819580078125, "rewards//std": 0.02490842342376709, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3104, "grad_norm": 0.9936661720275879, "kl": 0.2027598526328802, "learning_rate": 3.948849524467937e-06, "loss": 0.0203, "num_tokens": 10124488.0, "reward": 0.85675048828125, "reward_std": 0.015573893673717976, "rewards//mean": 0.85675048828125, "rewards//std": 0.029512515291571617, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3106, "grad_norm": 1.174791693687439, "kl": 0.11028601508587599, "learning_rate": 3.9475561914114625e-06, "loss": 0.011, "num_tokens": 10130952.0, "reward": 0.87298583984375, "reward_std": 0.01029521320015192, "rewards//mean": 0.87298583984375, "rewards//std": 0.03199290856719017, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3108, "grad_norm": 0.9310947060585022, "kl": 0.09671257017180324, "learning_rate": 3.946262275279528e-06, "loss": 0.0097, "num_tokens": 10137472.0, "reward": 0.78741455078125, "reward_std": 0.009314518421888351, "rewards//mean": 0.78741455078125, "rewards//std": 0.02134786732494831, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.311, "grad_norm": 1.284087061882019, "kl": 0.1145801180973649, "learning_rate": 3.944967776593321e-06, "loss": 0.0115, "num_tokens": 10144032.0, "reward": 0.8720703125, "reward_std": 0.015393547713756561, "rewards//mean": 0.8720703125, "rewards//std": 0.02720610983669758, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3112, "grad_norm": 1.058629035949707, "kl": 0.1322183571755886, "learning_rate": 3.9436726958742665e-06, "loss": 0.0132, "num_tokens": 10150616.0, "reward": 0.80389404296875, "reward_std": 0.012231405824422836, "rewards//mean": 0.80389404296875, "rewards//std": 0.028324255719780922, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3114, "grad_norm": 1.1307373046875, "kl": 0.13148376159369946, "learning_rate": 3.9423770336440235e-06, "loss": 0.0131, "num_tokens": 10157000.0, "reward": 0.8385009765625, "reward_std": 0.015050876885652542, "rewards//mean": 0.8385009765625, "rewards//std": 0.019920704886317253, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3116, "grad_norm": 1.360014796257019, "kl": 0.09901539469137788, "learning_rate": 3.941080790424483e-06, "loss": 0.0099, "num_tokens": 10163520.0, "reward": 0.85760498046875, "reward_std": 0.01079031266272068, "rewards//mean": 0.85760498046875, "rewards//std": 0.025957170873880386, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3118, "grad_norm": 0.9540613293647766, "kl": 0.13096563797444105, "learning_rate": 3.939783966737774e-06, "loss": 0.0131, "num_tokens": 10170016.0, "reward": 0.86480712890625, "reward_std": 0.013692352920770645, "rewards//mean": 0.86480712890625, "rewards//std": 0.022687841206789017, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.312, "grad_norm": 0.9531645774841309, "kl": 0.13835171330720186, "learning_rate": 3.938486563106254e-06, "loss": 0.0138, "num_tokens": 10176480.0, "reward": 0.84735107421875, "reward_std": 0.01101384125649929, "rewards//mean": 0.84735107421875, "rewards//std": 0.033133238554000854, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3122, "grad_norm": 1.0841401815414429, "kl": 0.14647202752530575, "learning_rate": 3.937188580052518e-06, "loss": 0.0146, "num_tokens": 10183048.0, "reward": 0.79193115234375, "reward_std": 0.009339461103081703, "rewards//mean": 0.79193115234375, "rewards//std": 0.018285444006323814, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3124, "grad_norm": 1.0599943399429321, "kl": 0.20825077965855598, "learning_rate": 3.935890018099395e-06, "loss": 0.0208, "num_tokens": 10189584.0, "reward": 0.86541748046875, "reward_std": 0.01859995350241661, "rewards//mean": 0.86541748046875, "rewards//std": 0.024899912998080254, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3126, "grad_norm": 1.1814680099487305, "kl": 0.141653535887599, "learning_rate": 3.934590877769944e-06, "loss": 0.0142, "num_tokens": 10196040.0, "reward": 0.8367919921875, "reward_std": 0.009873385541141033, "rewards//mean": 0.8367919921875, "rewards//std": 0.023151244968175888, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3128, "grad_norm": 1.4059680700302124, "kl": 0.1675069872289896, "learning_rate": 3.933291159587459e-06, "loss": 0.0168, "num_tokens": 10202624.0, "reward": 0.8426513671875, "reward_std": 0.012220920994877815, "rewards//mean": 0.8426513671875, "rewards//std": 0.016667084768414497, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.313, "grad_norm": 1.194278597831726, "kl": 0.14656903874129057, "learning_rate": 3.931990864075465e-06, "loss": 0.0147, "num_tokens": 10209200.0, "reward": 0.8702392578125, "reward_std": 0.014691924676299095, "rewards//mean": 0.8702392578125, "rewards//std": 0.031323302537202835, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3132, "grad_norm": 1.0327028036117554, "kl": 0.15431713964790106, "learning_rate": 3.9306899917577245e-06, "loss": 0.0154, "num_tokens": 10215896.0, "reward": 0.857421875, "reward_std": 0.013767905533313751, "rewards//mean": 0.857421875, "rewards//std": 0.021281909197568893, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3134, "grad_norm": 1.2217936515808105, "kl": 0.10981670394539833, "learning_rate": 3.929388543158225e-06, "loss": 0.011, "num_tokens": 10222560.0, "reward": 0.87493896484375, "reward_std": 0.013651605695486069, "rewards//mean": 0.87493896484375, "rewards//std": 0.035939738154411316, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3136, "grad_norm": 1.081897497177124, "kl": 0.1916325893253088, "learning_rate": 3.928086518801192e-06, "loss": 0.0192, "num_tokens": 10229112.0, "reward": 0.89166259765625, "reward_std": 0.00864003598690033, "rewards//mean": 0.89166259765625, "rewards//std": 0.013571720570325851, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3138, "grad_norm": 1.444561243057251, "kl": 0.21791773475706577, "learning_rate": 3.92678391921108e-06, "loss": 0.0218, "num_tokens": 10235648.0, "reward": 0.83172607421875, "reward_std": 0.009457984939217567, "rewards//mean": 0.83172607421875, "rewards//std": 0.011735369451344013, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.314, "grad_norm": 0.9707148671150208, "kl": 0.15562077052891254, "learning_rate": 3.925480744912575e-06, "loss": 0.0156, "num_tokens": 10242224.0, "reward": 0.84814453125, "reward_std": 0.011352070607244968, "rewards//mean": 0.84814453125, "rewards//std": 0.03064345009624958, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3142, "grad_norm": 1.494766354560852, "kl": 0.2010530512779951, "learning_rate": 3.924176996430597e-06, "loss": 0.0201, "num_tokens": 10248800.0, "reward": 0.8199462890625, "reward_std": 0.008230272680521011, "rewards//mean": 0.8199462890625, "rewards//std": 0.014857755973935127, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3144, "grad_norm": 1.3175941705703735, "kl": 0.20267202984541655, "learning_rate": 3.922872674290296e-06, "loss": 0.0203, "num_tokens": 10255224.0, "reward": 0.88153076171875, "reward_std": 0.014309214428067207, "rewards//mean": 0.88153076171875, "rewards//std": 0.021910564973950386, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3146, "grad_norm": 1.185042142868042, "kl": 0.16586732771247625, "learning_rate": 3.921567779017051e-06, "loss": 0.0166, "num_tokens": 10261808.0, "reward": 0.8033447265625, "reward_std": 0.01597771793603897, "rewards//mean": 0.8033447265625, "rewards//std": 0.041743434965610504, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3148, "grad_norm": 1.0905088186264038, "kl": 0.1943365652114153, "learning_rate": 3.9202623111364745e-06, "loss": 0.0194, "num_tokens": 10268344.0, "reward": 0.8514404296875, "reward_std": 0.009620440192520618, "rewards//mean": 0.8514404296875, "rewards//std": 0.02421983890235424, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.315, "grad_norm": 1.1202824115753174, "kl": 0.1653131451457739, "learning_rate": 3.918956271174409e-06, "loss": 0.0165, "num_tokens": 10274856.0, "reward": 0.86968994140625, "reward_std": 0.011868776753544807, "rewards//mean": 0.86968994140625, "rewards//std": 0.03686375916004181, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3152, "grad_norm": 1.2422899007797241, "kl": 0.19478945434093475, "learning_rate": 3.917649659656927e-06, "loss": 0.0195, "num_tokens": 10281280.0, "reward": 0.866455078125, "reward_std": 0.01647929847240448, "rewards//mean": 0.866455078125, "rewards//std": 0.031046859920024872, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3154, "grad_norm": 1.199830412864685, "kl": 0.21953730657696724, "learning_rate": 3.916342477110332e-06, "loss": 0.022, "num_tokens": 10287968.0, "reward": 0.8291015625, "reward_std": 0.014582466334104538, "rewards//mean": 0.8291015625, "rewards//std": 0.03074996918439865, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3156, "grad_norm": 1.3040071725845337, "kl": 0.21313344594091177, "learning_rate": 3.915034724061157e-06, "loss": 0.0213, "num_tokens": 10294448.0, "reward": 0.8565673828125, "reward_std": 0.015878789126873016, "rewards//mean": 0.8565673828125, "rewards//std": 0.03322204202413559, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3158, "grad_norm": 1.3451532125473022, "kl": 0.20193507708609104, "learning_rate": 3.913726401036164e-06, "loss": 0.0202, "num_tokens": 10301056.0, "reward": 0.8218994140625, "reward_std": 0.009965074248611927, "rewards//mean": 0.8218994140625, "rewards//std": 0.013418748043477535, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.316, "grad_norm": 1.521202564239502, "kl": 0.29137206450104713, "learning_rate": 3.912417508562345e-06, "loss": 0.0291, "num_tokens": 10307552.0, "reward": 0.84954833984375, "reward_std": 0.01774267852306366, "rewards//mean": 0.84954833984375, "rewards//std": 0.03565897047519684, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3162, "grad_norm": 1.3244349956512451, "kl": 0.22236711345613003, "learning_rate": 3.911108047166924e-06, "loss": 0.0222, "num_tokens": 10314024.0, "reward": 0.87835693359375, "reward_std": 0.013048689812421799, "rewards//mean": 0.87835693359375, "rewards//std": 0.01797316037118435, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3164, "grad_norm": 1.547742247581482, "kl": 0.2273577768355608, "learning_rate": 3.909798017377348e-06, "loss": 0.0227, "num_tokens": 10320496.0, "reward": 0.85882568359375, "reward_std": 0.011823354288935661, "rewards//mean": 0.85882568359375, "rewards//std": 0.024801841005682945, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3166, "grad_norm": 1.4738612174987793, "kl": 0.297494251281023, "learning_rate": 3.908487419721302e-06, "loss": 0.0297, "num_tokens": 10327040.0, "reward": 0.8375244140625, "reward_std": 0.012024518102407455, "rewards//mean": 0.8375244140625, "rewards//std": 0.025274399667978287, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3168, "grad_norm": 1.581113338470459, "kl": 0.37137704342603683, "learning_rate": 3.90717625472669e-06, "loss": 0.0371, "num_tokens": 10333504.0, "reward": 0.8240966796875, "reward_std": 0.011749541386961937, "rewards//mean": 0.8240966796875, "rewards//std": 0.025609971955418587, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.317, "grad_norm": 1.4921132326126099, "kl": 0.30917175859212875, "learning_rate": 3.9058645229216515e-06, "loss": 0.0309, "num_tokens": 10339984.0, "reward": 0.79119873046875, "reward_std": 0.010327380150556564, "rewards//mean": 0.79119873046875, "rewards//std": 0.017301123589277267, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3172, "grad_norm": 1.763503074645996, "kl": 0.280387694016099, "learning_rate": 3.90455222483455e-06, "loss": 0.028, "num_tokens": 10346496.0, "reward": 0.86541748046875, "reward_std": 0.011608521454036236, "rewards//mean": 0.86541748046875, "rewards//std": 0.02044525183737278, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3174, "grad_norm": 2.9891467094421387, "kl": 0.31675155088305473, "learning_rate": 3.903239360993982e-06, "loss": 0.0317, "num_tokens": 10353120.0, "reward": 0.848876953125, "reward_std": 0.010927378199994564, "rewards//mean": 0.848876953125, "rewards//std": 0.024058358743786812, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3176, "grad_norm": 1.3137307167053223, "kl": 0.3084825146943331, "learning_rate": 3.9019259319287666e-06, "loss": 0.0308, "num_tokens": 10359736.0, "reward": 0.83538818359375, "reward_std": 0.012250076979398727, "rewards//mean": 0.83538818359375, "rewards//std": 0.02125762216746807, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3178, "grad_norm": 2.3434829711914062, "kl": 0.3836858058348298, "learning_rate": 3.900611938167953e-06, "loss": 0.0384, "num_tokens": 10366312.0, "reward": 0.85748291015625, "reward_std": 0.011049291118979454, "rewards//mean": 0.85748291015625, "rewards//std": 0.025957753881812096, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.318, "grad_norm": 1.5555602312088013, "kl": 0.28151683136820793, "learning_rate": 3.899297380240819e-06, "loss": 0.0282, "num_tokens": 10372776.0, "reward": 0.8275146484375, "reward_std": 0.009996293112635612, "rewards//mean": 0.8275146484375, "rewards//std": 0.02338803932070732, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3182, "grad_norm": 1.5242350101470947, "kl": 0.2793434113264084, "learning_rate": 3.897982258676867e-06, "loss": 0.0279, "num_tokens": 10379368.0, "reward": 0.83135986328125, "reward_std": 0.009575911797583103, "rewards//mean": 0.83135986328125, "rewards//std": 0.01838204823434353, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3184, "grad_norm": 1.6228594779968262, "kl": 0.2863396443426609, "learning_rate": 3.896666574005829e-06, "loss": 0.0286, "num_tokens": 10385840.0, "reward": 0.83306884765625, "reward_std": 0.01187505479902029, "rewards//mean": 0.83306884765625, "rewards//std": 0.022501591593027115, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3186, "grad_norm": 1.8107728958129883, "kl": 0.554836381226778, "learning_rate": 3.895350326757662e-06, "loss": 0.0555, "num_tokens": 10392264.0, "reward": 0.8460693359375, "reward_std": 0.016593921929597855, "rewards//mean": 0.8460693359375, "rewards//std": 0.03656923398375511, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3188, "grad_norm": 1.3721973896026611, "kl": 0.3221135176718235, "learning_rate": 3.89403351746255e-06, "loss": 0.0322, "num_tokens": 10398720.0, "reward": 0.88671875, "reward_std": 0.013877788558602333, "rewards//mean": 0.88671875, "rewards//std": 0.019930202513933182, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.319, "grad_norm": 2.2039554119110107, "kl": 0.2943542618304491, "learning_rate": 3.892716146650903e-06, "loss": 0.0294, "num_tokens": 10405208.0, "reward": 0.83837890625, "reward_std": 0.016938013955950737, "rewards//mean": 0.83837890625, "rewards//std": 0.033555950969457626, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3192, "grad_norm": 1.6362485885620117, "kl": 0.3125859946012497, "learning_rate": 3.8913982148533605e-06, "loss": 0.0313, "num_tokens": 10411656.0, "reward": 0.85650634765625, "reward_std": 0.014011642895638943, "rewards//mean": 0.85650634765625, "rewards//std": 0.02661426179111004, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3194, "grad_norm": 3.1087827682495117, "kl": 0.2995978221297264, "learning_rate": 3.890079722600781e-06, "loss": 0.03, "num_tokens": 10418208.0, "reward": 0.7996826171875, "reward_std": 0.0077233766205608845, "rewards//mean": 0.7996826171875, "rewards//std": 0.014771925285458565, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3196, "grad_norm": 2.0437378883361816, "kl": 0.3898289669305086, "learning_rate": 3.888760670424257e-06, "loss": 0.039, "num_tokens": 10424736.0, "reward": 0.87957763671875, "reward_std": 0.013866064138710499, "rewards//mean": 0.87957763671875, "rewards//std": 0.025070765987038612, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3198, "grad_norm": 1.75062894821167, "kl": 0.21383224613964558, "learning_rate": 3.887441058855102e-06, "loss": 0.0214, "num_tokens": 10431200.0, "reward": 0.86383056640625, "reward_std": 0.008367249742150307, "rewards//mean": 0.86383056640625, "rewards//std": 0.028930766507983208, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.32, "grad_norm": 2.5839340686798096, "kl": 0.46917199343442917, "learning_rate": 3.8861208884248526e-06, "loss": 0.0469, "num_tokens": 10437632.0, "reward": 0.85357666015625, "reward_std": 0.01555424090474844, "rewards//mean": 0.85357666015625, "rewards//std": 0.022434215992689133, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3202, "grad_norm": 2.180337429046631, "kl": 0.3641548529267311, "learning_rate": 3.8848001596652765e-06, "loss": 0.0364, "num_tokens": 10444176.0, "reward": 0.7913818359375, "reward_std": 0.012111423537135124, "rewards//mean": 0.7913818359375, "rewards//std": 0.028226081281900406, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3204, "grad_norm": 1.5325684547424316, "kl": 0.2617505071684718, "learning_rate": 3.88347887310836e-06, "loss": 0.0262, "num_tokens": 10450712.0, "reward": 0.8709716796875, "reward_std": 0.0117114232853055, "rewards//mean": 0.8709716796875, "rewards//std": 0.023083142936229706, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3206, "grad_norm": 1.3849873542785645, "kl": 0.29397496208548546, "learning_rate": 3.882157029286321e-06, "loss": 0.0294, "num_tokens": 10457232.0, "reward": 0.83367919921875, "reward_std": 0.009234374389052391, "rewards//mean": 0.83367919921875, "rewards//std": 0.022721843793988228, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3208, "grad_norm": 1.487534523010254, "kl": 0.2960985638201237, "learning_rate": 3.880834628731594e-06, "loss": 0.0296, "num_tokens": 10463912.0, "reward": 0.83544921875, "reward_std": 0.013643302954733372, "rewards//mean": 0.83544921875, "rewards//std": 0.038016121834516525, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.321, "grad_norm": 1.6184959411621094, "kl": 0.29642074555158615, "learning_rate": 3.8795116719768445e-06, "loss": 0.0296, "num_tokens": 10470320.0, "reward": 0.8839111328125, "reward_std": 0.013438718393445015, "rewards//mean": 0.8839111328125, "rewards//std": 0.02137807197868824, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3212, "grad_norm": 1.7646229267120361, "kl": 0.2807923462241888, "learning_rate": 3.8781881595549585e-06, "loss": 0.0281, "num_tokens": 10476864.0, "reward": 0.83984375, "reward_std": 0.010423191823065281, "rewards//mean": 0.83984375, "rewards//std": 0.024577517062425613, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3214, "grad_norm": 1.7403249740600586, "kl": 0.259729765355587, "learning_rate": 3.876864091999046e-06, "loss": 0.026, "num_tokens": 10483384.0, "reward": 0.8280029296875, "reward_std": 0.009250469505786896, "rewards//mean": 0.8280029296875, "rewards//std": 0.0167323499917984, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3216, "grad_norm": 1.4189034700393677, "kl": 0.24482745770365, "learning_rate": 3.875539469842443e-06, "loss": 0.0245, "num_tokens": 10489840.0, "reward": 0.85430908203125, "reward_std": 0.009904876351356506, "rewards//mean": 0.85430908203125, "rewards//std": 0.024790242314338684, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3218, "grad_norm": 1.5223784446716309, "kl": 0.40021322946995497, "learning_rate": 3.874214293618706e-06, "loss": 0.04, "num_tokens": 10496320.0, "reward": 0.889892578125, "reward_std": 0.012777541764080524, "rewards//mean": 0.889892578125, "rewards//std": 0.025179890915751457, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.322, "grad_norm": 1.7800970077514648, "kl": 0.22602685168385506, "learning_rate": 3.872888563861615e-06, "loss": 0.0226, "num_tokens": 10502816.0, "reward": 0.816650390625, "reward_std": 0.008710541762411594, "rewards//mean": 0.816650390625, "rewards//std": 0.014396708458662033, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3222, "grad_norm": 1.7651360034942627, "kl": 0.2483451496809721, "learning_rate": 3.8715622811051754e-06, "loss": 0.0248, "num_tokens": 10509320.0, "reward": 0.8411865234375, "reward_std": 0.013786174356937408, "rewards//mean": 0.8411865234375, "rewards//std": 0.018639342859387398, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3224, "grad_norm": 1.2564246654510498, "kl": 0.14153725700452924, "learning_rate": 3.8702354458836124e-06, "loss": 0.0142, "num_tokens": 10515792.0, "reward": 0.873291015625, "reward_std": 0.01036643423140049, "rewards//mean": 0.873291015625, "rewards//std": 0.01756434142589569, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3226, "grad_norm": 1.5291967391967773, "kl": 0.16267961356788874, "learning_rate": 3.868908058731376e-06, "loss": 0.0163, "num_tokens": 10522208.0, "reward": 0.83026123046875, "reward_std": 0.0093242097645998, "rewards//mean": 0.83026123046875, "rewards//std": 0.01745443604886532, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3228, "grad_norm": 0.969484269618988, "kl": 0.19823438487946987, "learning_rate": 3.867580120183138e-06, "loss": 0.0198, "num_tokens": 10528832.0, "reward": 0.854736328125, "reward_std": 0.011797342449426651, "rewards//mean": 0.854736328125, "rewards//std": 0.021097596734762192, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.323, "grad_norm": 1.2242486476898193, "kl": 0.1758004892617464, "learning_rate": 3.86625163077379e-06, "loss": 0.0176, "num_tokens": 10535384.0, "reward": 0.7921142578125, "reward_std": 0.009723196737468243, "rewards//mean": 0.7921142578125, "rewards//std": 0.01895183138549328, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3232, "grad_norm": 1.259074330329895, "kl": 0.1677936716005206, "learning_rate": 3.864922591038448e-06, "loss": 0.0168, "num_tokens": 10541888.0, "reward": 0.83563232421875, "reward_std": 0.014554427936673164, "rewards//mean": 0.83563232421875, "rewards//std": 0.029349982738494873, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3234, "grad_norm": 1.3640682697296143, "kl": 0.26752010080963373, "learning_rate": 3.863593001512451e-06, "loss": 0.0268, "num_tokens": 10548400.0, "reward": 0.86651611328125, "reward_std": 0.012483533471822739, "rewards//mean": 0.86651611328125, "rewards//std": 0.02434537559747696, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3236, "grad_norm": 1.3536274433135986, "kl": 0.21294006519019604, "learning_rate": 3.862262862731355e-06, "loss": 0.0213, "num_tokens": 10555000.0, "reward": 0.85455322265625, "reward_std": 0.01098201610147953, "rewards//mean": 0.85455322265625, "rewards//std": 0.02201739139854908, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3238, "grad_norm": 1.2604089975357056, "kl": 0.1933983825147152, "learning_rate": 3.860932175230941e-06, "loss": 0.0193, "num_tokens": 10561536.0, "reward": 0.8463134765625, "reward_std": 0.014932648278772831, "rewards//mean": 0.8463134765625, "rewards//std": 0.01998140476644039, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.324, "grad_norm": 1.0606210231781006, "kl": 0.14916081633418798, "learning_rate": 3.85960093954721e-06, "loss": 0.0149, "num_tokens": 10568016.0, "reward": 0.85498046875, "reward_std": 0.012895429506897926, "rewards//mean": 0.85498046875, "rewards//std": 0.03829542174935341, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3242, "grad_norm": 1.5457760095596313, "kl": 0.18304038047790527, "learning_rate": 3.858269156216383e-06, "loss": 0.0183, "num_tokens": 10574400.0, "reward": 0.84527587890625, "reward_std": 0.017079979181289673, "rewards//mean": 0.84527587890625, "rewards//std": 0.020956402644515038, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3244, "grad_norm": 1.046340823173523, "kl": 0.17034246772527695, "learning_rate": 3.8569368257749025e-06, "loss": 0.017, "num_tokens": 10580872.0, "reward": 0.775146484375, "reward_std": 0.011773445643484592, "rewards//mean": 0.775146484375, "rewards//std": 0.015592026524245739, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3246, "grad_norm": 1.0418622493743896, "kl": 0.13584748608991504, "learning_rate": 3.855603948759431e-06, "loss": 0.0136, "num_tokens": 10587304.0, "reward": 0.8558349609375, "reward_std": 0.01224471814930439, "rewards//mean": 0.8558349609375, "rewards//std": 0.025645412504673004, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3248, "grad_norm": 1.1217701435089111, "kl": 0.09765125159174204, "learning_rate": 3.85427052570685e-06, "loss": 0.0098, "num_tokens": 10593920.0, "reward": 0.84716796875, "reward_std": 0.011668877676129341, "rewards//mean": 0.84716796875, "rewards//std": 0.022047707810997963, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.325, "grad_norm": 1.3880126476287842, "kl": 0.14052546955645084, "learning_rate": 3.8529365571542645e-06, "loss": 0.0141, "num_tokens": 10600480.0, "reward": 0.7803955078125, "reward_std": 0.011577043682336807, "rewards//mean": 0.7803955078125, "rewards//std": 0.026166634634137154, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3252, "grad_norm": 1.0971704721450806, "kl": 0.14983691833913326, "learning_rate": 3.8516020436389945e-06, "loss": 0.015, "num_tokens": 10607040.0, "reward": 0.81805419921875, "reward_std": 0.015540399588644505, "rewards//mean": 0.81805419921875, "rewards//std": 0.016526052728295326, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3254, "grad_norm": 1.2790805101394653, "kl": 0.1348266862332821, "learning_rate": 3.850266985698583e-06, "loss": 0.0135, "num_tokens": 10613560.0, "reward": 0.849853515625, "reward_std": 0.010666648857295513, "rewards//mean": 0.849853515625, "rewards//std": 0.020158275961875916, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3256, "grad_norm": 0.992993950843811, "kl": 0.1500972593203187, "learning_rate": 3.848931383870792e-06, "loss": 0.015, "num_tokens": 10620056.0, "reward": 0.87591552734375, "reward_std": 0.01517472229897976, "rewards//mean": 0.87591552734375, "rewards//std": 0.026132117956876755, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3258, "grad_norm": 0.9758333563804626, "kl": 0.1466165203601122, "learning_rate": 3.8475952386936e-06, "loss": 0.0147, "num_tokens": 10626552.0, "reward": 0.8533935546875, "reward_std": 0.009698385372757912, "rewards//mean": 0.8533935546875, "rewards//std": 0.041672300547361374, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.326, "grad_norm": 1.2106187343597412, "kl": 0.15590465255081654, "learning_rate": 3.846258550705207e-06, "loss": 0.0156, "num_tokens": 10633160.0, "reward": 0.8604736328125, "reward_std": 0.012996692210435867, "rewards//mean": 0.8604736328125, "rewards//std": 0.022003378719091415, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3262, "grad_norm": 1.2773569822311401, "kl": 0.14580817613750696, "learning_rate": 3.844921320444031e-06, "loss": 0.0146, "num_tokens": 10639616.0, "reward": 0.8297119140625, "reward_std": 0.014681543223559856, "rewards//mean": 0.8297119140625, "rewards//std": 0.031263317912817, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3264, "grad_norm": 1.0682847499847412, "kl": 0.12519427947700024, "learning_rate": 3.84358354844871e-06, "loss": 0.0125, "num_tokens": 10646232.0, "reward": 0.7886962890625, "reward_std": 0.009556448087096214, "rewards//mean": 0.7886962890625, "rewards//std": 0.01895502768456936, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3266, "grad_norm": 1.2429144382476807, "kl": 0.1677149897441268, "learning_rate": 3.842245235258093e-06, "loss": 0.0168, "num_tokens": 10652680.0, "reward": 0.8212890625, "reward_std": 0.008451156318187714, "rewards//mean": 0.8212890625, "rewards//std": 0.02178802154958248, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3268, "grad_norm": 1.0727429389953613, "kl": 0.13021681271493435, "learning_rate": 3.840906381411258e-06, "loss": 0.013, "num_tokens": 10659216.0, "reward": 0.86065673828125, "reward_std": 0.011397959664463997, "rewards//mean": 0.86065673828125, "rewards//std": 0.03059246577322483, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.327, "grad_norm": 1.343408465385437, "kl": 0.14459174685180187, "learning_rate": 3.839566987447492e-06, "loss": 0.0145, "num_tokens": 10665688.0, "reward": 0.87799072265625, "reward_std": 0.012052120640873909, "rewards//mean": 0.87799072265625, "rewards//std": 0.021884987130761147, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3272, "grad_norm": 1.0196846723556519, "kl": 0.12612706422805786, "learning_rate": 3.838227053906304e-06, "loss": 0.0126, "num_tokens": 10672192.0, "reward": 0.86492919921875, "reward_std": 0.00849381648004055, "rewards//mean": 0.86492919921875, "rewards//std": 0.01626010611653328, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3274, "grad_norm": 1.1647162437438965, "kl": 0.15662445314228535, "learning_rate": 3.836886581327418e-06, "loss": 0.0157, "num_tokens": 10678776.0, "reward": 0.8204345703125, "reward_std": 0.014544153586030006, "rewards//mean": 0.8204345703125, "rewards//std": 0.021820997819304466, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3276, "grad_norm": 1.2039580345153809, "kl": 0.15561936236917973, "learning_rate": 3.835545570250778e-06, "loss": 0.0156, "num_tokens": 10685264.0, "reward": 0.81463623046875, "reward_std": 0.012347047217190266, "rewards//mean": 0.81463623046875, "rewards//std": 0.01577816903591156, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3278, "grad_norm": 1.129900336265564, "kl": 0.1493422193452716, "learning_rate": 3.834204021216541e-06, "loss": 0.0149, "num_tokens": 10691744.0, "reward": 0.862548828125, "reward_std": 0.01393700111657381, "rewards//mean": 0.862548828125, "rewards//std": 0.024792149662971497, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.328, "grad_norm": 1.2402477264404297, "kl": 0.15178625378757715, "learning_rate": 3.832861934765085e-06, "loss": 0.0152, "num_tokens": 10698176.0, "reward": 0.86187744140625, "reward_std": 0.01918061450123787, "rewards//mean": 0.86187744140625, "rewards//std": 0.04252605885267258, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3282, "grad_norm": 1.0745972394943237, "kl": 0.16789849940687418, "learning_rate": 3.8315193114369995e-06, "loss": 0.0168, "num_tokens": 10704664.0, "reward": 0.785888671875, "reward_std": 0.008465569466352463, "rewards//mean": 0.785888671875, "rewards//std": 0.02293478697538376, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3284, "grad_norm": 1.079355239868164, "kl": 0.16650789231061935, "learning_rate": 3.830176151773096e-06, "loss": 0.0167, "num_tokens": 10711192.0, "reward": 0.83087158203125, "reward_std": 0.01002349890768528, "rewards//mean": 0.83087158203125, "rewards//std": 0.023351360112428665, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3286, "grad_norm": 1.02597975730896, "kl": 0.17875688336789608, "learning_rate": 3.828832456314397e-06, "loss": 0.0179, "num_tokens": 10717840.0, "reward": 0.84735107421875, "reward_std": 0.012389512732625008, "rewards//mean": 0.84735107421875, "rewards//std": 0.02529916912317276, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3288, "grad_norm": 1.1863553524017334, "kl": 0.2037382423877716, "learning_rate": 3.827488225602144e-06, "loss": 0.0204, "num_tokens": 10724288.0, "reward": 0.84442138671875, "reward_std": 0.016151918098330498, "rewards//mean": 0.84442138671875, "rewards//std": 0.022418692708015442, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.329, "grad_norm": 1.5287710428237915, "kl": 0.20985182374715805, "learning_rate": 3.8261434601777916e-06, "loss": 0.021, "num_tokens": 10730888.0, "reward": 0.85833740234375, "reward_std": 0.01537338923662901, "rewards//mean": 0.85833740234375, "rewards//std": 0.02309650182723999, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3292, "grad_norm": 1.1089560985565186, "kl": 0.21017701923847198, "learning_rate": 3.824798160583012e-06, "loss": 0.021, "num_tokens": 10737536.0, "reward": 0.84735107421875, "reward_std": 0.009916501119732857, "rewards//mean": 0.84735107421875, "rewards//std": 0.01903899386525154, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3294, "grad_norm": 1.0912302732467651, "kl": 0.18353916332125664, "learning_rate": 3.823452327359693e-06, "loss": 0.0184, "num_tokens": 10744080.0, "reward": 0.83544921875, "reward_std": 0.012215595692396164, "rewards//mean": 0.83544921875, "rewards//std": 0.028734475374221802, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3296, "grad_norm": 1.111501693725586, "kl": 0.12131490558385849, "learning_rate": 3.8221059610499336e-06, "loss": 0.0121, "num_tokens": 10750576.0, "reward": 0.84429931640625, "reward_std": 0.012313824146986008, "rewards//mean": 0.84429931640625, "rewards//std": 0.021484287455677986, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3298, "grad_norm": 1.2223269939422607, "kl": 0.1280033839866519, "learning_rate": 3.820759062196052e-06, "loss": 0.0128, "num_tokens": 10757088.0, "reward": 0.843994140625, "reward_std": 0.01149844378232956, "rewards//mean": 0.843994140625, "rewards//std": 0.018982557579874992, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.33, "grad_norm": 1.521353006362915, "kl": 0.20365995727479458, "learning_rate": 3.819411631340577e-06, "loss": 0.0204, "num_tokens": 10763552.0, "reward": 0.85345458984375, "reward_std": 0.01380252093076706, "rewards//mean": 0.85345458984375, "rewards//std": 0.03458315506577492, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3302, "grad_norm": 1.178860068321228, "kl": 0.13951645186170936, "learning_rate": 3.8180636690262565e-06, "loss": 0.014, "num_tokens": 10770040.0, "reward": 0.83551025390625, "reward_std": 0.008601983077824116, "rewards//mean": 0.83551025390625, "rewards//std": 0.023552104830741882, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3304, "grad_norm": 1.1294639110565186, "kl": 0.15050000324845314, "learning_rate": 3.8167151757960466e-06, "loss": 0.015, "num_tokens": 10776544.0, "reward": 0.853759765625, "reward_std": 0.0082255182787776, "rewards//mean": 0.853759765625, "rewards//std": 0.02568463608622551, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3306, "grad_norm": 1.2284032106399536, "kl": 0.17394139431416988, "learning_rate": 3.815366152193122e-06, "loss": 0.0174, "num_tokens": 10783040.0, "reward": 0.86273193359375, "reward_std": 0.016201257705688477, "rewards//mean": 0.86273193359375, "rewards//std": 0.02932986058294773, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3308, "grad_norm": 1.04451584815979, "kl": 0.19539512880146503, "learning_rate": 3.8140165987608678e-06, "loss": 0.0195, "num_tokens": 10789584.0, "reward": 0.78173828125, "reward_std": 0.011390826664865017, "rewards//mean": 0.78173828125, "rewards//std": 0.02557714469730854, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.331, "grad_norm": 1.1904040575027466, "kl": 0.13732353318482637, "learning_rate": 3.812666516042885e-06, "loss": 0.0137, "num_tokens": 10796096.0, "reward": 0.87579345703125, "reward_std": 0.011900639161467552, "rewards//mean": 0.87579345703125, "rewards//std": 0.026284025982022285, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3312, "grad_norm": 1.226128101348877, "kl": 0.1864331504330039, "learning_rate": 3.811315904582986e-06, "loss": 0.0186, "num_tokens": 10802616.0, "reward": 0.83831787109375, "reward_std": 0.010644596070051193, "rewards//mean": 0.83831787109375, "rewards//std": 0.039687566459178925, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3314, "grad_norm": 1.3116858005523682, "kl": 0.13595839962363243, "learning_rate": 3.8099647649251984e-06, "loss": 0.0136, "num_tokens": 10809088.0, "reward": 0.80694580078125, "reward_std": 0.009059847332537174, "rewards//mean": 0.80694580078125, "rewards//std": 0.01201076153665781, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3316, "grad_norm": 1.0426509380340576, "kl": 0.14694624859839678, "learning_rate": 3.808613097613759e-06, "loss": 0.0147, "num_tokens": 10815600.0, "reward": 0.8099365234375, "reward_std": 0.012357229366898537, "rewards//mean": 0.8099365234375, "rewards//std": 0.020766014233231544, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3318, "grad_norm": 1.240186333656311, "kl": 0.22811613697558641, "learning_rate": 3.807260903193122e-06, "loss": 0.0228, "num_tokens": 10822072.0, "reward": 0.8546142578125, "reward_std": 0.014348466880619526, "rewards//mean": 0.8546142578125, "rewards//std": 0.020733915269374847, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.332, "grad_norm": 1.1840341091156006, "kl": 0.22027858067303896, "learning_rate": 3.805908182207948e-06, "loss": 0.022, "num_tokens": 10828632.0, "reward": 0.87176513671875, "reward_std": 0.012350248172879219, "rewards//mean": 0.87176513671875, "rewards//std": 0.029244577512145042, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3322, "grad_norm": 1.092741847038269, "kl": 0.20715150330215693, "learning_rate": 3.804554935203115e-06, "loss": 0.0207, "num_tokens": 10835104.0, "reward": 0.8253173828125, "reward_std": 0.009723320603370667, "rewards//mean": 0.8253173828125, "rewards//std": 0.02126447483897209, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3324, "grad_norm": 0.9915041327476501, "kl": 0.11787826521322131, "learning_rate": 3.8032011627237105e-06, "loss": 0.0118, "num_tokens": 10841720.0, "reward": 0.82415771484375, "reward_std": 0.013901704922318459, "rewards//mean": 0.82415771484375, "rewards//std": 0.02289903350174427, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3326, "grad_norm": 1.223789930343628, "kl": 0.17681985534727573, "learning_rate": 3.801846865315033e-06, "loss": 0.0177, "num_tokens": 10848240.0, "reward": 0.848876953125, "reward_std": 0.009956100024282932, "rewards//mean": 0.848876953125, "rewards//std": 0.023332634940743446, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3328, "grad_norm": 1.0605173110961914, "kl": 0.12429733760654926, "learning_rate": 3.8004920435225934e-06, "loss": 0.0124, "num_tokens": 10854784.0, "reward": 0.8564453125, "reward_std": 0.012144333682954311, "rewards//mean": 0.8564453125, "rewards//std": 0.02365351840853691, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.333, "grad_norm": 1.2286394834518433, "kl": 0.18913033977150917, "learning_rate": 3.7991366978921152e-06, "loss": 0.0189, "num_tokens": 10861272.0, "reward": 0.87469482421875, "reward_std": 0.011257180944085121, "rewards//mean": 0.87469482421875, "rewards//std": 0.023543747141957283, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3332, "grad_norm": 1.2374653816223145, "kl": 0.20200098492205143, "learning_rate": 3.7977808289695306e-06, "loss": 0.0202, "num_tokens": 10867744.0, "reward": 0.81927490234375, "reward_std": 0.013876447454094887, "rewards//mean": 0.81927490234375, "rewards//std": 0.018872089684009552, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3334, "grad_norm": 1.0597271919250488, "kl": 0.12655257433652878, "learning_rate": 3.796424437300982e-06, "loss": 0.0127, "num_tokens": 10874256.0, "reward": 0.8951416015625, "reward_std": 0.013223649933934212, "rewards//mean": 0.8951416015625, "rewards//std": 0.031041739508509636, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3336, "grad_norm": 1.2580838203430176, "kl": 0.1858372949063778, "learning_rate": 3.795067523432826e-06, "loss": 0.0186, "num_tokens": 10880760.0, "reward": 0.85858154296875, "reward_std": 0.014656871557235718, "rewards//mean": 0.85858154296875, "rewards//std": 0.03016291745007038, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3338, "grad_norm": 0.9753212332725525, "kl": 0.13140025502070785, "learning_rate": 3.793710087911626e-06, "loss": 0.0131, "num_tokens": 10887240.0, "reward": 0.85797119140625, "reward_std": 0.011839199811220169, "rewards//mean": 0.85797119140625, "rewards//std": 0.026313383132219315, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.334, "grad_norm": 1.1487759351730347, "kl": 0.18213751167058945, "learning_rate": 3.7923521312841575e-06, "loss": 0.0182, "num_tokens": 10893696.0, "reward": 0.8538818359375, "reward_std": 0.01351674273610115, "rewards//mean": 0.8538818359375, "rewards//std": 0.01850239932537079, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3342, "grad_norm": 1.3839889764785767, "kl": 0.2473511416465044, "learning_rate": 3.7909936540974052e-06, "loss": 0.0247, "num_tokens": 10900224.0, "reward": 0.83331298828125, "reward_std": 0.013229642994701862, "rewards//mean": 0.83331298828125, "rewards//std": 0.018946539610624313, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3344, "grad_norm": 1.0848548412322998, "kl": 0.17123944591730833, "learning_rate": 3.789634656898563e-06, "loss": 0.0171, "num_tokens": 10906640.0, "reward": 0.86029052734375, "reward_std": 0.012325625866651535, "rewards//mean": 0.86029052734375, "rewards//std": 0.019409868866205215, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3346, "grad_norm": 1.2640920877456665, "kl": 0.14798249956220388, "learning_rate": 3.788275140235036e-06, "loss": 0.0148, "num_tokens": 10913264.0, "reward": 0.84747314453125, "reward_std": 0.017889846116304398, "rewards//mean": 0.84747314453125, "rewards//std": 0.03498310595750809, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3348, "grad_norm": 0.9946934580802917, "kl": 0.1204144824296236, "learning_rate": 3.786915104654436e-06, "loss": 0.012, "num_tokens": 10919776.0, "reward": 0.8782958984375, "reward_std": 0.008448861539363861, "rewards//mean": 0.8782958984375, "rewards//std": 0.0297915730625391, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.335, "grad_norm": 1.1531076431274414, "kl": 0.13881013542413712, "learning_rate": 3.7855545507045856e-06, "loss": 0.0139, "num_tokens": 10926280.0, "reward": 0.865966796875, "reward_std": 0.012167485430836678, "rewards//mean": 0.865966796875, "rewards//std": 0.021831056103110313, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3352, "grad_norm": 1.0336984395980835, "kl": 0.1341300057247281, "learning_rate": 3.7841934789335167e-06, "loss": 0.0134, "num_tokens": 10932808.0, "reward": 0.87640380859375, "reward_std": 0.013194960542023182, "rewards//mean": 0.87640380859375, "rewards//std": 0.023698195815086365, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3354, "grad_norm": 1.4901009798049927, "kl": 0.1281805383041501, "learning_rate": 3.7828318898894667e-06, "loss": 0.0128, "num_tokens": 10939280.0, "reward": 0.8697509765625, "reward_std": 0.013518409803509712, "rewards//mean": 0.8697509765625, "rewards//std": 0.020258279517292976, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3356, "grad_norm": 1.2215501070022583, "kl": 0.15847931988537312, "learning_rate": 3.781469784120886e-06, "loss": 0.0158, "num_tokens": 10945768.0, "reward": 0.822509765625, "reward_std": 0.012979680672287941, "rewards//mean": 0.822509765625, "rewards//std": 0.03228592872619629, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3358, "grad_norm": 1.084694266319275, "kl": 0.14642178267240524, "learning_rate": 3.780107162176429e-06, "loss": 0.0146, "num_tokens": 10952288.0, "reward": 0.83087158203125, "reward_std": 0.011702725663781166, "rewards//mean": 0.83087158203125, "rewards//std": 0.03303349018096924, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.336, "grad_norm": 1.594547986984253, "kl": 0.10404386650770903, "learning_rate": 3.7787440246049606e-06, "loss": 0.0104, "num_tokens": 10958880.0, "reward": 0.85986328125, "reward_std": 0.013462567701935768, "rewards//mean": 0.85986328125, "rewards//std": 0.024324923753738403, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3362, "grad_norm": 1.268694519996643, "kl": 0.15274635143578053, "learning_rate": 3.777380371955552e-06, "loss": 0.0153, "num_tokens": 10965472.0, "reward": 0.82958984375, "reward_std": 0.01850060001015663, "rewards//mean": 0.82958984375, "rewards//std": 0.023036206141114235, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3364, "grad_norm": 1.0746183395385742, "kl": 0.11498851794749498, "learning_rate": 3.7760162047774816e-06, "loss": 0.0115, "num_tokens": 10971944.0, "reward": 0.8028564453125, "reward_std": 0.008092767558991909, "rewards//mean": 0.8028564453125, "rewards//std": 0.01833473891019821, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3366, "grad_norm": 1.2225549221038818, "kl": 0.1401190608739853, "learning_rate": 3.7746515236202374e-06, "loss": 0.014, "num_tokens": 10978432.0, "reward": 0.84124755859375, "reward_std": 0.010202763602137566, "rewards//mean": 0.84124755859375, "rewards//std": 0.027395671233534813, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3368, "grad_norm": 1.5883640050888062, "kl": 0.1795216165482998, "learning_rate": 3.773286329033511e-06, "loss": 0.018, "num_tokens": 10984920.0, "reward": 0.87451171875, "reward_std": 0.01330691296607256, "rewards//mean": 0.87451171875, "rewards//std": 0.028369713574647903, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.337, "grad_norm": 1.2008116245269775, "kl": 0.1556827574968338, "learning_rate": 3.7719206215672046e-06, "loss": 0.0156, "num_tokens": 10991432.0, "reward": 0.84136962890625, "reward_std": 0.010501531884074211, "rewards//mean": 0.84136962890625, "rewards//std": 0.030310604721307755, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3372, "grad_norm": 1.1222370862960815, "kl": 0.12442900985479355, "learning_rate": 3.770554401771423e-06, "loss": 0.0124, "num_tokens": 10997920.0, "reward": 0.822265625, "reward_std": 0.012988422065973282, "rewards//mean": 0.822265625, "rewards//std": 0.023602265864610672, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3374, "grad_norm": 1.357275366783142, "kl": 0.14468492195010185, "learning_rate": 3.769187670196481e-06, "loss": 0.0145, "num_tokens": 11004464.0, "reward": 0.85101318359375, "reward_std": 0.01251860149204731, "rewards//mean": 0.85101318359375, "rewards//std": 0.03203735500574112, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3376, "grad_norm": 1.3790971040725708, "kl": 0.13679356407374144, "learning_rate": 3.7678204273928965e-06, "loss": 0.0137, "num_tokens": 11010952.0, "reward": 0.84967041015625, "reward_std": 0.016929667443037033, "rewards//mean": 0.84967041015625, "rewards//std": 0.028881540521979332, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3378, "grad_norm": 1.4123620986938477, "kl": 0.16785067785531282, "learning_rate": 3.766452673911396e-06, "loss": 0.0168, "num_tokens": 11017448.0, "reward": 0.85931396484375, "reward_std": 0.015209059230983257, "rewards//mean": 0.85931396484375, "rewards//std": 0.022471971809864044, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.338, "grad_norm": 1.264365315437317, "kl": 0.17842304427176714, "learning_rate": 3.7650844103029093e-06, "loss": 0.0178, "num_tokens": 11023992.0, "reward": 0.864013671875, "reward_std": 0.012815456837415695, "rewards//mean": 0.864013671875, "rewards//std": 0.028449639678001404, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3382, "grad_norm": 1.6134607791900635, "kl": 0.16491785272955894, "learning_rate": 3.7637156371185744e-06, "loss": 0.0165, "num_tokens": 11030600.0, "reward": 0.85723876953125, "reward_std": 0.012963094748556614, "rewards//mean": 0.85723876953125, "rewards//std": 0.01888091117143631, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3384, "grad_norm": 1.9365715980529785, "kl": 0.1763475425541401, "learning_rate": 3.7623463549097318e-06, "loss": 0.0176, "num_tokens": 11037096.0, "reward": 0.830322265625, "reward_std": 0.014493362046778202, "rewards//mean": 0.830322265625, "rewards//std": 0.02541922777891159, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3386, "grad_norm": 1.900067687034607, "kl": 0.18936949223279953, "learning_rate": 3.760976564227928e-06, "loss": 0.0189, "num_tokens": 11043584.0, "reward": 0.8150634765625, "reward_std": 0.014577506110072136, "rewards//mean": 0.8150634765625, "rewards//std": 0.019749755039811134, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3388, "grad_norm": 1.3144900798797607, "kl": 0.1887713335454464, "learning_rate": 3.759606265624915e-06, "loss": 0.0189, "num_tokens": 11050064.0, "reward": 0.79498291015625, "reward_std": 0.014395808801054955, "rewards//mean": 0.79498291015625, "rewards//std": 0.0337466262280941, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.339, "grad_norm": 1.3897720575332642, "kl": 0.19331872276961803, "learning_rate": 3.7582354596526493e-06, "loss": 0.0193, "num_tokens": 11056568.0, "reward": 0.83001708984375, "reward_std": 0.011129519902169704, "rewards//mean": 0.83001708984375, "rewards//std": 0.02671191282570362, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3392, "grad_norm": 1.028624176979065, "kl": 0.16877240128815174, "learning_rate": 3.7568641468632898e-06, "loss": 0.0169, "num_tokens": 11063008.0, "reward": 0.85614013671875, "reward_std": 0.01190134510397911, "rewards//mean": 0.85614013671875, "rewards//std": 0.023944122716784477, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3394, "grad_norm": 1.386691689491272, "kl": 0.18964815139770508, "learning_rate": 3.7554923278092037e-06, "loss": 0.019, "num_tokens": 11069528.0, "reward": 0.8572998046875, "reward_std": 0.015087602660059929, "rewards//mean": 0.8572998046875, "rewards//std": 0.030096912756562233, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3396, "grad_norm": 1.8007807731628418, "kl": 0.1702565187588334, "learning_rate": 3.754120003042957e-06, "loss": 0.017, "num_tokens": 11076104.0, "reward": 0.83660888671875, "reward_std": 0.010566698387265205, "rewards//mean": 0.83660888671875, "rewards//std": 0.025215260684490204, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3398, "grad_norm": 1.2226858139038086, "kl": 0.1682624090462923, "learning_rate": 3.752747173117324e-06, "loss": 0.0168, "num_tokens": 11082552.0, "reward": 0.88848876953125, "reward_std": 0.010222594253718853, "rewards//mean": 0.88848876953125, "rewards//std": 0.025647403672337532, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.34, "grad_norm": 1.1546058654785156, "kl": 0.14353855978697538, "learning_rate": 3.751373838585278e-06, "loss": 0.0144, "num_tokens": 11089080.0, "reward": 0.79400634765625, "reward_std": 0.008299615234136581, "rewards//mean": 0.79400634765625, "rewards//std": 0.020836150273680687, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3402, "grad_norm": 1.3053923845291138, "kl": 0.20300929248332977, "learning_rate": 3.7500000000000005e-06, "loss": 0.0203, "num_tokens": 11095592.0, "reward": 0.87542724609375, "reward_std": 0.010183681733906269, "rewards//mean": 0.87542724609375, "rewards//std": 0.023355897516012192, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3404, "grad_norm": 1.223368763923645, "kl": 0.1632108986377716, "learning_rate": 3.748625657914872e-06, "loss": 0.0163, "num_tokens": 11102128.0, "reward": 0.86151123046875, "reward_std": 0.015512991696596146, "rewards//mean": 0.86151123046875, "rewards//std": 0.03522803261876106, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3406, "grad_norm": 2.405582904815674, "kl": 0.16091686952859163, "learning_rate": 3.747250812883478e-06, "loss": 0.0161, "num_tokens": 11108720.0, "reward": 0.853759765625, "reward_std": 0.012506428174674511, "rewards//mean": 0.853759765625, "rewards//std": 0.03202228993177414, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3408, "grad_norm": 1.4617900848388672, "kl": 0.13038993813097477, "learning_rate": 3.7458754654596063e-06, "loss": 0.013, "num_tokens": 11115224.0, "reward": 0.8494873046875, "reward_std": 0.012755920179188251, "rewards//mean": 0.8494873046875, "rewards//std": 0.029098566621541977, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.341, "grad_norm": 1.0595983266830444, "kl": 0.1840918157249689, "learning_rate": 3.744499616197246e-06, "loss": 0.0184, "num_tokens": 11121800.0, "reward": 0.8050537109375, "reward_std": 0.011159805580973625, "rewards//mean": 0.8050537109375, "rewards//std": 0.02213781140744686, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3412, "grad_norm": 1.4514061212539673, "kl": 0.21766717731952667, "learning_rate": 3.743123265650589e-06, "loss": 0.0218, "num_tokens": 11128344.0, "reward": 0.84228515625, "reward_std": 0.011966688558459282, "rewards//mean": 0.84228515625, "rewards//std": 0.023730190470814705, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3414, "grad_norm": 1.345911979675293, "kl": 0.23854824621230364, "learning_rate": 3.7417464143740283e-06, "loss": 0.0239, "num_tokens": 11134952.0, "reward": 0.84320068359375, "reward_std": 0.012141108512878418, "rewards//mean": 0.84320068359375, "rewards//std": 0.02657441794872284, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3416, "grad_norm": 2.0054266452789307, "kl": 0.15904941130429506, "learning_rate": 3.740369062922161e-06, "loss": 0.0159, "num_tokens": 11141456.0, "reward": 0.8409423828125, "reward_std": 0.011485688388347626, "rewards//mean": 0.8409423828125, "rewards//std": 0.02016540989279747, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3418, "grad_norm": 1.5950835943222046, "kl": 0.19463621079921722, "learning_rate": 3.738991211849784e-06, "loss": 0.0195, "num_tokens": 11147992.0, "reward": 0.86370849609375, "reward_std": 0.013644029386341572, "rewards//mean": 0.86370849609375, "rewards//std": 0.03186490014195442, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.342, "grad_norm": 1.5421680212020874, "kl": 0.17644426971673965, "learning_rate": 3.7376128617118943e-06, "loss": 0.0176, "num_tokens": 11154560.0, "reward": 0.806640625, "reward_std": 0.011663274839520454, "rewards//mean": 0.806640625, "rewards//std": 0.02343750186264515, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3422, "grad_norm": 1.4455673694610596, "kl": 0.23444785550236702, "learning_rate": 3.7362340130636926e-06, "loss": 0.0234, "num_tokens": 11161208.0, "reward": 0.81536865234375, "reward_std": 0.011241000145673752, "rewards//mean": 0.81536865234375, "rewards//std": 0.02741224318742752, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3424, "grad_norm": 1.3178112506866455, "kl": 0.22703893575817347, "learning_rate": 3.7348546664605777e-06, "loss": 0.0227, "num_tokens": 11167648.0, "reward": 0.845703125, "reward_std": 0.01521255448460579, "rewards//mean": 0.845703125, "rewards//std": 0.04049338772892952, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3426, "grad_norm": 1.121376395225525, "kl": 0.1903177471831441, "learning_rate": 3.7334748224581507e-06, "loss": 0.019, "num_tokens": 11174168.0, "reward": 0.83489990234375, "reward_std": 0.012005062773823738, "rewards//mean": 0.83489990234375, "rewards//std": 0.024051988497376442, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3428, "grad_norm": 1.6415040493011475, "kl": 0.13600875157862902, "learning_rate": 3.732094481612214e-06, "loss": 0.0136, "num_tokens": 11180656.0, "reward": 0.82110595703125, "reward_std": 0.010404227301478386, "rewards//mean": 0.82110595703125, "rewards//std": 0.022468602284789085, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.343, "grad_norm": 0.9488919973373413, "kl": 0.1049012178555131, "learning_rate": 3.730713644478766e-06, "loss": 0.0105, "num_tokens": 11187144.0, "reward": 0.86126708984375, "reward_std": 0.009814893826842308, "rewards//mean": 0.86126708984375, "rewards//std": 0.020457835868000984, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3432, "grad_norm": 1.118933916091919, "kl": 0.23077546618878841, "learning_rate": 3.72933231161401e-06, "loss": 0.0231, "num_tokens": 11193592.0, "reward": 0.855712890625, "reward_std": 0.010181194171309471, "rewards//mean": 0.855712890625, "rewards//std": 0.024566426873207092, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3434, "grad_norm": 1.3336840867996216, "kl": 0.19039161736145616, "learning_rate": 3.7279504835743453e-06, "loss": 0.019, "num_tokens": 11200048.0, "reward": 0.8172607421875, "reward_std": 0.013428143225610256, "rewards//mean": 0.8172607421875, "rewards//std": 0.017196359112858772, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3436, "grad_norm": 0.9654021263122559, "kl": 0.14221018180251122, "learning_rate": 3.726568160916373e-06, "loss": 0.0142, "num_tokens": 11206536.0, "reward": 0.86181640625, "reward_std": 0.011711150407791138, "rewards//mean": 0.86181640625, "rewards//std": 0.02371998317539692, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3438, "grad_norm": 1.1966793537139893, "kl": 0.12163388077169657, "learning_rate": 3.725185344196892e-06, "loss": 0.0122, "num_tokens": 11213024.0, "reward": 0.8004150390625, "reward_std": 0.008779608644545078, "rewards//mean": 0.8004150390625, "rewards//std": 0.018172195181250572, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.344, "grad_norm": 1.4464792013168335, "kl": 0.2638219762593508, "learning_rate": 3.7238020339729015e-06, "loss": 0.0264, "num_tokens": 11219600.0, "reward": 0.84039306640625, "reward_std": 0.015688665211200714, "rewards//mean": 0.84039306640625, "rewards//std": 0.019671708345413208, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3442, "grad_norm": 1.2265249490737915, "kl": 0.1198842627927661, "learning_rate": 3.7224182308015977e-06, "loss": 0.012, "num_tokens": 11226312.0, "reward": 0.87054443359375, "reward_std": 0.01065195444971323, "rewards//mean": 0.87054443359375, "rewards//std": 0.021246224641799927, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3444, "grad_norm": 1.2761610746383667, "kl": 0.16249667573720217, "learning_rate": 3.721033935240376e-06, "loss": 0.0162, "num_tokens": 11232768.0, "reward": 0.862548828125, "reward_std": 0.01527030672878027, "rewards//mean": 0.862548828125, "rewards//std": 0.02704427018761635, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3446, "grad_norm": 1.365539789199829, "kl": 0.22107641631737351, "learning_rate": 3.7196491478468322e-06, "loss": 0.0221, "num_tokens": 11239328.0, "reward": 0.8441162109375, "reward_std": 0.013919533230364323, "rewards//mean": 0.8441162109375, "rewards//std": 0.026362596079707146, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3448, "grad_norm": 1.1789425611495972, "kl": 0.15124674141407013, "learning_rate": 3.718263869178757e-06, "loss": 0.0151, "num_tokens": 11245816.0, "reward": 0.8443603515625, "reward_std": 0.013836739584803581, "rewards//mean": 0.8443603515625, "rewards//std": 0.0309244804084301, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.345, "grad_norm": 1.2515184879302979, "kl": 0.22264001425355673, "learning_rate": 3.716878099794141e-06, "loss": 0.0223, "num_tokens": 11252352.0, "reward": 0.79888916015625, "reward_std": 0.014435089193284512, "rewards//mean": 0.79888916015625, "rewards//std": 0.02820267528295517, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3452, "grad_norm": 1.2482409477233887, "kl": 0.13545828079804778, "learning_rate": 3.715491840251172e-06, "loss": 0.0135, "num_tokens": 11258904.0, "reward": 0.86285400390625, "reward_std": 0.013001851737499237, "rewards//mean": 0.86285400390625, "rewards//std": 0.020589124411344528, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3454, "grad_norm": 1.420387864112854, "kl": 0.19200598821043968, "learning_rate": 3.7141050911082357e-06, "loss": 0.0192, "num_tokens": 11265536.0, "reward": 0.83453369140625, "reward_std": 0.018285386264324188, "rewards//mean": 0.83453369140625, "rewards//std": 0.040201447904109955, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3456, "grad_norm": 1.2384271621704102, "kl": 0.12268851697444916, "learning_rate": 3.7127178529239126e-06, "loss": 0.0123, "num_tokens": 11272112.0, "reward": 0.8193359375, "reward_std": 0.012207691557705402, "rewards//mean": 0.8193359375, "rewards//std": 0.020002983510494232, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3458, "grad_norm": 1.0250064134597778, "kl": 0.12968253623694181, "learning_rate": 3.7113301262569845e-06, "loss": 0.013, "num_tokens": 11278640.0, "reward": 0.8134765625, "reward_std": 0.011040281504392624, "rewards//mean": 0.8134765625, "rewards//std": 0.028125345706939697, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.346, "grad_norm": 1.2409635782241821, "kl": 0.16847587376832962, "learning_rate": 3.7099419116664255e-06, "loss": 0.0168, "num_tokens": 11285208.0, "reward": 0.84649658203125, "reward_std": 0.015550029464066029, "rewards//mean": 0.84649658203125, "rewards//std": 0.03176259621977806, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3462, "grad_norm": 1.1247594356536865, "kl": 0.13143022172152996, "learning_rate": 3.7085532097114098e-06, "loss": 0.0131, "num_tokens": 11291648.0, "reward": 0.80645751953125, "reward_std": 0.014143487438559532, "rewards//mean": 0.80645751953125, "rewards//std": 0.025835584849119186, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3464, "grad_norm": 1.028573989868164, "kl": 0.1156574496999383, "learning_rate": 3.7071640209513054e-06, "loss": 0.0116, "num_tokens": 11298120.0, "reward": 0.8671875, "reward_std": 0.011460047215223312, "rewards//mean": 0.8671875, "rewards//std": 0.017524652183055878, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3466, "grad_norm": 1.4063475131988525, "kl": 0.14531050063669682, "learning_rate": 3.7057743459456786e-06, "loss": 0.0145, "num_tokens": 11304648.0, "reward": 0.8577880859375, "reward_std": 0.01344828587025404, "rewards//mean": 0.8577880859375, "rewards//std": 0.02870258502662182, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3468, "grad_norm": 1.2181246280670166, "kl": 0.13775143213570118, "learning_rate": 3.7043841852542884e-06, "loss": 0.0138, "num_tokens": 11311248.0, "reward": 0.8074951171875, "reward_std": 0.009472533129155636, "rewards//mean": 0.8074951171875, "rewards//std": 0.02667086571455002, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.347, "grad_norm": 2.348598003387451, "kl": 0.1715964525938034, "learning_rate": 3.702993539437093e-06, "loss": 0.0172, "num_tokens": 11317728.0, "reward": 0.87213134765625, "reward_std": 0.01654331013560295, "rewards//mean": 0.87213134765625, "rewards//std": 0.024352213367819786, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3472, "grad_norm": 1.0899401903152466, "kl": 0.10013393219560385, "learning_rate": 3.7016024090542436e-06, "loss": 0.01, "num_tokens": 11324264.0, "reward": 0.87847900390625, "reward_std": 0.012722134590148926, "rewards//mean": 0.87847900390625, "rewards//std": 0.025809206068515778, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3474, "grad_norm": 1.524187684059143, "kl": 0.15203462820500135, "learning_rate": 3.7002107946660874e-06, "loss": 0.0152, "num_tokens": 11330728.0, "reward": 0.890869140625, "reward_std": 0.012501136399805546, "rewards//mean": 0.890869140625, "rewards//std": 0.020420897752046585, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3476, "grad_norm": 1.082101583480835, "kl": 0.13634513597935438, "learning_rate": 3.6988186968331667e-06, "loss": 0.0136, "num_tokens": 11337384.0, "reward": 0.8331298828125, "reward_std": 0.007592954207211733, "rewards//mean": 0.8331298828125, "rewards//std": 0.015112322755157948, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3478, "grad_norm": 1.2019413709640503, "kl": 0.17733390675857663, "learning_rate": 3.6974261161162182e-06, "loss": 0.0177, "num_tokens": 11343944.0, "reward": 0.8505859375, "reward_std": 0.010301269590854645, "rewards//mean": 0.8505859375, "rewards//std": 0.020493414252996445, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.348, "grad_norm": 1.2776867151260376, "kl": 0.21086320839822292, "learning_rate": 3.6960330530761735e-06, "loss": 0.0211, "num_tokens": 11350480.0, "reward": 0.85504150390625, "reward_std": 0.017650537192821503, "rewards//mean": 0.85504150390625, "rewards//std": 0.037687335163354874, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3482, "grad_norm": 1.1269402503967285, "kl": 0.139348984695971, "learning_rate": 3.6946395082741582e-06, "loss": 0.0139, "num_tokens": 11356960.0, "reward": 0.83502197265625, "reward_std": 0.010757364332675934, "rewards//mean": 0.83502197265625, "rewards//std": 0.027898307889699936, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3484, "grad_norm": 1.0949785709381104, "kl": 0.15068648289889097, "learning_rate": 3.6932454822714915e-06, "loss": 0.0151, "num_tokens": 11363472.0, "reward": 0.86700439453125, "reward_std": 0.015072057023644447, "rewards//mean": 0.86700439453125, "rewards//std": 0.043109044432640076, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3486, "grad_norm": 1.0529556274414062, "kl": 0.123992002569139, "learning_rate": 3.6918509756296876e-06, "loss": 0.0124, "num_tokens": 11369912.0, "reward": 0.8145751953125, "reward_std": 0.011777686886489391, "rewards//mean": 0.8145751953125, "rewards//std": 0.0208213422447443, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3488, "grad_norm": 1.297634482383728, "kl": 0.18713740166276693, "learning_rate": 3.6904559889104534e-06, "loss": 0.0187, "num_tokens": 11376480.0, "reward": 0.8505859375, "reward_std": 0.01412765122950077, "rewards//mean": 0.8505859375, "rewards//std": 0.021609429270029068, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.349, "grad_norm": 1.4217078685760498, "kl": 0.2007905002683401, "learning_rate": 3.689060522675689e-06, "loss": 0.0201, "num_tokens": 11383000.0, "reward": 0.87518310546875, "reward_std": 0.014552782289683819, "rewards//mean": 0.87518310546875, "rewards//std": 0.030420776456594467, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3492, "grad_norm": 1.4188538789749146, "kl": 0.19861986488103867, "learning_rate": 3.6876645774874882e-06, "loss": 0.0199, "num_tokens": 11389504.0, "reward": 0.89190673828125, "reward_std": 0.010491138324141502, "rewards//mean": 0.89190673828125, "rewards//std": 0.018150001764297485, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3494, "grad_norm": 1.4209965467453003, "kl": 0.18480386771261692, "learning_rate": 3.686268153908137e-06, "loss": 0.0185, "num_tokens": 11395976.0, "reward": 0.8609619140625, "reward_std": 0.01842474937438965, "rewards//mean": 0.8609619140625, "rewards//std": 0.028499344363808632, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3496, "grad_norm": 1.1185232400894165, "kl": 0.24174714088439941, "learning_rate": 3.684871252500116e-06, "loss": 0.0242, "num_tokens": 11402488.0, "reward": 0.8946533203125, "reward_std": 0.01326714176684618, "rewards//mean": 0.8946533203125, "rewards//std": 0.02971627563238144, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3498, "grad_norm": 1.312836766242981, "kl": 0.17498574033379555, "learning_rate": 3.6834738738260955e-06, "loss": 0.0175, "num_tokens": 11408936.0, "reward": 0.7664794921875, "reward_std": 0.01111630443483591, "rewards//mean": 0.7664794921875, "rewards//std": 0.019923744723200798, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.35, "grad_norm": 1.2983803749084473, "kl": 0.15632094908505678, "learning_rate": 3.6820760184489413e-06, "loss": 0.0156, "num_tokens": 11415432.0, "reward": 0.87603759765625, "reward_std": 0.01099543459713459, "rewards//mean": 0.87603759765625, "rewards//std": 0.014963719993829727, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3502, "grad_norm": 1.2889938354492188, "kl": 0.21109430585056543, "learning_rate": 3.6806776869317074e-06, "loss": 0.0211, "num_tokens": 11421888.0, "reward": 0.85308837890625, "reward_std": 0.009937352500855923, "rewards//mean": 0.85308837890625, "rewards//std": 0.018748154863715172, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3504, "grad_norm": 1.6869314908981323, "kl": 0.16373597458004951, "learning_rate": 3.679278879837642e-06, "loss": 0.0164, "num_tokens": 11428496.0, "reward": 0.826416015625, "reward_std": 0.011646917089819908, "rewards//mean": 0.826416015625, "rewards//std": 0.022966446354985237, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3506, "grad_norm": 1.52003014087677, "kl": 0.23184552043676376, "learning_rate": 3.6778795977301856e-06, "loss": 0.0232, "num_tokens": 11435192.0, "reward": 0.7420654296875, "reward_std": 0.00913048442453146, "rewards//mean": 0.7420654296875, "rewards//std": 0.02402908354997635, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3508, "grad_norm": 1.4098858833312988, "kl": 0.27462242962792516, "learning_rate": 3.676479841172968e-06, "loss": 0.0275, "num_tokens": 11441704.0, "reward": 0.86895751953125, "reward_std": 0.014187974855303764, "rewards//mean": 0.86895751953125, "rewards//std": 0.03078877367079258, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.351, "grad_norm": 1.2183738946914673, "kl": 0.2601485443301499, "learning_rate": 3.675079610729811e-06, "loss": 0.026, "num_tokens": 11448224.0, "reward": 0.850341796875, "reward_std": 0.011326085776090622, "rewards//mean": 0.850341796875, "rewards//std": 0.032105378806591034, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3512, "grad_norm": 1.687752604484558, "kl": 0.24641847796738148, "learning_rate": 3.6736789069647273e-06, "loss": 0.0246, "num_tokens": 11454624.0, "reward": 0.77880859375, "reward_std": 0.007466997019946575, "rewards//mean": 0.77880859375, "rewards//std": 0.02201472781598568, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3514, "grad_norm": 1.1488102674484253, "kl": 0.16935606393963099, "learning_rate": 3.67227773044192e-06, "loss": 0.0169, "num_tokens": 11461080.0, "reward": 0.8524169921875, "reward_std": 0.011813574470579624, "rewards//mean": 0.8524169921875, "rewards//std": 0.025243235751986504, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3516, "grad_norm": 1.2389283180236816, "kl": 0.23943752888590097, "learning_rate": 3.670876081725784e-06, "loss": 0.0239, "num_tokens": 11467608.0, "reward": 0.855712890625, "reward_std": 0.010093769058585167, "rewards//mean": 0.855712890625, "rewards//std": 0.023906873539090157, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3518, "grad_norm": 1.5454990863800049, "kl": 0.2193242134526372, "learning_rate": 3.6694739613809015e-06, "loss": 0.0219, "num_tokens": 11474128.0, "reward": 0.8360595703125, "reward_std": 0.00892927497625351, "rewards//mean": 0.8360595703125, "rewards//std": 0.021395059302449226, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.352, "grad_norm": 1.3443801403045654, "kl": 0.21726640639826655, "learning_rate": 3.6680713699720473e-06, "loss": 0.0217, "num_tokens": 11480736.0, "reward": 0.86761474609375, "reward_std": 0.01264202781021595, "rewards//mean": 0.86761474609375, "rewards//std": 0.027543358504772186, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3522, "grad_norm": 1.4039642810821533, "kl": 0.13493547588586807, "learning_rate": 3.6666683080641846e-06, "loss": 0.0135, "num_tokens": 11487240.0, "reward": 0.86419677734375, "reward_std": 0.009548654779791832, "rewards//mean": 0.86419677734375, "rewards//std": 0.017115512862801552, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3524, "grad_norm": 1.3398916721343994, "kl": 0.167149699293077, "learning_rate": 3.665264776222467e-06, "loss": 0.0167, "num_tokens": 11493800.0, "reward": 0.84521484375, "reward_std": 0.0087888203561306, "rewards//mean": 0.84521484375, "rewards//std": 0.03050878830254078, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3526, "grad_norm": 1.6382014751434326, "kl": 0.22538587357848883, "learning_rate": 3.663860775012238e-06, "loss": 0.0225, "num_tokens": 11500376.0, "reward": 0.8282470703125, "reward_std": 0.011912903748452663, "rewards//mean": 0.8282470703125, "rewards//std": 0.024239830672740936, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3528, "grad_norm": 2.2193081378936768, "kl": 0.2061766628175974, "learning_rate": 3.662456304999027e-06, "loss": 0.0206, "num_tokens": 11506936.0, "reward": 0.85577392578125, "reward_std": 0.016354814171791077, "rewards//mean": 0.85577392578125, "rewards//std": 0.02880018576979637, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.353, "grad_norm": 1.2258036136627197, "kl": 0.278794564306736, "learning_rate": 3.661051366748555e-06, "loss": 0.0279, "num_tokens": 11513480.0, "reward": 0.81939697265625, "reward_std": 0.01539636217057705, "rewards//mean": 0.81939697265625, "rewards//std": 0.0241073090583086, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3532, "grad_norm": 1.278024673461914, "kl": 0.3183693792670965, "learning_rate": 3.659645960826732e-06, "loss": 0.0318, "num_tokens": 11519992.0, "reward": 0.80706787109375, "reward_std": 0.010747987776994705, "rewards//mean": 0.80706787109375, "rewards//std": 0.019358327612280846, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3534, "grad_norm": 1.2269108295440674, "kl": 0.17436159402132034, "learning_rate": 3.658240087799655e-06, "loss": 0.0174, "num_tokens": 11526536.0, "reward": 0.82928466796875, "reward_std": 0.008262701332569122, "rewards//mean": 0.82928466796875, "rewards//std": 0.020155929028987885, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3536, "grad_norm": 2.063746452331543, "kl": 0.25868869200348854, "learning_rate": 3.656833748233608e-06, "loss": 0.0259, "num_tokens": 11533008.0, "reward": 0.8712158203125, "reward_std": 0.010420592501759529, "rewards//mean": 0.8712158203125, "rewards//std": 0.027054063975811005, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3538, "grad_norm": 1.617897391319275, "kl": 0.20325280260294676, "learning_rate": 3.6554269426950666e-06, "loss": 0.0203, "num_tokens": 11539496.0, "reward": 0.85601806640625, "reward_std": 0.011717898771166801, "rewards//mean": 0.85601806640625, "rewards//std": 0.02896423637866974, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.354, "grad_norm": 1.0743240118026733, "kl": 0.1971705248579383, "learning_rate": 3.6540196717506905e-06, "loss": 0.0197, "num_tokens": 11546080.0, "reward": 0.8319091796875, "reward_std": 0.014976841397583485, "rewards//mean": 0.8319091796875, "rewards//std": 0.0424610897898674, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3542, "grad_norm": 1.4917166233062744, "kl": 0.2164121177047491, "learning_rate": 3.6526119359673283e-06, "loss": 0.0216, "num_tokens": 11552632.0, "reward": 0.84783935546875, "reward_std": 0.015939585864543915, "rewards//mean": 0.84783935546875, "rewards//std": 0.023203087970614433, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3544, "grad_norm": 1.0517910718917847, "kl": 0.2149463901296258, "learning_rate": 3.651203735912017e-06, "loss": 0.0215, "num_tokens": 11559200.0, "reward": 0.8424072265625, "reward_std": 0.010603864677250385, "rewards//mean": 0.8424072265625, "rewards//std": 0.01664891093969345, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3546, "grad_norm": 1.5117005109786987, "kl": 0.2513514244928956, "learning_rate": 3.6497950721519777e-06, "loss": 0.0251, "num_tokens": 11565688.0, "reward": 0.84356689453125, "reward_std": 0.010663229040801525, "rewards//mean": 0.84356689453125, "rewards//std": 0.025943171232938766, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3548, "grad_norm": 1.967753529548645, "kl": 0.28735970240086317, "learning_rate": 3.648385945254621e-06, "loss": 0.0287, "num_tokens": 11572152.0, "reward": 0.830078125, "reward_std": 0.015031819231808186, "rewards//mean": 0.830078125, "rewards//std": 0.028064999729394913, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.355, "grad_norm": 1.397688865661621, "kl": 0.2901907442137599, "learning_rate": 3.646976355787543e-06, "loss": 0.029, "num_tokens": 11578624.0, "reward": 0.849365234375, "reward_std": 0.012732641771435738, "rewards//mean": 0.849365234375, "rewards//std": 0.027856020256876945, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3552, "grad_norm": 1.706593632698059, "kl": 0.16542614251375198, "learning_rate": 3.6455663043185264e-06, "loss": 0.0165, "num_tokens": 11585128.0, "reward": 0.88580322265625, "reward_std": 0.011283708736300468, "rewards//mean": 0.88580322265625, "rewards//std": 0.03524048998951912, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3554, "grad_norm": 1.980695366859436, "kl": 0.17941990494728088, "learning_rate": 3.644155791415539e-06, "loss": 0.0179, "num_tokens": 11591712.0, "reward": 0.82769775390625, "reward_std": 0.012753150425851345, "rewards//mean": 0.82769775390625, "rewards//std": 0.018896138295531273, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3556, "grad_norm": 1.5060824155807495, "kl": 0.2359601091593504, "learning_rate": 3.642744817646736e-06, "loss": 0.0236, "num_tokens": 11598200.0, "reward": 0.833251953125, "reward_std": 0.011173607781529427, "rewards//mean": 0.833251953125, "rewards//std": 0.016231337562203407, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3558, "grad_norm": 1.4473319053649902, "kl": 0.14659902080893517, "learning_rate": 3.6413333835804567e-06, "loss": 0.0147, "num_tokens": 11604752.0, "reward": 0.80645751953125, "reward_std": 0.011671839281916618, "rewards//mean": 0.80645751953125, "rewards//std": 0.02234700322151184, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.356, "grad_norm": 1.4868983030319214, "kl": 0.14992115460336208, "learning_rate": 3.639921489785227e-06, "loss": 0.015, "num_tokens": 11611336.0, "reward": 0.84490966796875, "reward_std": 0.013040047138929367, "rewards//mean": 0.84490966796875, "rewards//std": 0.022409237921237946, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3562, "grad_norm": 1.4008314609527588, "kl": 0.16750399116426706, "learning_rate": 3.6385091368297582e-06, "loss": 0.0168, "num_tokens": 11617824.0, "reward": 0.84521484375, "reward_std": 0.011480474844574928, "rewards//mean": 0.84521484375, "rewards//std": 0.023025689646601677, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3564, "grad_norm": 1.7435966730117798, "kl": 0.18702357728034258, "learning_rate": 3.637096325282945e-06, "loss": 0.0187, "num_tokens": 11624320.0, "reward": 0.881103515625, "reward_std": 0.013019557110965252, "rewards//mean": 0.881103515625, "rewards//std": 0.024586137384176254, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3566, "grad_norm": 1.4326269626617432, "kl": 0.18929530680179596, "learning_rate": 3.6356830557138673e-06, "loss": 0.0189, "num_tokens": 11630888.0, "reward": 0.89874267578125, "reward_std": 0.015047634020447731, "rewards//mean": 0.89874267578125, "rewards//std": 0.03292056918144226, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3568, "grad_norm": 1.294507622718811, "kl": 0.2925416938960552, "learning_rate": 3.6342693286917906e-06, "loss": 0.0293, "num_tokens": 11637424.0, "reward": 0.8089599609375, "reward_std": 0.012834219262003899, "rewards//mean": 0.8089599609375, "rewards//std": 0.0204071793705225, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.357, "grad_norm": 1.5203204154968262, "kl": 0.16781086660921574, "learning_rate": 3.632855144786164e-06, "loss": 0.0168, "num_tokens": 11643880.0, "reward": 0.83636474609375, "reward_std": 0.011852528899908066, "rewards//mean": 0.83636474609375, "rewards//std": 0.023449042811989784, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3572, "grad_norm": 1.3981322050094604, "kl": 0.17373729264363647, "learning_rate": 3.631440504566621e-06, "loss": 0.0174, "num_tokens": 11650400.0, "reward": 0.8720703125, "reward_std": 0.011949749663472176, "rewards//mean": 0.8720703125, "rewards//std": 0.021167796105146408, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3574, "grad_norm": 1.5132122039794922, "kl": 0.23027662001550198, "learning_rate": 3.630025408602978e-06, "loss": 0.023, "num_tokens": 11656888.0, "reward": 0.8841552734375, "reward_std": 0.01609812118113041, "rewards//mean": 0.8841552734375, "rewards//std": 0.03747531399130821, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3576, "grad_norm": 1.4388235807418823, "kl": 0.21792693249881268, "learning_rate": 3.6286098574652358e-06, "loss": 0.0218, "num_tokens": 11663320.0, "reward": 0.85369873046875, "reward_std": 0.014487190172076225, "rewards//mean": 0.85369873046875, "rewards//std": 0.02969149686396122, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3578, "grad_norm": 1.8549010753631592, "kl": 0.25830122362822294, "learning_rate": 3.627193851723577e-06, "loss": 0.0258, "num_tokens": 11669792.0, "reward": 0.81048583984375, "reward_std": 0.010726533830165863, "rewards//mean": 0.81048583984375, "rewards//std": 0.02675721049308777, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.358, "grad_norm": 1.4389604330062866, "kl": 0.29452634416520596, "learning_rate": 3.6257773919483706e-06, "loss": 0.0295, "num_tokens": 11676312.0, "reward": 0.808837890625, "reward_std": 0.012015480548143387, "rewards//mean": 0.808837890625, "rewards//std": 0.026756148785352707, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3582, "grad_norm": 1.717443823814392, "kl": 0.16759195178747177, "learning_rate": 3.624360478710165e-06, "loss": 0.0168, "num_tokens": 11682872.0, "reward": 0.79730224609375, "reward_std": 0.011939364485442638, "rewards//mean": 0.79730224609375, "rewards//std": 0.020304834470152855, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3584, "grad_norm": 1.7203381061553955, "kl": 0.47356833331286907, "learning_rate": 3.622943112579693e-06, "loss": 0.0474, "num_tokens": 11689376.0, "reward": 0.8453369140625, "reward_std": 0.015297693200409412, "rewards//mean": 0.8453369140625, "rewards//std": 0.04424193128943443, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3586, "grad_norm": 1.6547951698303223, "kl": 0.26839224342256784, "learning_rate": 3.621525294127869e-06, "loss": 0.0268, "num_tokens": 11695880.0, "reward": 0.83233642578125, "reward_std": 0.012475302442908287, "rewards//mean": 0.83233642578125, "rewards//std": 0.024022389203310013, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3588, "grad_norm": 1.2556886672973633, "kl": 0.21179917082190514, "learning_rate": 3.6201070239257914e-06, "loss": 0.0212, "num_tokens": 11702408.0, "reward": 0.876220703125, "reward_std": 0.012393772602081299, "rewards//mean": 0.876220703125, "rewards//std": 0.01862187311053276, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.359, "grad_norm": 1.382396936416626, "kl": 0.19932799506932497, "learning_rate": 3.6186883025447382e-06, "loss": 0.0199, "num_tokens": 11708960.0, "reward": 0.88922119140625, "reward_std": 0.013840220868587494, "rewards//mean": 0.88922119140625, "rewards//std": 0.02854730747640133, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3592, "grad_norm": 1.2955398559570312, "kl": 0.29645396769046783, "learning_rate": 3.617269130556171e-06, "loss": 0.0296, "num_tokens": 11715432.0, "reward": 0.85430908203125, "reward_std": 0.013686101883649826, "rewards//mean": 0.85430908203125, "rewards//std": 0.02731875702738762, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3594, "grad_norm": 1.0753570795059204, "kl": 0.19395678117871284, "learning_rate": 3.61584950853173e-06, "loss": 0.0194, "num_tokens": 11721912.0, "reward": 0.83209228515625, "reward_std": 0.01332775503396988, "rewards//mean": 0.83209228515625, "rewards//std": 0.018075620755553246, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3596, "grad_norm": 2.036764144897461, "kl": 0.3178403824567795, "learning_rate": 3.6144294370432427e-06, "loss": 0.0318, "num_tokens": 11728368.0, "reward": 0.8101806640625, "reward_std": 0.010789253748953342, "rewards//mean": 0.8101806640625, "rewards//std": 0.016018707305192947, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3598, "grad_norm": 1.9756271839141846, "kl": 0.1898160008713603, "learning_rate": 3.6130089166627102e-06, "loss": 0.019, "num_tokens": 11734816.0, "reward": 0.85443115234375, "reward_std": 0.014439236372709274, "rewards//mean": 0.85443115234375, "rewards//std": 0.031088702380657196, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.36, "grad_norm": 1.4417182207107544, "kl": 0.31596218422055244, "learning_rate": 3.611587947962319e-06, "loss": 0.0316, "num_tokens": 11741272.0, "reward": 0.8343505859375, "reward_std": 0.01093918364495039, "rewards//mean": 0.8343505859375, "rewards//std": 0.016258826479315758, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3602, "grad_norm": 1.7080986499786377, "kl": 0.21893007960170507, "learning_rate": 3.6101665315144357e-06, "loss": 0.0219, "num_tokens": 11747672.0, "reward": 0.784912109375, "reward_std": 0.014078229665756226, "rewards//mean": 0.784912109375, "rewards//std": 0.023764614015817642, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3604, "grad_norm": 1.4053176641464233, "kl": 0.2934566307812929, "learning_rate": 3.608744667891606e-06, "loss": 0.0293, "num_tokens": 11754344.0, "reward": 0.7884521484375, "reward_std": 0.016613099724054337, "rewards//mean": 0.7884521484375, "rewards//std": 0.04327608644962311, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3606, "grad_norm": 2.3433194160461426, "kl": 0.29821125231683254, "learning_rate": 3.607322357666557e-06, "loss": 0.0298, "num_tokens": 11760824.0, "reward": 0.82965087890625, "reward_std": 0.014315242879092693, "rewards//mean": 0.82965087890625, "rewards//std": 0.02180599234998226, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3608, "grad_norm": 1.3814597129821777, "kl": 0.21451814286410809, "learning_rate": 3.6058996014121944e-06, "loss": 0.0215, "num_tokens": 11767336.0, "reward": 0.8458251953125, "reward_std": 0.01219137478619814, "rewards//mean": 0.8458251953125, "rewards//std": 0.02330244705080986, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.361, "grad_norm": 1.354205846786499, "kl": 0.22424293402582407, "learning_rate": 3.6044763997016054e-06, "loss": 0.0224, "num_tokens": 11773808.0, "reward": 0.88311767578125, "reward_std": 0.011723622679710388, "rewards//mean": 0.88311767578125, "rewards//std": 0.018692361190915108, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3612, "grad_norm": 1.401512861251831, "kl": 0.20975149795413017, "learning_rate": 3.6030527531080533e-06, "loss": 0.021, "num_tokens": 11780304.0, "reward": 0.87054443359375, "reward_std": 0.014817184768617153, "rewards//mean": 0.87054443359375, "rewards//std": 0.026846449822187424, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3614, "grad_norm": 1.374791145324707, "kl": 0.18597988225519657, "learning_rate": 3.6016286622049857e-06, "loss": 0.0186, "num_tokens": 11786784.0, "reward": 0.85760498046875, "reward_std": 0.017611630260944366, "rewards//mean": 0.85760498046875, "rewards//std": 0.03412444889545441, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3616, "grad_norm": 1.386276364326477, "kl": 0.2653267178684473, "learning_rate": 3.600204127566023e-06, "loss": 0.0265, "num_tokens": 11793208.0, "reward": 0.82080078125, "reward_std": 0.009636849164962769, "rewards//mean": 0.82080078125, "rewards//std": 0.016195859760046005, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3618, "grad_norm": 1.5966846942901611, "kl": 0.15748619940131903, "learning_rate": 3.5987791497649705e-06, "loss": 0.0157, "num_tokens": 11799792.0, "reward": 0.86566162109375, "reward_std": 0.013208129443228245, "rewards//mean": 0.86566162109375, "rewards//std": 0.030501779168844223, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.362, "grad_norm": 2.5419254302978516, "kl": 0.2868408281356096, "learning_rate": 3.5973537293758076e-06, "loss": 0.0287, "num_tokens": 11806376.0, "reward": 0.82720947265625, "reward_std": 0.012264983728528023, "rewards//mean": 0.82720947265625, "rewards//std": 0.026436205953359604, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3622, "grad_norm": 1.3745231628417969, "kl": 0.2988817114382982, "learning_rate": 3.595927866972694e-06, "loss": 0.0299, "num_tokens": 11813128.0, "reward": 0.84503173828125, "reward_std": 0.01289953663945198, "rewards//mean": 0.84503173828125, "rewards//std": 0.04176568612456322, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3624, "grad_norm": 1.289957046508789, "kl": 0.2103986470028758, "learning_rate": 3.594501563129966e-06, "loss": 0.021, "num_tokens": 11819736.0, "reward": 0.800537109375, "reward_std": 0.012647994793951511, "rewards//mean": 0.800537109375, "rewards//std": 0.027320489287376404, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3626, "grad_norm": 2.153311014175415, "kl": 0.3064973447471857, "learning_rate": 3.59307481842214e-06, "loss": 0.0306, "num_tokens": 11826344.0, "reward": 0.84326171875, "reward_std": 0.013538805767893791, "rewards//mean": 0.84326171875, "rewards//std": 0.03340402990579605, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3628, "grad_norm": 1.2736635208129883, "kl": 0.24693956691771746, "learning_rate": 3.5916476334239077e-06, "loss": 0.0247, "num_tokens": 11832848.0, "reward": 0.84320068359375, "reward_std": 0.01287849061191082, "rewards//mean": 0.84320068359375, "rewards//std": 0.03103899583220482, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.363, "grad_norm": 2.0980098247528076, "kl": 0.25316454749554396, "learning_rate": 3.5902200087101386e-06, "loss": 0.0253, "num_tokens": 11839304.0, "reward": 0.8660888671875, "reward_std": 0.014943184331059456, "rewards//mean": 0.8660888671875, "rewards//std": 0.01739242486655712, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3632, "grad_norm": 2.218087673187256, "kl": 0.16594486124813557, "learning_rate": 3.5887919448558813e-06, "loss": 0.0166, "num_tokens": 11845840.0, "reward": 0.88604736328125, "reward_std": 0.011849489063024521, "rewards//mean": 0.88604736328125, "rewards//std": 0.039809055626392365, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3634, "grad_norm": 1.2509801387786865, "kl": 0.15681287180632353, "learning_rate": 3.587363442436358e-06, "loss": 0.0157, "num_tokens": 11852296.0, "reward": 0.8724365234375, "reward_std": 0.013778559863567352, "rewards//mean": 0.8724365234375, "rewards//std": 0.03232599049806595, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3636, "grad_norm": 1.5533673763275146, "kl": 0.20095501095056534, "learning_rate": 3.5859345020269702e-06, "loss": 0.0201, "num_tokens": 11858760.0, "reward": 0.82867431640625, "reward_std": 0.01434343121945858, "rewards//mean": 0.82867431640625, "rewards//std": 0.02971544675529003, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3638, "grad_norm": 1.6392464637756348, "kl": 0.16612649988383055, "learning_rate": 3.584505124203295e-06, "loss": 0.0166, "num_tokens": 11865256.0, "reward": 0.8612060546875, "reward_std": 0.010885052382946014, "rewards//mean": 0.8612060546875, "rewards//std": 0.021832095459103584, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.364, "grad_norm": 1.2631516456604004, "kl": 0.18263808358460665, "learning_rate": 3.5830753095410857e-06, "loss": 0.0183, "num_tokens": 11871816.0, "reward": 0.8427734375, "reward_std": 0.010348541662096977, "rewards//mean": 0.8427734375, "rewards//std": 0.026493476703763008, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3642, "grad_norm": 1.1717936992645264, "kl": 0.16634648386389017, "learning_rate": 3.581645058616271e-06, "loss": 0.0166, "num_tokens": 11878264.0, "reward": 0.85009765625, "reward_std": 0.011172271333634853, "rewards//mean": 0.85009765625, "rewards//std": 0.022385666146874428, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3644, "grad_norm": 1.970152735710144, "kl": 0.19964067079126835, "learning_rate": 3.5802143720049565e-06, "loss": 0.02, "num_tokens": 11884760.0, "reward": 0.81768798828125, "reward_std": 0.009602159261703491, "rewards//mean": 0.81768798828125, "rewards//std": 0.02082234062254429, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3646, "grad_norm": 1.6004148721694946, "kl": 0.31915620900690556, "learning_rate": 3.5787832502834214e-06, "loss": 0.0319, "num_tokens": 11891288.0, "reward": 0.83563232421875, "reward_std": 0.01112605445086956, "rewards//mean": 0.83563232421875, "rewards//std": 0.03228915110230446, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3648, "grad_norm": 1.818578839302063, "kl": 0.2475033737719059, "learning_rate": 3.577351694028123e-06, "loss": 0.0248, "num_tokens": 11897752.0, "reward": 0.8475341796875, "reward_std": 0.010888276621699333, "rewards//mean": 0.8475341796875, "rewards//std": 0.021865351125597954, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.365, "grad_norm": 1.8619863986968994, "kl": 0.15754984132945538, "learning_rate": 3.57591970381569e-06, "loss": 0.0158, "num_tokens": 11904256.0, "reward": 0.82440185546875, "reward_std": 0.00855446606874466, "rewards//mean": 0.82440185546875, "rewards//std": 0.01580117829144001, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3652, "grad_norm": 1.438535213470459, "kl": 0.21449177525937557, "learning_rate": 3.5744872802229296e-06, "loss": 0.0214, "num_tokens": 11910824.0, "reward": 0.83526611328125, "reward_std": 0.011501898057758808, "rewards//mean": 0.83526611328125, "rewards//std": 0.037028878927230835, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3654, "grad_norm": 1.5154943466186523, "kl": 0.1983030242845416, "learning_rate": 3.573054423826821e-06, "loss": 0.0198, "num_tokens": 11917296.0, "reward": 0.859375, "reward_std": 0.01579335518181324, "rewards//mean": 0.859375, "rewards//std": 0.020002983510494232, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3656, "grad_norm": 1.6630445718765259, "kl": 0.21915276907384396, "learning_rate": 3.5716211352045194e-06, "loss": 0.0219, "num_tokens": 11923920.0, "reward": 0.85650634765625, "reward_std": 0.014565490186214447, "rewards//mean": 0.85650634765625, "rewards//std": 0.03734597936272621, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3658, "grad_norm": 1.972080945968628, "kl": 0.1890709325671196, "learning_rate": 3.5701874149333515e-06, "loss": 0.0189, "num_tokens": 11930384.0, "reward": 0.88043212890625, "reward_std": 0.01287904754281044, "rewards//mean": 0.88043212890625, "rewards//std": 0.02602880261838436, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.366, "grad_norm": 1.5100934505462646, "kl": 0.28246919997036457, "learning_rate": 3.5687532635908216e-06, "loss": 0.0282, "num_tokens": 11936912.0, "reward": 0.7618408203125, "reward_std": 0.010890690609812737, "rewards//mean": 0.7618408203125, "rewards//std": 0.020751429721713066, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3662, "grad_norm": 1.4070303440093994, "kl": 0.21846398245543242, "learning_rate": 3.5673186817546047e-06, "loss": 0.0218, "num_tokens": 11943400.0, "reward": 0.7908935546875, "reward_std": 0.013800546526908875, "rewards//mean": 0.7908935546875, "rewards//std": 0.018996508792042732, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3664, "grad_norm": 1.3824055194854736, "kl": 0.20048269908875227, "learning_rate": 3.565883670002551e-06, "loss": 0.02, "num_tokens": 11949912.0, "reward": 0.77813720703125, "reward_std": 0.009928274899721146, "rewards//mean": 0.77813720703125, "rewards//std": 0.030955485999584198, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3666, "grad_norm": 1.4420313835144043, "kl": 0.18756195716559887, "learning_rate": 3.564448228912682e-06, "loss": 0.0188, "num_tokens": 11956496.0, "reward": 0.85943603515625, "reward_std": 0.01079621259123087, "rewards//mean": 0.85943603515625, "rewards//std": 0.018446989357471466, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3668, "grad_norm": 1.4740324020385742, "kl": 0.19386128894984722, "learning_rate": 3.563012359063194e-06, "loss": 0.0194, "num_tokens": 11963088.0, "reward": 0.86541748046875, "reward_std": 0.009346428327262402, "rewards//mean": 0.86541748046875, "rewards//std": 0.01678958721458912, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.367, "grad_norm": 1.3641743659973145, "kl": 0.28146378323435783, "learning_rate": 3.5615760610324547e-06, "loss": 0.0281, "num_tokens": 11969536.0, "reward": 0.84002685546875, "reward_std": 0.020294351503252983, "rewards//mean": 0.84002685546875, "rewards//std": 0.033992890268564224, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3672, "grad_norm": 1.1049847602844238, "kl": 0.18979441840201616, "learning_rate": 3.560139335399005e-06, "loss": 0.019, "num_tokens": 11976032.0, "reward": 0.84173583984375, "reward_std": 0.009652558714151382, "rewards//mean": 0.84173583984375, "rewards//std": 0.028518659994006157, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3674, "grad_norm": 1.1105765104293823, "kl": 0.19871950428932905, "learning_rate": 3.558702182741558e-06, "loss": 0.0199, "num_tokens": 11982656.0, "reward": 0.84674072265625, "reward_std": 0.012555320747196674, "rewards//mean": 0.84674072265625, "rewards//std": 0.029649149626493454, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3676, "grad_norm": 1.1528319120407104, "kl": 0.16698293946683407, "learning_rate": 3.557264603638998e-06, "loss": 0.0167, "num_tokens": 11989200.0, "reward": 0.8636474609375, "reward_std": 0.009904362261295319, "rewards//mean": 0.8636474609375, "rewards//std": 0.019284386187791824, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3678, "grad_norm": 1.3622164726257324, "kl": 0.20566106867045164, "learning_rate": 3.555826598670382e-06, "loss": 0.0206, "num_tokens": 11995696.0, "reward": 0.80657958984375, "reward_std": 0.007618977688252926, "rewards//mean": 0.80657958984375, "rewards//std": 0.01789381541311741, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.368, "grad_norm": 1.7446454763412476, "kl": 0.19049219693988562, "learning_rate": 3.5543881684149382e-06, "loss": 0.019, "num_tokens": 12002128.0, "reward": 0.8590087890625, "reward_std": 0.015145896002650261, "rewards//mean": 0.8590087890625, "rewards//std": 0.02212686650454998, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3682, "grad_norm": 1.3005292415618896, "kl": 0.15110753010958433, "learning_rate": 3.552949313452067e-06, "loss": 0.0151, "num_tokens": 12008648.0, "reward": 0.8240966796875, "reward_std": 0.014386220835149288, "rewards//mean": 0.8240966796875, "rewards//std": 0.027000294998288155, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3684, "grad_norm": 1.496935248374939, "kl": 0.23303412273526192, "learning_rate": 3.5515100343613375e-06, "loss": 0.0233, "num_tokens": 12015232.0, "reward": 0.79278564453125, "reward_std": 0.011723525822162628, "rewards//mean": 0.79278564453125, "rewards//std": 0.027343127876520157, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3686, "grad_norm": 1.0947133302688599, "kl": 0.1684664161875844, "learning_rate": 3.5500703317224926e-06, "loss": 0.0168, "num_tokens": 12021792.0, "reward": 0.85870361328125, "reward_std": 0.00980221014469862, "rewards//mean": 0.85870361328125, "rewards//std": 0.03772226348519325, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3688, "grad_norm": 1.4166611433029175, "kl": 0.19942076969891787, "learning_rate": 3.5486302061154433e-06, "loss": 0.0199, "num_tokens": 12028312.0, "reward": 0.86810302734375, "reward_std": 0.015355045907199383, "rewards//mean": 0.86810302734375, "rewards//std": 0.03101460076868534, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.369, "grad_norm": 1.422276258468628, "kl": 0.13354660430923104, "learning_rate": 3.5471896581202724e-06, "loss": 0.0134, "num_tokens": 12034816.0, "reward": 0.82098388671875, "reward_std": 0.0076361484825611115, "rewards//mean": 0.82098388671875, "rewards//std": 0.019143657758831978, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3692, "grad_norm": 1.2726631164550781, "kl": 0.12937258929014206, "learning_rate": 3.5457486883172323e-06, "loss": 0.0129, "num_tokens": 12041368.0, "reward": 0.87255859375, "reward_std": 0.013409263454377651, "rewards//mean": 0.87255859375, "rewards//std": 0.02667115069925785, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3694, "grad_norm": 1.2186793088912964, "kl": 0.13429692247882485, "learning_rate": 3.544307297286746e-06, "loss": 0.0134, "num_tokens": 12047864.0, "reward": 0.82977294921875, "reward_std": 0.01812170445919037, "rewards//mean": 0.82977294921875, "rewards//std": 0.031132007017731667, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3696, "grad_norm": 0.9945281744003296, "kl": 0.14247377309948206, "learning_rate": 3.5428654856094047e-06, "loss": 0.0142, "num_tokens": 12054368.0, "reward": 0.823486328125, "reward_std": 0.01067198347300291, "rewards//mean": 0.823486328125, "rewards//std": 0.0316188782453537, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3698, "grad_norm": 1.1626938581466675, "kl": 0.12195639312267303, "learning_rate": 3.541423253865971e-06, "loss": 0.0122, "num_tokens": 12060888.0, "reward": 0.837890625, "reward_std": 0.012817876413464546, "rewards//mean": 0.837890625, "rewards//std": 0.018996907398104668, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.37, "grad_norm": 1.7529188394546509, "kl": 0.1643361458554864, "learning_rate": 3.5399806026373746e-06, "loss": 0.0164, "num_tokens": 12067448.0, "reward": 0.86236572265625, "reward_std": 0.013955667614936829, "rewards//mean": 0.86236572265625, "rewards//std": 0.023049263283610344, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3702, "grad_norm": 1.2025814056396484, "kl": 0.13457837281748652, "learning_rate": 3.5385375325047167e-06, "loss": 0.0135, "num_tokens": 12073952.0, "reward": 0.88702392578125, "reward_std": 0.010873900726437569, "rewards//mean": 0.88702392578125, "rewards//std": 0.027877138927578926, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3704, "grad_norm": 1.0852248668670654, "kl": 0.14479498751461506, "learning_rate": 3.537094044049264e-06, "loss": 0.0145, "num_tokens": 12080416.0, "reward": 0.81268310546875, "reward_std": 0.012398038059473038, "rewards//mean": 0.81268310546875, "rewards//std": 0.023441940546035767, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3706, "grad_norm": 1.7674623727798462, "kl": 0.15091689303517342, "learning_rate": 3.535650137852455e-06, "loss": 0.0151, "num_tokens": 12087040.0, "reward": 0.86883544921875, "reward_std": 0.018203843384981155, "rewards//mean": 0.86883544921875, "rewards//std": 0.029395336285233498, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3708, "grad_norm": 1.2897779941558838, "kl": 0.15425454452633858, "learning_rate": 3.5342058144958943e-06, "loss": 0.0154, "num_tokens": 12093656.0, "reward": 0.815673828125, "reward_std": 0.009151878766715527, "rewards//mean": 0.815673828125, "rewards//std": 0.02317640371620655, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.371, "grad_norm": 1.2268383502960205, "kl": 0.14002483198419213, "learning_rate": 3.532761074561355e-06, "loss": 0.014, "num_tokens": 12100152.0, "reward": 0.84063720703125, "reward_std": 0.011004693806171417, "rewards//mean": 0.84063720703125, "rewards//std": 0.022033199667930603, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3712, "grad_norm": 1.4838786125183105, "kl": 0.1412762808613479, "learning_rate": 3.531315918630778e-06, "loss": 0.0141, "num_tokens": 12106536.0, "reward": 0.84844970703125, "reward_std": 0.014354107901453972, "rewards//mean": 0.84844970703125, "rewards//std": 0.034352585673332214, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3714, "grad_norm": 1.4979288578033447, "kl": 0.17813465651124716, "learning_rate": 3.5298703472862725e-06, "loss": 0.0178, "num_tokens": 12113040.0, "reward": 0.8050537109375, "reward_std": 0.010361743159592152, "rewards//mean": 0.8050537109375, "rewards//std": 0.013200376182794571, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3716, "grad_norm": 2.225863456726074, "kl": 0.2578902021050453, "learning_rate": 3.528424361110115e-06, "loss": 0.0258, "num_tokens": 12119576.0, "reward": 0.8414306640625, "reward_std": 0.013399234041571617, "rewards//mean": 0.8414306640625, "rewards//std": 0.030700454488396645, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3718, "grad_norm": 1.0961664915084839, "kl": 0.15653812512755394, "learning_rate": 3.526977960684747e-06, "loss": 0.0157, "num_tokens": 12126040.0, "reward": 0.845947265625, "reward_std": 0.011344525031745434, "rewards//mean": 0.845947265625, "rewards//std": 0.01680321805179119, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.372, "grad_norm": 1.4447729587554932, "kl": 0.15414735116064548, "learning_rate": 3.52553114659278e-06, "loss": 0.0154, "num_tokens": 12132560.0, "reward": 0.8643798828125, "reward_std": 0.010230681858956814, "rewards//mean": 0.8643798828125, "rewards//std": 0.02213507518172264, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3722, "grad_norm": 1.2703537940979004, "kl": 0.1451016804203391, "learning_rate": 3.5240839194169885e-06, "loss": 0.0145, "num_tokens": 12139008.0, "reward": 0.8671875, "reward_std": 0.012119496241211891, "rewards//mean": 0.8671875, "rewards//std": 0.02123633772134781, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3724, "grad_norm": 1.2037148475646973, "kl": 0.1670379526913166, "learning_rate": 3.522636279740318e-06, "loss": 0.0167, "num_tokens": 12145480.0, "reward": 0.8551025390625, "reward_std": 0.009730571880936623, "rewards//mean": 0.8551025390625, "rewards//std": 0.017944185063242912, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3726, "grad_norm": 1.1831048727035522, "kl": 0.22351480834186077, "learning_rate": 3.521188228145876e-06, "loss": 0.0224, "num_tokens": 12152024.0, "reward": 0.828125, "reward_std": 0.008525419980287552, "rewards//mean": 0.828125, "rewards//std": 0.023714875802397728, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3728, "grad_norm": 2.2635245323181152, "kl": 0.1528142262250185, "learning_rate": 3.5197397652169375e-06, "loss": 0.0153, "num_tokens": 12158512.0, "reward": 0.83233642578125, "reward_std": 0.012617976404726505, "rewards//mean": 0.83233642578125, "rewards//std": 0.023728197440505028, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.373, "grad_norm": 1.2801164388656616, "kl": 0.1460969289764762, "learning_rate": 3.518290891536944e-06, "loss": 0.0146, "num_tokens": 12165008.0, "reward": 0.83837890625, "reward_std": 0.009285153821110725, "rewards//mean": 0.83837890625, "rewards//std": 0.013504772447049618, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3732, "grad_norm": 1.6569485664367676, "kl": 0.18008715752512217, "learning_rate": 3.516841607689501e-06, "loss": 0.018, "num_tokens": 12171640.0, "reward": 0.87310791015625, "reward_std": 0.0142369968816638, "rewards//mean": 0.87310791015625, "rewards//std": 0.022541919723153114, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3734, "grad_norm": 1.1149349212646484, "kl": 0.15350383054465055, "learning_rate": 3.5153919142583797e-06, "loss": 0.0154, "num_tokens": 12178224.0, "reward": 0.8638916015625, "reward_std": 0.013952887617051601, "rewards//mean": 0.8638916015625, "rewards//std": 0.02604370191693306, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3736, "grad_norm": 1.2068285942077637, "kl": 0.1637724945321679, "learning_rate": 3.5139418118275174e-06, "loss": 0.0164, "num_tokens": 12184664.0, "reward": 0.86114501953125, "reward_std": 0.015023384243249893, "rewards//mean": 0.86114501953125, "rewards//std": 0.03377487510442734, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3738, "grad_norm": 1.1385362148284912, "kl": 0.13408851064741611, "learning_rate": 3.5124913009810137e-06, "loss": 0.0134, "num_tokens": 12191232.0, "reward": 0.8447265625, "reward_std": 0.011140041053295135, "rewards//mean": 0.8447265625, "rewards//std": 0.023735294118523598, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.374, "grad_norm": 1.2288206815719604, "kl": 0.2179594156332314, "learning_rate": 3.511040382303136e-06, "loss": 0.0218, "num_tokens": 12197680.0, "reward": 0.81658935546875, "reward_std": 0.017253538593649864, "rewards//mean": 0.81658935546875, "rewards//std": 0.024334179237484932, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3742, "grad_norm": 1.9958826303482056, "kl": 0.12442289479076862, "learning_rate": 3.5095890563783124e-06, "loss": 0.0124, "num_tokens": 12204232.0, "reward": 0.83721923828125, "reward_std": 0.012860724702477455, "rewards//mean": 0.83721923828125, "rewards//std": 0.021271146833896637, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3744, "grad_norm": 1.2777856588363647, "kl": 0.1404807111248374, "learning_rate": 3.508137323791138e-06, "loss": 0.014, "num_tokens": 12210792.0, "reward": 0.83367919921875, "reward_std": 0.014106115326285362, "rewards//mean": 0.83367919921875, "rewards//std": 0.024859756231307983, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3746, "grad_norm": 1.023818016052246, "kl": 0.13428121525794268, "learning_rate": 3.50668518512637e-06, "loss": 0.0134, "num_tokens": 12217200.0, "reward": 0.80511474609375, "reward_std": 0.008685903623700142, "rewards//mean": 0.80511474609375, "rewards//std": 0.023147566244006157, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3748, "grad_norm": 1.1434919834136963, "kl": 0.14941918291151524, "learning_rate": 3.5052326409689296e-06, "loss": 0.0149, "num_tokens": 12223656.0, "reward": 0.8685302734375, "reward_std": 0.011002862825989723, "rewards//mean": 0.8685302734375, "rewards//std": 0.015782605856657028, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.375, "grad_norm": 1.617285966873169, "kl": 0.14714120887219906, "learning_rate": 3.503779691903902e-06, "loss": 0.0147, "num_tokens": 12230216.0, "reward": 0.85919189453125, "reward_std": 0.013259627856314182, "rewards//mean": 0.85919189453125, "rewards//std": 0.03544394299387932, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3752, "grad_norm": 1.7108086347579956, "kl": 0.110241558868438, "learning_rate": 3.5023263385165346e-06, "loss": 0.011, "num_tokens": 12236712.0, "reward": 0.86474609375, "reward_std": 0.008531774394214153, "rewards//mean": 0.86474609375, "rewards//std": 0.023297574371099472, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3754, "grad_norm": 1.6628937721252441, "kl": 0.11157500930130482, "learning_rate": 3.5008725813922383e-06, "loss": 0.0112, "num_tokens": 12243176.0, "reward": 0.85400390625, "reward_std": 0.008369793184101582, "rewards//mean": 0.85400390625, "rewards//std": 0.020093591883778572, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3756, "grad_norm": 0.9294976592063904, "kl": 0.12484822236001492, "learning_rate": 3.499418421116585e-06, "loss": 0.0125, "num_tokens": 12249728.0, "reward": 0.85491943359375, "reward_std": 0.011745519004762173, "rewards//mean": 0.85491943359375, "rewards//std": 0.022324638441205025, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3758, "grad_norm": 1.2579690217971802, "kl": 0.14521337486803532, "learning_rate": 3.4979638582753117e-06, "loss": 0.0145, "num_tokens": 12256248.0, "reward": 0.84521484375, "reward_std": 0.01485304906964302, "rewards//mean": 0.84521484375, "rewards//std": 0.027546724304556847, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.376, "grad_norm": 1.1864783763885498, "kl": 0.12853500340133905, "learning_rate": 3.4965088934543153e-06, "loss": 0.0129, "num_tokens": 12262768.0, "reward": 0.8741455078125, "reward_std": 0.012940873391926289, "rewards//mean": 0.8741455078125, "rewards//std": 0.030397193506360054, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3762, "grad_norm": 1.25899338722229, "kl": 0.2011901279911399, "learning_rate": 3.4950535272396564e-06, "loss": 0.0201, "num_tokens": 12269304.0, "reward": 0.82745361328125, "reward_std": 0.010217287577688694, "rewards//mean": 0.82745361328125, "rewards//std": 0.024175656959414482, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3764, "grad_norm": 1.3644109964370728, "kl": 0.15865014912560582, "learning_rate": 3.4935977602175547e-06, "loss": 0.0159, "num_tokens": 12275856.0, "reward": 0.846923828125, "reward_std": 0.015185525640845299, "rewards//mean": 0.846923828125, "rewards//std": 0.030614785850048065, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3766, "grad_norm": 1.038100004196167, "kl": 0.12509902194142342, "learning_rate": 3.492141592974395e-06, "loss": 0.0125, "num_tokens": 12282376.0, "reward": 0.87103271484375, "reward_std": 0.014049486257135868, "rewards//mean": 0.87103271484375, "rewards//std": 0.024503417313098907, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3768, "grad_norm": 1.1797133684158325, "kl": 0.14367573335766792, "learning_rate": 3.4906850260967197e-06, "loss": 0.0144, "num_tokens": 12289096.0, "reward": 0.80816650390625, "reward_std": 0.009251104667782784, "rewards//mean": 0.80816650390625, "rewards//std": 0.023603467270731926, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.377, "grad_norm": 1.0537770986557007, "kl": 0.1153098102658987, "learning_rate": 3.4892280601712346e-06, "loss": 0.0115, "num_tokens": 12295688.0, "reward": 0.835205078125, "reward_std": 0.008667953312397003, "rewards//mean": 0.835205078125, "rewards//std": 0.016081426292657852, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3772, "grad_norm": 1.3601089715957642, "kl": 0.14573818165808916, "learning_rate": 3.4877706957848052e-06, "loss": 0.0146, "num_tokens": 12302152.0, "reward": 0.86114501953125, "reward_std": 0.012371938675642014, "rewards//mean": 0.86114501953125, "rewards//std": 0.030694229528307915, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3774, "grad_norm": 1.2649428844451904, "kl": 0.18420946691185236, "learning_rate": 3.486312933524457e-06, "loss": 0.0184, "num_tokens": 12308608.0, "reward": 0.8455810546875, "reward_std": 0.009151825681328773, "rewards//mean": 0.8455810546875, "rewards//std": 0.019868964329361916, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3776, "grad_norm": 1.3410557508468628, "kl": 0.14798563532531261, "learning_rate": 3.4848547739773782e-06, "loss": 0.0148, "num_tokens": 12315096.0, "reward": 0.8472900390625, "reward_std": 0.010493988171219826, "rewards//mean": 0.8472900390625, "rewards//std": 0.03204379230737686, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3778, "grad_norm": 1.1807827949523926, "kl": 0.17069992050528526, "learning_rate": 3.4833962177309137e-06, "loss": 0.0171, "num_tokens": 12321616.0, "reward": 0.75628662109375, "reward_std": 0.012952504679560661, "rewards//mean": 0.75628662109375, "rewards//std": 0.023495476692914963, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.378, "grad_norm": 1.392043948173523, "kl": 0.11674785520881414, "learning_rate": 3.4819372653725704e-06, "loss": 0.0117, "num_tokens": 12328064.0, "reward": 0.84515380859375, "reward_std": 0.0132078155875206, "rewards//mean": 0.84515380859375, "rewards//std": 0.01575416512787342, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3782, "grad_norm": 1.0803405046463013, "kl": 0.15719668846577406, "learning_rate": 3.480477917490014e-06, "loss": 0.0157, "num_tokens": 12334632.0, "reward": 0.83465576171875, "reward_std": 0.014684981666505337, "rewards//mean": 0.83465576171875, "rewards//std": 0.030915362760424614, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3784, "grad_norm": 1.7734837532043457, "kl": 0.25403459183871746, "learning_rate": 3.47901817467107e-06, "loss": 0.0254, "num_tokens": 12341072.0, "reward": 0.85028076171875, "reward_std": 0.011008698493242264, "rewards//mean": 0.85028076171875, "rewards//std": 0.02481343410909176, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3786, "grad_norm": 1.1913044452667236, "kl": 0.14972811471670866, "learning_rate": 3.4775580375037217e-06, "loss": 0.015, "num_tokens": 12347600.0, "reward": 0.862060546875, "reward_std": 0.016076069325208664, "rewards//mean": 0.862060546875, "rewards//std": 0.02488965168595314, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3788, "grad_norm": 1.6779652833938599, "kl": 0.16856642346829176, "learning_rate": 3.4760975065761134e-06, "loss": 0.0169, "num_tokens": 12354056.0, "reward": 0.88104248046875, "reward_std": 0.01040204893797636, "rewards//mean": 0.88104248046875, "rewards//std": 0.02278704009950161, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.379, "grad_norm": 1.5841668844223022, "kl": 0.17384642641991377, "learning_rate": 3.4746365824765455e-06, "loss": 0.0174, "num_tokens": 12360568.0, "reward": 0.82257080078125, "reward_std": 0.010689996182918549, "rewards//mean": 0.82257080078125, "rewards//std": 0.021050816401839256, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3792, "grad_norm": 1.221721887588501, "kl": 0.14468539599329233, "learning_rate": 3.4731752657934793e-06, "loss": 0.0145, "num_tokens": 12367088.0, "reward": 0.8094482421875, "reward_std": 0.011023780331015587, "rewards//mean": 0.8094482421875, "rewards//std": 0.027299148961901665, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3794, "grad_norm": 1.7015875577926636, "kl": 0.17052426654845476, "learning_rate": 3.471713557115532e-06, "loss": 0.0171, "num_tokens": 12373584.0, "reward": 0.8048095703125, "reward_std": 0.010979904793202877, "rewards//mean": 0.8048095703125, "rewards//std": 0.017175218090415, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3796, "grad_norm": 1.2400166988372803, "kl": 0.1724272733554244, "learning_rate": 3.4702514570314804e-06, "loss": 0.0172, "num_tokens": 12380112.0, "reward": 0.816650390625, "reward_std": 0.015190564095973969, "rewards//mean": 0.816650390625, "rewards//std": 0.026161137968301773, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3798, "grad_norm": 2.1519312858581543, "kl": 0.20773764606565237, "learning_rate": 3.4687889661302577e-06, "loss": 0.0208, "num_tokens": 12386728.0, "reward": 0.830078125, "reward_std": 0.014840161427855492, "rewards//mean": 0.830078125, "rewards//std": 0.023958738893270493, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.38, "grad_norm": 1.1740214824676514, "kl": 0.1385835986584425, "learning_rate": 3.4673260850009553e-06, "loss": 0.0139, "num_tokens": 12393208.0, "reward": 0.8221435546875, "reward_std": 0.0101883914321661, "rewards//mean": 0.8221435546875, "rewards//std": 0.0312342531979084, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3802, "grad_norm": 1.2194256782531738, "kl": 0.18403546512126923, "learning_rate": 3.4658628142328215e-06, "loss": 0.0184, "num_tokens": 12399720.0, "reward": 0.8287353515625, "reward_std": 0.012656591832637787, "rewards//mean": 0.8287353515625, "rewards//std": 0.02380884252488613, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3804, "grad_norm": 1.3995733261108398, "kl": 0.11859565880149603, "learning_rate": 3.464399154415262e-06, "loss": 0.0119, "num_tokens": 12406224.0, "reward": 0.8394775390625, "reward_std": 0.00825486145913601, "rewards//mean": 0.8394775390625, "rewards//std": 0.019706785678863525, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3806, "grad_norm": 1.0950136184692383, "kl": 0.13619355112314224, "learning_rate": 3.462935106137838e-06, "loss": 0.0136, "num_tokens": 12412720.0, "reward": 0.81146240234375, "reward_std": 0.013532737269997597, "rewards//mean": 0.81146240234375, "rewards//std": 0.02171277068555355, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3808, "grad_norm": 1.2996381521224976, "kl": 0.14935880061239004, "learning_rate": 3.461470669990269e-06, "loss": 0.0149, "num_tokens": 12419224.0, "reward": 0.8828125, "reward_std": 0.011350594460964203, "rewards//mean": 0.8828125, "rewards//std": 0.02051703818142414, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.381, "grad_norm": 1.342714786529541, "kl": 0.10617937101051211, "learning_rate": 3.4600058465624288e-06, "loss": 0.0106, "num_tokens": 12425792.0, "reward": 0.82733154296875, "reward_std": 0.012024933472275734, "rewards//mean": 0.82733154296875, "rewards//std": 0.03704604133963585, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3812, "grad_norm": 1.8753525018692017, "kl": 0.1565959556028247, "learning_rate": 3.4585406364443484e-06, "loss": 0.0157, "num_tokens": 12432312.0, "reward": 0.83209228515625, "reward_std": 0.014785107225179672, "rewards//mean": 0.83209228515625, "rewards//std": 0.039090346544981, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3814, "grad_norm": 1.4190987348556519, "kl": 0.16530808340758085, "learning_rate": 3.457075040226214e-06, "loss": 0.0165, "num_tokens": 12438832.0, "reward": 0.86383056640625, "reward_std": 0.01243327185511589, "rewards//mean": 0.86383056640625, "rewards//std": 0.024507123976945877, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3816, "grad_norm": 1.409058928489685, "kl": 0.21940014604479074, "learning_rate": 3.455609058498369e-06, "loss": 0.0219, "num_tokens": 12445336.0, "reward": 0.8741455078125, "reward_std": 0.015806537121534348, "rewards//mean": 0.8741455078125, "rewards//std": 0.023983681574463844, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3818, "grad_norm": 1.7757790088653564, "kl": 0.18474542908370495, "learning_rate": 3.4541426918513084e-06, "loss": 0.0185, "num_tokens": 12451848.0, "reward": 0.86431884765625, "reward_std": 0.013601141050457954, "rewards//mean": 0.86431884765625, "rewards//std": 0.03212323412299156, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.382, "grad_norm": 1.4408786296844482, "kl": 0.1952875154092908, "learning_rate": 3.452675940875686e-06, "loss": 0.0195, "num_tokens": 12458312.0, "reward": 0.84722900390625, "reward_std": 0.018021192401647568, "rewards//mean": 0.84722900390625, "rewards//std": 0.03746819123625755, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3822, "grad_norm": 1.2277010679244995, "kl": 0.16415668092668056, "learning_rate": 3.4512088061623077e-06, "loss": 0.0164, "num_tokens": 12464816.0, "reward": 0.852294921875, "reward_std": 0.017646048218011856, "rewards//mean": 0.852294921875, "rewards//std": 0.03109363093972206, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3824, "grad_norm": 1.7312166690826416, "kl": 0.2223032657057047, "learning_rate": 3.4497412883021375e-06, "loss": 0.0222, "num_tokens": 12471400.0, "reward": 0.864013671875, "reward_std": 0.011610273271799088, "rewards//mean": 0.864013671875, "rewards//std": 0.024517083540558815, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3826, "grad_norm": 1.274600625038147, "kl": 0.18283769767731428, "learning_rate": 3.4482733878862885e-06, "loss": 0.0183, "num_tokens": 12477872.0, "reward": 0.8079833984375, "reward_std": 0.008838141337037086, "rewards//mean": 0.8079833984375, "rewards//std": 0.018138844519853592, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3828, "grad_norm": 1.8426188230514526, "kl": 0.22701702918857336, "learning_rate": 3.4468051055060335e-06, "loss": 0.0227, "num_tokens": 12484400.0, "reward": 0.774658203125, "reward_std": 0.010469979606568813, "rewards//mean": 0.774658203125, "rewards//std": 0.019058961421251297, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.383, "grad_norm": 1.7794902324676514, "kl": 0.20635292399674654, "learning_rate": 3.4453364417527944e-06, "loss": 0.0206, "num_tokens": 12490968.0, "reward": 0.8402099609375, "reward_std": 0.013707267120480537, "rewards//mean": 0.8402099609375, "rewards//std": 0.019471868872642517, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3832, "grad_norm": 1.0929782390594482, "kl": 0.15116431657224894, "learning_rate": 3.4438673972181503e-06, "loss": 0.0151, "num_tokens": 12497480.0, "reward": 0.79510498046875, "reward_std": 0.011386433616280556, "rewards//mean": 0.79510498046875, "rewards//std": 0.017329098656773567, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3834, "grad_norm": 1.200981616973877, "kl": 0.24124240968376398, "learning_rate": 3.4423979724938305e-06, "loss": 0.0241, "num_tokens": 12504040.0, "reward": 0.799560546875, "reward_std": 0.010009681805968285, "rewards//mean": 0.799560546875, "rewards//std": 0.036103684455156326, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3836, "grad_norm": 1.1968566179275513, "kl": 0.24149654246866703, "learning_rate": 3.440928168171721e-06, "loss": 0.0241, "num_tokens": 12510488.0, "reward": 0.85321044921875, "reward_std": 0.012754389084875584, "rewards//mean": 0.85321044921875, "rewards//std": 0.02321743406355381, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3838, "grad_norm": 1.602799892425537, "kl": 0.20601962134242058, "learning_rate": 3.4394579848438573e-06, "loss": 0.0206, "num_tokens": 12517064.0, "reward": 0.8431396484375, "reward_std": 0.013294117525219917, "rewards//mean": 0.8431396484375, "rewards//std": 0.03029143437743187, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.384, "grad_norm": 1.6847951412200928, "kl": 0.2000315533950925, "learning_rate": 3.4379874231024297e-06, "loss": 0.02, "num_tokens": 12523704.0, "reward": 0.85711669921875, "reward_std": 0.013041604310274124, "rewards//mean": 0.85711669921875, "rewards//std": 0.01667196676135063, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3842, "grad_norm": 2.2290494441986084, "kl": 0.2846827022731304, "learning_rate": 3.436516483539781e-06, "loss": 0.0285, "num_tokens": 12530160.0, "reward": 0.83953857421875, "reward_std": 0.014225217513740063, "rewards//mean": 0.83953857421875, "rewards//std": 0.02396181784570217, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3844, "grad_norm": 1.3658255338668823, "kl": 0.24594670347869396, "learning_rate": 3.4350451667484035e-06, "loss": 0.0246, "num_tokens": 12536752.0, "reward": 0.83502197265625, "reward_std": 0.009193542413413525, "rewards//mean": 0.83502197265625, "rewards//std": 0.011536323465406895, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3846, "grad_norm": 2.1321451663970947, "kl": 0.17843194212764502, "learning_rate": 3.4335734733209457e-06, "loss": 0.0178, "num_tokens": 12543344.0, "reward": 0.78839111328125, "reward_std": 0.009488465264439583, "rewards//mean": 0.78839111328125, "rewards//std": 0.015049462206661701, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3848, "grad_norm": 1.362433671951294, "kl": 0.2269473373889923, "learning_rate": 3.4321014038502036e-06, "loss": 0.0227, "num_tokens": 12549848.0, "reward": 0.85162353515625, "reward_std": 0.01840696670114994, "rewards//mean": 0.85162353515625, "rewards//std": 0.04488205164670944, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.385, "grad_norm": 2.1588237285614014, "kl": 0.22999593149870634, "learning_rate": 3.4306289589291287e-06, "loss": 0.023, "num_tokens": 12556264.0, "reward": 0.83843994140625, "reward_std": 0.010497717186808586, "rewards//mean": 0.83843994140625, "rewards//std": 0.02343096025288105, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3852, "grad_norm": 1.451374888420105, "kl": 0.1861251052469015, "learning_rate": 3.429156139150819e-06, "loss": 0.0186, "num_tokens": 12562768.0, "reward": 0.847412109375, "reward_std": 0.010170169174671173, "rewards//mean": 0.847412109375, "rewards//std": 0.017973264679312706, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3854, "grad_norm": 1.853372573852539, "kl": 0.21883191727101803, "learning_rate": 3.4276829451085287e-06, "loss": 0.0219, "num_tokens": 12569272.0, "reward": 0.8270263671875, "reward_std": 0.010472087189555168, "rewards//mean": 0.8270263671875, "rewards//std": 0.014739097096025944, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3856, "grad_norm": 2.0180160999298096, "kl": 0.3084346204996109, "learning_rate": 3.4262093773956583e-06, "loss": 0.0308, "num_tokens": 12575848.0, "reward": 0.7763671875, "reward_std": 0.014981066808104515, "rewards//mean": 0.7763671875, "rewards//std": 0.023499423637986183, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3858, "grad_norm": 2.0500059127807617, "kl": 0.28097373247146606, "learning_rate": 3.4247354366057618e-06, "loss": 0.0281, "num_tokens": 12582400.0, "reward": 0.85626220703125, "reward_std": 0.0148585494607687, "rewards//mean": 0.85626220703125, "rewards//std": 0.022789698094129562, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.386, "grad_norm": 2.189487934112549, "kl": 0.2600579373538494, "learning_rate": 3.4232611233325418e-06, "loss": 0.026, "num_tokens": 12588880.0, "reward": 0.82550048828125, "reward_std": 0.012923184782266617, "rewards//mean": 0.82550048828125, "rewards//std": 0.030033156275749207, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3862, "grad_norm": 2.6917362213134766, "kl": 0.2806738205254078, "learning_rate": 3.4217864381698523e-06, "loss": 0.0281, "num_tokens": 12595368.0, "reward": 0.855712890625, "reward_std": 0.0123723354190588, "rewards//mean": 0.855712890625, "rewards//std": 0.022754548117518425, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3864, "grad_norm": 1.5503488779067993, "kl": 0.25199113320559263, "learning_rate": 3.4203113817116955e-06, "loss": 0.0252, "num_tokens": 12601848.0, "reward": 0.821044921875, "reward_std": 0.0136185921728611, "rewards//mean": 0.821044921875, "rewards//std": 0.020158275961875916, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3866, "grad_norm": 1.4460183382034302, "kl": 0.3144112806767225, "learning_rate": 3.4188359545522235e-06, "loss": 0.0314, "num_tokens": 12608400.0, "reward": 0.8726806640625, "reward_std": 0.011937587521970272, "rewards//mean": 0.8726806640625, "rewards//std": 0.029534369707107544, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3868, "grad_norm": 2.37680721282959, "kl": 0.21671432629227638, "learning_rate": 3.41736015728574e-06, "loss": 0.0217, "num_tokens": 12615000.0, "reward": 0.85540771484375, "reward_std": 0.01025642640888691, "rewards//mean": 0.85540771484375, "rewards//std": 0.015523795038461685, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.387, "grad_norm": 1.9547513723373413, "kl": 0.3038980048149824, "learning_rate": 3.415883990506694e-06, "loss": 0.0304, "num_tokens": 12621584.0, "reward": 0.81048583984375, "reward_std": 0.00796540267765522, "rewards//mean": 0.81048583984375, "rewards//std": 0.02837391570210457, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3872, "grad_norm": 1.4822955131530762, "kl": 0.22928674891591072, "learning_rate": 3.414407454809687e-06, "loss": 0.0229, "num_tokens": 12628128.0, "reward": 0.8531494140625, "reward_std": 0.010823153890669346, "rewards//mean": 0.8531494140625, "rewards//std": 0.02665496990084648, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3874, "grad_norm": 1.4620147943496704, "kl": 0.2864753436297178, "learning_rate": 3.4129305507894657e-06, "loss": 0.0286, "num_tokens": 12634672.0, "reward": 0.8603515625, "reward_std": 0.013154091313481331, "rewards//mean": 0.8603515625, "rewards//std": 0.02027956396341324, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3876, "grad_norm": 1.3887264728546143, "kl": 0.336206279695034, "learning_rate": 3.411453279040928e-06, "loss": 0.0336, "num_tokens": 12641240.0, "reward": 0.83380126953125, "reward_std": 0.012275397777557373, "rewards//mean": 0.83380126953125, "rewards//std": 0.016291730105876923, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3878, "grad_norm": 2.0351128578186035, "kl": 0.3460333039984107, "learning_rate": 3.4099756401591182e-06, "loss": 0.0346, "num_tokens": 12647744.0, "reward": 0.8282470703125, "reward_std": 0.012835802510380745, "rewards//mean": 0.8282470703125, "rewards//std": 0.01775762438774109, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.388, "grad_norm": 1.5595088005065918, "kl": 0.2505854363553226, "learning_rate": 3.40849763473923e-06, "loss": 0.0251, "num_tokens": 12654312.0, "reward": 0.86199951171875, "reward_std": 0.016162022948265076, "rewards//mean": 0.86199951171875, "rewards//std": 0.031879622489213943, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3882, "grad_norm": 1.711012840270996, "kl": 0.3680433165282011, "learning_rate": 3.4070192633766025e-06, "loss": 0.0368, "num_tokens": 12660872.0, "reward": 0.8690185546875, "reward_std": 0.015204604715108871, "rewards//mean": 0.8690185546875, "rewards//std": 0.033262114971876144, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3884, "grad_norm": 1.8827543258666992, "kl": 0.3054479844868183, "learning_rate": 3.405540526666725e-06, "loss": 0.0305, "num_tokens": 12667376.0, "reward": 0.8314208984375, "reward_std": 0.011353651992976665, "rewards//mean": 0.8314208984375, "rewards//std": 0.022490572184324265, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3886, "grad_norm": 1.3246738910675049, "kl": 0.14262975566089153, "learning_rate": 3.4040614252052305e-06, "loss": 0.0143, "num_tokens": 12673872.0, "reward": 0.766357421875, "reward_std": 0.00805084127932787, "rewards//mean": 0.766357421875, "rewards//std": 0.01395244337618351, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3888, "grad_norm": 2.293337821960449, "kl": 0.20539615117013454, "learning_rate": 3.4025819595879033e-06, "loss": 0.0205, "num_tokens": 12680368.0, "reward": 0.802978515625, "reward_std": 0.010548784397542477, "rewards//mean": 0.802978515625, "rewards//std": 0.0229453444480896, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.389, "grad_norm": 1.505679965019226, "kl": 0.2883669575676322, "learning_rate": 3.40110213041067e-06, "loss": 0.0288, "num_tokens": 12686952.0, "reward": 0.83135986328125, "reward_std": 0.01155521534383297, "rewards//mean": 0.83135986328125, "rewards//std": 0.021871840581297874, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3892, "grad_norm": 2.5515248775482178, "kl": 0.42939623445272446, "learning_rate": 3.3996219382696066e-06, "loss": 0.0429, "num_tokens": 12693456.0, "reward": 0.861083984375, "reward_std": 0.011626130901277065, "rewards//mean": 0.861083984375, "rewards//std": 0.025863181799650192, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3894, "grad_norm": 1.6958684921264648, "kl": 0.35294659808278084, "learning_rate": 3.3981413837609346e-06, "loss": 0.0353, "num_tokens": 12699960.0, "reward": 0.83740234375, "reward_std": 0.014239251613616943, "rewards//mean": 0.83740234375, "rewards//std": 0.018525702878832817, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3896, "grad_norm": 1.3509844541549683, "kl": 0.3307188879698515, "learning_rate": 3.3966604674810193e-06, "loss": 0.0331, "num_tokens": 12706480.0, "reward": 0.86419677734375, "reward_std": 0.012420088052749634, "rewards//mean": 0.86419677734375, "rewards//std": 0.02024061791598797, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3898, "grad_norm": 1.6209245920181274, "kl": 0.31537160743027925, "learning_rate": 3.395179190026376e-06, "loss": 0.0315, "num_tokens": 12712992.0, "reward": 0.8546142578125, "reward_std": 0.01275199931114912, "rewards//mean": 0.8546142578125, "rewards//std": 0.022632813081145287, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.39, "grad_norm": 1.516732096672058, "kl": 0.35833659768104553, "learning_rate": 3.3936975519936615e-06, "loss": 0.0358, "num_tokens": 12719504.0, "reward": 0.8416748046875, "reward_std": 0.010418053716421127, "rewards//mean": 0.8416748046875, "rewards//std": 0.01995411328971386, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3902, "grad_norm": 1.561761736869812, "kl": 0.1813245676457882, "learning_rate": 3.39221555397968e-06, "loss": 0.0181, "num_tokens": 12726056.0, "reward": 0.8702392578125, "reward_std": 0.013150820508599281, "rewards//mean": 0.8702392578125, "rewards//std": 0.02973664551973343, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3904, "grad_norm": 1.4619077444076538, "kl": 0.25907554663717747, "learning_rate": 3.3907331965813807e-06, "loss": 0.0259, "num_tokens": 12732488.0, "reward": 0.838134765625, "reward_std": 0.012800414115190506, "rewards//mean": 0.838134765625, "rewards//std": 0.028492173179984093, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3906, "grad_norm": 2.099212646484375, "kl": 0.23495186027139425, "learning_rate": 3.3892504803958547e-06, "loss": 0.0235, "num_tokens": 12739016.0, "reward": 0.887451171875, "reward_std": 0.015376945957541466, "rewards//mean": 0.887451171875, "rewards//std": 0.02527589723467827, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3908, "grad_norm": 1.7655950784683228, "kl": 0.26156890019774437, "learning_rate": 3.387767406020343e-06, "loss": 0.0262, "num_tokens": 12745592.0, "reward": 0.782470703125, "reward_std": 0.006867438089102507, "rewards//mean": 0.782470703125, "rewards//std": 0.01665845327079296, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.391, "grad_norm": 1.4790065288543701, "kl": 0.20613016840070486, "learning_rate": 3.386283974052226e-06, "loss": 0.0206, "num_tokens": 12752048.0, "reward": 0.8759765625, "reward_std": 0.012111521326005459, "rewards//mean": 0.8759765625, "rewards//std": 0.03280777856707573, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3912, "grad_norm": 1.1200881004333496, "kl": 0.13084584847092628, "learning_rate": 3.38480018508903e-06, "loss": 0.0131, "num_tokens": 12758600.0, "reward": 0.86456298828125, "reward_std": 0.011789888143539429, "rewards//mean": 0.86456298828125, "rewards//std": 0.02483294904232025, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3914, "grad_norm": 1.3309682607650757, "kl": 0.2469489872455597, "learning_rate": 3.383316039728426e-06, "loss": 0.0247, "num_tokens": 12765144.0, "reward": 0.84112548828125, "reward_std": 0.014004849828779697, "rewards//mean": 0.84112548828125, "rewards//std": 0.031242674216628075, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3916, "grad_norm": 1.9101786613464355, "kl": 0.17485249042510986, "learning_rate": 3.3818315385682255e-06, "loss": 0.0175, "num_tokens": 12771608.0, "reward": 0.8515625, "reward_std": 0.009614004753530025, "rewards//mean": 0.8515625, "rewards//std": 0.02896534651517868, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3918, "grad_norm": 1.5089843273162842, "kl": 0.26226514764130116, "learning_rate": 3.380346682206388e-06, "loss": 0.0262, "num_tokens": 12778144.0, "reward": 0.8685302734375, "reward_std": 0.014687031507492065, "rewards//mean": 0.8685302734375, "rewards//std": 0.03327121585607529, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.392, "grad_norm": 1.6584827899932861, "kl": 0.1928934883326292, "learning_rate": 3.378861471241011e-06, "loss": 0.0193, "num_tokens": 12784728.0, "reward": 0.81329345703125, "reward_std": 0.010870056226849556, "rewards//mean": 0.81329345703125, "rewards//std": 0.016055872663855553, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3922, "grad_norm": 1.572653889656067, "kl": 0.24441533535718918, "learning_rate": 3.37737590627034e-06, "loss": 0.0244, "num_tokens": 12791272.0, "reward": 0.83074951171875, "reward_std": 0.014522993005812168, "rewards//mean": 0.83074951171875, "rewards//std": 0.020764101296663284, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3924, "grad_norm": 1.2992830276489258, "kl": 0.28006161097437143, "learning_rate": 3.3758899878927574e-06, "loss": 0.028, "num_tokens": 12797792.0, "reward": 0.859619140625, "reward_std": 0.01354979071766138, "rewards//mean": 0.859619140625, "rewards//std": 0.03559018298983574, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3926, "grad_norm": 1.2050861120224, "kl": 0.19530109502375126, "learning_rate": 3.3744037167067933e-06, "loss": 0.0195, "num_tokens": 12804288.0, "reward": 0.86431884765625, "reward_std": 0.015202890150249004, "rewards//mean": 0.86431884765625, "rewards//std": 0.02990536577999592, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3928, "grad_norm": 1.384436011314392, "kl": 0.19659986533224583, "learning_rate": 3.372917093311116e-06, "loss": 0.0197, "num_tokens": 12810992.0, "reward": 0.83294677734375, "reward_std": 0.014496484771370888, "rewards//mean": 0.83294677734375, "rewards//std": 0.03886197507381439, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.393, "grad_norm": 1.9128657579421997, "kl": 0.1770204994827509, "learning_rate": 3.3714301183045382e-06, "loss": 0.0177, "num_tokens": 12817536.0, "reward": 0.85687255859375, "reward_std": 0.013299301266670227, "rewards//mean": 0.85687255859375, "rewards//std": 0.032580845057964325, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3932, "grad_norm": 1.3095289468765259, "kl": 0.1655904920771718, "learning_rate": 3.369942792286013e-06, "loss": 0.0166, "num_tokens": 12824080.0, "reward": 0.8709716796875, "reward_std": 0.016635611653327942, "rewards//mean": 0.8709716796875, "rewards//std": 0.026593564078211784, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3934, "grad_norm": 1.338943362236023, "kl": 0.2355615384876728, "learning_rate": 3.3684551158546354e-06, "loss": 0.0236, "num_tokens": 12830576.0, "reward": 0.85693359375, "reward_std": 0.013074936345219612, "rewards//mean": 0.85693359375, "rewards//std": 0.02687019109725952, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3936, "grad_norm": 1.6923797130584717, "kl": 0.2725093010812998, "learning_rate": 3.3669670896096406e-06, "loss": 0.0273, "num_tokens": 12837088.0, "reward": 0.836181640625, "reward_std": 0.012646114453673363, "rewards//mean": 0.836181640625, "rewards//std": 0.016932448372244835, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3938, "grad_norm": 1.7711920738220215, "kl": 0.19956402853131294, "learning_rate": 3.3654787141504062e-06, "loss": 0.02, "num_tokens": 12843488.0, "reward": 0.86865234375, "reward_std": 0.014071707613766193, "rewards//mean": 0.86865234375, "rewards//std": 0.026139140129089355, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.394, "grad_norm": 1.6579523086547852, "kl": 0.240700613707304, "learning_rate": 3.3639899900764496e-06, "loss": 0.0241, "num_tokens": 12849960.0, "reward": 0.8570556640625, "reward_std": 0.009335736744105816, "rewards//mean": 0.8570556640625, "rewards//std": 0.015323207713663578, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3942, "grad_norm": 1.3138344287872314, "kl": 0.20460982620716095, "learning_rate": 3.362500917987427e-06, "loss": 0.0205, "num_tokens": 12856440.0, "reward": 0.84454345703125, "reward_std": 0.01122652180492878, "rewards//mean": 0.84454345703125, "rewards//std": 0.019859343767166138, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3944, "grad_norm": 1.9974066019058228, "kl": 0.23226803727447987, "learning_rate": 3.3610114984831388e-06, "loss": 0.0232, "num_tokens": 12862920.0, "reward": 0.81475830078125, "reward_std": 0.010275142267346382, "rewards//mean": 0.81475830078125, "rewards//std": 0.019007163122296333, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3946, "grad_norm": 1.550940990447998, "kl": 0.23780939169228077, "learning_rate": 3.3595217321635217e-06, "loss": 0.0238, "num_tokens": 12869368.0, "reward": 0.7830810546875, "reward_std": 0.01112491637468338, "rewards//mean": 0.7830810546875, "rewards//std": 0.02396852895617485, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3948, "grad_norm": 1.516589641571045, "kl": 0.1728332620114088, "learning_rate": 3.3580316196286534e-06, "loss": 0.0173, "num_tokens": 12875896.0, "reward": 0.841552734375, "reward_std": 0.01551414467394352, "rewards//mean": 0.841552734375, "rewards//std": 0.034251581877470016, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.395, "grad_norm": 1.2579668760299683, "kl": 0.2020314261317253, "learning_rate": 3.356541161478751e-06, "loss": 0.0202, "num_tokens": 12882376.0, "reward": 0.8841552734375, "reward_std": 0.01052119955420494, "rewards//mean": 0.8841552734375, "rewards//std": 0.02622903883457184, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3952, "grad_norm": 1.2929043769836426, "kl": 0.31892747059464455, "learning_rate": 3.3550503583141726e-06, "loss": 0.0319, "num_tokens": 12888912.0, "reward": 0.85748291015625, "reward_std": 0.01697753183543682, "rewards//mean": 0.85748291015625, "rewards//std": 0.02570459246635437, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3954, "grad_norm": 1.3856309652328491, "kl": 0.19151218701153994, "learning_rate": 3.353559210735411e-06, "loss": 0.0192, "num_tokens": 12895352.0, "reward": 0.8629150390625, "reward_std": 0.012094013392925262, "rewards//mean": 0.8629150390625, "rewards//std": 0.0221159178763628, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3956, "grad_norm": 1.5917448997497559, "kl": 0.29140064725652337, "learning_rate": 3.3520677193431017e-06, "loss": 0.0291, "num_tokens": 12901864.0, "reward": 0.8282470703125, "reward_std": 0.008313345722854137, "rewards//mean": 0.8282470703125, "rewards//std": 0.021720873191952705, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3958, "grad_norm": 1.3049300909042358, "kl": 0.15975985024124384, "learning_rate": 3.3505758847380163e-06, "loss": 0.016, "num_tokens": 12908336.0, "reward": 0.8271484375, "reward_std": 0.009737499058246613, "rewards//mean": 0.8271484375, "rewards//std": 0.028151167556643486, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.396, "grad_norm": 1.6307432651519775, "kl": 0.3250326681882143, "learning_rate": 3.3490837075210677e-06, "loss": 0.0325, "num_tokens": 12914928.0, "reward": 0.850830078125, "reward_std": 0.013365332037210464, "rewards//mean": 0.850830078125, "rewards//std": 0.027611492201685905, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3962, "grad_norm": 1.5126124620437622, "kl": 0.2642070781439543, "learning_rate": 3.3475911882933014e-06, "loss": 0.0264, "num_tokens": 12921592.0, "reward": 0.87786865234375, "reward_std": 0.01585608720779419, "rewards//mean": 0.87786865234375, "rewards//std": 0.0379953607916832, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3964, "grad_norm": 2.0392467975616455, "kl": 0.21102296747267246, "learning_rate": 3.346098327655907e-06, "loss": 0.0211, "num_tokens": 12928032.0, "reward": 0.85205078125, "reward_std": 0.015003281645476818, "rewards//mean": 0.85205078125, "rewards//std": 0.02330796793103218, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3966, "grad_norm": 1.4264273643493652, "kl": 0.21913836523890495, "learning_rate": 3.3446051262102076e-06, "loss": 0.0219, "num_tokens": 12934592.0, "reward": 0.8392333984375, "reward_std": 0.010099314153194427, "rewards//mean": 0.8392333984375, "rewards//std": 0.02323218248784542, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3968, "grad_norm": 1.6798410415649414, "kl": 0.204536653123796, "learning_rate": 3.343111584557664e-06, "loss": 0.0205, "num_tokens": 12941256.0, "reward": 0.808837890625, "reward_std": 0.008469732478260994, "rewards//mean": 0.808837890625, "rewards//std": 0.01176773477345705, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.397, "grad_norm": 1.5059850215911865, "kl": 0.17779461201280355, "learning_rate": 3.341617703299875e-06, "loss": 0.0178, "num_tokens": 12947800.0, "reward": 0.85125732421875, "reward_std": 0.012839564122259617, "rewards//mean": 0.85125732421875, "rewards//std": 0.02829110063612461, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3972, "grad_norm": 1.1993932723999023, "kl": 0.2466699993237853, "learning_rate": 3.3401234830385753e-06, "loss": 0.0247, "num_tokens": 12954400.0, "reward": 0.8543701171875, "reward_std": 0.0099581778049469, "rewards//mean": 0.8543701171875, "rewards//std": 0.012842318043112755, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3974, "grad_norm": 1.2308577299118042, "kl": 0.2063814615830779, "learning_rate": 3.338628924375638e-06, "loss": 0.0206, "num_tokens": 12960912.0, "reward": 0.852783203125, "reward_std": 0.012494321912527084, "rewards//mean": 0.852783203125, "rewards//std": 0.019880009815096855, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3976, "grad_norm": 1.7159438133239746, "kl": 0.1892592143267393, "learning_rate": 3.3371340279130694e-06, "loss": 0.0189, "num_tokens": 12967512.0, "reward": 0.8826904296875, "reward_std": 0.013886124826967716, "rewards//mean": 0.8826904296875, "rewards//std": 0.02398873120546341, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3978, "grad_norm": 1.1668554544448853, "kl": 0.15343563351780176, "learning_rate": 3.335638794253015e-06, "loss": 0.0153, "num_tokens": 12974072.0, "reward": 0.8197021484375, "reward_std": 0.011763466522097588, "rewards//mean": 0.8197021484375, "rewards//std": 0.03676409646868706, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.398, "grad_norm": 1.8344471454620361, "kl": 0.21648874785751104, "learning_rate": 3.3341432239977537e-06, "loss": 0.0216, "num_tokens": 12980576.0, "reward": 0.83807373046875, "reward_std": 0.011384960263967514, "rewards//mean": 0.83807373046875, "rewards//std": 0.021864918991923332, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3982, "grad_norm": 1.3712502717971802, "kl": 0.20847327634692192, "learning_rate": 3.332647317749702e-06, "loss": 0.0208, "num_tokens": 12987104.0, "reward": 0.85302734375, "reward_std": 0.014032524079084396, "rewards//mean": 0.85302734375, "rewards//std": 0.0377475880086422, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3984, "grad_norm": 1.9870731830596924, "kl": 0.2579640205949545, "learning_rate": 3.33115107611141e-06, "loss": 0.0258, "num_tokens": 12993584.0, "reward": 0.89398193359375, "reward_std": 0.014900933019816875, "rewards//mean": 0.89398193359375, "rewards//std": 0.025881830602884293, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3986, "grad_norm": 1.5930770635604858, "kl": 0.2581451702862978, "learning_rate": 3.329654499685565e-06, "loss": 0.0258, "num_tokens": 13000128.0, "reward": 0.83447265625, "reward_std": 0.014952573925256729, "rewards//mean": 0.83447265625, "rewards//std": 0.027458660304546356, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3988, "grad_norm": 1.5184813737869263, "kl": 0.2204946344718337, "learning_rate": 3.3281575890749857e-06, "loss": 0.022, "num_tokens": 13006640.0, "reward": 0.8671875, "reward_std": 0.012683672830462456, "rewards//mean": 0.8671875, "rewards//std": 0.014532756991684437, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.399, "grad_norm": 2.3581702709198, "kl": 0.23843526747077703, "learning_rate": 3.3266603448826286e-06, "loss": 0.0238, "num_tokens": 13013144.0, "reward": 0.8470458984375, "reward_std": 0.014393327757716179, "rewards//mean": 0.8470458984375, "rewards//std": 0.021723661571741104, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3992, "grad_norm": 1.4219532012939453, "kl": 0.2572242086753249, "learning_rate": 3.325162767711583e-06, "loss": 0.0257, "num_tokens": 13019640.0, "reward": 0.85394287109375, "reward_std": 0.013244499452412128, "rewards//mean": 0.85394287109375, "rewards//std": 0.02064199186861515, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3994, "grad_norm": 1.884981632232666, "kl": 0.23352040629833937, "learning_rate": 3.3236648581650743e-06, "loss": 0.0234, "num_tokens": 13026136.0, "reward": 0.7840576171875, "reward_std": 0.009789198637008667, "rewards//mean": 0.7840576171875, "rewards//std": 0.017054932191967964, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3996, "grad_norm": 1.7079914808273315, "kl": 0.17976296413689852, "learning_rate": 3.3221666168464584e-06, "loss": 0.018, "num_tokens": 13032824.0, "reward": 0.83587646484375, "reward_std": 0.009672037325799465, "rewards//mean": 0.83587646484375, "rewards//std": 0.0198509581387043, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.3998, "grad_norm": 1.5797020196914673, "kl": 0.22703632153570652, "learning_rate": 3.3206680443592283e-06, "loss": 0.0227, "num_tokens": 13039344.0, "reward": 0.829345703125, "reward_std": 0.012659886851906776, "rewards//mean": 0.829345703125, "rewards//std": 0.02200785093009472, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4, "grad_norm": 1.9587570428848267, "kl": 0.3071395689621568, "learning_rate": 3.319169141307007e-06, "loss": 0.0307, "num_tokens": 13045864.0, "reward": 0.8388671875, "reward_std": 0.010451741516590118, "rewards//mean": 0.8388671875, "rewards//std": 0.018453655764460564, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4002, "grad_norm": 1.3456932306289673, "kl": 0.25920157227665186, "learning_rate": 3.3176699082935546e-06, "loss": 0.0259, "num_tokens": 13052304.0, "reward": 0.84210205078125, "reward_std": 0.010754143819212914, "rewards//mean": 0.84210205078125, "rewards//std": 0.0211197379976511, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4004, "grad_norm": 1.3019630908966064, "kl": 0.2230223622173071, "learning_rate": 3.3161703459227602e-06, "loss": 0.0223, "num_tokens": 13058792.0, "reward": 0.8690185546875, "reward_std": 0.013275819830596447, "rewards//mean": 0.8690185546875, "rewards//std": 0.025845322757959366, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4006, "grad_norm": 1.781256914138794, "kl": 0.21144264936447144, "learning_rate": 3.3146704547986487e-06, "loss": 0.0211, "num_tokens": 13065344.0, "reward": 0.85198974609375, "reward_std": 0.012219378724694252, "rewards//mean": 0.85198974609375, "rewards//std": 0.021475831046700478, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4008, "grad_norm": 1.9967092275619507, "kl": 0.22869620472192764, "learning_rate": 3.3131702355253747e-06, "loss": 0.0229, "num_tokens": 13071880.0, "reward": 0.87481689453125, "reward_std": 0.010182630270719528, "rewards//mean": 0.87481689453125, "rewards//std": 0.024729104712605476, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.401, "grad_norm": 1.7654813528060913, "kl": 0.18828272446990013, "learning_rate": 3.311669688707228e-06, "loss": 0.0188, "num_tokens": 13078328.0, "reward": 0.8564453125, "reward_std": 0.016941428184509277, "rewards//mean": 0.8564453125, "rewards//std": 0.03787250071763992, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4012, "grad_norm": 2.0383007526397705, "kl": 0.23879436124116182, "learning_rate": 3.3101688149486268e-06, "loss": 0.0239, "num_tokens": 13084808.0, "reward": 0.79791259765625, "reward_std": 0.011856112629175186, "rewards//mean": 0.79791259765625, "rewards//std": 0.02678661234676838, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4014, "grad_norm": 1.4209285974502563, "kl": 0.14875551220029593, "learning_rate": 3.3086676148541245e-06, "loss": 0.0149, "num_tokens": 13091384.0, "reward": 0.84814453125, "reward_std": 0.009802783839404583, "rewards//mean": 0.84814453125, "rewards//std": 0.02705881930887699, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4016, "grad_norm": 1.3769190311431885, "kl": 0.23808326572179794, "learning_rate": 3.307166089028403e-06, "loss": 0.0238, "num_tokens": 13097840.0, "reward": 0.8480224609375, "reward_std": 0.01237301342189312, "rewards//mean": 0.8480224609375, "rewards//std": 0.04644269496202469, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4018, "grad_norm": 1.4063600301742554, "kl": 0.3354848735034466, "learning_rate": 3.3056642380762783e-06, "loss": 0.0335, "num_tokens": 13104384.0, "reward": 0.79559326171875, "reward_std": 0.013242160901427269, "rewards//mean": 0.79559326171875, "rewards//std": 0.025967666879296303, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.402, "grad_norm": 1.4500303268432617, "kl": 0.2387236673384905, "learning_rate": 3.3041620626026943e-06, "loss": 0.0239, "num_tokens": 13110920.0, "reward": 0.86297607421875, "reward_std": 0.012746579013764858, "rewards//mean": 0.86297607421875, "rewards//std": 0.024913284927606583, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4022, "grad_norm": 1.4739997386932373, "kl": 0.254985511302948, "learning_rate": 3.3026595632127274e-06, "loss": 0.0255, "num_tokens": 13117456.0, "reward": 0.831787109375, "reward_std": 0.01050491351634264, "rewards//mean": 0.831787109375, "rewards//std": 0.03276252746582031, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4024, "grad_norm": 1.4527848958969116, "kl": 0.27210069447755814, "learning_rate": 3.301156740511585e-06, "loss": 0.0272, "num_tokens": 13123896.0, "reward": 0.8419189453125, "reward_std": 0.011728821322321892, "rewards//mean": 0.8419189453125, "rewards//std": 0.01840067096054554, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4026, "grad_norm": 1.5649949312210083, "kl": 0.26365606021136045, "learning_rate": 3.299653595104603e-06, "loss": 0.0264, "num_tokens": 13130384.0, "reward": 0.810791015625, "reward_std": 0.007449167314916849, "rewards//mean": 0.810791015625, "rewards//std": 0.026999453082680702, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4028, "grad_norm": 1.718432068824768, "kl": 0.31725412979722023, "learning_rate": 3.298150127597248e-06, "loss": 0.0317, "num_tokens": 13136784.0, "reward": 0.840087890625, "reward_std": 0.01505677867680788, "rewards//mean": 0.840087890625, "rewards//std": 0.02826172299683094, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.403, "grad_norm": 2.5385167598724365, "kl": 0.33394272066652775, "learning_rate": 3.2966463385951174e-06, "loss": 0.0334, "num_tokens": 13143304.0, "reward": 0.80914306640625, "reward_std": 0.014810748398303986, "rewards//mean": 0.80914306640625, "rewards//std": 0.023467110469937325, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4032, "grad_norm": 1.6591646671295166, "kl": 0.42761330865323544, "learning_rate": 3.295142228703938e-06, "loss": 0.0428, "num_tokens": 13149880.0, "reward": 0.83502197265625, "reward_std": 0.013734391890466213, "rewards//mean": 0.83502197265625, "rewards//std": 0.022431517019867897, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4034, "grad_norm": 1.476294994354248, "kl": 0.2717515863478184, "learning_rate": 3.2936377985295627e-06, "loss": 0.0272, "num_tokens": 13156376.0, "reward": 0.810302734375, "reward_std": 0.01000109501183033, "rewards//mean": 0.810302734375, "rewards//std": 0.013760167174041271, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4036, "grad_norm": 1.2854851484298706, "kl": 0.19790327362716198, "learning_rate": 3.2921330486779773e-06, "loss": 0.0198, "num_tokens": 13162968.0, "reward": 0.832763671875, "reward_std": 0.012579068541526794, "rewards//mean": 0.832763671875, "rewards//std": 0.03136507794260979, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4038, "grad_norm": 1.9215617179870605, "kl": 0.35040474124252796, "learning_rate": 3.290627979755295e-06, "loss": 0.035, "num_tokens": 13169512.0, "reward": 0.86651611328125, "reward_std": 0.014830604195594788, "rewards//mean": 0.86651611328125, "rewards//std": 0.02094412036240101, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.404, "grad_norm": 1.7257351875305176, "kl": 0.31828534230589867, "learning_rate": 3.2891225923677565e-06, "loss": 0.0318, "num_tokens": 13176104.0, "reward": 0.84716796875, "reward_std": 0.015367535874247551, "rewards//mean": 0.84716796875, "rewards//std": 0.030682945623993874, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4042, "grad_norm": 1.7571234703063965, "kl": 0.3570402767509222, "learning_rate": 3.2876168871217322e-06, "loss": 0.0357, "num_tokens": 13182608.0, "reward": 0.85906982421875, "reward_std": 0.012716399505734444, "rewards//mean": 0.85906982421875, "rewards//std": 0.025308741256594658, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4044, "grad_norm": 2.792107582092285, "kl": 0.28927917033433914, "learning_rate": 3.2861108646237205e-06, "loss": 0.0289, "num_tokens": 13189144.0, "reward": 0.83392333984375, "reward_std": 0.008419015444815159, "rewards//mean": 0.83392333984375, "rewards//std": 0.015422048047184944, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4046, "grad_norm": 3.165228843688965, "kl": 0.22225462831556797, "learning_rate": 3.284604525480346e-06, "loss": 0.0222, "num_tokens": 13195664.0, "reward": 0.84051513671875, "reward_std": 0.013562703505158424, "rewards//mean": 0.84051513671875, "rewards//std": 0.03941160440444946, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4048, "grad_norm": 2.365527391433716, "kl": 0.35371775925159454, "learning_rate": 3.2830978702983628e-06, "loss": 0.0354, "num_tokens": 13202168.0, "reward": 0.8096923828125, "reward_std": 0.011431550607085228, "rewards//mean": 0.8096923828125, "rewards//std": 0.018446682021021843, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.405, "grad_norm": 2.6078109741210938, "kl": 0.3188340775668621, "learning_rate": 3.2815908996846506e-06, "loss": 0.0319, "num_tokens": 13208696.0, "reward": 0.87213134765625, "reward_std": 0.014622559770941734, "rewards//mean": 0.87213134765625, "rewards//std": 0.03584357723593712, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4052, "grad_norm": 1.65888512134552, "kl": 0.26908246614038944, "learning_rate": 3.280083614246218e-06, "loss": 0.0269, "num_tokens": 13215224.0, "reward": 0.8387451171875, "reward_std": 0.013668091967701912, "rewards//mean": 0.8387451171875, "rewards//std": 0.03208722174167633, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4054, "grad_norm": 2.273232936859131, "kl": 0.36289042234420776, "learning_rate": 3.278576014590199e-06, "loss": 0.0363, "num_tokens": 13221824.0, "reward": 0.8482666015625, "reward_std": 0.010314177721738815, "rewards//mean": 0.8482666015625, "rewards//std": 0.0188203826546669, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4056, "grad_norm": 1.9291726350784302, "kl": 0.2656873846426606, "learning_rate": 3.2770681013238535e-06, "loss": 0.0266, "num_tokens": 13228432.0, "reward": 0.8695068359375, "reward_std": 0.013299573212862015, "rewards//mean": 0.8695068359375, "rewards//std": 0.02595520205795765, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4058, "grad_norm": 1.6598447561264038, "kl": 0.26030414272099733, "learning_rate": 3.275559875054571e-06, "loss": 0.026, "num_tokens": 13234920.0, "reward": 0.8841552734375, "reward_std": 0.009872987866401672, "rewards//mean": 0.8841552734375, "rewards//std": 0.020126337185502052, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.406, "grad_norm": 1.6411429643630981, "kl": 0.26598589308559895, "learning_rate": 3.2740513363898624e-06, "loss": 0.0266, "num_tokens": 13241408.0, "reward": 0.8648681640625, "reward_std": 0.013604866340756416, "rewards//mean": 0.8648681640625, "rewards//std": 0.032111745327711105, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4062, "grad_norm": 1.8488880395889282, "kl": 0.26669708639383316, "learning_rate": 3.272542485937369e-06, "loss": 0.0267, "num_tokens": 13247976.0, "reward": 0.86767578125, "reward_std": 0.010786222293972969, "rewards//mean": 0.86767578125, "rewards//std": 0.021981697529554367, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4064, "grad_norm": 2.344836711883545, "kl": 0.26758462004363537, "learning_rate": 3.2710333243048542e-06, "loss": 0.0268, "num_tokens": 13254576.0, "reward": 0.88128662109375, "reward_std": 0.017161568626761436, "rewards//mean": 0.88128662109375, "rewards//std": 0.028515474870800972, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4066, "grad_norm": 2.291576385498047, "kl": 0.2800061032176018, "learning_rate": 3.2695238521002086e-06, "loss": 0.028, "num_tokens": 13261024.0, "reward": 0.79803466796875, "reward_std": 0.008982343599200249, "rewards//mean": 0.79803466796875, "rewards//std": 0.01911992020905018, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4068, "grad_norm": 3.060637950897217, "kl": 0.25515037775039673, "learning_rate": 3.2680140699314474e-06, "loss": 0.0255, "num_tokens": 13267544.0, "reward": 0.8502197265625, "reward_std": 0.009249523282051086, "rewards//mean": 0.8502197265625, "rewards//std": 0.014010367915034294, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.407, "grad_norm": 1.7126872539520264, "kl": 0.26818475872278214, "learning_rate": 3.2665039784067103e-06, "loss": 0.0268, "num_tokens": 13274064.0, "reward": 0.84381103515625, "reward_std": 0.011746569536626339, "rewards//mean": 0.84381103515625, "rewards//std": 0.015523795038461685, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4072, "grad_norm": 2.1263577938079834, "kl": 0.34108745213598013, "learning_rate": 3.2649935781342635e-06, "loss": 0.0341, "num_tokens": 13280592.0, "reward": 0.81988525390625, "reward_std": 0.00924522615969181, "rewards//mean": 0.81988525390625, "rewards//std": 0.01613018289208412, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4074, "grad_norm": 1.753426432609558, "kl": 0.29421536065638065, "learning_rate": 3.263482869722494e-06, "loss": 0.0294, "num_tokens": 13287200.0, "reward": 0.8409423828125, "reward_std": 0.01285061240196228, "rewards//mean": 0.8409423828125, "rewards//std": 0.03519734740257263, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4076, "grad_norm": 1.7158691883087158, "kl": 0.23440835997462273, "learning_rate": 3.261971853779916e-06, "loss": 0.0234, "num_tokens": 13293768.0, "reward": 0.81890869140625, "reward_std": 0.010628972202539444, "rewards//mean": 0.81890869140625, "rewards//std": 0.023952340707182884, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4078, "grad_norm": 1.636093258857727, "kl": 0.24359282292425632, "learning_rate": 3.2604605309151667e-06, "loss": 0.0244, "num_tokens": 13300360.0, "reward": 0.84521484375, "reward_std": 0.013960248790681362, "rewards//mean": 0.84521484375, "rewards//std": 0.020640617236495018, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.408, "grad_norm": 1.5822540521621704, "kl": 0.3625267930328846, "learning_rate": 3.2589489017370056e-06, "loss": 0.0363, "num_tokens": 13306864.0, "reward": 0.8514404296875, "reward_std": 0.01039578765630722, "rewards//mean": 0.8514404296875, "rewards//std": 0.025619426742196083, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4082, "grad_norm": 1.8198933601379395, "kl": 0.2542379666119814, "learning_rate": 3.2574369668543187e-06, "loss": 0.0254, "num_tokens": 13313352.0, "reward": 0.81341552734375, "reward_std": 0.007212778087705374, "rewards//mean": 0.81341552734375, "rewards//std": 0.013473208993673325, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4084, "grad_norm": 1.5425877571105957, "kl": 0.27958838548511267, "learning_rate": 3.2559247268761117e-06, "loss": 0.028, "num_tokens": 13319880.0, "reward": 0.81982421875, "reward_std": 0.009508145973086357, "rewards//mean": 0.81982421875, "rewards//std": 0.016884103417396545, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4086, "grad_norm": 1.6781865358352661, "kl": 0.40033346228301525, "learning_rate": 3.2544121824115154e-06, "loss": 0.04, "num_tokens": 13326360.0, "reward": 0.85186767578125, "reward_std": 0.013849182985723019, "rewards//mean": 0.85186767578125, "rewards//std": 0.020591329783201218, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4088, "grad_norm": 1.517913818359375, "kl": 0.23705233447253704, "learning_rate": 3.2528993340697817e-06, "loss": 0.0237, "num_tokens": 13332912.0, "reward": 0.76361083984375, "reward_std": 0.009549669921398163, "rewards//mean": 0.76361083984375, "rewards//std": 0.01945038139820099, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.409, "grad_norm": 2.5470054149627686, "kl": 0.369333453476429, "learning_rate": 3.2513861824602866e-06, "loss": 0.0369, "num_tokens": 13339504.0, "reward": 0.8250732421875, "reward_std": 0.01498548872768879, "rewards//mean": 0.8250732421875, "rewards//std": 0.022867033258080482, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4092, "grad_norm": 2.4156148433685303, "kl": 0.3742985315620899, "learning_rate": 3.249872728192527e-06, "loss": 0.0374, "num_tokens": 13345992.0, "reward": 0.77850341796875, "reward_std": 0.012915275990962982, "rewards//mean": 0.77850341796875, "rewards//std": 0.023090600967407227, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4094, "grad_norm": 1.9096599817276, "kl": 0.2871760977432132, "learning_rate": 3.248358971876122e-06, "loss": 0.0287, "num_tokens": 13352504.0, "reward": 0.8294677734375, "reward_std": 0.017156630754470825, "rewards//mean": 0.8294677734375, "rewards//std": 0.024990104138851166, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4096, "grad_norm": 1.4348558187484741, "kl": 0.27135795168578625, "learning_rate": 3.2468449141208124e-06, "loss": 0.0271, "num_tokens": 13359056.0, "reward": 0.82037353515625, "reward_std": 0.011023739352822304, "rewards//mean": 0.82037353515625, "rewards//std": 0.013997801579535007, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4098, "grad_norm": 1.9253655672073364, "kl": 0.2840028144419193, "learning_rate": 3.2453305555364616e-06, "loss": 0.0284, "num_tokens": 13365528.0, "reward": 0.7987060546875, "reward_std": 0.010803207755088806, "rewards//mean": 0.7987060546875, "rewards//std": 0.02748042717576027, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.41, "grad_norm": 1.513474941253662, "kl": 0.23330248519778252, "learning_rate": 3.2438158967330518e-06, "loss": 0.0233, "num_tokens": 13372032.0, "reward": 0.83148193359375, "reward_std": 0.008672202937304974, "rewards//mean": 0.83148193359375, "rewards//std": 0.01602095179259777, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4102, "grad_norm": 1.7274426221847534, "kl": 0.3164318408817053, "learning_rate": 3.2423009383206876e-06, "loss": 0.0316, "num_tokens": 13378544.0, "reward": 0.8348388671875, "reward_std": 0.017357971519231796, "rewards//mean": 0.8348388671875, "rewards//std": 0.029712200164794922, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4104, "grad_norm": 1.366492509841919, "kl": 0.2599536459892988, "learning_rate": 3.2407856809095945e-06, "loss": 0.026, "num_tokens": 13384904.0, "reward": 0.87469482421875, "reward_std": 0.017850616946816444, "rewards//mean": 0.87469482421875, "rewards//std": 0.029978672042489052, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4106, "grad_norm": 1.3832217454910278, "kl": 0.2599945068359375, "learning_rate": 3.2392701251101172e-06, "loss": 0.026, "num_tokens": 13391392.0, "reward": 0.85296630859375, "reward_std": 0.009855691343545914, "rewards//mean": 0.85296630859375, "rewards//std": 0.012379425577819347, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4108, "grad_norm": 1.406179666519165, "kl": 0.19157631509006023, "learning_rate": 3.2377542715327227e-06, "loss": 0.0192, "num_tokens": 13397928.0, "reward": 0.8515625, "reward_std": 0.009162312373518944, "rewards//mean": 0.8515625, "rewards//std": 0.021259134635329247, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.411, "grad_norm": 1.3894538879394531, "kl": 0.22542461194097996, "learning_rate": 3.2362381207879946e-06, "loss": 0.0225, "num_tokens": 13404376.0, "reward": 0.85614013671875, "reward_std": 0.014883531257510185, "rewards//mean": 0.85614013671875, "rewards//std": 0.03728959709405899, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4112, "grad_norm": 1.5166605710983276, "kl": 0.1997812082991004, "learning_rate": 3.2347216734866406e-06, "loss": 0.02, "num_tokens": 13410832.0, "reward": 0.84259033203125, "reward_std": 0.0126652792096138, "rewards//mean": 0.84259033203125, "rewards//std": 0.026187092065811157, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4114, "grad_norm": 1.4042149782180786, "kl": 0.17251752130687237, "learning_rate": 3.2332049302394834e-06, "loss": 0.0173, "num_tokens": 13417328.0, "reward": 0.88214111328125, "reward_std": 0.011162126436829567, "rewards//mean": 0.88214111328125, "rewards//std": 0.022103166207671165, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4116, "grad_norm": 1.3422216176986694, "kl": 0.22853142954409122, "learning_rate": 3.231687891657469e-06, "loss": 0.0229, "num_tokens": 13423864.0, "reward": 0.84564208984375, "reward_std": 0.012234932743012905, "rewards//mean": 0.84564208984375, "rewards//std": 0.03313826769590378, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4118, "grad_norm": 1.626787781715393, "kl": 0.2158260215073824, "learning_rate": 3.2301705583516586e-06, "loss": 0.0216, "num_tokens": 13430344.0, "reward": 0.86981201171875, "reward_std": 0.013794959522783756, "rewards//mean": 0.86981201171875, "rewards//std": 0.018908150494098663, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.412, "grad_norm": 1.630737543106079, "kl": 0.19805716630071402, "learning_rate": 3.2286529309332353e-06, "loss": 0.0198, "num_tokens": 13436848.0, "reward": 0.8701171875, "reward_std": 0.012534265406429768, "rewards//mean": 0.8701171875, "rewards//std": 0.02896534651517868, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4122, "grad_norm": 2.5440428256988525, "kl": 0.17938098032027483, "learning_rate": 3.227135010013498e-06, "loss": 0.0179, "num_tokens": 13443336.0, "reward": 0.823974609375, "reward_std": 0.013195358216762543, "rewards//mean": 0.823974609375, "rewards//std": 0.026810409501194954, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4124, "grad_norm": 1.3605693578720093, "kl": 0.19136207923293114, "learning_rate": 3.225616796203866e-06, "loss": 0.0191, "num_tokens": 13449848.0, "reward": 0.7930908203125, "reward_std": 0.009676920250058174, "rewards//mean": 0.7930908203125, "rewards//std": 0.018108775839209557, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4126, "grad_norm": 1.6545403003692627, "kl": 0.21054410189390182, "learning_rate": 3.2240982901158747e-06, "loss": 0.0211, "num_tokens": 13456424.0, "reward": 0.856201171875, "reward_std": 0.013996101915836334, "rewards//mean": 0.856201171875, "rewards//std": 0.022605039179325104, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4128, "grad_norm": 1.6466785669326782, "kl": 0.21981132496148348, "learning_rate": 3.222579492361179e-06, "loss": 0.022, "num_tokens": 13462944.0, "reward": 0.8726806640625, "reward_std": 0.015301883220672607, "rewards//mean": 0.8726806640625, "rewards//std": 0.03139474615454674, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.413, "grad_norm": 1.662522315979004, "kl": 0.17663805559277534, "learning_rate": 3.221060403551551e-06, "loss": 0.0177, "num_tokens": 13469472.0, "reward": 0.8197021484375, "reward_std": 0.014798833057284355, "rewards//mean": 0.8197021484375, "rewards//std": 0.027789371088147163, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4132, "grad_norm": 1.3355202674865723, "kl": 0.22046801261603832, "learning_rate": 3.2195410242988777e-06, "loss": 0.022, "num_tokens": 13476024.0, "reward": 0.79901123046875, "reward_std": 0.014740463346242905, "rewards//mean": 0.79901123046875, "rewards//std": 0.023887157440185547, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4134, "grad_norm": 1.4280939102172852, "kl": 0.1846131719648838, "learning_rate": 3.2180213552151663e-06, "loss": 0.0185, "num_tokens": 13482616.0, "reward": 0.76397705078125, "reward_std": 0.008580896072089672, "rewards//mean": 0.76397705078125, "rewards//std": 0.02393653616309166, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4136, "grad_norm": 1.4044177532196045, "kl": 0.14527849201112986, "learning_rate": 3.216501396912538e-06, "loss": 0.0145, "num_tokens": 13489216.0, "reward": 0.86529541015625, "reward_std": 0.012955695390701294, "rewards//mean": 0.86529541015625, "rewards//std": 0.031288668513298035, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4138, "grad_norm": 1.405395269393921, "kl": 0.2329601002857089, "learning_rate": 3.214981150003234e-06, "loss": 0.0233, "num_tokens": 13495728.0, "reward": 0.82171630859375, "reward_std": 0.014029848389327526, "rewards//mean": 0.82171630859375, "rewards//std": 0.0248164851218462, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.414, "grad_norm": 1.1646020412445068, "kl": 0.2001659469678998, "learning_rate": 3.2134606150996075e-06, "loss": 0.02, "num_tokens": 13502248.0, "reward": 0.84747314453125, "reward_std": 0.013385271653532982, "rewards//mean": 0.84747314453125, "rewards//std": 0.019480708986520767, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4142, "grad_norm": 1.1917154788970947, "kl": 0.26797593757510185, "learning_rate": 3.211939792814131e-06, "loss": 0.0268, "num_tokens": 13508752.0, "reward": 0.86505126953125, "reward_std": 0.013713480904698372, "rewards//mean": 0.86505126953125, "rewards//std": 0.025122638791799545, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4144, "grad_norm": 1.85873544216156, "kl": 0.20681273750960827, "learning_rate": 3.2104186837593904e-06, "loss": 0.0207, "num_tokens": 13515368.0, "reward": 0.83294677734375, "reward_std": 0.010367844253778458, "rewards//mean": 0.83294677734375, "rewards//std": 0.030279625207185745, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4146, "grad_norm": 1.3769853115081787, "kl": 0.2513934811577201, "learning_rate": 3.2088972885480894e-06, "loss": 0.0251, "num_tokens": 13521928.0, "reward": 0.84759521484375, "reward_std": 0.011005290783941746, "rewards//mean": 0.84759521484375, "rewards//std": 0.02661653608083725, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4148, "grad_norm": 1.3269444704055786, "kl": 0.19434460531920195, "learning_rate": 3.2073756077930453e-06, "loss": 0.0194, "num_tokens": 13528528.0, "reward": 0.837158203125, "reward_std": 0.010176599025726318, "rewards//mean": 0.837158203125, "rewards//std": 0.023363754153251648, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.415, "grad_norm": 1.6925891637802124, "kl": 0.28913682140409946, "learning_rate": 3.205853642107192e-06, "loss": 0.0289, "num_tokens": 13535032.0, "reward": 0.87896728515625, "reward_std": 0.015126490034162998, "rewards//mean": 0.87896728515625, "rewards//std": 0.0251473318785429, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4152, "grad_norm": 1.628798484802246, "kl": 0.21951354760676622, "learning_rate": 3.2043313921035747e-06, "loss": 0.022, "num_tokens": 13541536.0, "reward": 0.8468017578125, "reward_std": 0.009496030397713184, "rewards//mean": 0.8468017578125, "rewards//std": 0.02260068617761135, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4154, "grad_norm": 1.2498492002487183, "kl": 0.2394741252064705, "learning_rate": 3.202808858395357e-06, "loss": 0.0239, "num_tokens": 13548040.0, "reward": 0.84716796875, "reward_std": 0.007779507432132959, "rewards//mean": 0.84716796875, "rewards//std": 0.0158634465187788, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4156, "grad_norm": 1.3040999174118042, "kl": 0.31618971936404705, "learning_rate": 3.201286041595816e-06, "loss": 0.0316, "num_tokens": 13554568.0, "reward": 0.8446044921875, "reward_std": 0.014241158962249756, "rewards//mean": 0.8446044921875, "rewards//std": 0.024513069540262222, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4158, "grad_norm": 1.5534573793411255, "kl": 0.3234667554497719, "learning_rate": 3.1997629423183407e-06, "loss": 0.0323, "num_tokens": 13561112.0, "reward": 0.83782958984375, "reward_std": 0.010756529867649078, "rewards//mean": 0.83782958984375, "rewards//std": 0.016833709552884102, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.416, "grad_norm": 2.0763978958129883, "kl": 0.28366887755692005, "learning_rate": 3.198239561176436e-06, "loss": 0.0284, "num_tokens": 13567592.0, "reward": 0.849365234375, "reward_std": 0.016259770840406418, "rewards//mean": 0.849365234375, "rewards//std": 0.029813161119818687, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4162, "grad_norm": 1.4891360998153687, "kl": 0.3444456681609154, "learning_rate": 3.19671589878372e-06, "loss": 0.0344, "num_tokens": 13574080.0, "reward": 0.852294921875, "reward_std": 0.013064688071608543, "rewards//mean": 0.852294921875, "rewards//std": 0.022259533405303955, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4164, "grad_norm": 2.192216396331787, "kl": 0.3460389245301485, "learning_rate": 3.1951919557539225e-06, "loss": 0.0346, "num_tokens": 13580608.0, "reward": 0.80291748046875, "reward_std": 0.012370647862553596, "rewards//mean": 0.80291748046875, "rewards//std": 0.01946205087006092, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4166, "grad_norm": 2.437055826187134, "kl": 0.308832511305809, "learning_rate": 3.1936677327008887e-06, "loss": 0.0309, "num_tokens": 13587120.0, "reward": 0.82257080078125, "reward_std": 0.013976707123219967, "rewards//mean": 0.82257080078125, "rewards//std": 0.019842568784952164, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4168, "grad_norm": 1.420653223991394, "kl": 0.23718150332570076, "learning_rate": 3.1921432302385747e-06, "loss": 0.0237, "num_tokens": 13593712.0, "reward": 0.83258056640625, "reward_std": 0.009794378653168678, "rewards//mean": 0.83258056640625, "rewards//std": 0.027549952268600464, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.417, "grad_norm": 1.4400264024734497, "kl": 0.13714737724512815, "learning_rate": 3.190618448981051e-06, "loss": 0.0137, "num_tokens": 13600200.0, "reward": 0.852783203125, "reward_std": 0.012979315593838692, "rewards//mean": 0.852783203125, "rewards//std": 0.032052528113126755, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4172, "grad_norm": 1.4468188285827637, "kl": 0.2410000916570425, "learning_rate": 3.189093389542498e-06, "loss": 0.0241, "num_tokens": 13606728.0, "reward": 0.85137939453125, "reward_std": 0.016755683347582817, "rewards//mean": 0.85137939453125, "rewards//std": 0.04388856887817383, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4174, "grad_norm": 1.4794448614120483, "kl": 0.21392268780618906, "learning_rate": 3.187568052537211e-06, "loss": 0.0214, "num_tokens": 13613224.0, "reward": 0.85931396484375, "reward_std": 0.019102074205875397, "rewards//mean": 0.85931396484375, "rewards//std": 0.03438297659158707, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4176, "grad_norm": 1.405073642730713, "kl": 0.2221553884446621, "learning_rate": 3.1860424385795942e-06, "loss": 0.0222, "num_tokens": 13619744.0, "reward": 0.8380126953125, "reward_std": 0.01602233201265335, "rewards//mean": 0.8380126953125, "rewards//std": 0.03128848969936371, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4178, "grad_norm": 1.3318150043487549, "kl": 0.24795302283018827, "learning_rate": 3.1845165482841652e-06, "loss": 0.0248, "num_tokens": 13626256.0, "reward": 0.84722900390625, "reward_std": 0.013562886044383049, "rewards//mean": 0.84722900390625, "rewards//std": 0.031009720638394356, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.418, "grad_norm": 1.8149176836013794, "kl": 0.19102980941534042, "learning_rate": 3.1829903822655528e-06, "loss": 0.0191, "num_tokens": 13632888.0, "reward": 0.8653564453125, "reward_std": 0.010225332342088223, "rewards//mean": 0.8653564453125, "rewards//std": 0.01886857859790325, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4182, "grad_norm": 1.9892795085906982, "kl": 0.1565965609624982, "learning_rate": 3.1814639411384953e-06, "loss": 0.0157, "num_tokens": 13639376.0, "reward": 0.8104248046875, "reward_std": 0.010508917272090912, "rewards//mean": 0.8104248046875, "rewards//std": 0.03174954652786255, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4184, "grad_norm": 1.7367751598358154, "kl": 0.22108756564557552, "learning_rate": 3.179937225517844e-06, "loss": 0.0221, "num_tokens": 13645904.0, "reward": 0.81121826171875, "reward_std": 0.009627393446862698, "rewards//mean": 0.81121826171875, "rewards//std": 0.016293587163090706, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4186, "grad_norm": 2.1851015090942383, "kl": 0.22052592039108276, "learning_rate": 3.17841023601856e-06, "loss": 0.0221, "num_tokens": 13652344.0, "reward": 0.86334228515625, "reward_std": 0.014036190696060658, "rewards//mean": 0.86334228515625, "rewards//std": 0.03294401243329048, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4188, "grad_norm": 1.544690728187561, "kl": 0.2891365271061659, "learning_rate": 3.1768829732557137e-06, "loss": 0.0289, "num_tokens": 13658864.0, "reward": 0.84820556640625, "reward_std": 0.014961971901357174, "rewards//mean": 0.84820556640625, "rewards//std": 0.022367993369698524, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.419, "grad_norm": 1.479034185409546, "kl": 0.19239099882543087, "learning_rate": 3.175355437844486e-06, "loss": 0.0192, "num_tokens": 13665376.0, "reward": 0.82647705078125, "reward_std": 0.016724567860364914, "rewards//mean": 0.82647705078125, "rewards//std": 0.02209494635462761, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4192, "grad_norm": 1.4734145402908325, "kl": 0.18972532358020544, "learning_rate": 3.17382763040017e-06, "loss": 0.019, "num_tokens": 13671944.0, "reward": 0.80352783203125, "reward_std": 0.008237404748797417, "rewards//mean": 0.80352783203125, "rewards//std": 0.022430842742323875, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4194, "grad_norm": 2.2686874866485596, "kl": 0.3365077879279852, "learning_rate": 3.1722995515381644e-06, "loss": 0.0337, "num_tokens": 13678424.0, "reward": 0.84881591796875, "reward_std": 0.01495739072561264, "rewards//mean": 0.84881591796875, "rewards//std": 0.029754647985100746, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4196, "grad_norm": 2.5084452629089355, "kl": 0.1884743170812726, "learning_rate": 3.17077120187398e-06, "loss": 0.0188, "num_tokens": 13685000.0, "reward": 0.88043212890625, "reward_std": 0.013117094524204731, "rewards//mean": 0.88043212890625, "rewards//std": 0.02844744361937046, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4198, "grad_norm": 1.4009900093078613, "kl": 0.2393751461058855, "learning_rate": 3.169242582023236e-06, "loss": 0.0239, "num_tokens": 13691496.0, "reward": 0.83514404296875, "reward_std": 0.015473881736397743, "rewards//mean": 0.83514404296875, "rewards//std": 0.02608051337301731, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.42, "grad_norm": 1.4392926692962646, "kl": 0.25286889635026455, "learning_rate": 3.16771369260166e-06, "loss": 0.0253, "num_tokens": 13697936.0, "reward": 0.84014892578125, "reward_std": 0.012339215725660324, "rewards//mean": 0.84014892578125, "rewards//std": 0.020079178735613823, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4202, "grad_norm": 1.6008802652359009, "kl": 0.32890768721699715, "learning_rate": 3.1661845342250874e-06, "loss": 0.0329, "num_tokens": 13704352.0, "reward": 0.85870361328125, "reward_std": 0.00948946550488472, "rewards//mean": 0.85870361328125, "rewards//std": 0.015910016372799873, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4204, "grad_norm": 1.4424753189086914, "kl": 0.17100952751934528, "learning_rate": 3.1646551075094656e-06, "loss": 0.0171, "num_tokens": 13710848.0, "reward": 0.78924560546875, "reward_std": 0.011202545836567879, "rewards//mean": 0.78924560546875, "rewards//std": 0.024393824860453606, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4206, "grad_norm": 1.5895953178405762, "kl": 0.24572067894041538, "learning_rate": 3.1631254130708446e-06, "loss": 0.0246, "num_tokens": 13717408.0, "reward": 0.84619140625, "reward_std": 0.014255987480282784, "rewards//mean": 0.84619140625, "rewards//std": 0.028008848428726196, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4208, "grad_norm": 1.4043093919754028, "kl": 0.3761600125581026, "learning_rate": 3.161595451525388e-06, "loss": 0.0376, "num_tokens": 13723968.0, "reward": 0.827392578125, "reward_std": 0.021004393696784973, "rewards//mean": 0.827392578125, "rewards//std": 0.02657448872923851, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.421, "grad_norm": 1.8078935146331787, "kl": 0.31286709755659103, "learning_rate": 3.160065223489361e-06, "loss": 0.0313, "num_tokens": 13730600.0, "reward": 0.873779296875, "reward_std": 0.015432553365826607, "rewards//mean": 0.873779296875, "rewards//std": 0.021652817726135254, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4212, "grad_norm": 1.3489049673080444, "kl": 0.3047799728810787, "learning_rate": 3.158534729579142e-06, "loss": 0.0305, "num_tokens": 13737104.0, "reward": 0.85601806640625, "reward_std": 0.012817729264497757, "rewards//mean": 0.85601806640625, "rewards//std": 0.030535507947206497, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4214, "grad_norm": 1.3224866390228271, "kl": 0.24450539145618677, "learning_rate": 3.1570039704112127e-06, "loss": 0.0245, "num_tokens": 13743672.0, "reward": 0.84429931640625, "reward_std": 0.011430873535573483, "rewards//mean": 0.84429931640625, "rewards//std": 0.033217653632164, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4216, "grad_norm": 1.4761995077133179, "kl": 0.2610027426853776, "learning_rate": 3.155472946602162e-06, "loss": 0.0261, "num_tokens": 13750216.0, "reward": 0.8363037109375, "reward_std": 0.010564275085926056, "rewards//mean": 0.8363037109375, "rewards//std": 0.02053731679916382, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4218, "grad_norm": 2.0035955905914307, "kl": 0.22906195558607578, "learning_rate": 3.1539416587686876e-06, "loss": 0.0229, "num_tokens": 13756816.0, "reward": 0.83941650390625, "reward_std": 0.011643676087260246, "rewards//mean": 0.83941650390625, "rewards//std": 0.01935207098722458, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.422, "grad_norm": 2.0968916416168213, "kl": 0.31746518053114414, "learning_rate": 3.15241010752759e-06, "loss": 0.0317, "num_tokens": 13763288.0, "reward": 0.7811279296875, "reward_std": 0.012517087161540985, "rewards//mean": 0.7811279296875, "rewards//std": 0.027533257380127907, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4222, "grad_norm": 1.7464072704315186, "kl": 0.2873392151668668, "learning_rate": 3.1508782934957804e-06, "loss": 0.0287, "num_tokens": 13769832.0, "reward": 0.88665771484375, "reward_std": 0.010747982189059258, "rewards//mean": 0.88665771484375, "rewards//std": 0.01647743210196495, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4224, "grad_norm": 1.6001224517822266, "kl": 0.26417588349431753, "learning_rate": 3.1493462172902713e-06, "loss": 0.0264, "num_tokens": 13776328.0, "reward": 0.85589599609375, "reward_std": 0.013890923000872135, "rewards//mean": 0.85589599609375, "rewards//std": 0.0293154064565897, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4226, "grad_norm": 1.4509334564208984, "kl": 0.28111970983445644, "learning_rate": 3.147813879528184e-06, "loss": 0.0281, "num_tokens": 13782792.0, "reward": 0.81549072265625, "reward_std": 0.00917040929198265, "rewards//mean": 0.81549072265625, "rewards//std": 0.021290352568030357, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4228, "grad_norm": 1.4758511781692505, "kl": 0.2716400036588311, "learning_rate": 3.146281280826743e-06, "loss": 0.0272, "num_tokens": 13789352.0, "reward": 0.88507080078125, "reward_std": 0.014986925758421421, "rewards//mean": 0.88507080078125, "rewards//std": 0.03968871012330055, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.423, "grad_norm": 2.834211826324463, "kl": 0.42628304101526737, "learning_rate": 3.14474842180328e-06, "loss": 0.0426, "num_tokens": 13795960.0, "reward": 0.81976318359375, "reward_std": 0.011750428937375546, "rewards//mean": 0.81976318359375, "rewards//std": 0.026683563366532326, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4232, "grad_norm": 2.326777219772339, "kl": 0.22385219857096672, "learning_rate": 3.143215303075229e-06, "loss": 0.0224, "num_tokens": 13802480.0, "reward": 0.8594970703125, "reward_std": 0.0116764847189188, "rewards//mean": 0.8594970703125, "rewards//std": 0.016804570332169533, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4234, "grad_norm": 1.3091679811477661, "kl": 0.34994947351515293, "learning_rate": 3.1416819252601318e-06, "loss": 0.035, "num_tokens": 13809008.0, "reward": 0.8375244140625, "reward_std": 0.015554312616586685, "rewards//mean": 0.8375244140625, "rewards//std": 0.026380963623523712, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4236, "grad_norm": 1.6539829969406128, "kl": 0.2981468290090561, "learning_rate": 3.140148288975631e-06, "loss": 0.0298, "num_tokens": 13815544.0, "reward": 0.80841064453125, "reward_std": 0.009314393624663353, "rewards//mean": 0.80841064453125, "rewards//std": 0.02383132465183735, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4238, "grad_norm": 2.689807653427124, "kl": 0.3918081447482109, "learning_rate": 3.1386143948394764e-06, "loss": 0.0392, "num_tokens": 13822136.0, "reward": 0.7828369140625, "reward_std": 0.011183375492691994, "rewards//mean": 0.7828369140625, "rewards//std": 0.019183648750185966, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.424, "grad_norm": 1.2850412130355835, "kl": 0.17867718264460564, "learning_rate": 3.13708024346952e-06, "loss": 0.0179, "num_tokens": 13828680.0, "reward": 0.85589599609375, "reward_std": 0.00867018848657608, "rewards//mean": 0.85589599609375, "rewards//std": 0.024221321567893028, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4242, "grad_norm": 2.148141860961914, "kl": 0.23242129106074572, "learning_rate": 3.1355458354837183e-06, "loss": 0.0232, "num_tokens": 13835256.0, "reward": 0.8653564453125, "reward_std": 0.0167667455971241, "rewards//mean": 0.8653564453125, "rewards//std": 0.03109436109662056, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4244, "grad_norm": 2.464419364929199, "kl": 0.34251460433006287, "learning_rate": 3.134011171500129e-06, "loss": 0.0343, "num_tokens": 13841696.0, "reward": 0.82611083984375, "reward_std": 0.010273282416164875, "rewards//mean": 0.82611083984375, "rewards//std": 0.02480306103825569, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4246, "grad_norm": 1.6713422536849976, "kl": 0.30788818560540676, "learning_rate": 3.1324762521369156e-06, "loss": 0.0308, "num_tokens": 13848232.0, "reward": 0.86090087890625, "reward_std": 0.013405179604887962, "rewards//mean": 0.86090087890625, "rewards//std": 0.026997141540050507, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4248, "grad_norm": 1.5575813055038452, "kl": 0.24563703685998917, "learning_rate": 3.130941078012344e-06, "loss": 0.0246, "num_tokens": 13854848.0, "reward": 0.8245849609375, "reward_std": 0.014153433963656425, "rewards//mean": 0.8245849609375, "rewards//std": 0.030327394604682922, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.425, "grad_norm": 1.7807587385177612, "kl": 0.31352563574910164, "learning_rate": 3.1294056497447818e-06, "loss": 0.0314, "num_tokens": 13861408.0, "reward": 0.85980224609375, "reward_std": 0.014478510245680809, "rewards//mean": 0.85980224609375, "rewards//std": 0.03420376405119896, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4252, "grad_norm": 1.5111557245254517, "kl": 0.30207211896777153, "learning_rate": 3.127869967952698e-06, "loss": 0.0302, "num_tokens": 13867856.0, "reward": 0.84075927734375, "reward_std": 0.009212872013449669, "rewards//mean": 0.84075927734375, "rewards//std": 0.020702039822936058, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4254, "grad_norm": 2.4485764503479004, "kl": 0.2635139934718609, "learning_rate": 3.126334033254668e-06, "loss": 0.0264, "num_tokens": 13874416.0, "reward": 0.86248779296875, "reward_std": 0.013469710946083069, "rewards//mean": 0.86248779296875, "rewards//std": 0.03006640449166298, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4256, "grad_norm": 1.3360071182250977, "kl": 0.39939786680042744, "learning_rate": 3.124797846269363e-06, "loss": 0.0399, "num_tokens": 13880920.0, "reward": 0.8001708984375, "reward_std": 0.016106948256492615, "rewards//mean": 0.8001708984375, "rewards//std": 0.025555534288287163, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4258, "grad_norm": 1.5685603618621826, "kl": 0.3719202559441328, "learning_rate": 3.1232614076155605e-06, "loss": 0.0372, "num_tokens": 13887440.0, "reward": 0.865478515625, "reward_std": 0.012503368780016899, "rewards//mean": 0.865478515625, "rewards//std": 0.017973264679312706, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.426, "grad_norm": 1.758812665939331, "kl": 0.4209972769021988, "learning_rate": 3.121724717912138e-06, "loss": 0.0421, "num_tokens": 13893864.0, "reward": 0.82049560546875, "reward_std": 0.011843436397612095, "rewards//mean": 0.82049560546875, "rewards//std": 0.019480708986520767, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4262, "grad_norm": 1.6792635917663574, "kl": 0.38652216270565987, "learning_rate": 3.1201877777780724e-06, "loss": 0.0387, "num_tokens": 13900344.0, "reward": 0.764404296875, "reward_std": 0.009559590369462967, "rewards//mean": 0.764404296875, "rewards//std": 0.012906398624181747, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4264, "grad_norm": 1.8772845268249512, "kl": 0.27993398159742355, "learning_rate": 3.1186505878324452e-06, "loss": 0.028, "num_tokens": 13906752.0, "reward": 0.80841064453125, "reward_std": 0.010026413016021252, "rewards//mean": 0.80841064453125, "rewards//std": 0.022898372262716293, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4266, "grad_norm": 1.7502858638763428, "kl": 0.38925628550350666, "learning_rate": 3.1171131486944345e-06, "loss": 0.0389, "num_tokens": 13913312.0, "reward": 0.8450927734375, "reward_std": 0.013456946238875389, "rewards//mean": 0.8450927734375, "rewards//std": 0.025057855993509293, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4268, "grad_norm": 1.4037811756134033, "kl": 0.3435385674238205, "learning_rate": 3.1155754609833216e-06, "loss": 0.0344, "num_tokens": 13919904.0, "reward": 0.8585205078125, "reward_std": 0.016124019399285316, "rewards//mean": 0.8585205078125, "rewards//std": 0.036936625838279724, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.427, "grad_norm": 1.5913851261138916, "kl": 0.2764491084963083, "learning_rate": 3.1140375253184863e-06, "loss": 0.0276, "num_tokens": 13926280.0, "reward": 0.8275146484375, "reward_std": 0.01105770468711853, "rewards//mean": 0.8275146484375, "rewards//std": 0.02622903883457184, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4272, "grad_norm": 2.057889699935913, "kl": 0.2817895682528615, "learning_rate": 3.11249934231941e-06, "loss": 0.0282, "num_tokens": 13932792.0, "reward": 0.85009765625, "reward_std": 0.017210427671670914, "rewards//mean": 0.85009765625, "rewards//std": 0.036594267934560776, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4274, "grad_norm": 1.7748407125473022, "kl": 0.18714800477027893, "learning_rate": 3.110960912605671e-06, "loss": 0.0187, "num_tokens": 13939384.0, "reward": 0.8626708984375, "reward_std": 0.012404970824718475, "rewards//mean": 0.8626708984375, "rewards//std": 0.02664133720099926, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4276, "grad_norm": 1.5337979793548584, "kl": 0.3879495169967413, "learning_rate": 3.10942223679695e-06, "loss": 0.0388, "num_tokens": 13945920.0, "reward": 0.843505859375, "reward_std": 0.013442842289805412, "rewards//mean": 0.843505859375, "rewards//std": 0.023165950551629066, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4278, "grad_norm": 1.563681721687317, "kl": 0.3264656402170658, "learning_rate": 3.1078833155130246e-06, "loss": 0.0326, "num_tokens": 13952432.0, "reward": 0.8734130859375, "reward_std": 0.013773305341601372, "rewards//mean": 0.8734130859375, "rewards//std": 0.02054910734295845, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.428, "grad_norm": 1.558983325958252, "kl": 0.20966578647494316, "learning_rate": 3.106344149373773e-06, "loss": 0.021, "num_tokens": 13959024.0, "reward": 0.864013671875, "reward_std": 0.014885405078530312, "rewards//mean": 0.864013671875, "rewards//std": 0.027716554701328278, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4282, "grad_norm": 1.5087130069732666, "kl": 0.31845899671316147, "learning_rate": 3.1048047389991693e-06, "loss": 0.0318, "num_tokens": 13965584.0, "reward": 0.8709716796875, "reward_std": 0.013624060899019241, "rewards//mean": 0.8709716796875, "rewards//std": 0.024832110852003098, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4284, "grad_norm": 1.8251286745071411, "kl": 0.2186506288126111, "learning_rate": 3.1032650850092906e-06, "loss": 0.0219, "num_tokens": 13972184.0, "reward": 0.88433837890625, "reward_std": 0.012737477198243141, "rewards//mean": 0.88433837890625, "rewards//std": 0.0409727469086647, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4286, "grad_norm": 2.15902042388916, "kl": 0.26439279690384865, "learning_rate": 3.101725188024307e-06, "loss": 0.0264, "num_tokens": 13978688.0, "reward": 0.86566162109375, "reward_std": 0.013225117698311806, "rewards//mean": 0.86566162109375, "rewards//std": 0.03083348087966442, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4288, "grad_norm": 2.2630209922790527, "kl": 0.19989563897252083, "learning_rate": 3.1001850486644893e-06, "loss": 0.02, "num_tokens": 13985096.0, "reward": 0.86431884765625, "reward_std": 0.013283808715641499, "rewards//mean": 0.86431884765625, "rewards//std": 0.03006690926849842, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.429, "grad_norm": 2.8927502632141113, "kl": 0.40046823769807816, "learning_rate": 3.098644667550206e-06, "loss": 0.04, "num_tokens": 13991648.0, "reward": 0.8740234375, "reward_std": 0.012530426494777203, "rewards//mean": 0.8740234375, "rewards//std": 0.026153037324547768, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4292, "grad_norm": 1.97957444190979, "kl": 0.23854135535657406, "learning_rate": 3.0971040453019225e-06, "loss": 0.0239, "num_tokens": 13998112.0, "reward": 0.86309814453125, "reward_std": 0.010485764592885971, "rewards//mean": 0.86309814453125, "rewards//std": 0.01856396533548832, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4294, "grad_norm": 1.7374829053878784, "kl": 0.2616093959659338, "learning_rate": 3.095563182540201e-06, "loss": 0.0262, "num_tokens": 14004624.0, "reward": 0.837646484375, "reward_std": 0.01182644348591566, "rewards//mean": 0.837646484375, "rewards//std": 0.023363754153251648, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4296, "grad_norm": 1.678256630897522, "kl": 0.3142957352101803, "learning_rate": 3.0940220798857e-06, "loss": 0.0314, "num_tokens": 14011040.0, "reward": 0.83953857421875, "reward_std": 0.012463623657822609, "rewards//mean": 0.83953857421875, "rewards//std": 0.01793859526515007, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4298, "grad_norm": 1.711432933807373, "kl": 0.42819410376250744, "learning_rate": 3.0924807379591775e-06, "loss": 0.0428, "num_tokens": 14017760.0, "reward": 0.83734130859375, "reward_std": 0.013482763431966305, "rewards//mean": 0.83734130859375, "rewards//std": 0.03854045644402504, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.43, "grad_norm": 1.9702438116073608, "kl": 0.31508114002645016, "learning_rate": 3.090939157381484e-06, "loss": 0.0315, "num_tokens": 14024240.0, "reward": 0.83770751953125, "reward_std": 0.013504298403859138, "rewards//mean": 0.83770751953125, "rewards//std": 0.03800611197948456, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4302, "grad_norm": 2.4760539531707764, "kl": 0.24360336177051067, "learning_rate": 3.089397338773569e-06, "loss": 0.0244, "num_tokens": 14030808.0, "reward": 0.88494873046875, "reward_std": 0.017287708818912506, "rewards//mean": 0.88494873046875, "rewards//std": 0.02974039874970913, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4304, "grad_norm": 1.3248170614242554, "kl": 0.1667246287688613, "learning_rate": 3.087855282756475e-06, "loss": 0.0167, "num_tokens": 14037360.0, "reward": 0.86029052734375, "reward_std": 0.012907970696687698, "rewards//mean": 0.86029052734375, "rewards//std": 0.02221451885998249, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4306, "grad_norm": 1.5107572078704834, "kl": 0.36442319210618734, "learning_rate": 3.086312989951345e-06, "loss": 0.0364, "num_tokens": 14043856.0, "reward": 0.827880859375, "reward_std": 0.011507580056786537, "rewards//mean": 0.827880859375, "rewards//std": 0.016614779829978943, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4308, "grad_norm": 1.542646050453186, "kl": 0.28487165085971355, "learning_rate": 3.0847704609794116e-06, "loss": 0.0285, "num_tokens": 14050320.0, "reward": 0.8385009765625, "reward_std": 0.015429973602294922, "rewards//mean": 0.8385009765625, "rewards//std": 0.024873530492186546, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.431, "grad_norm": 1.827907681465149, "kl": 0.22492705145850778, "learning_rate": 3.0832276964620074e-06, "loss": 0.0225, "num_tokens": 14056832.0, "reward": 0.86822509765625, "reward_std": 0.012362740933895111, "rewards//mean": 0.86822509765625, "rewards//std": 0.03658594191074371, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4312, "grad_norm": 2.020228147506714, "kl": 0.2773941121995449, "learning_rate": 3.081684697020556e-06, "loss": 0.0277, "num_tokens": 14063432.0, "reward": 0.8291015625, "reward_std": 0.013643607497215271, "rewards//mean": 0.8291015625, "rewards//std": 0.03377537801861763, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4314, "grad_norm": 3.0724129676818848, "kl": 0.3555173669010401, "learning_rate": 3.0801414632765792e-06, "loss": 0.0356, "num_tokens": 14069968.0, "reward": 0.82464599609375, "reward_std": 0.009953189641237259, "rewards//mean": 0.82464599609375, "rewards//std": 0.013799577951431274, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4316, "grad_norm": 1.372650384902954, "kl": 0.3189261872321367, "learning_rate": 3.078597995851689e-06, "loss": 0.0319, "num_tokens": 14076440.0, "reward": 0.84893798828125, "reward_std": 0.00932818278670311, "rewards//mean": 0.84893798828125, "rewards//std": 0.01950090378522873, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4318, "grad_norm": 2.3562371730804443, "kl": 0.2914155311882496, "learning_rate": 3.0770542953675963e-06, "loss": 0.0291, "num_tokens": 14082952.0, "reward": 0.84283447265625, "reward_std": 0.013126096688210964, "rewards//mean": 0.84283447265625, "rewards//std": 0.02603927068412304, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.432, "grad_norm": 2.4705958366394043, "kl": 0.3675695061683655, "learning_rate": 3.0755103624461023e-06, "loss": 0.0368, "num_tokens": 14089496.0, "reward": 0.8187255859375, "reward_std": 0.01403226051479578, "rewards//mean": 0.8187255859375, "rewards//std": 0.02662769705057144, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4322, "grad_norm": 2.279218912124634, "kl": 0.4487595334649086, "learning_rate": 3.0739661977091027e-06, "loss": 0.0449, "num_tokens": 14096032.0, "reward": 0.80859375, "reward_std": 0.010604945942759514, "rewards//mean": 0.80859375, "rewards//std": 0.02041051536798477, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4324, "grad_norm": 2.5013363361358643, "kl": 0.4397276248782873, "learning_rate": 3.072421801778588e-06, "loss": 0.044, "num_tokens": 14102504.0, "reward": 0.86199951171875, "reward_std": 0.015753978863358498, "rewards//mean": 0.86199951171875, "rewards//std": 0.037776801735162735, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4326, "grad_norm": 3.0332248210906982, "kl": 0.31096627190709114, "learning_rate": 3.0708771752766397e-06, "loss": 0.0311, "num_tokens": 14108984.0, "reward": 0.8740234375, "reward_std": 0.009036701172590256, "rewards//mean": 0.8740234375, "rewards//std": 0.021854618564248085, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4328, "grad_norm": 1.9653762578964233, "kl": 0.42729904502630234, "learning_rate": 3.0693323188254333e-06, "loss": 0.0427, "num_tokens": 14115488.0, "reward": 0.8109130859375, "reward_std": 0.010579624213278294, "rewards//mean": 0.8109130859375, "rewards//std": 0.02218152955174446, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.433, "grad_norm": 1.6814950704574585, "kl": 0.31860833056271076, "learning_rate": 3.0677872330472376e-06, "loss": 0.0319, "num_tokens": 14122048.0, "reward": 0.83392333984375, "reward_std": 0.010468876920640469, "rewards//mean": 0.83392333984375, "rewards//std": 0.02861192636191845, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4332, "grad_norm": 1.656169056892395, "kl": 0.3427421981468797, "learning_rate": 3.0662419185644117e-06, "loss": 0.0343, "num_tokens": 14128600.0, "reward": 0.8685302734375, "reward_std": 0.01563132368028164, "rewards//mean": 0.8685302734375, "rewards//std": 0.03219836950302124, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4334, "grad_norm": 1.480324387550354, "kl": 0.18083972297608852, "learning_rate": 3.064696375999409e-06, "loss": 0.0181, "num_tokens": 14135152.0, "reward": 0.8621826171875, "reward_std": 0.012180369347333908, "rewards//mean": 0.8621826171875, "rewards//std": 0.01572495326399803, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4336, "grad_norm": 2.0552866458892822, "kl": 0.2552765514701605, "learning_rate": 3.0631506059747727e-06, "loss": 0.0255, "num_tokens": 14141688.0, "reward": 0.8818359375, "reward_std": 0.010299960151314735, "rewards//mean": 0.8818359375, "rewards//std": 0.01840108260512352, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4338, "grad_norm": 2.3517467975616455, "kl": 0.18299156054854393, "learning_rate": 3.061604609113141e-06, "loss": 0.0183, "num_tokens": 14148256.0, "reward": 0.8768310546875, "reward_std": 0.010893914848566055, "rewards//mean": 0.8768310546875, "rewards//std": 0.01568640023469925, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.434, "grad_norm": 2.8943934440612793, "kl": 0.22401544731110334, "learning_rate": 3.060058386037239e-06, "loss": 0.0224, "num_tokens": 14154808.0, "reward": 0.84747314453125, "reward_std": 0.014472988434135914, "rewards//mean": 0.84747314453125, "rewards//std": 0.026950562372803688, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4342, "grad_norm": 1.579474687576294, "kl": 0.32400694489479065, "learning_rate": 3.0585119373698858e-06, "loss": 0.0324, "num_tokens": 14161256.0, "reward": 0.8704833984375, "reward_std": 0.012072117999196053, "rewards//mean": 0.8704833984375, "rewards//std": 0.024589523673057556, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4344, "grad_norm": 2.6347146034240723, "kl": 0.33904603123664856, "learning_rate": 3.0569652637339917e-06, "loss": 0.0339, "num_tokens": 14167736.0, "reward": 0.84783935546875, "reward_std": 0.01612505316734314, "rewards//mean": 0.84783935546875, "rewards//std": 0.03268521651625633, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4346, "grad_norm": 1.485530138015747, "kl": 0.25789869390428066, "learning_rate": 3.055418365752556e-06, "loss": 0.0258, "num_tokens": 14174208.0, "reward": 0.86566162109375, "reward_std": 0.008784761652350426, "rewards//mean": 0.86566162109375, "rewards//std": 0.0230584554374218, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4348, "grad_norm": 1.4034143686294556, "kl": 0.23824886605143547, "learning_rate": 3.053871244048669e-06, "loss": 0.0238, "num_tokens": 14180792.0, "reward": 0.85821533203125, "reward_std": 0.009220361709594727, "rewards//mean": 0.85821533203125, "rewards//std": 0.022376788780093193, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.435, "grad_norm": 1.470288872718811, "kl": 0.3579995520412922, "learning_rate": 3.0523238992455108e-06, "loss": 0.0358, "num_tokens": 14187288.0, "reward": 0.84576416015625, "reward_std": 0.017460890114307404, "rewards//mean": 0.84576416015625, "rewards//std": 0.031265437602996826, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4352, "grad_norm": 1.5897024869918823, "kl": 0.26855282858014107, "learning_rate": 3.050776331966352e-06, "loss": 0.0269, "num_tokens": 14193728.0, "reward": 0.8184814453125, "reward_std": 0.012674350291490555, "rewards//mean": 0.8184814453125, "rewards//std": 0.019300080835819244, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4354, "grad_norm": 1.6211918592453003, "kl": 0.2820090651512146, "learning_rate": 3.0492285428345523e-06, "loss": 0.0282, "num_tokens": 14200280.0, "reward": 0.8328857421875, "reward_std": 0.015255695208907127, "rewards//mean": 0.8328857421875, "rewards//std": 0.01999654993414879, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4356, "grad_norm": 2.0101318359375, "kl": 0.1882610497996211, "learning_rate": 3.047680532473562e-06, "loss": 0.0188, "num_tokens": 14207056.0, "reward": 0.8568115234375, "reward_std": 0.014643016271293163, "rewards//mean": 0.8568115234375, "rewards//std": 0.029913272708654404, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4358, "grad_norm": 1.7630410194396973, "kl": 0.2706904448568821, "learning_rate": 3.0461323015069184e-06, "loss": 0.0271, "num_tokens": 14213536.0, "reward": 0.845947265625, "reward_std": 0.011161372996866703, "rewards//mean": 0.845947265625, "rewards//std": 0.0174259003251791, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.436, "grad_norm": 1.3942729234695435, "kl": 0.24558944907039404, "learning_rate": 3.044583850558249e-06, "loss": 0.0246, "num_tokens": 14219992.0, "reward": 0.8701171875, "reward_std": 0.012805221602320671, "rewards//mean": 0.8701171875, "rewards//std": 0.030504819005727768, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4362, "grad_norm": 2.0173428058624268, "kl": 0.21347941365092993, "learning_rate": 3.04303518025127e-06, "loss": 0.0213, "num_tokens": 14226528.0, "reward": 0.85931396484375, "reward_std": 0.01155479159206152, "rewards//mean": 0.85931396484375, "rewards//std": 0.023525094613432884, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4364, "grad_norm": 1.7359927892684937, "kl": 0.22651927918195724, "learning_rate": 3.0414862912097846e-06, "loss": 0.0227, "num_tokens": 14233120.0, "reward": 0.75897216796875, "reward_std": 0.009247688576579094, "rewards//mean": 0.75897216796875, "rewards//std": 0.01742318645119667, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4366, "grad_norm": 1.591886043548584, "kl": 0.3197274524718523, "learning_rate": 3.039937184057687e-06, "loss": 0.032, "num_tokens": 14239656.0, "reward": 0.855224609375, "reward_std": 0.011472209356725216, "rewards//mean": 0.855224609375, "rewards//std": 0.026747096329927444, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4368, "grad_norm": 2.1276378631591797, "kl": 0.24722513277083635, "learning_rate": 3.0383878594189553e-06, "loss": 0.0247, "num_tokens": 14246280.0, "reward": 0.864013671875, "reward_std": 0.016147062182426453, "rewards//mean": 0.864013671875, "rewards//std": 0.03239826112985611, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.437, "grad_norm": 1.7047417163848877, "kl": 0.3075410323217511, "learning_rate": 3.0368383179176584e-06, "loss": 0.0308, "num_tokens": 14252824.0, "reward": 0.76104736328125, "reward_std": 0.009549632668495178, "rewards//mean": 0.76104736328125, "rewards//std": 0.018263079226017, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4372, "grad_norm": 1.4994796514511108, "kl": 0.29973055236041546, "learning_rate": 3.0352885601779514e-06, "loss": 0.03, "num_tokens": 14259384.0, "reward": 0.85302734375, "reward_std": 0.010325837880373001, "rewards//mean": 0.85302734375, "rewards//std": 0.02676180750131607, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4374, "grad_norm": 2.4391119480133057, "kl": 0.262607017531991, "learning_rate": 3.0337385868240765e-06, "loss": 0.0263, "num_tokens": 14265960.0, "reward": 0.8836669921875, "reward_std": 0.010698621161282063, "rewards//mean": 0.8836669921875, "rewards//std": 0.028463203459978104, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4376, "grad_norm": 1.7519035339355469, "kl": 0.2756522987037897, "learning_rate": 3.0321883984803617e-06, "loss": 0.0276, "num_tokens": 14272448.0, "reward": 0.83245849609375, "reward_std": 0.009290201589465141, "rewards//mean": 0.83245849609375, "rewards//std": 0.021294619888067245, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4378, "grad_norm": 1.7906337976455688, "kl": 0.24486299883574247, "learning_rate": 3.030637995771225e-06, "loss": 0.0245, "num_tokens": 14278952.0, "reward": 0.80462646484375, "reward_std": 0.011783318594098091, "rewards//mean": 0.80462646484375, "rewards//std": 0.025348784402012825, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.438, "grad_norm": 1.5803560018539429, "kl": 0.3154057525098324, "learning_rate": 3.029087379321166e-06, "loss": 0.0315, "num_tokens": 14285472.0, "reward": 0.83026123046875, "reward_std": 0.008977801539003849, "rewards//mean": 0.83026123046875, "rewards//std": 0.0156239103525877, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4382, "grad_norm": 1.5623222589492798, "kl": 0.41956206783652306, "learning_rate": 3.0275365497547747e-06, "loss": 0.042, "num_tokens": 14292016.0, "reward": 0.84002685546875, "reward_std": 0.009899454191327095, "rewards//mean": 0.84002685546875, "rewards//std": 0.023431606590747833, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4384, "grad_norm": 2.3592681884765625, "kl": 0.4376298524439335, "learning_rate": 3.0259855076967235e-06, "loss": 0.0438, "num_tokens": 14298552.0, "reward": 0.82159423828125, "reward_std": 0.0132528617978096, "rewards//mean": 0.82159423828125, "rewards//std": 0.02385798841714859, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4386, "grad_norm": 1.9205858707427979, "kl": 0.4869076795876026, "learning_rate": 3.0244342537717735e-06, "loss": 0.0487, "num_tokens": 14305136.0, "reward": 0.84222412109375, "reward_std": 0.009756450541317463, "rewards//mean": 0.84222412109375, "rewards//std": 0.025502389296889305, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4388, "grad_norm": 1.614143967628479, "kl": 0.20401273341849446, "learning_rate": 3.022882788604768e-06, "loss": 0.0204, "num_tokens": 14311640.0, "reward": 0.86761474609375, "reward_std": 0.009398632682859898, "rewards//mean": 0.86761474609375, "rewards//std": 0.016734499484300613, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.439, "grad_norm": 1.3239657878875732, "kl": 0.2172070126980543, "learning_rate": 3.0213311128206385e-06, "loss": 0.0217, "num_tokens": 14318208.0, "reward": 0.77850341796875, "reward_std": 0.011279908940196037, "rewards//mean": 0.77850341796875, "rewards//std": 0.01754785142838955, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4392, "grad_norm": 1.8925012350082397, "kl": 0.24171410873532295, "learning_rate": 3.019779227044398e-06, "loss": 0.0242, "num_tokens": 14324656.0, "reward": 0.85675048828125, "reward_std": 0.01214037649333477, "rewards//mean": 0.85675048828125, "rewards//std": 0.02798769436776638, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4394, "grad_norm": 1.7848438024520874, "kl": 0.4073919504880905, "learning_rate": 3.0182271319011486e-06, "loss": 0.0407, "num_tokens": 14331176.0, "reward": 0.83843994140625, "reward_std": 0.016687525436282158, "rewards//mean": 0.83843994140625, "rewards//std": 0.037469808012247086, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4396, "grad_norm": 1.604193091392517, "kl": 0.47554923221468925, "learning_rate": 3.0166748280160716e-06, "loss": 0.0476, "num_tokens": 14337696.0, "reward": 0.84979248046875, "reward_std": 0.014256896451115608, "rewards//mean": 0.84979248046875, "rewards//std": 0.023652799427509308, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4398, "grad_norm": 1.8840214014053345, "kl": 0.40426860097795725, "learning_rate": 3.0151223160144353e-06, "loss": 0.0404, "num_tokens": 14344224.0, "reward": 0.84619140625, "reward_std": 0.017747381702065468, "rewards//mean": 0.84619140625, "rewards//std": 0.030421337112784386, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.44, "grad_norm": 2.2957606315612793, "kl": 0.30617956444621086, "learning_rate": 3.0135695965215924e-06, "loss": 0.0306, "num_tokens": 14350800.0, "reward": 0.88861083984375, "reward_std": 0.012981479056179523, "rewards//mean": 0.88861083984375, "rewards//std": 0.03199290856719017, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4402, "grad_norm": 2.006725788116455, "kl": 0.2555235577747226, "learning_rate": 3.012016670162977e-06, "loss": 0.0256, "num_tokens": 14357320.0, "reward": 0.77581787109375, "reward_std": 0.00787123292684555, "rewards//mean": 0.77581787109375, "rewards//std": 0.017505530267953873, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4404, "grad_norm": 1.6151883602142334, "kl": 0.3507488239556551, "learning_rate": 3.0104635375641083e-06, "loss": 0.0351, "num_tokens": 14363864.0, "reward": 0.8331298828125, "reward_std": 0.013516631908714771, "rewards//mean": 0.8331298828125, "rewards//std": 0.031004654243588448, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4406, "grad_norm": 2.5642285346984863, "kl": 0.39766893722116947, "learning_rate": 3.0089101993505877e-06, "loss": 0.0398, "num_tokens": 14370424.0, "reward": 0.8397216796875, "reward_std": 0.01602315530180931, "rewards//mean": 0.8397216796875, "rewards//std": 0.026143483817577362, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4408, "grad_norm": 1.5073381662368774, "kl": 0.24660158902406693, "learning_rate": 3.007356656148099e-06, "loss": 0.0247, "num_tokens": 14376984.0, "reward": 0.86712646484375, "reward_std": 0.010578399524092674, "rewards//mean": 0.86712646484375, "rewards//std": 0.031423844397068024, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.441, "grad_norm": 1.8950413465499878, "kl": 0.4511853065341711, "learning_rate": 3.005802908582411e-06, "loss": 0.0451, "num_tokens": 14383408.0, "reward": 0.7923583984375, "reward_std": 0.010220017284154892, "rewards//mean": 0.7923583984375, "rewards//std": 0.01967603527009487, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4412, "grad_norm": 1.5232435464859009, "kl": 0.3234141804277897, "learning_rate": 3.0042489572793722e-06, "loss": 0.0323, "num_tokens": 14389968.0, "reward": 0.89019775390625, "reward_std": 0.020121948793530464, "rewards//mean": 0.89019775390625, "rewards//std": 0.03334319591522217, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4414, "grad_norm": 3.1079630851745605, "kl": 0.4958507977426052, "learning_rate": 3.002694802864912e-06, "loss": 0.0496, "num_tokens": 14396680.0, "reward": 0.84228515625, "reward_std": 0.016255352646112442, "rewards//mean": 0.84228515625, "rewards//std": 0.02557714469730854, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4416, "grad_norm": 1.5372247695922852, "kl": 0.319557779468596, "learning_rate": 3.0011404459650467e-06, "loss": 0.032, "num_tokens": 14403152.0, "reward": 0.8616943359375, "reward_std": 0.014440163969993591, "rewards//mean": 0.8616943359375, "rewards//std": 0.02053731679916382, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4418, "grad_norm": 4.121426582336426, "kl": 0.4293886311352253, "learning_rate": 2.9995858872058686e-06, "loss": 0.0429, "num_tokens": 14409728.0, "reward": 0.85357666015625, "reward_std": 0.014783421531319618, "rewards//mean": 0.85357666015625, "rewards//std": 0.035841889679431915, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.442, "grad_norm": 1.8765480518341064, "kl": 0.33298663701862097, "learning_rate": 2.998031127213556e-06, "loss": 0.0333, "num_tokens": 14416288.0, "reward": 0.875244140625, "reward_std": 0.012766627594828606, "rewards//mean": 0.875244140625, "rewards//std": 0.02662002108991146, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4422, "grad_norm": 1.9876054525375366, "kl": 0.3685247153043747, "learning_rate": 2.9964761666143638e-06, "loss": 0.0369, "num_tokens": 14422808.0, "reward": 0.8265380859375, "reward_std": 0.0162955392152071, "rewards//mean": 0.8265380859375, "rewards//std": 0.021049829199910164, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4424, "grad_norm": 1.9164963960647583, "kl": 0.38894451782107353, "learning_rate": 2.9949210060346323e-06, "loss": 0.0389, "num_tokens": 14429352.0, "reward": 0.86175537109375, "reward_std": 0.0180559940636158, "rewards//mean": 0.86175537109375, "rewards//std": 0.02414620853960514, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4426, "grad_norm": 1.830180287361145, "kl": 0.3133532302454114, "learning_rate": 2.9933656461007775e-06, "loss": 0.0313, "num_tokens": 14435840.0, "reward": 0.85125732421875, "reward_std": 0.012855417095124722, "rewards//mean": 0.85125732421875, "rewards//std": 0.023738402873277664, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4428, "grad_norm": 1.6213860511779785, "kl": 0.39351621083915234, "learning_rate": 2.9918100874393007e-06, "loss": 0.0394, "num_tokens": 14442264.0, "reward": 0.84375, "reward_std": 0.014166067354381084, "rewards//mean": 0.84375, "rewards//std": 0.0390625, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.443, "grad_norm": 1.8051910400390625, "kl": 0.2557607628405094, "learning_rate": 2.9902543306767786e-06, "loss": 0.0256, "num_tokens": 14448752.0, "reward": 0.8021240234375, "reward_std": 0.00795908272266388, "rewards//mean": 0.8021240234375, "rewards//std": 0.015108315274119377, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4432, "grad_norm": 1.9518810510635376, "kl": 0.26210843678563833, "learning_rate": 2.988698376439871e-06, "loss": 0.0262, "num_tokens": 14455200.0, "reward": 0.8592529296875, "reward_std": 0.01577504351735115, "rewards//mean": 0.8592529296875, "rewards//std": 0.02232573926448822, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4434, "grad_norm": 1.494371771812439, "kl": 0.21407626010477543, "learning_rate": 2.9871422253553154e-06, "loss": 0.0214, "num_tokens": 14461744.0, "reward": 0.84991455078125, "reward_std": 0.01413970347493887, "rewards//mean": 0.84991455078125, "rewards//std": 0.02802012860774994, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4436, "grad_norm": 1.5967673063278198, "kl": 0.278025371953845, "learning_rate": 2.98558587804993e-06, "loss": 0.0278, "num_tokens": 14468168.0, "reward": 0.83685302734375, "reward_std": 0.011657357215881348, "rewards//mean": 0.83685302734375, "rewards//std": 0.01864614337682724, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4438, "grad_norm": 1.9295581579208374, "kl": 0.34050197154283524, "learning_rate": 2.9840293351506113e-06, "loss": 0.0341, "num_tokens": 14474656.0, "reward": 0.8819580078125, "reward_std": 0.016042090952396393, "rewards//mean": 0.8819580078125, "rewards//std": 0.024204833433032036, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.444, "grad_norm": 1.6776727437973022, "kl": 0.2917546294629574, "learning_rate": 2.9824725972843344e-06, "loss": 0.0292, "num_tokens": 14481152.0, "reward": 0.84527587890625, "reward_std": 0.012526944279670715, "rewards//mean": 0.84527587890625, "rewards//std": 0.01846175454556942, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4442, "grad_norm": 1.882965326309204, "kl": 0.20281281834468246, "learning_rate": 2.980915665078153e-06, "loss": 0.0203, "num_tokens": 14487656.0, "reward": 0.8555908203125, "reward_std": 0.010586272925138474, "rewards//mean": 0.8555908203125, "rewards//std": 0.016557738184928894, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4444, "grad_norm": 1.3039051294326782, "kl": 0.28880898375064135, "learning_rate": 2.9793585391591984e-06, "loss": 0.0289, "num_tokens": 14494208.0, "reward": 0.82696533203125, "reward_std": 0.011577642522752285, "rewards//mean": 0.82696533203125, "rewards//std": 0.019321538507938385, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4446, "grad_norm": 2.179978609085083, "kl": 0.19057163782417774, "learning_rate": 2.9778012201546825e-06, "loss": 0.0191, "num_tokens": 14500752.0, "reward": 0.81689453125, "reward_std": 0.011199300177395344, "rewards//mean": 0.81689453125, "rewards//std": 0.020862378180027008, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4448, "grad_norm": 1.5030924081802368, "kl": 0.19148099049925804, "learning_rate": 2.976243708691891e-06, "loss": 0.0191, "num_tokens": 14507224.0, "reward": 0.8922119140625, "reward_std": 0.009759771637618542, "rewards//mean": 0.8922119140625, "rewards//std": 0.018697723746299744, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.445, "grad_norm": 1.4079809188842773, "kl": 0.23316137585788965, "learning_rate": 2.974686005398192e-06, "loss": 0.0233, "num_tokens": 14513728.0, "reward": 0.8511962890625, "reward_std": 0.01158439926803112, "rewards//mean": 0.8511962890625, "rewards//std": 0.020818432793021202, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4452, "grad_norm": 2.050485134124756, "kl": 0.29163285717368126, "learning_rate": 2.973128110901026e-06, "loss": 0.0292, "num_tokens": 14520264.0, "reward": 0.86029052734375, "reward_std": 0.013933689333498478, "rewards//mean": 0.86029052734375, "rewards//std": 0.01982959359884262, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4454, "grad_norm": 1.7337063550949097, "kl": 0.28068370558321476, "learning_rate": 2.9715700258279138e-06, "loss": 0.0281, "num_tokens": 14526752.0, "reward": 0.813232421875, "reward_std": 0.014671631157398224, "rewards//mean": 0.813232421875, "rewards//std": 0.02236807718873024, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4456, "grad_norm": 1.8769738674163818, "kl": 0.37867451272904873, "learning_rate": 2.970011750806451e-06, "loss": 0.0379, "num_tokens": 14533240.0, "reward": 0.78924560546875, "reward_std": 0.011825019493699074, "rewards//mean": 0.78924560546875, "rewards//std": 0.02491449937224388, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4458, "grad_norm": 1.4227157831192017, "kl": 0.27587628923356533, "learning_rate": 2.9684532864643123e-06, "loss": 0.0276, "num_tokens": 14539688.0, "reward": 0.79278564453125, "reward_std": 0.00960316602140665, "rewards//mean": 0.79278564453125, "rewards//std": 0.018354028463363647, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.446, "grad_norm": 1.5988483428955078, "kl": 0.4480137303471565, "learning_rate": 2.9668946334292448e-06, "loss": 0.0448, "num_tokens": 14546240.0, "reward": 0.85589599609375, "reward_std": 0.014239007607102394, "rewards//mean": 0.85589599609375, "rewards//std": 0.029863828793168068, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4462, "grad_norm": 1.9311983585357666, "kl": 0.44090455397963524, "learning_rate": 2.9653357923290753e-06, "loss": 0.0441, "num_tokens": 14552752.0, "reward": 0.82745361328125, "reward_std": 0.01407945342361927, "rewards//mean": 0.82745361328125, "rewards//std": 0.021047940477728844, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4464, "grad_norm": 1.835129976272583, "kl": 0.30922626703977585, "learning_rate": 2.9637767637917035e-06, "loss": 0.0309, "num_tokens": 14559264.0, "reward": 0.83099365234375, "reward_std": 0.013506224378943443, "rewards//mean": 0.83099365234375, "rewards//std": 0.016281504184007645, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4466, "grad_norm": 1.5437012910842896, "kl": 0.30611105635762215, "learning_rate": 2.962217548445108e-06, "loss": 0.0306, "num_tokens": 14565728.0, "reward": 0.85009765625, "reward_std": 0.013665108010172844, "rewards//mean": 0.85009765625, "rewards//std": 0.02771327830851078, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4468, "grad_norm": 1.5087783336639404, "kl": 0.37221899442374706, "learning_rate": 2.9606581469173383e-06, "loss": 0.0372, "num_tokens": 14572248.0, "reward": 0.828857421875, "reward_std": 0.012186943553388119, "rewards//mean": 0.828857421875, "rewards//std": 0.018841683864593506, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.447, "grad_norm": 2.915076971054077, "kl": 0.5696072839200497, "learning_rate": 2.959098559836523e-06, "loss": 0.057, "num_tokens": 14578816.0, "reward": 0.77227783203125, "reward_std": 0.009388620033860207, "rewards//mean": 0.77227783203125, "rewards//std": 0.017114628106355667, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4472, "grad_norm": 2.784641981124878, "kl": 0.3570241704583168, "learning_rate": 2.9575387878308617e-06, "loss": 0.0357, "num_tokens": 14585320.0, "reward": 0.8544921875, "reward_std": 0.014209440909326077, "rewards//mean": 0.8544921875, "rewards//std": 0.02909882739186287, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4474, "grad_norm": 1.8110226392745972, "kl": 0.37919948250055313, "learning_rate": 2.955978831528632e-06, "loss": 0.0379, "num_tokens": 14591952.0, "reward": 0.881591796875, "reward_std": 0.009335657581686974, "rewards//mean": 0.881591796875, "rewards//std": 0.020242206752300262, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4476, "grad_norm": 1.8311983346939087, "kl": 0.36984019074589014, "learning_rate": 2.9544186915581835e-06, "loss": 0.037, "num_tokens": 14598632.0, "reward": 0.841796875, "reward_std": 0.01759893074631691, "rewards//mean": 0.841796875, "rewards//std": 0.026346798986196518, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4478, "grad_norm": 1.383833885192871, "kl": 0.239517230540514, "learning_rate": 2.95285836854794e-06, "loss": 0.024, "num_tokens": 14605272.0, "reward": 0.85260009765625, "reward_std": 0.01806078851222992, "rewards//mean": 0.85260009765625, "rewards//std": 0.03191900998353958, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.448, "grad_norm": 2.9157514572143555, "kl": 0.27215602062642574, "learning_rate": 2.9512978631264006e-06, "loss": 0.0272, "num_tokens": 14611728.0, "reward": 0.83209228515625, "reward_std": 0.010033085942268372, "rewards//mean": 0.83209228515625, "rewards//std": 0.02503148838877678, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4482, "grad_norm": 2.9061686992645264, "kl": 0.21391806565225124, "learning_rate": 2.949737175922135e-06, "loss": 0.0214, "num_tokens": 14618336.0, "reward": 0.89007568359375, "reward_std": 0.014275779947638512, "rewards//mean": 0.89007568359375, "rewards//std": 0.01790565438568592, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4484, "grad_norm": 2.2548840045928955, "kl": 0.2845702338963747, "learning_rate": 2.948176307563789e-06, "loss": 0.0285, "num_tokens": 14624944.0, "reward": 0.8350830078125, "reward_std": 0.015707073733210564, "rewards//mean": 0.8350830078125, "rewards//std": 0.026231348514556885, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4486, "grad_norm": 2.592463254928589, "kl": 0.30520729161798954, "learning_rate": 2.94661525868008e-06, "loss": 0.0305, "num_tokens": 14631560.0, "reward": 0.8563232421875, "reward_std": 0.014344178140163422, "rewards//mean": 0.8563232421875, "rewards//std": 0.033031947910785675, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4488, "grad_norm": 3.0242691040039062, "kl": 0.5152493566274643, "learning_rate": 2.945054029899798e-06, "loss": 0.0515, "num_tokens": 14638064.0, "reward": 0.84759521484375, "reward_std": 0.011402152478694916, "rewards//mean": 0.84759521484375, "rewards//std": 0.022761117666959763, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.449, "grad_norm": 2.6028687953948975, "kl": 0.34910872764885426, "learning_rate": 2.943492621851806e-06, "loss": 0.0349, "num_tokens": 14644536.0, "reward": 0.849853515625, "reward_std": 0.012492219917476177, "rewards//mean": 0.849853515625, "rewards//std": 0.030424321070313454, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4492, "grad_norm": 1.6336338520050049, "kl": 0.2447541318833828, "learning_rate": 2.9419310351650395e-06, "loss": 0.0245, "num_tokens": 14650944.0, "reward": 0.86248779296875, "reward_std": 0.014639736153185368, "rewards//mean": 0.86248779296875, "rewards//std": 0.01952727697789669, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4494, "grad_norm": 2.9781789779663086, "kl": 0.3521153014153242, "learning_rate": 2.940369270468504e-06, "loss": 0.0352, "num_tokens": 14657432.0, "reward": 0.86810302734375, "reward_std": 0.011371064931154251, "rewards//mean": 0.86810302734375, "rewards//std": 0.0231331754475832, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4496, "grad_norm": 1.6722413301467896, "kl": 0.28446963243186474, "learning_rate": 2.9388073283912797e-06, "loss": 0.0284, "num_tokens": 14663968.0, "reward": 0.83209228515625, "reward_std": 0.010295522399246693, "rewards//mean": 0.83209228515625, "rewards//std": 0.02146878093481064, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4498, "grad_norm": 1.4043786525726318, "kl": 0.3331361096352339, "learning_rate": 2.9372452095625163e-06, "loss": 0.0333, "num_tokens": 14670544.0, "reward": 0.8760986328125, "reward_std": 0.01299562118947506, "rewards//mean": 0.8760986328125, "rewards//std": 0.022771505638957024, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.45, "grad_norm": 1.5551691055297852, "kl": 0.4798464570194483, "learning_rate": 2.9356829146114347e-06, "loss": 0.048, "num_tokens": 14677048.0, "reward": 0.8328857421875, "reward_std": 0.009672300890088081, "rewards//mean": 0.8328857421875, "rewards//std": 0.023993778973817825, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4502, "grad_norm": 1.450987696647644, "kl": 0.2654816219583154, "learning_rate": 2.9341204441673267e-06, "loss": 0.0265, "num_tokens": 14683672.0, "reward": 0.85382080078125, "reward_std": 0.011795315891504288, "rewards//mean": 0.85382080078125, "rewards//std": 0.022116858512163162, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4504, "grad_norm": 2.203139066696167, "kl": 0.2903859280049801, "learning_rate": 2.9325577988595556e-06, "loss": 0.029, "num_tokens": 14690120.0, "reward": 0.82177734375, "reward_std": 0.013740688562393188, "rewards//mean": 0.82177734375, "rewards//std": 0.01906690187752247, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4506, "grad_norm": 1.5978888273239136, "kl": 0.3097352422773838, "learning_rate": 2.9309949793175544e-06, "loss": 0.031, "num_tokens": 14696632.0, "reward": 0.8681640625, "reward_std": 0.016814611852169037, "rewards//mean": 0.8681640625, "rewards//std": 0.031079022213816643, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4508, "grad_norm": 1.5835446119308472, "kl": 0.3321505328640342, "learning_rate": 2.929431986170828e-06, "loss": 0.0332, "num_tokens": 14703120.0, "reward": 0.8597412109375, "reward_std": 0.014268564060330391, "rewards//mean": 0.8597412109375, "rewards//std": 0.03359359875321388, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.451, "grad_norm": 1.5448793172836304, "kl": 0.3178654611110687, "learning_rate": 2.9278688200489487e-06, "loss": 0.0318, "num_tokens": 14709592.0, "reward": 0.8734130859375, "reward_std": 0.012209124863147736, "rewards//mean": 0.8734130859375, "rewards//std": 0.020096229389309883, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4512, "grad_norm": 2.7039437294006348, "kl": 0.2593133896589279, "learning_rate": 2.92630548158156e-06, "loss": 0.0259, "num_tokens": 14716096.0, "reward": 0.81610107421875, "reward_std": 0.010906334966421127, "rewards//mean": 0.81610107421875, "rewards//std": 0.01744576171040535, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4514, "grad_norm": 2.008620023727417, "kl": 0.3268382716923952, "learning_rate": 2.9247419713983736e-06, "loss": 0.0327, "num_tokens": 14722576.0, "reward": 0.8253173828125, "reward_std": 0.0133244963362813, "rewards//mean": 0.8253173828125, "rewards//std": 0.035966526716947556, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4516, "grad_norm": 2.8732705116271973, "kl": 0.3959777280688286, "learning_rate": 2.9231782901291726e-06, "loss": 0.0396, "num_tokens": 14729112.0, "reward": 0.83648681640625, "reward_std": 0.011491058394312859, "rewards//mean": 0.83648681640625, "rewards//std": 0.016171421855688095, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4518, "grad_norm": 1.5704731941223145, "kl": 0.24499783292412758, "learning_rate": 2.921614438403807e-06, "loss": 0.0245, "num_tokens": 14735528.0, "reward": 0.81011962890625, "reward_std": 0.00973108783364296, "rewards//mean": 0.81011962890625, "rewards//std": 0.018579453229904175, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.452, "grad_norm": 2.101139783859253, "kl": 0.35850689746439457, "learning_rate": 2.920050416852196e-06, "loss": 0.0359, "num_tokens": 14742064.0, "reward": 0.87957763671875, "reward_std": 0.010287009179592133, "rewards//mean": 0.87957763671875, "rewards//std": 0.02089781127870083, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4522, "grad_norm": 1.5805819034576416, "kl": 0.3814091421663761, "learning_rate": 2.9184862261043272e-06, "loss": 0.0381, "num_tokens": 14748536.0, "reward": 0.80108642578125, "reward_std": 0.013931550085544586, "rewards//mean": 0.80108642578125, "rewards//std": 0.020223408937454224, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4524, "grad_norm": 1.8312634229660034, "kl": 0.2904959311708808, "learning_rate": 2.9169218667902562e-06, "loss": 0.029, "num_tokens": 14755056.0, "reward": 0.8914794921875, "reward_std": 0.010376534424722195, "rewards//mean": 0.8914794921875, "rewards//std": 0.023576276376843452, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4526, "grad_norm": 2.017184019088745, "kl": 0.34412345103919506, "learning_rate": 2.9153573395401076e-06, "loss": 0.0344, "num_tokens": 14761512.0, "reward": 0.8363037109375, "reward_std": 0.015289170667529106, "rewards//mean": 0.8363037109375, "rewards//std": 0.029138077050447464, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4528, "grad_norm": 1.7015048265457153, "kl": 0.488469909876585, "learning_rate": 2.9137926449840714e-06, "loss": 0.0488, "num_tokens": 14767984.0, "reward": 0.8548583984375, "reward_std": 0.014826931990683079, "rewards//mean": 0.8548583984375, "rewards//std": 0.023930605500936508, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.453, "grad_norm": 1.5288481712341309, "kl": 0.31413935869932175, "learning_rate": 2.9122277837524087e-06, "loss": 0.0314, "num_tokens": 14774592.0, "reward": 0.85772705078125, "reward_std": 0.014920426532626152, "rewards//mean": 0.85772705078125, "rewards//std": 0.022410588338971138, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4532, "grad_norm": 3.070589303970337, "kl": 0.48405493795871735, "learning_rate": 2.910662756475443e-06, "loss": 0.0484, "num_tokens": 14781152.0, "reward": 0.84356689453125, "reward_std": 0.014428988099098206, "rewards//mean": 0.84356689453125, "rewards//std": 0.023790637031197548, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4534, "grad_norm": 3.8048651218414307, "kl": 0.35858581587672234, "learning_rate": 2.909097563783568e-06, "loss": 0.0359, "num_tokens": 14787672.0, "reward": 0.8421630859375, "reward_std": 0.011905834078788757, "rewards//mean": 0.8421630859375, "rewards//std": 0.034307003021240234, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4536, "grad_norm": 3.048924207687378, "kl": 0.3658562656491995, "learning_rate": 2.9075322063072437e-06, "loss": 0.0366, "num_tokens": 14794288.0, "reward": 0.8319091796875, "reward_std": 0.010986391454935074, "rewards//mean": 0.8319091796875, "rewards//std": 0.029740717262029648, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4538, "grad_norm": 1.467453122138977, "kl": 0.45963216479867697, "learning_rate": 2.905966684676996e-06, "loss": 0.046, "num_tokens": 14800784.0, "reward": 0.83734130859375, "reward_std": 0.013034502044320107, "rewards//mean": 0.83734130859375, "rewards//std": 0.020627319812774658, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.454, "grad_norm": 2.182546854019165, "kl": 0.4854056518524885, "learning_rate": 2.904400999523415e-06, "loss": 0.0485, "num_tokens": 14807352.0, "reward": 0.88665771484375, "reward_std": 0.011842403560876846, "rewards//mean": 0.88665771484375, "rewards//std": 0.024124884977936745, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4542, "grad_norm": 1.8068294525146484, "kl": 0.3480653837323189, "learning_rate": 2.902835151477161e-06, "loss": 0.0348, "num_tokens": 14813912.0, "reward": 0.84686279296875, "reward_std": 0.012591374106705189, "rewards//mean": 0.84686279296875, "rewards//std": 0.03165183588862419, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4544, "grad_norm": 2.0750396251678467, "kl": 0.3787130592390895, "learning_rate": 2.901269141168955e-06, "loss": 0.0379, "num_tokens": 14820432.0, "reward": 0.8487548828125, "reward_std": 0.015321217477321625, "rewards//mean": 0.8487548828125, "rewards//std": 0.038155827671289444, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4546, "grad_norm": 2.195941209793091, "kl": 0.3128690756857395, "learning_rate": 2.8997029692295875e-06, "loss": 0.0313, "num_tokens": 14826952.0, "reward": 0.8172607421875, "reward_std": 0.008684767410159111, "rewards//mean": 0.8172607421875, "rewards//std": 0.015507819131016731, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4548, "grad_norm": 3.111254930496216, "kl": 0.38781092688441277, "learning_rate": 2.8981366362899116e-06, "loss": 0.0388, "num_tokens": 14833448.0, "reward": 0.87347412109375, "reward_std": 0.012650043703615665, "rewards//mean": 0.87347412109375, "rewards//std": 0.019131792709231377, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.455, "grad_norm": 1.77406644821167, "kl": 0.6927803419530392, "learning_rate": 2.896570142980846e-06, "loss": 0.0693, "num_tokens": 14840144.0, "reward": 0.86590576171875, "reward_std": 0.016118574887514114, "rewards//mean": 0.86590576171875, "rewards//std": 0.03987288475036621, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4552, "grad_norm": 1.299218773841858, "kl": 0.20992653630673885, "learning_rate": 2.895003489933375e-06, "loss": 0.021, "num_tokens": 14846632.0, "reward": 0.84283447265625, "reward_std": 0.01040612906217575, "rewards//mean": 0.84283447265625, "rewards//std": 0.026536794379353523, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4554, "grad_norm": 1.4672002792358398, "kl": 0.26456642150878906, "learning_rate": 2.893436677778545e-06, "loss": 0.0265, "num_tokens": 14853192.0, "reward": 0.78851318359375, "reward_std": 0.0088205486536026, "rewards//mean": 0.78851318359375, "rewards//std": 0.02106303721666336, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4556, "grad_norm": 1.4670863151550293, "kl": 0.30191001016646624, "learning_rate": 2.891869707147469e-06, "loss": 0.0302, "num_tokens": 14859696.0, "reward": 0.8594970703125, "reward_std": 0.009994002990424633, "rewards//mean": 0.8594970703125, "rewards//std": 0.025276795029640198, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4558, "grad_norm": 2.477640151977539, "kl": 0.29489048570394516, "learning_rate": 2.8903025786713205e-06, "loss": 0.0295, "num_tokens": 14866200.0, "reward": 0.82830810546875, "reward_std": 0.015864258632063866, "rewards//mean": 0.82830810546875, "rewards//std": 0.022167451679706573, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.456, "grad_norm": 1.5503143072128296, "kl": 0.21885040029883385, "learning_rate": 2.888735292981341e-06, "loss": 0.0219, "num_tokens": 14872712.0, "reward": 0.843994140625, "reward_std": 0.011640174314379692, "rewards//mean": 0.843994140625, "rewards//std": 0.025965990498661995, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4562, "grad_norm": 2.0898635387420654, "kl": 0.34651374258100986, "learning_rate": 2.887167850708831e-06, "loss": 0.0347, "num_tokens": 14879152.0, "reward": 0.81756591796875, "reward_std": 0.01585220731794834, "rewards//mean": 0.81756591796875, "rewards//std": 0.027993101626634598, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4564, "grad_norm": 1.6308037042617798, "kl": 0.31566675193607807, "learning_rate": 2.885600252485158e-06, "loss": 0.0316, "num_tokens": 14885736.0, "reward": 0.80157470703125, "reward_std": 0.010930366814136505, "rewards//mean": 0.80157470703125, "rewards//std": 0.023356545716524124, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4566, "grad_norm": 2.5096116065979004, "kl": 0.2764302287250757, "learning_rate": 2.8840324989417488e-06, "loss": 0.0276, "num_tokens": 14892192.0, "reward": 0.8521728515625, "reward_std": 0.011292753741145134, "rewards//mean": 0.8521728515625, "rewards//std": 0.02877422235906124, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4568, "grad_norm": 1.8721249103546143, "kl": 0.4098262581974268, "learning_rate": 2.8824645907100957e-06, "loss": 0.041, "num_tokens": 14898640.0, "reward": 0.8568115234375, "reward_std": 0.015497831627726555, "rewards//mean": 0.8568115234375, "rewards//std": 0.03330759331583977, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.457, "grad_norm": 1.6521573066711426, "kl": 0.3646479193121195, "learning_rate": 2.8808965284217503e-06, "loss": 0.0365, "num_tokens": 14905216.0, "reward": 0.8238525390625, "reward_std": 0.014188975095748901, "rewards//mean": 0.8238525390625, "rewards//std": 0.030849985778331757, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4572, "grad_norm": 2.7284412384033203, "kl": 0.5517546162009239, "learning_rate": 2.8793283127083295e-06, "loss": 0.0552, "num_tokens": 14911784.0, "reward": 0.844482421875, "reward_std": 0.012454396113753319, "rewards//mean": 0.844482421875, "rewards//std": 0.03729167580604553, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4574, "grad_norm": 1.8137531280517578, "kl": 0.35993669647723436, "learning_rate": 2.8777599442015093e-06, "loss": 0.036, "num_tokens": 14918280.0, "reward": 0.8834228515625, "reward_std": 0.010538595728576183, "rewards//mean": 0.8834228515625, "rewards//std": 0.03345271199941635, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4576, "grad_norm": 1.3111284971237183, "kl": 0.23163478635251522, "learning_rate": 2.876191423533029e-06, "loss": 0.0232, "num_tokens": 14924776.0, "reward": 0.847412109375, "reward_std": 0.010528365150094032, "rewards//mean": 0.847412109375, "rewards//std": 0.01996511034667492, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4578, "grad_norm": 1.7068698406219482, "kl": 0.2827287605032325, "learning_rate": 2.8746227513346873e-06, "loss": 0.0283, "num_tokens": 14931352.0, "reward": 0.83587646484375, "reward_std": 0.010831426829099655, "rewards//mean": 0.83587646484375, "rewards//std": 0.024154985323548317, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.458, "grad_norm": 2.28053879737854, "kl": 0.26576652005314827, "learning_rate": 2.8730539282383473e-06, "loss": 0.0266, "num_tokens": 14937920.0, "reward": 0.88726806640625, "reward_std": 0.013688004575669765, "rewards//mean": 0.88726806640625, "rewards//std": 0.031155822798609734, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4582, "grad_norm": 2.7248106002807617, "kl": 0.45498624816536903, "learning_rate": 2.8714849548759293e-06, "loss": 0.0455, "num_tokens": 14944392.0, "reward": 0.8525390625, "reward_std": 0.014116987586021423, "rewards//mean": 0.8525390625, "rewards//std": 0.03363165259361267, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4584, "grad_norm": 2.0290744304656982, "kl": 0.26190187223255634, "learning_rate": 2.869915831879417e-06, "loss": 0.0262, "num_tokens": 14950920.0, "reward": 0.8397216796875, "reward_std": 0.011525067500770092, "rewards//mean": 0.8397216796875, "rewards//std": 0.032068345695734024, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4586, "grad_norm": 1.9918924570083618, "kl": 0.2749956864863634, "learning_rate": 2.86834655988085e-06, "loss": 0.0275, "num_tokens": 14957464.0, "reward": 0.83074951171875, "reward_std": 0.018292753025889397, "rewards//mean": 0.83074951171875, "rewards//std": 0.03736421465873718, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4588, "grad_norm": 1.3656247854232788, "kl": 0.19544134382158518, "learning_rate": 2.866777139512334e-06, "loss": 0.0195, "num_tokens": 14964056.0, "reward": 0.84027099609375, "reward_std": 0.008758781477808952, "rewards//mean": 0.84027099609375, "rewards//std": 0.020352492108941078, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.459, "grad_norm": 1.5922166109085083, "kl": 0.2512696338817477, "learning_rate": 2.8652075714060296e-06, "loss": 0.0251, "num_tokens": 14970544.0, "reward": 0.87347412109375, "reward_std": 0.015145315788686275, "rewards//mean": 0.87347412109375, "rewards//std": 0.022376112639904022, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4592, "grad_norm": 2.7561933994293213, "kl": 0.36159806698560715, "learning_rate": 2.863637856194159e-06, "loss": 0.0362, "num_tokens": 14977024.0, "reward": 0.83673095703125, "reward_std": 0.013522964902222157, "rewards//mean": 0.83673095703125, "rewards//std": 0.01937786675989628, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4594, "grad_norm": 2.6371536254882812, "kl": 0.26697196811437607, "learning_rate": 2.8620679945090035e-06, "loss": 0.0267, "num_tokens": 14983544.0, "reward": 0.85748291015625, "reward_std": 0.011502518318593502, "rewards//mean": 0.85748291015625, "rewards//std": 0.024577438831329346, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4596, "grad_norm": 1.5838829278945923, "kl": 0.27268497459590435, "learning_rate": 2.860497986982903e-06, "loss": 0.0273, "num_tokens": 14990064.0, "reward": 0.8060302734375, "reward_std": 0.010947845876216888, "rewards//mean": 0.8060302734375, "rewards//std": 0.026265950873494148, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4598, "grad_norm": 1.8953540325164795, "kl": 0.2731747757643461, "learning_rate": 2.8589278342482567e-06, "loss": 0.0273, "num_tokens": 14996592.0, "reward": 0.85162353515625, "reward_std": 0.011167670600116253, "rewards//mean": 0.85162353515625, "rewards//std": 0.017614150419831276, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.46, "grad_norm": 2.6708381175994873, "kl": 0.33245973102748394, "learning_rate": 2.8573575369375215e-06, "loss": 0.0332, "num_tokens": 15003112.0, "reward": 0.79522705078125, "reward_std": 0.010586029849946499, "rewards//mean": 0.79522705078125, "rewards//std": 0.016584571450948715, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4602, "grad_norm": 2.412337303161621, "kl": 0.3333996757864952, "learning_rate": 2.8557870956832135e-06, "loss": 0.0333, "num_tokens": 15009600.0, "reward": 0.83758544921875, "reward_std": 0.013943308964371681, "rewards//mean": 0.83758544921875, "rewards//std": 0.020205436274409294, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4604, "grad_norm": 1.6750023365020752, "kl": 0.3424206040799618, "learning_rate": 2.8542165111179053e-06, "loss": 0.0342, "num_tokens": 15016184.0, "reward": 0.82403564453125, "reward_std": 0.010416394099593163, "rewards//mean": 0.82403564453125, "rewards//std": 0.020896362140774727, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4606, "grad_norm": 1.6780766248703003, "kl": 0.2294640326872468, "learning_rate": 2.8526457838742292e-06, "loss": 0.0229, "num_tokens": 15022696.0, "reward": 0.8375244140625, "reward_std": 0.013319874182343483, "rewards//mean": 0.8375244140625, "rewards//std": 0.03421510010957718, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4608, "grad_norm": 3.901198387145996, "kl": 0.459273848682642, "learning_rate": 2.8510749145848732e-06, "loss": 0.0459, "num_tokens": 15029200.0, "reward": 0.85540771484375, "reward_std": 0.010380043648183346, "rewards//mean": 0.85540771484375, "rewards//std": 0.016667425632476807, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.461, "grad_norm": 1.5497561693191528, "kl": 0.3981144241988659, "learning_rate": 2.8495039038825845e-06, "loss": 0.0398, "num_tokens": 15035624.0, "reward": 0.90130615234375, "reward_std": 0.012122973799705505, "rewards//mean": 0.90130615234375, "rewards//std": 0.01839110441505909, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4612, "grad_norm": 1.6883964538574219, "kl": 0.2448892965912819, "learning_rate": 2.847932752400164e-06, "loss": 0.0245, "num_tokens": 15042160.0, "reward": 0.85516357421875, "reward_std": 0.011999912559986115, "rewards//mean": 0.85516357421875, "rewards//std": 0.02573578618466854, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4614, "grad_norm": 2.3302884101867676, "kl": 0.3480874849483371, "learning_rate": 2.846361460770473e-06, "loss": 0.0348, "num_tokens": 15048784.0, "reward": 0.85076904296875, "reward_std": 0.013706710189580917, "rewards//mean": 0.85076904296875, "rewards//std": 0.031243642792105675, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4616, "grad_norm": 1.6402302980422974, "kl": 0.42943240888416767, "learning_rate": 2.844790029626426e-06, "loss": 0.0429, "num_tokens": 15055248.0, "reward": 0.848876953125, "reward_std": 0.013231305405497551, "rewards//mean": 0.848876953125, "rewards//std": 0.027942834421992302, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4618, "grad_norm": 1.5864450931549072, "kl": 0.3447018302977085, "learning_rate": 2.843218459600998e-06, "loss": 0.0345, "num_tokens": 15061784.0, "reward": 0.8468017578125, "reward_std": 0.01525229774415493, "rewards//mean": 0.8468017578125, "rewards//std": 0.02467065118253231, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.462, "grad_norm": 1.8248820304870605, "kl": 0.5567512419074774, "learning_rate": 2.8416467513272147e-06, "loss": 0.0557, "num_tokens": 15068256.0, "reward": 0.8460693359375, "reward_std": 0.013468947261571884, "rewards//mean": 0.8460693359375, "rewards//std": 0.029287315905094147, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4622, "grad_norm": 1.9592379331588745, "kl": 0.2757602818310261, "learning_rate": 2.840074905438161e-06, "loss": 0.0276, "num_tokens": 15074776.0, "reward": 0.8272705078125, "reward_std": 0.009734965860843658, "rewards//mean": 0.8272705078125, "rewards//std": 0.01975282095372677, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4624, "grad_norm": 1.7543302774429321, "kl": 0.3184617254883051, "learning_rate": 2.8385029225669757e-06, "loss": 0.0318, "num_tokens": 15081280.0, "reward": 0.7900390625, "reward_std": 0.014894036576151848, "rewards//mean": 0.7900390625, "rewards//std": 0.019648704677820206, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4626, "grad_norm": 2.1127309799194336, "kl": 0.26603878382593393, "learning_rate": 2.836930803346854e-06, "loss": 0.0266, "num_tokens": 15087752.0, "reward": 0.868896484375, "reward_std": 0.015715233981609344, "rewards//mean": 0.868896484375, "rewards//std": 0.02546682395040989, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4628, "grad_norm": 1.8155341148376465, "kl": 0.4525589942932129, "learning_rate": 2.8353585484110447e-06, "loss": 0.0453, "num_tokens": 15094232.0, "reward": 0.84930419921875, "reward_std": 0.011345282196998596, "rewards//mean": 0.84930419921875, "rewards//std": 0.021517377346754074, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.463, "grad_norm": 1.5350840091705322, "kl": 0.3053242489695549, "learning_rate": 2.833786158392853e-06, "loss": 0.0305, "num_tokens": 15100800.0, "reward": 0.8170166015625, "reward_std": 0.010302696377038956, "rewards//mean": 0.8170166015625, "rewards//std": 0.02115885354578495, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4632, "grad_norm": 1.7779126167297363, "kl": 0.30903394147753716, "learning_rate": 2.8322136339256355e-06, "loss": 0.0309, "num_tokens": 15107296.0, "reward": 0.853271484375, "reward_std": 0.009990358725190163, "rewards//mean": 0.853271484375, "rewards//std": 0.020633282139897346, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4634, "grad_norm": 2.3730788230895996, "kl": 0.34933993592858315, "learning_rate": 2.8306409756428067e-06, "loss": 0.0349, "num_tokens": 15113816.0, "reward": 0.8798828125, "reward_std": 0.013761315494775772, "rewards//mean": 0.8798828125, "rewards//std": 0.033065155148506165, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4636, "grad_norm": 2.8795058727264404, "kl": 0.32124610617756844, "learning_rate": 2.8290681841778324e-06, "loss": 0.0321, "num_tokens": 15120360.0, "reward": 0.82403564453125, "reward_std": 0.014315012842416763, "rewards//mean": 0.82403564453125, "rewards//std": 0.046680472791194916, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4638, "grad_norm": 1.8747925758361816, "kl": 0.2814974281936884, "learning_rate": 2.8274952601642327e-06, "loss": 0.0281, "num_tokens": 15126936.0, "reward": 0.8233642578125, "reward_std": 0.014590087346732616, "rewards//mean": 0.8233642578125, "rewards//std": 0.02903607301414013, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.464, "grad_norm": 1.6748340129852295, "kl": 0.31420791894197464, "learning_rate": 2.825922204235581e-06, "loss": 0.0314, "num_tokens": 15133480.0, "reward": 0.85784912109375, "reward_std": 0.014356816187500954, "rewards//mean": 0.85784912109375, "rewards//std": 0.03052559122443199, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4642, "grad_norm": 2.005253553390503, "kl": 0.3390680365264416, "learning_rate": 2.8243490170255046e-06, "loss": 0.0339, "num_tokens": 15140040.0, "reward": 0.82586669921875, "reward_std": 0.012085824273526669, "rewards//mean": 0.82586669921875, "rewards//std": 0.018853630870580673, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4644, "grad_norm": 3.336803674697876, "kl": 0.42440252751111984, "learning_rate": 2.8227756991676837e-06, "loss": 0.0424, "num_tokens": 15146632.0, "reward": 0.86407470703125, "reward_std": 0.01610143482685089, "rewards//mean": 0.86407470703125, "rewards//std": 0.03923066332936287, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4646, "grad_norm": 1.4296468496322632, "kl": 0.2465143147855997, "learning_rate": 2.8212022512958487e-06, "loss": 0.0247, "num_tokens": 15153104.0, "reward": 0.8538818359375, "reward_std": 0.010695122182369232, "rewards//mean": 0.8538818359375, "rewards//std": 0.02155015431344509, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4648, "grad_norm": 1.8018661737442017, "kl": 0.3501425627619028, "learning_rate": 2.8196286740437862e-06, "loss": 0.035, "num_tokens": 15159584.0, "reward": 0.859375, "reward_std": 0.014410493895411491, "rewards//mean": 0.859375, "rewards//std": 0.0327264703810215, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.465, "grad_norm": 2.117105007171631, "kl": 0.25518124364316463, "learning_rate": 2.8180549680453305e-06, "loss": 0.0255, "num_tokens": 15166096.0, "reward": 0.84136962890625, "reward_std": 0.013998198322951794, "rewards//mean": 0.84136962890625, "rewards//std": 0.027442047372460365, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4652, "grad_norm": 2.3203978538513184, "kl": 0.2830195464193821, "learning_rate": 2.8164811339343736e-06, "loss": 0.0283, "num_tokens": 15172640.0, "reward": 0.8292236328125, "reward_std": 0.01384476013481617, "rewards//mean": 0.8292236328125, "rewards//std": 0.03184476122260094, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4654, "grad_norm": 2.0100834369659424, "kl": 0.2563795894384384, "learning_rate": 2.814907172344853e-06, "loss": 0.0256, "num_tokens": 15179088.0, "reward": 0.81622314453125, "reward_std": 0.01211756095290184, "rewards//mean": 0.81622314453125, "rewards//std": 0.020332399755716324, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4656, "grad_norm": 1.5841726064682007, "kl": 0.3750372715294361, "learning_rate": 2.813333083910761e-06, "loss": 0.0375, "num_tokens": 15185704.0, "reward": 0.85748291015625, "reward_std": 0.013800503686070442, "rewards//mean": 0.85748291015625, "rewards//std": 0.042988430708646774, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4658, "grad_norm": 1.9085698127746582, "kl": 0.3271010536700487, "learning_rate": 2.81175886926614e-06, "loss": 0.0327, "num_tokens": 15192232.0, "reward": 0.87542724609375, "reward_std": 0.015510495752096176, "rewards//mean": 0.87542724609375, "rewards//std": 0.022873898968100548, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.466, "grad_norm": 2.143233060836792, "kl": 0.22193099930882454, "learning_rate": 2.810184529045084e-06, "loss": 0.0222, "num_tokens": 15198712.0, "reward": 0.875244140625, "reward_std": 0.011654970236122608, "rewards//mean": 0.875244140625, "rewards//std": 0.026105530560016632, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4662, "grad_norm": 1.7174605131149292, "kl": 0.33018490206450224, "learning_rate": 2.808610063881737e-06, "loss": 0.033, "num_tokens": 15205224.0, "reward": 0.86981201171875, "reward_std": 0.010603249073028564, "rewards//mean": 0.86981201171875, "rewards//std": 0.03096233308315277, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4664, "grad_norm": 2.1562132835388184, "kl": 0.25923182256519794, "learning_rate": 2.8070354744102927e-06, "loss": 0.0259, "num_tokens": 15211744.0, "reward": 0.80126953125, "reward_std": 0.010674623772501945, "rewards//mean": 0.80126953125, "rewards//std": 0.027308298274874687, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4666, "grad_norm": 1.6243113279342651, "kl": 0.19191200472414494, "learning_rate": 2.805460761264997e-06, "loss": 0.0192, "num_tokens": 15218184.0, "reward": 0.88531494140625, "reward_std": 0.0110977403819561, "rewards//mean": 0.88531494140625, "rewards//std": 0.01998320408165455, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4668, "grad_norm": 2.0076448917388916, "kl": 0.2277565011754632, "learning_rate": 2.8038859250801425e-06, "loss": 0.0228, "num_tokens": 15224792.0, "reward": 0.81805419921875, "reward_std": 0.009410821832716465, "rewards//mean": 0.81805419921875, "rewards//std": 0.023579727858304977, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.467, "grad_norm": 1.5530483722686768, "kl": 0.20274651423096657, "learning_rate": 2.802310966490074e-06, "loss": 0.0203, "num_tokens": 15231312.0, "reward": 0.83087158203125, "reward_std": 0.017818404361605644, "rewards//mean": 0.83087158203125, "rewards//std": 0.033694103360176086, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4672, "grad_norm": 1.9626697301864624, "kl": 0.24996915273368359, "learning_rate": 2.800735886129184e-06, "loss": 0.025, "num_tokens": 15237776.0, "reward": 0.8646240234375, "reward_std": 0.015506412833929062, "rewards//mean": 0.8646240234375, "rewards//std": 0.019774267449975014, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4674, "grad_norm": 1.7959585189819336, "kl": 0.26152317970991135, "learning_rate": 2.799160684631915e-06, "loss": 0.0262, "num_tokens": 15244344.0, "reward": 0.86376953125, "reward_std": 0.01399032212793827, "rewards//mean": 0.86376953125, "rewards//std": 0.025643346831202507, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4676, "grad_norm": 1.4810444116592407, "kl": 0.241928955540061, "learning_rate": 2.7975853626327583e-06, "loss": 0.0242, "num_tokens": 15250832.0, "reward": 0.83172607421875, "reward_std": 0.010056865401566029, "rewards//mean": 0.83172607421875, "rewards//std": 0.01934189908206463, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4678, "grad_norm": 1.5231400728225708, "kl": 0.34193415753543377, "learning_rate": 2.7960099207662535e-06, "loss": 0.0342, "num_tokens": 15257384.0, "reward": 0.8565673828125, "reward_std": 0.012211402878165245, "rewards//mean": 0.8565673828125, "rewards//std": 0.01934395357966423, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.468, "grad_norm": 1.4253034591674805, "kl": 0.37333555798977613, "learning_rate": 2.794434359666987e-06, "loss": 0.0373, "num_tokens": 15263848.0, "reward": 0.8546142578125, "reward_std": 0.012120727449655533, "rewards//mean": 0.8546142578125, "rewards//std": 0.023077895864844322, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4682, "grad_norm": 1.474371075630188, "kl": 0.41606544982641935, "learning_rate": 2.792858679969596e-06, "loss": 0.0416, "num_tokens": 15270376.0, "reward": 0.81060791015625, "reward_std": 0.011086873710155487, "rewards//mean": 0.81060791015625, "rewards//std": 0.028603460639715195, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4684, "grad_norm": 1.817245602607727, "kl": 0.24297047965228558, "learning_rate": 2.791282882308764e-06, "loss": 0.0243, "num_tokens": 15276856.0, "reward": 0.88275146484375, "reward_std": 0.01531982235610485, "rewards//mean": 0.88275146484375, "rewards//std": 0.024963663890957832, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4686, "grad_norm": 1.390491247177124, "kl": 0.26794518902897835, "learning_rate": 2.789706967319222e-06, "loss": 0.0268, "num_tokens": 15283424.0, "reward": 0.8209228515625, "reward_std": 0.009524652734398842, "rewards//mean": 0.8209228515625, "rewards//std": 0.01491470169275999, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4688, "grad_norm": 3.3259854316711426, "kl": 0.4248095974326134, "learning_rate": 2.788130935635747e-06, "loss": 0.0425, "num_tokens": 15289960.0, "reward": 0.8472900390625, "reward_std": 0.010576330125331879, "rewards//mean": 0.8472900390625, "rewards//std": 0.019755885004997253, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.469, "grad_norm": 1.4788705110549927, "kl": 0.41137322038412094, "learning_rate": 2.786554787893167e-06, "loss": 0.0411, "num_tokens": 15296520.0, "reward": 0.8521728515625, "reward_std": 0.011926948092877865, "rewards//mean": 0.8521728515625, "rewards//std": 0.03773294761776924, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4692, "grad_norm": 1.5931633710861206, "kl": 0.3283001910895109, "learning_rate": 2.7849785247263515e-06, "loss": 0.0328, "num_tokens": 15303032.0, "reward": 0.827392578125, "reward_std": 0.00862909760326147, "rewards//mean": 0.827392578125, "rewards//std": 0.017060698941349983, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4694, "grad_norm": 1.4131499528884888, "kl": 0.21294193714857101, "learning_rate": 2.7834021467702217e-06, "loss": 0.0213, "num_tokens": 15309632.0, "reward": 0.84735107421875, "reward_std": 0.01187670323997736, "rewards//mean": 0.84735107421875, "rewards//std": 0.023717988282442093, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4696, "grad_norm": 1.9488465785980225, "kl": 0.33602960780262947, "learning_rate": 2.78182565465974e-06, "loss": 0.0336, "num_tokens": 15316184.0, "reward": 0.83709716796875, "reward_std": 0.010473818518221378, "rewards//mean": 0.83709716796875, "rewards//std": 0.03604866564273834, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4698, "grad_norm": 1.9797089099884033, "kl": 0.28010083083063364, "learning_rate": 2.780249049029919e-06, "loss": 0.028, "num_tokens": 15322688.0, "reward": 0.8519287109375, "reward_std": 0.010505106300115585, "rewards//mean": 0.8519287109375, "rewards//std": 0.019446976482868195, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.47, "grad_norm": 1.5532265901565552, "kl": 0.3054502345621586, "learning_rate": 2.778672330515814e-06, "loss": 0.0305, "num_tokens": 15329264.0, "reward": 0.8309326171875, "reward_std": 0.012682946398854256, "rewards//mean": 0.8309326171875, "rewards//std": 0.019418932497501373, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4702, "grad_norm": 1.7247121334075928, "kl": 0.2941478118300438, "learning_rate": 2.7770954997525277e-06, "loss": 0.0294, "num_tokens": 15335768.0, "reward": 0.833740234375, "reward_std": 0.012626966461539268, "rewards//mean": 0.833740234375, "rewards//std": 0.02397768571972847, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4704, "grad_norm": 1.7113028764724731, "kl": 0.3429913632571697, "learning_rate": 2.7755185573752075e-06, "loss": 0.0343, "num_tokens": 15342304.0, "reward": 0.854736328125, "reward_std": 0.018488038331270218, "rewards//mean": 0.854736328125, "rewards//std": 0.030827650800347328, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4706, "grad_norm": 1.3178462982177734, "kl": 0.321854704990983, "learning_rate": 2.7739415040190456e-06, "loss": 0.0322, "num_tokens": 15348800.0, "reward": 0.88671875, "reward_std": 0.015623497776687145, "rewards//mean": 0.88671875, "rewards//std": 0.03351622819900513, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4708, "grad_norm": 2.2573373317718506, "kl": 0.3132162867113948, "learning_rate": 2.7723643403192786e-06, "loss": 0.0313, "num_tokens": 15355296.0, "reward": 0.82373046875, "reward_std": 0.013489196076989174, "rewards//mean": 0.82373046875, "rewards//std": 0.03774116933345795, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.471, "grad_norm": 1.5513920783996582, "kl": 0.2369818277657032, "learning_rate": 2.770787066911187e-06, "loss": 0.0237, "num_tokens": 15361808.0, "reward": 0.861083984375, "reward_std": 0.010772403329610825, "rewards//mean": 0.861083984375, "rewards//std": 0.0248409491032362, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4712, "grad_norm": 1.5831325054168701, "kl": 0.3383005568757653, "learning_rate": 2.769209684430098e-06, "loss": 0.0338, "num_tokens": 15368320.0, "reward": 0.83807373046875, "reward_std": 0.012488718144595623, "rewards//mean": 0.83807373046875, "rewards//std": 0.03778241202235222, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4714, "grad_norm": 1.6255704164505005, "kl": 0.21038724668323994, "learning_rate": 2.76763219351138e-06, "loss": 0.021, "num_tokens": 15374736.0, "reward": 0.873779296875, "reward_std": 0.013316929340362549, "rewards//mean": 0.873779296875, "rewards//std": 0.030168499797582626, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4716, "grad_norm": 2.6984057426452637, "kl": 0.19226098712533712, "learning_rate": 2.7660545947904464e-06, "loss": 0.0192, "num_tokens": 15381280.0, "reward": 0.8543701171875, "reward_std": 0.013540985062718391, "rewards//mean": 0.8543701171875, "rewards//std": 0.028843581676483154, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4718, "grad_norm": 1.3169896602630615, "kl": 0.18819042947143316, "learning_rate": 2.764476888902754e-06, "loss": 0.0188, "num_tokens": 15387712.0, "reward": 0.8883056640625, "reward_std": 0.010334202088415623, "rewards//mean": 0.8883056640625, "rewards//std": 0.018225429579615593, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.472, "grad_norm": 1.5994449853897095, "kl": 0.20398014411330223, "learning_rate": 2.7628990764838026e-06, "loss": 0.0204, "num_tokens": 15394192.0, "reward": 0.859375, "reward_std": 0.014193999581038952, "rewards//mean": 0.859375, "rewards//std": 0.03428780287504196, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4722, "grad_norm": 1.6290333271026611, "kl": 0.1957937516272068, "learning_rate": 2.761321158169134e-06, "loss": 0.0196, "num_tokens": 15400808.0, "reward": 0.85546875, "reward_std": 0.009828787297010422, "rewards//mean": 0.85546875, "rewards//std": 0.026584740728139877, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4724, "grad_norm": 1.4685982465744019, "kl": 0.30867206025868654, "learning_rate": 2.7597431345943346e-06, "loss": 0.0309, "num_tokens": 15407360.0, "reward": 0.83819580078125, "reward_std": 0.014325186610221863, "rewards//mean": 0.83819580078125, "rewards//std": 0.02887996844947338, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4726, "grad_norm": 1.6837579011917114, "kl": 0.17965969722718, "learning_rate": 2.7581650063950316e-06, "loss": 0.018, "num_tokens": 15413936.0, "reward": 0.77252197265625, "reward_std": 0.00861920416355133, "rewards//mean": 0.77252197265625, "rewards//std": 0.015943283215165138, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4728, "grad_norm": 1.6919846534729004, "kl": 0.2522948235273361, "learning_rate": 2.7565867742068947e-06, "loss": 0.0252, "num_tokens": 15420472.0, "reward": 0.823486328125, "reward_std": 0.013783026486635208, "rewards//mean": 0.823486328125, "rewards//std": 0.02269059419631958, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.473, "grad_norm": 1.5316702127456665, "kl": 0.22833935357630253, "learning_rate": 2.7550084386656355e-06, "loss": 0.0228, "num_tokens": 15426976.0, "reward": 0.83477783203125, "reward_std": 0.009985314682126045, "rewards//mean": 0.83477783203125, "rewards//std": 0.02253856137394905, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4732, "grad_norm": 1.6725367307662964, "kl": 0.20393128413707018, "learning_rate": 2.7534300004070084e-06, "loss": 0.0204, "num_tokens": 15433480.0, "reward": 0.831298828125, "reward_std": 0.010526251047849655, "rewards//mean": 0.831298828125, "rewards//std": 0.02082025073468685, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4734, "grad_norm": 3.418830156326294, "kl": 0.32940297946333885, "learning_rate": 2.751851460066807e-06, "loss": 0.0329, "num_tokens": 15440000.0, "reward": 0.8328857421875, "reward_std": 0.014490539208054543, "rewards//mean": 0.8328857421875, "rewards//std": 0.031500644981861115, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4736, "grad_norm": 1.6849770545959473, "kl": 0.144375029951334, "learning_rate": 2.7502728182808685e-06, "loss": 0.0144, "num_tokens": 15446456.0, "reward": 0.85882568359375, "reward_std": 0.012603196315467358, "rewards//mean": 0.85882568359375, "rewards//std": 0.023155411705374718, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4738, "grad_norm": 1.4028066396713257, "kl": 0.2957917544990778, "learning_rate": 2.748694075685068e-06, "loss": 0.0296, "num_tokens": 15452840.0, "reward": 0.82537841796875, "reward_std": 0.00857553631067276, "rewards//mean": 0.82537841796875, "rewards//std": 0.0122676445171237, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.474, "grad_norm": 1.708282709121704, "kl": 0.2280195839703083, "learning_rate": 2.747115232915324e-06, "loss": 0.0228, "num_tokens": 15459568.0, "reward": 0.81256103515625, "reward_std": 0.009335273876786232, "rewards//mean": 0.81256103515625, "rewards//std": 0.019294099882245064, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4742, "grad_norm": 1.3447688817977905, "kl": 0.29526165314018726, "learning_rate": 2.745536290607593e-06, "loss": 0.0295, "num_tokens": 15466080.0, "reward": 0.7386474609375, "reward_std": 0.012222347781062126, "rewards//mean": 0.7386474609375, "rewards//std": 0.028226081281900406, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4744, "grad_norm": 1.8655414581298828, "kl": 0.23333128355443478, "learning_rate": 2.743957249397874e-06, "loss": 0.0233, "num_tokens": 15472512.0, "reward": 0.834228515625, "reward_std": 0.012041263282299042, "rewards//mean": 0.834228515625, "rewards//std": 0.02363174967467785, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4746, "grad_norm": 1.325015664100647, "kl": 0.29618992656469345, "learning_rate": 2.742378109922204e-06, "loss": 0.0296, "num_tokens": 15479088.0, "reward": 0.81341552734375, "reward_std": 0.010269047692418098, "rewards//mean": 0.81341552734375, "rewards//std": 0.018172508105635643, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4748, "grad_norm": 2.101470947265625, "kl": 0.2430214062333107, "learning_rate": 2.7407988728166603e-06, "loss": 0.0243, "num_tokens": 15485608.0, "reward": 0.8048095703125, "reward_std": 0.015124920755624771, "rewards//mean": 0.8048095703125, "rewards//std": 0.0338396318256855, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.475, "grad_norm": 2.0811336040496826, "kl": 0.329216405749321, "learning_rate": 2.7392195387173593e-06, "loss": 0.0329, "num_tokens": 15492024.0, "reward": 0.8795166015625, "reward_std": 0.009608948603272438, "rewards//mean": 0.8795166015625, "rewards//std": 0.012238767929375172, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4752, "grad_norm": 2.28615403175354, "kl": 0.2334832027554512, "learning_rate": 2.7376401082604563e-06, "loss": 0.0233, "num_tokens": 15498512.0, "reward": 0.8094482421875, "reward_std": 0.00859777070581913, "rewards//mean": 0.8094482421875, "rewards//std": 0.016637995839118958, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4754, "grad_norm": 2.320150136947632, "kl": 0.30715748481452465, "learning_rate": 2.7360605820821477e-06, "loss": 0.0307, "num_tokens": 15505040.0, "reward": 0.79620361328125, "reward_std": 0.010844048112630844, "rewards//mean": 0.79620361328125, "rewards//std": 0.017995886504650116, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4756, "grad_norm": 1.3040324449539185, "kl": 0.18004060629755259, "learning_rate": 2.7344809608186635e-06, "loss": 0.018, "num_tokens": 15511560.0, "reward": 0.8187255859375, "reward_std": 0.008511773310601711, "rewards//mean": 0.8187255859375, "rewards//std": 0.029779374599456787, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4758, "grad_norm": 2.6275084018707275, "kl": 0.19322673976421356, "learning_rate": 2.732901245106277e-06, "loss": 0.0193, "num_tokens": 15518064.0, "reward": 0.84307861328125, "reward_std": 0.012537302449345589, "rewards//mean": 0.84307861328125, "rewards//std": 0.0331350676715374, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.476, "grad_norm": 1.583221197128296, "kl": 0.2365482784807682, "learning_rate": 2.731321435581296e-06, "loss": 0.0237, "num_tokens": 15524536.0, "reward": 0.8681640625, "reward_std": 0.012260264717042446, "rewards//mean": 0.8681640625, "rewards//std": 0.028898373246192932, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4762, "grad_norm": 1.8808507919311523, "kl": 0.18844317272305489, "learning_rate": 2.729741532880069e-06, "loss": 0.0188, "num_tokens": 15531080.0, "reward": 0.892578125, "reward_std": 0.013373354449868202, "rewards//mean": 0.892578125, "rewards//std": 0.024170098826289177, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4764, "grad_norm": 1.655403971672058, "kl": 0.23444505967199802, "learning_rate": 2.7281615376389797e-06, "loss": 0.0234, "num_tokens": 15537528.0, "reward": 0.8323974609375, "reward_std": 0.012602083384990692, "rewards//mean": 0.8323974609375, "rewards//std": 0.018118804320693016, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4766, "grad_norm": 2.3190808296203613, "kl": 0.3176463730633259, "learning_rate": 2.726581450494451e-06, "loss": 0.0318, "num_tokens": 15544104.0, "reward": 0.845947265625, "reward_std": 0.013540494255721569, "rewards//mean": 0.845947265625, "rewards//std": 0.02161923423409462, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4768, "grad_norm": 2.384873390197754, "kl": 0.17464586161077023, "learning_rate": 2.7250012720829403e-06, "loss": 0.0175, "num_tokens": 15550672.0, "reward": 0.78240966796875, "reward_std": 0.010309628210961819, "rewards//mean": 0.78240966796875, "rewards//std": 0.020642725750803947, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.477, "grad_norm": 1.4247220754623413, "kl": 0.30143312830477953, "learning_rate": 2.723421003040945e-06, "loss": 0.0301, "num_tokens": 15557272.0, "reward": 0.827880859375, "reward_std": 0.010887732729315758, "rewards//mean": 0.827880859375, "rewards//std": 0.014159221202135086, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4772, "grad_norm": 1.886326551437378, "kl": 0.3604503236711025, "learning_rate": 2.7218406440049954e-06, "loss": 0.036, "num_tokens": 15563720.0, "reward": 0.8616943359375, "reward_std": 0.01355401985347271, "rewards//mean": 0.8616943359375, "rewards//std": 0.0261133573949337, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4774, "grad_norm": 2.251972198486328, "kl": 0.22407624777406454, "learning_rate": 2.7202601956116613e-06, "loss": 0.0224, "num_tokens": 15570272.0, "reward": 0.81768798828125, "reward_std": 0.01263450551778078, "rewards//mean": 0.81768798828125, "rewards//std": 0.0275955218821764, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4776, "grad_norm": 1.4858670234680176, "kl": 0.24360890686511993, "learning_rate": 2.718679658497547e-06, "loss": 0.0244, "num_tokens": 15576736.0, "reward": 0.82696533203125, "reward_std": 0.013746175915002823, "rewards//mean": 0.82696533203125, "rewards//std": 0.03178546577692032, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4778, "grad_norm": 1.9869011640548706, "kl": 0.16046570893377066, "learning_rate": 2.7170990332992923e-06, "loss": 0.016, "num_tokens": 15583288.0, "reward": 0.83624267578125, "reward_std": 0.011555353179574013, "rewards//mean": 0.83624267578125, "rewards//std": 0.0283338725566864, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.478, "grad_norm": 2.1419198513031006, "kl": 0.2320765694603324, "learning_rate": 2.715518320653573e-06, "loss": 0.0232, "num_tokens": 15589728.0, "reward": 0.84588623046875, "reward_std": 0.012889220379292965, "rewards//mean": 0.84588623046875, "rewards//std": 0.02670227736234665, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4782, "grad_norm": 1.5776150226593018, "kl": 0.19620709959417582, "learning_rate": 2.7139375211971e-06, "loss": 0.0196, "num_tokens": 15596288.0, "reward": 0.86181640625, "reward_std": 0.01747399941086769, "rewards//mean": 0.86181640625, "rewards//std": 0.03432663157582283, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4784, "grad_norm": 1.5757005214691162, "kl": 0.21802632696926594, "learning_rate": 2.7123566355666184e-06, "loss": 0.0218, "num_tokens": 15602784.0, "reward": 0.86614990234375, "reward_std": 0.011363329365849495, "rewards//mean": 0.86614990234375, "rewards//std": 0.02074294723570347, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4786, "grad_norm": 1.8117733001708984, "kl": 0.3130333162844181, "learning_rate": 2.7107756643989104e-06, "loss": 0.0313, "num_tokens": 15609304.0, "reward": 0.87713623046875, "reward_std": 0.01507486216723919, "rewards//mean": 0.87713623046875, "rewards//std": 0.022314464673399925, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4788, "grad_norm": 1.5721406936645508, "kl": 0.21983103640377522, "learning_rate": 2.70919460833079e-06, "loss": 0.022, "num_tokens": 15615936.0, "reward": 0.8770751953125, "reward_std": 0.009607934392988682, "rewards//mean": 0.8770751953125, "rewards//std": 0.026013460010290146, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.479, "grad_norm": 1.20684814453125, "kl": 0.24341735430061817, "learning_rate": 2.707613467999105e-06, "loss": 0.0243, "num_tokens": 15622384.0, "reward": 0.8636474609375, "reward_std": 0.01036902703344822, "rewards//mean": 0.8636474609375, "rewards//std": 0.019780389964580536, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4792, "grad_norm": 2.1193888187408447, "kl": 0.1595313847064972, "learning_rate": 2.706032244040741e-06, "loss": 0.016, "num_tokens": 15628968.0, "reward": 0.8419189453125, "reward_std": 0.010126039385795593, "rewards//mean": 0.8419189453125, "rewards//std": 0.024587061256170273, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4794, "grad_norm": 1.5503590106964111, "kl": 0.3058337066322565, "learning_rate": 2.7044509370926132e-06, "loss": 0.0306, "num_tokens": 15635496.0, "reward": 0.85284423828125, "reward_std": 0.012326473370194435, "rewards//mean": 0.85284423828125, "rewards//std": 0.029082629829645157, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4796, "grad_norm": 2.012948989868164, "kl": 0.2899816622957587, "learning_rate": 2.7028695477916727e-06, "loss": 0.029, "num_tokens": 15641856.0, "reward": 0.78619384765625, "reward_std": 0.013845352455973625, "rewards//mean": 0.78619384765625, "rewards//std": 0.019453493878245354, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4798, "grad_norm": 1.5062297582626343, "kl": 0.37479428946971893, "learning_rate": 2.7012880767749024e-06, "loss": 0.0375, "num_tokens": 15648360.0, "reward": 0.84283447265625, "reward_std": 0.016565997153520584, "rewards//mean": 0.84283447265625, "rewards//std": 0.0282862838357687, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.48, "grad_norm": 2.449092388153076, "kl": 0.38347289711236954, "learning_rate": 2.6997065246793193e-06, "loss": 0.0383, "num_tokens": 15654896.0, "reward": 0.86962890625, "reward_std": 0.0150529183447361, "rewards//mean": 0.86962890625, "rewards//std": 0.029127944260835648, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4802, "grad_norm": 1.9646985530853271, "kl": 0.30560007970780134, "learning_rate": 2.6981248921419713e-06, "loss": 0.0306, "num_tokens": 15661464.0, "reward": 0.8765869140625, "reward_std": 0.013739099726080894, "rewards//mean": 0.8765869140625, "rewards//std": 0.02294369414448738, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4804, "grad_norm": 2.118495225906372, "kl": 0.41044921800494194, "learning_rate": 2.696543179799941e-06, "loss": 0.041, "num_tokens": 15667920.0, "reward": 0.85614013671875, "reward_std": 0.012275456450879574, "rewards//mean": 0.85614013671875, "rewards//std": 0.025906968861818314, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4806, "grad_norm": 1.9458847045898438, "kl": 0.307394789531827, "learning_rate": 2.694961388290341e-06, "loss": 0.0307, "num_tokens": 15674424.0, "reward": 0.83087158203125, "reward_std": 0.010456299409270287, "rewards//mean": 0.83087158203125, "rewards//std": 0.013756729662418365, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4808, "grad_norm": 2.844421863555908, "kl": 0.2799907885491848, "learning_rate": 2.6933795182503177e-06, "loss": 0.028, "num_tokens": 15680944.0, "reward": 0.838623046875, "reward_std": 0.014400897547602654, "rewards//mean": 0.838623046875, "rewards//std": 0.03658350929617882, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.481, "grad_norm": 1.297037124633789, "kl": 0.37469591200351715, "learning_rate": 2.6917975703170466e-06, "loss": 0.0375, "num_tokens": 15687472.0, "reward": 0.87445068359375, "reward_std": 0.013739842921495438, "rewards//mean": 0.87445068359375, "rewards//std": 0.02617032267153263, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4812, "grad_norm": 2.0613291263580322, "kl": 0.42648087814450264, "learning_rate": 2.6902155451277377e-06, "loss": 0.0426, "num_tokens": 15693936.0, "reward": 0.84698486328125, "reward_std": 0.015534252859652042, "rewards//mean": 0.84698486328125, "rewards//std": 0.019778380170464516, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4814, "grad_norm": 1.1875375509262085, "kl": 0.3921100376173854, "learning_rate": 2.68863344331963e-06, "loss": 0.0392, "num_tokens": 15700480.0, "reward": 0.814453125, "reward_std": 0.013264492154121399, "rewards//mean": 0.814453125, "rewards//std": 0.022809036076068878, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4816, "grad_norm": 1.9913605451583862, "kl": 0.51880968734622, "learning_rate": 2.6870512655299942e-06, "loss": 0.0519, "num_tokens": 15706920.0, "reward": 0.84466552734375, "reward_std": 0.013452282175421715, "rewards//mean": 0.84466552734375, "rewards//std": 0.01769988238811493, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4818, "grad_norm": 1.8405109643936157, "kl": 0.4557125549763441, "learning_rate": 2.685469012396131e-06, "loss": 0.0456, "num_tokens": 15713424.0, "reward": 0.857666015625, "reward_std": 0.013097536750137806, "rewards//mean": 0.857666015625, "rewards//std": 0.02215045690536499, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.482, "grad_norm": 1.63455331325531, "kl": 0.35498199705034494, "learning_rate": 2.6838866845553705e-06, "loss": 0.0355, "num_tokens": 15719936.0, "reward": 0.83502197265625, "reward_std": 0.013921674340963364, "rewards//mean": 0.83502197265625, "rewards//std": 0.027924340218305588, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4822, "grad_norm": 3.1267154216766357, "kl": 0.680381903424859, "learning_rate": 2.682304282645077e-06, "loss": 0.068, "num_tokens": 15726408.0, "reward": 0.82354736328125, "reward_std": 0.015975670889019966, "rewards//mean": 0.82354736328125, "rewards//std": 0.037094227969646454, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4824, "grad_norm": 2.5858166217803955, "kl": 0.4004299622029066, "learning_rate": 2.6807218073026394e-06, "loss": 0.04, "num_tokens": 15732960.0, "reward": 0.83599853515625, "reward_std": 0.008504129946231842, "rewards//mean": 0.83599853515625, "rewards//std": 0.012786056846380234, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4826, "grad_norm": 2.5139615535736084, "kl": 0.5483271107077599, "learning_rate": 2.67913925916548e-06, "loss": 0.0548, "num_tokens": 15739584.0, "reward": 0.8182373046875, "reward_std": 0.010090961121022701, "rewards//mean": 0.8182373046875, "rewards//std": 0.024119628593325615, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4828, "grad_norm": 2.24466609954834, "kl": 0.3128954553976655, "learning_rate": 2.6775566388710476e-06, "loss": 0.0313, "num_tokens": 15746136.0, "reward": 0.83343505859375, "reward_std": 0.011378685012459755, "rewards//mean": 0.83343505859375, "rewards//std": 0.03483697772026062, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.483, "grad_norm": 1.771301031112671, "kl": 0.29547942243516445, "learning_rate": 2.675973947056823e-06, "loss": 0.0295, "num_tokens": 15752664.0, "reward": 0.8009033203125, "reward_std": 0.01318027637898922, "rewards//mean": 0.8009033203125, "rewards//std": 0.04823470860719681, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4832, "grad_norm": 1.6323559284210205, "kl": 0.48642288614064455, "learning_rate": 2.6743911843603134e-06, "loss": 0.0486, "num_tokens": 15759200.0, "reward": 0.84674072265625, "reward_std": 0.012531675398349762, "rewards//mean": 0.84674072265625, "rewards//std": 0.01823902502655983, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4834, "grad_norm": 2.303833484649658, "kl": 0.48703732900321484, "learning_rate": 2.6728083514190555e-06, "loss": 0.0487, "num_tokens": 15765680.0, "reward": 0.8509521484375, "reward_std": 0.01613321714103222, "rewards//mean": 0.8509521484375, "rewards//std": 0.02807551622390747, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4836, "grad_norm": 1.6915593147277832, "kl": 0.32283988408744335, "learning_rate": 2.6712254488706143e-06, "loss": 0.0323, "num_tokens": 15772256.0, "reward": 0.8603515625, "reward_std": 0.012081949971616268, "rewards//mean": 0.8603515625, "rewards//std": 0.03538629040122032, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4838, "grad_norm": 2.2461087703704834, "kl": 0.36161368153989315, "learning_rate": 2.669642477352583e-06, "loss": 0.0362, "num_tokens": 15778744.0, "reward": 0.83807373046875, "reward_std": 0.012470964342355728, "rewards//mean": 0.83807373046875, "rewards//std": 0.030607305467128754, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.484, "grad_norm": 2.92684006690979, "kl": 0.24087615311145782, "learning_rate": 2.6680594375025824e-06, "loss": 0.0241, "num_tokens": 15785320.0, "reward": 0.84796142578125, "reward_std": 0.012385494075715542, "rewards//mean": 0.84796142578125, "rewards//std": 0.022275084629654884, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4842, "grad_norm": 1.625237226486206, "kl": 0.1809593988582492, "learning_rate": 2.66647632995826e-06, "loss": 0.0181, "num_tokens": 15791824.0, "reward": 0.8616943359375, "reward_std": 0.01287923939526081, "rewards//mean": 0.8616943359375, "rewards//std": 0.04132063686847687, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4844, "grad_norm": 1.9562008380889893, "kl": 0.2578487070277333, "learning_rate": 2.6648931553572916e-06, "loss": 0.0258, "num_tokens": 15798408.0, "reward": 0.793212890625, "reward_std": 0.010302145034074783, "rewards//mean": 0.793212890625, "rewards//std": 0.018386267125606537, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4846, "grad_norm": 2.563681125640869, "kl": 0.2528790067881346, "learning_rate": 2.663309914337381e-06, "loss": 0.0253, "num_tokens": 15804904.0, "reward": 0.78271484375, "reward_std": 0.0077979653142392635, "rewards//mean": 0.78271484375, "rewards//std": 0.021116243675351143, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4848, "grad_norm": 1.9776242971420288, "kl": 0.37594619765877724, "learning_rate": 2.6617266075362548e-06, "loss": 0.0376, "num_tokens": 15811480.0, "reward": 0.83294677734375, "reward_std": 0.011736126616597176, "rewards//mean": 0.83294677734375, "rewards//std": 0.02340381033718586, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.485, "grad_norm": 2.4013473987579346, "kl": 0.39352574944496155, "learning_rate": 2.6601432355916716e-06, "loss": 0.0394, "num_tokens": 15818016.0, "reward": 0.8316650390625, "reward_std": 0.015280114486813545, "rewards//mean": 0.8316650390625, "rewards//std": 0.03236717730760574, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4852, "grad_norm": 1.5858350992202759, "kl": 0.19993457105010748, "learning_rate": 2.6585597991414115e-06, "loss": 0.02, "num_tokens": 15824424.0, "reward": 0.86358642578125, "reward_std": 0.011547300964593887, "rewards//mean": 0.86358642578125, "rewards//std": 0.01647467538714409, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4854, "grad_norm": 1.9068140983581543, "kl": 0.2184878159314394, "learning_rate": 2.6569762988232838e-06, "loss": 0.0218, "num_tokens": 15830952.0, "reward": 0.84832763671875, "reward_std": 0.011036115698516369, "rewards//mean": 0.84832763671875, "rewards//std": 0.025358933955430984, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4856, "grad_norm": 2.2455801963806152, "kl": 0.21074817795306444, "learning_rate": 2.6553927352751214e-06, "loss": 0.0211, "num_tokens": 15837456.0, "reward": 0.85260009765625, "reward_std": 0.011180685833096504, "rewards//mean": 0.85260009765625, "rewards//std": 0.018948936834931374, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4858, "grad_norm": 1.5306835174560547, "kl": 0.24880517832934856, "learning_rate": 2.6538091091347843e-06, "loss": 0.0249, "num_tokens": 15844040.0, "reward": 0.83648681640625, "reward_std": 0.009518531151115894, "rewards//mean": 0.83648681640625, "rewards//std": 0.02126901224255562, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.486, "grad_norm": 1.4361473321914673, "kl": 0.34012316819280386, "learning_rate": 2.652225421040156e-06, "loss": 0.034, "num_tokens": 15850616.0, "reward": 0.8209228515625, "reward_std": 0.01481681689620018, "rewards//mean": 0.8209228515625, "rewards//std": 0.019278105348348618, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4862, "grad_norm": 1.4668046236038208, "kl": 0.21556077059358358, "learning_rate": 2.6506416716291466e-06, "loss": 0.0216, "num_tokens": 15857088.0, "reward": 0.84521484375, "reward_std": 0.01269693672657013, "rewards//mean": 0.84521484375, "rewards//std": 0.02566222846508026, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4864, "grad_norm": 1.960253357887268, "kl": 0.30731135979294777, "learning_rate": 2.64905786153969e-06, "loss": 0.0307, "num_tokens": 15863656.0, "reward": 0.83099365234375, "reward_std": 0.014684576541185379, "rewards//mean": 0.83099365234375, "rewards//std": 0.026568720117211342, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4866, "grad_norm": 1.8147683143615723, "kl": 0.26309714559465647, "learning_rate": 2.647473991409744e-06, "loss": 0.0263, "num_tokens": 15870200.0, "reward": 0.82916259765625, "reward_std": 0.008542644791305065, "rewards//mean": 0.82916259765625, "rewards//std": 0.02059059403836727, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4868, "grad_norm": 1.9573638439178467, "kl": 0.32636053301393986, "learning_rate": 2.6458900618772926e-06, "loss": 0.0326, "num_tokens": 15876672.0, "reward": 0.86614990234375, "reward_std": 0.016203606501221657, "rewards//mean": 0.86614990234375, "rewards//std": 0.03252783417701721, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.487, "grad_norm": 1.4037529230117798, "kl": 0.2537020705640316, "learning_rate": 2.6443060735803405e-06, "loss": 0.0254, "num_tokens": 15883304.0, "reward": 0.8292236328125, "reward_std": 0.009884541854262352, "rewards//mean": 0.8292236328125, "rewards//std": 0.019406456500291824, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4872, "grad_norm": 2.4551124572753906, "kl": 0.48267779126763344, "learning_rate": 2.6427220271569206e-06, "loss": 0.0483, "num_tokens": 15889760.0, "reward": 0.781982421875, "reward_std": 0.01397321093827486, "rewards//mean": 0.781982421875, "rewards//std": 0.03944428637623787, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4874, "grad_norm": 1.4766411781311035, "kl": 0.3503557462245226, "learning_rate": 2.6411379232450844e-06, "loss": 0.035, "num_tokens": 15896264.0, "reward": 0.82989501953125, "reward_std": 0.015760352835059166, "rewards//mean": 0.82989501953125, "rewards//std": 0.029609810560941696, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4876, "grad_norm": 1.235368251800537, "kl": 0.16894059162586927, "learning_rate": 2.63955376248291e-06, "loss": 0.0169, "num_tokens": 15902824.0, "reward": 0.8646240234375, "reward_std": 0.009502649307250977, "rewards//mean": 0.8646240234375, "rewards//std": 0.03327121585607529, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4878, "grad_norm": 1.7899178266525269, "kl": 0.434480257332325, "learning_rate": 2.6379695455084963e-06, "loss": 0.0434, "num_tokens": 15909384.0, "reward": 0.87432861328125, "reward_std": 0.014413990080356598, "rewards//mean": 0.87432861328125, "rewards//std": 0.02858281321823597, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.488, "grad_norm": 1.5874227285385132, "kl": 0.3356715030968189, "learning_rate": 2.6363852729599672e-06, "loss": 0.0336, "num_tokens": 15915952.0, "reward": 0.84197998046875, "reward_std": 0.01369521114975214, "rewards//mean": 0.84197998046875, "rewards//std": 0.02630014717578888, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4882, "grad_norm": 1.5935243368148804, "kl": 0.2207133900374174, "learning_rate": 2.634800945475465e-06, "loss": 0.0221, "num_tokens": 15922496.0, "reward": 0.85931396484375, "reward_std": 0.008061347529292107, "rewards//mean": 0.85931396484375, "rewards//std": 0.03084820695221424, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4884, "grad_norm": 1.5645921230316162, "kl": 0.2781666871160269, "learning_rate": 2.6332165636931593e-06, "loss": 0.0278, "num_tokens": 15928984.0, "reward": 0.87371826171875, "reward_std": 0.015357434749603271, "rewards//mean": 0.87371826171875, "rewards//std": 0.03245935216546059, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4886, "grad_norm": 1.43972909450531, "kl": 0.2997987885028124, "learning_rate": 2.6316321282512368e-06, "loss": 0.03, "num_tokens": 15935600.0, "reward": 0.84063720703125, "reward_std": 0.009399522095918655, "rewards//mean": 0.84063720703125, "rewards//std": 0.021066632121801376, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4888, "grad_norm": 1.505717396736145, "kl": 0.33386301063001156, "learning_rate": 2.630047639787909e-06, "loss": 0.0334, "num_tokens": 15942056.0, "reward": 0.8404541015625, "reward_std": 0.01086465921252966, "rewards//mean": 0.8404541015625, "rewards//std": 0.021745948120951653, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.489, "grad_norm": 1.2849024534225464, "kl": 0.32227552495896816, "learning_rate": 2.6284630989414078e-06, "loss": 0.0322, "num_tokens": 15948544.0, "reward": 0.8677978515625, "reward_std": 0.010741427540779114, "rewards//mean": 0.8677978515625, "rewards//std": 0.019639072939753532, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4892, "grad_norm": 2.6308796405792236, "kl": 0.25684032402932644, "learning_rate": 2.626878506349986e-06, "loss": 0.0257, "num_tokens": 15955032.0, "reward": 0.86553955078125, "reward_std": 0.00945857260376215, "rewards//mean": 0.86553955078125, "rewards//std": 0.021663213148713112, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4894, "grad_norm": 1.9056003093719482, "kl": 0.22163926158100367, "learning_rate": 2.625293862651916e-06, "loss": 0.0222, "num_tokens": 15961576.0, "reward": 0.8785400390625, "reward_std": 0.01035432331264019, "rewards//mean": 0.8785400390625, "rewards//std": 0.032074011862277985, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4896, "grad_norm": 2.570564031600952, "kl": 0.28995821345597506, "learning_rate": 2.6237091684854945e-06, "loss": 0.029, "num_tokens": 15968120.0, "reward": 0.82879638671875, "reward_std": 0.01371969748288393, "rewards//mean": 0.82879638671875, "rewards//std": 0.02547328732907772, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4898, "grad_norm": 1.790452003479004, "kl": 0.45712690986692905, "learning_rate": 2.6221244244890336e-06, "loss": 0.0457, "num_tokens": 15974616.0, "reward": 0.84002685546875, "reward_std": 0.012459748424589634, "rewards//mean": 0.84002685546875, "rewards//std": 0.02603636309504509, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.49, "grad_norm": 2.1952457427978516, "kl": 0.3013730179518461, "learning_rate": 2.620539631300869e-06, "loss": 0.0301, "num_tokens": 15981200.0, "reward": 0.8204345703125, "reward_std": 0.009451804682612419, "rewards//mean": 0.8204345703125, "rewards//std": 0.019722143188118935, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4902, "grad_norm": 1.7211582660675049, "kl": 0.1996807949617505, "learning_rate": 2.6189547895593565e-06, "loss": 0.02, "num_tokens": 15987808.0, "reward": 0.85302734375, "reward_std": 0.009768860414624214, "rewards//mean": 0.85302734375, "rewards//std": 0.02826707810163498, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4904, "grad_norm": 1.7095606327056885, "kl": 0.3498337157070637, "learning_rate": 2.6173698999028676e-06, "loss": 0.035, "num_tokens": 15994328.0, "reward": 0.86566162109375, "reward_std": 0.013514718040823936, "rewards//mean": 0.86566162109375, "rewards//std": 0.02341286465525627, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4906, "grad_norm": 2.1271109580993652, "kl": 0.32594658341258764, "learning_rate": 2.615784962969798e-06, "loss": 0.0326, "num_tokens": 16000848.0, "reward": 0.7989501953125, "reward_std": 0.015415003523230553, "rewards//mean": 0.7989501953125, "rewards//std": 0.033479850739240646, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4908, "grad_norm": 1.7912023067474365, "kl": 0.3334022965282202, "learning_rate": 2.6141999793985586e-06, "loss": 0.0333, "num_tokens": 16007336.0, "reward": 0.8489990234375, "reward_std": 0.01317882165312767, "rewards//mean": 0.8489990234375, "rewards//std": 0.023798668757081032, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.491, "grad_norm": 1.4344667196273804, "kl": 0.2903171870857477, "learning_rate": 2.6126149498275814e-06, "loss": 0.029, "num_tokens": 16013800.0, "reward": 0.82232666015625, "reward_std": 0.013900223188102245, "rewards//mean": 0.82232666015625, "rewards//std": 0.03879491984844208, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4912, "grad_norm": 2.376922845840454, "kl": 0.5250178445130587, "learning_rate": 2.6110298748953158e-06, "loss": 0.0525, "num_tokens": 16020296.0, "reward": 0.85626220703125, "reward_std": 0.013904450461268425, "rewards//mean": 0.85626220703125, "rewards//std": 0.025403067469596863, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4914, "grad_norm": 1.8872917890548706, "kl": 0.3556964322924614, "learning_rate": 2.60944475524023e-06, "loss": 0.0356, "num_tokens": 16026856.0, "reward": 0.83544921875, "reward_std": 0.017335427924990654, "rewards//mean": 0.83544921875, "rewards//std": 0.03304683789610863, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4916, "grad_norm": 1.4046353101730347, "kl": 0.2166057089343667, "learning_rate": 2.6078595915008096e-06, "loss": 0.0217, "num_tokens": 16033480.0, "reward": 0.83050537109375, "reward_std": 0.010851139202713966, "rewards//mean": 0.83050537109375, "rewards//std": 0.019395826384425163, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4918, "grad_norm": 2.7267603874206543, "kl": 0.2755761370062828, "learning_rate": 2.606274384315559e-06, "loss": 0.0276, "num_tokens": 16039968.0, "reward": 0.8155517578125, "reward_std": 0.011606285348534584, "rewards//mean": 0.8155517578125, "rewards//std": 0.026194389909505844, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.492, "grad_norm": 1.854804515838623, "kl": 0.41427729092538357, "learning_rate": 2.604689134322999e-06, "loss": 0.0414, "num_tokens": 16046432.0, "reward": 0.81951904296875, "reward_std": 0.011512323282659054, "rewards//mean": 0.81951904296875, "rewards//std": 0.02530754543840885, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4922, "grad_norm": 1.6037660837173462, "kl": 0.2892428273335099, "learning_rate": 2.6031038421616684e-06, "loss": 0.0289, "num_tokens": 16052976.0, "reward": 0.828857421875, "reward_std": 0.010024795308709145, "rewards//mean": 0.828857421875, "rewards//std": 0.015838617458939552, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4924, "grad_norm": 1.3342516422271729, "kl": 0.3512964677065611, "learning_rate": 2.6015185084701226e-06, "loss": 0.0351, "num_tokens": 16059536.0, "reward": 0.838623046875, "reward_std": 0.009889259934425354, "rewards//mean": 0.838623046875, "rewards//std": 0.021551910787820816, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4926, "grad_norm": 2.3771471977233887, "kl": 0.22252903878688812, "learning_rate": 2.599933133886934e-06, "loss": 0.0223, "num_tokens": 16065984.0, "reward": 0.87359619140625, "reward_std": 0.014277622103691101, "rewards//mean": 0.87359619140625, "rewards//std": 0.03518976643681526, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4928, "grad_norm": 2.7124555110931396, "kl": 0.27332983911037445, "learning_rate": 2.59834771905069e-06, "loss": 0.0273, "num_tokens": 16072480.0, "reward": 0.8603515625, "reward_std": 0.013286899775266647, "rewards//mean": 0.8603515625, "rewards//std": 0.04054718464612961, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.493, "grad_norm": 1.6197770833969116, "kl": 0.3143019061535597, "learning_rate": 2.5967622645999972e-06, "loss": 0.0314, "num_tokens": 16078992.0, "reward": 0.86676025390625, "reward_std": 0.009944534860551357, "rewards//mean": 0.86676025390625, "rewards//std": 0.027849432080984116, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4932, "grad_norm": 1.7265771627426147, "kl": 0.46843980997800827, "learning_rate": 2.5951767711734756e-06, "loss": 0.0468, "num_tokens": 16085536.0, "reward": 0.84967041015625, "reward_std": 0.010644908994436264, "rewards//mean": 0.84967041015625, "rewards//std": 0.03169628232717514, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4934, "grad_norm": 1.5965112447738647, "kl": 0.3645903719589114, "learning_rate": 2.5935912394097616e-06, "loss": 0.0365, "num_tokens": 16091944.0, "reward": 0.87738037109375, "reward_std": 0.015844646841287613, "rewards//mean": 0.87738037109375, "rewards//std": 0.03229899704456329, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4936, "grad_norm": 2.3450405597686768, "kl": 0.36557460203766823, "learning_rate": 2.5920056699475076e-06, "loss": 0.0366, "num_tokens": 16098384.0, "reward": 0.81927490234375, "reward_std": 0.011214300990104675, "rewards//mean": 0.81927490234375, "rewards//std": 0.016102004796266556, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4938, "grad_norm": 1.632002592086792, "kl": 0.33931236527860165, "learning_rate": 2.590420063425382e-06, "loss": 0.0339, "num_tokens": 16104832.0, "reward": 0.8209228515625, "reward_std": 0.013540804386138916, "rewards//mean": 0.8209228515625, "rewards//std": 0.030331386253237724, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.494, "grad_norm": 1.961845874786377, "kl": 0.2962764548137784, "learning_rate": 2.588834420482064e-06, "loss": 0.0296, "num_tokens": 16111352.0, "reward": 0.8404541015625, "reward_std": 0.0118065495043993, "rewards//mean": 0.8404541015625, "rewards//std": 0.028367312625050545, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4942, "grad_norm": 1.5983009338378906, "kl": 0.23230336979031563, "learning_rate": 2.587248741756253e-06, "loss": 0.0232, "num_tokens": 16117888.0, "reward": 0.84356689453125, "reward_std": 0.010063902474939823, "rewards//mean": 0.84356689453125, "rewards//std": 0.0235141534358263, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4944, "grad_norm": 2.3154795169830322, "kl": 0.21641121618449688, "learning_rate": 2.585663027886659e-06, "loss": 0.0216, "num_tokens": 16124352.0, "reward": 0.840576171875, "reward_std": 0.012653951533138752, "rewards//mean": 0.840576171875, "rewards//std": 0.031148109585046768, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4946, "grad_norm": 1.9953831434249878, "kl": 0.26242612674832344, "learning_rate": 2.584077279512007e-06, "loss": 0.0262, "num_tokens": 16130872.0, "reward": 0.82208251953125, "reward_std": 0.012262675911188126, "rewards//mean": 0.82208251953125, "rewards//std": 0.020936891436576843, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4948, "grad_norm": 1.7465953826904297, "kl": 0.24859404936432838, "learning_rate": 2.582491497271038e-06, "loss": 0.0249, "num_tokens": 16137360.0, "reward": 0.77520751953125, "reward_std": 0.007223078981041908, "rewards//mean": 0.77520751953125, "rewards//std": 0.015452446416020393, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.495, "grad_norm": 2.1239845752716064, "kl": 0.26639459654688835, "learning_rate": 2.580905681802503e-06, "loss": 0.0266, "num_tokens": 16143872.0, "reward": 0.81719970703125, "reward_std": 0.010705179534852505, "rewards//mean": 0.81719970703125, "rewards//std": 0.020281709730625153, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4952, "grad_norm": 1.7472479343414307, "kl": 0.1606267960742116, "learning_rate": 2.57931983374517e-06, "loss": 0.0161, "num_tokens": 16150320.0, "reward": 0.8531494140625, "reward_std": 0.014356125146150589, "rewards//mean": 0.8531494140625, "rewards//std": 0.03317279368638992, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4954, "grad_norm": 1.5441423654556274, "kl": 0.1811736999079585, "learning_rate": 2.577733953737816e-06, "loss": 0.0181, "num_tokens": 16156776.0, "reward": 0.78668212890625, "reward_std": 0.011386802420020103, "rewards//mean": 0.78668212890625, "rewards//std": 0.02101411111652851, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4956, "grad_norm": 1.7555235624313354, "kl": 0.29185268469154835, "learning_rate": 2.5761480424192358e-06, "loss": 0.0292, "num_tokens": 16163336.0, "reward": 0.8629150390625, "reward_std": 0.012290574610233307, "rewards//mean": 0.8629150390625, "rewards//std": 0.023834262043237686, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4958, "grad_norm": 1.777313470840454, "kl": 0.22826204914599657, "learning_rate": 2.574562100428233e-06, "loss": 0.0228, "num_tokens": 16169896.0, "reward": 0.8271484375, "reward_std": 0.009191330522298813, "rewards//mean": 0.8271484375, "rewards//std": 0.029528474435210228, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.496, "grad_norm": 1.8449537754058838, "kl": 0.30622424744069576, "learning_rate": 2.5729761284036247e-06, "loss": 0.0306, "num_tokens": 16176448.0, "reward": 0.81939697265625, "reward_std": 0.012415748089551926, "rewards//mean": 0.81939697265625, "rewards//std": 0.014511778950691223, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4962, "grad_norm": 3.0311455726623535, "kl": 0.2736195642501116, "learning_rate": 2.5713901269842405e-06, "loss": 0.0274, "num_tokens": 16182968.0, "reward": 0.85992431640625, "reward_std": 0.012117796577513218, "rewards//mean": 0.85992431640625, "rewards//std": 0.030999954789876938, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4964, "grad_norm": 2.5516581535339355, "kl": 0.21123144682496786, "learning_rate": 2.569804096808923e-06, "loss": 0.0211, "num_tokens": 16189544.0, "reward": 0.84375, "reward_std": 0.012208626605570316, "rewards//mean": 0.84375, "rewards//std": 0.03689418360590935, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4966, "grad_norm": 1.262876033782959, "kl": 0.17070193774998188, "learning_rate": 2.568218038516522e-06, "loss": 0.0171, "num_tokens": 16195960.0, "reward": 0.85821533203125, "reward_std": 0.008298475295305252, "rewards//mean": 0.85821533203125, "rewards//std": 0.01698678359389305, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4968, "grad_norm": 1.690809965133667, "kl": 0.24147260934114456, "learning_rate": 2.5666319527459044e-06, "loss": 0.0241, "num_tokens": 16202488.0, "reward": 0.86517333984375, "reward_std": 0.014171771705150604, "rewards//mean": 0.86517333984375, "rewards//std": 0.03122667968273163, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.497, "grad_norm": 3.7948496341705322, "kl": 0.2529966337606311, "learning_rate": 2.5650458401359436e-06, "loss": 0.0253, "num_tokens": 16209056.0, "reward": 0.86767578125, "reward_std": 0.015222111716866493, "rewards//mean": 0.86767578125, "rewards//std": 0.02643398754298687, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4972, "grad_norm": 1.7960100173950195, "kl": 0.25113015016540885, "learning_rate": 2.5634597013255265e-06, "loss": 0.0251, "num_tokens": 16215592.0, "reward": 0.8656005859375, "reward_std": 0.01393476314842701, "rewards//mean": 0.8656005859375, "rewards//std": 0.025569746270775795, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4974, "grad_norm": 1.7831249237060547, "kl": 0.41491211391985416, "learning_rate": 2.561873536953549e-06, "loss": 0.0415, "num_tokens": 16222040.0, "reward": 0.8348388671875, "reward_std": 0.01631031557917595, "rewards//mean": 0.8348388671875, "rewards//std": 0.03368539735674858, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4976, "grad_norm": 1.410488247871399, "kl": 0.15417403541505337, "learning_rate": 2.5602873476589186e-06, "loss": 0.0154, "num_tokens": 16228680.0, "reward": 0.8084716796875, "reward_std": 0.014457312412559986, "rewards//mean": 0.8084716796875, "rewards//std": 0.029528219252824783, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4978, "grad_norm": 3.017153024673462, "kl": 0.2626640284433961, "learning_rate": 2.5587011340805514e-06, "loss": 0.0263, "num_tokens": 16235208.0, "reward": 0.8388671875, "reward_std": 0.009593318216502666, "rewards//mean": 0.8388671875, "rewards//std": 0.020623009651899338, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.498, "grad_norm": 1.728162169456482, "kl": 0.21758228540420532, "learning_rate": 2.5571148968573747e-06, "loss": 0.0218, "num_tokens": 16241704.0, "reward": 0.81103515625, "reward_std": 0.011358867399394512, "rewards//mean": 0.81103515625, "rewards//std": 0.02182689495384693, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4982, "grad_norm": 1.3808461427688599, "kl": 0.16961194248870015, "learning_rate": 2.555528636628324e-06, "loss": 0.017, "num_tokens": 16248200.0, "reward": 0.79937744140625, "reward_std": 0.011301981285214424, "rewards//mean": 0.79937744140625, "rewards//std": 0.02623964287340641, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4984, "grad_norm": 1.574618935585022, "kl": 0.19624423701316118, "learning_rate": 2.5539423540323447e-06, "loss": 0.0196, "num_tokens": 16254664.0, "reward": 0.8580322265625, "reward_std": 0.010723263025283813, "rewards//mean": 0.8580322265625, "rewards//std": 0.022980613633990288, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4986, "grad_norm": 2.9131853580474854, "kl": 0.24487788695842028, "learning_rate": 2.5523560497083927e-06, "loss": 0.0245, "num_tokens": 16261152.0, "reward": 0.8094482421875, "reward_std": 0.011428981088101864, "rewards//mean": 0.8094482421875, "rewards//std": 0.019368978217244148, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4988, "grad_norm": 1.9206171035766602, "kl": 0.20338453818112612, "learning_rate": 2.5507697242954295e-06, "loss": 0.0203, "num_tokens": 16267672.0, "reward": 0.84326171875, "reward_std": 0.009627661667764187, "rewards//mean": 0.84326171875, "rewards//std": 0.02183798886835575, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.499, "grad_norm": 1.8427461385726929, "kl": 0.28964888677001, "learning_rate": 2.549183378432428e-06, "loss": 0.029, "num_tokens": 16274152.0, "reward": 0.8316650390625, "reward_std": 0.011716336943209171, "rewards//mean": 0.8316650390625, "rewards//std": 0.02162868343293667, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4992, "grad_norm": 1.797951579093933, "kl": 0.218647507019341, "learning_rate": 2.5475970127583665e-06, "loss": 0.0219, "num_tokens": 16280648.0, "reward": 0.84954833984375, "reward_std": 0.010953174903988838, "rewards//mean": 0.84954833984375, "rewards//std": 0.020207684487104416, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4994, "grad_norm": 1.4754337072372437, "kl": 0.3366709742695093, "learning_rate": 2.5460106279122354e-06, "loss": 0.0337, "num_tokens": 16287144.0, "reward": 0.8389892578125, "reward_std": 0.01165071316063404, "rewards//mean": 0.8389892578125, "rewards//std": 0.016768498346209526, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4996, "grad_norm": 1.6104755401611328, "kl": 0.33308495953679085, "learning_rate": 2.5444242245330276e-06, "loss": 0.0333, "num_tokens": 16293616.0, "reward": 0.78973388671875, "reward_std": 0.012372547760605812, "rewards//mean": 0.78973388671875, "rewards//std": 0.014458481222391129, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.4998, "grad_norm": 2.391277551651001, "kl": 0.200754608027637, "learning_rate": 2.542837803259748e-06, "loss": 0.0201, "num_tokens": 16300104.0, "reward": 0.84637451171875, "reward_std": 0.016695328056812286, "rewards//mean": 0.84637451171875, "rewards//std": 0.02537325583398342, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5, "grad_norm": 2.914837121963501, "kl": 0.26235833764076233, "learning_rate": 2.5412513647314065e-06, "loss": 0.0262, "num_tokens": 16306688.0, "reward": 0.85101318359375, "reward_std": 0.010424918495118618, "rewards//mean": 0.85101318359375, "rewards//std": 0.022248564288020134, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5002, "grad_norm": 1.5018295049667358, "kl": 0.1874726889654994, "learning_rate": 2.53966490958702e-06, "loss": 0.0187, "num_tokens": 16313288.0, "reward": 0.83056640625, "reward_std": 0.010075242258608341, "rewards//mean": 0.83056640625, "rewards//std": 0.030157459899783134, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5004, "grad_norm": 1.8648275136947632, "kl": 0.3198117120191455, "learning_rate": 2.5380784384656126e-06, "loss": 0.032, "num_tokens": 16319816.0, "reward": 0.8209228515625, "reward_std": 0.012409290298819542, "rewards//mean": 0.8209228515625, "rewards//std": 0.020555000752210617, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5006, "grad_norm": 3.101459264755249, "kl": 0.281138701364398, "learning_rate": 2.536491952006215e-06, "loss": 0.0281, "num_tokens": 16326320.0, "reward": 0.87060546875, "reward_std": 0.013392644934356213, "rewards//mean": 0.87060546875, "rewards//std": 0.022951940074563026, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5008, "grad_norm": 1.5123884677886963, "kl": 0.29189431108534336, "learning_rate": 2.5349054508478636e-06, "loss": 0.0292, "num_tokens": 16332808.0, "reward": 0.80712890625, "reward_std": 0.012681880034506321, "rewards//mean": 0.80712890625, "rewards//std": 0.03591282665729523, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.501, "grad_norm": 1.732906699180603, "kl": 0.2455336255952716, "learning_rate": 2.5333189356296006e-06, "loss": 0.0246, "num_tokens": 16339264.0, "reward": 0.7940673828125, "reward_std": 0.011640122160315514, "rewards//mean": 0.7940673828125, "rewards//std": 0.027765393257141113, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5012, "grad_norm": 1.818814992904663, "kl": 0.31241329573094845, "learning_rate": 2.531732406990474e-06, "loss": 0.0312, "num_tokens": 16345776.0, "reward": 0.8834228515625, "reward_std": 0.013374852947890759, "rewards//mean": 0.8834228515625, "rewards//std": 0.024980410933494568, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5014, "grad_norm": 1.4015955924987793, "kl": 0.3067450728267431, "learning_rate": 2.530145865569538e-06, "loss": 0.0307, "num_tokens": 16352296.0, "reward": 0.8597412109375, "reward_std": 0.015514080412685871, "rewards//mean": 0.8597412109375, "rewards//std": 0.032930970191955566, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5016, "grad_norm": 1.3870824575424194, "kl": 0.34432306606322527, "learning_rate": 2.528559312005851e-06, "loss": 0.0344, "num_tokens": 16358784.0, "reward": 0.856201171875, "reward_std": 0.01626926101744175, "rewards//mean": 0.856201171875, "rewards//std": 0.030614785850048065, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5018, "grad_norm": 3.091921806335449, "kl": 0.3104264670982957, "learning_rate": 2.5269727469384762e-06, "loss": 0.031, "num_tokens": 16365320.0, "reward": 0.80584716796875, "reward_std": 0.01213865540921688, "rewards//mean": 0.80584716796875, "rewards//std": 0.01778094470500946, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.502, "grad_norm": 1.8312230110168457, "kl": 0.2971277590841055, "learning_rate": 2.525386171006483e-06, "loss": 0.0297, "num_tokens": 16371840.0, "reward": 0.88372802734375, "reward_std": 0.014626404270529747, "rewards//mean": 0.88372802734375, "rewards//std": 0.01595846749842167, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5022, "grad_norm": 1.3745338916778564, "kl": 0.37570967990905046, "learning_rate": 2.5237995848489422e-06, "loss": 0.0376, "num_tokens": 16378464.0, "reward": 0.82464599609375, "reward_std": 0.015737462788820267, "rewards//mean": 0.82464599609375, "rewards//std": 0.03617902472615242, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5024, "grad_norm": 2.297320604324341, "kl": 0.302663866430521, "learning_rate": 2.522212989104932e-06, "loss": 0.0303, "num_tokens": 16384944.0, "reward": 0.81915283203125, "reward_std": 0.009844222106039524, "rewards//mean": 0.81915283203125, "rewards//std": 0.01995060406625271, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5026, "grad_norm": 1.718304991722107, "kl": 0.40152551606297493, "learning_rate": 2.520626384413532e-06, "loss": 0.0402, "num_tokens": 16391480.0, "reward": 0.80377197265625, "reward_std": 0.009933361783623695, "rewards//mean": 0.80377197265625, "rewards//std": 0.019484594464302063, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5028, "grad_norm": 1.630189061164856, "kl": 0.36287038680166006, "learning_rate": 2.519039771413827e-06, "loss": 0.0363, "num_tokens": 16398056.0, "reward": 0.8494873046875, "reward_std": 0.011809838004410267, "rewards//mean": 0.8494873046875, "rewards//std": 0.01902199164032936, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.503, "grad_norm": 1.3997116088867188, "kl": 0.2814068514853716, "learning_rate": 2.517453150744904e-06, "loss": 0.0281, "num_tokens": 16404552.0, "reward": 0.8568115234375, "reward_std": 0.01294899545609951, "rewards//mean": 0.8568115234375, "rewards//std": 0.05568472668528557, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5032, "grad_norm": 1.558886170387268, "kl": 0.24359482619911432, "learning_rate": 2.515866523045855e-06, "loss": 0.0244, "num_tokens": 16411112.0, "reward": 0.812255859375, "reward_std": 0.01147528737783432, "rewards//mean": 0.812255859375, "rewards//std": 0.0183994360268116, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5034, "grad_norm": 2.1354639530181885, "kl": 0.14960728213191032, "learning_rate": 2.5142798889557713e-06, "loss": 0.015, "num_tokens": 16417640.0, "reward": 0.8638916015625, "reward_std": 0.00968625582754612, "rewards//mean": 0.8638916015625, "rewards//std": 0.0188203826546669, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5036, "grad_norm": 1.9576085805892944, "kl": 0.3380892714485526, "learning_rate": 2.5126932491137507e-06, "loss": 0.0338, "num_tokens": 16424176.0, "reward": 0.76416015625, "reward_std": 0.011740263551473618, "rewards//mean": 0.76416015625, "rewards//std": 0.019838847219944, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5038, "grad_norm": 2.098404884338379, "kl": 0.24595889635384083, "learning_rate": 2.5111066041588905e-06, "loss": 0.0246, "num_tokens": 16430752.0, "reward": 0.8221435546875, "reward_std": 0.012425771914422512, "rewards//mean": 0.8221435546875, "rewards//std": 0.029223153367638588, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.504, "grad_norm": 2.04781174659729, "kl": 0.44675134494900703, "learning_rate": 2.509519954730292e-06, "loss": 0.0447, "num_tokens": 16437184.0, "reward": 0.82916259765625, "reward_std": 0.009838099591434002, "rewards//mean": 0.82916259765625, "rewards//std": 0.026813725009560585, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5042, "grad_norm": 1.4970394372940063, "kl": 0.3407612731680274, "learning_rate": 2.507933301467056e-06, "loss": 0.0341, "num_tokens": 16443760.0, "reward": 0.83526611328125, "reward_std": 0.012729013338685036, "rewards//mean": 0.83526611328125, "rewards//std": 0.029409751296043396, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5044, "grad_norm": 1.5638236999511719, "kl": 0.3877902403473854, "learning_rate": 2.506346645008288e-06, "loss": 0.0388, "num_tokens": 16450272.0, "reward": 0.86962890625, "reward_std": 0.01711748167872429, "rewards//mean": 0.86962890625, "rewards//std": 0.024305002763867378, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5046, "grad_norm": 1.705910086631775, "kl": 0.18201319687068462, "learning_rate": 2.5047599859930916e-06, "loss": 0.0182, "num_tokens": 16456792.0, "reward": 0.85784912109375, "reward_std": 0.009183516725897789, "rewards//mean": 0.85784912109375, "rewards//std": 0.022332772612571716, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5048, "grad_norm": 2.046365737915039, "kl": 0.2254131156951189, "learning_rate": 2.503173325060574e-06, "loss": 0.0225, "num_tokens": 16463328.0, "reward": 0.86114501953125, "reward_std": 0.012010055594146252, "rewards//mean": 0.86114501953125, "rewards//std": 0.023302040994167328, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.505, "grad_norm": 2.2138376235961914, "kl": 0.42002639453858137, "learning_rate": 2.501586662849841e-06, "loss": 0.042, "num_tokens": 16469888.0, "reward": 0.8507080078125, "reward_std": 0.011498721316456795, "rewards//mean": 0.8507080078125, "rewards//std": 0.02011730894446373, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5052, "grad_norm": 1.3715267181396484, "kl": 0.2763461507856846, "learning_rate": 2.5e-06, "loss": 0.0276, "num_tokens": 16476408.0, "reward": 0.84051513671875, "reward_std": 0.015005196444690228, "rewards//mean": 0.84051513671875, "rewards//std": 0.02110539749264717, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5054, "grad_norm": 2.0013267993927, "kl": 0.2325794380158186, "learning_rate": 2.49841333715016e-06, "loss": 0.0233, "num_tokens": 16482856.0, "reward": 0.85723876953125, "reward_std": 0.008064309135079384, "rewards//mean": 0.85723876953125, "rewards//std": 0.016904599964618683, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5056, "grad_norm": 1.6709753274917603, "kl": 0.2320391875691712, "learning_rate": 2.496826674939427e-06, "loss": 0.0232, "num_tokens": 16489384.0, "reward": 0.85943603515625, "reward_std": 0.011077716015279293, "rewards//mean": 0.85943603515625, "rewards//std": 0.014886637218296528, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5058, "grad_norm": 1.7893208265304565, "kl": 0.3001213651150465, "learning_rate": 2.495240014006909e-06, "loss": 0.03, "num_tokens": 16495904.0, "reward": 0.86737060546875, "reward_std": 0.009862218052148819, "rewards//mean": 0.86737060546875, "rewards//std": 0.02859022654592991, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.506, "grad_norm": 1.8512728214263916, "kl": 0.3713970258831978, "learning_rate": 2.493653354991713e-06, "loss": 0.0371, "num_tokens": 16502384.0, "reward": 0.86065673828125, "reward_std": 0.015258923172950745, "rewards//mean": 0.86065673828125, "rewards//std": 0.027726860716938972, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5062, "grad_norm": 1.5370680093765259, "kl": 0.23986042384058237, "learning_rate": 2.4920666985329446e-06, "loss": 0.024, "num_tokens": 16508880.0, "reward": 0.84771728515625, "reward_std": 0.012663004919886589, "rewards//mean": 0.84771728515625, "rewards//std": 0.023307885974645615, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5064, "grad_norm": 2.3581624031066895, "kl": 0.5914188530296087, "learning_rate": 2.4904800452697085e-06, "loss": 0.0591, "num_tokens": 16515320.0, "reward": 0.82952880859375, "reward_std": 0.01230580173432827, "rewards//mean": 0.82952880859375, "rewards//std": 0.021703006699681282, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5066, "grad_norm": 2.032431125640869, "kl": 0.5449427869170904, "learning_rate": 2.4888933958411104e-06, "loss": 0.0545, "num_tokens": 16521840.0, "reward": 0.8553466796875, "reward_std": 0.015318848192691803, "rewards//mean": 0.8553466796875, "rewards//std": 0.025219237431883812, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5068, "grad_norm": 1.9194674491882324, "kl": 0.49969081953167915, "learning_rate": 2.48730675088625e-06, "loss": 0.05, "num_tokens": 16528288.0, "reward": 0.87823486328125, "reward_std": 0.011283699423074722, "rewards//mean": 0.87823486328125, "rewards//std": 0.02020243927836418, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.507, "grad_norm": 1.7263127565383911, "kl": 0.3564156610518694, "learning_rate": 2.4857201110442295e-06, "loss": 0.0356, "num_tokens": 16534928.0, "reward": 0.86285400390625, "reward_std": 0.017202507704496384, "rewards//mean": 0.86285400390625, "rewards//std": 0.033740345388650894, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5072, "grad_norm": 2.60752272605896, "kl": 0.4010701794177294, "learning_rate": 2.4841334769541457e-06, "loss": 0.0401, "num_tokens": 16541512.0, "reward": 0.8675537109375, "reward_std": 0.014426898211240768, "rewards//mean": 0.8675537109375, "rewards//std": 0.03596147894859314, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5074, "grad_norm": 1.4221830368041992, "kl": 0.25643387250602245, "learning_rate": 2.482546849255096e-06, "loss": 0.0256, "num_tokens": 16548072.0, "reward": 0.86676025390625, "reward_std": 0.011343313381075859, "rewards//mean": 0.86676025390625, "rewards//std": 0.017484763637185097, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5076, "grad_norm": 1.5597951412200928, "kl": 0.21451689954847097, "learning_rate": 2.4809602285861737e-06, "loss": 0.0215, "num_tokens": 16554600.0, "reward": 0.861572265625, "reward_std": 0.015151053667068481, "rewards//mean": 0.861572265625, "rewards//std": 0.04139183089137077, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5078, "grad_norm": 3.0436723232269287, "kl": 0.4672063812613487, "learning_rate": 2.479373615586469e-06, "loss": 0.0467, "num_tokens": 16561064.0, "reward": 0.7939453125, "reward_std": 0.014334510080516338, "rewards//mean": 0.7939453125, "rewards//std": 0.024220149964094162, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.508, "grad_norm": 1.8942006826400757, "kl": 0.2902680393308401, "learning_rate": 2.477787010895069e-06, "loss": 0.029, "num_tokens": 16567584.0, "reward": 0.76708984375, "reward_std": 0.012336043640971184, "rewards//mean": 0.76708984375, "rewards//std": 0.022536631673574448, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5082, "grad_norm": 1.9670993089675903, "kl": 0.5222773216664791, "learning_rate": 2.4762004151510586e-06, "loss": 0.0522, "num_tokens": 16574040.0, "reward": 0.82867431640625, "reward_std": 0.010806123726069927, "rewards//mean": 0.82867431640625, "rewards//std": 0.031023385003209114, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5084, "grad_norm": 2.3889992237091064, "kl": 0.3000019518658519, "learning_rate": 2.4746138289935177e-06, "loss": 0.03, "num_tokens": 16580648.0, "reward": 0.83099365234375, "reward_std": 0.012368901632726192, "rewards//mean": 0.83099365234375, "rewards//std": 0.022980859503149986, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5086, "grad_norm": 2.527829885482788, "kl": 0.23390883393585682, "learning_rate": 2.473027253061524e-06, "loss": 0.0234, "num_tokens": 16587208.0, "reward": 0.8544921875, "reward_std": 0.01200925000011921, "rewards//mean": 0.8544921875, "rewards//std": 0.02995196171104908, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5088, "grad_norm": 1.8188652992248535, "kl": 0.4250277355313301, "learning_rate": 2.47144068799415e-06, "loss": 0.0425, "num_tokens": 16593696.0, "reward": 0.84588623046875, "reward_std": 0.01012780237942934, "rewards//mean": 0.84588623046875, "rewards//std": 0.01696181297302246, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.509, "grad_norm": 1.7192133665084839, "kl": 0.5106096044182777, "learning_rate": 2.4698541344304623e-06, "loss": 0.0511, "num_tokens": 16600128.0, "reward": 0.85089111328125, "reward_std": 0.013833921402692795, "rewards//mean": 0.85089111328125, "rewards//std": 0.02991599403321743, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5092, "grad_norm": 2.9199490547180176, "kl": 0.5789858568459749, "learning_rate": 2.4682675930095264e-06, "loss": 0.0579, "num_tokens": 16606720.0, "reward": 0.8565673828125, "reward_std": 0.018710341304540634, "rewards//mean": 0.8565673828125, "rewards//std": 0.04291781410574913, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5094, "grad_norm": 1.7597541809082031, "kl": 0.565124798566103, "learning_rate": 2.4666810643704003e-06, "loss": 0.0565, "num_tokens": 16613216.0, "reward": 0.86767578125, "reward_std": 0.015368029475212097, "rewards//mean": 0.86767578125, "rewards//std": 0.026816053315997124, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5096, "grad_norm": 1.871014952659607, "kl": 0.3030029609799385, "learning_rate": 2.4650945491521372e-06, "loss": 0.0303, "num_tokens": 16619648.0, "reward": 0.852294921875, "reward_std": 0.00981568731367588, "rewards//mean": 0.852294921875, "rewards//std": 0.021630434319376945, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5098, "grad_norm": 1.7850871086120605, "kl": 0.3168558217585087, "learning_rate": 2.463508047993785e-06, "loss": 0.0317, "num_tokens": 16626072.0, "reward": 0.8931884765625, "reward_std": 0.0143305454403162, "rewards//mean": 0.8931884765625, "rewards//std": 0.030171260237693787, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.51, "grad_norm": 1.6420392990112305, "kl": 0.3282525483518839, "learning_rate": 2.461921561534388e-06, "loss": 0.0328, "num_tokens": 16632584.0, "reward": 0.8424072265625, "reward_std": 0.01302638091146946, "rewards//mean": 0.8424072265625, "rewards//std": 0.02241506241261959, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5102, "grad_norm": 2.061525344848633, "kl": 0.3507862724363804, "learning_rate": 2.4603350904129802e-06, "loss": 0.0351, "num_tokens": 16639104.0, "reward": 0.839111328125, "reward_std": 0.014945794828236103, "rewards//mean": 0.839111328125, "rewards//std": 0.03345022350549698, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5104, "grad_norm": 1.6126154661178589, "kl": 0.340866481885314, "learning_rate": 2.458748635268594e-06, "loss": 0.0341, "num_tokens": 16645576.0, "reward": 0.87347412109375, "reward_std": 0.015506146475672722, "rewards//mean": 0.87347412109375, "rewards//std": 0.020082946866750717, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5106, "grad_norm": 1.6989431381225586, "kl": 0.32655486837029457, "learning_rate": 2.457162196740252e-06, "loss": 0.0327, "num_tokens": 16652048.0, "reward": 0.850830078125, "reward_std": 0.012990662828087807, "rewards//mean": 0.850830078125, "rewards//std": 0.03998098894953728, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5108, "grad_norm": 1.3935903310775757, "kl": 0.40316721238195896, "learning_rate": 2.455575775466973e-06, "loss": 0.0403, "num_tokens": 16658432.0, "reward": 0.85986328125, "reward_std": 0.012846007943153381, "rewards//mean": 0.85986328125, "rewards//std": 0.019494013860821724, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.511, "grad_norm": 3.4051382541656494, "kl": 0.27457467280328274, "learning_rate": 2.453989372087765e-06, "loss": 0.0275, "num_tokens": 16664992.0, "reward": 0.86572265625, "reward_std": 0.013376735150814056, "rewards//mean": 0.86572265625, "rewards//std": 0.01913031004369259, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5112, "grad_norm": 2.027010202407837, "kl": 0.45589890517294407, "learning_rate": 2.4524029872416335e-06, "loss": 0.0456, "num_tokens": 16671608.0, "reward": 0.84027099609375, "reward_std": 0.009386691264808178, "rewards//mean": 0.84027099609375, "rewards//std": 0.02837498113512993, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5114, "grad_norm": 2.770504951477051, "kl": 0.48822442069649696, "learning_rate": 2.450816621567572e-06, "loss": 0.0488, "num_tokens": 16678152.0, "reward": 0.82470703125, "reward_std": 0.010983502492308617, "rewards//mean": 0.82470703125, "rewards//std": 0.015367105603218079, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5116, "grad_norm": 2.358412504196167, "kl": 0.3954878933727741, "learning_rate": 2.4492302757045705e-06, "loss": 0.0395, "num_tokens": 16684600.0, "reward": 0.82568359375, "reward_std": 0.011059756390750408, "rewards//mean": 0.82568359375, "rewards//std": 0.021547695621848106, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5118, "grad_norm": 1.507461667060852, "kl": 0.4474807409569621, "learning_rate": 2.447643950291608e-06, "loss": 0.0447, "num_tokens": 16691096.0, "reward": 0.8341064453125, "reward_std": 0.016117027029395103, "rewards//mean": 0.8341064453125, "rewards//std": 0.025751441717147827, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.512, "grad_norm": 2.0088775157928467, "kl": 0.4540459802374244, "learning_rate": 2.4460576459676553e-06, "loss": 0.0454, "num_tokens": 16697720.0, "reward": 0.84228515625, "reward_std": 0.013288947753608227, "rewards//mean": 0.84228515625, "rewards//std": 0.029011299833655357, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5122, "grad_norm": 1.6629273891448975, "kl": 0.395614180713892, "learning_rate": 2.4444713633716764e-06, "loss": 0.0396, "num_tokens": 16704224.0, "reward": 0.8406982421875, "reward_std": 0.011362366378307343, "rewards//mean": 0.8406982421875, "rewards//std": 0.020722229033708572, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5124, "grad_norm": 2.4014792442321777, "kl": 0.47180740162730217, "learning_rate": 2.4428851031426257e-06, "loss": 0.0472, "num_tokens": 16710688.0, "reward": 0.84619140625, "reward_std": 0.01635817252099514, "rewards//mean": 0.84619140625, "rewards//std": 0.031222863122820854, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5126, "grad_norm": 1.889607548713684, "kl": 0.4418380782008171, "learning_rate": 2.441298865919449e-06, "loss": 0.0442, "num_tokens": 16717208.0, "reward": 0.8280029296875, "reward_std": 0.012547997757792473, "rewards//mean": 0.8280029296875, "rewards//std": 0.03253324329853058, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5128, "grad_norm": 1.790810227394104, "kl": 0.45814836770296097, "learning_rate": 2.439712652341082e-06, "loss": 0.0458, "num_tokens": 16723648.0, "reward": 0.89202880859375, "reward_std": 0.015180263668298721, "rewards//mean": 0.89202880859375, "rewards//std": 0.026868995279073715, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.513, "grad_norm": 2.361876964569092, "kl": 0.5088536217808723, "learning_rate": 2.4381264630464517e-06, "loss": 0.0509, "num_tokens": 16730144.0, "reward": 0.7645263671875, "reward_std": 0.012460285797715187, "rewards//mean": 0.7645263671875, "rewards//std": 0.01928124763071537, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5132, "grad_norm": 2.0941150188446045, "kl": 0.39964594691991806, "learning_rate": 2.436540298674474e-06, "loss": 0.04, "num_tokens": 16736712.0, "reward": 0.85760498046875, "reward_std": 0.01673029363155365, "rewards//mean": 0.85760498046875, "rewards//std": 0.03528299182653427, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5134, "grad_norm": 2.2572505474090576, "kl": 0.6004703678190708, "learning_rate": 2.434954159864057e-06, "loss": 0.06, "num_tokens": 16743144.0, "reward": 0.8662109375, "reward_std": 0.014148936606943607, "rewards//mean": 0.8662109375, "rewards//std": 0.021179234609007835, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5136, "grad_norm": 3.4373607635498047, "kl": 0.38323466014117, "learning_rate": 2.4333680472540956e-06, "loss": 0.0383, "num_tokens": 16749736.0, "reward": 0.867919921875, "reward_std": 0.018559733405709267, "rewards//mean": 0.867919921875, "rewards//std": 0.03808852285146713, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5138, "grad_norm": 2.3535878658294678, "kl": 0.3642215747386217, "learning_rate": 2.4317819614834786e-06, "loss": 0.0364, "num_tokens": 16756272.0, "reward": 0.843994140625, "reward_std": 0.013670571148395538, "rewards//mean": 0.843994140625, "rewards//std": 0.021864313632249832, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.514, "grad_norm": 2.6937880516052246, "kl": 0.4433259107172489, "learning_rate": 2.4301959031910785e-06, "loss": 0.0443, "num_tokens": 16762832.0, "reward": 0.850341796875, "reward_std": 0.014039508998394012, "rewards//mean": 0.850341796875, "rewards//std": 0.026105530560016632, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5142, "grad_norm": 2.9774675369262695, "kl": 0.28302899841219187, "learning_rate": 2.42860987301576e-06, "loss": 0.0283, "num_tokens": 16769304.0, "reward": 0.86639404296875, "reward_std": 0.012589499354362488, "rewards//mean": 0.86639404296875, "rewards//std": 0.03243042528629303, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5144, "grad_norm": 2.3669724464416504, "kl": 0.36494111828505993, "learning_rate": 2.427023871596376e-06, "loss": 0.0365, "num_tokens": 16775832.0, "reward": 0.84124755859375, "reward_std": 0.013097953051328659, "rewards//mean": 0.84124755859375, "rewards//std": 0.02371862716972828, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5146, "grad_norm": 2.89581561088562, "kl": 0.3963787127286196, "learning_rate": 2.4254378995717685e-06, "loss": 0.0396, "num_tokens": 16782288.0, "reward": 0.8480224609375, "reward_std": 0.012762291356921196, "rewards//mean": 0.8480224609375, "rewards//std": 0.03698413819074631, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5148, "grad_norm": 1.7533481121063232, "kl": 0.41386733390390873, "learning_rate": 2.4238519575807655e-06, "loss": 0.0414, "num_tokens": 16788872.0, "reward": 0.85467529296875, "reward_std": 0.014324396848678589, "rewards//mean": 0.85467529296875, "rewards//std": 0.034584470093250275, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.515, "grad_norm": 2.6514172554016113, "kl": 0.32465624064207077, "learning_rate": 2.422266046262185e-06, "loss": 0.0325, "num_tokens": 16795408.0, "reward": 0.84771728515625, "reward_std": 0.01869121752679348, "rewards//mean": 0.84771728515625, "rewards//std": 0.03429127857089043, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5152, "grad_norm": 2.084843397140503, "kl": 0.45401959493756294, "learning_rate": 2.4206801662548314e-06, "loss": 0.0454, "num_tokens": 16801984.0, "reward": 0.861572265625, "reward_std": 0.012142452411353588, "rewards//mean": 0.861572265625, "rewards//std": 0.03396042063832283, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5154, "grad_norm": 1.878481388092041, "kl": 0.43649598956108093, "learning_rate": 2.4190943181974978e-06, "loss": 0.0436, "num_tokens": 16808520.0, "reward": 0.84027099609375, "reward_std": 0.011062515899538994, "rewards//mean": 0.84027099609375, "rewards//std": 0.029455525800585747, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5156, "grad_norm": 1.676449179649353, "kl": 0.27688300237059593, "learning_rate": 2.417508502728963e-06, "loss": 0.0277, "num_tokens": 16815088.0, "reward": 0.815673828125, "reward_std": 0.012808827683329582, "rewards//mean": 0.815673828125, "rewards//std": 0.01845201663672924, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5158, "grad_norm": 1.9111791849136353, "kl": 0.5248861238360405, "learning_rate": 2.4159227204879938e-06, "loss": 0.0525, "num_tokens": 16821736.0, "reward": 0.83697509765625, "reward_std": 0.010675502941012383, "rewards//mean": 0.83697509765625, "rewards//std": 0.026220019906759262, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.516, "grad_norm": 1.642434000968933, "kl": 0.3368194494396448, "learning_rate": 2.414336972113342e-06, "loss": 0.0337, "num_tokens": 16828264.0, "reward": 0.77496337890625, "reward_std": 0.01217586174607277, "rewards//mean": 0.77496337890625, "rewards//std": 0.013878334313631058, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5162, "grad_norm": 1.7217192649841309, "kl": 0.5467094238847494, "learning_rate": 2.4127512582437486e-06, "loss": 0.0547, "num_tokens": 16834720.0, "reward": 0.818359375, "reward_std": 0.013633167371153831, "rewards//mean": 0.818359375, "rewards//std": 0.0392850823700428, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5164, "grad_norm": 2.886094331741333, "kl": 0.3828388527035713, "learning_rate": 2.4111655795179366e-06, "loss": 0.0383, "num_tokens": 16841232.0, "reward": 0.82366943359375, "reward_std": 0.011894600465893745, "rewards//mean": 0.82366943359375, "rewards//std": 0.026142543181777, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5166, "grad_norm": 2.148555278778076, "kl": 0.34369961731135845, "learning_rate": 2.4095799365746198e-06, "loss": 0.0344, "num_tokens": 16847768.0, "reward": 0.857177734375, "reward_std": 0.01335289515554905, "rewards//mean": 0.857177734375, "rewards//std": 0.03215061128139496, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5168, "grad_norm": 2.4140403270721436, "kl": 0.6605085544288158, "learning_rate": 2.407994330052493e-06, "loss": 0.0661, "num_tokens": 16854152.0, "reward": 0.81390380859375, "reward_std": 0.013937309384346008, "rewards//mean": 0.81390380859375, "rewards//std": 0.037772390991449356, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.517, "grad_norm": 1.5945017337799072, "kl": 0.28601563442498446, "learning_rate": 2.4064087605902396e-06, "loss": 0.0286, "num_tokens": 16860664.0, "reward": 0.78912353515625, "reward_std": 0.008265987038612366, "rewards//mean": 0.78912353515625, "rewards//std": 0.029177729040384293, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5172, "grad_norm": 2.057435989379883, "kl": 0.3864058805629611, "learning_rate": 2.4048232288265257e-06, "loss": 0.0386, "num_tokens": 16867208.0, "reward": 0.8271484375, "reward_std": 0.011500965803861618, "rewards//mean": 0.8271484375, "rewards//std": 0.017133302986621857, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5174, "grad_norm": 2.3972082138061523, "kl": 0.3933129385113716, "learning_rate": 2.403237735400004e-06, "loss": 0.0393, "num_tokens": 16873736.0, "reward": 0.76959228515625, "reward_std": 0.009115630760788918, "rewards//mean": 0.76959228515625, "rewards//std": 0.012974102050065994, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5176, "grad_norm": 2.9266507625579834, "kl": 0.3566048387438059, "learning_rate": 2.401652280949311e-06, "loss": 0.0357, "num_tokens": 16880328.0, "reward": 0.831298828125, "reward_std": 0.009933270514011383, "rewards//mean": 0.831298828125, "rewards//std": 0.023050658404827118, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5178, "grad_norm": 2.3014323711395264, "kl": 0.4288256047293544, "learning_rate": 2.4000668661130674e-06, "loss": 0.0429, "num_tokens": 16886840.0, "reward": 0.8414306640625, "reward_std": 0.013899263925850391, "rewards//mean": 0.8414306640625, "rewards//std": 0.030857834964990616, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.518, "grad_norm": 1.8213921785354614, "kl": 0.6037158370018005, "learning_rate": 2.3984814915298787e-06, "loss": 0.0604, "num_tokens": 16893352.0, "reward": 0.85675048828125, "reward_std": 0.012809434905648232, "rewards//mean": 0.85675048828125, "rewards//std": 0.02545902132987976, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5182, "grad_norm": 1.7193224430084229, "kl": 0.3022696105763316, "learning_rate": 2.3968961578383324e-06, "loss": 0.0302, "num_tokens": 16899928.0, "reward": 0.879150390625, "reward_std": 0.012008359655737877, "rewards//mean": 0.879150390625, "rewards//std": 0.031565211713314056, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5184, "grad_norm": 1.8697164058685303, "kl": 0.5323450900614262, "learning_rate": 2.3953108656770018e-06, "loss": 0.0532, "num_tokens": 16906432.0, "reward": 0.8604736328125, "reward_std": 0.008741352707147598, "rewards//mean": 0.8604736328125, "rewards//std": 0.019230935722589493, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5186, "grad_norm": 2.153576612472534, "kl": 0.45148106664419174, "learning_rate": 2.3937256156844415e-06, "loss": 0.0451, "num_tokens": 16912904.0, "reward": 0.8829345703125, "reward_std": 0.016787730157375336, "rewards//mean": 0.8829345703125, "rewards//std": 0.02822822704911232, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5188, "grad_norm": 1.7590614557266235, "kl": 0.2480978136882186, "learning_rate": 2.392140408499191e-06, "loss": 0.0248, "num_tokens": 16919408.0, "reward": 0.8294677734375, "reward_std": 0.012419600039720535, "rewards//mean": 0.8294677734375, "rewards//std": 0.03360440954566002, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.519, "grad_norm": 2.1630964279174805, "kl": 0.2498441506177187, "learning_rate": 2.3905552447597707e-06, "loss": 0.025, "num_tokens": 16925928.0, "reward": 0.82305908203125, "reward_std": 0.012689357623457909, "rewards//mean": 0.82305908203125, "rewards//std": 0.02472175657749176, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5192, "grad_norm": 1.56257963180542, "kl": 0.4648524271324277, "learning_rate": 2.3889701251046847e-06, "loss": 0.0465, "num_tokens": 16932432.0, "reward": 0.87603759765625, "reward_std": 0.013579490594565868, "rewards//mean": 0.87603759765625, "rewards//std": 0.028273437172174454, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5194, "grad_norm": 1.9681445360183716, "kl": 0.5007281098514795, "learning_rate": 2.387385050172419e-06, "loss": 0.0501, "num_tokens": 16938904.0, "reward": 0.85919189453125, "reward_std": 0.011497464962303638, "rewards//mean": 0.85919189453125, "rewards//std": 0.022525126114487648, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5196, "grad_norm": 1.8420177698135376, "kl": 0.25826001912355423, "learning_rate": 2.385800020601442e-06, "loss": 0.0258, "num_tokens": 16945400.0, "reward": 0.81903076171875, "reward_std": 0.014715030789375305, "rewards//mean": 0.81903076171875, "rewards//std": 0.01871502213180065, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5198, "grad_norm": 2.1889190673828125, "kl": 0.32333469204604626, "learning_rate": 2.384215037030203e-06, "loss": 0.0323, "num_tokens": 16951904.0, "reward": 0.8739013671875, "reward_std": 0.012030133046209812, "rewards//mean": 0.8739013671875, "rewards//std": 0.026616323739290237, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.52, "grad_norm": 2.0067076683044434, "kl": 0.28800967894494534, "learning_rate": 2.382630100097133e-06, "loss": 0.0288, "num_tokens": 16958368.0, "reward": 0.853271484375, "reward_std": 0.013203928247094154, "rewards//mean": 0.853271484375, "rewards//std": 0.028124269098043442, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5202, "grad_norm": 1.8403135538101196, "kl": 0.3377582747489214, "learning_rate": 2.3810452104406444e-06, "loss": 0.0338, "num_tokens": 16964856.0, "reward": 0.8182373046875, "reward_std": 0.00821585301309824, "rewards//mean": 0.8182373046875, "rewards//std": 0.01821546070277691, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5204, "grad_norm": 1.7944402694702148, "kl": 0.2981593320146203, "learning_rate": 2.3794603686991317e-06, "loss": 0.0298, "num_tokens": 16971328.0, "reward": 0.85675048828125, "reward_std": 0.01307620108127594, "rewards//mean": 0.85675048828125, "rewards//std": 0.025930330157279968, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5206, "grad_norm": 1.5637933015823364, "kl": 0.3212653771042824, "learning_rate": 2.377875575510967e-06, "loss": 0.0321, "num_tokens": 16977752.0, "reward": 0.83001708984375, "reward_std": 0.011010969989001751, "rewards//mean": 0.83001708984375, "rewards//std": 0.019161837175488472, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5208, "grad_norm": 1.4927823543548584, "kl": 0.23079501558095217, "learning_rate": 2.3762908315145068e-06, "loss": 0.0231, "num_tokens": 16984384.0, "reward": 0.792236328125, "reward_std": 0.0102676497772336, "rewards//mean": 0.792236328125, "rewards//std": 0.01873856410384178, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.521, "grad_norm": 2.362586498260498, "kl": 0.4125477857887745, "learning_rate": 2.3747061373480844e-06, "loss": 0.0413, "num_tokens": 16990880.0, "reward": 0.8809814453125, "reward_std": 0.016498545184731483, "rewards//mean": 0.8809814453125, "rewards//std": 0.0391242615878582, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5212, "grad_norm": 1.5961962938308716, "kl": 0.3757031913846731, "learning_rate": 2.373121493650015e-06, "loss": 0.0376, "num_tokens": 16997432.0, "reward": 0.81884765625, "reward_std": 0.010432162322103977, "rewards//mean": 0.81884765625, "rewards//std": 0.01716860756278038, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5214, "grad_norm": 2.533118486404419, "kl": 0.41614619828760624, "learning_rate": 2.3715369010585927e-06, "loss": 0.0416, "num_tokens": 17003920.0, "reward": 0.84429931640625, "reward_std": 0.012866564095020294, "rewards//mean": 0.84429931640625, "rewards//std": 0.02575989067554474, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5216, "grad_norm": 3.3336503505706787, "kl": 0.3569672107696533, "learning_rate": 2.3699523602120913e-06, "loss": 0.0357, "num_tokens": 17010432.0, "reward": 0.8394775390625, "reward_std": 0.01260696817189455, "rewards//mean": 0.8394775390625, "rewards//std": 0.0224312637001276, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5218, "grad_norm": 1.8417168855667114, "kl": 0.26251781918108463, "learning_rate": 2.368367871748764e-06, "loss": 0.0263, "num_tokens": 17017024.0, "reward": 0.8577880859375, "reward_std": 0.016704466193914413, "rewards//mean": 0.8577880859375, "rewards//std": 0.0391845740377903, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.522, "grad_norm": 1.7155882120132446, "kl": 0.2539467643946409, "learning_rate": 2.3667834363068415e-06, "loss": 0.0254, "num_tokens": 17023480.0, "reward": 0.8597412109375, "reward_std": 0.00823557935655117, "rewards//mean": 0.8597412109375, "rewards//std": 0.021973086521029472, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5222, "grad_norm": 2.0578196048736572, "kl": 0.22822071239352226, "learning_rate": 2.3651990545245357e-06, "loss": 0.0228, "num_tokens": 17029944.0, "reward": 0.86846923828125, "reward_std": 0.01494415383785963, "rewards//mean": 0.86846923828125, "rewards//std": 0.02285205014050007, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5224, "grad_norm": 1.3919881582260132, "kl": 0.4088557958602905, "learning_rate": 2.363614727040034e-06, "loss": 0.0409, "num_tokens": 17036456.0, "reward": 0.8211669921875, "reward_std": 0.009862177073955536, "rewards//mean": 0.8211669921875, "rewards//std": 0.02127586118876934, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5226, "grad_norm": 1.5566200017929077, "kl": 0.30592497531324625, "learning_rate": 2.362030454491504e-06, "loss": 0.0306, "num_tokens": 17042896.0, "reward": 0.850830078125, "reward_std": 0.011495653539896011, "rewards//mean": 0.850830078125, "rewards//std": 0.021360008046030998, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5228, "grad_norm": 1.5386337041854858, "kl": 0.4200058802962303, "learning_rate": 2.3604462375170905e-06, "loss": 0.042, "num_tokens": 17049392.0, "reward": 0.84344482421875, "reward_std": 0.010706042870879173, "rewards//mean": 0.84344482421875, "rewards//std": 0.024022389203310013, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.523, "grad_norm": 1.6585655212402344, "kl": 0.29339405708014965, "learning_rate": 2.3588620767549165e-06, "loss": 0.0293, "num_tokens": 17055904.0, "reward": 0.8343505859375, "reward_std": 0.011426800861954689, "rewards//mean": 0.8343505859375, "rewards//std": 0.02242046408355236, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5232, "grad_norm": 2.4468724727630615, "kl": 0.41609243489801884, "learning_rate": 2.35727797284308e-06, "loss": 0.0416, "num_tokens": 17062384.0, "reward": 0.82952880859375, "reward_std": 0.011757058091461658, "rewards//mean": 0.82952880859375, "rewards//std": 0.022674493491649628, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5234, "grad_norm": 2.5183417797088623, "kl": 0.609455019235611, "learning_rate": 2.35569392641966e-06, "loss": 0.0609, "num_tokens": 17068880.0, "reward": 0.84259033203125, "reward_std": 0.011354807764291763, "rewards//mean": 0.84259033203125, "rewards//std": 0.017561648041009903, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5236, "grad_norm": 1.6119438409805298, "kl": 0.3623243719339371, "learning_rate": 2.3541099381227083e-06, "loss": 0.0362, "num_tokens": 17075280.0, "reward": 0.85986328125, "reward_std": 0.013821976259350777, "rewards//mean": 0.85986328125, "rewards//std": 0.03227561339735985, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5238, "grad_norm": 4.189740180969238, "kl": 0.7513459399342537, "learning_rate": 2.352526008590257e-06, "loss": 0.0751, "num_tokens": 17081776.0, "reward": 0.86871337890625, "reward_std": 0.012158507481217384, "rewards//mean": 0.86871337890625, "rewards//std": 0.020382221788167953, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.524, "grad_norm": 3.0685131549835205, "kl": 0.33906853944063187, "learning_rate": 2.350942138460311e-06, "loss": 0.0339, "num_tokens": 17088312.0, "reward": 0.8245849609375, "reward_std": 0.01626267470419407, "rewards//mean": 0.8245849609375, "rewards//std": 0.02852482907474041, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5242, "grad_norm": 2.085426092147827, "kl": 0.3784342650324106, "learning_rate": 2.3493583283708542e-06, "loss": 0.0378, "num_tokens": 17094808.0, "reward": 0.83465576171875, "reward_std": 0.011567550711333752, "rewards//mean": 0.83465576171875, "rewards//std": 0.025742841884493828, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5244, "grad_norm": 1.6271640062332153, "kl": 0.5229867342859507, "learning_rate": 2.347774578959845e-06, "loss": 0.0523, "num_tokens": 17101256.0, "reward": 0.8665771484375, "reward_std": 0.013842158019542694, "rewards//mean": 0.8665771484375, "rewards//std": 0.0329052172601223, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5246, "grad_norm": 2.712383985519409, "kl": 0.6072989590466022, "learning_rate": 2.3461908908652165e-06, "loss": 0.0607, "num_tokens": 17107784.0, "reward": 0.8446044921875, "reward_std": 0.01671091467142105, "rewards//mean": 0.8446044921875, "rewards//std": 0.028420625254511833, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5248, "grad_norm": 2.125272512435913, "kl": 0.5981442667543888, "learning_rate": 2.344607264724879e-06, "loss": 0.0598, "num_tokens": 17114336.0, "reward": 0.8609619140625, "reward_std": 0.014889363199472427, "rewards//mean": 0.8609619140625, "rewards//std": 0.027319103479385376, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.525, "grad_norm": 2.167851448059082, "kl": 0.4934689626097679, "learning_rate": 2.3430237011767166e-06, "loss": 0.0493, "num_tokens": 17120864.0, "reward": 0.8590087890625, "reward_std": 0.014637220650911331, "rewards//mean": 0.8590087890625, "rewards//std": 0.03367280960083008, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5252, "grad_norm": 2.455810546875, "kl": 0.33282077498734, "learning_rate": 2.341440200858589e-06, "loss": 0.0333, "num_tokens": 17127376.0, "reward": 0.79412841796875, "reward_std": 0.009287979453802109, "rewards//mean": 0.79412841796875, "rewards//std": 0.018761876970529556, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5254, "grad_norm": 1.7258468866348267, "kl": 0.29713178239762783, "learning_rate": 2.3398567644083293e-06, "loss": 0.0297, "num_tokens": 17133816.0, "reward": 0.87158203125, "reward_std": 0.013294169679284096, "rewards//mean": 0.87158203125, "rewards//std": 0.01945670321583748, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5256, "grad_norm": 1.8708112239837646, "kl": 0.3417679946869612, "learning_rate": 2.3382733924637456e-06, "loss": 0.0342, "num_tokens": 17140408.0, "reward": 0.8841552734375, "reward_std": 0.007780161686241627, "rewards//mean": 0.8841552734375, "rewards//std": 0.016836969181895256, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5258, "grad_norm": 2.579324245452881, "kl": 0.37971700355410576, "learning_rate": 2.3366900856626203e-06, "loss": 0.038, "num_tokens": 17146928.0, "reward": 0.7701416015625, "reward_std": 0.010907369665801525, "rewards//mean": 0.7701416015625, "rewards//std": 0.02257656119763851, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.526, "grad_norm": 1.808257818222046, "kl": 0.3786275014281273, "learning_rate": 2.335106844642709e-06, "loss": 0.0379, "num_tokens": 17153400.0, "reward": 0.84075927734375, "reward_std": 0.012688912451267242, "rewards//mean": 0.84075927734375, "rewards//std": 0.02701619826257229, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5262, "grad_norm": 2.36411714553833, "kl": 0.3307806337252259, "learning_rate": 2.3335236700417404e-06, "loss": 0.0331, "num_tokens": 17159936.0, "reward": 0.87158203125, "reward_std": 0.011139369569718838, "rewards//mean": 0.87158203125, "rewards//std": 0.024611983448266983, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5264, "grad_norm": 1.541189193725586, "kl": 0.4006677381694317, "learning_rate": 2.3319405624974184e-06, "loss": 0.0401, "num_tokens": 17166440.0, "reward": 0.81573486328125, "reward_std": 0.012904379516839981, "rewards//mean": 0.81573486328125, "rewards//std": 0.030966242775321007, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5266, "grad_norm": 2.255964994430542, "kl": 0.3909178674221039, "learning_rate": 2.330357522647417e-06, "loss": 0.0391, "num_tokens": 17173008.0, "reward": 0.78851318359375, "reward_std": 0.007817204110324383, "rewards//mean": 0.78851318359375, "rewards//std": 0.012310757301747799, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5268, "grad_norm": 1.7225563526153564, "kl": 0.2993033640086651, "learning_rate": 2.3287745511293857e-06, "loss": 0.0299, "num_tokens": 17179512.0, "reward": 0.8306884765625, "reward_std": 0.008923912420868874, "rewards//mean": 0.8306884765625, "rewards//std": 0.025077180936932564, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.527, "grad_norm": 2.1878278255462646, "kl": 0.21479713637381792, "learning_rate": 2.3271916485809444e-06, "loss": 0.0215, "num_tokens": 17186024.0, "reward": 0.8656005859375, "reward_std": 0.010531878098845482, "rewards//mean": 0.8656005859375, "rewards//std": 0.021884728223085403, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5272, "grad_norm": 1.940641164779663, "kl": 0.4521284746006131, "learning_rate": 2.325608815639687e-06, "loss": 0.0452, "num_tokens": 17192632.0, "reward": 0.86016845703125, "reward_std": 0.014933791011571884, "rewards//mean": 0.86016845703125, "rewards//std": 0.03715050220489502, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5274, "grad_norm": 1.409749984741211, "kl": 0.23707370553165674, "learning_rate": 2.3240260529431773e-06, "loss": 0.0237, "num_tokens": 17199168.0, "reward": 0.79876708984375, "reward_std": 0.007644031196832657, "rewards//mean": 0.79876708984375, "rewards//std": 0.04107532650232315, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5276, "grad_norm": 3.2103710174560547, "kl": 0.28153146244585514, "learning_rate": 2.3224433611289523e-06, "loss": 0.0282, "num_tokens": 17205592.0, "reward": 0.8280029296875, "reward_std": 0.0166248120367527, "rewards//mean": 0.8280029296875, "rewards//std": 0.040090449154376984, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5278, "grad_norm": 2.6666061878204346, "kl": 0.23942931555211544, "learning_rate": 2.3208607408345206e-06, "loss": 0.0239, "num_tokens": 17212104.0, "reward": 0.8460693359375, "reward_std": 0.011717099696397781, "rewards//mean": 0.8460693359375, "rewards//std": 0.0274517685174942, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.528, "grad_norm": 2.3511416912078857, "kl": 0.2664095088839531, "learning_rate": 2.3192781926973606e-06, "loss": 0.0266, "num_tokens": 17218752.0, "reward": 0.8304443359375, "reward_std": 0.011443043127655983, "rewards//mean": 0.8304443359375, "rewards//std": 0.03047080710530281, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5282, "grad_norm": 1.5550352334976196, "kl": 0.3346384400501847, "learning_rate": 2.3176957173549236e-06, "loss": 0.0335, "num_tokens": 17225296.0, "reward": 0.8074951171875, "reward_std": 0.00877461489289999, "rewards//mean": 0.8074951171875, "rewards//std": 0.019923744723200798, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5284, "grad_norm": 1.6594741344451904, "kl": 0.33884879015386105, "learning_rate": 2.316113315444629e-06, "loss": 0.0339, "num_tokens": 17231824.0, "reward": 0.8604736328125, "reward_std": 0.009361013770103455, "rewards//mean": 0.8604736328125, "rewards//std": 0.020344773307442665, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5286, "grad_norm": 1.781470537185669, "kl": 0.25903381407260895, "learning_rate": 2.31453098760387e-06, "loss": 0.0259, "num_tokens": 17238336.0, "reward": 0.85650634765625, "reward_std": 0.01106211543083191, "rewards//mean": 0.85650634765625, "rewards//std": 0.023940961807966232, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5288, "grad_norm": 3.8277125358581543, "kl": 0.2615869101136923, "learning_rate": 2.312948734470006e-06, "loss": 0.0262, "num_tokens": 17244896.0, "reward": 0.84588623046875, "reward_std": 0.0152097437530756, "rewards//mean": 0.84588623046875, "rewards//std": 0.020715927705168724, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.529, "grad_norm": 1.81606125831604, "kl": 0.26836335100233555, "learning_rate": 2.31136655668037e-06, "loss": 0.0268, "num_tokens": 17251304.0, "reward": 0.87066650390625, "reward_std": 0.014746678993105888, "rewards//mean": 0.87066650390625, "rewards//std": 0.04114087298512459, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5292, "grad_norm": 1.9318512678146362, "kl": 0.31720766611397266, "learning_rate": 2.309784454872262e-06, "loss": 0.0317, "num_tokens": 17257744.0, "reward": 0.8350830078125, "reward_std": 0.011904140003025532, "rewards//mean": 0.8350830078125, "rewards//std": 0.03527295961976051, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5294, "grad_norm": 1.787901759147644, "kl": 0.2546895993873477, "learning_rate": 2.3082024296829538e-06, "loss": 0.0255, "num_tokens": 17264264.0, "reward": 0.8720703125, "reward_std": 0.014698406681418419, "rewards//mean": 0.8720703125, "rewards//std": 0.02477382682263851, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5296, "grad_norm": 1.886016845703125, "kl": 0.22238098923116922, "learning_rate": 2.3066204817496828e-06, "loss": 0.0222, "num_tokens": 17270808.0, "reward": 0.8643798828125, "reward_std": 0.014394732192158699, "rewards//mean": 0.8643798828125, "rewards//std": 0.02243936061859131, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5298, "grad_norm": 3.136828899383545, "kl": 0.2619193270802498, "learning_rate": 2.3050386117096594e-06, "loss": 0.0262, "num_tokens": 17277288.0, "reward": 0.8414306640625, "reward_std": 0.011762051843106747, "rewards//mean": 0.8414306640625, "rewards//std": 0.015323207713663578, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.53, "grad_norm": 1.5005671977996826, "kl": 0.2689260393381119, "learning_rate": 2.303456820200059e-06, "loss": 0.0269, "num_tokens": 17283968.0, "reward": 0.83062744140625, "reward_std": 0.012485150247812271, "rewards//mean": 0.83062744140625, "rewards//std": 0.018273850902915, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5302, "grad_norm": 1.4647550582885742, "kl": 0.34090571478009224, "learning_rate": 2.3018751078580287e-06, "loss": 0.0341, "num_tokens": 17290496.0, "reward": 0.86767578125, "reward_std": 0.014503785409033298, "rewards//mean": 0.86767578125, "rewards//std": 0.023770984262228012, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5304, "grad_norm": 2.4913737773895264, "kl": 0.3705272227525711, "learning_rate": 2.300293475320681e-06, "loss": 0.0371, "num_tokens": 17297096.0, "reward": 0.83282470703125, "reward_std": 0.014619313180446625, "rewards//mean": 0.83282470703125, "rewards//std": 0.025959502905607224, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5306, "grad_norm": 2.5475640296936035, "kl": 0.2882338594645262, "learning_rate": 2.298711923225098e-06, "loss": 0.0288, "num_tokens": 17303544.0, "reward": 0.8385009765625, "reward_std": 0.014480341225862503, "rewards//mean": 0.8385009765625, "rewards//std": 0.024510597810149193, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5308, "grad_norm": 1.536615252494812, "kl": 0.33061631489545107, "learning_rate": 2.297130452208328e-06, "loss": 0.0331, "num_tokens": 17310040.0, "reward": 0.8524169921875, "reward_std": 0.010235469788312912, "rewards//mean": 0.8524169921875, "rewards//std": 0.02102392353117466, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.531, "grad_norm": 1.802592158317566, "kl": 0.3539694529026747, "learning_rate": 2.295549062907388e-06, "loss": 0.0354, "num_tokens": 17316608.0, "reward": 0.8731689453125, "reward_std": 0.011722204275429249, "rewards//mean": 0.8731689453125, "rewards//std": 0.019549455493688583, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5312, "grad_norm": 1.7474466562271118, "kl": 0.3086931807920337, "learning_rate": 2.2939677559592606e-06, "loss": 0.0309, "num_tokens": 17323096.0, "reward": 0.86407470703125, "reward_std": 0.013047978281974792, "rewards//mean": 0.86407470703125, "rewards//std": 0.024091605097055435, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5314, "grad_norm": 1.4441128969192505, "kl": 0.2846041936427355, "learning_rate": 2.2923865320008963e-06, "loss": 0.0285, "num_tokens": 17329632.0, "reward": 0.83135986328125, "reward_std": 0.01091306284070015, "rewards//mean": 0.83135986328125, "rewards//std": 0.025186428800225258, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5316, "grad_norm": 2.4797866344451904, "kl": 0.35613850876688957, "learning_rate": 2.290805391669212e-06, "loss": 0.0356, "num_tokens": 17336184.0, "reward": 0.8675537109375, "reward_std": 0.007925625890493393, "rewards//mean": 0.8675537109375, "rewards//std": 0.015728803351521492, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5318, "grad_norm": 2.4773266315460205, "kl": 0.5070906467735767, "learning_rate": 2.2892243356010913e-06, "loss": 0.0507, "num_tokens": 17342712.0, "reward": 0.888916015625, "reward_std": 0.013379091396927834, "rewards//mean": 0.888916015625, "rewards//std": 0.02548583783209324, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.532, "grad_norm": 2.2979629039764404, "kl": 0.28642532601952553, "learning_rate": 2.2876433644333824e-06, "loss": 0.0286, "num_tokens": 17349232.0, "reward": 0.78387451171875, "reward_std": 0.012101277709007263, "rewards//mean": 0.78387451171875, "rewards//std": 0.02360282652080059, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5322, "grad_norm": 2.806227684020996, "kl": 0.5794506222009659, "learning_rate": 2.2860624788029013e-06, "loss": 0.0579, "num_tokens": 17355664.0, "reward": 0.8463134765625, "reward_std": 0.016868894919753075, "rewards//mean": 0.8463134765625, "rewards//std": 0.027902444824576378, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5324, "grad_norm": 2.0719521045684814, "kl": 0.43460038490593433, "learning_rate": 2.284481679346428e-06, "loss": 0.0435, "num_tokens": 17362328.0, "reward": 0.80657958984375, "reward_std": 0.011294210329651833, "rewards//mean": 0.80657958984375, "rewards//std": 0.02686561457812786, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5326, "grad_norm": 1.82942533493042, "kl": 0.2862794790416956, "learning_rate": 2.282900966700709e-06, "loss": 0.0286, "num_tokens": 17368832.0, "reward": 0.89178466796875, "reward_std": 0.011110563762485981, "rewards//mean": 0.89178466796875, "rewards//std": 0.015972690656781197, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5328, "grad_norm": 1.733947992324829, "kl": 0.30434744618833065, "learning_rate": 2.2813203415024537e-06, "loss": 0.0304, "num_tokens": 17375368.0, "reward": 0.8466796875, "reward_std": 0.018485255539417267, "rewards//mean": 0.8466796875, "rewards//std": 0.03355234116315842, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.533, "grad_norm": 1.5846587419509888, "kl": 0.2303962865844369, "learning_rate": 2.2797398043883395e-06, "loss": 0.023, "num_tokens": 17381928.0, "reward": 0.86944580078125, "reward_std": 0.013738680630922318, "rewards//mean": 0.86944580078125, "rewards//std": 0.030423762276768684, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5332, "grad_norm": 3.210442304611206, "kl": 0.347423754632473, "learning_rate": 2.278159355995005e-06, "loss": 0.0347, "num_tokens": 17388528.0, "reward": 0.81866455078125, "reward_std": 0.010115688666701317, "rewards//mean": 0.81866455078125, "rewards//std": 0.013474333100020885, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5334, "grad_norm": 1.6977559328079224, "kl": 0.19961673021316528, "learning_rate": 2.2765789969590564e-06, "loss": 0.02, "num_tokens": 17395160.0, "reward": 0.81298828125, "reward_std": 0.010267006233334541, "rewards//mean": 0.81298828125, "rewards//std": 0.036094460636377335, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5336, "grad_norm": 1.6780002117156982, "kl": 0.3049915265291929, "learning_rate": 2.2749987279170605e-06, "loss": 0.0305, "num_tokens": 17401656.0, "reward": 0.8646240234375, "reward_std": 0.014035902917385101, "rewards//mean": 0.8646240234375, "rewards//std": 0.02655026689171791, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5338, "grad_norm": 1.6690456867218018, "kl": 0.3041702639311552, "learning_rate": 2.2734185495055503e-06, "loss": 0.0304, "num_tokens": 17408104.0, "reward": 0.85101318359375, "reward_std": 0.015988042578101158, "rewards//mean": 0.85101318359375, "rewards//std": 0.02827022410929203, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.534, "grad_norm": 1.6171016693115234, "kl": 0.3361935019493103, "learning_rate": 2.271838462361021e-06, "loss": 0.0336, "num_tokens": 17414720.0, "reward": 0.839599609375, "reward_std": 0.01142318919301033, "rewards//mean": 0.839599609375, "rewards//std": 0.022292152047157288, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5342, "grad_norm": 2.2569873332977295, "kl": 0.1690401779487729, "learning_rate": 2.2702584671199317e-06, "loss": 0.0169, "num_tokens": 17421144.0, "reward": 0.8900146484375, "reward_std": 0.012728402391076088, "rewards//mean": 0.8900146484375, "rewards//std": 0.02621980383992195, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5344, "grad_norm": 1.7024372816085815, "kl": 0.26089052204042673, "learning_rate": 2.268678564418705e-06, "loss": 0.0261, "num_tokens": 17427664.0, "reward": 0.82122802734375, "reward_std": 0.014987068250775337, "rewards//mean": 0.82122802734375, "rewards//std": 0.029221275821328163, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5346, "grad_norm": 1.511756181716919, "kl": 0.46139800921082497, "learning_rate": 2.267098754893724e-06, "loss": 0.0461, "num_tokens": 17434200.0, "reward": 0.85760498046875, "reward_std": 0.012617886066436768, "rewards//mean": 0.85760498046875, "rewards//std": 0.023897293955087662, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5348, "grad_norm": 2.6373119354248047, "kl": 0.338581133633852, "learning_rate": 2.2655190391813373e-06, "loss": 0.0339, "num_tokens": 17440848.0, "reward": 0.86273193359375, "reward_std": 0.012530416250228882, "rewards//mean": 0.86273193359375, "rewards//std": 0.03503456339240074, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.535, "grad_norm": 1.3911439180374146, "kl": 0.16793415881693363, "learning_rate": 2.2639394179178536e-06, "loss": 0.0168, "num_tokens": 17447360.0, "reward": 0.84393310546875, "reward_std": 0.012136043980717659, "rewards//mean": 0.84393310546875, "rewards//std": 0.024552173912525177, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5352, "grad_norm": 2.8344063758850098, "kl": 0.28881784435361624, "learning_rate": 2.262359891739544e-06, "loss": 0.0289, "num_tokens": 17453856.0, "reward": 0.816650390625, "reward_std": 0.009155151434242725, "rewards//mean": 0.816650390625, "rewards//std": 0.015945300459861755, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5354, "grad_norm": 1.5385926961898804, "kl": 0.2924559507519007, "learning_rate": 2.260780461282641e-06, "loss": 0.0292, "num_tokens": 17460288.0, "reward": 0.86663818359375, "reward_std": 0.010241298004984856, "rewards//mean": 0.86663818359375, "rewards//std": 0.017959678545594215, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5356, "grad_norm": 1.332816481590271, "kl": 0.3039694409817457, "learning_rate": 2.2592011271833406e-06, "loss": 0.0304, "num_tokens": 17466816.0, "reward": 0.85247802734375, "reward_std": 0.01019319612532854, "rewards//mean": 0.85247802734375, "rewards//std": 0.04110921546816826, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5358, "grad_norm": 4.057806968688965, "kl": 0.33656573854386806, "learning_rate": 2.257621890077797e-06, "loss": 0.0337, "num_tokens": 17473288.0, "reward": 0.8216552734375, "reward_std": 0.013989482074975967, "rewards//mean": 0.8216552734375, "rewards//std": 0.018311606720089912, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.536, "grad_norm": 3.0237419605255127, "kl": 0.26028473023325205, "learning_rate": 2.256042750602127e-06, "loss": 0.026, "num_tokens": 17479888.0, "reward": 0.85650634765625, "reward_std": 0.014389966614544392, "rewards//mean": 0.85650634765625, "rewards//std": 0.028118818998336792, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5362, "grad_norm": 1.685168981552124, "kl": 0.33306901901960373, "learning_rate": 2.2544637093924072e-06, "loss": 0.0333, "num_tokens": 17486448.0, "reward": 0.82391357421875, "reward_std": 0.012234707362949848, "rewards//mean": 0.82391357421875, "rewards//std": 0.01898803934454918, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5364, "grad_norm": 1.4057021141052246, "kl": 0.13320612628012896, "learning_rate": 2.252884767084677e-06, "loss": 0.0133, "num_tokens": 17493056.0, "reward": 0.84991455078125, "reward_std": 0.01079094223678112, "rewards//mean": 0.84991455078125, "rewards//std": 0.019298020750284195, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5366, "grad_norm": 3.0752296447753906, "kl": 0.2651050053536892, "learning_rate": 2.251305924314933e-06, "loss": 0.0265, "num_tokens": 17499696.0, "reward": 0.87957763671875, "reward_std": 0.011752190068364143, "rewards//mean": 0.87957763671875, "rewards//std": 0.03406495600938797, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5368, "grad_norm": 1.639016032218933, "kl": 0.37284934893250465, "learning_rate": 2.2497271817191323e-06, "loss": 0.0373, "num_tokens": 17506272.0, "reward": 0.83465576171875, "reward_std": 0.011059429496526718, "rewards//mean": 0.83465576171875, "rewards//std": 0.0347791388630867, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.537, "grad_norm": 1.8991674184799194, "kl": 0.27694827876985073, "learning_rate": 2.2481485399331933e-06, "loss": 0.0277, "num_tokens": 17512760.0, "reward": 0.82891845703125, "reward_std": 0.010286202654242516, "rewards//mean": 0.82891845703125, "rewards//std": 0.021453969180583954, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5372, "grad_norm": 1.6425343751907349, "kl": 0.2923229783773422, "learning_rate": 2.246569999592992e-06, "loss": 0.0292, "num_tokens": 17519240.0, "reward": 0.78265380859375, "reward_std": 0.012538396753370762, "rewards//mean": 0.78265380859375, "rewards//std": 0.01592238061130047, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5374, "grad_norm": 2.0773653984069824, "kl": 0.36656503193080425, "learning_rate": 2.244991561334365e-06, "loss": 0.0367, "num_tokens": 17525712.0, "reward": 0.84600830078125, "reward_std": 0.011753687635064125, "rewards//mean": 0.84600830078125, "rewards//std": 0.01986696571111679, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5376, "grad_norm": 1.744006633758545, "kl": 0.2583205811679363, "learning_rate": 2.2434132257931057e-06, "loss": 0.0258, "num_tokens": 17532184.0, "reward": 0.847900390625, "reward_std": 0.01388879306614399, "rewards//mean": 0.847900390625, "rewards//std": 0.027089012786746025, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5378, "grad_norm": 2.2504935264587402, "kl": 0.31632302794605494, "learning_rate": 2.241834993604969e-06, "loss": 0.0316, "num_tokens": 17538680.0, "reward": 0.88201904296875, "reward_std": 0.010065781883895397, "rewards//mean": 0.88201904296875, "rewards//std": 0.020916637033224106, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.538, "grad_norm": 1.634213924407959, "kl": 0.23530986346304417, "learning_rate": 2.240256865405666e-06, "loss": 0.0235, "num_tokens": 17545232.0, "reward": 0.868408203125, "reward_std": 0.013366134837269783, "rewards//mean": 0.868408203125, "rewards//std": 0.029560241848230362, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5382, "grad_norm": 1.7584229707717896, "kl": 0.2741362638771534, "learning_rate": 2.238678841830867e-06, "loss": 0.0274, "num_tokens": 17551832.0, "reward": 0.87994384765625, "reward_std": 0.007968941703438759, "rewards//mean": 0.87994384765625, "rewards//std": 0.020460795611143112, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5384, "grad_norm": 2.4928126335144043, "kl": 0.25760788191109896, "learning_rate": 2.237100923516198e-06, "loss": 0.0258, "num_tokens": 17558328.0, "reward": 0.873779296875, "reward_std": 0.012347468174993992, "rewards//mean": 0.873779296875, "rewards//std": 0.02394736371934414, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5386, "grad_norm": 2.1569483280181885, "kl": 0.5414028540253639, "learning_rate": 2.235523111097247e-06, "loss": 0.0541, "num_tokens": 17564856.0, "reward": 0.86700439453125, "reward_std": 0.014869908802211285, "rewards//mean": 0.86700439453125, "rewards//std": 0.03946610540151596, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5388, "grad_norm": 2.032477855682373, "kl": 0.3292141407728195, "learning_rate": 2.233945405209554e-06, "loss": 0.0329, "num_tokens": 17571416.0, "reward": 0.81878662109375, "reward_std": 0.01006835326552391, "rewards//mean": 0.81878662109375, "rewards//std": 0.017477836459875107, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.539, "grad_norm": 1.9090543985366821, "kl": 0.4091068748384714, "learning_rate": 2.232367806488621e-06, "loss": 0.0409, "num_tokens": 17577920.0, "reward": 0.84368896484375, "reward_std": 0.013082598336040974, "rewards//mean": 0.84368896484375, "rewards//std": 0.029293710365891457, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5392, "grad_norm": 1.6322174072265625, "kl": 0.4747640397399664, "learning_rate": 2.230790315569903e-06, "loss": 0.0475, "num_tokens": 17584432.0, "reward": 0.87237548828125, "reward_std": 0.009977024979889393, "rewards//mean": 0.87237548828125, "rewards//std": 0.01871502213180065, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5394, "grad_norm": 2.174102306365967, "kl": 0.7970433607697487, "learning_rate": 2.229212933088814e-06, "loss": 0.0797, "num_tokens": 17590952.0, "reward": 0.84368896484375, "reward_std": 0.015541739761829376, "rewards//mean": 0.84368896484375, "rewards//std": 0.026268471032381058, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5396, "grad_norm": 1.4071910381317139, "kl": 0.4102098122239113, "learning_rate": 2.2276356596807226e-06, "loss": 0.041, "num_tokens": 17597504.0, "reward": 0.86541748046875, "reward_std": 0.012964694760739803, "rewards//mean": 0.86541748046875, "rewards//std": 0.04741407558321953, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5398, "grad_norm": 1.9158782958984375, "kl": 0.48656902462244034, "learning_rate": 2.2260584959809553e-06, "loss": 0.0487, "num_tokens": 17603976.0, "reward": 0.84112548828125, "reward_std": 0.012534516863524914, "rewards//mean": 0.84112548828125, "rewards//std": 0.028442654758691788, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.54, "grad_norm": 1.4652632474899292, "kl": 0.5533108934760094, "learning_rate": 2.2244814426247934e-06, "loss": 0.0553, "num_tokens": 17610456.0, "reward": 0.84375, "reward_std": 0.011707383207976818, "rewards//mean": 0.84375, "rewards//std": 0.021327383816242218, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5402, "grad_norm": 1.4427696466445923, "kl": 0.32131601218134165, "learning_rate": 2.2229045002474727e-06, "loss": 0.0321, "num_tokens": 17617032.0, "reward": 0.82373046875, "reward_std": 0.01123142521828413, "rewards//mean": 0.82373046875, "rewards//std": 0.01566370576620102, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5404, "grad_norm": 2.0731940269470215, "kl": 0.4151822626590729, "learning_rate": 2.2213276694841866e-06, "loss": 0.0415, "num_tokens": 17623592.0, "reward": 0.85455322265625, "reward_std": 0.015862220898270607, "rewards//mean": 0.85455322265625, "rewards//std": 0.032717157155275345, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5406, "grad_norm": 4.144421577453613, "kl": 0.7315444014966488, "learning_rate": 2.219750950970082e-06, "loss": 0.0732, "num_tokens": 17630200.0, "reward": 0.84033203125, "reward_std": 0.015536182560026646, "rewards//mean": 0.84033203125, "rewards//std": 0.019117645919322968, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5408, "grad_norm": 2.6050689220428467, "kl": 0.34435400180518627, "learning_rate": 2.2181743453402605e-06, "loss": 0.0344, "num_tokens": 17636728.0, "reward": 0.796630859375, "reward_std": 0.01125125028192997, "rewards//mean": 0.796630859375, "rewards//std": 0.018751485273241997, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.541, "grad_norm": 1.6396629810333252, "kl": 0.2939010187983513, "learning_rate": 2.216597853229779e-06, "loss": 0.0294, "num_tokens": 17643240.0, "reward": 0.828125, "reward_std": 0.012548625469207764, "rewards//mean": 0.828125, "rewards//std": 0.02365351840853691, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5412, "grad_norm": 1.90702223777771, "kl": 0.41984913125634193, "learning_rate": 2.215021475273649e-06, "loss": 0.042, "num_tokens": 17649744.0, "reward": 0.84881591796875, "reward_std": 0.01628176122903824, "rewards//mean": 0.84881591796875, "rewards//std": 0.024574359878897667, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5414, "grad_norm": 4.266636848449707, "kl": 0.5369407711550593, "learning_rate": 2.2134452121068337e-06, "loss": 0.0537, "num_tokens": 17656232.0, "reward": 0.8475341796875, "reward_std": 0.013721132650971413, "rewards//mean": 0.8475341796875, "rewards//std": 0.024249820038676262, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5416, "grad_norm": 2.020404815673828, "kl": 0.3594751302152872, "learning_rate": 2.2118690643642533e-06, "loss": 0.0359, "num_tokens": 17662760.0, "reward": 0.80133056640625, "reward_std": 0.008947620168328285, "rewards//mean": 0.80133056640625, "rewards//std": 0.022743819281458855, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5418, "grad_norm": 1.7026344537734985, "kl": 0.3874022141098976, "learning_rate": 2.210293032680779e-06, "loss": 0.0387, "num_tokens": 17669336.0, "reward": 0.84149169921875, "reward_std": 0.0103412801399827, "rewards//mean": 0.84149169921875, "rewards//std": 0.030645858496427536, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.542, "grad_norm": 2.4247255325317383, "kl": 0.22128173895180225, "learning_rate": 2.208717117691237e-06, "loss": 0.0221, "num_tokens": 17675920.0, "reward": 0.79541015625, "reward_std": 0.011463804170489311, "rewards//mean": 0.79541015625, "rewards//std": 0.022750556468963623, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5422, "grad_norm": 3.068688154220581, "kl": 0.42983203660696745, "learning_rate": 2.2071413200304046e-06, "loss": 0.043, "num_tokens": 17682416.0, "reward": 0.86480712890625, "reward_std": 0.012933049350976944, "rewards//mean": 0.86480712890625, "rewards//std": 0.02808757685124874, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5424, "grad_norm": 1.9769434928894043, "kl": 0.4256030526012182, "learning_rate": 2.205565640333014e-06, "loss": 0.0426, "num_tokens": 17688920.0, "reward": 0.86358642578125, "reward_std": 0.011334730312228203, "rewards//mean": 0.86358642578125, "rewards//std": 0.019203661009669304, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5426, "grad_norm": 1.5977753400802612, "kl": 0.3877729494124651, "learning_rate": 2.2039900792337477e-06, "loss": 0.0388, "num_tokens": 17695368.0, "reward": 0.8739013671875, "reward_std": 0.009439677931368351, "rewards//mean": 0.8739013671875, "rewards//std": 0.028335275128483772, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5428, "grad_norm": 1.9001837968826294, "kl": 0.30606089159846306, "learning_rate": 2.2024146373672425e-06, "loss": 0.0306, "num_tokens": 17701888.0, "reward": 0.87274169921875, "reward_std": 0.010293031111359596, "rewards//mean": 0.87274169921875, "rewards//std": 0.03084280900657177, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.543, "grad_norm": 1.772414207458496, "kl": 0.29850054159760475, "learning_rate": 2.200839315368086e-06, "loss": 0.0299, "num_tokens": 17708432.0, "reward": 0.8551025390625, "reward_std": 0.011883324012160301, "rewards//mean": 0.8551025390625, "rewards//std": 0.022517479956150055, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5432, "grad_norm": 2.179696559906006, "kl": 0.3372396770864725, "learning_rate": 2.199264113870817e-06, "loss": 0.0337, "num_tokens": 17714952.0, "reward": 0.80963134765625, "reward_std": 0.012188287451863289, "rewards//mean": 0.80963134765625, "rewards//std": 0.019453493878245354, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5434, "grad_norm": 2.146873712539673, "kl": 0.22098803706467152, "learning_rate": 2.197689033509927e-06, "loss": 0.0221, "num_tokens": 17721464.0, "reward": 0.833251953125, "reward_std": 0.010490244254469872, "rewards//mean": 0.833251953125, "rewards//std": 0.018147604539990425, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5436, "grad_norm": 1.5210456848144531, "kl": 0.3028183737769723, "learning_rate": 2.196114074919858e-06, "loss": 0.0303, "num_tokens": 17727976.0, "reward": 0.87091064453125, "reward_std": 0.012011967599391937, "rewards//mean": 0.87091064453125, "rewards//std": 0.027738871052861214, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5438, "grad_norm": 1.4079958200454712, "kl": 0.30798448622226715, "learning_rate": 2.194539238735004e-06, "loss": 0.0308, "num_tokens": 17734496.0, "reward": 0.8548583984375, "reward_std": 0.012459421530365944, "rewards//mean": 0.8548583984375, "rewards//std": 0.021656662225723267, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.544, "grad_norm": 2.246630907058716, "kl": 0.2655184231698513, "learning_rate": 2.1929645255897073e-06, "loss": 0.0266, "num_tokens": 17740984.0, "reward": 0.82080078125, "reward_std": 0.009219856932759285, "rewards//mean": 0.82080078125, "rewards//std": 0.019319286569952965, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5442, "grad_norm": 2.363102674484253, "kl": 0.28014803770929575, "learning_rate": 2.1913899361182634e-06, "loss": 0.028, "num_tokens": 17747536.0, "reward": 0.8388671875, "reward_std": 0.01596798375248909, "rewards//mean": 0.8388671875, "rewards//std": 0.029626740142703056, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5444, "grad_norm": 1.7048940658569336, "kl": 0.20704139629378915, "learning_rate": 2.189815470954916e-06, "loss": 0.0207, "num_tokens": 17754120.0, "reward": 0.83544921875, "reward_std": 0.013447495177388191, "rewards//mean": 0.83544921875, "rewards//std": 0.034185223281383514, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5446, "grad_norm": 1.9328293800354004, "kl": 0.32670245319604874, "learning_rate": 2.1882411307338604e-06, "loss": 0.0327, "num_tokens": 17760632.0, "reward": 0.8238525390625, "reward_std": 0.012412184849381447, "rewards//mean": 0.8238525390625, "rewards//std": 0.026141168549656868, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5448, "grad_norm": 1.9688076972961426, "kl": 0.27909900434315205, "learning_rate": 2.186666916089239e-06, "loss": 0.0279, "num_tokens": 17767080.0, "reward": 0.85443115234375, "reward_std": 0.01526486687362194, "rewards//mean": 0.85443115234375, "rewards//std": 0.019503232091665268, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.545, "grad_norm": 1.7394418716430664, "kl": 0.3434292655438185, "learning_rate": 2.1850928276551473e-06, "loss": 0.0343, "num_tokens": 17773624.0, "reward": 0.860107421875, "reward_std": 0.012777173891663551, "rewards//mean": 0.860107421875, "rewards//std": 0.031349629163742065, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5452, "grad_norm": 2.16192889213562, "kl": 0.3330814726650715, "learning_rate": 2.183518866065627e-06, "loss": 0.0333, "num_tokens": 17780096.0, "reward": 0.82476806640625, "reward_std": 0.010509390383958817, "rewards//mean": 0.82476806640625, "rewards//std": 0.0157310888171196, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5454, "grad_norm": 2.1763970851898193, "kl": 0.38438572362065315, "learning_rate": 2.181945031954669e-06, "loss": 0.0384, "num_tokens": 17786656.0, "reward": 0.84405517578125, "reward_std": 0.010951744392514229, "rewards//mean": 0.84405517578125, "rewards//std": 0.025019995868206024, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5456, "grad_norm": 1.5821623802185059, "kl": 0.28456786274909973, "learning_rate": 2.180371325956214e-06, "loss": 0.0285, "num_tokens": 17793280.0, "reward": 0.80670166015625, "reward_std": 0.012029403820633888, "rewards//mean": 0.80670166015625, "rewards//std": 0.021810157224535942, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5458, "grad_norm": 1.9322948455810547, "kl": 0.40266600623726845, "learning_rate": 2.1787977487041517e-06, "loss": 0.0403, "num_tokens": 17799696.0, "reward": 0.83880615234375, "reward_std": 0.011950632557272911, "rewards//mean": 0.83880615234375, "rewards//std": 0.029472993686795235, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.546, "grad_norm": 1.5945934057235718, "kl": 0.2813077075406909, "learning_rate": 2.1772243008323167e-06, "loss": 0.0281, "num_tokens": 17806184.0, "reward": 0.82916259765625, "reward_std": 0.015093011781573296, "rewards//mean": 0.82916259765625, "rewards//std": 0.023809080943465233, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5462, "grad_norm": 2.512582302093506, "kl": 0.3372381888329983, "learning_rate": 2.1756509829744958e-06, "loss": 0.0337, "num_tokens": 17812712.0, "reward": 0.8289794921875, "reward_std": 0.012917822226881981, "rewards//mean": 0.8289794921875, "rewards//std": 0.035750385373830795, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5464, "grad_norm": 1.9530895948410034, "kl": 0.41231667064130306, "learning_rate": 2.17407779576442e-06, "loss": 0.0412, "num_tokens": 17819256.0, "reward": 0.8104248046875, "reward_std": 0.0122939208522439, "rewards//mean": 0.8104248046875, "rewards//std": 0.02315647527575493, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5466, "grad_norm": 1.603867769241333, "kl": 0.22878009686246514, "learning_rate": 2.1725047398357677e-06, "loss": 0.0229, "num_tokens": 17825920.0, "reward": 0.8720703125, "reward_std": 0.012404918670654297, "rewards//mean": 0.8720703125, "rewards//std": 0.03186400979757309, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5468, "grad_norm": 2.2596688270568848, "kl": 0.31683386489748955, "learning_rate": 2.1709318158221684e-06, "loss": 0.0317, "num_tokens": 17832336.0, "reward": 0.82171630859375, "reward_std": 0.008711433038115501, "rewards//mean": 0.82171630859375, "rewards//std": 0.022254006937146187, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.547, "grad_norm": 1.963685154914856, "kl": 0.28239005617797375, "learning_rate": 2.1693590243571937e-06, "loss": 0.0282, "num_tokens": 17838768.0, "reward": 0.8843994140625, "reward_std": 0.016152461990714073, "rewards//mean": 0.8843994140625, "rewards//std": 0.024994950741529465, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5472, "grad_norm": 1.4212199449539185, "kl": 0.2517299931496382, "learning_rate": 2.167786366074365e-06, "loss": 0.0252, "num_tokens": 17845264.0, "reward": 0.86053466796875, "reward_std": 0.01543545164167881, "rewards//mean": 0.86053466796875, "rewards//std": 0.02558831311762333, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5474, "grad_norm": 2.506329298019409, "kl": 0.44260415621101856, "learning_rate": 2.1662138416071475e-06, "loss": 0.0443, "num_tokens": 17851824.0, "reward": 0.87030029296875, "reward_std": 0.009350113570690155, "rewards//mean": 0.87030029296875, "rewards//std": 0.02376771904528141, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5476, "grad_norm": 1.697428822517395, "kl": 0.40795487724244595, "learning_rate": 2.1646414515889557e-06, "loss": 0.0408, "num_tokens": 17858376.0, "reward": 0.8365478515625, "reward_std": 0.010844911448657513, "rewards//mean": 0.8365478515625, "rewards//std": 0.016488108783960342, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5478, "grad_norm": 1.703357458114624, "kl": 0.25492018181830645, "learning_rate": 2.163069196653146e-06, "loss": 0.0255, "num_tokens": 17864904.0, "reward": 0.85400390625, "reward_std": 0.014388026669621468, "rewards//mean": 0.85400390625, "rewards//std": 0.015064667910337448, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.548, "grad_norm": 2.494652509689331, "kl": 0.2664229925721884, "learning_rate": 2.161497077433025e-06, "loss": 0.0266, "num_tokens": 17871456.0, "reward": 0.7825927734375, "reward_std": 0.011605273932218552, "rewards//mean": 0.7825927734375, "rewards//std": 0.023336201906204224, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5482, "grad_norm": 2.7510640621185303, "kl": 0.3616100074723363, "learning_rate": 2.1599250945618404e-06, "loss": 0.0362, "num_tokens": 17877976.0, "reward": 0.83697509765625, "reward_std": 0.01493571512401104, "rewards//mean": 0.83697509765625, "rewards//std": 0.025479823350906372, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5484, "grad_norm": 2.162010669708252, "kl": 0.3047668244689703, "learning_rate": 2.158353248672786e-06, "loss": 0.0305, "num_tokens": 17884424.0, "reward": 0.8553466796875, "reward_std": 0.014404110610485077, "rewards//mean": 0.8553466796875, "rewards//std": 0.02719692699611187, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5486, "grad_norm": 2.424870252609253, "kl": 0.5817837445065379, "learning_rate": 2.156781540399003e-06, "loss": 0.0582, "num_tokens": 17890968.0, "reward": 0.83795166015625, "reward_std": 0.013328604400157928, "rewards//mean": 0.83795166015625, "rewards//std": 0.02198711968958378, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5488, "grad_norm": 1.710442304611206, "kl": 0.4293752871453762, "learning_rate": 2.1552099703735742e-06, "loss": 0.0429, "num_tokens": 17897520.0, "reward": 0.86212158203125, "reward_std": 0.014160207472741604, "rewards//mean": 0.86212158203125, "rewards//std": 0.02551247924566269, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.549, "grad_norm": 2.0250155925750732, "kl": 0.3636801037937403, "learning_rate": 2.1536385392295283e-06, "loss": 0.0364, "num_tokens": 17904008.0, "reward": 0.84637451171875, "reward_std": 0.012255113571882248, "rewards//mean": 0.84637451171875, "rewards//std": 0.03102484904229641, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5492, "grad_norm": 2.797884941101074, "kl": 0.3806733600795269, "learning_rate": 2.1520672475998374e-06, "loss": 0.0381, "num_tokens": 17910504.0, "reward": 0.7291259765625, "reward_std": 0.012150906026363373, "rewards//mean": 0.7291259765625, "rewards//std": 0.029092323035001755, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5494, "grad_norm": 1.5125242471694946, "kl": 0.3385340813547373, "learning_rate": 2.150496096117417e-06, "loss": 0.0339, "num_tokens": 17916992.0, "reward": 0.86651611328125, "reward_std": 0.010560679249465466, "rewards//mean": 0.86651611328125, "rewards//std": 0.029136713594198227, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5496, "grad_norm": 1.5509902238845825, "kl": 0.2659501153975725, "learning_rate": 2.1489250854151276e-06, "loss": 0.0266, "num_tokens": 17923608.0, "reward": 0.8709716796875, "reward_std": 0.013701347634196281, "rewards//mean": 0.8709716796875, "rewards//std": 0.021417688578367233, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5498, "grad_norm": 1.3203288316726685, "kl": 0.24575026333332062, "learning_rate": 2.147354216125772e-06, "loss": 0.0246, "num_tokens": 17930104.0, "reward": 0.85662841796875, "reward_std": 0.010841279290616512, "rewards//mean": 0.85662841796875, "rewards//std": 0.015573447570204735, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.55, "grad_norm": 2.825763463973999, "kl": 0.37644192203879356, "learning_rate": 2.145783488882096e-06, "loss": 0.0376, "num_tokens": 17936544.0, "reward": 0.861328125, "reward_std": 0.01173393428325653, "rewards//mean": 0.861328125, "rewards//std": 0.022851470857858658, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5502, "grad_norm": 1.4390629529953003, "kl": 0.2921880939975381, "learning_rate": 2.1442129043167877e-06, "loss": 0.0292, "num_tokens": 17943208.0, "reward": 0.823974609375, "reward_std": 0.015232466161251068, "rewards//mean": 0.823974609375, "rewards//std": 0.03223337233066559, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5504, "grad_norm": 1.4214346408843994, "kl": 0.17900089733302593, "learning_rate": 2.1426424630624797e-06, "loss": 0.0179, "num_tokens": 17949632.0, "reward": 0.801513671875, "reward_std": 0.008692212402820587, "rewards//mean": 0.801513671875, "rewards//std": 0.021063128486275673, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5506, "grad_norm": 2.6955084800720215, "kl": 0.40627087466418743, "learning_rate": 2.141072165751744e-06, "loss": 0.0406, "num_tokens": 17956160.0, "reward": 0.82965087890625, "reward_std": 0.014534296467900276, "rewards//mean": 0.82965087890625, "rewards//std": 0.024789631366729736, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5508, "grad_norm": 1.6859873533248901, "kl": 0.1837122500874102, "learning_rate": 2.139502013017098e-06, "loss": 0.0184, "num_tokens": 17962664.0, "reward": 0.88134765625, "reward_std": 0.010720271617174149, "rewards//mean": 0.88134765625, "rewards//std": 0.016565507277846336, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.551, "grad_norm": 1.4431986808776855, "kl": 0.2401575120165944, "learning_rate": 2.1379320054909973e-06, "loss": 0.024, "num_tokens": 17969088.0, "reward": 0.8165283203125, "reward_std": 0.010225853882730007, "rewards//mean": 0.8165283203125, "rewards//std": 0.020786413922905922, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5512, "grad_norm": 1.5322259664535522, "kl": 0.2736801886931062, "learning_rate": 2.136362143805842e-06, "loss": 0.0274, "num_tokens": 17975560.0, "reward": 0.870849609375, "reward_std": 0.010498326271772385, "rewards//mean": 0.870849609375, "rewards//std": 0.019684111699461937, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5514, "grad_norm": 1.4667491912841797, "kl": 0.18936146330088377, "learning_rate": 2.134792428593971e-06, "loss": 0.0189, "num_tokens": 17982120.0, "reward": 0.8389892578125, "reward_std": 0.009833618067204952, "rewards//mean": 0.8389892578125, "rewards//std": 0.014540567062795162, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5516, "grad_norm": 1.7233893871307373, "kl": 0.322734915651381, "learning_rate": 2.133222860487667e-06, "loss": 0.0323, "num_tokens": 17988816.0, "reward": 0.81353759765625, "reward_std": 0.009019844233989716, "rewards//mean": 0.81353759765625, "rewards//std": 0.013836824335157871, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5518, "grad_norm": 1.6291356086730957, "kl": 0.24868507869541645, "learning_rate": 2.1316534401191504e-06, "loss": 0.0249, "num_tokens": 17995392.0, "reward": 0.846923828125, "reward_std": 0.012043749913573265, "rewards//mean": 0.846923828125, "rewards//std": 0.033188533037900925, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.552, "grad_norm": 1.7882100343704224, "kl": 0.2672593081369996, "learning_rate": 2.1300841681205844e-06, "loss": 0.0267, "num_tokens": 18001872.0, "reward": 0.84783935546875, "reward_std": 0.009833501651883125, "rewards//mean": 0.84783935546875, "rewards//std": 0.023307237774133682, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5522, "grad_norm": 1.9165858030319214, "kl": 0.22205201163887978, "learning_rate": 2.128515045124071e-06, "loss": 0.0222, "num_tokens": 18008416.0, "reward": 0.84002685546875, "reward_std": 0.01219249702990055, "rewards//mean": 0.84002685546875, "rewards//std": 0.0244632288813591, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5524, "grad_norm": 1.5670595169067383, "kl": 0.21257047075778246, "learning_rate": 2.126946071761653e-06, "loss": 0.0213, "num_tokens": 18014976.0, "reward": 0.87249755859375, "reward_std": 0.014223047532141209, "rewards//mean": 0.87249755859375, "rewards//std": 0.022918857634067535, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5526, "grad_norm": 2.689857006072998, "kl": 0.2401174157857895, "learning_rate": 2.125377248665313e-06, "loss": 0.024, "num_tokens": 18021448.0, "reward": 0.87579345703125, "reward_std": 0.0155106820166111, "rewards//mean": 0.87579345703125, "rewards//std": 0.023686055094003677, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5528, "grad_norm": 2.9252421855926514, "kl": 0.22952975891530514, "learning_rate": 2.123808576466972e-06, "loss": 0.023, "num_tokens": 18028104.0, "reward": 0.822509765625, "reward_std": 0.013323286548256874, "rewards//mean": 0.822509765625, "rewards//std": 0.032502755522727966, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.553, "grad_norm": 1.8146675825119019, "kl": 0.2515881396830082, "learning_rate": 2.122240055798492e-06, "loss": 0.0252, "num_tokens": 18034680.0, "reward": 0.8818359375, "reward_std": 0.015920104458928108, "rewards//mean": 0.8818359375, "rewards//std": 0.02865428850054741, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5532, "grad_norm": 1.744505763053894, "kl": 0.2599931079894304, "learning_rate": 2.1206716872916713e-06, "loss": 0.026, "num_tokens": 18041232.0, "reward": 0.7940673828125, "reward_std": 0.0121388528496027, "rewards//mean": 0.7940673828125, "rewards//std": 0.029704047366976738, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5534, "grad_norm": 1.6262613534927368, "kl": 0.2906060256063938, "learning_rate": 2.1191034715782505e-06, "loss": 0.0291, "num_tokens": 18047688.0, "reward": 0.8890380859375, "reward_std": 0.009850949048995972, "rewards//mean": 0.8890380859375, "rewards//std": 0.015667088329792023, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5536, "grad_norm": 1.7978684902191162, "kl": 0.39791457913815975, "learning_rate": 2.117535409289905e-06, "loss": 0.0398, "num_tokens": 18054200.0, "reward": 0.8543701171875, "reward_std": 0.016843993216753006, "rewards//mean": 0.8543701171875, "rewards//std": 0.04015986621379852, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5538, "grad_norm": 2.709608554840088, "kl": 0.4049006514251232, "learning_rate": 2.115967501058252e-06, "loss": 0.0405, "num_tokens": 18060616.0, "reward": 0.87060546875, "reward_std": 0.012675169855356216, "rewards//mean": 0.87060546875, "rewards//std": 0.02799154818058014, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.554, "grad_norm": 1.808256983757019, "kl": 0.19101037550717592, "learning_rate": 2.1143997475148424e-06, "loss": 0.0191, "num_tokens": 18067168.0, "reward": 0.86102294921875, "reward_std": 0.011935347691178322, "rewards//mean": 0.86102294921875, "rewards//std": 0.02509249374270439, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5542, "grad_norm": 2.1386635303497314, "kl": 0.23312299605458975, "learning_rate": 2.1128321492911697e-06, "loss": 0.0233, "num_tokens": 18073744.0, "reward": 0.77911376953125, "reward_std": 0.013266094028949738, "rewards//mean": 0.77911376953125, "rewards//std": 0.019709378480911255, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5544, "grad_norm": 1.935979962348938, "kl": 0.3111722944304347, "learning_rate": 2.1112647070186597e-06, "loss": 0.0311, "num_tokens": 18080224.0, "reward": 0.87091064453125, "reward_std": 0.016218971461057663, "rewards//mean": 0.87091064453125, "rewards//std": 0.039820462465286255, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5546, "grad_norm": 1.627297043800354, "kl": 0.1860641185194254, "learning_rate": 2.1096974213286803e-06, "loss": 0.0186, "num_tokens": 18086768.0, "reward": 0.85772705078125, "reward_std": 0.01169888861477375, "rewards//mean": 0.85772705078125, "rewards//std": 0.03691915422677994, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5548, "grad_norm": 1.6692906618118286, "kl": 0.21927758678793907, "learning_rate": 2.1081302928525323e-06, "loss": 0.0219, "num_tokens": 18093384.0, "reward": 0.8409423828125, "reward_std": 0.012402446940541267, "rewards//mean": 0.8409423828125, "rewards//std": 0.022728921845555305, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.555, "grad_norm": 1.861154317855835, "kl": 0.2927249316126108, "learning_rate": 2.1065633222214555e-06, "loss": 0.0293, "num_tokens": 18099944.0, "reward": 0.82879638671875, "reward_std": 0.010571196675300598, "rewards//mean": 0.82879638671875, "rewards//std": 0.01909298449754715, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5552, "grad_norm": 1.8184797763824463, "kl": 0.26383697614073753, "learning_rate": 2.1049965100666255e-06, "loss": 0.0264, "num_tokens": 18106400.0, "reward": 0.8663330078125, "reward_std": 0.011407744139432907, "rewards//mean": 0.8663330078125, "rewards//std": 0.020427938550710678, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5554, "grad_norm": 2.6112241744995117, "kl": 0.2848534947261214, "learning_rate": 2.1034298570191542e-06, "loss": 0.0285, "num_tokens": 18112968.0, "reward": 0.885009765625, "reward_std": 0.015326723456382751, "rewards//mean": 0.885009765625, "rewards//std": 0.026656389236450195, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5556, "grad_norm": 2.55448842048645, "kl": 0.5408084243535995, "learning_rate": 2.1018633637100892e-06, "loss": 0.0541, "num_tokens": 18119448.0, "reward": 0.86224365234375, "reward_std": 0.017955873161554337, "rewards//mean": 0.86224365234375, "rewards//std": 0.025706946849822998, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5558, "grad_norm": 1.4748960733413696, "kl": 0.2727449405938387, "learning_rate": 2.1002970307704134e-06, "loss": 0.0273, "num_tokens": 18126040.0, "reward": 0.86273193359375, "reward_std": 0.012766191735863686, "rewards//mean": 0.86273193359375, "rewards//std": 0.033195775002241135, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.556, "grad_norm": 1.9746555089950562, "kl": 0.4187017949298024, "learning_rate": 2.098730858831046e-06, "loss": 0.0419, "num_tokens": 18132584.0, "reward": 0.8656005859375, "reward_std": 0.014332510530948639, "rewards//mean": 0.8656005859375, "rewards//std": 0.03951694443821907, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5562, "grad_norm": 2.6885170936584473, "kl": 0.24640971049666405, "learning_rate": 2.0971648485228404e-06, "loss": 0.0246, "num_tokens": 18139016.0, "reward": 0.84912109375, "reward_std": 0.012926872819662094, "rewards//mean": 0.84912109375, "rewards//std": 0.021298972889780998, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5564, "grad_norm": 1.3791922330856323, "kl": 0.2978736599907279, "learning_rate": 2.0955990004765857e-06, "loss": 0.0298, "num_tokens": 18145576.0, "reward": 0.884521484375, "reward_std": 0.012912002392113209, "rewards//mean": 0.884521484375, "rewards//std": 0.023363754153251648, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5566, "grad_norm": 1.5120372772216797, "kl": 0.27655358519405127, "learning_rate": 2.094033315323005e-06, "loss": 0.0277, "num_tokens": 18152120.0, "reward": 0.80078125, "reward_std": 0.00766813475638628, "rewards//mean": 0.80078125, "rewards//std": 0.019098632037639618, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5568, "grad_norm": 2.178332567214966, "kl": 0.4281632397323847, "learning_rate": 2.0924677936927567e-06, "loss": 0.0428, "num_tokens": 18158648.0, "reward": 0.8629150390625, "reward_std": 0.01762513816356659, "rewards//mean": 0.8629150390625, "rewards//std": 0.028260383754968643, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.557, "grad_norm": 2.2599310874938965, "kl": 0.35593687929213047, "learning_rate": 2.0909024362164323e-06, "loss": 0.0356, "num_tokens": 18165152.0, "reward": 0.77764892578125, "reward_std": 0.008508952334523201, "rewards//mean": 0.77764892578125, "rewards//std": 0.016401004046201706, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5572, "grad_norm": 1.9904800653457642, "kl": 0.43947565369307995, "learning_rate": 2.089337243524558e-06, "loss": 0.0439, "num_tokens": 18171712.0, "reward": 0.77301025390625, "reward_std": 0.009836118668317795, "rewards//mean": 0.77301025390625, "rewards//std": 0.01947682350873947, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5574, "grad_norm": 1.7544703483581543, "kl": 0.35265037417411804, "learning_rate": 2.087772216247592e-06, "loss": 0.0353, "num_tokens": 18178224.0, "reward": 0.87109375, "reward_std": 0.012455819174647331, "rewards//mean": 0.87109375, "rewards//std": 0.02403947524726391, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5576, "grad_norm": 2.887469530105591, "kl": 0.2521998845040798, "learning_rate": 2.086207355015929e-06, "loss": 0.0252, "num_tokens": 18184736.0, "reward": 0.83343505859375, "reward_std": 0.014749730005860329, "rewards//mean": 0.83343505859375, "rewards//std": 0.029946843162178993, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5578, "grad_norm": 1.5823661088943481, "kl": 0.31834359280765057, "learning_rate": 2.0846426604598932e-06, "loss": 0.0318, "num_tokens": 18191200.0, "reward": 0.87567138671875, "reward_std": 0.009988244622945786, "rewards//mean": 0.87567138671875, "rewards//std": 0.02554924041032791, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.558, "grad_norm": 3.1820790767669678, "kl": 0.4507823567837477, "learning_rate": 2.0830781332097446e-06, "loss": 0.0451, "num_tokens": 18197680.0, "reward": 0.82122802734375, "reward_std": 0.014466448687016964, "rewards//mean": 0.82122802734375, "rewards//std": 0.023834500461816788, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5582, "grad_norm": 2.0599639415740967, "kl": 0.42233809642493725, "learning_rate": 2.0815137738956736e-06, "loss": 0.0422, "num_tokens": 18204088.0, "reward": 0.84283447265625, "reward_std": 0.014082375913858414, "rewards//mean": 0.84283447265625, "rewards//std": 0.02076045423746109, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5584, "grad_norm": 1.7829320430755615, "kl": 0.2721722610294819, "learning_rate": 2.079949583147805e-06, "loss": 0.0272, "num_tokens": 18210536.0, "reward": 0.87890625, "reward_std": 0.012034351006150246, "rewards//mean": 0.87890625, "rewards//std": 0.02340647764503956, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5586, "grad_norm": 1.5859808921813965, "kl": 0.3157652039080858, "learning_rate": 2.0783855615961938e-06, "loss": 0.0316, "num_tokens": 18217024.0, "reward": 0.81842041015625, "reward_std": 0.011793545447289944, "rewards//mean": 0.81842041015625, "rewards//std": 0.02107597142457962, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5588, "grad_norm": 2.0936577320098877, "kl": 0.33291765581816435, "learning_rate": 2.076821709870828e-06, "loss": 0.0333, "num_tokens": 18223512.0, "reward": 0.86187744140625, "reward_std": 0.01852244883775711, "rewards//mean": 0.86187744140625, "rewards//std": 0.04362252727150917, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.559, "grad_norm": 2.178611993789673, "kl": 0.29304058477282524, "learning_rate": 2.075258028601627e-06, "loss": 0.0293, "num_tokens": 18230000.0, "reward": 0.860107421875, "reward_std": 0.010560743510723114, "rewards//mean": 0.860107421875, "rewards//std": 0.029047802090644836, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5592, "grad_norm": 1.9282448291778564, "kl": 0.2878599837422371, "learning_rate": 2.0736945184184406e-06, "loss": 0.0288, "num_tokens": 18236528.0, "reward": 0.85699462890625, "reward_std": 0.013776151463389397, "rewards//mean": 0.85699462890625, "rewards//std": 0.03358565643429756, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5594, "grad_norm": 2.4503023624420166, "kl": 0.23530966602265835, "learning_rate": 2.072131179951052e-06, "loss": 0.0235, "num_tokens": 18243144.0, "reward": 0.86456298828125, "reward_std": 0.015571060590445995, "rewards//mean": 0.86456298828125, "rewards//std": 0.02727772295475006, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5596, "grad_norm": 2.794254779815674, "kl": 0.3079934846609831, "learning_rate": 2.0705680138291724e-06, "loss": 0.0308, "num_tokens": 18249704.0, "reward": 0.81268310546875, "reward_std": 0.009605806320905685, "rewards//mean": 0.81268310546875, "rewards//std": 0.021023474633693695, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5598, "grad_norm": 1.561891794204712, "kl": 0.4154408434405923, "learning_rate": 2.069005020682446e-06, "loss": 0.0415, "num_tokens": 18256176.0, "reward": 0.78961181640625, "reward_std": 0.010574070736765862, "rewards//mean": 0.78961181640625, "rewards//std": 0.01872553490102291, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.56, "grad_norm": 1.6736783981323242, "kl": 0.34489313792437315, "learning_rate": 2.067442201140445e-06, "loss": 0.0345, "num_tokens": 18262672.0, "reward": 0.86383056640625, "reward_std": 0.013642764650285244, "rewards//mean": 0.86383056640625, "rewards//std": 0.024168768897652626, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5602, "grad_norm": 1.682488203048706, "kl": 0.37077015917748213, "learning_rate": 2.0658795558326745e-06, "loss": 0.0371, "num_tokens": 18269176.0, "reward": 0.866943359375, "reward_std": 0.012578604742884636, "rewards//mean": 0.866943359375, "rewards//std": 0.02743549644947052, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5604, "grad_norm": 3.5041756629943848, "kl": 0.4512365162372589, "learning_rate": 2.0643170853885665e-06, "loss": 0.0451, "num_tokens": 18275728.0, "reward": 0.83001708984375, "reward_std": 0.016096124425530434, "rewards//mean": 0.83001708984375, "rewards//std": 0.023673908784985542, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5606, "grad_norm": 1.8885712623596191, "kl": 0.3163228742778301, "learning_rate": 2.0627547904374846e-06, "loss": 0.0316, "num_tokens": 18282200.0, "reward": 0.84124755859375, "reward_std": 0.011756704188883305, "rewards//mean": 0.84124755859375, "rewards//std": 0.016810311004519463, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5608, "grad_norm": 1.9203038215637207, "kl": 0.3495679283514619, "learning_rate": 2.0611926716087203e-06, "loss": 0.035, "num_tokens": 18288824.0, "reward": 0.859130859375, "reward_std": 0.01118490006774664, "rewards//mean": 0.859130859375, "rewards//std": 0.03059895895421505, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.561, "grad_norm": 2.5079219341278076, "kl": 0.38363502733409405, "learning_rate": 2.059630729531496e-06, "loss": 0.0384, "num_tokens": 18295352.0, "reward": 0.83929443359375, "reward_std": 0.011413225904107094, "rewards//mean": 0.83929443359375, "rewards//std": 0.02448735013604164, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5612, "grad_norm": 1.617093563079834, "kl": 0.22164367698132992, "learning_rate": 2.0580689648349605e-06, "loss": 0.0222, "num_tokens": 18301848.0, "reward": 0.85107421875, "reward_std": 0.012276215478777885, "rewards//mean": 0.85107421875, "rewards//std": 0.03260412439703941, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5614, "grad_norm": 1.5436896085739136, "kl": 0.36944496631622314, "learning_rate": 2.0565073781481943e-06, "loss": 0.0369, "num_tokens": 18308320.0, "reward": 0.8448486328125, "reward_std": 0.012660009786486626, "rewards//mean": 0.8448486328125, "rewards//std": 0.026887930929660797, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5616, "grad_norm": 1.861499547958374, "kl": 0.2932739267125726, "learning_rate": 2.054945970100202e-06, "loss": 0.0293, "num_tokens": 18314808.0, "reward": 0.8780517578125, "reward_std": 0.012834051623940468, "rewards//mean": 0.8780517578125, "rewards//std": 0.016135461628437042, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5618, "grad_norm": 2.086439371109009, "kl": 0.2438540132716298, "learning_rate": 2.0533847413199202e-06, "loss": 0.0244, "num_tokens": 18321280.0, "reward": 0.85693359375, "reward_std": 0.017634930089116096, "rewards//mean": 0.85693359375, "rewards//std": 0.027423353865742683, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.562, "grad_norm": 2.6038453578948975, "kl": 0.4882434867322445, "learning_rate": 2.0518236924362118e-06, "loss": 0.0488, "num_tokens": 18327832.0, "reward": 0.78082275390625, "reward_std": 0.009107234887778759, "rewards//mean": 0.78082275390625, "rewards//std": 0.013389809988439083, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5622, "grad_norm": 2.096039295196533, "kl": 0.5229883044958115, "learning_rate": 2.0502628240778655e-06, "loss": 0.0523, "num_tokens": 18334400.0, "reward": 0.83740234375, "reward_std": 0.012887082993984222, "rewards//mean": 0.83740234375, "rewards//std": 0.019704097881913185, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5624, "grad_norm": 2.3062119483947754, "kl": 0.4734035525470972, "learning_rate": 2.0487021368736002e-06, "loss": 0.0473, "num_tokens": 18340968.0, "reward": 0.8460693359375, "reward_std": 0.015495773404836655, "rewards//mean": 0.8460693359375, "rewards//std": 0.032930970191955566, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5626, "grad_norm": 1.9029260873794556, "kl": 0.1699219266884029, "learning_rate": 2.04714163145206e-06, "loss": 0.017, "num_tokens": 18347408.0, "reward": 0.87408447265625, "reward_std": 0.01141197606921196, "rewards//mean": 0.87408447265625, "rewards//std": 0.025861937552690506, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5628, "grad_norm": 2.019824504852295, "kl": 0.355945591814816, "learning_rate": 2.045581308441817e-06, "loss": 0.0356, "num_tokens": 18354016.0, "reward": 0.84381103515625, "reward_std": 0.012773682363331318, "rewards//mean": 0.84381103515625, "rewards//std": 0.031638920307159424, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.563, "grad_norm": 1.5178500413894653, "kl": 0.23861664440482855, "learning_rate": 2.044021168471368e-06, "loss": 0.0239, "num_tokens": 18360568.0, "reward": 0.835693359375, "reward_std": 0.013922713696956635, "rewards//mean": 0.835693359375, "rewards//std": 0.02603120170533657, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5632, "grad_norm": 1.9562556743621826, "kl": 0.2615255769342184, "learning_rate": 2.0424612121691383e-06, "loss": 0.0262, "num_tokens": 18367008.0, "reward": 0.86236572265625, "reward_std": 0.011265065521001816, "rewards//mean": 0.86236572265625, "rewards//std": 0.01923437789082527, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5634, "grad_norm": 2.1965296268463135, "kl": 0.3983521070331335, "learning_rate": 2.0409014401634773e-06, "loss": 0.0398, "num_tokens": 18373592.0, "reward": 0.80462646484375, "reward_std": 0.01215679943561554, "rewards//mean": 0.80462646484375, "rewards//std": 0.02916112169623375, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5636, "grad_norm": 1.7736155986785889, "kl": 0.37301212921738625, "learning_rate": 2.0393418530826617e-06, "loss": 0.0373, "num_tokens": 18380152.0, "reward": 0.85906982421875, "reward_std": 0.014091813936829567, "rewards//mean": 0.85906982421875, "rewards//std": 0.03067350760102272, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5638, "grad_norm": 2.78847336769104, "kl": 0.3128879275172949, "learning_rate": 2.0377824515548923e-06, "loss": 0.0313, "num_tokens": 18386896.0, "reward": 0.837158203125, "reward_std": 0.011339973658323288, "rewards//mean": 0.837158203125, "rewards//std": 0.028218841180205345, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.564, "grad_norm": 2.0876760482788086, "kl": 0.22145891468971968, "learning_rate": 2.036223236208296e-06, "loss": 0.0221, "num_tokens": 18393400.0, "reward": 0.82318115234375, "reward_std": 0.011123064905405045, "rewards//mean": 0.82318115234375, "rewards//std": 0.01978679746389389, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5642, "grad_norm": 2.1770637035369873, "kl": 0.5075486805289984, "learning_rate": 2.034664207670925e-06, "loss": 0.0508, "num_tokens": 18399960.0, "reward": 0.84210205078125, "reward_std": 0.01431229617446661, "rewards//mean": 0.84210205078125, "rewards//std": 0.019310567528009415, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5644, "grad_norm": 2.4944581985473633, "kl": 0.3260407727211714, "learning_rate": 2.0331053665707556e-06, "loss": 0.0326, "num_tokens": 18406432.0, "reward": 0.8055419921875, "reward_std": 0.010118969716131687, "rewards//mean": 0.8055419921875, "rewards//std": 0.01944386214017868, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5646, "grad_norm": 1.7906310558319092, "kl": 0.3701071720570326, "learning_rate": 2.031546713535688e-06, "loss": 0.037, "num_tokens": 18412912.0, "reward": 0.84515380859375, "reward_std": 0.017547607421875, "rewards//mean": 0.84515380859375, "rewards//std": 0.0359119288623333, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5648, "grad_norm": 1.7331050634384155, "kl": 0.3822266608476639, "learning_rate": 2.0299882491935492e-06, "loss": 0.0382, "num_tokens": 18419360.0, "reward": 0.83740234375, "reward_std": 0.012641796842217445, "rewards//mean": 0.83740234375, "rewards//std": 0.02974504418671131, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.565, "grad_norm": 3.3479583263397217, "kl": 0.39208398573100567, "learning_rate": 2.0284299741720875e-06, "loss": 0.0392, "num_tokens": 18425888.0, "reward": 0.845458984375, "reward_std": 0.0196065790951252, "rewards//mean": 0.845458984375, "rewards//std": 0.03876296058297157, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5652, "grad_norm": 1.7005152702331543, "kl": 0.41619318071752787, "learning_rate": 2.0268718890989754e-06, "loss": 0.0416, "num_tokens": 18432392.0, "reward": 0.8544921875, "reward_std": 0.010762276127934456, "rewards//mean": 0.8544921875, "rewards//std": 0.021190667524933815, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5654, "grad_norm": 2.9131317138671875, "kl": 0.2476744530722499, "learning_rate": 2.0253139946018093e-06, "loss": 0.0248, "num_tokens": 18438896.0, "reward": 0.869873046875, "reward_std": 0.0135627631098032, "rewards//mean": 0.869873046875, "rewards//std": 0.024258870631456375, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5656, "grad_norm": 2.4183719158172607, "kl": 0.32592375949025154, "learning_rate": 2.02375629130811e-06, "loss": 0.0326, "num_tokens": 18445432.0, "reward": 0.8302001953125, "reward_std": 0.011712683364748955, "rewards//mean": 0.8302001953125, "rewards//std": 0.01822875253856182, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5658, "grad_norm": 1.7280998229980469, "kl": 0.25227757170796394, "learning_rate": 2.022198779845319e-06, "loss": 0.0252, "num_tokens": 18451984.0, "reward": 0.85699462890625, "reward_std": 0.011648004874587059, "rewards//mean": 0.85699462890625, "rewards//std": 0.03700311481952667, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.566, "grad_norm": 2.022475481033325, "kl": 0.398098049685359, "learning_rate": 2.020641460840803e-06, "loss": 0.0398, "num_tokens": 18458512.0, "reward": 0.84442138671875, "reward_std": 0.01042837742716074, "rewards//mean": 0.84442138671875, "rewards//std": 0.019569095224142075, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5662, "grad_norm": 2.5086843967437744, "kl": 0.37785210832953453, "learning_rate": 2.019084334921849e-06, "loss": 0.0378, "num_tokens": 18464992.0, "reward": 0.77593994140625, "reward_std": 0.010385055094957352, "rewards//mean": 0.77593994140625, "rewards//std": 0.02341027744114399, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5664, "grad_norm": 2.6404781341552734, "kl": 0.3636457081884146, "learning_rate": 2.0175274027156673e-06, "loss": 0.0364, "num_tokens": 18471520.0, "reward": 0.79327392578125, "reward_std": 0.010034476406872272, "rewards//mean": 0.79327392578125, "rewards//std": 0.021979544311761856, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5666, "grad_norm": 2.583080530166626, "kl": 0.36448066122829914, "learning_rate": 2.015970664849389e-06, "loss": 0.0364, "num_tokens": 18478008.0, "reward": 0.8577880859375, "reward_std": 0.01369045302271843, "rewards//mean": 0.8577880859375, "rewards//std": 0.026545705273747444, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5668, "grad_norm": 1.6818803548812866, "kl": 0.37824024073779583, "learning_rate": 2.0144141219500707e-06, "loss": 0.0378, "num_tokens": 18484600.0, "reward": 0.84906005859375, "reward_std": 0.010223206132650375, "rewards//mean": 0.84906005859375, "rewards//std": 0.02121770568192005, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.567, "grad_norm": 2.606274366378784, "kl": 0.2858577538281679, "learning_rate": 2.0128577746446854e-06, "loss": 0.0286, "num_tokens": 18491088.0, "reward": 0.865478515625, "reward_std": 0.011398808099329472, "rewards//mean": 0.865478515625, "rewards//std": 0.02818448841571808, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5672, "grad_norm": 3.407790422439575, "kl": 0.4382147118449211, "learning_rate": 2.01130162356013e-06, "loss": 0.0438, "num_tokens": 18497648.0, "reward": 0.812255859375, "reward_std": 0.01388513669371605, "rewards//mean": 0.812255859375, "rewards//std": 0.029967118054628372, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5674, "grad_norm": 1.5586738586425781, "kl": 0.18488372303545475, "learning_rate": 2.0097456693232222e-06, "loss": 0.0185, "num_tokens": 18504280.0, "reward": 0.83935546875, "reward_std": 0.01019925158470869, "rewards//mean": 0.83935546875, "rewards//std": 0.021558932960033417, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5676, "grad_norm": 1.7547770738601685, "kl": 0.330087972804904, "learning_rate": 2.0081899125607006e-06, "loss": 0.033, "num_tokens": 18510736.0, "reward": 0.8094482421875, "reward_std": 0.012299812398850918, "rewards//mean": 0.8094482421875, "rewards//std": 0.02102392353117466, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5678, "grad_norm": 1.3871917724609375, "kl": 0.18589959666132927, "learning_rate": 2.0066343538992237e-06, "loss": 0.0186, "num_tokens": 18517192.0, "reward": 0.80914306640625, "reward_std": 0.01168447732925415, "rewards//mean": 0.80914306640625, "rewards//std": 0.022007765248417854, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.568, "grad_norm": 2.0061252117156982, "kl": 0.3835411136969924, "learning_rate": 2.0050789939653694e-06, "loss": 0.0384, "num_tokens": 18523680.0, "reward": 0.846923828125, "reward_std": 0.01268633920699358, "rewards//mean": 0.846923828125, "rewards//std": 0.0221941526979208, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5682, "grad_norm": 1.5667272806167603, "kl": 0.46895091235637665, "learning_rate": 2.003523833385637e-06, "loss": 0.0469, "num_tokens": 18530168.0, "reward": 0.836181640625, "reward_std": 0.01137080043554306, "rewards//mean": 0.836181640625, "rewards//std": 0.019185619428753853, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5684, "grad_norm": 1.5946049690246582, "kl": 0.46989670395851135, "learning_rate": 2.0019688727864453e-06, "loss": 0.047, "num_tokens": 18536584.0, "reward": 0.79803466796875, "reward_std": 0.011583784595131874, "rewards//mean": 0.79803466796875, "rewards//std": 0.02182888798415661, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5686, "grad_norm": 1.9692376852035522, "kl": 0.316482225432992, "learning_rate": 2.0004141127941322e-06, "loss": 0.0316, "num_tokens": 18543128.0, "reward": 0.84228515625, "reward_std": 0.010870113037526608, "rewards//mean": 0.84228515625, "rewards//std": 0.018222521990537643, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5688, "grad_norm": 1.7705665826797485, "kl": 0.23573397658765316, "learning_rate": 1.9988595540349546e-06, "loss": 0.0236, "num_tokens": 18549600.0, "reward": 0.86846923828125, "reward_std": 0.013795951381325722, "rewards//mean": 0.86846923828125, "rewards//std": 0.02959446795284748, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.569, "grad_norm": 2.6521542072296143, "kl": 0.4081556936725974, "learning_rate": 1.997305197135089e-06, "loss": 0.0408, "num_tokens": 18556104.0, "reward": 0.82159423828125, "reward_std": 0.01704772561788559, "rewards//mean": 0.82159423828125, "rewards//std": 0.03549600765109062, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5692, "grad_norm": 1.6381239891052246, "kl": 0.20320568792521954, "learning_rate": 1.9957510427206295e-06, "loss": 0.0203, "num_tokens": 18562592.0, "reward": 0.8670654296875, "reward_std": 0.009473703801631927, "rewards//mean": 0.8670654296875, "rewards//std": 0.013954070396721363, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5694, "grad_norm": 2.241696834564209, "kl": 0.3075030483305454, "learning_rate": 1.9941970914175902e-06, "loss": 0.0308, "num_tokens": 18569088.0, "reward": 0.8406982421875, "reward_std": 0.01183834858238697, "rewards//mean": 0.8406982421875, "rewards//std": 0.017097482457756996, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5696, "grad_norm": 2.3617289066314697, "kl": 0.35171512607485056, "learning_rate": 1.9926433438519013e-06, "loss": 0.0352, "num_tokens": 18575544.0, "reward": 0.840087890625, "reward_std": 0.012795694172382355, "rewards//mean": 0.840087890625, "rewards//std": 0.03310084342956543, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5698, "grad_norm": 1.5003111362457275, "kl": 0.26101219467818737, "learning_rate": 1.9910898006494135e-06, "loss": 0.0261, "num_tokens": 18582064.0, "reward": 0.85687255859375, "reward_std": 0.016328658908605576, "rewards//mean": 0.85687255859375, "rewards//std": 0.0265059731900692, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.57, "grad_norm": 2.83984112739563, "kl": 0.30579084530472755, "learning_rate": 1.989536462435892e-06, "loss": 0.0306, "num_tokens": 18588560.0, "reward": 0.85711669921875, "reward_std": 0.013638116419315338, "rewards//mean": 0.85711669921875, "rewards//std": 0.031733036041259766, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5702, "grad_norm": 1.2914718389511108, "kl": 0.20834377501159906, "learning_rate": 1.987983329837024e-06, "loss": 0.0208, "num_tokens": 18595032.0, "reward": 0.8685302734375, "reward_std": 0.008773893117904663, "rewards//mean": 0.8685302734375, "rewards//std": 0.023439761251211166, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5704, "grad_norm": 2.0279672145843506, "kl": 0.1590603655204177, "learning_rate": 1.986430403478408e-06, "loss": 0.0159, "num_tokens": 18601504.0, "reward": 0.839111328125, "reward_std": 0.010550418868660927, "rewards//mean": 0.839111328125, "rewards//std": 0.016453644260764122, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5706, "grad_norm": 2.5972352027893066, "kl": 0.36180374026298523, "learning_rate": 1.9848776839855655e-06, "loss": 0.0362, "num_tokens": 18608040.0, "reward": 0.85406494140625, "reward_std": 0.011659136973321438, "rewards//mean": 0.85406494140625, "rewards//std": 0.025642091408371925, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5708, "grad_norm": 2.463425636291504, "kl": 0.38413443230092525, "learning_rate": 1.9833251719839292e-06, "loss": 0.0384, "num_tokens": 18614616.0, "reward": 0.86492919921875, "reward_std": 0.010853348299860954, "rewards//mean": 0.86492919921875, "rewards//std": 0.021131202578544617, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.571, "grad_norm": 1.7727768421173096, "kl": 0.21198936365544796, "learning_rate": 1.981772868098852e-06, "loss": 0.0212, "num_tokens": 18621048.0, "reward": 0.86907958984375, "reward_std": 0.009306253865361214, "rewards//mean": 0.86907958984375, "rewards//std": 0.0141441123560071, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5712, "grad_norm": 1.485203504562378, "kl": 0.3663062732666731, "learning_rate": 1.9802207729556023e-06, "loss": 0.0366, "num_tokens": 18627568.0, "reward": 0.85284423828125, "reward_std": 0.010410632006824017, "rewards//mean": 0.85284423828125, "rewards//std": 0.017134075984358788, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5714, "grad_norm": 1.7590725421905518, "kl": 0.4445856623351574, "learning_rate": 1.9786688871793623e-06, "loss": 0.0445, "num_tokens": 18634160.0, "reward": 0.8621826171875, "reward_std": 0.01593841053545475, "rewards//mean": 0.8621826171875, "rewards//std": 0.026166634634137154, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5716, "grad_norm": 2.8758177757263184, "kl": 0.5777189303189516, "learning_rate": 1.9771172113952327e-06, "loss": 0.0578, "num_tokens": 18640728.0, "reward": 0.82659912109375, "reward_std": 0.017544452100992203, "rewards//mean": 0.82659912109375, "rewards//std": 0.022942621260881424, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5718, "grad_norm": 1.623428225517273, "kl": 0.5016990136355162, "learning_rate": 1.9755657462282273e-06, "loss": 0.0502, "num_tokens": 18647440.0, "reward": 0.8720703125, "reward_std": 0.016019845381379128, "rewards//mean": 0.8720703125, "rewards//std": 0.04693180322647095, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.572, "grad_norm": 2.0242860317230225, "kl": 0.41456918604671955, "learning_rate": 1.9740144923032773e-06, "loss": 0.0415, "num_tokens": 18653920.0, "reward": 0.83709716796875, "reward_std": 0.01080261543393135, "rewards//mean": 0.83709716796875, "rewards//std": 0.017767317593097687, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5722, "grad_norm": 2.51069974899292, "kl": 0.3196733519434929, "learning_rate": 1.972463450245226e-06, "loss": 0.032, "num_tokens": 18660440.0, "reward": 0.88299560546875, "reward_std": 0.015948936343193054, "rewards//mean": 0.88299560546875, "rewards//std": 0.0399479866027832, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5724, "grad_norm": 1.8896172046661377, "kl": 0.2712840735912323, "learning_rate": 1.9709126206788347e-06, "loss": 0.0271, "num_tokens": 18666920.0, "reward": 0.82440185546875, "reward_std": 0.01087958738207817, "rewards//mean": 0.82440185546875, "rewards//std": 0.02736968919634819, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5726, "grad_norm": 1.6533724069595337, "kl": 0.2832153094932437, "learning_rate": 1.969362004228776e-06, "loss": 0.0283, "num_tokens": 18673496.0, "reward": 0.8604736328125, "reward_std": 0.011993307620286942, "rewards//mean": 0.8604736328125, "rewards//std": 0.035238612443208694, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5728, "grad_norm": 1.7784078121185303, "kl": 0.46245223470032215, "learning_rate": 1.9678116015196387e-06, "loss": 0.0462, "num_tokens": 18680008.0, "reward": 0.8514404296875, "reward_std": 0.013494458049535751, "rewards//mean": 0.8514404296875, "rewards//std": 0.03147372230887413, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.573, "grad_norm": 1.5519423484802246, "kl": 0.3786465637385845, "learning_rate": 1.9662614131759243e-06, "loss": 0.0379, "num_tokens": 18686544.0, "reward": 0.87261962890625, "reward_std": 0.010604584589600563, "rewards//mean": 0.87261962890625, "rewards//std": 0.022473318502306938, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5732, "grad_norm": 1.415085792541504, "kl": 0.2632770296186209, "learning_rate": 1.9647114398220494e-06, "loss": 0.0263, "num_tokens": 18693008.0, "reward": 0.761474609375, "reward_std": 0.010835697874426842, "rewards//mean": 0.761474609375, "rewards//std": 0.034847456961870193, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5734, "grad_norm": 2.4145021438598633, "kl": 0.3491997513920069, "learning_rate": 1.963161682082342e-06, "loss": 0.0349, "num_tokens": 18699480.0, "reward": 0.8460693359375, "reward_std": 0.011227492243051529, "rewards//mean": 0.8460693359375, "rewards//std": 0.01859380677342415, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5736, "grad_norm": 1.8243117332458496, "kl": 0.45009888149797916, "learning_rate": 1.9616121405810455e-06, "loss": 0.045, "num_tokens": 18705976.0, "reward": 0.83795166015625, "reward_std": 0.014016980305314064, "rewards//mean": 0.83795166015625, "rewards//std": 0.02102995291352272, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5738, "grad_norm": 1.7589702606201172, "kl": 0.3110180087387562, "learning_rate": 1.960062815942314e-06, "loss": 0.0311, "num_tokens": 18712456.0, "reward": 0.88702392578125, "reward_std": 0.011276209726929665, "rewards//mean": 0.88702392578125, "rewards//std": 0.03036399558186531, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.574, "grad_norm": 1.8351705074310303, "kl": 0.3043543891981244, "learning_rate": 1.958513708790216e-06, "loss": 0.0304, "num_tokens": 18719048.0, "reward": 0.8599853515625, "reward_std": 0.011120544746518135, "rewards//mean": 0.8599853515625, "rewards//std": 0.027403198182582855, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5742, "grad_norm": 1.9185649156570435, "kl": 0.41009137500077486, "learning_rate": 1.956964819748731e-06, "loss": 0.041, "num_tokens": 18725504.0, "reward": 0.85614013671875, "reward_std": 0.013390857726335526, "rewards//mean": 0.85614013671875, "rewards//std": 0.019692473113536835, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5744, "grad_norm": 1.8613545894622803, "kl": 0.36305925622582436, "learning_rate": 1.955416149441752e-06, "loss": 0.0363, "num_tokens": 18731968.0, "reward": 0.87109375, "reward_std": 0.012364154681563377, "rewards//mean": 0.87109375, "rewards//std": 0.020775124430656433, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5746, "grad_norm": 1.247689127922058, "kl": 0.34422713331878185, "learning_rate": 1.953867698493083e-06, "loss": 0.0344, "num_tokens": 18738536.0, "reward": 0.87286376953125, "reward_std": 0.01050717942416668, "rewards//mean": 0.87286376953125, "rewards//std": 0.02345743216574192, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5748, "grad_norm": 1.858486294746399, "kl": 0.5167370177805424, "learning_rate": 1.9523194675264385e-06, "loss": 0.0517, "num_tokens": 18745088.0, "reward": 0.8504638671875, "reward_std": 0.017890438437461853, "rewards//mean": 0.8504638671875, "rewards//std": 0.028514213860034943, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.575, "grad_norm": 2.9529404640197754, "kl": 0.5239101406186819, "learning_rate": 1.950771457165448e-06, "loss": 0.0524, "num_tokens": 18751624.0, "reward": 0.85821533203125, "reward_std": 0.01503017358481884, "rewards//mean": 0.85821533203125, "rewards//std": 0.02901279926300049, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5752, "grad_norm": 1.5768191814422607, "kl": 0.29102776385843754, "learning_rate": 1.9492236680336486e-06, "loss": 0.0291, "num_tokens": 18758216.0, "reward": 0.8563232421875, "reward_std": 0.016212470829486847, "rewards//mean": 0.8563232421875, "rewards//std": 0.0310982558876276, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5754, "grad_norm": 1.684763789176941, "kl": 0.31169069185853004, "learning_rate": 1.9476761007544905e-06, "loss": 0.0312, "num_tokens": 18764720.0, "reward": 0.86285400390625, "reward_std": 0.014366457238793373, "rewards//mean": 0.86285400390625, "rewards//std": 0.03502937778830528, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5756, "grad_norm": 1.5986583232879639, "kl": 0.35907834954559803, "learning_rate": 1.946128755951332e-06, "loss": 0.0359, "num_tokens": 18771248.0, "reward": 0.82940673828125, "reward_std": 0.013002458959817886, "rewards//mean": 0.82940673828125, "rewards//std": 0.019979415461421013, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5758, "grad_norm": 1.870845079421997, "kl": 0.4441049862653017, "learning_rate": 1.944581634247445e-06, "loss": 0.0444, "num_tokens": 18777744.0, "reward": 0.771728515625, "reward_std": 0.012067369185388088, "rewards//mean": 0.771728515625, "rewards//std": 0.03154218569397926, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.576, "grad_norm": 1.9940099716186523, "kl": 0.3504444230347872, "learning_rate": 1.9430347362660087e-06, "loss": 0.035, "num_tokens": 18784240.0, "reward": 0.8392333984375, "reward_std": 0.015166912227869034, "rewards//mean": 0.8392333984375, "rewards//std": 0.019428284838795662, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5762, "grad_norm": 1.6411126852035522, "kl": 0.48319728672504425, "learning_rate": 1.9414880626301147e-06, "loss": 0.0483, "num_tokens": 18790744.0, "reward": 0.8543701171875, "reward_std": 0.012831599451601505, "rewards//mean": 0.8543701171875, "rewards//std": 0.017940809950232506, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5764, "grad_norm": 1.914994478225708, "kl": 0.2802959457039833, "learning_rate": 1.9399416139627617e-06, "loss": 0.028, "num_tokens": 18797280.0, "reward": 0.84051513671875, "reward_std": 0.011445973068475723, "rewards//mean": 0.84051513671875, "rewards//std": 0.024125512689352036, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5766, "grad_norm": 1.664781928062439, "kl": 0.34557780623435974, "learning_rate": 1.9383953908868604e-06, "loss": 0.0346, "num_tokens": 18803816.0, "reward": 0.83636474609375, "reward_std": 0.0103932935744524, "rewards//mean": 0.83636474609375, "rewards//std": 0.02061263844370842, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5768, "grad_norm": 1.6120036840438843, "kl": 0.27963039092719555, "learning_rate": 1.9368493940252277e-06, "loss": 0.028, "num_tokens": 18810400.0, "reward": 0.84222412109375, "reward_std": 0.01391511969268322, "rewards//mean": 0.84222412109375, "rewards//std": 0.02404569275677204, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.577, "grad_norm": 2.536445379257202, "kl": 0.3399261571466923, "learning_rate": 1.935303624000592e-06, "loss": 0.034, "num_tokens": 18817000.0, "reward": 0.84014892578125, "reward_std": 0.010786477476358414, "rewards//mean": 0.84014892578125, "rewards//std": 0.029250789433717728, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5772, "grad_norm": 2.7077622413635254, "kl": 0.34501572512090206, "learning_rate": 1.9337580814355887e-06, "loss": 0.0345, "num_tokens": 18823488.0, "reward": 0.8406982421875, "reward_std": 0.012321336194872856, "rewards//mean": 0.8406982421875, "rewards//std": 0.0270137470215559, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5774, "grad_norm": 2.4065403938293457, "kl": 0.4612739197909832, "learning_rate": 1.9322127669527637e-06, "loss": 0.0461, "num_tokens": 18829976.0, "reward": 0.80999755859375, "reward_std": 0.010902556590735912, "rewards//mean": 0.80999755859375, "rewards//std": 0.020662516355514526, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5776, "grad_norm": 2.601701498031616, "kl": 0.37575820833444595, "learning_rate": 1.930667681174567e-06, "loss": 0.0376, "num_tokens": 18836440.0, "reward": 0.87255859375, "reward_std": 0.016602689400315285, "rewards//mean": 0.87255859375, "rewards//std": 0.024473823606967926, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5778, "grad_norm": 1.5052471160888672, "kl": 0.3349686274304986, "learning_rate": 1.9291228247233607e-06, "loss": 0.0335, "num_tokens": 18843048.0, "reward": 0.82647705078125, "reward_std": 0.009442489594221115, "rewards//mean": 0.82647705078125, "rewards//std": 0.019720127806067467, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.578, "grad_norm": 4.00409460067749, "kl": 0.25457396544516087, "learning_rate": 1.9275781982214126e-06, "loss": 0.0255, "num_tokens": 18849472.0, "reward": 0.85455322265625, "reward_std": 0.014076260849833488, "rewards//mean": 0.85455322265625, "rewards//std": 0.02940305881202221, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5782, "grad_norm": 2.3441288471221924, "kl": 0.35950091294944286, "learning_rate": 1.9260338022908972e-06, "loss": 0.036, "num_tokens": 18855928.0, "reward": 0.86871337890625, "reward_std": 0.011091098189353943, "rewards//mean": 0.86871337890625, "rewards//std": 0.02189466916024685, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5784, "grad_norm": 2.203530788421631, "kl": 0.6375265643000603, "learning_rate": 1.924489637553898e-06, "loss": 0.0638, "num_tokens": 18862536.0, "reward": 0.846923828125, "reward_std": 0.01369117759168148, "rewards//mean": 0.846923828125, "rewards//std": 0.04118651896715164, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5786, "grad_norm": 2.726480007171631, "kl": 0.5136549137532711, "learning_rate": 1.9229457046324037e-06, "loss": 0.0514, "num_tokens": 18869024.0, "reward": 0.8499755859375, "reward_std": 0.017336096614599228, "rewards//mean": 0.8499755859375, "rewards//std": 0.025139881297945976, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5788, "grad_norm": 3.6082680225372314, "kl": 0.6754504404962063, "learning_rate": 1.9214020041483112e-06, "loss": 0.0675, "num_tokens": 18875512.0, "reward": 0.83978271484375, "reward_std": 0.013487354852259159, "rewards//mean": 0.83978271484375, "rewards//std": 0.02600902132689953, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.579, "grad_norm": 3.403373956680298, "kl": 0.36005739122629166, "learning_rate": 1.919858536723421e-06, "loss": 0.036, "num_tokens": 18881976.0, "reward": 0.870849609375, "reward_std": 0.013030588626861572, "rewards//mean": 0.870849609375, "rewards//std": 0.031786952167749405, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5792, "grad_norm": 1.7392476797103882, "kl": 0.48344587348401546, "learning_rate": 1.9183153029794443e-06, "loss": 0.0483, "num_tokens": 18888392.0, "reward": 0.8760986328125, "reward_std": 0.016403626650571823, "rewards//mean": 0.8760986328125, "rewards//std": 0.03294384106993675, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5794, "grad_norm": 1.8865134716033936, "kl": 0.33422818034887314, "learning_rate": 1.916772303537993e-06, "loss": 0.0334, "num_tokens": 18895048.0, "reward": 0.878662109375, "reward_std": 0.010293712839484215, "rewards//mean": 0.878662109375, "rewards//std": 0.02585381455719471, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5796, "grad_norm": 1.8279670476913452, "kl": 0.5770630221813917, "learning_rate": 1.9152295390205884e-06, "loss": 0.0577, "num_tokens": 18901600.0, "reward": 0.84930419921875, "reward_std": 0.016179390251636505, "rewards//mean": 0.84930419921875, "rewards//std": 0.03569164127111435, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5798, "grad_norm": 1.8473116159439087, "kl": 0.5625094641000032, "learning_rate": 1.9136870100486554e-06, "loss": 0.0563, "num_tokens": 18908008.0, "reward": 0.846435546875, "reward_std": 0.014800790697336197, "rewards//mean": 0.846435546875, "rewards//std": 0.025900613516569138, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.58, "grad_norm": 2.076357126235962, "kl": 0.36933496221899986, "learning_rate": 1.912144717243525e-06, "loss": 0.0369, "num_tokens": 18914536.0, "reward": 0.86126708984375, "reward_std": 0.009642798453569412, "rewards//mean": 0.86126708984375, "rewards//std": 0.022358516231179237, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5802, "grad_norm": 1.605246901512146, "kl": 0.4590171417221427, "learning_rate": 1.9106026612264316e-06, "loss": 0.0459, "num_tokens": 18921032.0, "reward": 0.82159423828125, "reward_std": 0.014328929595649242, "rewards//mean": 0.82159423828125, "rewards//std": 0.030097603797912598, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5804, "grad_norm": 1.7777332067489624, "kl": 0.5273631252348423, "learning_rate": 1.9090608426185163e-06, "loss": 0.0527, "num_tokens": 18927480.0, "reward": 0.84893798828125, "reward_std": 0.01609497331082821, "rewards//mean": 0.84893798828125, "rewards//std": 0.02959446795284748, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5806, "grad_norm": 3.268005847930908, "kl": 0.24524891283363104, "learning_rate": 1.907519262040823e-06, "loss": 0.0245, "num_tokens": 18933896.0, "reward": 0.849609375, "reward_std": 0.013459498062729836, "rewards//mean": 0.849609375, "rewards//std": 0.021179234609007835, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5808, "grad_norm": 1.7078856229782104, "kl": 0.2888923157006502, "learning_rate": 1.9059779201142997e-06, "loss": 0.0289, "num_tokens": 18940376.0, "reward": 0.8450927734375, "reward_std": 0.015006184577941895, "rewards//mean": 0.8450927734375, "rewards//std": 0.02540343999862671, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.581, "grad_norm": 1.9891105890274048, "kl": 0.24344342667609453, "learning_rate": 1.9044368174597997e-06, "loss": 0.0243, "num_tokens": 18946984.0, "reward": 0.82159423828125, "reward_std": 0.009586825966835022, "rewards//mean": 0.82159423828125, "rewards//std": 0.015656817704439163, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5812, "grad_norm": 1.665045976638794, "kl": 0.31856018863618374, "learning_rate": 1.9028959546980777e-06, "loss": 0.0319, "num_tokens": 18953544.0, "reward": 0.79779052734375, "reward_std": 0.010324240662157536, "rewards//mean": 0.79779052734375, "rewards//std": 0.028567451983690262, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5814, "grad_norm": 5.530695915222168, "kl": 0.4343784749507904, "learning_rate": 1.901355332449794e-06, "loss": 0.0434, "num_tokens": 18960208.0, "reward": 0.81280517578125, "reward_std": 0.011601315811276436, "rewards//mean": 0.81280517578125, "rewards//std": 0.02480611391365528, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5816, "grad_norm": 4.0258402824401855, "kl": 0.3804450314491987, "learning_rate": 1.8998149513355107e-06, "loss": 0.038, "num_tokens": 18966808.0, "reward": 0.849365234375, "reward_std": 0.009412665851414204, "rewards//mean": 0.849365234375, "rewards//std": 0.03128775954246521, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5818, "grad_norm": 2.1070690155029297, "kl": 0.20760388392955065, "learning_rate": 1.8982748119756938e-06, "loss": 0.0208, "num_tokens": 18973312.0, "reward": 0.8551025390625, "reward_std": 0.01107096392661333, "rewards//mean": 0.8551025390625, "rewards//std": 0.03081856667995453, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.582, "grad_norm": 3.8707735538482666, "kl": 0.46708159521222115, "learning_rate": 1.8967349149907099e-06, "loss": 0.0467, "num_tokens": 18979808.0, "reward": 0.82110595703125, "reward_std": 0.011509295552968979, "rewards//mean": 0.82110595703125, "rewards//std": 0.021363461390137672, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5822, "grad_norm": 2.9444820880889893, "kl": 0.3344595991075039, "learning_rate": 1.895195261000831e-06, "loss": 0.0334, "num_tokens": 18986352.0, "reward": 0.84375, "reward_std": 0.012842820957303047, "rewards//mean": 0.84375, "rewards//std": 0.01582523062825203, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5824, "grad_norm": 2.04758620262146, "kl": 0.4331577531993389, "learning_rate": 1.8936558506262287e-06, "loss": 0.0433, "num_tokens": 18993040.0, "reward": 0.851318359375, "reward_std": 0.012007124722003937, "rewards//mean": 0.851318359375, "rewards//std": 0.040270719677209854, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5826, "grad_norm": 2.725952386856079, "kl": 0.411660872399807, "learning_rate": 1.8921166844869762e-06, "loss": 0.0412, "num_tokens": 18999496.0, "reward": 0.8363037109375, "reward_std": 0.014737111516296864, "rewards//mean": 0.8363037109375, "rewards//std": 0.036209817975759506, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5828, "grad_norm": 1.9523098468780518, "kl": 0.8323369231075048, "learning_rate": 1.8905777632030514e-06, "loss": 0.0832, "num_tokens": 19006064.0, "reward": 0.85516357421875, "reward_std": 0.019743770360946655, "rewards//mean": 0.85516357421875, "rewards//std": 0.03207843378186226, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.583, "grad_norm": 3.6246917247772217, "kl": 0.6383101046085358, "learning_rate": 1.8890390873943299e-06, "loss": 0.0638, "num_tokens": 19012544.0, "reward": 0.81927490234375, "reward_std": 0.013968314975500107, "rewards//mean": 0.81927490234375, "rewards//std": 0.030115202069282532, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5832, "grad_norm": 2.3453900814056396, "kl": 0.588603600859642, "learning_rate": 1.8875006576805915e-06, "loss": 0.0589, "num_tokens": 19019032.0, "reward": 0.87384033203125, "reward_std": 0.010941860266029835, "rewards//mean": 0.87384033203125, "rewards//std": 0.02169533260166645, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5834, "grad_norm": 1.919100046157837, "kl": 0.586994101293385, "learning_rate": 1.885962474681515e-06, "loss": 0.0587, "num_tokens": 19025576.0, "reward": 0.8521728515625, "reward_std": 0.01426271814852953, "rewards//mean": 0.8521728515625, "rewards//std": 0.0212160125374794, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5836, "grad_norm": 2.6457300186157227, "kl": 0.5415442865341902, "learning_rate": 1.8844245390166797e-06, "loss": 0.0542, "num_tokens": 19032152.0, "reward": 0.8077392578125, "reward_std": 0.012018021196126938, "rewards//mean": 0.8077392578125, "rewards//std": 0.020463477820158005, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5838, "grad_norm": 5.104342937469482, "kl": 0.6369865424931049, "learning_rate": 1.882886851305567e-06, "loss": 0.0637, "num_tokens": 19038752.0, "reward": 0.844970703125, "reward_std": 0.013071296736598015, "rewards//mean": 0.844970703125, "rewards//std": 0.016141558066010475, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.584, "grad_norm": 3.8094286918640137, "kl": 0.4771609827876091, "learning_rate": 1.8813494121675562e-06, "loss": 0.0477, "num_tokens": 19045200.0, "reward": 0.80694580078125, "reward_std": 0.01309327594935894, "rewards//mean": 0.80694580078125, "rewards//std": 0.03027212619781494, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5842, "grad_norm": 1.93894362449646, "kl": 0.36882166750729084, "learning_rate": 1.8798122222219288e-06, "loss": 0.0369, "num_tokens": 19051664.0, "reward": 0.832275390625, "reward_std": 0.015044484287500381, "rewards//mean": 0.832275390625, "rewards//std": 0.021963784471154213, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5844, "grad_norm": 2.022705554962158, "kl": 0.2895257552154362, "learning_rate": 1.8782752820878636e-06, "loss": 0.029, "num_tokens": 19058160.0, "reward": 0.8724365234375, "reward_std": 0.014696146361529827, "rewards//mean": 0.8724365234375, "rewards//std": 0.02323218248784542, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5846, "grad_norm": 3.668586254119873, "kl": 0.40954902209341526, "learning_rate": 1.876738592384441e-06, "loss": 0.041, "num_tokens": 19064640.0, "reward": 0.87860107421875, "reward_std": 0.015478519722819328, "rewards//mean": 0.87860107421875, "rewards//std": 0.028572220355272293, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5848, "grad_norm": 1.4564876556396484, "kl": 0.30120954662561417, "learning_rate": 1.875202153730638e-06, "loss": 0.0301, "num_tokens": 19071120.0, "reward": 0.85009765625, "reward_std": 0.01520561520010233, "rewards//mean": 0.85009765625, "rewards//std": 0.04548247903585434, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.585, "grad_norm": 3.151355504989624, "kl": 0.4334563836455345, "learning_rate": 1.8736659667453339e-06, "loss": 0.0433, "num_tokens": 19077656.0, "reward": 0.8330078125, "reward_std": 0.018301602452993393, "rewards//mean": 0.8330078125, "rewards//std": 0.03118016943335533, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5852, "grad_norm": 2.1281955242156982, "kl": 0.34791621193289757, "learning_rate": 1.8721300320473023e-06, "loss": 0.0348, "num_tokens": 19084192.0, "reward": 0.8489990234375, "reward_std": 0.01591341197490692, "rewards//mean": 0.8489990234375, "rewards//std": 0.03525750711560249, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5854, "grad_norm": 2.7004525661468506, "kl": 0.5954413246363401, "learning_rate": 1.87059435025522e-06, "loss": 0.0595, "num_tokens": 19090728.0, "reward": 0.81292724609375, "reward_std": 0.013163812458515167, "rewards//mean": 0.81292724609375, "rewards//std": 0.019551293924450874, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5856, "grad_norm": 1.803842544555664, "kl": 0.4211144894361496, "learning_rate": 1.8690589219876571e-06, "loss": 0.0421, "num_tokens": 19097328.0, "reward": 0.82476806640625, "reward_std": 0.010989975184202194, "rewards//mean": 0.82476806640625, "rewards//std": 0.018333397805690765, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5858, "grad_norm": 1.7488584518432617, "kl": 0.2855754643678665, "learning_rate": 1.8675237478630853e-06, "loss": 0.0286, "num_tokens": 19103832.0, "reward": 0.81549072265625, "reward_std": 0.015468508005142212, "rewards//mean": 0.81549072265625, "rewards//std": 0.03577255830168724, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.586, "grad_norm": 1.5411194562911987, "kl": 0.5870329085737467, "learning_rate": 1.865988828499872e-06, "loss": 0.0587, "num_tokens": 19110336.0, "reward": 0.86431884765625, "reward_std": 0.01175253838300705, "rewards//mean": 0.86431884765625, "rewards//std": 0.03257991746068001, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5862, "grad_norm": 2.1147918701171875, "kl": 0.4635418001562357, "learning_rate": 1.8644541645162834e-06, "loss": 0.0464, "num_tokens": 19116800.0, "reward": 0.861328125, "reward_std": 0.013438438065350056, "rewards//mean": 0.861328125, "rewards//std": 0.02592981979250908, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5864, "grad_norm": 3.554961919784546, "kl": 0.41283707320690155, "learning_rate": 1.8629197565304805e-06, "loss": 0.0413, "num_tokens": 19123304.0, "reward": 0.7777099609375, "reward_std": 0.008295231498777866, "rewards//mean": 0.7777099609375, "rewards//std": 0.01769956201314926, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5866, "grad_norm": 1.7157822847366333, "kl": 0.37759754806756973, "learning_rate": 1.8613856051605242e-06, "loss": 0.0378, "num_tokens": 19129872.0, "reward": 0.82293701171875, "reward_std": 0.011888885870575905, "rewards//mean": 0.82293701171875, "rewards//std": 0.026983119547367096, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5868, "grad_norm": 3.6489529609680176, "kl": 0.39295760076493025, "learning_rate": 1.8598517110243696e-06, "loss": 0.0393, "num_tokens": 19136336.0, "reward": 0.80462646484375, "reward_std": 0.011913858354091644, "rewards//mean": 0.80462646484375, "rewards//std": 0.02499275468289852, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.587, "grad_norm": 1.943169355392456, "kl": 0.45668802969157696, "learning_rate": 1.8583180747398689e-06, "loss": 0.0457, "num_tokens": 19142840.0, "reward": 0.84722900390625, "reward_std": 0.014190929010510445, "rewards//mean": 0.84722900390625, "rewards//std": 0.028477227315306664, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5872, "grad_norm": 1.8945813179016113, "kl": 0.5546719655394554, "learning_rate": 1.8567846969247716e-06, "loss": 0.0555, "num_tokens": 19149328.0, "reward": 0.85980224609375, "reward_std": 0.010419157333672047, "rewards//mean": 0.85980224609375, "rewards//std": 0.017963049933314323, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5874, "grad_norm": 3.3148837089538574, "kl": 0.3710269667208195, "learning_rate": 1.8552515781967206e-06, "loss": 0.0371, "num_tokens": 19155848.0, "reward": 0.8251953125, "reward_std": 0.011846828274428844, "rewards//mean": 0.8251953125, "rewards//std": 0.02350972779095173, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5876, "grad_norm": 2.0110244750976562, "kl": 0.5225689690560102, "learning_rate": 1.8537187191732576e-06, "loss": 0.0523, "num_tokens": 19162536.0, "reward": 0.8687744140625, "reward_std": 0.01609228551387787, "rewards//mean": 0.8687744140625, "rewards//std": 0.02761012129485607, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5878, "grad_norm": 1.8505017757415771, "kl": 0.37448460049927235, "learning_rate": 1.8521861204718167e-06, "loss": 0.0374, "num_tokens": 19169008.0, "reward": 0.8463134765625, "reward_std": 0.012522203847765923, "rewards//mean": 0.8463134765625, "rewards//std": 0.02438179962337017, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.588, "grad_norm": 1.6708961725234985, "kl": 0.28730368986725807, "learning_rate": 1.8506537827097293e-06, "loss": 0.0287, "num_tokens": 19175544.0, "reward": 0.86199951171875, "reward_std": 0.010719786398112774, "rewards//mean": 0.86199951171875, "rewards//std": 0.03221217542886734, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5882, "grad_norm": 1.89682936668396, "kl": 0.48323126696050167, "learning_rate": 1.84912170650422e-06, "loss": 0.0483, "num_tokens": 19181960.0, "reward": 0.864990234375, "reward_std": 0.011373411864042282, "rewards//mean": 0.864990234375, "rewards//std": 0.024713872000575066, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5884, "grad_norm": 2.7225615978240967, "kl": 0.27006442844867706, "learning_rate": 1.8475898924724103e-06, "loss": 0.027, "num_tokens": 19188488.0, "reward": 0.8359375, "reward_std": 0.012841040268540382, "rewards//mean": 0.8359375, "rewards//std": 0.022830262780189514, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5886, "grad_norm": 1.84710693359375, "kl": 0.3757104789838195, "learning_rate": 1.8460583412313132e-06, "loss": 0.0376, "num_tokens": 19195104.0, "reward": 0.83074951171875, "reward_std": 0.011004715226590633, "rewards//mean": 0.83074951171875, "rewards//std": 0.026190560311079025, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5888, "grad_norm": 3.124927282333374, "kl": 0.38298711739480495, "learning_rate": 1.8445270533978387e-06, "loss": 0.0383, "num_tokens": 19201600.0, "reward": 0.8067626953125, "reward_std": 0.011041136458516121, "rewards//mean": 0.8067626953125, "rewards//std": 0.016528455540537834, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.589, "grad_norm": 2.168368339538574, "kl": 0.36029950343072414, "learning_rate": 1.8429960295887881e-06, "loss": 0.036, "num_tokens": 19208152.0, "reward": 0.8231201171875, "reward_std": 0.01638726145029068, "rewards//mean": 0.8231201171875, "rewards//std": 0.03589237481355667, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5892, "grad_norm": 2.3935370445251465, "kl": 0.3250211011618376, "learning_rate": 1.8414652704208586e-06, "loss": 0.0325, "num_tokens": 19214720.0, "reward": 0.82574462890625, "reward_std": 0.010132629424333572, "rewards//mean": 0.82574462890625, "rewards//std": 0.02607528679072857, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5894, "grad_norm": 2.901034116744995, "kl": 0.4114565532654524, "learning_rate": 1.839934776510639e-06, "loss": 0.0411, "num_tokens": 19221112.0, "reward": 0.8707275390625, "reward_std": 0.0132225276902318, "rewards//mean": 0.8707275390625, "rewards//std": 0.01681537553668022, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5896, "grad_norm": 2.061276435852051, "kl": 0.31466901674866676, "learning_rate": 1.8384045484746133e-06, "loss": 0.0315, "num_tokens": 19227592.0, "reward": 0.84820556640625, "reward_std": 0.010947022587060928, "rewards//mean": 0.84820556640625, "rewards//std": 0.017131425440311432, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5898, "grad_norm": 2.029113292694092, "kl": 0.25304450187832117, "learning_rate": 1.836874586929156e-06, "loss": 0.0253, "num_tokens": 19234112.0, "reward": 0.85565185546875, "reward_std": 0.010975787416100502, "rewards//mean": 0.85565185546875, "rewards//std": 0.01898963563144207, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.59, "grad_norm": 2.0001232624053955, "kl": 0.40892164409160614, "learning_rate": 1.8353448924905354e-06, "loss": 0.0409, "num_tokens": 19240640.0, "reward": 0.852294921875, "reward_std": 0.011643966659903526, "rewards//mean": 0.852294921875, "rewards//std": 0.026828469708561897, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5902, "grad_norm": 2.0536110401153564, "kl": 0.35986788012087345, "learning_rate": 1.833815465774913e-06, "loss": 0.036, "num_tokens": 19247112.0, "reward": 0.84423828125, "reward_std": 0.011790689080953598, "rewards//mean": 0.84423828125, "rewards//std": 0.01641864702105522, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5904, "grad_norm": 1.9136900901794434, "kl": 0.47314727306365967, "learning_rate": 1.832286307398341e-06, "loss": 0.0473, "num_tokens": 19253616.0, "reward": 0.84686279296875, "reward_std": 0.01815611682832241, "rewards//mean": 0.84686279296875, "rewards//std": 0.036744069308042526, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5906, "grad_norm": 2.132089376449585, "kl": 0.6060861833393574, "learning_rate": 1.830757417976765e-06, "loss": 0.0606, "num_tokens": 19260192.0, "reward": 0.8369140625, "reward_std": 0.01573074609041214, "rewards//mean": 0.8369140625, "rewards//std": 0.022531256079673767, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5908, "grad_norm": 1.961999773979187, "kl": 0.39367105066776276, "learning_rate": 1.8292287981260204e-06, "loss": 0.0394, "num_tokens": 19266664.0, "reward": 0.8505859375, "reward_std": 0.015850882977247238, "rewards//mean": 0.8505859375, "rewards//std": 0.027761302888393402, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.591, "grad_norm": 2.556004047393799, "kl": 0.3196861380711198, "learning_rate": 1.827700448461836e-06, "loss": 0.032, "num_tokens": 19273224.0, "reward": 0.88262939453125, "reward_std": 0.010574322193861008, "rewards//mean": 0.88262939453125, "rewards//std": 0.02656131237745285, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5912, "grad_norm": 1.725843906402588, "kl": 0.42815123219043016, "learning_rate": 1.8261723695998306e-06, "loss": 0.0428, "num_tokens": 19279736.0, "reward": 0.8631591796875, "reward_std": 0.01628986746072769, "rewards//mean": 0.8631591796875, "rewards//std": 0.025477223098278046, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5914, "grad_norm": 2.64385986328125, "kl": 0.37322137504816055, "learning_rate": 1.8246445621555141e-06, "loss": 0.0373, "num_tokens": 19286216.0, "reward": 0.828125, "reward_std": 0.013540945947170258, "rewards//mean": 0.828125, "rewards//std": 0.017427638173103333, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5916, "grad_norm": 1.9067411422729492, "kl": 0.5179773960262537, "learning_rate": 1.823117026744287e-06, "loss": 0.0518, "num_tokens": 19292776.0, "reward": 0.86419677734375, "reward_std": 0.014006253331899643, "rewards//mean": 0.86419677734375, "rewards//std": 0.02847403660416603, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5918, "grad_norm": 2.8062093257904053, "kl": 0.3575042197480798, "learning_rate": 1.821589763981441e-06, "loss": 0.0358, "num_tokens": 19299328.0, "reward": 0.8511962890625, "reward_std": 0.012065602466464043, "rewards//mean": 0.8511962890625, "rewards//std": 0.022837886586785316, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.592, "grad_norm": 1.936678409576416, "kl": 0.3161815423518419, "learning_rate": 1.8200627744821564e-06, "loss": 0.0316, "num_tokens": 19305872.0, "reward": 0.855712890625, "reward_std": 0.01326943188905716, "rewards//mean": 0.855712890625, "rewards//std": 0.03324686363339424, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5922, "grad_norm": 1.9039298295974731, "kl": 0.3841035831719637, "learning_rate": 1.818536058861506e-06, "loss": 0.0384, "num_tokens": 19312312.0, "reward": 0.77313232421875, "reward_std": 0.012860340066254139, "rewards//mean": 0.77313232421875, "rewards//std": 0.01769389398396015, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5924, "grad_norm": 2.038519859313965, "kl": 0.34489806089550257, "learning_rate": 1.8170096177344483e-06, "loss": 0.0345, "num_tokens": 19318776.0, "reward": 0.88165283203125, "reward_std": 0.010428138077259064, "rewards//mean": 0.88165283203125, "rewards//std": 0.014121619053184986, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5926, "grad_norm": 1.600033164024353, "kl": 0.34466217644512653, "learning_rate": 1.8154834517158356e-06, "loss": 0.0345, "num_tokens": 19325232.0, "reward": 0.869140625, "reward_std": 0.011422245763242245, "rewards//mean": 0.869140625, "rewards//std": 0.018881812691688538, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5928, "grad_norm": 1.5880694389343262, "kl": 0.4274451844394207, "learning_rate": 1.8139575614204064e-06, "loss": 0.0427, "num_tokens": 19331776.0, "reward": 0.83740234375, "reward_std": 0.014891304075717926, "rewards//mean": 0.83740234375, "rewards//std": 0.025205127894878387, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.593, "grad_norm": 1.8936426639556885, "kl": 0.35823851451277733, "learning_rate": 1.8124319474627897e-06, "loss": 0.0358, "num_tokens": 19338264.0, "reward": 0.84112548828125, "reward_std": 0.01425710879266262, "rewards//mean": 0.84112548828125, "rewards//std": 0.02419067919254303, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5932, "grad_norm": 2.3296971321105957, "kl": 0.2774760676547885, "learning_rate": 1.8109066104575023e-06, "loss": 0.0277, "num_tokens": 19344696.0, "reward": 0.845947265625, "reward_std": 0.00974242202937603, "rewards//mean": 0.845947265625, "rewards//std": 0.022018853574991226, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5934, "grad_norm": 1.8140983581542969, "kl": 0.3947795508429408, "learning_rate": 1.8093815510189495e-06, "loss": 0.0395, "num_tokens": 19351216.0, "reward": 0.83648681640625, "reward_std": 0.011932332068681717, "rewards//mean": 0.83648681640625, "rewards//std": 0.02106303721666336, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5936, "grad_norm": 2.910041332244873, "kl": 0.572484927251935, "learning_rate": 1.8078567697614257e-06, "loss": 0.0572, "num_tokens": 19357768.0, "reward": 0.83978271484375, "reward_std": 0.015714092180132866, "rewards//mean": 0.83978271484375, "rewards//std": 0.025050833821296692, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5938, "grad_norm": 2.2959816455841064, "kl": 0.3941840725019574, "learning_rate": 1.8063322672991121e-06, "loss": 0.0394, "num_tokens": 19364272.0, "reward": 0.822998046875, "reward_std": 0.010190421715378761, "rewards//mean": 0.822998046875, "rewards//std": 0.019696412608027458, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.594, "grad_norm": 2.4106850624084473, "kl": 0.527946725487709, "learning_rate": 1.8048080442460786e-06, "loss": 0.0528, "num_tokens": 19370784.0, "reward": 0.868408203125, "reward_std": 0.020412320271134377, "rewards//mean": 0.868408203125, "rewards//std": 0.02546682395040989, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5942, "grad_norm": 3.031607151031494, "kl": 0.3738797642290592, "learning_rate": 1.803284101216281e-06, "loss": 0.0374, "num_tokens": 19377280.0, "reward": 0.87213134765625, "reward_std": 0.014429919421672821, "rewards//mean": 0.87213134765625, "rewards//std": 0.024011673405766487, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5944, "grad_norm": 2.0258214473724365, "kl": 0.2537171486765146, "learning_rate": 1.801760438823565e-06, "loss": 0.0254, "num_tokens": 19383800.0, "reward": 0.81158447265625, "reward_std": 0.010455965064466, "rewards//mean": 0.81158447265625, "rewards//std": 0.0171861220151186, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5946, "grad_norm": 2.939030408859253, "kl": 0.3227442540228367, "learning_rate": 1.8002370576816597e-06, "loss": 0.0323, "num_tokens": 19390264.0, "reward": 0.84307861328125, "reward_std": 0.010508473962545395, "rewards//mean": 0.84307861328125, "rewards//std": 0.021413005888462067, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5948, "grad_norm": 2.8408429622650146, "kl": 0.4557964652776718, "learning_rate": 1.798713958404185e-06, "loss": 0.0456, "num_tokens": 19396864.0, "reward": 0.81640625, "reward_std": 0.013365349732339382, "rewards//mean": 0.81640625, "rewards//std": 0.018466776236891747, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.595, "grad_norm": 1.3785338401794434, "kl": 0.35648508556187153, "learning_rate": 1.7971911416046431e-06, "loss": 0.0356, "num_tokens": 19403408.0, "reward": 0.8548583984375, "reward_std": 0.013763458468019962, "rewards//mean": 0.8548583984375, "rewards//std": 0.028891824185848236, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5952, "grad_norm": 2.3953983783721924, "kl": 0.29510789923369884, "learning_rate": 1.7956686078964257e-06, "loss": 0.0295, "num_tokens": 19409992.0, "reward": 0.8463134765625, "reward_std": 0.011698022484779358, "rewards//mean": 0.8463134765625, "rewards//std": 0.020424973219633102, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5954, "grad_norm": 1.7171069383621216, "kl": 0.3687427267432213, "learning_rate": 1.7941463578928088e-06, "loss": 0.0369, "num_tokens": 19416528.0, "reward": 0.8651123046875, "reward_std": 0.008347579278051853, "rewards//mean": 0.8651123046875, "rewards//std": 0.02121315710246563, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5956, "grad_norm": 1.6158063411712646, "kl": 0.3643795056268573, "learning_rate": 1.7926243922069547e-06, "loss": 0.0364, "num_tokens": 19423000.0, "reward": 0.85235595703125, "reward_std": 0.012056034058332443, "rewards//mean": 0.85235595703125, "rewards//std": 0.02765798568725586, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5958, "grad_norm": 2.9416821002960205, "kl": 0.26001967769116163, "learning_rate": 1.7911027114519104e-06, "loss": 0.026, "num_tokens": 19429608.0, "reward": 0.82855224609375, "reward_std": 0.009236112236976624, "rewards//mean": 0.82855224609375, "rewards//std": 0.024390723556280136, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.596, "grad_norm": 2.0782270431518555, "kl": 0.3594694323837757, "learning_rate": 1.7895813162406094e-06, "loss": 0.0359, "num_tokens": 19436056.0, "reward": 0.83612060546875, "reward_std": 0.011925783008337021, "rewards//mean": 0.83612060546875, "rewards//std": 0.03173017129302025, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5962, "grad_norm": 3.0604658126831055, "kl": 0.18584522604942322, "learning_rate": 1.7880602071858694e-06, "loss": 0.0186, "num_tokens": 19442528.0, "reward": 0.8719482421875, "reward_std": 0.012059607543051243, "rewards//mean": 0.8719482421875, "rewards//std": 0.026716234162449837, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5964, "grad_norm": 1.9830005168914795, "kl": 0.26687466353178024, "learning_rate": 1.7865393849003925e-06, "loss": 0.0267, "num_tokens": 19448960.0, "reward": 0.855224609375, "reward_std": 0.016560740768909454, "rewards//mean": 0.855224609375, "rewards//std": 0.029379431158304214, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5966, "grad_norm": 2.8475728034973145, "kl": 0.23591611441224813, "learning_rate": 1.785018849996767e-06, "loss": 0.0236, "num_tokens": 19455512.0, "reward": 0.84844970703125, "reward_std": 0.0101094925776124, "rewards//mean": 0.84844970703125, "rewards//std": 0.021453969180583954, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5968, "grad_norm": 1.9590389728546143, "kl": 0.4225652925670147, "learning_rate": 1.7834986030874617e-06, "loss": 0.0423, "num_tokens": 19462088.0, "reward": 0.84869384765625, "reward_std": 0.017152247950434685, "rewards//mean": 0.84869384765625, "rewards//std": 0.02372756041586399, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.597, "grad_norm": 3.023703098297119, "kl": 0.2317569088190794, "learning_rate": 1.7819786447848346e-06, "loss": 0.0232, "num_tokens": 19468648.0, "reward": 0.81378173828125, "reward_std": 0.012868855148553848, "rewards//mean": 0.81378173828125, "rewards//std": 0.022009827196598053, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5972, "grad_norm": 1.6550133228302002, "kl": 0.2738159978762269, "learning_rate": 1.7804589757011225e-06, "loss": 0.0274, "num_tokens": 19475136.0, "reward": 0.82391357421875, "reward_std": 0.00981660932302475, "rewards//mean": 0.82391357421875, "rewards//std": 0.020544227212667465, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5974, "grad_norm": 1.724603295326233, "kl": 0.346660939976573, "learning_rate": 1.7789395964484498e-06, "loss": 0.0347, "num_tokens": 19481904.0, "reward": 0.862060546875, "reward_std": 0.010921983048319817, "rewards//mean": 0.862060546875, "rewards//std": 0.017536740750074387, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5976, "grad_norm": 1.4627597332000732, "kl": 0.2664725612848997, "learning_rate": 1.7774205076388207e-06, "loss": 0.0266, "num_tokens": 19488352.0, "reward": 0.812744140625, "reward_std": 0.013089988380670547, "rewards//mean": 0.812744140625, "rewards//std": 0.022215967997908592, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5978, "grad_norm": 1.8169364929199219, "kl": 0.35024567507207394, "learning_rate": 1.7759017098841255e-06, "loss": 0.035, "num_tokens": 19494824.0, "reward": 0.8546142578125, "reward_std": 0.013189675286412239, "rewards//mean": 0.8546142578125, "rewards//std": 0.0312613807618618, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.598, "grad_norm": 1.7234477996826172, "kl": 0.236354673281312, "learning_rate": 1.7743832037961346e-06, "loss": 0.0236, "num_tokens": 19501360.0, "reward": 0.816650390625, "reward_std": 0.011838211677968502, "rewards//mean": 0.816650390625, "rewards//std": 0.017851572483778, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5982, "grad_norm": 2.8421287536621094, "kl": 0.3502425644546747, "learning_rate": 1.7728649899865024e-06, "loss": 0.035, "num_tokens": 19507840.0, "reward": 0.8359375, "reward_std": 0.015302229672670364, "rewards//mean": 0.8359375, "rewards//std": 0.027647653594613075, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5984, "grad_norm": 1.928076982498169, "kl": 0.22978410497307777, "learning_rate": 1.771347069066765e-06, "loss": 0.023, "num_tokens": 19514360.0, "reward": 0.82940673828125, "reward_std": 0.010801446624100208, "rewards//mean": 0.82940673828125, "rewards//std": 0.029635362327098846, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5986, "grad_norm": 1.9289555549621582, "kl": 0.2580075915902853, "learning_rate": 1.7698294416483414e-06, "loss": 0.0258, "num_tokens": 19520896.0, "reward": 0.840087890625, "reward_std": 0.010981438681483269, "rewards//mean": 0.840087890625, "rewards//std": 0.022259533405303955, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5988, "grad_norm": 1.623026967048645, "kl": 0.28232607804238796, "learning_rate": 1.7683121083425312e-06, "loss": 0.0282, "num_tokens": 19527384.0, "reward": 0.87274169921875, "reward_std": 0.009611171670258045, "rewards//mean": 0.87274169921875, "rewards//std": 0.01882792077958584, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.599, "grad_norm": 1.8799617290496826, "kl": 0.4315887689590454, "learning_rate": 1.7667950697605166e-06, "loss": 0.0432, "num_tokens": 19533936.0, "reward": 0.82977294921875, "reward_std": 0.01318407617509365, "rewards//mean": 0.82977294921875, "rewards//std": 0.024614982306957245, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5992, "grad_norm": 1.6462132930755615, "kl": 0.1851809537038207, "learning_rate": 1.7652783265133608e-06, "loss": 0.0185, "num_tokens": 19540472.0, "reward": 0.8548583984375, "reward_std": 0.01404733769595623, "rewards//mean": 0.8548583984375, "rewards//std": 0.02883308380842209, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5994, "grad_norm": 2.0227701663970947, "kl": 0.4222105238586664, "learning_rate": 1.7637618792120065e-06, "loss": 0.0422, "num_tokens": 19547104.0, "reward": 0.80584716796875, "reward_std": 0.01145744975656271, "rewards//mean": 0.80584716796875, "rewards//std": 0.019145239144563675, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5996, "grad_norm": 1.9756724834442139, "kl": 0.5309354644268751, "learning_rate": 1.762245728467279e-06, "loss": 0.0531, "num_tokens": 19553616.0, "reward": 0.86529541015625, "reward_std": 0.017458435148000717, "rewards//mean": 0.86529541015625, "rewards//std": 0.03790441155433655, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.5998, "grad_norm": 1.8834006786346436, "kl": 0.306193582713604, "learning_rate": 1.7607298748898844e-06, "loss": 0.0306, "num_tokens": 19560184.0, "reward": 0.82440185546875, "reward_std": 0.008078502491116524, "rewards//mean": 0.82440185546875, "rewards//std": 0.01578584313392639, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.0, "epoch": 0.6, "grad_norm": 1.7700424194335938, "kl": 0.3641325943171978, "learning_rate": 1.7592143190904068e-06, "loss": 0.0364, "num_tokens": 19566608.0, "reward": 0.83099365234375, "reward_std": 0.012255895882844925, "rewards//mean": 0.83099365234375, "rewards//std": 0.023798907175660133, "step": 3000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }