{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9999146539216524,
  "eval_steps": 500,
  "global_step": 2929,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1750.845703125,
      "completions/mean_terminated_length": 1302.2010498046875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.0003413843133907997,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.05033346607156781,
      "learning_rate": 0.0,
      "loss": 0.0074,
      "num_tokens": 975041.0,
      "reward": 0.072265625,
      "reward_std": 0.1443791538476944,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1776.15625,
      "completions/mean_terminated_length": 1248.0919189453125,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "epoch": 0.0006827686267815994,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.049110022117787845,
      "learning_rate": 3.4129692832764506e-09,
      "loss": 0.0006,
      "num_tokens": 1957665.0,
      "reward": 0.033203125,
      "reward_std": 0.0756097063422203,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1758.369140625,
      "completions/mean_terminated_length": 1328.1407470703125,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.001024152940172399,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04241046785362354,
      "learning_rate": 6.825938566552901e-09,
      "loss": 0.0081,
      "num_tokens": 2939822.0,
      "reward": 0.080078125,
      "reward_std": 0.1211543008685112,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1809.0078125,
      "completions/mean_terminated_length": 1273.5443115234375,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.0013655372535631989,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04091706716817529,
      "learning_rate": 1.023890784982935e-08,
      "loss": 0.0069,
      "num_tokens": 3946434.0,
      "reward": 0.041015625,
      "reward_std": 0.07328042387962341,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.587890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1707.787109375,
      "completions/mean_terminated_length": 1222.459716796875,
      "completions/min_length": 347.0,
      "completions/min_terminated_length": 347.0,
      "epoch": 0.0017069215669539984,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.05690206911194408,
      "learning_rate": 1.3651877133105802e-08,
      "loss": 0.022,
      "num_tokens": 4902325.0,
      "reward": 0.109375,
      "reward_std": 0.1755613535642624,
      "rewards/accuracy_reward/mean": 0.11290322244167328,
      "rewards/accuracy_reward/std": 0.3167939782142639,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1717.732421875,
      "completions/mean_terminated_length": 1219.3236083984375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.002048305880344798,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05573824427783701,
      "learning_rate": 1.706484641638225e-08,
      "loss": 0.0135,
      "num_tokens": 5869084.0,
      "reward": 0.041015625,
      "reward_std": 0.09644745290279388,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1754.251953125,
      "completions/mean_terminated_length": 1226.1474609375,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.0023896901937355977,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.03002513688396799,
      "learning_rate": 2.04778156996587e-08,
      "loss": 0.0121,
      "num_tokens": 6839645.0,
      "reward": 0.044921875,
      "reward_std": 0.060957908630371094,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.599609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1797.53125,
      "completions/mean_terminated_length": 1422.43896484375,
      "completions/min_length": 300.0,
      "completions/min_terminated_length": 300.0,
      "epoch": 0.0027310745071263977,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.060326288268162755,
      "learning_rate": 2.3890784982935154e-08,
      "loss": 0.0274,
      "num_tokens": 7836685.0,
      "reward": 0.123046875,
      "reward_std": 0.18169808387756348,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.595703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1707.265625,
      "completions/mean_terminated_length": 1205.2174072265625,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "epoch": 0.0030724588205171973,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.050131841466508904,
      "learning_rate": 2.7303754266211605e-08,
      "loss": 0.026,
      "num_tokens": 8784597.0,
      "reward": 0.048828125,
      "reward_std": 0.10684061795473099,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1774.05859375,
      "completions/mean_terminated_length": 1293.9246826171875,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 0.003413843133907997,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.041968355334221295,
      "learning_rate": 3.071672354948805e-08,
      "loss": 0.0199,
      "num_tokens": 9764851.0,
      "reward": 0.0390625,
      "reward_std": 0.0969453901052475,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.591796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1727.42578125,
      "completions/mean_terminated_length": 1262.6697998046875,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "epoch": 0.003755227447298797,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04477430838067097,
      "learning_rate": 3.41296928327645e-08,
      "loss": 0.0079,
      "num_tokens": 10724877.0,
      "reward": 0.05859375,
      "reward_std": 0.09534769505262375,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1760.837890625,
      "completions/mean_terminated_length": 1341.1395263671875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.004096611760689596,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04142656941993241,
      "learning_rate": 3.754266211604096e-08,
      "loss": 0.0103,
      "num_tokens": 11697802.0,
      "reward": 0.07421875,
      "reward_std": 0.09876786172389984,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.64453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1716.64453125,
      "completions/mean_terminated_length": 1115.835205078125,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.004437996074080396,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.043864214373648215,
      "learning_rate": 4.09556313993174e-08,
      "loss": 0.0081,
      "num_tokens": 12652532.0,
      "reward": 0.08203125,
      "reward_std": 0.1194583922624588,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.595703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1758.34375,
      "completions/mean_terminated_length": 1331.5555419921875,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.0047793803874711955,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.03417215435914826,
      "learning_rate": 4.436860068259386e-08,
      "loss": 0.0078,
      "num_tokens": 13630068.0,
      "reward": 0.05859375,
      "reward_std": 0.06805649399757385,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1646.5703125,
      "completions/mean_terminated_length": 1130.446533203125,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.005120764700861995,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04510596346055707,
      "learning_rate": 4.778156996587031e-08,
      "loss": 0.0198,
      "num_tokens": 14550264.0,
      "reward": 0.0859375,
      "reward_std": 0.12361770868301392,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1752.3125,
      "completions/mean_terminated_length": 1229.6649169921875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.0054621490142527955,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.054358656074527854,
      "learning_rate": 5.119453924914675e-08,
      "loss": 0.023,
      "num_tokens": 15522616.0,
      "reward": 0.119140625,
      "reward_std": 0.16773086786270142,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1725.244140625,
      "completions/mean_terminated_length": 1178.2579345703125,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "epoch": 0.005803533327643595,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04532171535738384,
      "learning_rate": 5.460750853242321e-08,
      "loss": 0.0112,
      "num_tokens": 16481669.0,
      "reward": 0.06640625,
      "reward_std": 0.09589291363954544,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1799.037109375,
      "completions/mean_terminated_length": 1284.712646484375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.006144917641034395,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04252846636231565,
      "learning_rate": 5.802047781569966e-08,
      "loss": 0.0188,
      "num_tokens": 17485736.0,
      "reward": 0.0625,
      "reward_std": 0.08450747281312943,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1799.37890625,
      "completions/mean_terminated_length": 1356.184814453125,
      "completions/min_length": 360.0,
      "completions/min_terminated_length": 360.0,
      "epoch": 0.006486301954425194,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.044007690247254184,
      "learning_rate": 6.14334470989761e-08,
      "loss": 0.01,
      "num_tokens": 18484362.0,
      "reward": 0.068359375,
      "reward_std": 0.1117907464504242,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1708.611328125,
      "completions/mean_terminated_length": 1220.5380859375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.006827686267815994,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04488698234518163,
      "learning_rate": 6.484641638225255e-08,
      "loss": 0.0103,
      "num_tokens": 19436387.0,
      "reward": 0.08984375,
      "reward_std": 0.09848526120185852,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1770.4296875,
      "completions/mean_terminated_length": 1307.8125,
      "completions/min_length": 581.0,
      "completions/min_terminated_length": 581.0,
      "epoch": 0.007169070581206793,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.04861825600744672,
      "learning_rate": 6.8259385665529e-08,
      "loss": 0.0229,
      "num_tokens": 20416207.0,
      "reward": 0.087890625,
      "reward_std": 0.13715961575508118,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1756.564453125,
      "completions/mean_terminated_length": 1312.95068359375,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.007510454894597594,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.052662806445118816,
      "learning_rate": 7.167235494880546e-08,
      "loss": 0.0276,
      "num_tokens": 21398480.0,
      "reward": 0.107421875,
      "reward_std": 0.1405652016401291,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1814.40234375,
      "completions/mean_terminated_length": 1356.658935546875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.007851839207988393,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04479486754650139,
      "learning_rate": 7.508532423208192e-08,
      "loss": 0.0178,
      "num_tokens": 22399118.0,
      "reward": 0.09375,
      "reward_std": 0.12595151364803314,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1717.69921875,
      "completions/mean_terminated_length": 1153.2169189453125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.008193223521379193,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.03288501841840136,
      "learning_rate": 7.849829351535836e-08,
      "loss": 0.0085,
      "num_tokens": 23358196.0,
      "reward": 0.04296875,
      "reward_std": 0.0457378551363945,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.607421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1744.763671875,
      "completions/mean_terminated_length": 1275.5770263671875,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.008534607834769992,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.041978908017609194,
      "learning_rate": 8.19112627986348e-08,
      "loss": 0.0196,
      "num_tokens": 24328619.0,
      "reward": 0.07421875,
      "reward_std": 0.09606722742319107,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1626.05859375,
      "completions/mean_terminated_length": 1132.6016845703125,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "epoch": 0.008875992148160792,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 0.062100550345779035,
      "learning_rate": 8.532423208191126e-08,
      "loss": 0.0166,
      "num_tokens": 25239705.0,
      "reward": 0.13671875,
      "reward_std": 0.18890517950057983,
      "rewards/accuracy_reward/mean": 0.1411290317773819,
      "rewards/accuracy_reward/std": 0.3485061228275299,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1607.740234375,
      "completions/mean_terminated_length": 1092.8602294921875,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.009217376461551591,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.038504286857851476,
      "learning_rate": 8.873720136518772e-08,
      "loss": 0.0141,
      "num_tokens": 26132148.0,
      "reward": 0.021484375,
      "reward_std": 0.06425705552101135,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1741.521484375,
      "completions/mean_terminated_length": 1171.36865234375,
      "completions/min_length": 300.0,
      "completions/min_terminated_length": 300.0,
      "epoch": 0.009558760774942391,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.047420119841969456,
      "learning_rate": 9.215017064846416e-08,
      "loss": 0.0179,
      "num_tokens": 27112895.0,
      "reward": 0.048828125,
      "reward_std": 0.11079943180084229,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1787.126953125,
      "completions/mean_terminated_length": 1218.391357421875,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.00990014508833319,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.03030950325457361,
      "learning_rate": 9.556313993174062e-08,
      "loss": 0.0093,
      "num_tokens": 28103168.0,
      "reward": 0.0546875,
      "reward_std": 0.056411758065223694,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1760.376953125,
      "completions/mean_terminated_length": 1256.263427734375,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 0.01024152940172399,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03741549775073281,
      "learning_rate": 9.897610921501706e-08,
      "loss": 0.0107,
      "num_tokens": 29084449.0,
      "reward": 0.125,
      "reward_std": 0.08109388500452042,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310423493385315,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.595703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1744.7421875,
      "completions/mean_terminated_length": 1297.9130859375,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 0.01058291371511479,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.05665272966941569,
      "learning_rate": 1.023890784982935e-07,
      "loss": 0.0223,
      "num_tokens": 30056541.0,
      "reward": 0.10546875,
      "reward_std": 0.15262386202812195,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1744.771484375,
      "completions/mean_terminated_length": 1322.5186767578125,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.010924298028505591,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04584619056826458,
      "learning_rate": 1.0580204778156996e-07,
      "loss": 0.0298,
      "num_tokens": 31027288.0,
      "reward": 0.0703125,
      "reward_std": 0.12140804529190063,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1746.130859375,
      "completions/mean_terminated_length": 1116.9337158203125,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.01126568234189639,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.03438908031100313,
      "learning_rate": 1.0921501706484642e-07,
      "loss": 0.007,
      "num_tokens": 32006043.0,
      "reward": 0.0390625,
      "reward_std": 0.06299237906932831,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1743.775390625,
      "completions/mean_terminated_length": 1269.1849365234375,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.01160706665528719,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0473121665769505,
      "learning_rate": 1.1262798634812286e-07,
      "loss": 0.0165,
      "num_tokens": 32972136.0,
      "reward": 0.056640625,
      "reward_std": 0.09433712065219879,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.541015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1680.3125,
      "completions/mean_terminated_length": 1246.91064453125,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.01194845096867799,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.046558777905087036,
      "learning_rate": 1.1604095563139932e-07,
      "loss": 0.0117,
      "num_tokens": 33914472.0,
      "reward": 0.095703125,
      "reward_std": 0.12388914078474045,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1696.216796875,
      "completions/mean_terminated_length": 1214.1435546875,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.01228983528206879,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.041928760536107765,
      "learning_rate": 1.1945392491467578e-07,
      "loss": 0.0105,
      "num_tokens": 34866631.0,
      "reward": 0.041015625,
      "reward_std": 0.08648413419723511,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1830.650390625,
      "completions/mean_terminated_length": 1385.6011962890625,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.012631219595459589,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03890801617300073,
      "learning_rate": 1.228668941979522e-07,
      "loss": 0.0066,
      "num_tokens": 35878868.0,
      "reward": 0.0625,
      "reward_std": 0.09350419044494629,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1743.234375,
      "completions/mean_terminated_length": 1204.54052734375,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 0.012972603908850388,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.03957306765731773,
      "learning_rate": 1.2627986348122866e-07,
      "loss": 0.0086,
      "num_tokens": 36852124.0,
      "reward": 0.041015625,
      "reward_std": 0.09110259264707565,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1774.359375,
      "completions/mean_terminated_length": 1282.404296875,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.013313988222241188,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05463734396955025,
      "learning_rate": 1.296928327645051e-07,
      "loss": 0.028,
      "num_tokens": 37838500.0,
      "reward": 0.08203125,
      "reward_std": 0.12384280562400818,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1852.732421875,
      "completions/mean_terminated_length": 1307.4295654296875,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 0.013655372535631987,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04512206696134928,
      "learning_rate": 1.3310580204778158e-07,
      "loss": 0.0088,
      "num_tokens": 38867691.0,
      "reward": 0.046875,
      "reward_std": 0.0908287987112999,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.607421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1745.134765625,
      "completions/mean_terminated_length": 1276.5223388671875,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.013996756849022787,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04932882394737749,
      "learning_rate": 1.36518771331058e-07,
      "loss": 0.013,
      "num_tokens": 39832064.0,
      "reward": 0.037109375,
      "reward_std": 0.09809703379869461,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1776.326171875,
      "completions/mean_terminated_length": 1199.8475341796875,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.014338141162413586,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.061108309601674154,
      "learning_rate": 1.3993174061433446e-07,
      "loss": 0.0093,
      "num_tokens": 40826903.0,
      "reward": 0.068359375,
      "reward_std": 0.12104017287492752,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1788.171875,
      "completions/mean_terminated_length": 1231.852783203125,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 0.014679525475804386,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03589300034594603,
      "learning_rate": 1.4334470989761092e-07,
      "loss": 0.0086,
      "num_tokens": 41816047.0,
      "reward": 0.052734375,
      "reward_std": 0.07894542813301086,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1756.146484375,
      "completions/mean_terminated_length": 1261.5316162109375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.015020909789195187,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0509461360080981,
      "learning_rate": 1.4675767918088735e-07,
      "loss": 0.0216,
      "num_tokens": 42785402.0,
      "reward": 0.099609375,
      "reward_std": 0.14260855317115784,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1785.208984375,
      "completions/mean_terminated_length": 1361.5255126953125,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.015362294102585987,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.03747732180213399,
      "learning_rate": 1.5017064846416383e-07,
      "loss": 0.0082,
      "num_tokens": 43781189.0,
      "reward": 0.04296875,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1801.880859375,
      "completions/mean_terminated_length": 1311.0819091796875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.015703678415976786,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04224816975820266,
      "learning_rate": 1.5358361774744026e-07,
      "loss": 0.0164,
      "num_tokens": 44778200.0,
      "reward": 0.0703125,
      "reward_std": 0.10864999890327454,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1731.1796875,
      "completions/mean_terminated_length": 1189.7354736328125,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.016045062729367586,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.04174360995094499,
      "learning_rate": 1.5699658703071672e-07,
      "loss": 0.0133,
      "num_tokens": 45740404.0,
      "reward": 0.0546875,
      "reward_std": 0.05311024188995361,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1836.826171875,
      "completions/mean_terminated_length": 1331.9669189453125,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.016386447042758386,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.024488356216432854,
      "learning_rate": 1.6040955631399318e-07,
      "loss": 0.0053,
      "num_tokens": 46761563.0,
      "reward": 0.033203125,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1769.5390625,
      "completions/mean_terminated_length": 1277.340576171875,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.016727831356149185,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.03501499334951841,
      "learning_rate": 1.638225255972696e-07,
      "loss": 0.0151,
      "num_tokens": 47740959.0,
      "reward": 0.046875,
      "reward_std": 0.07861834019422531,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.55859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1681.6953125,
      "completions/mean_terminated_length": 1218.1416015625,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.017069215669539985,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04553192175857216,
      "learning_rate": 1.6723549488054606e-07,
      "loss": 0.021,
      "num_tokens": 48670051.0,
      "reward": 0.09765625,
      "reward_std": 0.1125703826546669,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1785.296875,
      "completions/mean_terminated_length": 1354.680419921875,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.017410599982930784,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05871973844523566,
      "learning_rate": 1.7064846416382252e-07,
      "loss": 0.0233,
      "num_tokens": 49663355.0,
      "reward": 0.109375,
      "reward_std": 0.13375157117843628,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.576171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1647.58984375,
      "completions/mean_terminated_length": 1103.25341796875,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.017751984296321584,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.04705927425045244,
      "learning_rate": 1.7406143344709898e-07,
      "loss": 0.0102,
      "num_tokens": 50585241.0,
      "reward": 0.037109375,
      "reward_std": 0.069866843521595,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1742.458984375,
      "completions/mean_terminated_length": 1220.291015625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.018093368609712383,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.04988088049501189,
      "learning_rate": 1.7747440273037543e-07,
      "loss": 0.0094,
      "num_tokens": 51551540.0,
      "reward": 0.080078125,
      "reward_std": 0.12724943459033966,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1652.599609375,
      "completions/mean_terminated_length": 1197.390869140625,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.018434752923103183,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04925080043249225,
      "learning_rate": 1.8088737201365186e-07,
      "loss": 0.0129,
      "num_tokens": 52480679.0,
      "reward": 0.0703125,
      "reward_std": 0.09820214658975601,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1836.859375,
      "completions/mean_terminated_length": 1423.121337890625,
      "completions/min_length": 292.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 0.018776137236493982,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04356025838592198,
      "learning_rate": 1.8430034129692832e-07,
      "loss": 0.0173,
      "num_tokens": 53494015.0,
      "reward": 0.060546875,
      "reward_std": 0.11366777867078781,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1599.03515625,
      "completions/mean_terminated_length": 1177.2803955078125,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 0.019117521549884782,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05553040771364216,
      "learning_rate": 1.8771331058020475e-07,
      "loss": 0.0182,
      "num_tokens": 54389921.0,
      "reward": 0.09375,
      "reward_std": 0.15114058554172516,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1784.33984375,
      "completions/mean_terminated_length": 1267.6878662109375,
      "completions/min_length": 249.0,
      "completions/min_terminated_length": 249.0,
      "epoch": 0.01945890586327558,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05150373311041259,
      "learning_rate": 1.9112627986348124e-07,
      "loss": 0.0312,
      "num_tokens": 55381423.0,
      "reward": 0.078125,
      "reward_std": 0.12037593126296997,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1810.8515625,
      "completions/mean_terminated_length": 1342.0697021484375,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "epoch": 0.01980029017666638,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04456284040734546,
      "learning_rate": 1.9453924914675767e-07,
      "loss": 0.0016,
      "num_tokens": 56400915.0,
      "reward": 0.044921875,
      "reward_std": 0.08793555945158005,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1803.642578125,
      "completions/mean_terminated_length": 1294.3192138671875,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 0.02014167449005718,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04798246912932706,
      "learning_rate": 1.9795221843003412e-07,
      "loss": 0.01,
      "num_tokens": 57392812.0,
      "reward": 0.06640625,
      "reward_std": 0.10369987040758133,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.556640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1691.6328125,
      "completions/mean_terminated_length": 1244.21142578125,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 0.02048305880344798,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05220746688153303,
      "learning_rate": 2.0136518771331058e-07,
      "loss": 0.0194,
      "num_tokens": 58342352.0,
      "reward": 0.0625,
      "reward_std": 0.12056370824575424,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1796.45703125,
      "completions/mean_terminated_length": 1299.220947265625,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.02082444311683878,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04394834921332462,
      "learning_rate": 2.04778156996587e-07,
      "loss": 0.0229,
      "num_tokens": 59336266.0,
      "reward": 0.099609375,
      "reward_std": 0.11717641353607178,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.458984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1605.990234375,
      "completions/mean_terminated_length": 1231.0,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.02116582743022958,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04553323110882962,
      "learning_rate": 2.0819112627986347e-07,
      "loss": 0.0171,
      "num_tokens": 60232469.0,
      "reward": 0.150390625,
      "reward_std": 0.1292927861213684,
      "rewards/accuracy_reward/mean": 0.150390625,
      "rewards/accuracy_reward/std": 0.35780346393585205,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.51171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1584.740234375,
      "completions/mean_terminated_length": 1099.2440185546875,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.021507211743620382,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04684939911874622,
      "learning_rate": 2.1160409556313992e-07,
      "loss": 0.0042,
      "num_tokens": 61126912.0,
      "reward": 0.060546875,
      "reward_std": 0.0830162912607193,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.576171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1671.013671875,
      "completions/mean_terminated_length": 1158.520751953125,
      "completions/min_length": 252.0,
      "completions/min_terminated_length": 252.0,
      "epoch": 0.021848596057011182,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.051864508251110504,
      "learning_rate": 2.1501706484641638e-07,
      "loss": 0.033,
      "num_tokens": 62061399.0,
      "reward": 0.11328125,
      "reward_std": 0.1546129435300827,
      "rewards/accuracy_reward/mean": 0.11328125,
      "rewards/accuracy_reward/std": 0.3172462284564972,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1717.859375,
      "completions/mean_terminated_length": 1167.625,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.02218998037040198,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.046001761013168434,
      "learning_rate": 2.1843003412969284e-07,
      "loss": 0.0309,
      "num_tokens": 63021071.0,
      "reward": 0.09765625,
      "reward_std": 0.12735143303871155,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1701.951171875,
      "completions/mean_terminated_length": 1227.736083984375,
      "completions/min_length": 255.0,
      "completions/min_terminated_length": 255.0,
      "epoch": 0.02253136468379278,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03978169300789664,
      "learning_rate": 2.2184300341296927e-07,
      "loss": 0.0069,
      "num_tokens": 63969830.0,
      "reward": 0.0546875,
      "reward_std": 0.06799772381782532,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1828.521484375,
      "completions/mean_terminated_length": 1303.8079833984375,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.02287274899718358,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0394878820084567,
      "learning_rate": 2.2525597269624572e-07,
      "loss": 0.0071,
      "num_tokens": 64974513.0,
      "reward": 0.052734375,
      "reward_std": 0.09382468461990356,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1762.484375,
      "completions/mean_terminated_length": 1172.646728515625,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.02321413331057438,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.0487139884405135,
      "learning_rate": 2.2866894197952215e-07,
      "loss": 0.0195,
      "num_tokens": 65954201.0,
      "reward": 0.0703125,
      "reward_std": 0.09743183851242065,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1868.390625,
      "completions/mean_terminated_length": 1422.4217529296875,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.02355551762396518,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.026463022480313556,
      "learning_rate": 2.3208191126279864e-07,
      "loss": 0.0093,
      "num_tokens": 66995233.0,
      "reward": 0.015625,
      "reward_std": 0.03839729726314545,
      "rewards/accuracy_reward/mean": 0.016129031777381897,
      "rewards/accuracy_reward/std": 0.12609896063804626,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1702.6015625,
      "completions/mean_terminated_length": 1285.7413330078125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.02389690193735598,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.052620471317906196,
      "learning_rate": 2.354948805460751e-07,
      "loss": 0.0199,
      "num_tokens": 67945605.0,
      "reward": 0.08203125,
      "reward_std": 0.13357868790626526,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.61328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1719.845703125,
      "completions/mean_terminated_length": 1199.439453125,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.02423828625074678,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05242549715798892,
      "learning_rate": 2.3890784982935155e-07,
      "loss": 0.0192,
      "num_tokens": 68902294.0,
      "reward": 0.052734375,
      "reward_std": 0.09946727007627487,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1747.3359375,
      "completions/mean_terminated_length": 1215.8919677734375,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "epoch": 0.02457967056413758,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04516361120468605,
      "learning_rate": 2.42320819112628e-07,
      "loss": 0.007,
      "num_tokens": 69883058.0,
      "reward": 0.08203125,
      "reward_std": 0.08340215682983398,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1658.490234375,
      "completions/mean_terminated_length": 1188.3922119140625,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.024921054877528378,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04914435813626726,
      "learning_rate": 2.457337883959044e-07,
      "loss": 0.0117,
      "num_tokens": 70818557.0,
      "reward": 0.064453125,
      "reward_std": 0.10293962806463242,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.55859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1703.453125,
      "completions/mean_terminated_length": 1267.43359375,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.025262439190919177,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04325843110868303,
      "learning_rate": 2.4914675767918084e-07,
      "loss": 0.006,
      "num_tokens": 71764597.0,
      "reward": 0.111328125,
      "reward_std": 0.1361807882785797,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1792.08203125,
      "completions/mean_terminated_length": 1294.9539794921875,
      "completions/min_length": 266.0,
      "completions/min_terminated_length": 266.0,
      "epoch": 0.025603823504309977,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04052880308407635,
      "learning_rate": 2.525597269624573e-07,
      "loss": 0.0148,
      "num_tokens": 72762143.0,
      "reward": 0.05078125,
      "reward_std": 0.10270209610462189,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1736.513671875,
      "completions/mean_terminated_length": 1221.673583984375,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.025945207817700777,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.0442077917273844,
      "learning_rate": 2.5597269624573375e-07,
      "loss": 0.0171,
      "num_tokens": 73720918.0,
      "reward": 0.083984375,
      "reward_std": 0.125734344124794,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1820.267578125,
      "completions/mean_terminated_length": 1265.4564208984375,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.026286592131091576,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.0413919985499403,
      "learning_rate": 2.593856655290102e-07,
      "loss": 0.0086,
      "num_tokens": 74730399.0,
      "reward": 0.056640625,
      "reward_std": 0.08444078266620636,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.52734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1644.21875,
      "completions/mean_terminated_length": 1193.888427734375,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.026627976444482376,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05256560820797869,
      "learning_rate": 2.627986348122867e-07,
      "loss": 0.0193,
      "num_tokens": 75653551.0,
      "reward": 0.134765625,
      "reward_std": 0.12078087776899338,
      "rewards/accuracy_reward/mean": 0.1391129046678543,
      "rewards/accuracy_reward/std": 0.3464137017726898,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1755.6875,
      "completions/mean_terminated_length": 1268.5,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.026969360757873175,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.049319138504193165,
      "learning_rate": 2.6621160409556315e-07,
      "loss": 0.0061,
      "num_tokens": 76633087.0,
      "reward": 0.0234375,
      "reward_std": 0.062167368829250336,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1687.9921875,
      "completions/mean_terminated_length": 1144.691162109375,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.027310745071263975,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04549203694995152,
      "learning_rate": 2.696245733788396e-07,
      "loss": 0.0162,
      "num_tokens": 77581675.0,
      "reward": 0.041015625,
      "reward_std": 0.0879673957824707,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.498046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1614.78125,
      "completions/mean_terminated_length": 1184.933837890625,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 0.027652129384654774,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04830342329842642,
      "learning_rate": 2.73037542662116e-07,
      "loss": 0.0071,
      "num_tokens": 78480283.0,
      "reward": 0.11328125,
      "reward_std": 0.12803566455841064,
      "rewards/accuracy_reward/mean": 0.11328125,
      "rewards/accuracy_reward/std": 0.3172462284564972,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1852.708984375,
      "completions/mean_terminated_length": 1372.398681640625,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.027993513698045574,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05194084815029283,
      "learning_rate": 2.764505119453925e-07,
      "loss": 0.0145,
      "num_tokens": 79508278.0,
      "reward": 0.08203125,
      "reward_std": 0.1338978409767151,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1789.0,
      "completions/mean_terminated_length": 1249.1566162109375,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.028334898011436373,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.04420763351849259,
      "learning_rate": 2.798634812286689e-07,
      "loss": 0.0102,
      "num_tokens": 80498390.0,
      "reward": 0.037109375,
      "reward_std": 0.07174387574195862,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1737.111328125,
      "completions/mean_terminated_length": 1235.882568359375,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.028676282324827173,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.049489819333455864,
      "learning_rate": 2.8327645051194536e-07,
      "loss": 0.0108,
      "num_tokens": 81462431.0,
      "reward": 0.05859375,
      "reward_std": 0.1049705445766449,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1795.79296875,
      "completions/mean_terminated_length": 1310.1142578125,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.029017666638217973,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03697360030207983,
      "learning_rate": 2.8668941979522184e-07,
      "loss": 0.0059,
      "num_tokens": 82457717.0,
      "reward": 0.0625,
      "reward_std": 0.0752037912607193,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1781.0,
      "completions/mean_terminated_length": 1275.6610107421875,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 0.029359050951608772,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04519682757594774,
      "learning_rate": 2.9010238907849827e-07,
      "loss": 0.0183,
      "num_tokens": 83454821.0,
      "reward": 0.072265625,
      "reward_std": 0.07834454625844955,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1573.21875,
      "completions/mean_terminated_length": 981.99560546875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.02970043526499957,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.0437480076324369,
      "learning_rate": 2.935153583617747e-07,
      "loss": 0.0144,
      "num_tokens": 84339077.0,
      "reward": 0.064453125,
      "reward_std": 0.094484344124794,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1779.380859375,
      "completions/mean_terminated_length": 1288.3978271484375,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.030041819578390375,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05401587306569221,
      "learning_rate": 2.969283276450512e-07,
      "loss": 0.0219,
      "num_tokens": 85337224.0,
      "reward": 0.109375,
      "reward_std": 0.15756931900978088,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1696.1953125,
      "completions/mean_terminated_length": 1297.4833984375,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 0.030383203891781174,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03941846800047262,
      "learning_rate": 3.0034129692832767e-07,
      "loss": 0.0144,
      "num_tokens": 86279324.0,
      "reward": 0.04296875,
      "reward_std": 0.0828578919172287,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1741.275390625,
      "completions/mean_terminated_length": 1165.7359619140625,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.030724588205171974,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.049521798619494864,
      "learning_rate": 3.037542662116041e-07,
      "loss": -0.0008,
      "num_tokens": 87245913.0,
      "reward": 0.068359375,
      "reward_std": 0.09969472885131836,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.56640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1682.029296875,
      "completions/mean_terminated_length": 1203.95947265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.031065972518562773,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04393887888990321,
      "learning_rate": 3.0716723549488053e-07,
      "loss": 0.0182,
      "num_tokens": 88182408.0,
      "reward": 0.044921875,
      "reward_std": 0.09628114849328995,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1910.404296875,
      "completions/mean_terminated_length": 1364.59228515625,
      "completions/min_length": 398.0,
      "completions/min_terminated_length": 398.0,
      "epoch": 0.03140735683195357,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.03713949427220516,
      "learning_rate": 3.10580204778157e-07,
      "loss": 0.0095,
      "num_tokens": 89245159.0,
      "reward": 0.037109375,
      "reward_std": 0.07117830216884613,
      "rewards/accuracy_reward/mean": 0.03958333283662796,
      "rewards/accuracy_reward/std": 0.19518160820007324,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1593.330078125,
      "completions/mean_terminated_length": 1124.2261962890625,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.03174874114534437,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.057589385873441194,
      "learning_rate": 3.1399317406143344e-07,
      "loss": 0.0408,
      "num_tokens": 90130080.0,
      "reward": 0.087890625,
      "reward_std": 0.13479050993919373,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1829.37109375,
      "completions/mean_terminated_length": 1301.7467041015625,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.03209012545873517,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.03981280813413336,
      "learning_rate": 3.1740614334470987e-07,
      "loss": 0.015,
      "num_tokens": 91144382.0,
      "reward": 0.029296875,
      "reward_std": 0.058760739862918854,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1764.8671875,
      "completions/mean_terminated_length": 1264.6378173828125,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.03243150977212597,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.047823694560574635,
      "learning_rate": 3.2081911262798635e-07,
      "loss": 0.0279,
      "num_tokens": 92129738.0,
      "reward": 0.076171875,
      "reward_std": 0.1329134702682495,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1760.23828125,
      "completions/mean_terminated_length": 1191.406982421875,
      "completions/min_length": 341.0,
      "completions/min_terminated_length": 341.0,
      "epoch": 0.03277289408551677,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.053190577942516325,
      "learning_rate": 3.242320819112628e-07,
      "loss": 0.0268,
      "num_tokens": 93100820.0,
      "reward": 0.078125,
      "reward_std": 0.1441391110420227,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.595703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1761.7578125,
      "completions/mean_terminated_length": 1340.0,
      "completions/min_length": 295.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 0.03311427839890757,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04518469814679981,
      "learning_rate": 3.276450511945392e-07,
      "loss": 0.0142,
      "num_tokens": 94078888.0,
      "reward": 0.087890625,
      "reward_std": 0.11125211417675018,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1793.564453125,
      "completions/mean_terminated_length": 1258.478759765625,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.03345566271229837,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.041203920562389446,
      "learning_rate": 3.3105802047781565e-07,
      "loss": 0.0169,
      "num_tokens": 95078089.0,
      "reward": 0.05078125,
      "reward_std": 0.08709507435560226,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1794.5546875,
      "completions/mean_terminated_length": 1256.756103515625,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.03379704702568917,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04788091387888716,
      "learning_rate": 3.3447098976109213e-07,
      "loss": 0.0212,
      "num_tokens": 96073781.0,
      "reward": 0.03515625,
      "reward_std": 0.0908288061618805,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1782.1171875,
      "completions/mean_terminated_length": 1304.1092529296875,
      "completions/min_length": 184.0,
      "completions/min_terminated_length": 184.0,
      "epoch": 0.03413843133907997,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04097895400370385,
      "learning_rate": 3.3788395904436856e-07,
      "loss": 0.0144,
      "num_tokens": 97063361.0,
      "reward": 0.060546875,
      "reward_std": 0.08279120922088623,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1863.099609375,
      "completions/mean_terminated_length": 1381.31689453125,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 0.03447981565247077,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04778362901424674,
      "learning_rate": 3.4129692832764504e-07,
      "loss": 0.014,
      "num_tokens": 98088532.0,
      "reward": 0.05078125,
      "reward_std": 0.07300759106874466,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1841.681640625,
      "completions/mean_terminated_length": 1259.6790771484375,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 0.03482119996586157,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.03397151841054294,
      "learning_rate": 3.447098976109215e-07,
      "loss": 0.0127,
      "num_tokens": 99108657.0,
      "reward": 0.025390625,
      "reward_std": 0.05335709825158119,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1732.984375,
      "completions/mean_terminated_length": 1229.2791748046875,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "epoch": 0.03516258427925237,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.05057841585195066,
      "learning_rate": 3.4812286689419796e-07,
      "loss": 0.0141,
      "num_tokens": 100073801.0,
      "reward": 0.056640625,
      "reward_std": 0.08042868226766586,
      "rewards/accuracy_reward/mean": 0.058467742055654526,
      "rewards/accuracy_reward/std": 0.23486268520355225,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.544921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1723.919921875,
      "completions/mean_terminated_length": 1336.0943603515625,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "epoch": 0.03550396859264317,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06190229552777436,
      "learning_rate": 3.515358361774744e-07,
      "loss": 0.0281,
      "num_tokens": 101037888.0,
      "reward": 0.1484375,
      "reward_std": 0.21379581093788147,
      "rewards/accuracy_reward/mean": 0.15322580933570862,
      "rewards/accuracy_reward/std": 0.36056873202323914,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1829.623046875,
      "completions/mean_terminated_length": 1271.548583984375,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "epoch": 0.03584535290603397,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.02390952553732917,
      "learning_rate": 3.5494880546075087e-07,
      "loss": 0.0049,
      "num_tokens": 102051039.0,
      "reward": 0.037109375,
      "reward_std": 0.039834219962358475,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.60546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1740.14453125,
      "completions/mean_terminated_length": 1267.876220703125,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.03618673721942477,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.054417121947166296,
      "learning_rate": 3.583617747440273e-07,
      "loss": 0.0105,
      "num_tokens": 103025513.0,
      "reward": 0.126953125,
      "reward_std": 0.16408398747444153,
      "rewards/accuracy_reward/mean": 0.126953125,
      "rewards/accuracy_reward/std": 0.33324605226516724,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1633.775390625,
      "completions/mean_terminated_length": 1219.55078125,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "epoch": 0.036528121532815566,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.046098620094480694,
      "learning_rate": 3.6177474402730373e-07,
      "loss": 0.0159,
      "num_tokens": 103939222.0,
      "reward": 0.068359375,
      "reward_std": 0.09759023785591125,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1798.673828125,
      "completions/mean_terminated_length": 1305.8197021484375,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.036869505846206366,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.034770213970435726,
      "learning_rate": 3.6518771331058016e-07,
      "loss": 0.015,
      "num_tokens": 104942143.0,
      "reward": 0.05859375,
      "reward_std": 0.08103963732719421,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.576171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1714.1328125,
      "completions/mean_terminated_length": 1260.258056640625,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.037210890159597165,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04945430879362447,
      "learning_rate": 3.6860068259385664e-07,
      "loss": 0.0106,
      "num_tokens": 105891395.0,
      "reward": 0.119140625,
      "reward_std": 0.11643067002296448,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1700.833984375,
      "completions/mean_terminated_length": 1294.8262939453125,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.037552274472987965,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.0476936256201125,
      "learning_rate": 3.7201365187713307e-07,
      "loss": 0.015,
      "num_tokens": 106837294.0,
      "reward": 0.080078125,
      "reward_std": 0.12584641575813293,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1715.486328125,
      "completions/mean_terminated_length": 1229.5048828125,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.037893658786378764,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04945403629392554,
      "learning_rate": 3.754266211604095e-07,
      "loss": 0.0163,
      "num_tokens": 107792503.0,
      "reward": 0.052734375,
      "reward_std": 0.10982763022184372,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.580078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1707.126953125,
      "completions/mean_terminated_length": 1236.2464599609375,
      "completions/min_length": 355.0,
      "completions/min_terminated_length": 355.0,
      "epoch": 0.038235043099769564,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.044412513761630794,
      "learning_rate": 3.78839590443686e-07,
      "loss": 0.026,
      "num_tokens": 108741480.0,
      "reward": 0.076171875,
      "reward_std": 0.09938118606805801,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.580078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1656.69140625,
      "completions/mean_terminated_length": 1116.1395263671875,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.038576427413160363,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04192073828764485,
      "learning_rate": 3.8225255972696247e-07,
      "loss": 0.0162,
      "num_tokens": 109663930.0,
      "reward": 0.076171875,
      "reward_std": 0.08862704038619995,
      "rewards/accuracy_reward/mean": 0.0786290317773819,
      "rewards/accuracy_reward/std": 0.26943066716194153,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1618.369140625,
      "completions/mean_terminated_length": 1214.776611328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.03891781172655116,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.052787331840070895,
      "learning_rate": 3.856655290102389e-07,
      "loss": 0.0181,
      "num_tokens": 110571415.0,
      "reward": 0.09765625,
      "reward_std": 0.11698848009109497,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1763.501953125,
      "completions/mean_terminated_length": 1330.4482421875,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 0.03925919603994196,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05457150666025581,
      "learning_rate": 3.8907849829351533e-07,
      "loss": 0.0262,
      "num_tokens": 111542552.0,
      "reward": 0.109375,
      "reward_std": 0.14895127713680267,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1690.7265625,
      "completions/mean_terminated_length": 1168.5577392578125,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.03960058035333276,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.044330639053892756,
      "learning_rate": 3.924914675767918e-07,
      "loss": 0.008,
      "num_tokens": 112478396.0,
      "reward": 0.060546875,
      "reward_std": 0.09655951708555222,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1607.65234375,
      "completions/mean_terminated_length": 1108.5916748046875,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.03994196466672356,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05546016146959169,
      "learning_rate": 3.9590443686006824e-07,
      "loss": 0.0166,
      "num_tokens": 113374778.0,
      "reward": 0.064453125,
      "reward_std": 0.12579216063022614,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.501953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1647.279296875,
      "completions/mean_terminated_length": 1243.415771484375,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 0.04028334898011436,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04647878295869337,
      "learning_rate": 3.993174061433447e-07,
      "loss": 0.0206,
      "num_tokens": 114297641.0,
      "reward": 0.0625,
      "reward_std": 0.10300742089748383,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1825.35546875,
      "completions/mean_terminated_length": 1250.839111328125,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 0.04062473329350516,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05134646675164814,
      "learning_rate": 4.0273037542662116e-07,
      "loss": 0.017,
      "num_tokens": 115317759.0,
      "reward": 0.056640625,
      "reward_std": 0.106694795191288,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.560546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1710.044921875,
      "completions/mean_terminated_length": 1278.9644775390625,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.04096611760689596,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05048273340287397,
      "learning_rate": 4.061433447098976e-07,
      "loss": 0.0092,
      "num_tokens": 116264022.0,
      "reward": 0.123046875,
      "reward_std": 0.15681803226470947,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1736.603515625,
      "completions/mean_terminated_length": 1302.9765625,
      "completions/min_length": 268.0,
      "completions/min_terminated_length": 268.0,
      "epoch": 0.04130750192028676,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05672386558413753,
      "learning_rate": 4.09556313993174e-07,
      "loss": 0.0276,
      "num_tokens": 117230011.0,
      "reward": 0.0859375,
      "reward_std": 0.10480518639087677,
      "rewards/accuracy_reward/mean": 0.08870967477560043,
      "rewards/accuracy_reward/std": 0.284611314535141,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1741.84765625,
      "completions/mean_terminated_length": 1218.851806640625,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.04164888623367756,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.038448628495947025,
      "learning_rate": 4.1296928327645045e-07,
      "loss": 0.0152,
      "num_tokens": 118195469.0,
      "reward": 0.076171875,
      "reward_std": 0.07191018760204315,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.548828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1681.34375,
      "completions/mean_terminated_length": 1235.32470703125,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.04199027054706836,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.054195296284921715,
      "learning_rate": 4.1638225255972693e-07,
      "loss": 0.0155,
      "num_tokens": 119134173.0,
      "reward": 0.103515625,
      "reward_std": 0.15227124094963074,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1593.080078125,
      "completions/mean_terminated_length": 1165.7310791015625,
      "completions/min_length": 253.0,
      "completions/min_terminated_length": 253.0,
      "epoch": 0.04233165486045916,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.0607902927918571,
      "learning_rate": 4.1979522184300336e-07,
      "loss": 0.0303,
      "num_tokens": 120022598.0,
      "reward": 0.154296875,
      "reward_std": 0.14884379506111145,
      "rewards/accuracy_reward/mean": 0.154296875,
      "rewards/accuracy_reward/std": 0.36158639192581177,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1650.2265625,
      "completions/mean_terminated_length": 1170.1551513671875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.04267303917384996,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.052516768559291355,
      "learning_rate": 4.2320819112627985e-07,
      "loss": 0.0168,
      "num_tokens": 120954314.0,
      "reward": 0.05078125,
      "reward_std": 0.09338457882404327,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.60546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1744.26953125,
      "completions/mean_terminated_length": 1278.1484375,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.043014423487240765,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05674187549515702,
      "learning_rate": 4.2662116040955633e-07,
      "loss": 0.0239,
      "num_tokens": 121927956.0,
      "reward": 0.072265625,
      "reward_std": 0.09126891195774078,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1589.90625,
      "completions/mean_terminated_length": 1159.5758056640625,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 313.0,
      "epoch": 0.043355807800631564,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04441013244691082,
      "learning_rate": 4.3003412969283276e-07,
      "loss": 0.0253,
      "num_tokens": 122812660.0,
      "reward": 0.0859375,
      "reward_std": 0.1043786108493805,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1740.98828125,
      "completions/mean_terminated_length": 1154.875,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 196.0,
      "epoch": 0.043697192114022364,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03913875714149468,
      "learning_rate": 4.334470989761092e-07,
      "loss": 0.0151,
      "num_tokens": 123785246.0,
      "reward": 0.05859375,
      "reward_std": 0.07630911469459534,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.525390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1636.26171875,
      "completions/mean_terminated_length": 1180.4691162109375,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.04403857642741316,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05295970073418793,
      "learning_rate": 4.3686006825938567e-07,
      "loss": 0.0184,
      "num_tokens": 124703780.0,
      "reward": 0.134765625,
      "reward_std": 0.13888943195343018,
      "rewards/accuracy_reward/mean": 0.134765625,
      "rewards/accuracy_reward/std": 0.3418070077896118,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.486328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1618.779296875,
      "completions/mean_terminated_length": 1212.4068603515625,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.04437996074080396,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.05296995210886687,
      "learning_rate": 4.402730375426621e-07,
      "loss": 0.0167,
      "num_tokens": 125612419.0,
      "reward": 0.08984375,
      "reward_std": 0.15933895111083984,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1714.74609375,
      "completions/mean_terminated_length": 1207.477783203125,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.04472134505419476,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05022631992454979,
      "learning_rate": 4.4368600682593853e-07,
      "loss": 0.0216,
      "num_tokens": 126569713.0,
      "reward": 0.072265625,
      "reward_std": 0.12926754355430603,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1803.328125,
      "completions/mean_terminated_length": 1315.5906982421875,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 0.04506272936758556,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.053551969848103516,
      "learning_rate": 4.4709897610921496e-07,
      "loss": 0.0277,
      "num_tokens": 127580665.0,
      "reward": 0.119140625,
      "reward_std": 0.12450800091028214,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1713.65234375,
      "completions/mean_terminated_length": 1156.40625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.04540411368097636,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.042762161364303275,
      "learning_rate": 4.5051194539249145e-07,
      "loss": 0.0082,
      "num_tokens": 128532471.0,
      "reward": 0.05078125,
      "reward_std": 0.0767945945262909,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.533203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1646.6328125,
      "completions/mean_terminated_length": 1188.1673583984375,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "epoch": 0.04574549799436716,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.048225369539176284,
      "learning_rate": 4.539249146757679e-07,
      "loss": 0.0173,
      "num_tokens": 129449547.0,
      "reward": 0.07421875,
      "reward_std": 0.09341736882925034,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1629.669921875,
      "completions/mean_terminated_length": 1184.350830078125,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.04608688230775796,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06083833926021826,
      "learning_rate": 4.573378839590443e-07,
      "loss": 0.0208,
      "num_tokens": 130359138.0,
      "reward": 0.125,
      "reward_std": 0.17249077558517456,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310423493385315,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.548828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1708.466796875,
      "completions/mean_terminated_length": 1295.4415283203125,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.04642826662114876,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05601725402255147,
      "learning_rate": 4.6075085324232084e-07,
      "loss": 0.0254,
      "num_tokens": 131312481.0,
      "reward": 0.10546875,
      "reward_std": 0.14787328243255615,
      "rewards/accuracy_reward/mean": 0.1088709682226181,
      "rewards/accuracy_reward/std": 0.31179171800613403,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1736.83984375,
      "completions/mean_terminated_length": 1116.9532470703125,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.04676965093453956,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05588237673953811,
      "learning_rate": 4.641638225255973e-07,
      "loss": 0.0052,
      "num_tokens": 132288063.0,
      "reward": 0.060546875,
      "reward_std": 0.11057336628437042,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.529296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1639.3671875,
      "completions/mean_terminated_length": 1179.8673095703125,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.04711103524793036,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.038698838885418696,
      "learning_rate": 4.675767918088737e-07,
      "loss": 0.0134,
      "num_tokens": 133198779.0,
      "reward": 0.041015625,
      "reward_std": 0.07250870764255524,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1722.880859375,
      "completions/mean_terminated_length": 1167.25390625,
      "completions/min_length": 283.0,
      "completions/min_terminated_length": 283.0,
      "epoch": 0.04745241956132116,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04487318351480335,
      "learning_rate": 4.709897610921502e-07,
      "loss": 0.013,
      "num_tokens": 134162318.0,
      "reward": 0.05859375,
      "reward_std": 0.08683578670024872,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1734.521484375,
      "completions/mean_terminated_length": 1185.0914306640625,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.04779380387471196,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05003938325459597,
      "learning_rate": 4.744027303754266e-07,
      "loss": 0.023,
      "num_tokens": 135121817.0,
      "reward": 0.0703125,
      "reward_std": 0.09244654327630997,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1661.90234375,
      "completions/mean_terminated_length": 1132.8055419921875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.04813518818810276,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.05694080088244041,
      "learning_rate": 4.778156996587031e-07,
      "loss": 0.0299,
      "num_tokens": 136050215.0,
      "reward": 0.087890625,
      "reward_std": 0.1455163061618805,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1720.783203125,
      "completions/mean_terminated_length": 1234.7232666015625,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.04847657250149356,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.0534909894123429,
      "learning_rate": 4.812286689419795e-07,
      "loss": 0.0276,
      "num_tokens": 137014536.0,
      "reward": 0.0703125,
      "reward_std": 0.10674113780260086,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1689.265625,
      "completions/mean_terminated_length": 1173.3714599609375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.04881795681488436,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04011415363157658,
      "learning_rate": 4.84641638225256e-07,
      "loss": 0.0282,
      "num_tokens": 137952640.0,
      "reward": 0.056640625,
      "reward_std": 0.07686128467321396,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.55078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1676.654296875,
      "completions/mean_terminated_length": 1221.5260009765625,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.04915934112827516,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04582785148181721,
      "learning_rate": 4.880546075085323e-07,
      "loss": 0.0224,
      "num_tokens": 138890623.0,
      "reward": 0.10546875,
      "reward_std": 0.1024416983127594,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.427734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1472.955078125,
      "completions/mean_terminated_length": 1043.143310546875,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.049500725441665956,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.07300036922061065,
      "learning_rate": 4.914675767918088e-07,
      "loss": 0.0198,
      "num_tokens": 139715080.0,
      "reward": 0.1328125,
      "reward_std": 0.15780803561210632,
      "rewards/accuracy_reward/mean": 0.1328125,
      "rewards/accuracy_reward/std": 0.33970388770103455,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1656.533203125,
      "completions/mean_terminated_length": 1198.716064453125,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.049842109755056756,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04804002639321947,
      "learning_rate": 4.948805460750853e-07,
      "loss": 0.011,
      "num_tokens": 140637401.0,
      "reward": 0.091796875,
      "reward_std": 0.12312431633472443,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1589.837890625,
      "completions/mean_terminated_length": 1117.1309814453125,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.050183494068447555,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.0479645443070329,
      "learning_rate": 4.982935153583617e-07,
      "loss": 0.0256,
      "num_tokens": 141530518.0,
      "reward": 0.0703125,
      "reward_std": 0.10881631821393967,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.52734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1681.123046875,
      "completions/mean_terminated_length": 1271.7974853515625,
      "completions/min_length": 252.0,
      "completions/min_terminated_length": 252.0,
      "epoch": 0.050524878381838355,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05437882003712324,
      "learning_rate": 5.017064846416383e-07,
      "loss": 0.0127,
      "num_tokens": 142470005.0,
      "reward": 0.1328125,
      "reward_std": 0.14288797974586487,
      "rewards/accuracy_reward/mean": 0.1328125,
      "rewards/accuracy_reward/std": 0.33970388770103455,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.607421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1681.205078125,
      "completions/mean_terminated_length": 1113.6766357421875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.050866262695229154,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05242939089020235,
      "learning_rate": 5.051194539249146e-07,
      "loss": 0.0275,
      "num_tokens": 143408862.0,
      "reward": 0.099609375,
      "reward_std": 0.09644746035337448,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.57421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1638.263671875,
      "completions/mean_terminated_length": 1085.6834716796875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.051207647008619954,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04690025492945931,
      "learning_rate": 5.085324232081911e-07,
      "loss": 0.0106,
      "num_tokens": 144322853.0,
      "reward": 0.076171875,
      "reward_std": 0.10546942055225372,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.587890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1712.775390625,
      "completions/mean_terminated_length": 1234.8199462890625,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.051549031322010753,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05088571292453204,
      "learning_rate": 5.119453924914675e-07,
      "loss": 0.014,
      "num_tokens": 145280562.0,
      "reward": 0.0546875,
      "reward_std": 0.10469119250774384,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1651.888671875,
      "completions/mean_terminated_length": 1216.913818359375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.05189041563540155,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05625865566795103,
      "learning_rate": 5.15358361774744e-07,
      "loss": 0.0049,
      "num_tokens": 146210665.0,
      "reward": 0.146484375,
      "reward_std": 0.12141455709934235,
      "rewards/accuracy_reward/mean": 0.146484375,
      "rewards/accuracy_reward/std": 0.35393697023391724,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1757.263671875,
      "completions/mean_terminated_length": 1243.3675537109375,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.05223179994879235,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03961015669745025,
      "learning_rate": 5.187713310580204e-07,
      "loss": 0.0176,
      "num_tokens": 147184944.0,
      "reward": 0.044921875,
      "reward_std": 0.08483455330133438,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.48828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1585.193359375,
      "completions/mean_terminated_length": 1143.583984375,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.05257318426218315,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05207746909601062,
      "learning_rate": 5.221843003412969e-07,
      "loss": 0.006,
      "num_tokens": 148070723.0,
      "reward": 0.111328125,
      "reward_std": 0.1081790179014206,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1613.388671875,
      "completions/mean_terminated_length": 1150.7379150390625,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.05291456857557395,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05803034337918793,
      "learning_rate": 5.255972696245734e-07,
      "loss": 0.0137,
      "num_tokens": 148964106.0,
      "reward": 0.115234375,
      "reward_std": 0.13362500071525574,
      "rewards/accuracy_reward/mean": 0.11895161122083664,
      "rewards/accuracy_reward/std": 0.3240583837032318,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.513671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1575.779296875,
      "completions/mean_terminated_length": 1077.0079345703125,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.05325595288896475,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.0683698311192887,
      "learning_rate": 5.290102389078498e-07,
      "loss": 0.0127,
      "num_tokens": 149843913.0,
      "reward": 0.10546875,
      "reward_std": 0.15122081339359283,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.54296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1590.623046875,
      "completions/mean_terminated_length": 1047.24365234375,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "epoch": 0.05359733720235555,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05589296281262381,
      "learning_rate": 5.324232081911263e-07,
      "loss": 0.0187,
      "num_tokens": 150728152.0,
      "reward": 0.1171875,
      "reward_std": 0.13104984164237976,
      "rewards/accuracy_reward/mean": 0.1171875,
      "rewards/accuracy_reward/std": 0.32195815443992615,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.52734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1591.783203125,
      "completions/mean_terminated_length": 1082.781005859375,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.05393872151574635,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.052967163872803126,
      "learning_rate": 5.358361774744027e-07,
      "loss": 0.0251,
      "num_tokens": 151623609.0,
      "reward": 0.056640625,
      "reward_std": 0.08907270431518555,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.51171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1602.890625,
      "completions/mean_terminated_length": 1136.416015625,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.05428010582913715,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05292779656327808,
      "learning_rate": 5.392491467576792e-07,
      "loss": 0.0165,
      "num_tokens": 152524513.0,
      "reward": 0.087890625,
      "reward_std": 0.12337085604667664,
      "rewards/accuracy_reward/mean": 0.09072580933570862,
      "rewards/accuracy_reward/std": 0.2875087857246399,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.517578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1602.990234375,
      "completions/mean_terminated_length": 1125.5506591796875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.05462149014252795,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05872180443805259,
      "learning_rate": 5.426621160409555e-07,
      "loss": 0.0063,
      "num_tokens": 153422156.0,
      "reward": 0.169921875,
      "reward_std": 0.16396436095237732,
      "rewards/accuracy_reward/mean": 0.169921875,
      "rewards/accuracy_reward/std": 0.3759314715862274,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1602.109375,
      "completions/mean_terminated_length": 1232.6571044921875,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.05496287445591875,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.06237912695561149,
      "learning_rate": 5.46075085324232e-07,
      "loss": 0.0148,
      "num_tokens": 154324212.0,
      "reward": 0.134765625,
      "reward_std": 0.13336601853370667,
      "rewards/accuracy_reward/mean": 0.134765625,
      "rewards/accuracy_reward/std": 0.3418070077896118,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.51953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1590.720703125,
      "completions/mean_terminated_length": 1096.26416015625,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.05530425876930955,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04513467933459144,
      "learning_rate": 5.494880546075085e-07,
      "loss": 0.0185,
      "num_tokens": 155215333.0,
      "reward": 0.111328125,
      "reward_std": 0.1183076947927475,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.44921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1538.525390625,
      "completions/mean_terminated_length": 1122.9964599609375,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.05564564308270035,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05635666183435418,
      "learning_rate": 5.52901023890785e-07,
      "loss": 0.028,
      "num_tokens": 156079586.0,
      "reward": 0.126953125,
      "reward_std": 0.13561506569385529,
      "rewards/accuracy_reward/mean": 0.126953125,
      "rewards/accuracy_reward/std": 0.33324605226516724,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1500.015625,
      "completions/mean_terminated_length": 1060.08447265625,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.05598702739609115,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.056823796809735344,
      "learning_rate": 5.563139931740614e-07,
      "loss": 0.0213,
      "num_tokens": 156925530.0,
      "reward": 0.13671875,
      "reward_std": 0.09655256569385529,
      "rewards/accuracy_reward/mean": 0.13671875,
      "rewards/accuracy_reward/std": 0.3438861668109894,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.576171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1651.23046875,
      "completions/mean_terminated_length": 1111.84326171875,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.05632841170948195,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04406791313036526,
      "learning_rate": 5.597269624573379e-07,
      "loss": 0.0261,
      "num_tokens": 157846880.0,
      "reward": 0.08203125,
      "reward_std": 0.11667752265930176,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.345703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1348.568359375,
      "completions/mean_terminated_length": 979.0179443359375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.05666979602287275,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.062472373957335216,
      "learning_rate": 5.631399317406143e-07,
      "loss": 0.0255,
      "num_tokens": 158614419.0,
      "reward": 0.146484375,
      "reward_std": 0.18024902045726776,
      "rewards/accuracy_reward/mean": 0.146484375,
      "rewards/accuracy_reward/std": 0.35393697023391724,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.427734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1526.673828125,
      "completions/mean_terminated_length": 1137.013671875,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.057011180336263546,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05488752739517643,
      "learning_rate": 5.665529010238907e-07,
      "loss": 0.0176,
      "num_tokens": 159466684.0,
      "reward": 0.125,
      "reward_std": 0.13437044620513916,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310423493385315,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1626.142578125,
      "completions/mean_terminated_length": 1048.4305419921875,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.057352564649654346,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05434770049927932,
      "learning_rate": 5.699658703071673e-07,
      "loss": 0.0121,
      "num_tokens": 160385877.0,
      "reward": 0.0703125,
      "reward_std": 0.09566686302423477,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1486.888671875,
      "completions/mean_terminated_length": 1064.133544921875,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.057693948963045145,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.07295831048210671,
      "learning_rate": 5.733788395904437e-07,
      "loss": 0.0144,
      "num_tokens": 161220524.0,
      "reward": 0.13671875,
      "reward_std": 0.1389889121055603,
      "rewards/accuracy_reward/mean": 0.13671875,
      "rewards/accuracy_reward/std": 0.3438861668109894,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.55859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1558.830078125,
      "completions/mean_terminated_length": 939.7920532226562,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.058035333276435945,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05151259468658889,
      "learning_rate": 5.767918088737202e-07,
      "loss": 0.0026,
      "num_tokens": 162097525.0,
      "reward": 0.0390625,
      "reward_std": 0.08917921781539917,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1555.212890625,
      "completions/mean_terminated_length": 1120.4007568359375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.058376717589826745,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.0532030445654481,
      "learning_rate": 5.802047781569965e-07,
      "loss": 0.0132,
      "num_tokens": 162971298.0,
      "reward": 0.1328125,
      "reward_std": 0.1513998806476593,
      "rewards/accuracy_reward/mean": 0.1328125,
      "rewards/accuracy_reward/std": 0.33970388770103455,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.42578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1442.48046875,
      "completions/mean_terminated_length": 993.48974609375,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.058718101903217544,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05577109989592277,
      "learning_rate": 5.83617747440273e-07,
      "loss": 0.0146,
      "num_tokens": 163795480.0,
      "reward": 0.099609375,
      "reward_std": 0.1336861550807953,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.447265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1538.205078125,
      "completions/mean_terminated_length": 1125.685546875,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.059059486216608344,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.049739956106912495,
      "learning_rate": 5.870307167235494e-07,
      "loss": 0.0068,
      "num_tokens": 164658753.0,
      "reward": 0.1015625,
      "reward_std": 0.14584887027740479,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.388671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1346.8046875,
      "completions/mean_terminated_length": 900.9967651367188,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.05940087052999914,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.07032360980457225,
      "learning_rate": 5.904436860068259e-07,
      "loss": 0.0008,
      "num_tokens": 165428333.0,
      "reward": 0.1484375,
      "reward_std": 0.1553596556186676,
      "rewards/accuracy_reward/mean": 0.15322580933570862,
      "rewards/accuracy_reward/std": 0.36056873202323914,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.36328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1320.203125,
      "completions/mean_terminated_length": 904.95703125,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.05974225484338994,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.06093328097141317,
      "learning_rate": 5.938566552901024e-07,
      "loss": 0.0237,
      "num_tokens": 166182021.0,
      "reward": 0.10546875,
      "reward_std": 0.11872635036706924,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.458984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1458.333984375,
      "completions/mean_terminated_length": 958.0758056640625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.06008363915678075,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.058548190775278085,
      "learning_rate": 5.972696245733788e-07,
      "loss": 0.0173,
      "num_tokens": 167008544.0,
      "reward": 0.091796875,
      "reward_std": 0.11953167617321014,
      "rewards/accuracy_reward/mean": 0.0947580635547638,
      "rewards/accuracy_reward/std": 0.29317617416381836,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.443359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1467.755859375,
      "completions/mean_terminated_length": 1005.5964965820312,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.06042502347017155,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.06136394213803249,
      "learning_rate": 6.006825938566553e-07,
      "loss": 0.0237,
      "num_tokens": 167837507.0,
      "reward": 0.125,
      "reward_std": 0.14988242089748383,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310423493385315,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.427734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1411.6171875,
      "completions/mean_terminated_length": 935.9590454101562,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.06076640778356235,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05365700955680286,
      "learning_rate": 6.040955631399317e-07,
      "loss": 0.0347,
      "num_tokens": 168639391.0,
      "reward": 0.130859375,
      "reward_std": 0.13703955709934235,
      "rewards/accuracy_reward/mean": 0.130859375,
      "rewards/accuracy_reward/std": 0.33757632970809937,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1390.310546875,
      "completions/mean_terminated_length": 947.5523071289062,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.06110779209695315,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.05777821883833454,
      "learning_rate": 6.075085324232082e-07,
      "loss": 0.03,
      "num_tokens": 169425358.0,
      "reward": 0.09765625,
      "reward_std": 0.09079696238040924,
      "rewards/accuracy_reward/mean": 0.10080645233392715,
      "rewards/accuracy_reward/std": 0.30137622356414795,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1482.677734375,
      "completions/mean_terminated_length": 951.617431640625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.06144917641034395,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.049112555183154175,
      "learning_rate": 6.109215017064846e-07,
      "loss": 0.0165,
      "num_tokens": 170273577.0,
      "reward": 0.052734375,
      "reward_std": 0.08887359499931335,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.271484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1139.21875,
      "completions/mean_terminated_length": 800.5576782226562,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.06179056072373475,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.06202726901337909,
      "learning_rate": 6.143344709897611e-07,
      "loss": 0.0071,
      "num_tokens": 170938505.0,
      "reward": 0.123046875,
      "reward_std": 0.04973640665411949,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1410.91796875,
      "completions/mean_terminated_length": 982.03271484375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06213194503712555,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04369301292203226,
      "learning_rate": 6.177474402730375e-07,
      "loss": 0.0071,
      "num_tokens": 171739055.0,
      "reward": 0.048828125,
      "reward_std": 0.08207826316356659,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.263671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1174.68359375,
      "completions/mean_terminated_length": 861.95751953125,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.062473329350516346,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.04384792156315624,
      "learning_rate": 6.21160409556314e-07,
      "loss": 0.0163,
      "num_tokens": 172416845.0,
      "reward": 0.083984375,
      "reward_std": 0.08164606243371964,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.37109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1412.220703125,
      "completions/mean_terminated_length": 1037.0714111328125,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.06281471366390715,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0845404281614352,
      "learning_rate": 6.245733788395904e-07,
      "loss": 0.0095,
      "num_tokens": 173215886.0,
      "reward": 0.12890625,
      "reward_std": 0.13614104688167572,
      "rewards/accuracy_reward/mean": 0.12890625,
      "rewards/accuracy_reward/std": 0.33542385697364807,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.388671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1381.515625,
      "completions/mean_terminated_length": 957.9520874023438,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.06315609797729795,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04946703476560849,
      "learning_rate": 6.279863481228669e-07,
      "loss": 0.0121,
      "num_tokens": 174002726.0,
      "reward": 0.125,
      "reward_std": 0.1180124506354332,
      "rewards/accuracy_reward/mean": 0.12903225421905518,
      "rewards/accuracy_reward/std": 0.33557409048080444,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.408203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1410.26171875,
      "completions/mean_terminated_length": 970.5050048828125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06349748229068874,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.07915382776717433,
      "learning_rate": 6.313993174061433e-07,
      "loss": 0.0067,
      "num_tokens": 174805436.0,
      "reward": 0.220703125,
      "reward_std": 0.21836557984352112,
      "rewards/accuracy_reward/mean": 0.220703125,
      "rewards/accuracy_reward/std": 0.4151262938976288,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.212890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1071.439453125,
      "completions/mean_terminated_length": 807.3076782226562,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.06383886660407954,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09553243134690065,
      "learning_rate": 6.348122866894197e-07,
      "loss": 0.0328,
      "num_tokens": 175428957.0,
      "reward": 0.185546875,
      "reward_std": 0.1863960325717926,
      "rewards/accuracy_reward/mean": 0.185546875,
      "rewards/accuracy_reward/std": 0.38912075757980347,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.33203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1284.94140625,
      "completions/mean_terminated_length": 905.643310546875,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.06418025091747034,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.059261370227590045,
      "learning_rate": 6.382252559726961e-07,
      "loss": 0.0156,
      "num_tokens": 176164719.0,
      "reward": 0.1015625,
      "reward_std": 0.10013246536254883,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.33984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1222.591796875,
      "completions/mean_terminated_length": 797.6775512695312,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.06452163523086114,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0598591479349171,
      "learning_rate": 6.416382252559727e-07,
      "loss": 0.0076,
      "num_tokens": 176867214.0,
      "reward": 0.08203125,
      "reward_std": 0.09244653582572937,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.365234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1270.1171875,
      "completions/mean_terminated_length": 822.535400390625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.06486301954425194,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.06982183160504854,
      "learning_rate": 6.450511945392492e-07,
      "loss": 0.0052,
      "num_tokens": 177590010.0,
      "reward": 0.091796875,
      "reward_std": 0.1510140299797058,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.388671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1385.404296875,
      "completions/mean_terminated_length": 964.1373901367188,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.06520440385764274,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 0.07297420217316669,
      "learning_rate": 6.484641638225256e-07,
      "loss": 0.0418,
      "num_tokens": 178380953.0,
      "reward": 0.162109375,
      "reward_std": 0.21253521740436554,
      "rewards/accuracy_reward/mean": 0.162109375,
      "rewards/accuracy_reward/std": 0.3689115643501282,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.33984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1354.4609375,
      "completions/mean_terminated_length": 997.5621337890625,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.06554578817103354,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.07188494202544887,
      "learning_rate": 6.51877133105802e-07,
      "loss": -0.0046,
      "num_tokens": 179149813.0,
      "reward": 0.064453125,
      "reward_std": 0.12636467814445496,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.318359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1265.36328125,
      "completions/mean_terminated_length": 899.8338623046875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.06588717248442434,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.06838047937663966,
      "learning_rate": 6.552901023890784e-07,
      "loss": 0.007,
      "num_tokens": 179881423.0,
      "reward": 0.109375,
      "reward_std": 0.1336916983127594,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1234.23828125,
      "completions/mean_terminated_length": 890.6500244140625,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.06622855679781514,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.061814204134840234,
      "learning_rate": 6.587030716723549e-07,
      "loss": 0.0132,
      "num_tokens": 180591465.0,
      "reward": 0.123046875,
      "reward_std": 0.13055095076560974,
      "rewards/accuracy_reward/mean": 0.1270161271095276,
      "rewards/accuracy_reward/std": 0.33332720398902893,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.306640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1238.390625,
      "completions/mean_terminated_length": 880.3380126953125,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.06656994111120594,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.06388538599344679,
      "learning_rate": 6.621160409556313e-07,
      "loss": 0.0096,
      "num_tokens": 181298961.0,
      "reward": 0.12109375,
      "reward_std": 0.1446847766637802,
      "rewards/accuracy_reward/mean": 0.12109375,
      "rewards/accuracy_reward/std": 0.3265552520751953,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.251953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1193.0,
      "completions/mean_terminated_length": 905.0234985351562,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.06691132542459674,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.06061534148759774,
      "learning_rate": 6.655290102389079e-07,
      "loss": 0.0095,
      "num_tokens": 181985825.0,
      "reward": 0.1328125,
      "reward_std": 0.14849212765693665,
      "rewards/accuracy_reward/mean": 0.1328125,
      "rewards/accuracy_reward/std": 0.33970388770103455,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1370.814453125,
      "completions/mean_terminated_length": 1009.919189453125,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.06725270973798754,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05111044258348449,
      "learning_rate": 6.689419795221843e-07,
      "loss": 0.0264,
      "num_tokens": 182762882.0,
      "reward": 0.123046875,
      "reward_std": 0.10535639524459839,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1217.041015625,
      "completions/mean_terminated_length": 928.3921508789062,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.06759409405137834,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.06303566480342586,
      "learning_rate": 6.723549488054607e-07,
      "loss": 0.0126,
      "num_tokens": 183457335.0,
      "reward": 0.130859375,
      "reward_std": 0.12752877175807953,
      "rewards/accuracy_reward/mean": 0.130859375,
      "rewards/accuracy_reward/std": 0.33757632970809937,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.318359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1354.201171875,
      "completions/mean_terminated_length": 1030.163330078125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.06793547836476914,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.054702744240987866,
      "learning_rate": 6.757679180887371e-07,
      "loss": 0.0171,
      "num_tokens": 184227998.0,
      "reward": 0.1640625,
      "reward_std": 0.12148028612136841,
      "rewards/accuracy_reward/mean": 0.1640625,
      "rewards/accuracy_reward/std": 0.37069445848464966,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.369140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1379.123046875,
      "completions/mean_terminated_length": 987.73681640625,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.06827686267815994,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05259435310981436,
      "learning_rate": 6.791808873720136e-07,
      "loss": 0.0147,
      "num_tokens": 185001821.0,
      "reward": 0.125,
      "reward_std": 0.131615549325943,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310423493385315,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.33984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1317.5546875,
      "completions/mean_terminated_length": 941.526611328125,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.06861824699155074,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05284857738543805,
      "learning_rate": 6.825938566552901e-07,
      "loss": -0.0033,
      "num_tokens": 185753929.0,
      "reward": 0.07421875,
      "reward_std": 0.10067910701036453,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.43359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1418.537109375,
      "completions/mean_terminated_length": 936.6724243164062,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.06895963130494154,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 0.07815481352799261,
      "learning_rate": 6.860068259385665e-07,
      "loss": 0.0351,
      "num_tokens": 186564364.0,
      "reward": 0.158203125,
      "reward_std": 0.17998173832893372,
      "rewards/accuracy_reward/mean": 0.158203125,
      "rewards/accuracy_reward/std": 0.36528825759887695,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.36328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1348.0078125,
      "completions/mean_terminated_length": 948.625732421875,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.06930101561833234,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06857369827275735,
      "learning_rate": 6.89419795221843e-07,
      "loss": 0.0179,
      "num_tokens": 187332912.0,
      "reward": 0.140625,
      "reward_std": 0.1374513953924179,
      "rewards/accuracy_reward/mean": 0.140625,
      "rewards/accuracy_reward/std": 0.3479743003845215,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1037.267578125,
      "completions/mean_terminated_length": 741.1944580078125,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.06964239993172314,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 0.09079845289212818,
      "learning_rate": 6.928327645051194e-07,
      "loss": 0.0355,
      "num_tokens": 187936265.0,
      "reward": 0.16015625,
      "reward_std": 0.18469981849193573,
      "rewards/accuracy_reward/mean": 0.16015625,
      "rewards/accuracy_reward/std": 0.3671095669269562,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1330.427734375,
      "completions/mean_terminated_length": 954.5565795898438,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.06998378424511394,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.056106399432478124,
      "learning_rate": 6.962457337883959e-07,
      "loss": 0.0009,
      "num_tokens": 188690836.0,
      "reward": 0.078125,
      "reward_std": 0.09891510009765625,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.349609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1353.337890625,
      "completions/mean_terminated_length": 979.9309692382812,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.07032516855850474,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.06861247736470905,
      "learning_rate": 6.996587030716723e-07,
      "loss": 0.0265,
      "num_tokens": 189468993.0,
      "reward": 0.154296875,
      "reward_std": 0.18038935959339142,
      "rewards/accuracy_reward/mean": 0.154296875,
      "rewards/accuracy_reward/std": 0.36158639192581177,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.326171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1290.751953125,
      "completions/mean_terminated_length": 924.2000122070312,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.07066655287189554,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.05447422615165889,
      "learning_rate": 7.030716723549488e-07,
      "loss": 0.001,
      "num_tokens": 190205250.0,
      "reward": 0.095703125,
      "reward_std": 0.09519492089748383,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.33203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1281.751953125,
      "completions/mean_terminated_length": 900.868408203125,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "epoch": 0.07100793718528634,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06166585256041533,
      "learning_rate": 7.064846416382251e-07,
      "loss": 0.0131,
      "num_tokens": 190934291.0,
      "reward": 0.1640625,
      "reward_std": 0.1321541965007782,
      "rewards/accuracy_reward/mean": 0.1640625,
      "rewards/accuracy_reward/std": 0.37069445848464966,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1163.673828125,
      "completions/mean_terminated_length": 843.8111572265625,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.07134932149867713,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06370224490183188,
      "learning_rate": 7.098976109215017e-07,
      "loss": 0.0071,
      "num_tokens": 191604732.0,
      "reward": 0.15625,
      "reward_std": 0.1473877727985382,
      "rewards/accuracy_reward/mean": 0.15625,
      "rewards/accuracy_reward/std": 0.36344730854034424,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.255859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1182.1640625,
      "completions/mean_terminated_length": 884.4619140625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.07169070581206793,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.07316503857103573,
      "learning_rate": 7.133105802047781e-07,
      "loss": 0.0287,
      "num_tokens": 192279264.0,
      "reward": 0.078125,
      "reward_std": 0.13055649399757385,
      "rewards/accuracy_reward/mean": 0.08064515888690948,
      "rewards/accuracy_reward/std": 0.2725643217563629,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.275390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1186.552734375,
      "completions/mean_terminated_length": 859.1563110351562,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.07203209012545873,
      "frac_reward_zero_std": 0.4375,
      "grad_norm": 0.0930444194257224,
      "learning_rate": 7.167235494880546e-07,
      "loss": -0.0049,
      "num_tokens": 192963819.0,
      "reward": 0.177734375,
      "reward_std": 0.21451082825660706,
      "rewards/accuracy_reward/mean": 0.177734375,
      "rewards/accuracy_reward/std": 0.3826628625392914,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.330078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1345.55859375,
      "completions/mean_terminated_length": 999.457763671875,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.07237347443884953,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.06156584796150671,
      "learning_rate": 7.201365187713311e-07,
      "loss": 0.0035,
      "num_tokens": 193734761.0,
      "reward": 0.10546875,
      "reward_std": 0.11740703880786896,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.50390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1544.287109375,
      "completions/mean_terminated_length": 1032.6417236328125,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.07271485875224033,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.06710965589022971,
      "learning_rate": 7.235494880546075e-07,
      "loss": 0.0127,
      "num_tokens": 194612476.0,
      "reward": 0.1640625,
      "reward_std": 0.1723763346672058,
      "rewards/accuracy_reward/mean": 0.1640625,
      "rewards/accuracy_reward/std": 0.37069445848464966,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.36328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1303.27734375,
      "completions/mean_terminated_length": 878.5091552734375,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.07305624306563113,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.04227260922021009,
      "learning_rate": 7.269624573378839e-07,
      "loss": 0.0083,
      "num_tokens": 195351818.0,
      "reward": 0.025390625,
      "reward_std": 0.06354551017284393,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196493625641,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.392578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1416.05859375,
      "completions/mean_terminated_length": 1007.6334228515625,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.07339762737902193,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.04471491194159816,
      "learning_rate": 7.303754266211603e-07,
      "loss": 0.0054,
      "num_tokens": 196156232.0,
      "reward": 0.08984375,
      "reward_std": 0.08601218461990356,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.369140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1412.931640625,
      "completions/mean_terminated_length": 1041.328125,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.07373901169241273,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05314749310020337,
      "learning_rate": 7.337883959044369e-07,
      "loss": -0.0014,
      "num_tokens": 196955029.0,
      "reward": 0.130859375,
      "reward_std": 0.1138748899102211,
      "rewards/accuracy_reward/mean": 0.130859375,
      "rewards/accuracy_reward/std": 0.33757632970809937,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1371.814453125,
      "completions/mean_terminated_length": 979.4598999023438,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.07408039600580353,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06795948970036529,
      "learning_rate": 7.372013651877133e-07,
      "loss": 0.0203,
      "num_tokens": 197733670.0,
      "reward": 0.181640625,
      "reward_std": 0.14458514750003815,
      "rewards/accuracy_reward/mean": 0.181640625,
      "rewards/accuracy_reward/std": 0.38592514395713806,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1361.314453125,
      "completions/mean_terminated_length": 976.1005859375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.07442178031919433,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06342866930215545,
      "learning_rate": 7.406143344709898e-07,
      "loss": 0.0278,
      "num_tokens": 198505079.0,
      "reward": 0.119140625,
      "reward_std": 0.1513601392507553,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.38671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1344.80859375,
      "completions/mean_terminated_length": 901.5191040039062,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.07476316463258513,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.06039354300396888,
      "learning_rate": 7.440273037542661e-07,
      "loss": 0.0124,
      "num_tokens": 199284229.0,
      "reward": 0.125,
      "reward_std": 0.13164834678173065,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310423493385315,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.37109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1403.40625,
      "completions/mean_terminated_length": 1023.055908203125,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.07510454894597593,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05641142355994621,
      "learning_rate": 7.474402730375426e-07,
      "loss": 0.0158,
      "num_tokens": 200083925.0,
      "reward": 0.109375,
      "reward_std": 0.1270298957824707,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1405.111328125,
      "completions/mean_terminated_length": 1068.360107421875,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.07544593325936673,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.060386715579833,
      "learning_rate": 7.50853242320819e-07,
      "loss": 0.0072,
      "num_tokens": 200876062.0,
      "reward": 0.15234375,
      "reward_std": 0.12851551175117493,
      "rewards/accuracy_reward/mean": 0.15234375,
      "rewards/accuracy_reward/std": 0.35970520973205566,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.330078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1339.912109375,
      "completions/mean_terminated_length": 991.0291748046875,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.07578731757275753,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05638395971828098,
      "learning_rate": 7.542662116040955e-07,
      "loss": 0.0287,
      "num_tokens": 201640785.0,
      "reward": 0.14453125,
      "reward_std": 0.1186123713850975,
      "rewards/accuracy_reward/mean": 0.14453125,
      "rewards/accuracy_reward/std": 0.35197147727012634,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.322265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1306.1484375,
      "completions/mean_terminated_length": 953.394775390625,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.07612870188614833,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.07496764447714771,
      "learning_rate": 7.57679180887372e-07,
      "loss": 0.0193,
      "num_tokens": 202388093.0,
      "reward": 0.234375,
      "reward_std": 0.16398611664772034,
      "rewards/accuracy_reward/mean": 0.234375,
      "rewards/accuracy_reward/std": 0.42402184009552,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1326.798828125,
      "completions/mean_terminated_length": 974.5842895507812,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.07647008619953913,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.050537073261015864,
      "learning_rate": 7.610921501706485e-07,
      "loss": -0.0044,
      "num_tokens": 203144518.0,
      "reward": 0.0859375,
      "reward_std": 0.08450747281312943,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1483.244140625,
      "completions/mean_terminated_length": 1015.3035888671875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "epoch": 0.07681147051292993,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.059106606224482036,
      "learning_rate": 7.645051194539249e-07,
      "loss": 0.0143,
      "num_tokens": 203984851.0,
      "reward": 0.12890625,
      "reward_std": 0.1426628828048706,
      "rewards/accuracy_reward/mean": 0.12890625,
      "rewards/accuracy_reward/std": 0.33542385697364807,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1285.1796875,
      "completions/mean_terminated_length": 885.607177734375,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.07715285482632073,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.082351681932432,
      "learning_rate": 7.679180887372013e-07,
      "loss": 0.0119,
      "num_tokens": 204726367.0,
      "reward": 0.126953125,
      "reward_std": 0.14634227752685547,
      "rewards/accuracy_reward/mean": 0.13104838132858276,
      "rewards/accuracy_reward/std": 0.3377939760684967,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.380859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1400.884765625,
      "completions/mean_terminated_length": 1002.8170776367188,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.07749423913971153,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06143391851496004,
      "learning_rate": 7.713310580204778e-07,
      "loss": 0.0109,
      "num_tokens": 205522788.0,
      "reward": 0.115234375,
      "reward_std": 0.1425960510969162,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.384765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1389.75390625,
      "completions/mean_terminated_length": 978.0889282226562,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.07783562345310233,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.06857189924028634,
      "learning_rate": 7.747440273037542e-07,
      "loss": -0.0069,
      "num_tokens": 206305542.0,
      "reward": 0.119140625,
      "reward_std": 0.17003285884857178,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1446.267578125,
      "completions/mean_terminated_length": 1060.5416259765625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07817700776649313,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.06060487939431986,
      "learning_rate": 7.781569965870307e-07,
      "loss": 0.026,
      "num_tokens": 207130655.0,
      "reward": 0.185546875,
      "reward_std": 0.1637648046016693,
      "rewards/accuracy_reward/mean": 0.19153225421905518,
      "rewards/accuracy_reward/std": 0.3939041793346405,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1443.86328125,
      "completions/mean_terminated_length": 1069.1456298828125,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.07851839207988393,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04680176086940241,
      "learning_rate": 7.815699658703071e-07,
      "loss": 0.0128,
      "num_tokens": 207953817.0,
      "reward": 0.078125,
      "reward_std": 0.1085379421710968,
      "rewards/accuracy_reward/mean": 0.08064515888690948,
      "rewards/accuracy_reward/std": 0.2725643217563629,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.412109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1362.62890625,
      "completions/mean_terminated_length": 882.18603515625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.07885977639327472,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05815743683902764,
      "learning_rate": 7.849829351535836e-07,
      "loss": 0.0162,
      "num_tokens": 208727835.0,
      "reward": 0.134765625,
      "reward_std": 0.11187098175287247,
      "rewards/accuracy_reward/mean": 0.134765625,
      "rewards/accuracy_reward/std": 0.3418070077896118,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.318359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1386.005859375,
      "completions/mean_terminated_length": 1076.8223876953125,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.07920116070666552,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.04254597491119421,
      "learning_rate": 7.8839590443686e-07,
      "loss": 0.0134,
      "num_tokens": 209517294.0,
      "reward": 0.08203125,
      "reward_std": 0.077679343521595,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.400390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1442.533203125,
      "completions/mean_terminated_length": 1038.231201171875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "epoch": 0.07954254502005632,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 0.06810755401398516,
      "learning_rate": 7.918088737201365e-07,
      "loss": -0.001,
      "num_tokens": 210331695.0,
      "reward": 0.19921875,
      "reward_std": 0.20368652045726776,
      "rewards/accuracy_reward/mean": 0.19921875,
      "rewards/accuracy_reward/std": 0.39980348944664,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.443359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1470.01171875,
      "completions/mean_terminated_length": 1009.649169921875,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.07988392933344712,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.03783645855051803,
      "learning_rate": 7.952218430034129e-07,
      "loss": 0.0089,
      "num_tokens": 211165221.0,
      "reward": 0.029296875,
      "reward_std": 0.059079915285110474,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.404296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1472.572265625,
      "completions/mean_terminated_length": 1082.0360107421875,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.08022531364683792,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.05560621980418969,
      "learning_rate": 7.986348122866893e-07,
      "loss": 0.0114,
      "num_tokens": 211991850.0,
      "reward": 0.142578125,
      "reward_std": 0.1544896960258484,
      "rewards/accuracy_reward/mean": 0.142578125,
      "rewards/accuracy_reward/std": 0.3499840497970581,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1395.498046875,
      "completions/mean_terminated_length": 1047.757568359375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.08056669796022872,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05202985810272377,
      "learning_rate": 8.020477815699659e-07,
      "loss": 0.0306,
      "num_tokens": 212780729.0,
      "reward": 0.16796875,
      "reward_std": 0.14055965840816498,
      "rewards/accuracy_reward/mean": 0.16796875,
      "rewards/accuracy_reward/std": 0.374204158782959,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.412109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1510.9765625,
      "completions/mean_terminated_length": 1134.52490234375,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.08090808227361952,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05102068515338055,
      "learning_rate": 8.054607508532423e-07,
      "loss": 0.0064,
      "num_tokens": 213637437.0,
      "reward": 0.162109375,
      "reward_std": 0.13695251941680908,
      "rewards/accuracy_reward/mean": 0.162109375,
      "rewards/accuracy_reward/std": 0.3689115643501282,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1426.62890625,
      "completions/mean_terminated_length": 927.7816772460938,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.08124946658701032,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.04009850102602469,
      "learning_rate": 8.088737201365188e-07,
      "loss": 0.007,
      "num_tokens": 214441935.0,
      "reward": 0.09375,
      "reward_std": 0.07582925260066986,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1509.158203125,
      "completions/mean_terminated_length": 1163.746826171875,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "epoch": 0.08159085090040112,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.07606589914072429,
      "learning_rate": 8.122866894197952e-07,
      "loss": 0.0301,
      "num_tokens": 215295296.0,
      "reward": 0.1875,
      "reward_std": 0.19631412625312805,
      "rewards/accuracy_reward/mean": 0.1875,
      "rewards/accuracy_reward/std": 0.39069411158561707,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.431640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1471.509765625,
      "completions/mean_terminated_length": 1033.6942138671875,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.08193223521379192,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05453874474922659,
      "learning_rate": 8.156996587030717e-07,
      "loss": 0.0143,
      "num_tokens": 216122181.0,
      "reward": 0.1484375,
      "reward_std": 0.14860516786575317,
      "rewards/accuracy_reward/mean": 0.15322580933570862,
      "rewards/accuracy_reward/std": 0.36056873202323914,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1247.712890625,
      "completions/mean_terminated_length": 813.822265625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.08227361952718272,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.06660764839208902,
      "learning_rate": 8.19112627986348e-07,
      "loss": -0.0137,
      "num_tokens": 216834370.0,
      "reward": 0.15625,
      "reward_std": 0.12367744743824005,
      "rewards/accuracy_reward/mean": 0.15625,
      "rewards/accuracy_reward/std": 0.36344730854034424,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1350.517578125,
      "completions/mean_terminated_length": 985.169677734375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.08261500384057352,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.06341202553911253,
      "learning_rate": 8.225255972696245e-07,
      "loss": 0.0159,
      "num_tokens": 217604331.0,
      "reward": 0.1171875,
      "reward_std": 0.10480518639087677,
      "rewards/accuracy_reward/mean": 0.1171875,
      "rewards/accuracy_reward/std": 0.32195815443992615,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1428.5546875,
      "completions/mean_terminated_length": 1044.341796875,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.08295638815396432,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.06168287426030444,
      "learning_rate": 8.259385665529009e-07,
      "loss": 0.0228,
      "num_tokens": 218415223.0,
      "reward": 0.13671875,
      "reward_std": 0.1691007912158966,
      "rewards/accuracy_reward/mean": 0.13671875,
      "rewards/accuracy_reward/std": 0.3438861668109894,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.408203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1517.041015625,
      "completions/mean_terminated_length": 1150.802001953125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08329777246735512,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.061142768295055536,
      "learning_rate": 8.293515358361775e-07,
      "loss": 0.0086,
      "num_tokens": 219270588.0,
      "reward": 0.12890625,
      "reward_std": 0.1570894569158554,
      "rewards/accuracy_reward/mean": 0.12890625,
      "rewards/accuracy_reward/std": 0.33542385697364807,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1333.083984375,
      "completions/mean_terminated_length": 918.2561645507812,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.08363915678074592,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05292730748114597,
      "learning_rate": 8.327645051194539e-07,
      "loss": 0.0052,
      "num_tokens": 220027383.0,
      "reward": 0.103515625,
      "reward_std": 0.10779774188995361,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1571.427734375,
      "completions/mean_terminated_length": 1079.7261962890625,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.08398054109413672,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.042786488185753105,
      "learning_rate": 8.361774744027303e-07,
      "loss": 0.0056,
      "num_tokens": 220911106.0,
      "reward": 0.078125,
      "reward_std": 0.09244653582572937,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.392578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1447.17578125,
      "completions/mean_terminated_length": 1058.8616943359375,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.08432192540752752,
      "frac_reward_zero_std": 0.46875,
      "grad_norm": 0.08304467300965157,
      "learning_rate": 8.395904436860067e-07,
      "loss": 0.0114,
      "num_tokens": 221733852.0,
      "reward": 0.17578125,
      "reward_std": 0.19873401522636414,
      "rewards/accuracy_reward/mean": 0.17578125,
      "rewards/accuracy_reward/std": 0.3810062110424042,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.478515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1502.107421875,
      "completions/mean_terminated_length": 1001.1947631835938,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.08466330972091832,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.041970010222907216,
      "learning_rate": 8.430034129692832e-07,
      "loss": 0.0051,
      "num_tokens": 222584451.0,
      "reward": 0.12109375,
      "reward_std": 0.09122256934642792,
      "rewards/accuracy_reward/mean": 0.12109375,
      "rewards/accuracy_reward/std": 0.3265552520751953,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1499.66796875,
      "completions/mean_terminated_length": 1030.8043212890625,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.08500469403430912,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04845680798453503,
      "learning_rate": 8.464163822525597e-07,
      "loss": 0.0139,
      "num_tokens": 223433161.0,
      "reward": 0.08984375,
      "reward_std": 0.10970106720924377,
      "rewards/accuracy_reward/mean": 0.0927419364452362,
      "rewards/accuracy_reward/std": 0.2903633117675781,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.373046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1382.42578125,
      "completions/mean_terminated_length": 986.3987426757812,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.08534607834769992,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05890408312330982,
      "learning_rate": 8.498293515358362e-07,
      "loss": 0.0045,
      "num_tokens": 224213107.0,
      "reward": 0.138671875,
      "reward_std": 0.13934782147407532,
      "rewards/accuracy_reward/mean": 0.14791665971279144,
      "rewards/accuracy_reward/std": 0.35538771748542786,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1559.04296875,
      "completions/mean_terminated_length": 1166.5,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.08568746266109073,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04489745088402795,
      "learning_rate": 8.532423208191127e-07,
      "loss": 0.0038,
      "num_tokens": 225086889.0,
      "reward": 0.13671875,
      "reward_std": 0.13031890988349915,
      "rewards/accuracy_reward/mean": 0.13671875,
      "rewards/accuracy_reward/std": 0.3438861668109894,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1456.076171875,
      "completions/mean_terminated_length": 1037.9300537109375,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.08602884697448153,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.040106339863035766,
      "learning_rate": 8.56655290102389e-07,
      "loss": -0.0013,
      "num_tokens": 225919072.0,
      "reward": 0.041015625,
      "reward_std": 0.09016455709934235,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.501953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1567.36328125,
      "completions/mean_terminated_length": 1082.9569091796875,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.08637023128787233,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04487838386480456,
      "learning_rate": 8.600682593856655e-07,
      "loss": -0.0016,
      "num_tokens": 226810394.0,
      "reward": 0.080078125,
      "reward_std": 0.08642987906932831,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.443359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1479.53125,
      "completions/mean_terminated_length": 1026.7508544921875,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.08671161560126313,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.061708670608455446,
      "learning_rate": 8.634812286689419e-07,
      "loss": 0.0066,
      "num_tokens": 227636954.0,
      "reward": 0.240234375,
      "reward_std": 0.1975778490304947,
      "rewards/accuracy_reward/mean": 0.240234375,
      "rewards/accuracy_reward/std": 0.4276435375213623,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1446.666015625,
      "completions/mean_terminated_length": 1041.846435546875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08705299991465393,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.059667980541240034,
      "learning_rate": 8.668941979522184e-07,
      "loss": 0.0097,
      "num_tokens": 228461391.0,
      "reward": 0.123046875,
      "reward_std": 0.14497441053390503,
      "rewards/accuracy_reward/mean": 0.1270161271095276,
      "rewards/accuracy_reward/std": 0.3333272337913513,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.447265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1510.134765625,
      "completions/mean_terminated_length": 1074.901123046875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08739438422804473,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.03833520472893546,
      "learning_rate": 8.703071672354948e-07,
      "loss": 0.0091,
      "num_tokens": 229314084.0,
      "reward": 0.12890625,
      "reward_std": 0.08792105317115784,
      "rewards/accuracy_reward/mean": 0.12890625,
      "rewards/accuracy_reward/std": 0.33542385697364807,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1529.36328125,
      "completions/mean_terminated_length": 1174.506591796875,
      "completions/min_length": 259.0,
      "completions/min_terminated_length": 259.0,
      "epoch": 0.08773576854143553,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.06010643865404402,
      "learning_rate": 8.737201365187713e-07,
      "loss": 0.0004,
      "num_tokens": 230172318.0,
      "reward": 0.193359375,
      "reward_std": 0.16956712305545807,
      "rewards/accuracy_reward/mean": 0.193359375,
      "rewards/accuracy_reward/std": 0.39531853795051575,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1643.654296875,
      "completions/mean_terminated_length": 1263.814453125,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.08807715285482633,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.03753888562273841,
      "learning_rate": 8.771331058020477e-07,
      "loss": 0.0119,
      "num_tokens": 231082845.0,
      "reward": 0.10546875,
      "reward_std": 0.11004265397787094,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.505859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1573.89453125,
      "completions/mean_terminated_length": 1088.5455322265625,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.08841853716821713,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04991002387235712,
      "learning_rate": 8.805460750853242e-07,
      "loss": 0.001,
      "num_tokens": 231963111.0,
      "reward": 0.115234375,
      "reward_std": 0.13447695970535278,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.482421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1534.505859375,
      "completions/mean_terminated_length": 1055.890625,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.08875992148160793,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04476212872883397,
      "learning_rate": 8.839590443686007e-07,
      "loss": 0.0078,
      "num_tokens": 232828042.0,
      "reward": 0.03515625,
      "reward_std": 0.08493967354297638,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.50390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1550.263671875,
      "completions/mean_terminated_length": 1044.68896484375,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.08910130579499873,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05165846026565672,
      "learning_rate": 8.873720136518771e-07,
      "loss": -0.0062,
      "num_tokens": 233695969.0,
      "reward": 0.087890625,
      "reward_std": 0.10202304273843765,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.353515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1383.859375,
      "completions/mean_terminated_length": 1020.6888427734375,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.08944269010838952,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 0.07046557420444258,
      "learning_rate": 8.907849829351535e-07,
      "loss": 0.039,
      "num_tokens": 234479433.0,
      "reward": 0.18359375,
      "reward_std": 0.18688803911209106,
      "rewards/accuracy_reward/mean": 0.18359375,
      "rewards/accuracy_reward/std": 0.3875311613082886,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.400390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1436.2109375,
      "completions/mean_terminated_length": 1027.687255859375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.08978407442178032,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 0.05845658105319272,
      "learning_rate": 8.941979522184299e-07,
      "loss": 0.0247,
      "num_tokens": 235288549.0,
      "reward": 0.18359375,
      "reward_std": 0.18581107258796692,
      "rewards/accuracy_reward/mean": 0.18359375,
      "rewards/accuracy_reward/std": 0.3875311613082886,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.482421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1513.70703125,
      "completions/mean_terminated_length": 1015.7056884765625,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.09012545873517112,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05252347749924365,
      "learning_rate": 8.976109215017065e-07,
      "loss": -0.013,
      "num_tokens": 236147135.0,
      "reward": 0.08984375,
      "reward_std": 0.09319227933883667,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.560546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1599.451171875,
      "completions/mean_terminated_length": 1027.30224609375,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.09046684304856192,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.056581265366370447,
      "learning_rate": 9.010238907849829e-07,
      "loss": 0.0214,
      "num_tokens": 237047766.0,
      "reward": 0.1171875,
      "reward_std": 0.12962549924850464,
      "rewards/accuracy_reward/mean": 0.1171875,
      "rewards/accuracy_reward/std": 0.32195815443992615,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.37890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1544.4765625,
      "completions/mean_terminated_length": 1237.2955322265625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.09080822736195272,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.05766085684615005,
      "learning_rate": 9.044368600682594e-07,
      "loss": 0.0043,
      "num_tokens": 237913914.0,
      "reward": 0.15234375,
      "reward_std": 0.17622213065624237,
      "rewards/accuracy_reward/mean": 0.15234375,
      "rewards/accuracy_reward/std": 0.35970520973205566,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1478.552734375,
      "completions/mean_terminated_length": 1088.930908203125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.09114961167534352,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.06155848409463368,
      "learning_rate": 9.078498293515358e-07,
      "loss": 0.0064,
      "num_tokens": 238749141.0,
      "reward": 0.216796875,
      "reward_std": 0.15439340472221375,
      "rewards/accuracy_reward/mean": 0.216796875,
      "rewards/accuracy_reward/std": 0.4124660789966583,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.509765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1545.076171875,
      "completions/mean_terminated_length": 1022.1155395507812,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.09149099598873432,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.060196660541649086,
      "learning_rate": 9.112627986348122e-07,
      "loss": 0.0132,
      "num_tokens": 239616972.0,
      "reward": 0.138671875,
      "reward_std": 0.12972497940063477,
      "rewards/accuracy_reward/mean": 0.138671875,
      "rewards/accuracy_reward/std": 0.34594178199768066,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1612.6328125,
      "completions/mean_terminated_length": 1134.4425048828125,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 0.09183238030212512,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.05984180948321153,
      "learning_rate": 9.146757679180886e-07,
      "loss": 0.03,
      "num_tokens": 240515408.0,
      "reward": 0.1640625,
      "reward_std": 0.17900297045707703,
      "rewards/accuracy_reward/mean": 0.1640625,
      "rewards/accuracy_reward/std": 0.37069445848464966,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.48046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1497.748046875,
      "completions/mean_terminated_length": 988.8684692382812,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.09217376461551592,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.056143753697885226,
      "learning_rate": 9.180887372013651e-07,
      "loss": 0.0178,
      "num_tokens": 241353615.0,
      "reward": 0.177734375,
      "reward_std": 0.14117157459259033,
      "rewards/accuracy_reward/mean": 0.177734375,
      "rewards/accuracy_reward/std": 0.3826628625392914,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.45703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1521.458984375,
      "completions/mean_terminated_length": 1078.25537109375,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.09251514892890672,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05255470997158037,
      "learning_rate": 9.215017064846417e-07,
      "loss": 0.0166,
      "num_tokens": 242206522.0,
      "reward": 0.177734375,
      "reward_std": 0.13407659530639648,
      "rewards/accuracy_reward/mean": 0.177734375,
      "rewards/accuracy_reward/std": 0.3826628625392914,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.43359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1488.119140625,
      "completions/mean_terminated_length": 1059.5206298828125,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.09285653324229752,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05285557067370238,
      "learning_rate": 9.249146757679181e-07,
      "loss": 0.0075,
      "num_tokens": 243042535.0,
      "reward": 0.14453125,
      "reward_std": 0.1233629435300827,
      "rewards/accuracy_reward/mean": 0.14453125,
      "rewards/accuracy_reward/std": 0.35197147727012634,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.51171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1511.04296875,
      "completions/mean_terminated_length": 948.3120727539062,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.09319791755568832,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.05525082051472725,
      "learning_rate": 9.283276450511945e-07,
      "loss": 0.0243,
      "num_tokens": 243894253.0,
      "reward": 0.189453125,
      "reward_std": 0.1722542941570282,
      "rewards/accuracy_reward/mean": 0.189453125,
      "rewards/accuracy_reward/std": 0.3922513723373413,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.48046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1517.673828125,
      "completions/mean_terminated_length": 1027.2218017578125,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "epoch": 0.09353930186907912,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04270737137810146,
      "learning_rate": 9.317406143344709e-07,
      "loss": 0.0083,
      "num_tokens": 244742678.0,
      "reward": 0.080078125,
      "reward_std": 0.10896982997655869,
      "rewards/accuracy_reward/mean": 0.08266129344701767,
      "rewards/accuracy_reward/std": 0.2756476104259491,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.41796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1423.15625,
      "completions/mean_terminated_length": 974.4429321289062,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.09388068618246992,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04934473310328268,
      "learning_rate": 9.351535836177474e-07,
      "loss": 0.0129,
      "num_tokens": 245554390.0,
      "reward": 0.126953125,
      "reward_std": 0.1299196481704712,
      "rewards/accuracy_reward/mean": 0.13104838132858276,
      "rewards/accuracy_reward/std": 0.3377939760684967,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.419921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1535.0546875,
      "completions/mean_terminated_length": 1163.7305908203125,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.09422207049586072,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.05899691461891417,
      "learning_rate": 9.385665529010238e-07,
      "loss": 0.0258,
      "num_tokens": 246412674.0,
      "reward": 0.255859375,
      "reward_std": 0.1746368557214737,
      "rewards/accuracy_reward/mean": 0.255859375,
      "rewards/accuracy_reward/std": 0.43676990270614624,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.466796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1492.34375,
      "completions/mean_terminated_length": 1005.89013671875,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.09456345480925152,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.048357678495694646,
      "learning_rate": 9.419795221843004e-07,
      "loss": 0.0141,
      "num_tokens": 247251074.0,
      "reward": 0.134765625,
      "reward_std": 0.12556564807891846,
      "rewards/accuracy_reward/mean": 0.1391129046678543,
      "rewards/accuracy_reward/std": 0.34641367197036743,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.544921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1628.275390625,
      "completions/mean_terminated_length": 1125.6866455078125,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.09490483912264232,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.0517344088762816,
      "learning_rate": 9.453924914675768e-07,
      "loss": 0.0079,
      "num_tokens": 248160703.0,
      "reward": 0.162109375,
      "reward_std": 0.15084770321846008,
      "rewards/accuracy_reward/mean": 0.162109375,
      "rewards/accuracy_reward/std": 0.3689115643501282,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.587890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1684.41796875,
      "completions/mean_terminated_length": 1165.9573974609375,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.09524622343603312,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04413058747712155,
      "learning_rate": 9.488054607508532e-07,
      "loss": 0.0103,
      "num_tokens": 249110565.0,
      "reward": 0.111328125,
      "reward_std": 0.11575192213058472,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.48046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1564.01953125,
      "completions/mean_terminated_length": 1116.4285888671875,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.09558760774942392,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04799554240173732,
      "learning_rate": 9.522184300341296e-07,
      "loss": 0.0057,
      "num_tokens": 249980143.0,
      "reward": 0.08203125,
      "reward_std": 0.0969250351190567,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.541015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1567.552734375,
      "completions/mean_terminated_length": 1001.23828125,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.09592899206281472,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.04377724093945442,
      "learning_rate": 9.556313993174062e-07,
      "loss": 0.0043,
      "num_tokens": 250857450.0,
      "reward": 0.13671875,
      "reward_std": 0.1284025013446808,
      "rewards/accuracy_reward/mean": 0.13671875,
      "rewards/accuracy_reward/std": 0.3438861668109894,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.470703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1486.837890625,
      "completions/mean_terminated_length": 987.7970581054688,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.09627037637620552,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.039757610300643106,
      "learning_rate": 9.590443686006826e-07,
      "loss": 0.0124,
      "num_tokens": 251689239.0,
      "reward": 0.158203125,
      "reward_std": 0.10262156277894974,
      "rewards/accuracy_reward/mean": 0.158203125,
      "rewards/accuracy_reward/std": 0.36528825759887695,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.560546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1668.14453125,
      "completions/mean_terminated_length": 1183.6177978515625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.09661176068959632,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.03951743461345432,
      "learning_rate": 9.62457337883959e-07,
      "loss": 0.0101,
      "num_tokens": 252631617.0,
      "reward": 0.080078125,
      "reward_std": 0.1160123199224472,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.521484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1545.779296875,
      "completions/mean_terminated_length": 998.461181640625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.09695314500298712,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05704696957724095,
      "learning_rate": 9.658703071672355e-07,
      "loss": 0.0288,
      "num_tokens": 253491488.0,
      "reward": 0.1640625,
      "reward_std": 0.14701546728610992,
      "rewards/accuracy_reward/mean": 0.1640625,
      "rewards/accuracy_reward/std": 0.37069445848464966,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.587890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1680.263671875,
      "completions/mean_terminated_length": 1155.6729736328125,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.09729452931637791,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.0690785121234627,
      "learning_rate": 9.69283276450512e-07,
      "loss": 0.022,
      "num_tokens": 254435175.0,
      "reward": 0.177734375,
      "reward_std": 0.19123506546020508,
      "rewards/accuracy_reward/mean": 0.177734375,
      "rewards/accuracy_reward/std": 0.3826628625392914,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.564453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1595.998046875,
      "completions/mean_terminated_length": 1010.2197875976562,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.09763591362976871,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04632495698952849,
      "learning_rate": 9.726962457337883e-07,
      "loss": 0.0206,
      "num_tokens": 255331670.0,
      "reward": 0.1640625,
      "reward_std": 0.12637948989868164,
      "rewards/accuracy_reward/mean": 0.1640625,
      "rewards/accuracy_reward/std": 0.37069445848464966,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.52734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1582.466796875,
      "completions/mean_terminated_length": 1063.0701904296875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "epoch": 0.09797729794315951,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04924463534046396,
      "learning_rate": 9.761092150170647e-07,
      "loss": 0.015,
      "num_tokens": 256228805.0,
      "reward": 0.078125,
      "reward_std": 0.10144393146038055,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.521484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1610.263671875,
      "completions/mean_terminated_length": 1133.2203369140625,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.09831868225655031,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.040754809714969024,
      "learning_rate": 9.795221843003413e-07,
      "loss": 0.0043,
      "num_tokens": 257134572.0,
      "reward": 0.083984375,
      "reward_std": 0.1062953919172287,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1694.11328125,
      "completions/mean_terminated_length": 1099.361328125,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.09866006656994111,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.04639383445604393,
      "learning_rate": 9.829351535836176e-07,
      "loss": 0.0024,
      "num_tokens": 258079670.0,
      "reward": 0.142578125,
      "reward_std": 0.150249183177948,
      "rewards/accuracy_reward/mean": 0.142578125,
      "rewards/accuracy_reward/std": 0.3499840497970581,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.521484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1573.62890625,
      "completions/mean_terminated_length": 1056.6611328125,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.09900145088333191,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04219558774292927,
      "learning_rate": 9.863481228668942e-07,
      "loss": -0.0027,
      "num_tokens": 258961128.0,
      "reward": 0.119140625,
      "reward_std": 0.08941882103681564,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1706.4921875,
      "completions/mean_terminated_length": 1238.5,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.09934283519672271,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.040709983969185425,
      "learning_rate": 9.897610921501706e-07,
      "loss": 0.0203,
      "num_tokens": 259912292.0,
      "reward": 0.16015625,
      "reward_std": 0.09666463732719421,
      "rewards/accuracy_reward/mean": 0.16015625,
      "rewards/accuracy_reward/std": 0.3671095669269562,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.57421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1612.537109375,
      "completions/mean_terminated_length": 1025.261474609375,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.09968421951011351,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04276884810934766,
      "learning_rate": 9.93174061433447e-07,
      "loss": 0.025,
      "num_tokens": 260813431.0,
      "reward": 0.1015625,
      "reward_std": 0.10107709467411041,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.52734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1566.828125,
      "completions/mean_terminated_length": 1029.9833984375,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "epoch": 0.10002560382350431,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.0550766362489316,
      "learning_rate": 9.965870307167234e-07,
      "loss": 0.0223,
      "num_tokens": 261691679.0,
      "reward": 0.166015625,
      "reward_std": 0.16999930143356323,
      "rewards/accuracy_reward/mean": 0.166015625,
      "rewards/accuracy_reward/std": 0.3724585771560669,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1609.408203125,
      "completions/mean_terminated_length": 1156.8929443359375,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.10036698813689511,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.06305400424151338,
      "learning_rate": 1e-06,
      "loss": 0.0196,
      "num_tokens": 262592992.0,
      "reward": 0.109375,
      "reward_std": 0.14442548155784607,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1683.541015625,
      "completions/mean_terminated_length": 1150.8702392578125,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "epoch": 0.10070837245028591,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.03685004347579364,
      "learning_rate": 9.99999680653653e-07,
      "loss": 0.0112,
      "num_tokens": 263528741.0,
      "reward": 0.103515625,
      "reward_std": 0.10834437608718872,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.470703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1464.427734375,
      "completions/mean_terminated_length": 945.4575805664062,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.10104975676367671,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 0.06555009323554643,
      "learning_rate": 9.999987226150655e-07,
      "loss": 0.0293,
      "num_tokens": 264350288.0,
      "reward": 0.18359375,
      "reward_std": 0.18672311305999756,
      "rewards/accuracy_reward/mean": 0.18359375,
      "rewards/accuracy_reward/std": 0.3875311613082886,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1743.46875,
      "completions/mean_terminated_length": 1136.1871337890625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.10139114107706751,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05346895227262486,
      "learning_rate": 9.99997125885597e-07,
      "loss": 0.0148,
      "num_tokens": 265320928.0,
      "reward": 0.125,
      "reward_std": 0.1546129435300827,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310423493385315,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1677.2578125,
      "completions/mean_terminated_length": 1084.4466552734375,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.10173252539045831,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.03189958739910039,
      "learning_rate": 9.99994890467514e-07,
      "loss": 0.0066,
      "num_tokens": 266253252.0,
      "reward": 0.107421875,
      "reward_std": 0.07014618813991547,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.607421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1679.44140625,
      "completions/mean_terminated_length": 1109.18408203125,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.10207390970384911,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06666519777262402,
      "learning_rate": 9.999920163639891e-07,
      "loss": 0.0149,
      "num_tokens": 267193526.0,
      "reward": 0.08984375,
      "reward_std": 0.12279174476861954,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1647.5546875,
      "completions/mean_terminated_length": 1038.0098876953125,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.10241529401723991,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.044562918221771934,
      "learning_rate": 9.999885035791019e-07,
      "loss": 0.0195,
      "num_tokens": 268118130.0,
      "reward": 0.08203125,
      "reward_std": 0.09946031868457794,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.572265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1697.453125,
      "completions/mean_terminated_length": 1228.45654296875,
      "completions/min_length": 200.0,
      "completions/min_terminated_length": 200.0,
      "epoch": 0.10275667833063071,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05308401775817464,
      "learning_rate": 9.999843521178375e-07,
      "loss": 0.0152,
      "num_tokens": 269061674.0,
      "reward": 0.171875,
      "reward_std": 0.1400686800479889,
      "rewards/accuracy_reward/mean": 0.171875,
      "rewards/accuracy_reward/std": 0.3776407241821289,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1699.3046875,
      "completions/mean_terminated_length": 1033.6136474609375,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.10309806264402151,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04676814457056771,
      "learning_rate": 9.99979561986089e-07,
      "loss": 0.0067,
      "num_tokens": 270009606.0,
      "reward": 0.12890625,
      "reward_std": 0.10876302421092987,
      "rewards/accuracy_reward/mean": 0.12890625,
      "rewards/accuracy_reward/std": 0.33542385697364807,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1779.71875,
      "completions/mean_terminated_length": 1200.4197998046875,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.1034394469574123,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.044440036096316506,
      "learning_rate": 9.999741331906542e-07,
      "loss": 0.013,
      "num_tokens": 271005766.0,
      "reward": 0.1171875,
      "reward_std": 0.11595635861158371,
      "rewards/accuracy_reward/mean": 0.1171875,
      "rewards/accuracy_reward/std": 0.32195815443992615,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1739.099609375,
      "completions/mean_terminated_length": 1139.0517578125,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.1037808312708031,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.050629170052133174,
      "learning_rate": 9.99968065739239e-07,
      "loss": 0.0179,
      "num_tokens": 271972297.0,
      "reward": 0.123046875,
      "reward_std": 0.08224457502365112,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1762.671875,
      "completions/mean_terminated_length": 1140.62109375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.1041222155841939,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.047535662566425926,
      "learning_rate": 9.999613596404544e-07,
      "loss": 0.0178,
      "num_tokens": 272948241.0,
      "reward": 0.146484375,
      "reward_std": 0.11751356720924377,
      "rewards/accuracy_reward/mean": 0.146484375,
      "rewards/accuracy_reward/std": 0.35393697023391724,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1655.11328125,
      "completions/mean_terminated_length": 878.8488159179688,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.1044635998975847,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.058714538928285265,
      "learning_rate": 9.999540149038193e-07,
      "loss": 0.0141,
      "num_tokens": 273879083.0,
      "reward": 0.171875,
      "reward_std": 0.14168405532836914,
      "rewards/accuracy_reward/mean": 0.171875,
      "rewards/accuracy_reward/std": 0.3776407241821289,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.537109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1527.306640625,
      "completions/mean_terminated_length": 923.1265258789062,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.1048049842109755,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.05408849262602303,
      "learning_rate": 9.999460315397577e-07,
      "loss": 0.0174,
      "num_tokens": 274733336.0,
      "reward": 0.08984375,
      "reward_std": 0.10480421781539917,
      "rewards/accuracy_reward/mean": 0.09583333134651184,
      "rewards/accuracy_reward/std": 0.29466965794563293,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1725.140625,
      "completions/mean_terminated_length": 1108.772705078125,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.1051463685243663,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.04431010932684938,
      "learning_rate": 9.999374095596004e-07,
      "loss": 0.0085,
      "num_tokens": 275696576.0,
      "reward": 0.12109375,
      "reward_std": 0.1114048957824707,
      "rewards/accuracy_reward/mean": 0.12109375,
      "rewards/accuracy_reward/std": 0.3265552520751953,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.517578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1502.541015625,
      "completions/mean_terminated_length": 917.33203125,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.1054877528377571,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.051023601659573634,
      "learning_rate": 9.99928148975585e-07,
      "loss": 0.0019,
      "num_tokens": 276542629.0,
      "reward": 0.13671875,
      "reward_std": 0.12973949313163757,
      "rewards/accuracy_reward/mean": 0.13671875,
      "rewards/accuracy_reward/std": 0.3438861668109894,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.517578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1568.7578125,
      "completions/mean_terminated_length": 1054.591064453125,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "epoch": 0.1058291371511479,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05142377313796195,
      "learning_rate": 9.99918249800855e-07,
      "loss": 0.0131,
      "num_tokens": 277418153.0,
      "reward": 0.166015625,
      "reward_std": 0.11503897607326508,
      "rewards/accuracy_reward/mean": 0.166015625,
      "rewards/accuracy_reward/std": 0.3724585771560669,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.61328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1679.529296875,
      "completions/mean_terminated_length": 1095.1868896484375,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.1061705214645387,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.0398715324136502,
      "learning_rate": 9.999077120494608e-07,
      "loss": 0.002,
      "num_tokens": 278357240.0,
      "reward": 0.06640625,
      "reward_std": 0.06904878467321396,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1679.583984375,
      "completions/mean_terminated_length": 963.92529296875,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.1065119057779295,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04746502445588205,
      "learning_rate": 9.998965357363583e-07,
      "loss": 0.0113,
      "num_tokens": 279291731.0,
      "reward": 0.12109375,
      "reward_std": 0.116529181599617,
      "rewards/accuracy_reward/mean": 0.12109375,
      "rewards/accuracy_reward/std": 0.3265552520751953,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1699.498046875,
      "completions/mean_terminated_length": 1088.6827392578125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.1068532900913203,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.05746511849257925,
      "learning_rate": 9.998847208774107e-07,
      "loss": 0.0223,
      "num_tokens": 280234018.0,
      "reward": 0.15234375,
      "reward_std": 0.1403878629207611,
      "rewards/accuracy_reward/mean": 0.15234375,
      "rewards/accuracy_reward/std": 0.35970520973205566,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.57421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1676.560546875,
      "completions/mean_terminated_length": 1175.62841796875,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.1071946744047111,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05129375878392557,
      "learning_rate": 9.998722674893869e-07,
      "loss": 0.0165,
      "num_tokens": 281171217.0,
      "reward": 0.14453125,
      "reward_std": 0.13770455121994019,
      "rewards/accuracy_reward/mean": 0.14453125,
      "rewards/accuracy_reward/std": 0.35197147727012634,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1711.6640625,
      "completions/mean_terminated_length": 1096.91162109375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.1075360587181019,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04940003485514998,
      "learning_rate": 9.99859175589962e-07,
      "loss": 0.0227,
      "num_tokens": 282123077.0,
      "reward": 0.10546875,
      "reward_std": 0.11658906936645508,
      "rewards/accuracy_reward/mean": 0.1088709682226181,
      "rewards/accuracy_reward/std": 0.31179171800613403,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1796.013671875,
      "completions/mean_terminated_length": 1215.6322021484375,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.1078774430314927,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.030016141859968596,
      "learning_rate": 9.998454451977178e-07,
      "loss": -0.0008,
      "num_tokens": 283118684.0,
      "reward": 0.09765625,
      "reward_std": 0.06711846590042114,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1741.78125,
      "completions/mean_terminated_length": 1074.1864013671875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1082188273448835,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.05766121026930601,
      "learning_rate": 9.99831076332142e-07,
      "loss": 0.0159,
      "num_tokens": 284085836.0,
      "reward": 0.142578125,
      "reward_std": 0.14102670550346375,
      "rewards/accuracy_reward/mean": 0.142578125,
      "rewards/accuracy_reward/std": 0.3499840497970581,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1569.150390625,
      "completions/mean_terminated_length": 953.4866333007812,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "epoch": 0.1085602116582743,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.06286207256995095,
      "learning_rate": 9.998160690136289e-07,
      "loss": 0.0199,
      "num_tokens": 284960489.0,
      "reward": 0.14453125,
      "reward_std": 0.13519641757011414,
      "rewards/accuracy_reward/mean": 0.14919355511665344,
      "rewards/accuracy_reward/std": 0.3566388487815857,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1690.939453125,
      "completions/mean_terminated_length": 1105.6546630859375,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "epoch": 0.1089015959716651,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.060823581177644775,
      "learning_rate": 9.998004232634777e-07,
      "loss": 0.0095,
      "num_tokens": 285897178.0,
      "reward": 0.123046875,
      "reward_std": 0.12482814490795135,
      "rewards/accuracy_reward/mean": 0.1270161271095276,
      "rewards/accuracy_reward/std": 0.33332720398902893,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1684.44140625,
      "completions/mean_terminated_length": 1083.53369140625,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.1092429802850559,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04165122038921761,
      "learning_rate": 9.997841391038957e-07,
      "loss": 0.0166,
      "num_tokens": 286842028.0,
      "reward": 0.083984375,
      "reward_std": 0.0961657464504242,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1748.01953125,
      "completions/mean_terminated_length": 1017.1946411132812,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 0.1095843645984467,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.04882678845907548,
      "learning_rate": 9.997672165579948e-07,
      "loss": 0.0076,
      "num_tokens": 287812054.0,
      "reward": 0.158203125,
      "reward_std": 0.11791393160820007,
      "rewards/accuracy_reward/mean": 0.158203125,
      "rewards/accuracy_reward/std": 0.36528825759887695,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1667.2578125,
      "completions/mean_terminated_length": 1193.0,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "epoch": 0.1099257489118375,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04881625353475447,
      "learning_rate": 9.997496556497934e-07,
      "loss": 0.0031,
      "num_tokens": 288737242.0,
      "reward": 0.166015625,
      "reward_std": 0.08864613622426987,
      "rewards/accuracy_reward/mean": 0.166015625,
      "rewards/accuracy_reward/std": 0.3724585771560669,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1746.681640625,
      "completions/mean_terminated_length": 1264.878173828125,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.1102671332252283,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.055800776784542865,
      "learning_rate": 9.997314564042165e-07,
      "loss": 0.0042,
      "num_tokens": 289710151.0,
      "reward": 0.11328125,
      "reward_std": 0.13326513767242432,
      "rewards/accuracy_reward/mean": 0.11328125,
      "rewards/accuracy_reward/std": 0.3172462284564972,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1765.369140625,
      "completions/mean_terminated_length": 1154.7469482421875,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.1106085175386191,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.036523479547995535,
      "learning_rate": 9.997126188470941e-07,
      "loss": 0.0007,
      "num_tokens": 290704468.0,
      "reward": 0.05078125,
      "reward_std": 0.06100328266620636,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1698.853515625,
      "completions/mean_terminated_length": 1054.8721923828125,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.1109499018520099,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.06487477432683009,
      "learning_rate": 9.996931430051626e-07,
      "loss": 0.012,
      "num_tokens": 291643257.0,
      "reward": 0.208984375,
      "reward_std": 0.13948683440685272,
      "rewards/accuracy_reward/mean": 0.208984375,
      "rewards/accuracy_reward/std": 0.40698084235191345,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1799.423828125,
      "completions/mean_terminated_length": 1262.3765869140625,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.1112912861654007,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 0.05759697721423508,
      "learning_rate": 9.99673028906065e-07,
      "loss": 0.0154,
      "num_tokens": 292638962.0,
      "reward": 0.169921875,
      "reward_std": 0.12576617300510406,
      "rewards/accuracy_reward/mean": 0.169921875,
      "rewards/accuracy_reward/std": 0.3759314715862274,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1677.7421875,
      "completions/mean_terminated_length": 1145.2762451171875,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.1116326704787915,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 0.08341942847329462,
      "learning_rate": 9.996522765783488e-07,
      "loss": 0.0307,
      "num_tokens": 293572366.0,
      "reward": 0.28125,
      "reward_std": 0.20809099078178406,
      "rewards/accuracy_reward/mean": 0.28125,
      "rewards/accuracy_reward/std": 0.45004892349243164,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1772.068359375,
      "completions/mean_terminated_length": 1016.781005859375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.1119740547921823,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.03464978667496495,
      "learning_rate": 9.996308860514686e-07,
      "loss": 0.0173,
      "num_tokens": 294554833.0,
      "reward": 0.0859375,
      "reward_std": 0.059578798711299896,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.60546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1665.970703125,
      "completions/mean_terminated_length": 1079.6881103515625,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.1123154391055731,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05599199410478439,
      "learning_rate": 9.996088573557843e-07,
      "loss": 0.0047,
      "num_tokens": 295480034.0,
      "reward": 0.205078125,
      "reward_std": 0.08958513289690018,
      "rewards/accuracy_reward/mean": 0.205078125,
      "rewards/accuracy_reward/std": 0.4041535556316376,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1866.75,
      "completions/mean_terminated_length": 1120.0,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 0.1126568234189639,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04849906900597323,
      "learning_rate": 9.995861905225617e-07,
      "loss": -0.0013,
      "num_tokens": 296512706.0,
      "reward": 0.072265625,
      "reward_std": 0.10177553445100784,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1757.1953125,
      "completions/mean_terminated_length": 1081.1688232421875,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.1129982077323547,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.06822229763736717,
      "learning_rate": 9.995628855839721e-07,
      "loss": 0.0124,
      "num_tokens": 297495030.0,
      "reward": 0.123046875,
      "reward_std": 0.14834031462669373,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1781.4609375,
      "completions/mean_terminated_length": 1100.3055419921875,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "epoch": 0.1133395920457455,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.09081261286430949,
      "learning_rate": 9.995389425730923e-07,
      "loss": 0.0269,
      "num_tokens": 298486482.0,
      "reward": 0.142578125,
      "reward_std": 0.1532699465751648,
      "rewards/accuracy_reward/mean": 0.142578125,
      "rewards/accuracy_reward/std": 0.3499840497970581,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1763.40234375,
      "completions/mean_terminated_length": 1296.8968505859375,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.1136809763591363,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.07955616710976351,
      "learning_rate": 9.995143615239056e-07,
      "loss": 0.0092,
      "num_tokens": 299467408.0,
      "reward": 0.130859375,
      "reward_std": 0.1380920112133026,
      "rewards/accuracy_reward/mean": 0.130859375,
      "rewards/accuracy_reward/std": 0.33757632970809937,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1673.837890625,
      "completions/mean_terminated_length": 1104.3004150390625,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.11402236067252709,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.08875806539926845,
      "learning_rate": 9.994891424712998e-07,
      "loss": 0.0109,
      "num_tokens": 300401645.0,
      "reward": 0.14453125,
      "reward_std": 0.1489243358373642,
      "rewards/accuracy_reward/mean": 0.14453125,
      "rewards/accuracy_reward/std": 0.35197147727012634,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1626.884765625,
      "completions/mean_terminated_length": 985.8768310546875,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.11436374498591789,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.10709636875964579,
      "learning_rate": 9.99463285451069e-07,
      "loss": 0.0137,
      "num_tokens": 301311618.0,
      "reward": 0.16796875,
      "reward_std": 0.16807593405246735,
      "rewards/accuracy_reward/mean": 0.16796875,
      "rewards/accuracy_reward/std": 0.374204158782959,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1716.482421875,
      "completions/mean_terminated_length": 1105.0167236328125,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.11470512929930869,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 0.0704848028686384,
      "learning_rate": 9.994367904999127e-07,
      "loss": 0.008,
      "num_tokens": 302264345.0,
      "reward": 0.216796875,
      "reward_std": 0.1845674067735672,
      "rewards/accuracy_reward/mean": 0.216796875,
      "rewards/accuracy_reward/std": 0.4124660789966583,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1739.603515625,
      "completions/mean_terminated_length": 1096.8011474609375,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.11504651361269949,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.09842404712020399,
      "learning_rate": 9.994096576554353e-07,
      "loss": 0.009,
      "num_tokens": 303231742.0,
      "reward": 0.162109375,
      "reward_std": 0.13342025876045227,
      "rewards/accuracy_reward/mean": 0.16733871400356293,
      "rewards/accuracy_reward/std": 0.37365487217903137,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1731.248046875,
      "completions/mean_terminated_length": 1121.5313720703125,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.11538789792609029,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 0.08891326086421894,
      "learning_rate": 9.993818869561467e-07,
      "loss": 0.0155,
      "num_tokens": 304202029.0,
      "reward": 0.080078125,
      "reward_std": 0.11685626953840256,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1747.025390625,
      "completions/mean_terminated_length": 1167.434326171875,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.11572928223948109,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 0.09529601905784285,
      "learning_rate": 9.99353478441463e-07,
      "loss": 0.0143,
      "num_tokens": 305178042.0,
      "reward": 0.162109375,
      "reward_std": 0.17205162346363068,
      "rewards/accuracy_reward/mean": 0.162109375,
      "rewards/accuracy_reward/std": 0.3689115643501282,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1745.998046875,
      "completions/mean_terminated_length": 1138.441162109375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.11607066655287189,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.07301399070834347,
      "learning_rate": 9.993244321517045e-07,
      "loss": 0.0105,
      "num_tokens": 306143929.0,
      "reward": 0.107421875,
      "reward_std": 0.11646249890327454,
      "rewards/accuracy_reward/mean": 0.11088709533214569,
      "rewards/accuracy_reward/std": 0.3143092691898346,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1731.720703125,
      "completions/mean_terminated_length": 1112.24853515625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.11641205086626269,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.08293484324941816,
      "learning_rate": 9.99294748128097e-07,
      "loss": 0.0134,
      "num_tokens": 307108522.0,
      "reward": 0.162109375,
      "reward_std": 0.10481776297092438,
      "rewards/accuracy_reward/mean": 0.162109375,
      "rewards/accuracy_reward/std": 0.3689115643501282,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1765.857421875,
      "completions/mean_terminated_length": 1145.4625244140625,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.11675343517965349,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 0.13179741631114522,
      "learning_rate": 9.992644264127717e-07,
      "loss": 0.0216,
      "num_tokens": 308086321.0,
      "reward": 0.12109375,
      "reward_std": 0.10998040437698364,
      "rewards/accuracy_reward/mean": 0.125,
      "rewards/accuracy_reward/std": 0.3310528099536896,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1842.978515625,
      "completions/mean_terminated_length": 1303.5247802734375,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.11709481949304429,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.07285559281916934,
      "learning_rate": 9.992334670487646e-07,
      "loss": 0.0089,
      "num_tokens": 309098662.0,
      "reward": 0.068359375,
      "reward_std": 0.08750899136066437,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1880.46484375,
      "completions/mean_terminated_length": 1372.5826416015625,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.11743620380643509,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.08848032353536818,
      "learning_rate": 9.99201870080017e-07,
      "loss": 0.006,
      "num_tokens": 310137524.0,
      "reward": 0.12109375,
      "reward_std": 0.09862300753593445,
      "rewards/accuracy_reward/mean": 0.12109375,
      "rewards/accuracy_reward/std": 0.3265552520751953,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1813.697265625,
      "completions/mean_terminated_length": 1095.9127197265625,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.11777758811982589,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.09133029827215806,
      "learning_rate": 9.99169635551375e-07,
      "loss": -0.0024,
      "num_tokens": 311145289.0,
      "reward": 0.07421875,
      "reward_std": 0.10195636004209518,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1896.638671875,
      "completions/mean_terminated_length": 1374.113037109375,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.11811897243321669,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.13596286673083904,
      "learning_rate": 9.991367635085897e-07,
      "loss": 0.0078,
      "num_tokens": 312199584.0,
      "reward": 0.103515625,
      "reward_std": 0.07834453880786896,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1679.658203125,
      "completions/mean_terminated_length": 911.9096069335938,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.11846035674660749,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.1534179772580519,
      "learning_rate": 9.991032539983166e-07,
      "loss": 0.0148,
      "num_tokens": 313133617.0,
      "reward": 0.150390625,
      "reward_std": 0.1024552434682846,
      "rewards/accuracy_reward/mean": 0.150390625,
      "rewards/accuracy_reward/std": 0.35780346393585205,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1832.822265625,
      "completions/mean_terminated_length": 1159.524169921875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "epoch": 0.11880174105999829,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 0.1596366441630623,
      "learning_rate": 9.990691070681169e-07,
      "loss": 0.0208,
      "num_tokens": 314149462.0,
      "reward": 0.1875,
      "reward_std": 0.15019595623016357,
      "rewards/accuracy_reward/mean": 0.1875,
      "rewards/accuracy_reward/std": 0.39069411158561707,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1768.361328125,
      "completions/mean_terminated_length": 1118.292236328125,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.11914312537338909,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.11695111442667908,
      "learning_rate": 9.990343227664552e-07,
      "loss": 0.0099,
      "num_tokens": 315132079.0,
      "reward": 0.109375,
      "reward_std": 0.10326766967773438,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1922.390625,
      "completions/mean_terminated_length": 1356.47314453125,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.11948450968677989,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.12473908865500574,
      "learning_rate": 9.98998901142702e-07,
      "loss": 0.0044,
      "num_tokens": 316187159.0,
      "reward": 0.033203125,
      "reward_std": 0.06166848540306091,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1816.8984375,
      "completions/mean_terminated_length": 1130.7596435546875,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.1198258940001707,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.11734946077064011,
      "learning_rate": 9.989628422471316e-07,
      "loss": 0.0125,
      "num_tokens": 317196627.0,
      "reward": 0.12890625,
      "reward_std": 0.10067909955978394,
      "rewards/accuracy_reward/mean": 0.13306452333927155,
      "rewards/accuracy_reward/std": 0.3399873375892639,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1847.251953125,
      "completions/mean_terminated_length": 1040.3236083984375,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.1201672783135615,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.14179644403465022,
      "learning_rate": 9.989261461309232e-07,
      "loss": 0.0019,
      "num_tokens": 318209540.0,
      "reward": 0.041015625,
      "reward_std": 0.048086829483509064,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1842.19921875,
      "completions/mean_terminated_length": 1177.1734619140625,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.1205086626269523,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.2728144471324912,
      "learning_rate": 9.9888881284616e-07,
      "loss": 0.0144,
      "num_tokens": 319235594.0,
      "reward": 0.091796875,
      "reward_std": 0.07421942055225372,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.841796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1896.412109375,
      "completions/mean_terminated_length": 1089.8148193359375,
      "completions/min_length": 199.0,
      "completions/min_terminated_length": 199.0,
      "epoch": 0.1208500469403431,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.0710082246771826,
      "learning_rate": 9.9885084244583e-07,
      "loss": 0.0032,
      "num_tokens": 320288061.0,
      "reward": 0.044921875,
      "reward_std": 0.030584799125790596,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1872.99609375,
      "completions/mean_terminated_length": 1275.5689697265625,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.1211914312537339,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.9755373756315711,
      "learning_rate": 9.988122349838247e-07,
      "loss": 0.007,
      "num_tokens": 321321899.0,
      "reward": 0.09765625,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.10080645233392715,
      "rewards/accuracy_reward/std": 0.30137622356414795,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1834.01171875,
      "completions/mean_terminated_length": 994.519287109375,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.1215328155671247,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.0202236434349419,
      "learning_rate": 9.987729905149411e-07,
      "loss": 0.0071,
      "num_tokens": 322341505.0,
      "reward": 0.06640625,
      "reward_std": 0.049843885004520416,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1780.455078125,
      "completions/mean_terminated_length": 1048.1240234375,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.1218741998805155,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.3219660564537818,
      "learning_rate": 9.98733109094879e-07,
      "loss": 0.0052,
      "num_tokens": 323329162.0,
      "reward": 0.09765625,
      "reward_std": 0.09490203112363815,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1752.212890625,
      "completions/mean_terminated_length": 973.9361572265625,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.1222155841939063,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 1.8425893010360381,
      "learning_rate": 9.98692590780243e-07,
      "loss": 0.0093,
      "num_tokens": 324317063.0,
      "reward": 0.09765625,
      "reward_std": 0.13064473867416382,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.845703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1910.25,
      "completions/mean_terminated_length": 1155.240478515625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.1225569685072971,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 2.3807639689777997,
      "learning_rate": 9.986514356285412e-07,
      "loss": 0.0067,
      "num_tokens": 325376983.0,
      "reward": 0.0390625,
      "reward_std": 0.07241567224264145,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1847.900390625,
      "completions/mean_terminated_length": 1081.481201171875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1228983528206879,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.4733312131034353,
      "learning_rate": 9.986096436981862e-07,
      "loss": 0.0118,
      "num_tokens": 326405972.0,
      "reward": 0.04296875,
      "reward_std": 0.0635918527841568,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1812.865234375,
      "completions/mean_terminated_length": 1142.819580078125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "epoch": 0.1232397371340787,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.6548073897780564,
      "learning_rate": 9.985672150484937e-07,
      "loss": 0.0128,
      "num_tokens": 327415983.0,
      "reward": 0.16015625,
      "reward_std": 0.08980467915534973,
      "rewards/accuracy_reward/mean": 0.16015625,
      "rewards/accuracy_reward/std": 0.3671095669269562,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1768.724609375,
      "completions/mean_terminated_length": 1119.5,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1235811214474695,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.888454704512162,
      "learning_rate": 9.985241497396835e-07,
      "loss": -0.004,
      "num_tokens": 328403922.0,
      "reward": 0.140625,
      "reward_std": 0.08251741528511047,
      "rewards/accuracy_reward/mean": 0.140625,
      "rewards/accuracy_reward/std": 0.3479743003845215,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1630.876953125,
      "completions/mean_terminated_length": 918.3333129882812,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1239225057608603,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 6.702267300913001,
      "learning_rate": 9.984804478328792e-07,
      "loss": 0.0061,
      "num_tokens": 329321331.0,
      "reward": 0.1328125,
      "reward_std": 0.14254970848560333,
      "rewards/accuracy_reward/mean": 0.1328125,
      "rewards/accuracy_reward/std": 0.33970388770103455,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1916.169921875,
      "completions/mean_terminated_length": 1097.3380126953125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1242638900742511,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.1279630623853594,
      "learning_rate": 9.98436109390107e-07,
      "loss": 0.0024,
      "num_tokens": 330369898.0,
      "reward": 0.04296875,
      "reward_std": 0.04175759106874466,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1886.998046875,
      "completions/mean_terminated_length": 1189.322998046875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12460527438764189,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 4.009799030492814,
      "learning_rate": 9.983911344742979e-07,
      "loss": 0.0181,
      "num_tokens": 331411833.0,
      "reward": 0.08203125,
      "reward_std": 0.12440948933362961,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1858.69140625,
      "completions/mean_terminated_length": 1068.949462890625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12494665870103269,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 4.908216668332833,
      "learning_rate": 9.983455231492852e-07,
      "loss": 0.0051,
      "num_tokens": 332440283.0,
      "reward": 0.078125,
      "reward_std": 0.11812658607959747,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1769.083984375,
      "completions/mean_terminated_length": 896.3467407226562,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12528804301442348,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.478707134866254,
      "learning_rate": 9.982992754798057e-07,
      "loss": 0.005,
      "num_tokens": 333423414.0,
      "reward": 0.076171875,
      "reward_std": 0.06849660724401474,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1798.90234375,
      "completions/mean_terminated_length": 1117.065673828125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1256294273278143,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.054539392413355,
      "learning_rate": 9.982523915315e-07,
      "loss": 0.0116,
      "num_tokens": 334416772.0,
      "reward": 0.1328125,
      "reward_std": 0.14584887027740479,
      "rewards/accuracy_reward/mean": 0.1328125,
      "rewards/accuracy_reward/std": 0.33970388770103455,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1816.171875,
      "completions/mean_terminated_length": 872.7920532226562,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12597081164120508,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.805763736629649,
      "learning_rate": 9.982048713709109e-07,
      "loss": 0.0089,
      "num_tokens": 335419196.0,
      "reward": 0.044921875,
      "reward_std": 0.09032991528511047,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1882.212890625,
      "completions/mean_terminated_length": 1269.2568359375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1263121959545959,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.655856691146327,
      "learning_rate": 9.981567150654848e-07,
      "loss": 0.0042,
      "num_tokens": 336458841.0,
      "reward": 0.064453125,
      "reward_std": 0.05529290437698364,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1858.826171875,
      "completions/mean_terminated_length": 1167.4818115234375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.12665358026798668,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.619375394965113,
      "learning_rate": 9.981079226835704e-07,
      "loss": 0.0109,
      "num_tokens": 337487216.0,
      "reward": 0.037109375,
      "reward_std": 0.07026061415672302,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1955.298828125,
      "completions/mean_terminated_length": 1098.739990234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1269949645813775,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.543135339261255,
      "learning_rate": 9.9805849429442e-07,
      "loss": -0.0013,
      "num_tokens": 338579353.0,
      "reward": 0.0234375,
      "reward_std": 0.028608137741684914,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.888671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1904.044921875,
      "completions/mean_terminated_length": 754.9298095703125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12733634889476828,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.115825424624719,
      "learning_rate": 9.980084299681879e-07,
      "loss": 0.01,
      "num_tokens": 339628848.0,
      "reward": 0.04296875,
      "reward_std": 0.050948236137628555,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1923.17578125,
      "completions/mean_terminated_length": 1108.1470947265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1276777332081591,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 9.997232530856346,
      "learning_rate": 9.979577297759312e-07,
      "loss": 0.001,
      "num_tokens": 340687722.0,
      "reward": 0.048828125,
      "reward_std": 0.05138834938406944,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1859.162109375,
      "completions/mean_terminated_length": 1127.1905517578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12801911752154987,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.97417048651692,
      "learning_rate": 9.979063937896098e-07,
      "loss": 0.0279,
      "num_tokens": 341721853.0,
      "reward": 0.099609375,
      "reward_std": 0.09506139904260635,
      "rewards/accuracy_reward/mean": 0.10282257944345474,
      "rewards/accuracy_reward/std": 0.30403366684913635,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1903.326171875,
      "completions/mean_terminated_length": 1315.019775390625,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.1283605018349407,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.2069615640212055,
      "learning_rate": 9.978544220820858e-07,
      "loss": 0.004,
      "num_tokens": 342776836.0,
      "reward": 0.095703125,
      "reward_std": 0.0799964889883995,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1892.61328125,
      "completions/mean_terminated_length": 943.0277709960938,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.12870188614833147,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.869596712335641,
      "learning_rate": 9.978018147271232e-07,
      "loss": 0.0085,
      "num_tokens": 343813502.0,
      "reward": 0.009765625,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.010080644860863686,
      "rewards/accuracy_reward/std": 0.0999959409236908,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1964.162109375,
      "completions/mean_terminated_length": 1072.431884765625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.1290432704617223,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.789800421844635,
      "learning_rate": 9.977485717993885e-07,
      "loss": 0.0018,
      "num_tokens": 344888625.0,
      "reward": 0.0078125,
      "reward_std": 0.020409777760505676,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.93359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1954.0,
      "completions/mean_length": 1973.662109375,
      "completions/mean_terminated_length": 928.558837890625,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.12938465477511307,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.103245270166282,
      "learning_rate": 9.976946933744505e-07,
      "loss": 0.0027,
      "num_tokens": 345974036.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1924.02734375,
      "completions/mean_terminated_length": 1086.272705078125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12972603908850389,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.777409629720089,
      "learning_rate": 9.9764017952878e-07,
      "loss": 0.005,
      "num_tokens": 347034722.0,
      "reward": 0.033203125,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1920.111328125,
      "completions/mean_terminated_length": 991.8870849609375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.13006742340189467,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 8.452264719758775,
      "learning_rate": 9.97585030339749e-07,
      "loss": 0.0053,
      "num_tokens": 348095563.0,
      "reward": 0.029296875,
      "reward_std": 0.026572702452540398,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1943.615234375,
      "completions/mean_terminated_length": 1157.25,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.13040880771528549,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.257495929369215,
      "learning_rate": 9.97529245885632e-07,
      "loss": 0.0048,
      "num_tokens": 349164470.0,
      "reward": 0.041015625,
      "reward_std": 0.059305962175130844,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1895.861328125,
      "completions/mean_terminated_length": 1173.2696533203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.13075019202867627,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.64986476935335,
      "learning_rate": 9.974728262456043e-07,
      "loss": -0.0002,
      "num_tokens": 350214127.0,
      "reward": 0.0546875,
      "reward_std": 0.06062985956668854,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.88671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1943.9765625,
      "completions/mean_terminated_length": 1129.72412109375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.13109157634206708,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.543445015035747,
      "learning_rate": 9.974157714997438e-07,
      "loss": 0.003,
      "num_tokens": 351281411.0,
      "reward": 0.01953125,
      "reward_std": 0.04505910724401474,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1888.806640625,
      "completions/mean_terminated_length": 947.4054565429688,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.13143296065545787,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.967585905732063,
      "learning_rate": 9.973580817290295e-07,
      "loss": 0.0074,
      "num_tokens": 352333008.0,
      "reward": 0.015625,
      "reward_std": 0.04081955552101135,
      "rewards/accuracy_reward/mean": 0.016129031777381897,
      "rewards/accuracy_reward/std": 0.12609896063804626,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1891.8203125,
      "completions/mean_terminated_length": 937.388916015625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.13177434496884868,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.782230088734398,
      "learning_rate": 9.972997570153406e-07,
      "loss": 0.0132,
      "num_tokens": 353372308.0,
      "reward": 0.056640625,
      "reward_std": 0.07521171122789383,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1967.68359375,
      "completions/mean_terminated_length": 1313.6785888671875,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 0.13211572928223947,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.972407974414592e-07,
      "loss": 0.0,
      "num_tokens": 354461682.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/accuracy_reward/mean": 0.0,
      "rewards/accuracy_reward/std": 0.0,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.849609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1897.298828125,
      "completions/mean_terminated_length": 1046.727294921875,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.13245711359563028,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.200278918139587,
      "learning_rate": 9.971812030910671e-07,
      "loss": 0.0,
      "num_tokens": 355521803.0,
      "reward": 0.037109375,
      "reward_std": 0.049345001578330994,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1897.673828125,
      "completions/mean_terminated_length": 979.013916015625,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.13279849790902107,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 9.098451304623806,
      "learning_rate": 9.971209740487478e-07,
      "loss": -0.0004,
      "num_tokens": 356573508.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1918.1484375,
      "completions/mean_terminated_length": 1195.6409912109375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.13313988222241188,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 8.719509731807541,
      "learning_rate": 9.970601103999854e-07,
      "loss": 0.002,
      "num_tokens": 357632784.0,
      "reward": 0.052734375,
      "reward_std": 0.04456022381782532,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.900390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1968.681640625,
      "completions/mean_terminated_length": 1251.7059326171875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.13348126653580267,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.949437446080502,
      "learning_rate": 9.969986122311648e-07,
      "loss": 0.0117,
      "num_tokens": 358719645.0,
      "reward": 0.02734375,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.94140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1994.478515625,
      "completions/mean_terminated_length": 1134.5667724609375,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.13382265084919348,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.397498252199284,
      "learning_rate": 9.969364796295712e-07,
      "loss": 0.008,
      "num_tokens": 359814194.0,
      "reward": 0.01171875,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1880.0,
      "completions/mean_length": 1957.96484375,
      "completions/mean_terminated_length": 1000.3182373046875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.13416403516258427,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 14.766023961661254,
      "learning_rate": 9.968737126833905e-07,
      "loss": 0.0124,
      "num_tokens": 360892096.0,
      "reward": 0.056640625,
      "reward_std": 0.07438573241233826,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.91015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1976.1875,
      "completions/mean_terminated_length": 1248.6956787109375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.13450541947597508,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 2.7653090694364484,
      "learning_rate": 9.96810311481709e-07,
      "loss": 0.0002,
      "num_tokens": 361973632.0,
      "reward": 0.01171875,
      "reward_std": 0.022772299125790596,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1907.5234375,
      "completions/mean_terminated_length": 1221.287353515625,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.13484680378936587,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 30.105814068991844,
      "learning_rate": 9.967462761145132e-07,
      "loss": 0.0206,
      "num_tokens": 363019852.0,
      "reward": 0.08203125,
      "reward_std": 0.0697084441781044,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.892578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1937.1484375,
      "completions/mean_terminated_length": 1016.0726928710938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.13518818810275668,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.441591806308416,
      "learning_rate": 9.966816066726895e-07,
      "loss": 0.0057,
      "num_tokens": 364092056.0,
      "reward": 0.0234375,
      "reward_std": 0.049843885004520416,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.873046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1932.779296875,
      "completions/mean_terminated_length": 1140.4154052734375,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 0.13552957241614746,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.202296725686962,
      "learning_rate": 9.96616303248024e-07,
      "loss": 0.0059,
      "num_tokens": 365152743.0,
      "reward": 0.046875,
      "reward_std": 0.06751786917448044,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1993.341796875,
      "completions/mean_terminated_length": 1348.375,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 0.13587095672953828,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 5.469273864133791,
      "learning_rate": 9.965503659332037e-07,
      "loss": 0.0093,
      "num_tokens": 366247110.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.892578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1951.3359375,
      "completions/mean_terminated_length": 1148.1453857421875,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.13621234104292906,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.228366242907972,
      "learning_rate": 9.964837948218138e-07,
      "loss": 0.0213,
      "num_tokens": 367328210.0,
      "reward": 0.08984375,
      "reward_std": 0.1060623973608017,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.888671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1942.00390625,
      "completions/mean_terminated_length": 1095.894775390625,
      "completions/min_length": 357.0,
      "completions/min_terminated_length": 357.0,
      "epoch": 0.13655372535631988,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 10.869894102290171,
      "learning_rate": 9.964165900083402e-07,
      "loss": 0.0082,
      "num_tokens": 368406772.0,
      "reward": 0.0234375,
      "reward_std": 0.036547206342220306,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.82421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1893.1640625,
      "completions/mean_terminated_length": 1167.7667236328125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1368951096697107,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.98695375449312,
      "learning_rate": 9.963487515881678e-07,
      "loss": 0.0023,
      "num_tokens": 369460120.0,
      "reward": 0.056640625,
      "reward_std": 0.05628518760204315,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.962890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 2008.505859375,
      "completions/mean_terminated_length": 983.7368774414062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.13723649398310148,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 10.270718080635893,
      "learning_rate": 9.962802796575811e-07,
      "loss": 0.0082,
      "num_tokens": 370553259.0,
      "reward": 0.01171875,
      "reward_std": 0.025194555521011353,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1902.876953125,
      "completions/mean_terminated_length": 956.0294189453125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1375778782964923,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.95570444122586,
      "learning_rate": 9.96211174313763e-07,
      "loss": -0.0004,
      "num_tokens": 371611740.0,
      "reward": 0.060546875,
      "reward_std": 0.0870293527841568,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.857421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1904.462890625,
      "completions/mean_terminated_length": 1041.27392578125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.13791926260988308,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.238549811352238,
      "learning_rate": 9.961414356547962e-07,
      "loss": 0.0059,
      "num_tokens": 372663129.0,
      "reward": 0.037109375,
      "reward_std": 0.05919293686747551,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1898.0078125,
      "completions/mean_terminated_length": 1111.46337890625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1382606469232739,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.62090865913346,
      "learning_rate": 9.960710637796617e-07,
      "loss": 0.0068,
      "num_tokens": 373711373.0,
      "reward": 0.0546875,
      "reward_std": 0.07685433328151703,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.873046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1933.3359375,
      "completions/mean_terminated_length": 1144.800048828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.13860203123666467,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.011669243268397,
      "learning_rate": 9.960000587882396e-07,
      "loss": 0.0104,
      "num_tokens": 374775161.0,
      "reward": 0.072265625,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1956.77734375,
      "completions/mean_terminated_length": 1319.9375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.1389434155500555,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.79825251627794,
      "learning_rate": 9.95928420781309e-07,
      "loss": 0.0052,
      "num_tokens": 375863959.0,
      "reward": 0.0546875,
      "reward_std": 0.04517117142677307,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1967.76171875,
      "completions/mean_terminated_length": 1227.4599609375,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.13928479986344627,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 7.133796257019633,
      "learning_rate": 9.95856149860546e-07,
      "loss": 0.0048,
      "num_tokens": 376958429.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.931640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1988.0859375,
      "completions/mean_terminated_length": 1171.5428466796875,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 179.0,
      "epoch": 0.1396261841768371,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.549601892863644,
      "learning_rate": 9.957832461285267e-07,
      "loss": 0.0145,
      "num_tokens": 378053097.0,
      "reward": 0.013671875,
      "reward_std": 0.03300705552101135,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1927.81640625,
      "completions/mean_terminated_length": 1216.45947265625,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.13996756849022787,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 14.15011254657329,
      "learning_rate": 9.957097096887246e-07,
      "loss": 0.0019,
      "num_tokens": 379130923.0,
      "reward": 0.048828125,
      "reward_std": 0.062062256038188934,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.923828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1971.41015625,
      "completions/mean_terminated_length": 1042.5128173828125,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 0.1403089528036187,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.760457766557874,
      "learning_rate": 9.95635540645511e-07,
      "loss": 0.0227,
      "num_tokens": 380217165.0,
      "reward": 0.041015625,
      "reward_std": 0.04777328670024872,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.900390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1940.763671875,
      "completions/mean_terminated_length": 971.431396484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14065033711700947,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.7369027415487865,
      "learning_rate": 9.95560739104155e-07,
      "loss": 0.0107,
      "num_tokens": 381288068.0,
      "reward": 0.01171875,
      "reward_std": 0.031083684414625168,
      "rewards/accuracy_reward/mean": 0.012096773833036423,
      "rewards/accuracy_reward/std": 0.10942844301462173,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1843.568359375,
      "completions/mean_terminated_length": 934.5,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.14099172143040029,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 13.68441835155953,
      "learning_rate": 9.95485305170824e-07,
      "loss": 0.018,
      "num_tokens": 382300823.0,
      "reward": 0.08984375,
      "reward_std": 0.04742163419723511,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1896.9375,
      "completions/mean_terminated_length": 800.51611328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14133310574379107,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 27.493703456695176,
      "learning_rate": 9.95409238952583e-07,
      "loss": 0.0277,
      "num_tokens": 383359575.0,
      "reward": 0.03515625,
      "reward_std": 0.07779236882925034,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.91015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1961.908203125,
      "completions/mean_terminated_length": 1089.7608642578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.14167449005718188,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 15.649384548835311,
      "learning_rate": 9.953325405573935e-07,
      "loss": 0.014,
      "num_tokens": 384438072.0,
      "reward": 0.017578125,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.958984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 1992.30859375,
      "completions/mean_terminated_length": 690.1904907226562,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.14201587437057267,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.0540331220285415,
      "learning_rate": 9.952552100941155e-07,
      "loss": 0.0203,
      "num_tokens": 385535862.0,
      "reward": 0.015625,
      "reward_std": 0.04081955552101135,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1907.806640625,
      "completions/mean_terminated_length": 1127.7564697265625,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.14235725868396348,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.839195975122427,
      "learning_rate": 9.951772476725047e-07,
      "loss": 0.0119,
      "num_tokens": 386584483.0,
      "reward": 0.068359375,
      "reward_std": 0.07669496536254883,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1930.322265625,
      "completions/mean_terminated_length": 1199.394287109375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.14269864299735427,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 2.138767764865018,
      "learning_rate": 9.950986534032149e-07,
      "loss": 0.0032,
      "num_tokens": 387648616.0,
      "reward": 0.01171875,
      "reward_std": 0.024649331346154213,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1856.57421875,
      "completions/mean_terminated_length": 934.25,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.14304002731074508,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.200950273517261,
      "learning_rate": 9.950194273977964e-07,
      "loss": 0.014,
      "num_tokens": 388671246.0,
      "reward": 0.0703125,
      "reward_std": 0.060571081936359406,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.939453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1977.453125,
      "completions/mean_terminated_length": 882.8386840820312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14338141162413587,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 7.384092349387717,
      "learning_rate": 9.949395697686958e-07,
      "loss": 0.0013,
      "num_tokens": 389768150.0,
      "reward": 0.0078125,
      "reward_std": 0.013975424692034721,
      "rewards/accuracy_reward/mean": 0.008064515888690948,
      "rewards/accuracy_reward/std": 0.0895301103591919,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.91015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1957.583984375,
      "completions/mean_terminated_length": 1041.6304931640625,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.14372279593752668,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.953814546850847,
      "learning_rate": 9.948590806292565e-07,
      "loss": 0.0167,
      "num_tokens": 390855233.0,
      "reward": 0.041015625,
      "reward_std": 0.051707521080970764,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.892578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1937.830078125,
      "completions/mean_terminated_length": 1022.4181518554688,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.14406418025091747,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 65.62528060669477,
      "learning_rate": 9.947779600937181e-07,
      "loss": 0.0053,
      "num_tokens": 391925498.0,
      "reward": 0.025390625,
      "reward_std": 0.06288585811853409,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.888671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1934.337890625,
      "completions/mean_terminated_length": 1027.03515625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.14440556456430828,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.415628095915988,
      "learning_rate": 9.946962082772163e-07,
      "loss": 0.0087,
      "num_tokens": 392993815.0,
      "reward": 0.03125,
      "reward_std": 0.06563520431518555,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1907.03125,
      "completions/mean_terminated_length": 1098.3157958984375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14474694887769907,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 14.328699104696629,
      "learning_rate": 9.946138252957827e-07,
      "loss": 0.0024,
      "num_tokens": 394046343.0,
      "reward": 0.04296875,
      "reward_std": 0.06546889245510101,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.923828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 1972.025390625,
      "completions/mean_terminated_length": 1051.974365234375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.14508833319108988,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.0807841019737863,
      "learning_rate": 9.945308112663455e-07,
      "loss": 0.0124,
      "num_tokens": 395140260.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.82421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1833.947265625,
      "completions/mean_terminated_length": 830.2777709960938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14542971750448067,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.121932787306299,
      "learning_rate": 9.94447166306727e-07,
      "loss": 0.0144,
      "num_tokens": 396154825.0,
      "reward": 0.03125,
      "reward_std": 0.04670868441462517,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.880859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1910.654296875,
      "completions/mean_terminated_length": 895.1966552734375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.14577110181787148,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.671881201259376,
      "learning_rate": 9.94362890535647e-07,
      "loss": 0.0206,
      "num_tokens": 397211640.0,
      "reward": 0.03125,
      "reward_std": 0.06079617142677307,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.92578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1991.23046875,
      "completions/mean_terminated_length": 1283.105224609375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.14611248613126226,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.131014829230976,
      "learning_rate": 9.942779840727185e-07,
      "loss": 0.0124,
      "num_tokens": 398301950.0,
      "reward": 0.02734375,
      "reward_std": 0.05342378467321396,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.845703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1879.447265625,
      "completions/mean_terminated_length": 956.088623046875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.14645387044465308,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 13.492673337281245,
      "learning_rate": 9.941924470384515e-07,
      "loss": 0.0021,
      "num_tokens": 399343411.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1789.0,
      "completions/mean_length": 1891.458984375,
      "completions/mean_terminated_length": 796.40625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.14679525475804386,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.923054584476633,
      "learning_rate": 9.941062795542496e-07,
      "loss": 0.0041,
      "num_tokens": 400398430.0,
      "reward": 0.033203125,
      "reward_std": 0.07470491528511047,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1856.728515625,
      "completions/mean_terminated_length": 1173.6160888671875,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.14713663907143468,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 31.427259022046577,
      "learning_rate": 9.94019481742412e-07,
      "loss": 0.0281,
      "num_tokens": 401431187.0,
      "reward": 0.119140625,
      "reward_std": 0.11819230765104294,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1783.14453125,
      "completions/mean_terminated_length": 792.8518676757812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14747802338482546,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 17.78873551003455,
      "learning_rate": 9.939320537261326e-07,
      "loss": 0.0066,
      "num_tokens": 402429773.0,
      "reward": 0.033203125,
      "reward_std": 0.053357094526290894,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211627006530762,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1895.732421875,
      "completions/mean_terminated_length": 965.2083129882812,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.14781940769821628,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.93705640359269,
      "learning_rate": 9.93843995629499e-07,
      "loss": 0.0125,
      "num_tokens": 403483076.0,
      "reward": 0.08203125,
      "reward_std": 0.0645298957824707,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1876.96484375,
      "completions/mean_terminated_length": 864.6216430664062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14816079201160706,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.436705781921948,
      "learning_rate": 9.937553075774938e-07,
      "loss": 0.0026,
      "num_tokens": 404521042.0,
      "reward": 0.013671875,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.927734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1969.419921875,
      "completions/mean_terminated_length": 960.6216430664062,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.14850217632499788,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.108804417409502,
      "learning_rate": 9.936659896959935e-07,
      "loss": 0.0077,
      "num_tokens": 405608729.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1822.34765625,
      "completions/mean_terminated_length": 1060.5299072265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14884356063838866,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.426146056782713,
      "learning_rate": 9.935760421117686e-07,
      "loss": 0.0366,
      "num_tokens": 406619403.0,
      "reward": 0.09375,
      "reward_std": 0.09594857692718506,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1780.779296875,
      "completions/mean_terminated_length": 826.419677734375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.14918494495177947,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.440456789409149,
      "learning_rate": 9.93485464952483e-07,
      "loss": 0.035,
      "num_tokens": 407613098.0,
      "reward": 0.03515625,
      "reward_std": 0.08157937228679657,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1848.240234375,
      "completions/mean_terminated_length": 994.1030883789062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.14952632926517026,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.368768573442622,
      "learning_rate": 9.933942583466944e-07,
      "loss": 0.014,
      "num_tokens": 408639301.0,
      "reward": 0.033203125,
      "reward_std": 0.06271953880786896,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1885.396484375,
      "completions/mean_terminated_length": 922.95947265625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.14986771357856107,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.837798002478943,
      "learning_rate": 9.93302422423854e-07,
      "loss": 0.0033,
      "num_tokens": 409675728.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1858.60546875,
      "completions/mean_terminated_length": 1048.3092041015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15020909789195186,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 25.826564229933172,
      "learning_rate": 9.93209957314306e-07,
      "loss": 0.0324,
      "num_tokens": 410708342.0,
      "reward": 0.0390625,
      "reward_std": 0.09506931155920029,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1775.900390625,
      "completions/mean_terminated_length": 781.5,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15055048220534267,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.985461888331729,
      "learning_rate": 9.93116863149288e-07,
      "loss": 0.0127,
      "num_tokens": 411685731.0,
      "reward": 0.052734375,
      "reward_std": 0.08121949434280396,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.876953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1922.626953125,
      "completions/mean_terminated_length": 1029.857177734375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.15089186651873346,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.83284768215417,
      "learning_rate": 9.9302314006093e-07,
      "loss": 0.0123,
      "num_tokens": 412748388.0,
      "reward": 0.046875,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.04838709533214569,
      "rewards/accuracy_reward/std": 0.21479946374893188,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1812.51953125,
      "completions/mean_terminated_length": 921.5794067382812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.15123325083212427,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.644910885819239,
      "learning_rate": 9.929287881822545e-07,
      "loss": 0.0057,
      "num_tokens": 413758702.0,
      "reward": 0.048828125,
      "reward_std": 0.08149883151054382,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1829.255859375,
      "completions/mean_terminated_length": 1001.2990112304688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.15157463514551506,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.753194935359298,
      "learning_rate": 9.92833807647177e-07,
      "loss": 0.0228,
      "num_tokens": 414773489.0,
      "reward": 0.056640625,
      "reward_std": 0.07729348540306091,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.876953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1908.0,
      "completions/mean_length": 1888.41015625,
      "completions/mean_terminated_length": 751.0159301757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.15191601945890587,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.1789599203284744,
      "learning_rate": 9.927381985905051e-07,
      "loss": 0.0108,
      "num_tokens": 415817363.0,
      "reward": 0.017578125,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.018145160749554634,
      "rewards/accuracy_reward/std": 0.1336110383272171,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.86328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1898.666015625,
      "completions/mean_terminated_length": 955.7285766601562,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.15225740377229666,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.5471834044962816,
      "learning_rate": 9.92641961147938e-07,
      "loss": 0.0094,
      "num_tokens": 416864504.0,
      "reward": 0.013671875,
      "reward_std": 0.032461829483509064,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1891.697265625,
      "completions/mean_terminated_length": 835.4697265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15259878808568747,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.1969890488326105,
      "learning_rate": 9.925450954560676e-07,
      "loss": 0.0117,
      "num_tokens": 417904909.0,
      "reward": 0.0078125,
      "reward_std": 0.020409777760505676,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1923.06640625,
      "completions/mean_terminated_length": 1078.8182373046875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15294017239907826,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 19.477288233251414,
      "learning_rate": 9.924476016523765e-07,
      "loss": 0.0101,
      "num_tokens": 418967311.0,
      "reward": 0.03125,
      "reward_std": 0.059520021080970764,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1826.66015625,
      "completions/mean_terminated_length": 1062.5565185546875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15328155671246907,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.476519750183675,
      "learning_rate": 9.9234947987524e-07,
      "loss": 0.0077,
      "num_tokens": 419973329.0,
      "reward": 0.025390625,
      "reward_std": 0.048086829483509064,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1784.16015625,
      "completions/mean_terminated_length": 736.4854736328125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.15362294102585985,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 26.286316535343488,
      "learning_rate": 9.922507302639234e-07,
      "loss": 0.0392,
      "num_tokens": 420961635.0,
      "reward": 0.083984375,
      "reward_std": 0.11329546570777893,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1889.138671875,
      "completions/mean_terminated_length": 1005.2179565429688,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.15396432533925067,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 24.811244659039755,
      "learning_rate": 9.921513529585842e-07,
      "loss": 0.0072,
      "num_tokens": 422010362.0,
      "reward": 0.0390625,
      "reward_std": 0.031083684414625168,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1799.24609375,
      "completions/mean_terminated_length": 748.89794921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.15430570965264145,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.11324895709627,
      "learning_rate": 9.920513481002698e-07,
      "loss": 0.0045,
      "num_tokens": 423007864.0,
      "reward": 0.013671875,
      "reward_std": 0.03741292655467987,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1718.63671875,
      "completions/mean_terminated_length": 710.1032104492188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.15464709396603227,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.909204175468957,
      "learning_rate": 9.919507158309192e-07,
      "loss": 0.0259,
      "num_tokens": 423973070.0,
      "reward": 0.083984375,
      "reward_std": 0.1175336092710495,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1754.193359375,
      "completions/mean_terminated_length": 739.9216918945312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15498847827942305,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.570547002225478,
      "learning_rate": 9.918494562933614e-07,
      "loss": -0.0001,
      "num_tokens": 424944881.0,
      "reward": 0.0234375,
      "reward_std": 0.05920084938406944,
      "rewards/accuracy_reward/mean": 0.024193547666072845,
      "rewards/accuracy_reward/std": 0.15380479395389557,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1914.55078125,
      "completions/mean_terminated_length": 734.0385131835938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15532986259281387,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 5.071561512225769,
      "learning_rate": 9.917475696313157e-07,
      "loss": 0.0003,
      "num_tokens": 426006523.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1850.921875,
      "completions/mean_terminated_length": 888.712646484375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.15567124690620465,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 23.727008885026166,
      "learning_rate": 9.916450559893917e-07,
      "loss": 0.0038,
      "num_tokens": 427039507.0,
      "reward": 0.05078125,
      "reward_std": 0.0645298883318901,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1777.681640625,
      "completions/mean_terminated_length": 823.1947021484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15601263121959547,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 42.17401086487023,
      "learning_rate": 9.915419155130886e-07,
      "loss": -0.0022,
      "num_tokens": 428020128.0,
      "reward": 0.0234375,
      "reward_std": 0.05479401722550392,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1796.60546875,
      "completions/mean_terminated_length": 898.7678833007812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.15635401553298625,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.485090063427458,
      "learning_rate": 9.914381483487957e-07,
      "loss": 0.009,
      "num_tokens": 429017670.0,
      "reward": 0.033203125,
      "reward_std": 0.05688370764255524,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1834.505859375,
      "completions/mean_terminated_length": 1045.1650390625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15669539984637706,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.2693779814864006,
      "learning_rate": 9.913337546437912e-07,
      "loss": 0.0114,
      "num_tokens": 430032249.0,
      "reward": 0.044921875,
      "reward_std": 0.060957908630371094,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.896484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1922.154296875,
      "completions/mean_terminated_length": 832.2830200195312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.15703678415976785,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 5.220053865493498,
      "learning_rate": 9.91228734546243e-07,
      "loss": 0.0025,
      "num_tokens": 431096280.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.888671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1930.361328125,
      "completions/mean_terminated_length": 992.24560546875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.15737816847315866,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 42.12738165132376,
      "learning_rate": 9.911230882052082e-07,
      "loss": 0.0123,
      "num_tokens": 432162481.0,
      "reward": 0.04296875,
      "reward_std": 0.0580955371260643,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1834.3125,
      "completions/mean_terminated_length": 931.591796875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.15771955278654945,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 17.261598475462286,
      "learning_rate": 9.91016815770632e-07,
      "loss": 0.0098,
      "num_tokens": 433178289.0,
      "reward": 0.046875,
      "reward_std": 0.0737205371260643,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1863.0,
      "completions/mean_length": 1850.97265625,
      "completions/mean_terminated_length": 963.8709716796875,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.15806093709994026,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 22.928070970916895,
      "learning_rate": 9.90909917393349e-07,
      "loss": 0.0056,
      "num_tokens": 434214371.0,
      "reward": 0.03515625,
      "reward_std": 0.032021719962358475,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1807.453125,
      "completions/mean_terminated_length": 765.0833740234375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.15840232141333105,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.9311594819741185,
      "learning_rate": 9.908023932250816e-07,
      "loss": 0.0131,
      "num_tokens": 435213403.0,
      "reward": 0.05859375,
      "reward_std": 0.06327171623706818,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1895.07421875,
      "completions/mean_terminated_length": 989.9189453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.15874370572672186,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.263640298655056,
      "learning_rate": 9.906942434184411e-07,
      "loss": 0.0093,
      "num_tokens": 436257777.0,
      "reward": 0.041015625,
      "reward_std": 0.04561128467321396,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1898.78125,
      "completions/mean_terminated_length": 971.9436645507812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.15908509004011265,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 60.238725047809325,
      "learning_rate": 9.90585468126926e-07,
      "loss": 0.0249,
      "num_tokens": 437305569.0,
      "reward": 0.037109375,
      "reward_std": 0.07669496536254883,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1712.150390625,
      "completions/mean_terminated_length": 909.2251586914062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.15942647435350346,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.357874352588069,
      "learning_rate": 9.904760675049233e-07,
      "loss": 0.0063,
      "num_tokens": 438259646.0,
      "reward": 0.02734375,
      "reward_std": 0.06492365896701813,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.86328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1872.396484375,
      "completions/mean_terminated_length": 763.585693359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.15976785866689425,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 0.07308143364605568,
      "learning_rate": 9.903660417077069e-07,
      "loss": -0.0023,
      "num_tokens": 439288489.0,
      "reward": 0.009765625,
      "reward_std": 0.014959799125790596,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1787.587890625,
      "completions/mean_terminated_length": 946.0908813476562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16010924298028506,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 9.696046531817968,
      "learning_rate": 9.90255390891438e-07,
      "loss": 0.0087,
      "num_tokens": 440278854.0,
      "reward": 0.025390625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1862.0,
      "completions/mean_length": 1800.759765625,
      "completions/mean_terminated_length": 830.8173217773438,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.16045062729367585,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.917488953327458,
      "learning_rate": 9.90144115213166e-07,
      "loss": 0.0056,
      "num_tokens": 441281563.0,
      "reward": 0.037109375,
      "reward_std": 0.06860867142677307,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1766.3671875,
      "completions/mean_terminated_length": 912.5984497070312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.16079201160706666,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 17.88570821411593,
      "learning_rate": 9.900322148308256e-07,
      "loss": 0.0279,
      "num_tokens": 442255527.0,
      "reward": 0.048828125,
      "reward_std": 0.07874394953250885,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1642.8671875,
      "completions/mean_terminated_length": 767.5802612304688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16113339592045745,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.891304098682305,
      "learning_rate": 9.899196899032393e-07,
      "loss": 0.0146,
      "num_tokens": 443162387.0,
      "reward": 0.0625,
      "reward_std": 0.10067769140005112,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1897.0,
      "completions/mean_length": 1762.927734375,
      "completions/mean_terminated_length": 870.9273681640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.16147478023384826,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.102665470538868,
      "learning_rate": 9.898065405901156e-07,
      "loss": 0.0242,
      "num_tokens": 444143870.0,
      "reward": 0.068359375,
      "reward_std": 0.09078246355056763,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1775.95703125,
      "completions/mean_terminated_length": 857.5214233398438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.16181616454723904,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.937774072950919,
      "learning_rate": 9.896927670520495e-07,
      "loss": 0.032,
      "num_tokens": 445121864.0,
      "reward": 0.072265625,
      "reward_std": 0.0683835819363594,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1794.95703125,
      "completions/mean_terminated_length": 765.2474975585938,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.16215754886062986,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.331967286884289,
      "learning_rate": 9.895783694505212e-07,
      "loss": 0.0085,
      "num_tokens": 446117474.0,
      "reward": 0.01953125,
      "reward_std": 0.04505910724401474,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1722.34765625,
      "completions/mean_terminated_length": 857.0428466796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.16249893317402064,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.351624507032269,
      "learning_rate": 9.894633479478974e-07,
      "loss": 0.0224,
      "num_tokens": 447076516.0,
      "reward": 0.05078125,
      "reward_std": 0.07340699434280396,
      "rewards/accuracy_reward/mean": 0.052419353276491165,
      "rewards/accuracy_reward/std": 0.22309619188308716,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1603.806640625,
      "completions/mean_terminated_length": 869.6217651367188,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.16284031748741146,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.8560699741509796,
      "learning_rate": 9.893477027074303e-07,
      "loss": 0.014,
      "num_tokens": 447975009.0,
      "reward": 0.0625,
      "reward_std": 0.0645298883318901,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1776.279296875,
      "completions/mean_terminated_length": 994.0530395507812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16318170180080224,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.631906826141045,
      "learning_rate": 9.89231433893257e-07,
      "loss": 0.0201,
      "num_tokens": 448961504.0,
      "reward": 0.029296875,
      "reward_std": 0.064970001578331,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1843.33203125,
      "completions/mean_terminated_length": 933.2127075195312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.16352308611419306,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.48690598958937414,
      "learning_rate": 9.891145416703998e-07,
      "loss": 0.0033,
      "num_tokens": 449978746.0,
      "reward": 0.009765625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.86328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1895.099609375,
      "completions/mean_terminated_length": 929.6428833007812,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.16386447042758384,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.6989406952855886,
      "learning_rate": 9.889970262047658e-07,
      "loss": 0.0099,
      "num_tokens": 451026973.0,
      "reward": 0.013671875,
      "reward_std": 0.032461829483509064,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1775.75390625,
      "completions/mean_terminated_length": 876.655517578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.16420585474097465,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 19.866435026265687,
      "learning_rate": 9.888788876631467e-07,
      "loss": 0.0061,
      "num_tokens": 452010623.0,
      "reward": 0.033203125,
      "reward_std": 0.0666455551981926,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211629986763,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1846.037109375,
      "completions/mean_terminated_length": 859.436767578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.16454723905436544,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.24443680707577523,
      "learning_rate": 9.887601262132187e-07,
      "loss": 0.0046,
      "num_tokens": 453039778.0,
      "reward": 0.017578125,
      "reward_std": 0.03724660724401474,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1794.853515625,
      "completions/mean_terminated_length": 911.0614013671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16488862336775625,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.6697871523979727,
      "learning_rate": 9.88640742023542e-07,
      "loss": 0.0113,
      "num_tokens": 454031239.0,
      "reward": 0.009765625,
      "reward_std": 0.028222277760505676,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1669.65234375,
      "completions/mean_terminated_length": 934.701171875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16523000768114704,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.445078771953974,
      "learning_rate": 9.8852073526356e-07,
      "loss": 0.0118,
      "num_tokens": 454966309.0,
      "reward": 0.05859375,
      "reward_std": 0.0936504602432251,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1756.134765625,
      "completions/mean_terminated_length": 972.9281005859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16557139199453785,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 24.138076351078794,
      "learning_rate": 9.884001061036013e-07,
      "loss": -0.0058,
      "num_tokens": 455956650.0,
      "reward": 0.080078125,
      "reward_std": 0.0809411108493805,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1825.087890625,
      "completions/mean_terminated_length": 950.5865478515625,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.16591277630792864,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.1733387609781065,
      "learning_rate": 9.882788547148764e-07,
      "loss": 0.0154,
      "num_tokens": 456972423.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.012096773833036423,
      "rewards/accuracy_reward/std": 0.10942844301462173,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1626.744140625,
      "completions/mean_terminated_length": 924.6510620117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.16625416062131945,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.91214937309718,
      "learning_rate": 9.881569812694795e-07,
      "loss": 0.0176,
      "num_tokens": 457882036.0,
      "reward": 0.048828125,
      "reward_std": 0.07493096590042114,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1674.9140625,
      "completions/mean_terminated_length": 854.125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16659554493471024,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 13.682206703302473,
      "learning_rate": 9.880344859403876e-07,
      "loss": 0.0103,
      "num_tokens": 458813960.0,
      "reward": 0.02734375,
      "reward_std": 0.04907120764255524,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1673.44140625,
      "completions/mean_terminated_length": 826.5095825195312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16693692924810105,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.3641632511405746,
      "learning_rate": 9.879113689014606e-07,
      "loss": -0.0001,
      "num_tokens": 459755818.0,
      "reward": 0.041015625,
      "reward_std": 0.05793476477265358,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1697.404296875,
      "completions/mean_terminated_length": 826.8775634765625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16727831356149184,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 32.2314419948305,
      "learning_rate": 9.877876303274404e-07,
      "loss": 0.0127,
      "num_tokens": 460703209.0,
      "reward": 0.05859375,
      "reward_std": 0.1156454086303711,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1649.765625,
      "completions/mean_terminated_length": 827.06591796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16761969787488265,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.270410936155635,
      "learning_rate": 9.876632703939517e-07,
      "loss": 0.0112,
      "num_tokens": 461622097.0,
      "reward": 0.046875,
      "reward_std": 0.09518137574195862,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1789.486328125,
      "completions/mean_terminated_length": 916.7265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.16796108218827344,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.639480166458279,
      "learning_rate": 9.875382892775e-07,
      "loss": 0.021,
      "num_tokens": 462613258.0,
      "reward": 0.025390625,
      "reward_std": 0.07438573986291885,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1836.68359375,
      "completions/mean_terminated_length": 1081.982177734375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.16830246650166425,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.060624547414804,
      "learning_rate": 9.874126871554738e-07,
      "loss": 0.0224,
      "num_tokens": 463628632.0,
      "reward": 0.033203125,
      "reward_std": 0.07586899399757385,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1818.36328125,
      "completions/mean_terminated_length": 696.57470703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.16864385081505504,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.319974956419962,
      "learning_rate": 9.872864642061419e-07,
      "loss": 0.0057,
      "num_tokens": 464628306.0,
      "reward": 0.025390625,
      "reward_std": 0.06849660724401474,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196493625641,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1614.453125,
      "completions/mean_terminated_length": 710.795166015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.16898523512844585,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.292361686925275,
      "learning_rate": 9.87159620608655e-07,
      "loss": 0.0157,
      "num_tokens": 465536026.0,
      "reward": 0.060546875,
      "reward_std": 0.0840085819363594,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1807.380859375,
      "completions/mean_terminated_length": 885.76416015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.16932661944183663,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.90641434067934,
      "learning_rate": 9.87032156543044e-07,
      "loss": 0.0224,
      "num_tokens": 466532221.0,
      "reward": 0.021484375,
      "reward_std": 0.054354868829250336,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1827.595703125,
      "completions/mean_terminated_length": 941.6569213867188,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.16966800375522745,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.087614501863381,
      "learning_rate": 9.869040721902213e-07,
      "loss": 0.0023,
      "num_tokens": 467544654.0,
      "reward": 0.03125,
      "reward_std": 0.06409768760204315,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1827.9765625,
      "completions/mean_terminated_length": 1051.07958984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.17000938806861823,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 27.88031154057963,
      "learning_rate": 9.86775367731979e-07,
      "loss": -0.004,
      "num_tokens": 468561122.0,
      "reward": 0.083984375,
      "reward_std": 0.08094111829996109,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1815.07421875,
      "completions/mean_terminated_length": 953.889892578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.17035077238200905,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 36.14980895384696,
      "learning_rate": 9.866460433509893e-07,
      "loss": 0.0106,
      "num_tokens": 469566328.0,
      "reward": 0.033203125,
      "reward_std": 0.054241843521595,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1792.544921875,
      "completions/mean_terminated_length": 890.539794921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.17069215669539983,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.01887296968589,
      "learning_rate": 9.865160992308047e-07,
      "loss": 0.0179,
      "num_tokens": 470558703.0,
      "reward": 0.091796875,
      "reward_std": 0.09534074366092682,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1877.0546875,
      "completions/mean_terminated_length": 1154.89794921875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.17103354100879065,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 59.305129030936534,
      "learning_rate": 9.863855355558573e-07,
      "loss": 0.0144,
      "num_tokens": 471599627.0,
      "reward": 0.09765625,
      "reward_std": 0.08499295264482498,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1829.72265625,
      "completions/mean_terminated_length": 871.6000366210938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17137492532218146,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 42.46915295445329,
      "learning_rate": 9.862543525114582e-07,
      "loss": 0.0092,
      "num_tokens": 472609997.0,
      "reward": 0.021484375,
      "reward_std": 0.05193261429667473,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1826.541015625,
      "completions/mean_terminated_length": 1053.377197265625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.17171630963557225,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 17.236420763000325,
      "learning_rate": 9.861225502837976e-07,
      "loss": 0.0014,
      "num_tokens": 473628754.0,
      "reward": 0.029296875,
      "reward_std": 0.04726085811853409,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1765.7109375,
      "completions/mean_terminated_length": 843.5667114257812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17205769394896306,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.058591294531599,
      "learning_rate": 9.859901290599448e-07,
      "loss": 0.025,
      "num_tokens": 474611182.0,
      "reward": 0.048828125,
      "reward_std": 0.07302350550889969,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1879.12890625,
      "completions/mean_terminated_length": 879.5946044921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.17239907826235384,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.255340365910076,
      "learning_rate": 9.858570890278475e-07,
      "loss": 0.0121,
      "num_tokens": 475647424.0,
      "reward": 0.03125,
      "reward_std": 0.06837663054466248,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1868.5078125,
      "completions/mean_terminated_length": 716.115966796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.17274046257574466,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.772987306282009,
      "learning_rate": 9.857234303763317e-07,
      "loss": 0.0187,
      "num_tokens": 476682724.0,
      "reward": 0.037109375,
      "reward_std": 0.05099457502365112,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1883.412109375,
      "completions/mean_terminated_length": 808.75,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.17308184688913544,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.656879661902198,
      "learning_rate": 9.855891532951015e-07,
      "loss": 0.0159,
      "num_tokens": 477727463.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1884.251953125,
      "completions/mean_terminated_length": 883.5694580078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.17342323120252626,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 10.459997247741349,
      "learning_rate": 9.854542579747383e-07,
      "loss": 0.0051,
      "num_tokens": 478774088.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1943.6875,
      "completions/mean_terminated_length": 1095.071533203125,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.17376461551591704,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 1.482333780461527,
      "learning_rate": 9.853187446067019e-07,
      "loss": 0.0055,
      "num_tokens": 479856200.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1850.0,
      "completions/mean_length": 1923.134765625,
      "completions/mean_terminated_length": 906.3750610351562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17410599982930786,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.602772611966746,
      "learning_rate": 9.85182613383328e-07,
      "loss": 0.0121,
      "num_tokens": 480914365.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.876953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1895.380859375,
      "completions/mean_terminated_length": 807.6666870117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17444738414269864,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.3264946718671955,
      "learning_rate": 9.850458644978307e-07,
      "loss": 0.0114,
      "num_tokens": 481964096.0,
      "reward": 0.009765625,
      "reward_std": 0.023271184414625168,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.873046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1900.189453125,
      "completions/mean_terminated_length": 883.7077026367188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17478876845608946,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 5.089345314160676,
      "learning_rate": 9.849084981442997e-07,
      "loss": -0.0001,
      "num_tokens": 483012833.0,
      "reward": 0.009765625,
      "reward_std": 0.014959799125790596,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1918.6953125,
      "completions/mean_terminated_length": 1013.5625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.17513015276948024,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 46.126518427241216,
      "learning_rate": 9.847705145177013e-07,
      "loss": 0.0067,
      "num_tokens": 484079541.0,
      "reward": 0.01953125,
      "reward_std": 0.051493462175130844,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.955078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1722.0,
      "completions/mean_length": 1989.291015625,
      "completions/mean_terminated_length": 741.0869750976562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17547153708287105,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.306278338996831,
      "learning_rate": 9.84631913813878e-07,
      "loss": 0.0023,
      "num_tokens": 485177050.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1825.0,
      "completions/mean_length": 1833.232421875,
      "completions/mean_terminated_length": 521.2916870117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17581292139626184,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.07934141698569,
      "learning_rate": 9.844926962295487e-07,
      "loss": 0.0172,
      "num_tokens": 486193153.0,
      "reward": 0.046875,
      "reward_std": 0.08345641195774078,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1886.50390625,
      "completions/mean_terminated_length": 1063.642822265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17615430570965265,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 25.918379914242184,
      "learning_rate": 9.843528619623068e-07,
      "loss": -0.0068,
      "num_tokens": 487242851.0,
      "reward": 0.048828125,
      "reward_std": 0.055459219962358475,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.908203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1945.52734375,
      "completions/mean_terminated_length": 931.7020874023438,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.17649569002304344,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.074489289766609,
      "learning_rate": 9.842124112106214e-07,
      "loss": 0.0136,
      "num_tokens": 488316257.0,
      "reward": 0.0234375,
      "reward_std": 0.03449726849794388,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.873046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1909.5859375,
      "completions/mean_terminated_length": 958.4000244140625,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.17683707433643425,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.7535261999195635,
      "learning_rate": 9.84071344173837e-07,
      "loss": 0.0016,
      "num_tokens": 489378333.0,
      "reward": 0.025390625,
      "reward_std": 0.036420635879039764,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.888671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1794.0,
      "completions/mean_length": 1894.037109375,
      "completions/mean_terminated_length": 665.0350952148438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17717845864982504,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.51208727068919,
      "learning_rate": 9.839296610521723e-07,
      "loss": 0.0062,
      "num_tokens": 490430016.0,
      "reward": 0.005859375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.869140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1881.80078125,
      "completions/mean_terminated_length": 777.9403076171875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.17751984296321585,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.310692694221732,
      "learning_rate": 9.837873620467203e-07,
      "loss": 0.0215,
      "num_tokens": 491472858.0,
      "reward": 0.03125,
      "reward_std": 0.04738742858171463,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1865.65625,
      "completions/mean_terminated_length": 851.076904296875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17786122727660664,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.328715596479137,
      "learning_rate": 9.836444473594488e-07,
      "loss": 0.0129,
      "num_tokens": 492511274.0,
      "reward": 0.0234375,
      "reward_std": 0.04670868441462517,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.896484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1896.0,
      "completions/mean_length": 1898.76171875,
      "completions/mean_terminated_length": 606.3018798828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17820261158999745,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 12.579588330372541,
      "learning_rate": 9.83500917193199e-07,
      "loss": -0.0017,
      "num_tokens": 493560880.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1811.0,
      "completions/mean_length": 1922.841796875,
      "completions/mean_terminated_length": 712.9791870117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17854399590338824,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.4255374368916374,
      "learning_rate": 9.833567717516856e-07,
      "loss": 0.0109,
      "num_tokens": 494619263.0,
      "reward": 0.0078125,
      "reward_std": 0.020409777760505676,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.857421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 1842.72265625,
      "completions/mean_terminated_length": 608.24658203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.17888538021677905,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.922419274012383,
      "learning_rate": 9.832120112394969e-07,
      "loss": 0.0089,
      "num_tokens": 495637329.0,
      "reward": 0.0234375,
      "reward_std": 0.04738742858171463,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.919921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1941.068359375,
      "completions/mean_terminated_length": 712.6585083007812,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.17922676453016984,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 9.525964094930613,
      "learning_rate": 9.830666358620936e-07,
      "loss": 0.019,
      "num_tokens": 496712276.0,
      "reward": 0.0234375,
      "reward_std": 0.03779878467321396,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1917.82421875,
      "completions/mean_terminated_length": 857.8214721679688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.17956814884356065,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.077388356284844,
      "learning_rate": 9.829206458258097e-07,
      "loss": 0.0207,
      "num_tokens": 497772922.0,
      "reward": 0.0234375,
      "reward_std": 0.04670868441462517,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1938.634765625,
      "completions/mean_terminated_length": 1225.176513671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.17990953315695143,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.035390223215256,
      "learning_rate": 9.827740413378513e-07,
      "loss": 0.0119,
      "num_tokens": 498848239.0,
      "reward": 0.05078125,
      "reward_std": 0.06563520431518555,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1797.0,
      "completions/mean_length": 1808.3046875,
      "completions/mean_terminated_length": 756.16845703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18025091747034225,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.417740476227587,
      "learning_rate": 9.826268226062967e-07,
      "loss": 0.0041,
      "num_tokens": 499845467.0,
      "reward": 0.029296875,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.92578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1832.0,
      "completions/mean_length": 1923.798828125,
      "completions/mean_terminated_length": 374.5526428222656,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18059230178373303,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.82478989840096e-07,
      "loss": 0.0,
      "num_tokens": 500902916.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/accuracy_reward/mean": 0.0,
      "rewards/accuracy_reward/std": 0.0,
      "step": 529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.904296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1925.734375,
      "completions/mean_terminated_length": 770.448974609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18093368609712385,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.997138510877159,
      "learning_rate": 9.823305432490705e-07,
      "loss": 0.0153,
      "num_tokens": 501968012.0,
      "reward": 0.021484375,
      "reward_std": 0.03741292655467987,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1776.5546875,
      "completions/mean_terminated_length": 711.6538696289062,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.18127507041051463,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.314160850450367,
      "learning_rate": 9.821814830439133e-07,
      "loss": 0.0171,
      "num_tokens": 502958200.0,
      "reward": 0.037109375,
      "reward_std": 0.07273615896701813,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1893.259765625,
      "completions/mean_terminated_length": 947.625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18161645472390545,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 23.717393891089788,
      "learning_rate": 9.820318094361883e-07,
      "loss": 0.0044,
      "num_tokens": 504011549.0,
      "reward": 0.064453125,
      "reward_std": 0.06244116649031639,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1809.103515625,
      "completions/mean_terminated_length": 746.7765502929688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.18195783903729623,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.037442383874556,
      "learning_rate": 9.81881522638329e-07,
      "loss": 0.0107,
      "num_tokens": 505012722.0,
      "reward": 0.015625,
      "reward_std": 0.03839729726314545,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.884765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1889.455078125,
      "completions/mean_terminated_length": 672.1525268554688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18229922335068705,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 9.081106926428404,
      "learning_rate": 9.817306228636411e-07,
      "loss": 0.0188,
      "num_tokens": 506049611.0,
      "reward": 0.021484375,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1871.0,
      "completions/mean_length": 1751.1015625,
      "completions/mean_terminated_length": 600.2666625976562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18264060766407783,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.319436215537658,
      "learning_rate": 9.815791103262981e-07,
      "loss": 0.0008,
      "num_tokens": 507020799.0,
      "reward": 0.03515625,
      "reward_std": 0.05931950360536575,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1775.626953125,
      "completions/mean_terminated_length": 835.7042846679688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18298199197746864,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.975147957281659,
      "learning_rate": 9.814269852413453e-07,
      "loss": 0.0049,
      "num_tokens": 508007712.0,
      "reward": 0.05859375,
      "reward_std": 0.06634815037250519,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1830.619140625,
      "completions/mean_terminated_length": 851.236572265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.18332337629085943,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.2311419512950486,
      "learning_rate": 9.812742478246957e-07,
      "loss": 0.0009,
      "num_tokens": 509016813.0,
      "reward": 0.0234375,
      "reward_std": 0.036547206342220306,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1797.908203125,
      "completions/mean_terminated_length": 671.1505126953125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.18366476060425024,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.654129346242353,
      "learning_rate": 9.811208982931327e-07,
      "loss": 0.0194,
      "num_tokens": 510007390.0,
      "reward": 0.01171875,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1813.591796875,
      "completions/mean_terminated_length": 771.223388671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18400614491764103,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.112976580884771,
      "learning_rate": 9.809669368643075e-07,
      "loss": 0.0161,
      "num_tokens": 511012125.0,
      "reward": 0.033203125,
      "reward_std": 0.06844235956668854,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1793.0,
      "completions/mean_length": 1803.060546875,
      "completions/mean_terminated_length": 781.242431640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.18434752923103184,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.799750780626729,
      "learning_rate": 9.808123637567406e-07,
      "loss": 0.0201,
      "num_tokens": 512008268.0,
      "reward": 0.02734375,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1810.48046875,
      "completions/mean_terminated_length": 932.3118896484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.18468891354442263,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.498139135675712,
      "learning_rate": 9.806571791898196e-07,
      "loss": 0.0112,
      "num_tokens": 513015698.0,
      "reward": 0.013671875,
      "reward_std": 0.04973640665411949,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1699.0,
      "completions/mean_length": 1825.5078125,
      "completions/mean_terminated_length": 624.0499877929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18503029785781344,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 15.349433768378061,
      "learning_rate": 9.805013833838014e-07,
      "loss": 0.015,
      "num_tokens": 514033446.0,
      "reward": 0.0390625,
      "reward_std": 0.060571081936359406,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1857.458984375,
      "completions/mean_terminated_length": 886.607177734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.18537168217120423,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.652111917404474,
      "learning_rate": 9.80344976559809e-07,
      "loss": 0.0238,
      "num_tokens": 515066049.0,
      "reward": 0.052734375,
      "reward_std": 0.06397771090269089,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1842.611328125,
      "completions/mean_terminated_length": 929.2872314453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.18571306648459504,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 26.82701428176531,
      "learning_rate": 9.801879589398338e-07,
      "loss": 0.0113,
      "num_tokens": 516081850.0,
      "reward": 0.0546875,
      "reward_std": 0.09782323986291885,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1738.3828125,
      "completions/mean_terminated_length": 864.9850463867188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.18605445079798583,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 21.662496615970383,
      "learning_rate": 9.80030330746733e-07,
      "loss": 0.0112,
      "num_tokens": 517054334.0,
      "reward": 0.09765625,
      "reward_std": 0.12384376674890518,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1708.005859375,
      "completions/mean_terminated_length": 632.7398071289062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.18639583511137664,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 11.361701547499647,
      "learning_rate": 9.798720922042316e-07,
      "loss": 0.0209,
      "num_tokens": 518006945.0,
      "reward": 0.08203125,
      "reward_std": 0.13616865873336792,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1875.9765625,
      "completions/mean_terminated_length": 752.7647094726562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18673721942476743,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.1463425398936,
      "learning_rate": 9.7971324353692e-07,
      "loss": 0.0041,
      "num_tokens": 519044741.0,
      "reward": 0.01953125,
      "reward_std": 0.03944835811853409,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1905.0,
      "completions/mean_length": 1769.203125,
      "completions/mean_terminated_length": 838.3051147460938,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.18707860373815824,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.516965111440346,
      "learning_rate": 9.795537849702546e-07,
      "loss": 0.0167,
      "num_tokens": 520035997.0,
      "reward": 0.072265625,
      "reward_std": 0.08483455330133438,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1850.822265625,
      "completions/mean_terminated_length": 683.7432861328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.18741998805154902,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 23.723546969158985,
      "learning_rate": 9.79393716730558e-07,
      "loss": 0.008,
      "num_tokens": 521052338.0,
      "reward": 0.013671875,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.01411290280520916,
      "rewards/accuracy_reward/std": 0.11807556450366974,
      "step": 549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1738.033203125,
      "completions/mean_terminated_length": 679.8706665039062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18776137236493984,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.554461766217026,
      "learning_rate": 9.792330390450179e-07,
      "loss": 0.0264,
      "num_tokens": 522016707.0,
      "reward": 0.05859375,
      "reward_std": 0.08659161627292633,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1748.271484375,
      "completions/mean_terminated_length": 627.0648193359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18810275667833062,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 19.822015894176953,
      "learning_rate": 9.790717521416865e-07,
      "loss": 0.0079,
      "num_tokens": 522992606.0,
      "reward": 0.013671875,
      "reward_std": 0.03741292655467987,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1803.982421875,
      "completions/mean_terminated_length": 704.5913696289062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.18844414099172144,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.063037350430578,
      "learning_rate": 9.789098562494813e-07,
      "loss": 0.0139,
      "num_tokens": 523985925.0,
      "reward": 0.037109375,
      "reward_std": 0.06271954625844955,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1869.0,
      "completions/mean_length": 1872.84765625,
      "completions/mean_terminated_length": 784.9295654296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.18878552530511222,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.666618075475336,
      "learning_rate": 9.787473515981837e-07,
      "loss": 0.0037,
      "num_tokens": 525020919.0,
      "reward": 0.05078125,
      "reward_std": 0.06342554092407227,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1857.02734375,
      "completions/mean_terminated_length": 794.4359130859375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.18912690961850304,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 8.84247693319475,
      "learning_rate": 9.785842384184396e-07,
      "loss": 0.0031,
      "num_tokens": 526051141.0,
      "reward": 0.01171875,
      "reward_std": 0.032021719962358475,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1854.0,
      "completions/mean_length": 1805.044921875,
      "completions/mean_terminated_length": 602.1162719726562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.18946829393189382,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.738478543181028,
      "learning_rate": 9.784205169417582e-07,
      "loss": 0.0077,
      "num_tokens": 527051596.0,
      "reward": 0.029296875,
      "reward_std": 0.05782270431518555,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.896484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 1913.193359375,
      "completions/mean_terminated_length": 745.7169799804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.18980967824528464,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 0.3147091493933924,
      "learning_rate": 9.782561874005121e-07,
      "loss": 0.0044,
      "num_tokens": 528110431.0,
      "reward": 0.00390625,
      "reward_std": 0.010673906654119492,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.869140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1893.43359375,
      "completions/mean_terminated_length": 866.8358154296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.19015106255867542,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 28.941065225280187,
      "learning_rate": 9.78091250027937e-07,
      "loss": 0.0023,
      "num_tokens": 529154205.0,
      "reward": 0.05078125,
      "reward_std": 0.09145425260066986,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.845703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1821.501953125,
      "completions/mean_terminated_length": 580.0632934570312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19049244687206623,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.7571919857895075,
      "learning_rate": 9.779257050581316e-07,
      "loss": 0.0056,
      "num_tokens": 530163518.0,
      "reward": 0.013671875,
      "reward_std": 0.03741292655467987,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.91796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1542.0,
      "completions/mean_length": 1920.375,
      "completions/mean_terminated_length": 492.19049072265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19083383118545702,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.777595527260567e-07,
      "loss": 0.0,
      "num_tokens": 531221182.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/accuracy_reward/mean": 0.0,
      "rewards/accuracy_reward/std": 0.0,
      "step": 559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.857421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1834.982421875,
      "completions/mean_terminated_length": 553.9589233398438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19117521549884783,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.177725227789154,
      "learning_rate": 9.77592793267535e-07,
      "loss": 0.0041,
      "num_tokens": 532234117.0,
      "reward": 0.0234375,
      "reward_std": 0.03839729726314545,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.904296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1903.0,
      "completions/mean_length": 1890.23828125,
      "completions/mean_terminated_length": 399.551025390625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19151659981223862,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 0.8671678512352751,
      "learning_rate": 9.774254269192506e-07,
      "loss": 0.0038,
      "num_tokens": 533274975.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1934.0,
      "completions/mean_length": 1822.31640625,
      "completions/mean_terminated_length": 704.3953247070312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.19185798412562943,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.537147527139063,
      "learning_rate": 9.772574539187503e-07,
      "loss": 0.011,
      "num_tokens": 534289729.0,
      "reward": 0.02734375,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1845.525390625,
      "completions/mean_terminated_length": 883.2022705078125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.19219936843902022,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.534077515268373,
      "learning_rate": 9.770888745044405e-07,
      "loss": 0.0118,
      "num_tokens": 535310142.0,
      "reward": 0.021484375,
      "reward_std": 0.03773209825158119,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1808.1484375,
      "completions/mean_terminated_length": 741.574462890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.19254075275241103,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.679005661288093,
      "learning_rate": 9.769196889155888e-07,
      "loss": 0.0043,
      "num_tokens": 536323082.0,
      "reward": 0.04296875,
      "reward_std": 0.04907120764255524,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1792.0,
      "completions/mean_length": 1914.419921875,
      "completions/mean_terminated_length": 623.1458740234375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.19288213706580182,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.152320412280184,
      "learning_rate": 9.767498973923236e-07,
      "loss": 0.0057,
      "num_tokens": 537377937.0,
      "reward": 0.009765625,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1796.42578125,
      "completions/mean_terminated_length": 759.9400024414062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.19322352137919263,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.836133321230195,
      "learning_rate": 9.765795001756326e-07,
      "loss": 0.0119,
      "num_tokens": 538380043.0,
      "reward": 0.044921875,
      "reward_std": 0.05133409798145294,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.880859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1894.626953125,
      "completions/mean_terminated_length": 761.5573120117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19356490569258342,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 9.405455281696756,
      "learning_rate": 9.764084975073635e-07,
      "loss": 0.005,
      "num_tokens": 539434492.0,
      "reward": 0.01171875,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.888671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1885.0,
      "completions/mean_length": 1888.25,
      "completions/mean_terminated_length": 613.0526123046875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19390629000597423,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.965843961012798,
      "learning_rate": 9.762368896302234e-07,
      "loss": 0.0105,
      "num_tokens": 540474572.0,
      "reward": 0.01953125,
      "reward_std": 0.058320626616477966,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1873.9140625,
      "completions/mean_terminated_length": 697.5151977539062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.19424767431936502,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.310290168024493,
      "learning_rate": 9.760646767877784e-07,
      "loss": -0.0049,
      "num_tokens": 541506928.0,
      "reward": 0.01171875,
      "reward_std": 0.031083684414625168,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.884765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1910.544921875,
      "completions/mean_terminated_length": 855.1694946289062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.19458905863275583,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.06847169525495,
      "learning_rate": 9.758918592244528e-07,
      "loss": 0.0128,
      "num_tokens": 542558423.0,
      "reward": 0.017578125,
      "reward_std": 0.03536957502365112,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1820.466796875,
      "completions/mean_terminated_length": 871.2626342773438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.19493044294614661,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.81160156550978,
      "learning_rate": 9.757184371855298e-07,
      "loss": 0.0073,
      "num_tokens": 543575062.0,
      "reward": 0.04296875,
      "reward_std": 0.07415273785591125,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.841796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1841.806640625,
      "completions/mean_terminated_length": 744.654296875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.19527182725953743,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 15.483274572528053,
      "learning_rate": 9.7554441091715e-07,
      "loss": 0.0142,
      "num_tokens": 544594691.0,
      "reward": 0.04296875,
      "reward_std": 0.06037056818604469,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1845.0,
      "completions/mean_length": 1849.029296875,
      "completions/mean_terminated_length": 835.2261962890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.19561321157292821,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 1.1304099438143267,
      "learning_rate": 9.753697806663124e-07,
      "loss": 0.0001,
      "num_tokens": 545617922.0,
      "reward": 0.00390625,
      "reward_std": 0.010673906654119492,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1913.0,
      "completions/mean_length": 1766.611328125,
      "completions/mean_terminated_length": 562.73193359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19595459588631903,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.944897509612453,
      "learning_rate": 9.75194546680872e-07,
      "loss": -0.0149,
      "num_tokens": 546599259.0,
      "reward": 0.03125,
      "reward_std": 0.05012226849794388,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.91796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1935.37890625,
      "completions/mean_terminated_length": 675.0952758789062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.1962959801997098,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.175657373291508,
      "learning_rate": 9.750187092095422e-07,
      "loss": 0.0063,
      "num_tokens": 547664253.0,
      "reward": 0.021484375,
      "reward_std": 0.036420635879039764,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1804.53125,
      "completions/mean_terminated_length": 631.45458984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.19663736451310063,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.947413235457389,
      "learning_rate": 9.748422685018911e-07,
      "loss": 0.0011,
      "num_tokens": 548673133.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.004032257944345474,
      "rewards/accuracy_reward/std": 0.06343588978052139,
      "step": 576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.853515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1867.91015625,
      "completions/mean_terminated_length": 818.586669921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.1969787488264914,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.8444711746001765,
      "learning_rate": 9.74665224808345e-07,
      "loss": -0.0008,
      "num_tokens": 549709295.0,
      "reward": 0.015625,
      "reward_std": 0.023823359981179237,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1764.1484375,
      "completions/mean_terminated_length": 866.43896484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19732013313988223,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.501801099434747,
      "learning_rate": 9.744875783801844e-07,
      "loss": -0.0039,
      "num_tokens": 550688971.0,
      "reward": 0.0390625,
      "reward_std": 0.06299334019422531,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1819.109375,
      "completions/mean_terminated_length": 827.25,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.197661517453273,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.010604069433794,
      "learning_rate": 9.743093294695461e-07,
      "loss": 0.0123,
      "num_tokens": 551699731.0,
      "reward": 0.01953125,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1810.64453125,
      "completions/mean_terminated_length": 912.2429809570312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.19800290176666382,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.29025529971875,
      "learning_rate": 9.741304783294218e-07,
      "loss": 0.022,
      "num_tokens": 552705757.0,
      "reward": 0.041015625,
      "reward_std": 0.07339344918727875,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1795.353515625,
      "completions/mean_terminated_length": 741.8485107421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.1983442860800546,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 10.37541462003429,
      "learning_rate": 9.739510252136584e-07,
      "loss": 0.0041,
      "num_tokens": 553706850.0,
      "reward": 0.033203125,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1823.11328125,
      "completions/mean_terminated_length": 796.45654296875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.19868567039344542,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 15.049323590650918,
      "learning_rate": 9.737709703769562e-07,
      "loss": 0.0186,
      "num_tokens": 554726348.0,
      "reward": 0.03515625,
      "reward_std": 0.05507335811853409,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.91796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1946.0,
      "completions/mean_length": 1936.390625,
      "completions/mean_terminated_length": 687.4285888671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.1990270547068362,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.8096486926393,
      "learning_rate": 9.735903140748702e-07,
      "loss": 0.0106,
      "num_tokens": 555789172.0,
      "reward": 0.01171875,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.89453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1915.69921875,
      "completions/mean_terminated_length": 793.5925903320312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.19936843902022702,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 3.2684767544500843,
      "learning_rate": 9.734090565638092e-07,
      "loss": -0.0003,
      "num_tokens": 556851962.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.857421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1867.755859375,
      "completions/mean_terminated_length": 783.8218994140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.1997098233336178,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.368717374353762,
      "learning_rate": 9.73227198101035e-07,
      "loss": 0.0052,
      "num_tokens": 557877565.0,
      "reward": 0.029296875,
      "reward_std": 0.05298367142677307,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1888.966796875,
      "completions/mean_terminated_length": 775.734375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.20005120764700862,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 0.9319445263626492,
      "learning_rate": 9.730447389446623e-07,
      "loss": 0.0007,
      "num_tokens": 558922700.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1855.0,
      "completions/mean_length": 1903.37109375,
      "completions/mean_terminated_length": 926.8485107421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2003925919603994,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.7739725416482286,
      "learning_rate": 9.728616793536587e-07,
      "loss": 0.0051,
      "num_tokens": 559979466.0,
      "reward": 0.0078125,
      "reward_std": 0.020409777760505676,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1786.0,
      "completions/mean_length": 1913.91796875,
      "completions/mean_terminated_length": 617.7916870117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.20073397627379022,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.4087371599173673,
      "learning_rate": 9.726780195878438e-07,
      "loss": 0.0125,
      "num_tokens": 561035120.0,
      "reward": 0.009765625,
      "reward_std": 0.02178792469203472,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1880.0,
      "completions/mean_length": 1915.935546875,
      "completions/mean_terminated_length": 747.673095703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.201075360587181,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 26.011481222330602,
      "learning_rate": 9.724937599078888e-07,
      "loss": 0.0022,
      "num_tokens": 562094127.0,
      "reward": 0.01171875,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.012096773833036423,
      "rewards/accuracy_reward/std": 0.10942844301462173,
      "step": 589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1539.0,
      "completions/mean_length": 1928.267578125,
      "completions/mean_terminated_length": 515.4249877929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20141674490057182,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.126644533668479,
      "learning_rate": 9.72308900575317e-07,
      "loss": 0.0209,
      "num_tokens": 563148872.0,
      "reward": 0.015625,
      "reward_std": 0.027950849384069443,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1844.841796875,
      "completions/mean_terminated_length": 747.7875366210938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2017581292139626,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 14.645977451419562,
      "learning_rate": 9.72123441852502e-07,
      "loss": 0.0128,
      "num_tokens": 564175095.0,
      "reward": 0.02734375,
      "reward_std": 0.05311024188995361,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.904296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1915.9296875,
      "completions/mean_terminated_length": 668.0,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20209951352735342,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 11.592869084790411,
      "learning_rate": 9.719373840026686e-07,
      "loss": 0.0107,
      "num_tokens": 565229731.0,
      "reward": 0.03125,
      "reward_std": 0.04369450733065605,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1782.19140625,
      "completions/mean_terminated_length": 615.4315795898438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2024408978407442,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 21.605193368541222,
      "learning_rate": 9.717507272898922e-07,
      "loss": 0.0149,
      "num_tokens": 566223957.0,
      "reward": 0.029296875,
      "reward_std": 0.06150216981768608,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1821.900390625,
      "completions/mean_terminated_length": 440.1805725097656,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20278228215413502,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.0613364526902025,
      "learning_rate": 9.715634719790978e-07,
      "loss": 0.022,
      "num_tokens": 567227362.0,
      "reward": 0.046875,
      "reward_std": 0.08670367300510406,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.91015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1946.060546875,
      "completions/mean_terminated_length": 913.3695678710938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2031236664675258,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.70812589186284,
      "learning_rate": 9.713756183360597e-07,
      "loss": 0.0033,
      "num_tokens": 568298561.0,
      "reward": 0.013671875,
      "reward_std": 0.030584799125790596,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1868.12890625,
      "completions/mean_terminated_length": 750.9013671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20346505078091662,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.122825166578655,
      "learning_rate": 9.711871666274021e-07,
      "loss": 0.0003,
      "num_tokens": 569320995.0,
      "reward": 0.03515625,
      "reward_std": 0.04505910724401474,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1896.140625,
      "completions/mean_terminated_length": 793.9354858398438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2038064350943074,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 11.582825324714353,
      "learning_rate": 9.709981171205977e-07,
      "loss": -0.0005,
      "num_tokens": 570363723.0,
      "reward": 0.01171875,
      "reward_std": 0.022772299125790596,
      "rewards/accuracy_reward/mean": 0.012096773833036423,
      "rewards/accuracy_reward/std": 0.10942844301462173,
      "step": 597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1825.79296875,
      "completions/mean_terminated_length": 755.1591186523438,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.20414781940769822,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.968227355568104,
      "learning_rate": 9.708084700839678e-07,
      "loss": 0.0283,
      "num_tokens": 571380961.0,
      "reward": 0.076171875,
      "reward_std": 0.09793634712696075,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1874.10546875,
      "completions/mean_terminated_length": 907.1154174804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.204489203721089,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.123197532296599,
      "learning_rate": 9.706182257866812e-07,
      "loss": 0.011,
      "num_tokens": 572426135.0,
      "reward": 0.0390625,
      "reward_std": 0.046542368829250336,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1915.0,
      "completions/mean_length": 1784.28515625,
      "completions/mean_terminated_length": 737.1068115234375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20483058803447982,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.643624675033465,
      "learning_rate": 9.704273844987555e-07,
      "loss": 0.0078,
      "num_tokens": 573419657.0,
      "reward": 0.064453125,
      "reward_std": 0.08780106902122498,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1817.62109375,
      "completions/mean_terminated_length": 975.6908569335938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20517197234787063,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 19.13542093834099,
      "learning_rate": 9.702359464910546e-07,
      "loss": 0.0197,
      "num_tokens": 574426407.0,
      "reward": 0.072265625,
      "reward_std": 0.11113909631967545,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1709.392578125,
      "completions/mean_terminated_length": 661.0640258789062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20551335666126141,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.179275537858811,
      "learning_rate": 9.700439120352898e-07,
      "loss": 0.0017,
      "num_tokens": 575380544.0,
      "reward": 0.021484375,
      "reward_std": 0.04604348540306091,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1863.49609375,
      "completions/mean_terminated_length": 771.4324340820312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.20585474097465223,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.2513676266546545,
      "learning_rate": 9.69851281404019e-07,
      "loss": 0.0227,
      "num_tokens": 576409502.0,
      "reward": 0.02734375,
      "reward_std": 0.07135801017284393,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1762.978515625,
      "completions/mean_terminated_length": 745.044677734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.20619612528804301,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.460169727717185,
      "learning_rate": 9.696580548706462e-07,
      "loss": 0.0157,
      "num_tokens": 577391123.0,
      "reward": 0.029296875,
      "reward_std": 0.06092274188995361,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1777.99609375,
      "completions/mean_terminated_length": 876.4576416015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.20653750960143383,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.8758813445490032,
      "learning_rate": 9.69464232709421e-07,
      "loss": 0.0214,
      "num_tokens": 578374577.0,
      "reward": 0.046875,
      "reward_std": 0.0876617580652237,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1781.177734375,
      "completions/mean_terminated_length": 609.9684448242188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2068788939148246,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 1.0850371593350496,
      "learning_rate": 9.692698151954383e-07,
      "loss": -0.0002,
      "num_tokens": 579364316.0,
      "reward": 0.0078125,
      "reward_std": 0.013975424692034721,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1839.98828125,
      "completions/mean_terminated_length": 823.8390502929688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.20722027822821543,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 10.426151879229034,
      "learning_rate": 9.690748026046386e-07,
      "loss": -0.0004,
      "num_tokens": 580380614.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1716.9140625,
      "completions/mean_terminated_length": 658.5245361328125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2075616625416062,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.8669272294840247,
      "learning_rate": 9.688791952138068e-07,
      "loss": -0.0039,
      "num_tokens": 581339354.0,
      "reward": 0.048828125,
      "reward_std": 0.04230976849794388,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1751.7421875,
      "completions/mean_terminated_length": 907.518798828125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.20790304685499703,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.1731800354683215,
      "learning_rate": 9.686829933005709e-07,
      "loss": 0.0035,
      "num_tokens": 582312998.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1674.75,
      "completions/mean_terminated_length": 823.2628173828125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2082444311683878,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.574243232286076,
      "learning_rate": 9.684861971434043e-07,
      "loss": 0.0001,
      "num_tokens": 583257558.0,
      "reward": 0.05078125,
      "reward_std": 0.08263043314218521,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 1743.79296875,
      "completions/mean_terminated_length": 693.6173706054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20858581548177862,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 22.244181455949896,
      "learning_rate": 9.682888070216231e-07,
      "loss": 0.0003,
      "num_tokens": 584229500.0,
      "reward": 0.037109375,
      "reward_std": 0.06695909798145294,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1804.123046875,
      "completions/mean_terminated_length": 847.8750610351562,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2089271997951694,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 9.826526998632836,
      "learning_rate": 9.680908232153865e-07,
      "loss": 0.0027,
      "num_tokens": 585237403.0,
      "reward": 0.009765625,
      "reward_std": 0.028222277760505676,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1759.572265625,
      "completions/mean_terminated_length": 774.9396362304688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.20926858410856022,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.4591367622231552,
      "learning_rate": 9.67892246005696e-07,
      "loss": -0.0071,
      "num_tokens": 586210032.0,
      "reward": 0.029296875,
      "reward_std": 0.04859926179051399,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1724.783203125,
      "completions/mean_terminated_length": 691.5491333007812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.209609968421951,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.078602201675519,
      "learning_rate": 9.67693075674396e-07,
      "loss": 0.028,
      "num_tokens": 587171249.0,
      "reward": 0.080078125,
      "reward_std": 0.08311038464307785,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1770.412109375,
      "completions/mean_terminated_length": 681.9615478515625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.20995135273534182,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.930803448566627,
      "learning_rate": 9.674933125041722e-07,
      "loss": -0.0017,
      "num_tokens": 588159108.0,
      "reward": 0.0234375,
      "reward_std": 0.054249756038188934,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1642.30078125,
      "completions/mean_terminated_length": 818.8994140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2102927370487326,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.684390193283198,
      "learning_rate": 9.672929567785517e-07,
      "loss": 0.0016,
      "num_tokens": 589083790.0,
      "reward": 0.0390625,
      "reward_std": 0.04318207502365112,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1675.79296875,
      "completions/mean_terminated_length": 777.5333251953125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.21063412136212342,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 14.299613801039989,
      "learning_rate": 9.67092008781903e-07,
      "loss": 0.0074,
      "num_tokens": 590024660.0,
      "reward": 0.04296875,
      "reward_std": 0.10178204625844955,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1775.001953125,
      "completions/mean_terminated_length": 716.8095703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2109755056755142,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.809029334339867,
      "learning_rate": 9.668904687994351e-07,
      "loss": 0.0202,
      "num_tokens": 591002597.0,
      "reward": 0.037109375,
      "reward_std": 0.06587383151054382,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1685.38671875,
      "completions/mean_terminated_length": 850.2064208984375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.21131688998890502,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 5.840188475369804,
      "learning_rate": 9.66688337117197e-07,
      "loss": -0.0032,
      "num_tokens": 591943883.0,
      "reward": 0.029296875,
      "reward_std": 0.030584799125790596,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1631.392578125,
      "completions/mean_terminated_length": 714.8562622070312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2116582743022958,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.278747550405541,
      "learning_rate": 9.664856140220778e-07,
      "loss": -0.0132,
      "num_tokens": 592856628.0,
      "reward": 0.033203125,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1716.3125,
      "completions/mean_terminated_length": 860.4195556640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.21199965861568662,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 11.36693791810471,
      "learning_rate": 9.662822998018056e-07,
      "loss": -0.0016,
      "num_tokens": 593813252.0,
      "reward": 0.029296875,
      "reward_std": 0.04456022381782532,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1932.0,
      "completions/mean_length": 1812.8203125,
      "completions/mean_terminated_length": 901.2190551757812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2123410429290774,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 10.517391880169027,
      "learning_rate": 9.66078394744948e-07,
      "loss": 0.0232,
      "num_tokens": 594819560.0,
      "reward": 0.0546875,
      "reward_std": 0.10519562661647797,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1793.93359375,
      "completions/mean_terminated_length": 760.0593872070312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.21268242724246822,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.878379186252776,
      "learning_rate": 9.65873899140911e-07,
      "loss": 0.017,
      "num_tokens": 595811814.0,
      "reward": 0.046875,
      "reward_std": 0.08675792813301086,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1720.763671875,
      "completions/mean_terminated_length": 718.6111450195312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.213023811555859,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.563406905558506,
      "learning_rate": 9.656688132799382e-07,
      "loss": 0.0064,
      "num_tokens": 596782269.0,
      "reward": 0.037109375,
      "reward_std": 0.05335709825158119,
      "rewards/accuracy_reward/mean": 0.038306452333927155,
      "rewards/accuracy_reward/std": 0.19212885200977325,
      "step": 624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1875.384765625,
      "completions/mean_terminated_length": 854.4324340820312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.21336519586924982,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.929407269197637,
      "learning_rate": 9.65463137453112e-07,
      "loss": -0.0004,
      "num_tokens": 597821186.0,
      "reward": 0.0234375,
      "reward_std": 0.03779878467321396,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1852.630859375,
      "completions/mean_terminated_length": 857.1785888671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2137065801826406,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.607552224731443,
      "learning_rate": 9.652568719523516e-07,
      "loss": 0.0235,
      "num_tokens": 598849925.0,
      "reward": 0.0390625,
      "reward_std": 0.08109389245510101,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1803.330078125,
      "completions/mean_terminated_length": 743.09375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.21404796449603142,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.434637836004345,
      "learning_rate": 9.650500170704127e-07,
      "loss": 0.0199,
      "num_tokens": 599851790.0,
      "reward": 0.037109375,
      "reward_std": 0.06118202954530716,
      "rewards/accuracy_reward/mean": 0.038306452333927155,
      "rewards/accuracy_reward/std": 0.19212883710861206,
      "step": 627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1785.666015625,
      "completions/mean_terminated_length": 792.7196044921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2143893488094222,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.556227191545887,
      "learning_rate": 9.648425731008884e-07,
      "loss": 0.014,
      "num_tokens": 600845667.0,
      "reward": 0.0390625,
      "reward_std": 0.06449805945158005,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.869140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1654.0,
      "completions/mean_length": 1856.419921875,
      "completions/mean_terminated_length": 583.9850463867188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.21473073312281302,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.6683977493428692,
      "learning_rate": 9.646345403382073e-07,
      "loss": 0.0144,
      "num_tokens": 601879242.0,
      "reward": 0.01171875,
      "reward_std": 0.032021719962358475,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 1883.33203125,
      "completions/mean_terminated_length": 826.115966796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2150721174362038,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 10.02712140385809,
      "learning_rate": 9.644259190776339e-07,
      "loss": 0.0041,
      "num_tokens": 602916468.0,
      "reward": 0.01953125,
      "reward_std": 0.028608137741684914,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1929.96875,
      "completions/mean_terminated_length": 885.84619140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.21541350174959462,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.838934611715861,
      "learning_rate": 9.642167096152678e-07,
      "loss": 0.0077,
      "num_tokens": 603979028.0,
      "reward": 0.009765625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1762.919921875,
      "completions/mean_terminated_length": 811.0423583984375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2157548860629854,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 14.619106996182678,
      "learning_rate": 9.640069122480437e-07,
      "loss": 0.0278,
      "num_tokens": 604958843.0,
      "reward": 0.07421875,
      "reward_std": 0.12224511802196503,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1786.275390625,
      "completions/mean_terminated_length": 862.1327514648438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.21609627037637621,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 9.271328246169466,
      "learning_rate": 9.637965272737305e-07,
      "loss": 0.0076,
      "num_tokens": 605946920.0,
      "reward": 0.017578125,
      "reward_std": 0.03724660724401474,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1765.341796875,
      "completions/mean_terminated_length": 744.2072143554688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.216437654689767,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.266560544683273,
      "learning_rate": 9.635855549909314e-07,
      "loss": 0.0048,
      "num_tokens": 606923639.0,
      "reward": 0.03515625,
      "reward_std": 0.0645298957824707,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003823518753,
      "step": 634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.869140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1895.080078125,
      "completions/mean_terminated_length": 879.4179077148438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.21677903900315781,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.9201946016079303,
      "learning_rate": 9.63373995699083e-07,
      "loss": -0.0006,
      "num_tokens": 607973200.0,
      "reward": 0.0078125,
      "reward_std": 0.020409777760505676,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.876953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1909.587890625,
      "completions/mean_terminated_length": 923.1270141601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2171204233165486,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 0.06175537020243458,
      "learning_rate": 9.631618496984546e-07,
      "loss": 0.0054,
      "num_tokens": 609026973.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1863.95703125,
      "completions/mean_terminated_length": 912.6987915039062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2174618076299394,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.7033716440696702,
      "learning_rate": 9.62949117290149e-07,
      "loss": 0.0071,
      "num_tokens": 610061111.0,
      "reward": 0.015625,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1822.32421875,
      "completions/mean_terminated_length": 936.9807739257812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2178031919433302,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.692842122644058,
      "learning_rate": 9.627357987761007e-07,
      "loss": 0.0192,
      "num_tokens": 611062637.0,
      "reward": 0.0625,
      "reward_std": 0.062220655381679535,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1896.0,
      "completions/mean_length": 1789.890625,
      "completions/mean_terminated_length": 777.3077392578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.218144576256721,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.649960745861791,
      "learning_rate": 9.625218944590763e-07,
      "loss": 0.0114,
      "num_tokens": 612052261.0,
      "reward": 0.029296875,
      "reward_std": 0.07108421623706818,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1768.314453125,
      "completions/mean_terminated_length": 802.791259765625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2184859605701118,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.403190356293088,
      "learning_rate": 9.623074046426744e-07,
      "loss": -0.0029,
      "num_tokens": 613033414.0,
      "reward": 0.033203125,
      "reward_std": 0.048086829483509064,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1727.623046875,
      "completions/mean_terminated_length": 884.6453857421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2188273448835026,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.370851484601525,
      "learning_rate": 9.620923296313234e-07,
      "loss": 0.0101,
      "num_tokens": 613992597.0,
      "reward": 0.03515625,
      "reward_std": 0.05385598540306091,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1848.486328125,
      "completions/mean_terminated_length": 802.2560424804688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2191687291968934,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.1999419115278684,
      "learning_rate": 9.618766697302835e-07,
      "loss": 0.0029,
      "num_tokens": 615016590.0,
      "reward": 0.01953125,
      "reward_std": 0.023823359981179237,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1719.28125,
      "completions/mean_terminated_length": 925.9733276367188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2195101135102842,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.093660580826425,
      "learning_rate": 9.616604252456437e-07,
      "loss": -0.009,
      "num_tokens": 615976590.0,
      "reward": 0.0234375,
      "reward_std": 0.0479668527841568,
      "rewards/accuracy_reward/mean": 0.024193547666072845,
      "rewards/accuracy_reward/std": 0.15380480885505676,
      "step": 643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1792.875,
      "completions/mean_terminated_length": 1003.008056640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.219851497823675,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.8023919057887317,
      "learning_rate": 9.614435964843245e-07,
      "loss": 0.0228,
      "num_tokens": 616968542.0,
      "reward": 0.03515625,
      "reward_std": 0.08219823986291885,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1775.263671875,
      "completions/mean_terminated_length": 893.942138671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2201928821370658,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.44072077570558,
      "learning_rate": 9.612261837540738e-07,
      "loss": 0.0067,
      "num_tokens": 617952357.0,
      "reward": 0.033203125,
      "reward_std": 0.05392926186323166,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1701.828125,
      "completions/mean_terminated_length": 817.1666870117188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2205342664504566,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 3.1143148954330266,
      "learning_rate": 9.610081873634696e-07,
      "loss": 0.0132,
      "num_tokens": 618912557.0,
      "reward": 0.015625,
      "reward_std": 0.03438520431518555,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1811.61328125,
      "completions/mean_terminated_length": 957.6396484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2208756507638474,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.358383795916899,
      "learning_rate": 9.607896076219181e-07,
      "loss": 0.0003,
      "num_tokens": 619921063.0,
      "reward": 0.06640625,
      "reward_std": 0.05558578670024872,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1747.634765625,
      "completions/mean_terminated_length": 941.6187133789062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2212170350772382,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.6167391162865927,
      "learning_rate": 9.605704448396529e-07,
      "loss": 0.0249,
      "num_tokens": 620887788.0,
      "reward": 0.041015625,
      "reward_std": 0.06849661469459534,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1773.365234375,
      "completions/mean_terminated_length": 695.9519653320312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.221558419390629,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.6445848095832476,
      "learning_rate": 9.603506993277354e-07,
      "loss": 0.0148,
      "num_tokens": 621869399.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1718.099609375,
      "completions/mean_terminated_length": 796.8222045898438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2218998037040198,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.39632894233189,
      "learning_rate": 9.601303713980545e-07,
      "loss": 0.0225,
      "num_tokens": 622829802.0,
      "reward": 0.0546875,
      "reward_std": 0.06546888500452042,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1676.826171875,
      "completions/mean_terminated_length": 916.8035888671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2222411880174106,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.049203049976164,
      "learning_rate": 9.599094613633255e-07,
      "loss": 0.0191,
      "num_tokens": 623761537.0,
      "reward": 0.0703125,
      "reward_std": 0.1024949848651886,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1750.84375,
      "completions/mean_terminated_length": 904.0601806640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2225825723308014,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.696573944834077,
      "learning_rate": 9.596879695370894e-07,
      "loss": 0.013,
      "num_tokens": 624738305.0,
      "reward": 0.033203125,
      "reward_std": 0.07493096590042114,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1706.591796875,
      "completions/mean_terminated_length": 817.0070190429688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2229239566441922,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.418678104304901,
      "learning_rate": 9.594658962337134e-07,
      "loss": 0.0226,
      "num_tokens": 625691632.0,
      "reward": 0.05859375,
      "reward_std": 0.07682153582572937,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1802.8203125,
      "completions/mean_terminated_length": 946.8421020507812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.223265340957583,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.901602624213846,
      "learning_rate": 9.592432417683903e-07,
      "loss": 0.0109,
      "num_tokens": 626696276.0,
      "reward": 0.044921875,
      "reward_std": 0.07729348540306091,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1764.595703125,
      "completions/mean_terminated_length": 775.1666870117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2236067252709738,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.61631701851183,
      "learning_rate": 9.59020006457137e-07,
      "loss": 0.0052,
      "num_tokens": 627672325.0,
      "reward": 0.05859375,
      "reward_std": 0.06162214279174805,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1732.146484375,
      "completions/mean_terminated_length": 850.0962524414062,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2239481095843646,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 2.0359898782231824,
      "learning_rate": 9.587961906167952e-07,
      "loss": 0.0466,
      "num_tokens": 628634848.0,
      "reward": 0.076171875,
      "reward_std": 0.09633205831050873,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1797.7734375,
      "completions/mean_terminated_length": 816.1154174804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2242894938977554,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.9369401285106584,
      "learning_rate": 9.585717945650307e-07,
      "loss": 0.0177,
      "num_tokens": 629627708.0,
      "reward": 0.05078125,
      "reward_std": 0.0761418342590332,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1818.654296875,
      "completions/mean_terminated_length": 960.7315063476562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2246308782111462,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.019428379206984,
      "learning_rate": 9.583468186203326e-07,
      "loss": 0.0176,
      "num_tokens": 630627867.0,
      "reward": 0.025390625,
      "reward_std": 0.06849660724401474,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196493625641,
      "step": 658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1921.0,
      "completions/mean_length": 1825.560546875,
      "completions/mean_terminated_length": 983.6168212890625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.224972262524537,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 9.781776288495715,
      "learning_rate": 9.581212631020132e-07,
      "loss": 0.0185,
      "num_tokens": 631644218.0,
      "reward": 0.05859375,
      "reward_std": 0.09671889245510101,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1821.67578125,
      "completions/mean_terminated_length": 853.3814086914062,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2253136468379278,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.8209333369744485,
      "learning_rate": 9.578951283302072e-07,
      "loss": 0.0074,
      "num_tokens": 632650564.0,
      "reward": 0.037109375,
      "reward_std": 0.08537881821393967,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1823.76953125,
      "completions/mean_terminated_length": 954.6095581054688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2256550311513186,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.765844309007517,
      "learning_rate": 9.576684146258715e-07,
      "loss": 0.0057,
      "num_tokens": 633662398.0,
      "reward": 0.05859375,
      "reward_std": 0.0828578919172287,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1830.109375,
      "completions/mean_terminated_length": 719.90478515625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2259964154647094,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.7631497884140344,
      "learning_rate": 9.574411223107849e-07,
      "loss": 0.0097,
      "num_tokens": 634677062.0,
      "reward": 0.021484375,
      "reward_std": 0.04604348540306091,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1876.94921875,
      "completions/mean_terminated_length": 895.6578979492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2263377997781002,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.07349670476617,
      "learning_rate": 9.572132517075472e-07,
      "loss": 0.0054,
      "num_tokens": 635722588.0,
      "reward": 0.0078125,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1831.556640625,
      "completions/mean_terminated_length": 856.3978271484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.226679184091491,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 2.1084424092730654,
      "learning_rate": 9.56984803139579e-07,
      "loss": 0.0165,
      "num_tokens": 636733017.0,
      "reward": 0.029296875,
      "reward_std": 0.06425705552101135,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1871.3203125,
      "completions/mean_terminated_length": 736.9855346679688,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2270205684048818,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.303572483000552,
      "learning_rate": 9.567557769311213e-07,
      "loss": 0.0152,
      "num_tokens": 637772429.0,
      "reward": 0.015625,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1877.0,
      "completions/mean_length": 1875.404296875,
      "completions/mean_terminated_length": 767.2898559570312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2273619527182726,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.672283061801089,
      "learning_rate": 9.565261734072346e-07,
      "loss": 0.0178,
      "num_tokens": 638811772.0,
      "reward": 0.03515625,
      "reward_std": 0.04742163047194481,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 1859.57421875,
      "completions/mean_terminated_length": 778.6052856445312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2277033370316634,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 9.229973042212883,
      "learning_rate": 9.562959928937999e-07,
      "loss": 0.0061,
      "num_tokens": 639842706.0,
      "reward": 0.046875,
      "reward_std": 0.06304663419723511,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1880.767578125,
      "completions/mean_terminated_length": 620.9500122070312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.22804472134505419,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.147531335995282,
      "learning_rate": 9.560652357175157e-07,
      "loss": 0.0072,
      "num_tokens": 640878971.0,
      "reward": 0.013671875,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1902.29296875,
      "completions/mean_terminated_length": 804.6333618164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.228386105658445,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.107994333623763,
      "learning_rate": 9.558339022058995e-07,
      "loss": 0.0173,
      "num_tokens": 641930625.0,
      "reward": 0.02734375,
      "reward_std": 0.05072315037250519,
      "rewards/accuracy_reward/mean": 0.02822580561041832,
      "rewards/accuracy_reward/std": 0.1657845675945282,
      "step": 669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1631.0,
      "completions/mean_length": 1943.947265625,
      "completions/mean_terminated_length": 568.138916015625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.22872748997183578,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.249754160057081,
      "learning_rate": 9.556019926872874e-07,
      "loss": 0.0155,
      "num_tokens": 643007782.0,
      "reward": 0.0078125,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.91015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1678.0,
      "completions/mean_length": 1931.298828125,
      "completions/mean_terminated_length": 749.0652465820312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2290688742852266,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.119769835457573,
      "learning_rate": 9.553695074908321e-07,
      "loss": 0.015,
      "num_tokens": 644064607.0,
      "reward": 0.0390625,
      "reward_std": 0.08604402095079422,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.896484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1933.623046875,
      "completions/mean_terminated_length": 943.79248046875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.22941025859861738,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 5.792601290112887,
      "learning_rate": 9.55136446946504e-07,
      "loss": 0.0044,
      "num_tokens": 645132494.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1920.080078125,
      "completions/mean_terminated_length": 879.357177734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2297516429120082,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.1010597011537926,
      "learning_rate": 9.549028113850903e-07,
      "loss": 0.0082,
      "num_tokens": 646193399.0,
      "reward": 0.009765625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.900390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1926.142578125,
      "completions/mean_terminated_length": 824.6470947265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.23009302722539898,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 5.111521833901306,
      "learning_rate": 9.54668601138193e-07,
      "loss": 0.0169,
      "num_tokens": 647256416.0,
      "reward": 0.013671875,
      "reward_std": 0.025633705779910088,
      "rewards/accuracy_reward/mean": 0.01411290280520916,
      "rewards/accuracy_reward/std": 0.11807556450366974,
      "step": 674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.900390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1906.0,
      "completions/mean_length": 1914.828125,
      "completions/mean_terminated_length": 712.0784301757812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2304344115387898,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.817514981275791,
      "learning_rate": 9.544338165382318e-07,
      "loss": -0.001,
      "num_tokens": 648317544.0,
      "reward": 0.005859375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1944.029296875,
      "completions/mean_terminated_length": 940.0,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.23077579585218058,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.19701188209496,
      "learning_rate": 9.541984579184399e-07,
      "loss": 0.0055,
      "num_tokens": 649391975.0,
      "reward": 0.0078125,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1874.87109375,
      "completions/mean_terminated_length": 763.3333740234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.2311171801655714,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 21.240587694529683,
      "learning_rate": 9.539625256128658e-07,
      "loss": 0.011,
      "num_tokens": 650435413.0,
      "reward": 0.01953125,
      "reward_std": 0.051493462175130844,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.93359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1627.0,
      "completions/mean_length": 1958.833984375,
      "completions/mean_terminated_length": 705.2647094726562,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.23145856447896218,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 18.57208646125963,
      "learning_rate": 9.537260199563723e-07,
      "loss": 0.0092,
      "num_tokens": 651509984.0,
      "reward": 0.013671875,
      "reward_std": 0.032461829483509064,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1914.94921875,
      "completions/mean_terminated_length": 912.6333618164062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.231799948792353,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.169590602862235,
      "learning_rate": 9.534889412846361e-07,
      "loss": 0.0114,
      "num_tokens": 652569190.0,
      "reward": 0.025390625,
      "reward_std": 0.04230976849794388,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1952.755859375,
      "completions/mean_terminated_length": 1073.5399169921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.23214133310574378,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.807154451297801,
      "learning_rate": 9.532512899341467e-07,
      "loss": 0.0134,
      "num_tokens": 653654393.0,
      "reward": 0.021484375,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 1925.556640625,
      "completions/mean_terminated_length": 741.9375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2324827174191346,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.9076291352445545,
      "learning_rate": 9.53013066242207e-07,
      "loss": 0.0049,
      "num_tokens": 654714166.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.92578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1753.0,
      "completions/mean_length": 1947.02734375,
      "completions/mean_terminated_length": 687.5263061523438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.23282410173252538,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 9.421055299085438,
      "learning_rate": 9.527742705469318e-07,
      "loss": 0.0214,
      "num_tokens": 655786836.0,
      "reward": 0.02734375,
      "reward_std": 0.0580955371260643,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1931.18359375,
      "completions/mean_terminated_length": 689.9318237304688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2331654860459162,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.449464217623047,
      "learning_rate": 9.525349031872481e-07,
      "loss": 0.0136,
      "num_tokens": 656852226.0,
      "reward": 0.013671875,
      "reward_std": 0.032461829483509064,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.93359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1964.025390625,
      "completions/mean_terminated_length": 783.441162109375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.23350687035930698,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 19.298173662881958,
      "learning_rate": 9.52294964502894e-07,
      "loss": 0.0173,
      "num_tokens": 657932031.0,
      "reward": 0.01953125,
      "reward_std": 0.046542368829250336,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1891.552734375,
      "completions/mean_terminated_length": 834.3485107421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2338482546726978,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.337417659861314,
      "learning_rate": 9.520544548344184e-07,
      "loss": -0.0011,
      "num_tokens": 658979626.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.900390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1731.0,
      "completions/mean_length": 1910.216796875,
      "completions/mean_terminated_length": 664.7647094726562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.23418963898608858,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.425422756588813,
      "learning_rate": 9.51813374523181e-07,
      "loss": 0.0066,
      "num_tokens": 660033913.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1895.041015625,
      "completions/mean_terminated_length": 824.328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2345310232994794,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.225695329815126,
      "learning_rate": 9.515717239113511e-07,
      "loss": 0.0094,
      "num_tokens": 661083550.0,
      "reward": 0.041015625,
      "reward_std": 0.06790468841791153,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.857421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1969.0,
      "completions/mean_length": 1883.64453125,
      "completions/mean_terminated_length": 895.9862670898438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.23487240761287018,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 4.550178195000361,
      "learning_rate": 9.513295033419077e-07,
      "loss": 0.0085,
      "num_tokens": 662136680.0,
      "reward": 0.029296875,
      "reward_std": 0.05628519505262375,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1857.345703125,
      "completions/mean_terminated_length": 796.525634765625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.235213791926261,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.8550177090278439,
      "learning_rate": 9.510867131586383e-07,
      "loss": 0.0112,
      "num_tokens": 663167625.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.004032257944345474,
      "rewards/accuracy_reward/std": 0.06343588978052139,
      "step": 689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1828.4375,
      "completions/mean_terminated_length": 812.6593627929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.23555517623965178,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 9.326333054201168,
      "learning_rate": 9.508433537061394e-07,
      "loss": -0.0012,
      "num_tokens": 664183097.0,
      "reward": 0.009765625,
      "reward_std": 0.028222277760505676,
      "rewards/accuracy_reward/mean": 0.010080644860863686,
      "rewards/accuracy_reward/std": 0.0999959334731102,
      "step": 690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.884765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1705.0,
      "completions/mean_length": 1912.939453125,
      "completions/mean_terminated_length": 875.9491577148438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2358965605530426,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.407769084194278,
      "learning_rate": 9.505994253298152e-07,
      "loss": 0.0059,
      "num_tokens": 665239114.0,
      "reward": 0.01171875,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1845.53515625,
      "completions/mean_terminated_length": 921.7391357421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.23623794486643337,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 5.225003508802906,
      "learning_rate": 9.503549283758773e-07,
      "loss": 0.0021,
      "num_tokens": 666264828.0,
      "reward": 0.021484375,
      "reward_std": 0.02998628467321396,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1840.1484375,
      "completions/mean_terminated_length": 765.831298828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2365793291798242,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.442083675696622,
      "learning_rate": 9.501098631913446e-07,
      "loss": 0.0123,
      "num_tokens": 667276664.0,
      "reward": 0.0234375,
      "reward_std": 0.03944835811853409,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.884765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1911.650390625,
      "completions/mean_terminated_length": 864.7626953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.23692071349321497,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 7.307947045544329,
      "learning_rate": 9.498642301240422e-07,
      "loss": 0.0045,
      "num_tokens": 668327941.0,
      "reward": 0.013671875,
      "reward_std": 0.025633705779910088,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.900390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1922.025390625,
      "completions/mean_terminated_length": 783.3137817382812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2372620978066058,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.0203181571697983,
      "learning_rate": 9.496180295226012e-07,
      "loss": 0.0248,
      "num_tokens": 669385618.0,
      "reward": 0.01171875,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1934.0,
      "completions/mean_length": 1844.013671875,
      "completions/mean_terminated_length": 960.0729370117188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.23760348211999657,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.45281910422109,
      "learning_rate": 9.493712617364585e-07,
      "loss": 0.0207,
      "num_tokens": 670409193.0,
      "reward": 0.037109375,
      "reward_std": 0.058760739862918854,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1870.7890625,
      "completions/mean_terminated_length": 770.08447265625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2379448664333874,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.606653637945903,
      "learning_rate": 9.491239271158558e-07,
      "loss": -0.0012,
      "num_tokens": 671443197.0,
      "reward": 0.041015625,
      "reward_std": 0.050508126616477966,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1871.021484375,
      "completions/mean_terminated_length": 823.5,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.23828625074677817,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.834670748941362,
      "learning_rate": 9.488760260118393e-07,
      "loss": 0.0025,
      "num_tokens": 672475544.0,
      "reward": 0.01953125,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1847.0,
      "completions/mean_length": 1777.111328125,
      "completions/mean_terminated_length": 820.610595703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.23862763506016899,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.00588616683939,
      "learning_rate": 9.486275587762592e-07,
      "loss": 0.0042,
      "num_tokens": 673457585.0,
      "reward": 0.037109375,
      "reward_std": 0.07917051017284393,
      "rewards/accuracy_reward/mean": 0.038306452333927155,
      "rewards/accuracy_reward/std": 0.19212883710861206,
      "step": 699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1821.599609375,
      "completions/mean_terminated_length": 730.7614135742188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.23896901937355977,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.245960679230839,
      "learning_rate": 9.483785257617695e-07,
      "loss": 0.005,
      "num_tokens": 674460884.0,
      "reward": 0.06640625,
      "reward_std": 0.09743183851242065,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.849609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1915.0,
      "completions/mean_length": 1849.265625,
      "completions/mean_terminated_length": 726.5454711914062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.23931040368695058,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.160772576169151,
      "learning_rate": 9.48128927321827e-07,
      "loss": -0.0006,
      "num_tokens": 675486732.0,
      "reward": 0.05078125,
      "reward_std": 0.061169594526290894,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1860.0,
      "completions/mean_length": 1860.3203125,
      "completions/mean_terminated_length": 655.3623046875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2396517880003414,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.898876186601683,
      "learning_rate": 9.478787638106908e-07,
      "loss": 0.0099,
      "num_tokens": 676522128.0,
      "reward": 0.0390625,
      "reward_std": 0.04243633896112442,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1909.7421875,
      "completions/mean_terminated_length": 686.6923217773438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.23999317231373218,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 3.3994186384542977,
      "learning_rate": 9.476280355834224e-07,
      "loss": 0.0041,
      "num_tokens": 677568652.0,
      "reward": 0.017578125,
      "reward_std": 0.03536957502365112,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1921.705078125,
      "completions/mean_terminated_length": 804.4807739257812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.240334556627123,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.0858432771960347,
      "learning_rate": 9.473767429958846e-07,
      "loss": 0.0081,
      "num_tokens": 678628245.0,
      "reward": 0.009765625,
      "reward_std": 0.023271184414625168,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1821.080078125,
      "completions/mean_terminated_length": 825.431640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.24067594094051378,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 11.609559062075409,
      "learning_rate": 9.471248864047415e-07,
      "loss": 0.0312,
      "num_tokens": 679635278.0,
      "reward": 0.0859375,
      "reward_std": 0.11311560124158859,
      "rewards/accuracy_reward/mean": 0.08870967477560043,
      "rewards/accuracy_reward/std": 0.2846112847328186,
      "step": 705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1832.087890625,
      "completions/mean_terminated_length": 777.3448486328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2410173252539046,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.27326656183113,
      "learning_rate": 9.468724661674571e-07,
      "loss": 0.0126,
      "num_tokens": 680654187.0,
      "reward": 0.041015625,
      "reward_std": 0.046722229570150375,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1885.0,
      "completions/mean_length": 1812.83203125,
      "completions/mean_terminated_length": 647.9302368164062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.24135870956729538,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.026506072673076,
      "learning_rate": 9.466194826422961e-07,
      "loss": 0.006,
      "num_tokens": 681664693.0,
      "reward": 0.021484375,
      "reward_std": 0.04698248207569122,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 1820.4765625,
      "completions/mean_terminated_length": 834.5416870117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2417000938806862,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 15.492257018422846,
      "learning_rate": 9.463659361883219e-07,
      "loss": 0.0052,
      "num_tokens": 682677961.0,
      "reward": 0.015625,
      "reward_std": 0.047646719962358475,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1926.41796875,
      "completions/mean_terminated_length": 936.3928833007812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.24204147819407698,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.369205753473792,
      "learning_rate": 9.46111827165398e-07,
      "loss": 0.0032,
      "num_tokens": 683733519.0,
      "reward": 0.015625,
      "reward_std": 0.03438520431518555,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1772.890625,
      "completions/mean_terminated_length": 667.058837890625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2423828625074678,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.6176891808664795,
      "learning_rate": 9.458571559341849e-07,
      "loss": 0.0073,
      "num_tokens": 684712423.0,
      "reward": 0.04296875,
      "reward_std": 0.042583562433719635,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1816.73046875,
      "completions/mean_terminated_length": 638.357177734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.24272424682085858,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.10296319651108,
      "learning_rate": 9.456019228561425e-07,
      "loss": 0.0281,
      "num_tokens": 685719165.0,
      "reward": 0.0625,
      "reward_std": 0.0932510495185852,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1898.576171875,
      "completions/mean_terminated_length": 922.9264526367188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2430656311342494,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.4854106119832857,
      "learning_rate": 9.453461282935271e-07,
      "loss": 0.0063,
      "num_tokens": 686766788.0,
      "reward": 0.02734375,
      "reward_std": 0.04517117142677307,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.880859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1771.0,
      "completions/mean_length": 1903.732421875,
      "completions/mean_terminated_length": 837.0983276367188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.24340701544764018,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.7146508048265992,
      "learning_rate": 9.450897726093924e-07,
      "loss": 0.0048,
      "num_tokens": 687815963.0,
      "reward": 0.01171875,
      "reward_std": 0.022772299125790596,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.876953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1898.29296875,
      "completions/mean_terminated_length": 831.3333740234375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.243748399761031,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.274635279469366,
      "learning_rate": 9.448328561675883e-07,
      "loss": -0.0013,
      "num_tokens": 688864945.0,
      "reward": 0.029296875,
      "reward_std": 0.03741292655467987,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1948.87109375,
      "completions/mean_terminated_length": 779.1500244140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.24408978407442178,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.0808293627719798,
      "learning_rate": 9.44575379332761e-07,
      "loss": 0.0076,
      "num_tokens": 689933231.0,
      "reward": 0.0390625,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.935546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1976.615234375,
      "completions/mean_terminated_length": 940.45458984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2444311683878126,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 3.257313844473362,
      "learning_rate": 9.443173424703514e-07,
      "loss": 0.0026,
      "num_tokens": 691023322.0,
      "reward": 0.00390625,
      "reward_std": 0.010673906654119492,
      "rewards/accuracy_reward/mean": 0.004032257944345474,
      "rewards/accuracy_reward/std": 0.06343588978052139,
      "step": 716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1841.89453125,
      "completions/mean_terminated_length": 848.8409423828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.24477255270120338,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.556183918575925,
      "learning_rate": 9.440587459465956e-07,
      "loss": 0.0085,
      "num_tokens": 692040212.0,
      "reward": 0.021484375,
      "reward_std": 0.054354868829250336,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.884765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1895.978515625,
      "completions/mean_terminated_length": 728.7626953125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2451139370145942,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.641470017684496,
      "learning_rate": 9.437995901285246e-07,
      "loss": 0.0063,
      "num_tokens": 693085721.0,
      "reward": 0.033203125,
      "reward_std": 0.05793476849794388,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1747.0,
      "completions/mean_length": 1903.697265625,
      "completions/mean_terminated_length": 728.6607666015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.24545532132798498,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 17.175972875074105,
      "learning_rate": 9.435398753839622e-07,
      "loss": 0.0055,
      "num_tokens": 694137118.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1774.0,
      "completions/mean_length": 1908.376953125,
      "completions/mean_terminated_length": 423.29547119140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2457967056413758,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 9.432796020815261e-07,
      "loss": 0.0,
      "num_tokens": 695184159.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/accuracy_reward/mean": 0.0,
      "rewards/accuracy_reward/std": 0.0,
      "step": 720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.89453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1900.154296875,
      "completions/mean_terminated_length": 646.2037353515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.24613808995476658,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.07614082677678,
      "learning_rate": 9.430187705906268e-07,
      "loss": -0.005,
      "num_tokens": 696227726.0,
      "reward": 0.01171875,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1878.0,
      "completions/mean_length": 1888.22265625,
      "completions/mean_terminated_length": 808.5151977539062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2464794742681574,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.735065726966466,
      "learning_rate": 9.427573812814666e-07,
      "loss": 0.0016,
      "num_tokens": 697261488.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1955.484375,
      "completions/mean_terminated_length": 863.7999877929688,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.24682085858154817,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 33.91462387898152,
      "learning_rate": 9.424954345250401e-07,
      "loss": 0.0015,
      "num_tokens": 698336456.0,
      "reward": 0.005859375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1828.763671875,
      "completions/mean_terminated_length": 711.702392578125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.247162242894939,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.5097438308450295,
      "learning_rate": 9.422329306931325e-07,
      "loss": 0.0029,
      "num_tokens": 699354207.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.888671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1896.5625,
      "completions/mean_terminated_length": 687.7192993164062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.24750362720832977,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.253917350886468,
      "learning_rate": 9.419698701583204e-07,
      "loss": 0.0047,
      "num_tokens": 700401263.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.004032257944345474,
      "rewards/accuracy_reward/std": 0.06343588978052139,
      "step": 725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.873046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1819.0,
      "completions/mean_length": 1875.80078125,
      "completions/mean_terminated_length": 691.5999755859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2478450115217206,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 3.266792260067484,
      "learning_rate": 9.417062532939698e-07,
      "loss": -0.0013,
      "num_tokens": 701444025.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1858.53515625,
      "completions/mean_terminated_length": 1005.3118286132812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.24818639583511137,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.13482927774544,
      "learning_rate": 9.414420804742366e-07,
      "loss": 0.0065,
      "num_tokens": 702477787.0,
      "reward": 0.015625,
      "reward_std": 0.03438520431518555,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1878.255859375,
      "completions/mean_terminated_length": 788.4492797851562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2485277801485022,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.75132103856005,
      "learning_rate": 9.41177352074066e-07,
      "loss": 0.0069,
      "num_tokens": 703516622.0,
      "reward": 0.017578125,
      "reward_std": 0.03576334938406944,
      "rewards/accuracy_reward/mean": 0.018145160749554634,
      "rewards/accuracy_reward/std": 0.1336110234260559,
      "step": 728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1916.0,
      "completions/mean_length": 1966.896484375,
      "completions/mean_terminated_length": 894.5277709960938,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.24886916446189297,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.543322958878461,
      "learning_rate": 9.409120684691915e-07,
      "loss": 0.0023,
      "num_tokens": 704599193.0,
      "reward": 0.03515625,
      "reward_std": 0.05971328169107437,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1879.03125,
      "completions/mean_terminated_length": 737.212158203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.24921054877528379,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.378120638546573,
      "learning_rate": 9.406462300361345e-07,
      "loss": 0.0222,
      "num_tokens": 705633241.0,
      "reward": 0.03515625,
      "reward_std": 0.062220655381679535,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.849609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1921.0,
      "completions/mean_length": 1870.765625,
      "completions/mean_terminated_length": 869.5064697265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.24955193308867457,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.8710887732685961,
      "learning_rate": 9.403798371522042e-07,
      "loss": 0.0019,
      "num_tokens": 706669809.0,
      "reward": 0.01171875,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.916015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1700.0,
      "completions/mean_length": 1922.0078125,
      "completions/mean_terminated_length": 547.81396484375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.24989331740206538,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.308225203261697,
      "learning_rate": 9.401128901954964e-07,
      "loss": 0.012,
      "num_tokens": 707725973.0,
      "reward": 0.013671875,
      "reward_std": 0.032461829483509064,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1777.404296875,
      "completions/mean_terminated_length": 728.5238647460938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2502347017154562,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 6.096172823484754,
      "learning_rate": 9.398453895448936e-07,
      "loss": 0.028,
      "num_tokens": 708721348.0,
      "reward": 0.033203125,
      "reward_std": 0.08890639245510101,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1750.755859375,
      "completions/mean_terminated_length": 920.674072265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.25057608602884696,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.2339362130962455,
      "learning_rate": 9.395773355800643e-07,
      "loss": 0.0022,
      "num_tokens": 709699703.0,
      "reward": 0.03125,
      "reward_std": 0.05789502337574959,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1846.318359375,
      "completions/mean_terminated_length": 874.57958984375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.25091747034223777,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.351020923091093,
      "learning_rate": 9.393087286814616e-07,
      "loss": 0.0235,
      "num_tokens": 710723146.0,
      "reward": 0.037109375,
      "reward_std": 0.06860867142677307,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.849609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1733.0,
      "completions/mean_length": 1859.740234375,
      "completions/mean_terminated_length": 796.19482421875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2512588546556286,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.218381978402311,
      "learning_rate": 9.39039569230324e-07,
      "loss": 0.0127,
      "num_tokens": 711753461.0,
      "reward": 0.037109375,
      "reward_std": 0.0723423883318901,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1780.439453125,
      "completions/mean_terminated_length": 791.2017822265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2516002389690194,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.405206048368234,
      "learning_rate": 9.387698576086743e-07,
      "loss": -0.0006,
      "num_tokens": 712735094.0,
      "reward": 0.068359375,
      "reward_std": 0.06536377221345901,
      "rewards/accuracy_reward/mean": 0.07056451588869095,
      "rewards/accuracy_reward/std": 0.25635457038879395,
      "step": 737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1702.953125,
      "completions/mean_terminated_length": 777.0360107421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.25194162328241015,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 8.361406843910276,
      "learning_rate": 9.384995941993187e-07,
      "loss": 0.0029,
      "num_tokens": 713679950.0,
      "reward": 0.05859375,
      "reward_std": 0.09908141195774078,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1861.0,
      "completions/mean_length": 1730.751953125,
      "completions/mean_terminated_length": 779.0078125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.25228300759580097,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.992097237969598,
      "learning_rate": 9.382287793858467e-07,
      "loss": 0.0164,
      "num_tokens": 714641135.0,
      "reward": 0.064453125,
      "reward_std": 0.10009271651506424,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1911.0,
      "completions/mean_length": 1734.02734375,
      "completions/mean_terminated_length": 830.1666870117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2526243919091918,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.872577742999004,
      "learning_rate": 9.379574135526304e-07,
      "loss": 0.0302,
      "num_tokens": 715606413.0,
      "reward": 0.044921875,
      "reward_std": 0.08235900849103928,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1817.380859375,
      "completions/mean_terminated_length": 964.7247314453125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2529657762225826,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 17.771672883756676,
      "learning_rate": 9.376854970848239e-07,
      "loss": 0.0094,
      "num_tokens": 716612304.0,
      "reward": 0.03515625,
      "reward_std": 0.05001020431518555,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1753.06640625,
      "completions/mean_terminated_length": 810.245849609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.25330716053597335,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.652605277169814,
      "learning_rate": 9.374130303683628e-07,
      "loss": 0.0128,
      "num_tokens": 717594754.0,
      "reward": 0.021484375,
      "reward_std": 0.03735867142677307,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1720.76171875,
      "completions/mean_terminated_length": 952.9281005859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.25364854484936417,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.997219291130501,
      "learning_rate": 9.371400137899642e-07,
      "loss": 0.0029,
      "num_tokens": 718554424.0,
      "reward": 0.0390625,
      "reward_std": 0.05243149772286415,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1871.0,
      "completions/mean_length": 1821.517578125,
      "completions/mean_terminated_length": 964.27099609375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.253989929162755,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.3196305678553888,
      "learning_rate": 9.368664477371246e-07,
      "loss": 0.0014,
      "num_tokens": 719560033.0,
      "reward": 0.009765625,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1610.068359375,
      "completions/mean_terminated_length": 829.4076538085938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2543313134761458,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 4.516296029437842,
      "learning_rate": 9.365923325981214e-07,
      "loss": 0.0195,
      "num_tokens": 720459588.0,
      "reward": 0.080078125,
      "reward_std": 0.12215207517147064,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1637.5625,
      "completions/mean_terminated_length": 819.0877075195312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.25467269778953655,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.358497047175415,
      "learning_rate": 9.363176687620109e-07,
      "loss": -0.0001,
      "num_tokens": 721375460.0,
      "reward": 0.02734375,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1719.345703125,
      "completions/mean_terminated_length": 819.7445068359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.25501408210292736,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.059356007926346,
      "learning_rate": 9.360424566186279e-07,
      "loss": 0.0193,
      "num_tokens": 722337493.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1793.783203125,
      "completions/mean_terminated_length": 1006.72802734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2553554664163182,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 4.417724225579728,
      "learning_rate": 9.357666965585859e-07,
      "loss": 0.0056,
      "num_tokens": 723329686.0,
      "reward": 0.05078125,
      "reward_std": 0.05237272381782532,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1638.3203125,
      "completions/mean_terminated_length": 889.1271362304688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.255696850729709,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.572466992615142,
      "learning_rate": 9.354903889732761e-07,
      "loss": 0.0093,
      "num_tokens": 724254314.0,
      "reward": 0.025390625,
      "reward_std": 0.052045635879039764,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1785.15234375,
      "completions/mean_terminated_length": 766.3048095703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.25603823504309975,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.952264768861366,
      "learning_rate": 9.352135342548659e-07,
      "loss": 0.0111,
      "num_tokens": 725243624.0,
      "reward": 0.037109375,
      "reward_std": 0.07174387574195862,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1695.29296875,
      "completions/mean_terminated_length": 867.6993408203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.25637961935649056,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.104368814767614,
      "learning_rate": 9.349361327963006e-07,
      "loss": 0.0316,
      "num_tokens": 726191022.0,
      "reward": 0.056640625,
      "reward_std": 0.08149883151054382,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1723.990234375,
      "completions/mean_terminated_length": 911.74658203125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2567210036698814,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.9758544955350608,
      "learning_rate": 9.346581849913004e-07,
      "loss": 0.0091,
      "num_tokens": 727153753.0,
      "reward": 0.01171875,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1654.111328125,
      "completions/mean_terminated_length": 861.7000122070312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2570623879832722,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.588401547947068,
      "learning_rate": 9.343796912343617e-07,
      "loss": 0.0088,
      "num_tokens": 728077778.0,
      "reward": 0.05859375,
      "reward_std": 0.0792168527841568,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1764.6484375,
      "completions/mean_terminated_length": 878.0322265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.25740377229666295,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.455847970024246,
      "learning_rate": 9.341006519207551e-07,
      "loss": 0.0106,
      "num_tokens": 729061630.0,
      "reward": 0.029296875,
      "reward_std": 0.07179813086986542,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1661.6953125,
      "completions/mean_terminated_length": 870.6904907226562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.25774515661005376,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.735508205961146,
      "learning_rate": 9.338210674465263e-07,
      "loss": 0.0217,
      "num_tokens": 729985842.0,
      "reward": 0.060546875,
      "reward_std": 0.080595001578331,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1672.8671875,
      "completions/mean_terminated_length": 800.80517578125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2580865409234446,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.12835869836486,
      "learning_rate": 9.335409382084939e-07,
      "loss": 0.0053,
      "num_tokens": 730918398.0,
      "reward": 0.056640625,
      "reward_std": 0.08643084019422531,
      "rewards/accuracy_reward/mean": 0.058467742055654526,
      "rewards/accuracy_reward/std": 0.23486268520355225,
      "step": 756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1860.0,
      "completions/mean_length": 1720.318359375,
      "completions/mean_terminated_length": 832.253662109375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2584279252368354,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 1.2749479704223878,
      "learning_rate": 9.332602646042504e-07,
      "loss": 0.007,
      "num_tokens": 731877809.0,
      "reward": 0.044921875,
      "reward_std": 0.09522771835327148,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1669.61328125,
      "completions/mean_terminated_length": 908.3882446289062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.25876930955022615,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 1.071438862279623,
      "learning_rate": 9.329790470321607e-07,
      "loss": 0.0219,
      "num_tokens": 732812267.0,
      "reward": 0.041015625,
      "reward_std": 0.07379381358623505,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1748.84765625,
      "completions/mean_terminated_length": 887.6515502929688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.25911069386361696,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.401037798775814,
      "learning_rate": 9.326972858913613e-07,
      "loss": 0.0065,
      "num_tokens": 733780477.0,
      "reward": 0.0390625,
      "reward_std": 0.058208562433719635,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1767.48046875,
      "completions/mean_terminated_length": 820.4273681640625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.25945207817700777,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 12.927825227835879,
      "learning_rate": 9.324149815817612e-07,
      "loss": 0.0073,
      "num_tokens": 734765827.0,
      "reward": 0.017578125,
      "reward_std": 0.03576334938406944,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1737.080078125,
      "completions/mean_terminated_length": 886.0219116210938,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2597934624903986,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 42.327796975300664,
      "learning_rate": 9.321321345040391e-07,
      "loss": 0.002,
      "num_tokens": 735731308.0,
      "reward": 0.056640625,
      "reward_std": 0.08175812661647797,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 1767.5703125,
      "completions/mean_terminated_length": 871.1146850585938,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.26013484680378934,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.326397588053107,
      "learning_rate": 9.31848745059645e-07,
      "loss": 0.0057,
      "num_tokens": 736709936.0,
      "reward": 0.017578125,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1763.228515625,
      "completions/mean_terminated_length": 801.820556640625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.26047623111718016,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.4820721632697342,
      "learning_rate": 9.315648136507987e-07,
      "loss": 0.0114,
      "num_tokens": 737693173.0,
      "reward": 0.037109375,
      "reward_std": 0.058760736137628555,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1755.501953125,
      "completions/mean_terminated_length": 800.0083618164062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.26081761543057097,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.966260725492935,
      "learning_rate": 9.312803406804882e-07,
      "loss": 0.0101,
      "num_tokens": 738663350.0,
      "reward": 0.0234375,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1769.802734375,
      "completions/mean_terminated_length": 977.045166015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2611589997439618,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 1.0559549853916272,
      "learning_rate": 9.309953265524714e-07,
      "loss": 0.0137,
      "num_tokens": 739641089.0,
      "reward": 0.033203125,
      "reward_std": 0.06570751965045929,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1789.72265625,
      "completions/mean_terminated_length": 725.9400024414062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.26150038405735254,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.857766669809386,
      "learning_rate": 9.307097716712735e-07,
      "loss": 0.0173,
      "num_tokens": 740641379.0,
      "reward": 0.037109375,
      "reward_std": 0.06508206576108932,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1796.47265625,
      "completions/mean_terminated_length": 908.3363037109375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.26184176837074336,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 19.91026183073299,
      "learning_rate": 9.304236764421876e-07,
      "loss": 0.0056,
      "num_tokens": 741638261.0,
      "reward": 0.0625,
      "reward_std": 0.06657323241233826,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.876953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1868.0,
      "completions/mean_length": 1890.76953125,
      "completions/mean_terminated_length": 770.1904907226562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.26218315268413417,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.36754778467259164,
      "learning_rate": 9.301370412712733e-07,
      "loss": 0.014,
      "num_tokens": 742681839.0,
      "reward": 0.01171875,
      "reward_std": 0.032021719962358475,
      "rewards/accuracy_reward/mean": 0.012096773833036423,
      "rewards/accuracy_reward/std": 0.10942844301462173,
      "step": 768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1794.798828125,
      "completions/mean_terminated_length": 858.6513671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.262524536997525,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.8231903772645666,
      "learning_rate": 9.29849866565357e-07,
      "loss": 0.0018,
      "num_tokens": 743670616.0,
      "reward": 0.013671875,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1745.361328125,
      "completions/mean_terminated_length": 846.8294677734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.26286592131091574,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.719328430753915,
      "learning_rate": 9.295621527320305e-07,
      "loss": 0.0205,
      "num_tokens": 744645841.0,
      "reward": 0.05078125,
      "reward_std": 0.06441686302423477,
      "rewards/accuracy_reward/mean": 0.052419353276491165,
      "rewards/accuracy_reward/std": 0.22309619188308716,
      "step": 770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1834.23046875,
      "completions/mean_terminated_length": 858.8043823242188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.26320730562430655,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.02017420406366,
      "learning_rate": 9.292739001796513e-07,
      "loss": 0.0147,
      "num_tokens": 745669111.0,
      "reward": 0.044921875,
      "reward_std": 0.07960271090269089,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1918.0,
      "completions/mean_length": 1824.91015625,
      "completions/mean_terminated_length": 980.504638671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.26354868993769737,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.4715250050149322,
      "learning_rate": 9.289851093173408e-07,
      "loss": 0.0063,
      "num_tokens": 746676265.0,
      "reward": 0.01171875,
      "reward_std": 0.024649331346154213,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1932.0,
      "completions/mean_length": 1866.5625,
      "completions/mean_terminated_length": 980.2298583984375,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.2638900742510882,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.021038073079869,
      "learning_rate": 9.286957805549849e-07,
      "loss": 0.0079,
      "num_tokens": 747706553.0,
      "reward": 0.009765625,
      "reward_std": 0.028222277760505676,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1819.5390625,
      "completions/mean_terminated_length": 878.2799682617188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.26423145856447894,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.118142475390233,
      "learning_rate": 9.284059143032329e-07,
      "loss": 0.0153,
      "num_tokens": 748716125.0,
      "reward": 0.056640625,
      "reward_std": 0.0906703919172287,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1870.498046875,
      "completions/mean_terminated_length": 882.8590087890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.26457284287786975,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.084974035197056,
      "learning_rate": 9.281155109734971e-07,
      "loss": 0.0132,
      "num_tokens": 749745164.0,
      "reward": 0.029296875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1850.0546875,
      "completions/mean_terminated_length": 958.236572265625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.26491422719126057,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.6615705296670886,
      "learning_rate": 9.278245709779515e-07,
      "loss": 0.011,
      "num_tokens": 750767096.0,
      "reward": 0.021484375,
      "reward_std": 0.059305962175130844,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1838.583984375,
      "completions/mean_terminated_length": 756.1807250976562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.2652556115046514,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.4985521336770881,
      "learning_rate": 9.275330947295326e-07,
      "loss": 0.0024,
      "num_tokens": 751787939.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.888671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1848.0,
      "completions/mean_length": 1932.65625,
      "completions/mean_terminated_length": 1011.9298095703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.26559699581804214,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.796997054538657,
      "learning_rate": 9.272410826419374e-07,
      "loss": 0.0079,
      "num_tokens": 752855379.0,
      "reward": 0.021484375,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1851.685546875,
      "completions/mean_terminated_length": 836.9999389648438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.26593838013143295,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 23.433475064428848,
      "learning_rate": 9.269485351296239e-07,
      "loss": 0.0043,
      "num_tokens": 753882178.0,
      "reward": 0.041015625,
      "reward_std": 0.05492059141397476,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.833984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1885.0,
      "completions/mean_length": 1845.201171875,
      "completions/mean_terminated_length": 826.435302734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.26627976444482376,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.196287934551307,
      "learning_rate": 9.266554526078095e-07,
      "loss": 0.0167,
      "num_tokens": 754899689.0,
      "reward": 0.03125,
      "reward_std": 0.07795868813991547,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1773.0,
      "completions/mean_length": 1850.798828125,
      "completions/mean_terminated_length": 719.4868774414062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2666211487582146,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.389852331718428,
      "learning_rate": 9.263618354924714e-07,
      "loss": 0.0121,
      "num_tokens": 755920866.0,
      "reward": 0.015625,
      "reward_std": 0.03839729726314545,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.916015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1944.060546875,
      "completions/mean_terminated_length": 810.3953247070312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.26696253307160533,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.9108395569078425,
      "learning_rate": 9.260676842003453e-07,
      "loss": 0.0166,
      "num_tokens": 756991185.0,
      "reward": 0.0234375,
      "reward_std": 0.04522542655467987,
      "rewards/accuracy_reward/mean": 0.024193547666072845,
      "rewards/accuracy_reward/std": 0.15380479395389557,
      "step": 782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1835.373046875,
      "completions/mean_terminated_length": 824.7977905273438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.26730391738499615,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.340623069927433,
      "learning_rate": 9.257729991489252e-07,
      "loss": -0.0007,
      "num_tokens": 758005200.0,
      "reward": 0.0234375,
      "reward_std": 0.03748524188995361,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1921.240234375,
      "completions/mean_terminated_length": 695.8958740234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.26764530169838696,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.7644029241124284,
      "learning_rate": 9.254777807564626e-07,
      "loss": 0.0022,
      "num_tokens": 759063275.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1885.595703125,
      "completions/mean_terminated_length": 788.1364135742188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2679866860117778,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 10.522009974403513,
      "learning_rate": 9.251820294419661e-07,
      "loss": 0.0121,
      "num_tokens": 760101820.0,
      "reward": 0.03515625,
      "reward_std": 0.04318207502365112,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1768.8671875,
      "completions/mean_terminated_length": 794.3508911132812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.26832807032516853,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.921066859733127,
      "learning_rate": 9.248857456252005e-07,
      "loss": 0.0223,
      "num_tokens": 761080776.0,
      "reward": 0.046875,
      "reward_std": 0.08334338665008545,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1872.0,
      "completions/mean_length": 1849.865234375,
      "completions/mean_terminated_length": 840.3214721679688,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.26866945463855935,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 3.27380743481921,
      "learning_rate": 9.245889297266866e-07,
      "loss": 0.016,
      "num_tokens": 762102323.0,
      "reward": 0.017578125,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.943359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1967.6796875,
      "completions/mean_terminated_length": 629.9310302734375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.26901083895195016,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.748645983839906,
      "learning_rate": 9.242915821677001e-07,
      "loss": 0.0108,
      "num_tokens": 763185279.0,
      "reward": 0.02734375,
      "reward_std": 0.039282046258449554,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1646.0,
      "completions/mean_length": 1885.955078125,
      "completions/mean_terminated_length": 827.8970336914062,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.269352223265341,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.697268262704851,
      "learning_rate": 9.239937033702717e-07,
      "loss": 0.0047,
      "num_tokens": 764227176.0,
      "reward": 0.029296875,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.935546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1402.0,
      "completions/mean_length": 1957.75,
      "completions/mean_terminated_length": 647.757568359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.26969360757873173,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.0305707473273247,
      "learning_rate": 9.236952937571856e-07,
      "loss": 0.0086,
      "num_tokens": 765304120.0,
      "reward": 0.013671875,
      "reward_std": 0.026572702452540398,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1908.49609375,
      "completions/mean_terminated_length": 895.9677124023438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.27003499189212254,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.307911798958086,
      "learning_rate": 9.233963537519799e-07,
      "loss": 0.0114,
      "num_tokens": 766346502.0,
      "reward": 0.02734375,
      "reward_std": 0.04847268760204315,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1809.228515625,
      "completions/mean_terminated_length": 905.46728515625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27037637620551336,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.3232443725288308,
      "learning_rate": 9.230968837789451e-07,
      "loss": 0.0047,
      "num_tokens": 767349083.0,
      "reward": 0.009765625,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1908.0,
      "completions/mean_length": 1878.4375,
      "completions/mean_terminated_length": 962.7999877929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27071776051890417,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.687784549686607,
      "learning_rate": 9.227968842631243e-07,
      "loss": 0.0235,
      "num_tokens": 768390571.0,
      "reward": 0.021484375,
      "reward_std": 0.059305962175130844,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1836.49609375,
      "completions/mean_terminated_length": 858.6154174804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27105914483229493,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.744451020002693,
      "learning_rate": 9.224963556303116e-07,
      "loss": 0.0053,
      "num_tokens": 769420953.0,
      "reward": 0.029296875,
      "reward_std": 0.04230976849794388,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.845703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 1871.5390625,
      "completions/mean_terminated_length": 904.3544311523438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27140052914568574,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.346002400716161,
      "learning_rate": 9.221952983070526e-07,
      "loss": 0.0255,
      "num_tokens": 770447341.0,
      "reward": 0.0390625,
      "reward_std": 0.07961063086986542,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1885.09765625,
      "completions/mean_terminated_length": 920.8919067382812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27174191345907656,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.155085180387175,
      "learning_rate": 9.218937127206432e-07,
      "loss": 0.0058,
      "num_tokens": 771489167.0,
      "reward": 0.021484375,
      "reward_std": 0.06024399772286415,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1843.0,
      "completions/mean_length": 1764.76953125,
      "completions/mean_terminated_length": 717.5963134765625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.27208329777246737,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.939817684934455,
      "learning_rate": 9.215915992991289e-07,
      "loss": 0.0225,
      "num_tokens": 772471321.0,
      "reward": 0.076171875,
      "reward_std": 0.08642987906932831,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1884.89453125,
      "completions/mean_terminated_length": 949.7894897460938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2724246820858581,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.27218311020411,
      "learning_rate": 9.212889584713044e-07,
      "loss": 0.0014,
      "num_tokens": 773513667.0,
      "reward": 0.044921875,
      "reward_std": 0.07057978957891464,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.927734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1965.400390625,
      "completions/mean_terminated_length": 905.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.27276606639924894,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.6174194742930634,
      "learning_rate": 9.20985790666713e-07,
      "loss": 0.0077,
      "num_tokens": 774590736.0,
      "reward": 0.0234375,
      "reward_std": 0.044233135879039764,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1932.0,
      "completions/mean_length": 1862.845703125,
      "completions/mean_terminated_length": 800.6447143554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.27310745071263975,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.06604128369871849,
      "learning_rate": 9.20682096315646e-07,
      "loss": 0.007,
      "num_tokens": 775620417.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.004032257944345474,
      "rewards/accuracy_reward/std": 0.06343588978052139,
      "step": 800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1792.537109375,
      "completions/mean_terminated_length": 900.6578979492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.27344883502603057,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.199969987735104,
      "learning_rate": 9.20377875849142e-07,
      "loss": 0.004,
      "num_tokens": 776615252.0,
      "reward": 0.03515625,
      "reward_std": 0.04175759106874466,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1880.59765625,
      "completions/mean_terminated_length": 889.7567749023438,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2737902193394214,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.2533829936741272,
      "learning_rate": 9.200731296989862e-07,
      "loss": 0.0003,
      "num_tokens": 777654214.0,
      "reward": 0.0078125,
      "reward_std": 0.021347813308238983,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.841796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1856.60546875,
      "completions/mean_terminated_length": 838.197509765625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27413160365281214,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.031500405055657,
      "learning_rate": 9.1976785829771e-07,
      "loss": 0.0173,
      "num_tokens": 778679308.0,
      "reward": 0.0390625,
      "reward_std": 0.051493462175130844,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1772.0,
      "completions/mean_length": 1837.734375,
      "completions/mean_terminated_length": 750.9397583007812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27447298796620295,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.547172002821858,
      "learning_rate": 9.194620620785905e-07,
      "loss": 0.0149,
      "num_tokens": 779692308.0,
      "reward": 0.025390625,
      "reward_std": 0.04230976849794388,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1632.0,
      "completions/mean_length": 1807.783203125,
      "completions/mean_terminated_length": 909.1944580078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27481437227959377,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.357585515972072,
      "learning_rate": 9.191557414756495e-07,
      "loss": 0.0222,
      "num_tokens": 780690357.0,
      "reward": 0.06640625,
      "reward_std": 0.09353180229663849,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1880.994140625,
      "completions/mean_terminated_length": 892.5,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2751557565929846,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 34.122581171970694,
      "learning_rate": 9.188488969236531e-07,
      "loss": -0.0032,
      "num_tokens": 781736514.0,
      "reward": 0.03515625,
      "reward_std": 0.040593504905700684,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1841.494140625,
      "completions/mean_terminated_length": 923.7340087890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.27549714090637534,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 25.69693549648914,
      "learning_rate": 9.185415288581105e-07,
      "loss": -0.0018,
      "num_tokens": 782762143.0,
      "reward": 0.03125,
      "reward_std": 0.08257714658975601,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1846.7421875,
      "completions/mean_terminated_length": 915.6483764648438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27583852521976615,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 8.414492313316417,
      "learning_rate": 9.182336377152753e-07,
      "loss": 0.0004,
      "num_tokens": 783782891.0,
      "reward": 0.0703125,
      "reward_std": 0.0937047079205513,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1761.166015625,
      "completions/mean_terminated_length": 976.0364990234375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27617990953315696,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.193371531586765,
      "learning_rate": 9.179252239321419e-07,
      "loss": 0.0082,
      "num_tokens": 784769984.0,
      "reward": 0.09375,
      "reward_std": 0.09039659798145294,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1808.0625,
      "completions/mean_terminated_length": 843.6078491210938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2765212938465478,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.400832735733385,
      "learning_rate": 9.176162879464477e-07,
      "loss": 0.0034,
      "num_tokens": 785769088.0,
      "reward": 0.03125,
      "reward_std": 0.03996079042553902,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.86328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1880.482421875,
      "completions/mean_terminated_length": 822.7285766601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27686267815993854,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.1016606703853062,
      "learning_rate": 9.173068301966707e-07,
      "loss": 0.0115,
      "num_tokens": 786808935.0,
      "reward": 0.0234375,
      "reward_std": 0.03944835811853409,
      "rewards/accuracy_reward/mean": 0.024193547666072845,
      "rewards/accuracy_reward/std": 0.15380479395389557,
      "step": 811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1691.0078125,
      "completions/mean_terminated_length": 769.8181762695312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27720406247332935,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 5.374550915557742,
      "learning_rate": 9.169968511220296e-07,
      "loss": 0.0093,
      "num_tokens": 787754555.0,
      "reward": 0.0625,
      "reward_std": 0.10909565538167953,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1773.072265625,
      "completions/mean_terminated_length": 865.11767578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.27754544678672016,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 6.333591138045338,
      "learning_rate": 9.166863511624828e-07,
      "loss": 0.0038,
      "num_tokens": 788739920.0,
      "reward": 0.01953125,
      "reward_std": 0.02681133709847927,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1762.4609375,
      "completions/mean_terminated_length": 829.7000732421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.277886831100111,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.7085857978749721,
      "learning_rate": 9.163753307587285e-07,
      "loss": 0.0251,
      "num_tokens": 789715884.0,
      "reward": 0.029296875,
      "reward_std": 0.07729348540306091,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1802.287109375,
      "completions/mean_terminated_length": 972.74365234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.27822821541350173,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 14.597076921488581,
      "learning_rate": 9.160637903522031e-07,
      "loss": 0.0082,
      "num_tokens": 790720943.0,
      "reward": 0.044921875,
      "reward_std": 0.06684703379869461,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1927.0,
      "completions/mean_length": 1742.84375,
      "completions/mean_terminated_length": 932.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.27856959972689255,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.281868494887167,
      "learning_rate": 9.157517303850814e-07,
      "loss": 0.0058,
      "num_tokens": 791696863.0,
      "reward": 0.0390625,
      "reward_std": 0.07289456576108932,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1767.712890625,
      "completions/mean_terminated_length": 852.1083984375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27891098404028336,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.919495329929373,
      "learning_rate": 9.154391513002754e-07,
      "loss": 0.0052,
      "num_tokens": 792677932.0,
      "reward": 0.0078125,
      "reward_std": 0.021347813308238983,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1842.521484375,
      "completions/mean_terminated_length": 838.7471313476562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2792523683536742,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.57673311161995,
      "learning_rate": 9.151260535414336e-07,
      "loss": 0.015,
      "num_tokens": 793694375.0,
      "reward": 0.037109375,
      "reward_std": 0.08133251965045929,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1821.9453125,
      "completions/mean_terminated_length": 878.9091186523438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27959375266706493,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.636962021839514,
      "learning_rate": 9.148124375529414e-07,
      "loss": 0.0219,
      "num_tokens": 794708299.0,
      "reward": 0.046875,
      "reward_std": 0.09228022396564484,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1774.3828125,
      "completions/mean_terminated_length": 850.6325073242188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.27993513698045575,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.16909340269041,
      "learning_rate": 9.144983037799192e-07,
      "loss": 0.0134,
      "num_tokens": 795689279.0,
      "reward": 0.02734375,
      "reward_std": 0.0635918527841568,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1761.0,
      "completions/mean_length": 1698.392578125,
      "completions/mean_terminated_length": 778.5035400390625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.28027652129384656,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.886377386385229,
      "learning_rate": 9.141836526682226e-07,
      "loss": 0.0175,
      "num_tokens": 796633400.0,
      "reward": 0.05859375,
      "reward_std": 0.055899329483509064,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 1802.443359375,
      "completions/mean_terminated_length": 991.4874267578125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2806179056072374,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.070686773266614,
      "learning_rate": 9.138684846644408e-07,
      "loss": 0.0091,
      "num_tokens": 797637691.0,
      "reward": 0.03125,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1741.353515625,
      "completions/mean_terminated_length": 867.5263671875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.28095928992062813,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.327527105571596,
      "learning_rate": 9.135528002158977e-07,
      "loss": -0.0003,
      "num_tokens": 798608704.0,
      "reward": 0.083984375,
      "reward_std": 0.0775209367275238,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1723.576171875,
      "completions/mean_terminated_length": 799.0902709960938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28130067423401894,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.5841398569998713,
      "learning_rate": 9.132365997706493e-07,
      "loss": 0.0238,
      "num_tokens": 799573847.0,
      "reward": 0.03515625,
      "reward_std": 0.056218504905700684,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1835.443359375,
      "completions/mean_terminated_length": 797.0919799804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28164205854740976,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.6252711039686722,
      "learning_rate": 9.129198837774846e-07,
      "loss": 0.0149,
      "num_tokens": 800590362.0,
      "reward": 0.0234375,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1760.51953125,
      "completions/mean_terminated_length": 831.5537109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.28198344286080057,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.255931144584967,
      "learning_rate": 9.126026526859236e-07,
      "loss": 0.0088,
      "num_tokens": 801566676.0,
      "reward": 0.01953125,
      "reward_std": 0.03674772381782532,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1764.47265625,
      "completions/mean_terminated_length": 905.3228149414062,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.28232482717419133,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.973680788850989,
      "learning_rate": 9.122849069462181e-07,
      "loss": 0.0094,
      "num_tokens": 802562966.0,
      "reward": 0.12890625,
      "reward_std": 0.06794347614049911,
      "rewards/accuracy_reward/mean": 0.12890625,
      "rewards/accuracy_reward/std": 0.33542385697364807,
      "step": 827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1803.73046875,
      "completions/mean_terminated_length": 879.1588745117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28266621148758214,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.7959328963289285,
      "learning_rate": 9.119666470093501e-07,
      "loss": 0.0198,
      "num_tokens": 803566636.0,
      "reward": 0.03515625,
      "reward_std": 0.035868462175130844,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1765.677734375,
      "completions/mean_terminated_length": 812.5385131835938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28300759580097296,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.502840958414974,
      "learning_rate": 9.116478733270312e-07,
      "loss": 0.0072,
      "num_tokens": 804550391.0,
      "reward": 0.01953125,
      "reward_std": 0.05402229726314545,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.86328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1896.322265625,
      "completions/mean_terminated_length": 938.585693359375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.28334898011436377,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.189984021375238,
      "learning_rate": 9.113285863517024e-07,
      "loss": 0.0199,
      "num_tokens": 805603068.0,
      "reward": 0.03125,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.876953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1907.384765625,
      "completions/mean_terminated_length": 905.2222900390625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2836903644277545,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.395821068244509,
      "learning_rate": 9.110087865365333e-07,
      "loss": 0.0049,
      "num_tokens": 806654353.0,
      "reward": 0.0078125,
      "reward_std": 0.020409777760505676,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1791.0,
      "completions/mean_length": 1863.318359375,
      "completions/mean_terminated_length": 770.9459838867188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.28403174874114534,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 20.836372449782818,
      "learning_rate": 9.10688474335421e-07,
      "loss": 0.0026,
      "num_tokens": 807689348.0,
      "reward": 0.060546875,
      "reward_std": 0.08780106902122498,
      "rewards/accuracy_reward/mean": 0.06458333134651184,
      "rewards/accuracy_reward/std": 0.24604564905166626,
      "step": 832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1843.39453125,
      "completions/mean_terminated_length": 957.3021240234375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.28437313305453615,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.01190114557995,
      "learning_rate": 9.103676502029901e-07,
      "loss": -0.0058,
      "num_tokens": 808725454.0,
      "reward": 0.0625,
      "reward_std": 0.10064490139484406,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1839.3046875,
      "completions/mean_terminated_length": 760.62646484375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.28471451736792697,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.371139320984373,
      "learning_rate": 9.100463145945921e-07,
      "loss": 0.0225,
      "num_tokens": 809750762.0,
      "reward": 0.0390625,
      "reward_std": 0.062054343521595,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1839.0,
      "completions/mean_length": 1884.580078125,
      "completions/mean_terminated_length": 835.3768310546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.2850559016813177,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 4.270470032206402,
      "learning_rate": 9.097244679663037e-07,
      "loss": 0.0174,
      "num_tokens": 810792291.0,
      "reward": 0.0390625,
      "reward_std": 0.08544550836086273,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1904.75,
      "completions/mean_terminated_length": 985.0435180664062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28539728599470854,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.193138502196356,
      "learning_rate": 9.094021107749277e-07,
      "loss": 0.0265,
      "num_tokens": 811846467.0,
      "reward": 0.03125,
      "reward_std": 0.07426576316356659,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.880859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1888.322265625,
      "completions/mean_terminated_length": 707.7540283203125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28573867030809935,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.403438460241839,
      "learning_rate": 9.090792434779911e-07,
      "loss": 0.0234,
      "num_tokens": 812888600.0,
      "reward": 0.04296875,
      "reward_std": 0.07443207502365112,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1829.146484375,
      "completions/mean_terminated_length": 816.6483764648438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28608005462149017,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.672956930438959,
      "learning_rate": 9.087558665337447e-07,
      "loss": 0.0038,
      "num_tokens": 813903699.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1836.85546875,
      "completions/mean_terminated_length": 860.02197265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.2864214389348809,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.2238850401099515,
      "learning_rate": 9.084319804011631e-07,
      "loss": 0.0234,
      "num_tokens": 814915257.0,
      "reward": 0.05859375,
      "reward_std": 0.0756574496626854,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1878.810546875,
      "completions/mean_terminated_length": 735.5,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.28676282324827174,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.127142651197428,
      "learning_rate": 9.081075855399434e-07,
      "loss": 0.0043,
      "num_tokens": 815950984.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1771.0,
      "completions/mean_length": 1939.169921875,
      "completions/mean_terminated_length": 976.4423217773438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.28710420756166255,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.683663873918988,
      "learning_rate": 9.077826824105049e-07,
      "loss": 0.0153,
      "num_tokens": 817018511.0,
      "reward": 0.0078125,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1683.0,
      "completions/mean_length": 1942.5234375,
      "completions/mean_terminated_length": 820.6364135742188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.28744559187505336,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 1.423387792507175,
      "learning_rate": 9.074572714739881e-07,
      "loss": 0.023,
      "num_tokens": 818080715.0,
      "reward": 0.01953125,
      "reward_std": 0.05644455552101135,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1834.3828125,
      "completions/mean_terminated_length": 714.195068359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2877869761884441,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.914633061898735,
      "learning_rate": 9.071313531922541e-07,
      "loss": 0.0119,
      "num_tokens": 819100495.0,
      "reward": 0.03125,
      "reward_std": 0.04847268760204315,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1890.33984375,
      "completions/mean_terminated_length": 911.0704345703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28812836050183493,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.375183140345467,
      "learning_rate": 9.068049280278846e-07,
      "loss": -0.0024,
      "num_tokens": 820145677.0,
      "reward": 0.064453125,
      "reward_std": 0.09178133308887482,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.853515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1857.0,
      "completions/mean_length": 1858.984375,
      "completions/mean_terminated_length": 757.6533813476562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28846974481522575,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.691095628905756,
      "learning_rate": 9.064779964441802e-07,
      "loss": 0.0059,
      "num_tokens": 821172613.0,
      "reward": 0.021484375,
      "reward_std": 0.036420635879039764,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1781.109375,
      "completions/mean_terminated_length": 758.867919921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28881112912861656,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.227706853654657,
      "learning_rate": 9.061505589051606e-07,
      "loss": 0.0261,
      "num_tokens": 822157597.0,
      "reward": 0.06640625,
      "reward_std": 0.0843944400548935,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.873046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1661.0,
      "completions/mean_length": 1893.7109375,
      "completions/mean_terminated_length": 832.6769409179688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2891525134420073,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.660740416830857,
      "learning_rate": 9.058226158755634e-07,
      "loss": 0.0207,
      "num_tokens": 823202553.0,
      "reward": 0.0234375,
      "reward_std": 0.06805649399757385,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1854.662109375,
      "completions/mean_terminated_length": 840.8170166015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.28949389775539813,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.978697093397896,
      "learning_rate": 9.05494167820844e-07,
      "loss": 0.0011,
      "num_tokens": 824236076.0,
      "reward": 0.041015625,
      "reward_std": 0.069866843521595,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.845703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1848.302734375,
      "completions/mean_terminated_length": 753.759521484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.28983528206878895,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.294131623082693,
      "learning_rate": 9.05165215207174e-07,
      "loss": 0.009,
      "num_tokens": 825258343.0,
      "reward": 0.0234375,
      "reward_std": 0.053855981677770615,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.849609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1835.0,
      "completions/mean_length": 1860.3359375,
      "completions/mean_terminated_length": 800.1558227539062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.29017666638217976,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.250455706685674,
      "learning_rate": 9.048357585014417e-07,
      "loss": 0.0123,
      "num_tokens": 826290099.0,
      "reward": 0.048828125,
      "reward_std": 0.06075643002986908,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1866.08203125,
      "completions/mean_terminated_length": 925.8071899414062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2905180506955705,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.8172943673488975,
      "learning_rate": 9.045057981712504e-07,
      "loss": 0.0015,
      "num_tokens": 827324605.0,
      "reward": 0.017578125,
      "reward_std": 0.0390625,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1721.0,
      "completions/mean_length": 1782.380859375,
      "completions/mean_terminated_length": 740.3365478515625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.29085943500896133,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.221566492598393,
      "learning_rate": 9.041753346849187e-07,
      "loss": 0.0073,
      "num_tokens": 828319232.0,
      "reward": 0.05078125,
      "reward_std": 0.08874702453613281,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1754.94140625,
      "completions/mean_terminated_length": 797.61669921875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.29120081932235214,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.8383758396224,
      "learning_rate": 9.038443685114791e-07,
      "loss": 0.0212,
      "num_tokens": 829292546.0,
      "reward": 0.0625,
      "reward_std": 0.07708083093166351,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1877.115234375,
      "completions/mean_terminated_length": 926.2948608398438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.29154220363574296,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.906759911723979,
      "learning_rate": 9.035129001206771e-07,
      "loss": -0.0017,
      "num_tokens": 830320605.0,
      "reward": 0.021484375,
      "reward_std": 0.04604348540306091,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1752.626953125,
      "completions/mean_terminated_length": 828.3951416015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.2918835879491337,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 17.75494572292462,
      "learning_rate": 9.03180929982972e-07,
      "loss": -0.0093,
      "num_tokens": 831292446.0,
      "reward": 0.033203125,
      "reward_std": 0.06661957502365112,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1738.087890625,
      "completions/mean_terminated_length": 714.5966796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.29222497226252453,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.0996767404272845,
      "learning_rate": 9.028484585695345e-07,
      "loss": 0.0104,
      "num_tokens": 832261019.0,
      "reward": 0.044921875,
      "reward_std": 0.048086829483509064,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1903.0,
      "completions/mean_length": 1798.583984375,
      "completions/mean_terminated_length": 854.8971557617188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.29256635657591534,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.859577799381787,
      "learning_rate": 9.025154863522467e-07,
      "loss": 0.0035,
      "num_tokens": 833266534.0,
      "reward": 0.05859375,
      "reward_std": 0.07944335043430328,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 1762.826171875,
      "completions/mean_terminated_length": 916.1472778320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.29290774088930616,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.829967668780411,
      "learning_rate": 9.021820138037022e-07,
      "loss": 0.0053,
      "num_tokens": 834241837.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.008064515888690948,
      "rewards/accuracy_reward/std": 0.0895301103591919,
      "step": 858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1946.0,
      "completions/mean_length": 1745.716796875,
      "completions/mean_terminated_length": 809.8480224609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2932491252026969,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.248401421965156,
      "learning_rate": 9.01848041397204e-07,
      "loss": 0.01,
      "num_tokens": 835214332.0,
      "reward": 0.041015625,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1846.0,
      "completions/mean_length": 1802.109375,
      "completions/mean_terminated_length": 837.4615478515625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2935905095160877,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.343051079539533,
      "learning_rate": 9.015135696067649e-07,
      "loss": 0.0016,
      "num_tokens": 836209524.0,
      "reward": 0.060546875,
      "reward_std": 0.06849660724401474,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1837.31640625,
      "completions/mean_terminated_length": 958.404052734375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.29393189382947854,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.252634290375203,
      "learning_rate": 9.011785989071066e-07,
      "loss": 0.0281,
      "num_tokens": 837220630.0,
      "reward": 0.060546875,
      "reward_std": 0.09016455709934235,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1874.43359375,
      "completions/mean_terminated_length": 908.6923217773438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.29427327814286935,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.947210803084694,
      "learning_rate": 9.008431297736585e-07,
      "loss": 0.0112,
      "num_tokens": 838256372.0,
      "reward": 0.04296875,
      "reward_std": 0.040274329483509064,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1803.681640625,
      "completions/mean_terminated_length": 833.5242919921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2946146624562601,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.315930331018635,
      "learning_rate": 9.005071626825577e-07,
      "loss": 0.0137,
      "num_tokens": 839252321.0,
      "reward": 0.017578125,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1866.99609375,
      "completions/mean_terminated_length": 760.8611450195312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2949560467696509,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.6635401294550776,
      "learning_rate": 9.001706981106482e-07,
      "loss": 0.0048,
      "num_tokens": 840288111.0,
      "reward": 0.015625,
      "reward_std": 0.040274329483509064,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1771.62109375,
      "completions/mean_terminated_length": 858.8740234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.29529743108304174,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.12347243598158,
      "learning_rate": 8.998337365354798e-07,
      "loss": 0.0147,
      "num_tokens": 841273037.0,
      "reward": 0.02734375,
      "reward_std": 0.05806133523583412,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1755.48046875,
      "completions/mean_terminated_length": 877.921875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.29563881539643255,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.274812844600458,
      "learning_rate": 8.994962784353079e-07,
      "loss": 0.0226,
      "num_tokens": 842246931.0,
      "reward": 0.04296875,
      "reward_std": 0.08157937228679657,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1800.8828125,
      "completions/mean_terminated_length": 887.2293090820312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2959801997098233,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.2190727334438605,
      "learning_rate": 8.991583242890924e-07,
      "loss": 0.0025,
      "num_tokens": 843253927.0,
      "reward": 0.017578125,
      "reward_std": 0.03536957502365112,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1804.8125,
      "completions/mean_terminated_length": 873.3585205078125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.2963215840232141,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.815472197270014,
      "learning_rate": 8.988198745764976e-07,
      "loss": 0.0065,
      "num_tokens": 844263143.0,
      "reward": 0.021484375,
      "reward_std": 0.036420635879039764,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1814.0,
      "completions/mean_length": 1864.06640625,
      "completions/mean_terminated_length": 913.3734741210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.29666296833660494,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 2.3164676650613054,
      "learning_rate": 8.984809297778908e-07,
      "loss": 0.0075,
      "num_tokens": 845294409.0,
      "reward": 0.03125,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.841796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1878.345703125,
      "completions/mean_terminated_length": 975.6173095703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.29700435264999575,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 10.956686543784919,
      "learning_rate": 8.981414903743423e-07,
      "loss": 0.0294,
      "num_tokens": 846325562.0,
      "reward": 0.0625,
      "reward_std": 0.09671888500452042,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1888.41796875,
      "completions/mean_terminated_length": 972.9210815429688,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2973457369633865,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.498690871532542,
      "learning_rate": 8.97801556847624e-07,
      "loss": 0.0097,
      "num_tokens": 847370768.0,
      "reward": 0.021484375,
      "reward_std": 0.03163585811853409,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1848.6328125,
      "completions/mean_terminated_length": 938.478271484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.2976871212767773,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.730046789814379,
      "learning_rate": 8.974611296802096e-07,
      "loss": 0.0024,
      "num_tokens": 848394580.0,
      "reward": 0.046875,
      "reward_std": 0.044233135879039764,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.857421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1881.431640625,
      "completions/mean_terminated_length": 879.73974609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.29802850559016814,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.7773680058847106,
      "learning_rate": 8.971202093552731e-07,
      "loss": 0.021,
      "num_tokens": 849434705.0,
      "reward": 0.03515625,
      "reward_std": 0.05138043686747551,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1808.109375,
      "completions/mean_terminated_length": 867.0000610351562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.29836988990355895,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.7376704146901976,
      "learning_rate": 8.967787963566887e-07,
      "loss": 0.0124,
      "num_tokens": 850433001.0,
      "reward": 0.025390625,
      "reward_std": 0.04620979726314545,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1800.59375,
      "completions/mean_terminated_length": 875.1111450195312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2987112742169497,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.896390218321304,
      "learning_rate": 8.964368911690296e-07,
      "loss": 0.0097,
      "num_tokens": 851447129.0,
      "reward": 0.04296875,
      "reward_std": 0.07317391037940979,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1783.162109375,
      "completions/mean_terminated_length": 837.3125610351562,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.2990526585303405,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.428323802598927,
      "learning_rate": 8.960944942775675e-07,
      "loss": 0.027,
      "num_tokens": 852433180.0,
      "reward": 0.078125,
      "reward_std": 0.09060370922088623,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1909.0,
      "completions/mean_length": 1785.802734375,
      "completions/mean_terminated_length": 781.5377197265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.29939404284373133,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.528397245482416,
      "learning_rate": 8.957516061682724e-07,
      "loss": 0.016,
      "num_tokens": 853427783.0,
      "reward": 0.029296875,
      "reward_std": 0.049457065761089325,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1812.375,
      "completions/mean_terminated_length": 899.0476684570312,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.29973542715712215,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.188880628697577,
      "learning_rate": 8.954082273278112e-07,
      "loss": 0.0094,
      "num_tokens": 854430631.0,
      "reward": 0.029296875,
      "reward_std": 0.049457065761089325,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1742.98828125,
      "completions/mean_terminated_length": 856.572509765625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3000768114705129,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 11.391181433833179,
      "learning_rate": 8.950643582435474e-07,
      "loss": 0.0102,
      "num_tokens": 855410689.0,
      "reward": 0.05859375,
      "reward_std": 0.10986974090337753,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1748.1953125,
      "completions/mean_terminated_length": 858.465087890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3004181957839037,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 17.016757797584134,
      "learning_rate": 8.9471999940354e-07,
      "loss": 0.0125,
      "num_tokens": 856402005.0,
      "reward": 0.033203125,
      "reward_std": 0.06018522381782532,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1779.482421875,
      "completions/mean_terminated_length": 882.9067993164062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.30075958009729453,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 13.073247218031963,
      "learning_rate": 8.943751512965437e-07,
      "loss": 0.0415,
      "num_tokens": 857384876.0,
      "reward": 0.060546875,
      "reward_std": 0.10088353604078293,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1826.751953125,
      "completions/mean_terminated_length": 855.5895385742188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.30110096441068535,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.278548574437648,
      "learning_rate": 8.940298144120074e-07,
      "loss": 0.0236,
      "num_tokens": 858399885.0,
      "reward": 0.05859375,
      "reward_std": 0.10272903740406036,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1852.19140625,
      "completions/mean_terminated_length": 1014.4535522460938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3014423487240761,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 3.58069200787596,
      "learning_rate": 8.936839892400732e-07,
      "loss": 0.0297,
      "num_tokens": 859427023.0,
      "reward": 0.0546875,
      "reward_std": 0.11459337919950485,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1802.20703125,
      "completions/mean_terminated_length": 882.75927734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3017837330374669,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.840636193515991,
      "learning_rate": 8.93337676271577e-07,
      "loss": -0.006,
      "num_tokens": 860428121.0,
      "reward": 0.041015625,
      "reward_std": 0.06750431656837463,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1699.19921875,
      "completions/mean_terminated_length": 816.3724365234375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.30212511735085773,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.254061258358082,
      "learning_rate": 8.929908759980467e-07,
      "loss": 0.017,
      "num_tokens": 861372335.0,
      "reward": 0.041015625,
      "reward_std": 0.0854918509721756,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1808.37890625,
      "completions/mean_terminated_length": 808.7474975585938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.30246650166424854,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.18841138526206,
      "learning_rate": 8.926435889117019e-07,
      "loss": 0.0145,
      "num_tokens": 862373649.0,
      "reward": 0.037109375,
      "reward_std": 0.07800643146038055,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1848.21484375,
      "completions/mean_terminated_length": 859.0348510742188,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.3028078859776393,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.893986923322798,
      "learning_rate": 8.922958155054527e-07,
      "loss": 0.0089,
      "num_tokens": 863406831.0,
      "reward": 0.041015625,
      "reward_std": 0.05814187228679657,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1737.130859375,
      "completions/mean_terminated_length": 794.7322998046875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3031492702910301,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.029818981802405,
      "learning_rate": 8.919475562729004e-07,
      "loss": 0.0114,
      "num_tokens": 864381922.0,
      "reward": 0.02734375,
      "reward_std": 0.06987475603818893,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1820.1171875,
      "completions/mean_terminated_length": 819.8316040039062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.30349065460442093,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.276006276442953,
      "learning_rate": 8.915988117083351e-07,
      "loss": -0.0054,
      "num_tokens": 865388206.0,
      "reward": 0.060546875,
      "reward_std": 0.06218091398477554,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1824.611328125,
      "completions/mean_terminated_length": 804.7935180664062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.30383203891781174,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.8889294067874864,
      "learning_rate": 8.912495823067356e-07,
      "loss": 0.0112,
      "num_tokens": 866393655.0,
      "reward": 0.044921875,
      "reward_std": 0.04396171122789383,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1794.994140625,
      "completions/mean_terminated_length": 921.5738525390625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3041734232312025,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.512924424167205,
      "learning_rate": 8.908998685637696e-07,
      "loss": 0.0157,
      "num_tokens": 867389764.0,
      "reward": 0.0546875,
      "reward_std": 0.08394189178943634,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1932.0,
      "completions/mean_length": 1872.921875,
      "completions/mean_terminated_length": 1005.6744384765625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3045148075445933,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 3.7624399158122857,
      "learning_rate": 8.905496709757917e-07,
      "loss": 0.0058,
      "num_tokens": 868415836.0,
      "reward": 0.025390625,
      "reward_std": 0.03878315910696983,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.845703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 1869.724609375,
      "completions/mean_terminated_length": 892.594970703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3048561918579841,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 10.170331309747356,
      "learning_rate": 8.90198990039843e-07,
      "loss": 0.0048,
      "num_tokens": 869450527.0,
      "reward": 0.013671875,
      "reward_std": 0.03394509106874466,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.880859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1910.716796875,
      "completions/mean_terminated_length": 895.7212524414062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.30519757617137494,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.842252475556774,
      "learning_rate": 8.898478262536513e-07,
      "loss": 0.0053,
      "num_tokens": 870510542.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1816.4140625,
      "completions/mean_terminated_length": 799.8737182617188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3055389604847657,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.014274401936735,
      "learning_rate": 8.894961801156292e-07,
      "loss": 0.0218,
      "num_tokens": 871509202.0,
      "reward": 0.103515625,
      "reward_std": 0.10843831300735474,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1896.0,
      "completions/mean_length": 1836.443359375,
      "completions/mean_terminated_length": 817.125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3058803447981565,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 3.434047514521299,
      "learning_rate": 8.891440521248742e-07,
      "loss": 0.0115,
      "num_tokens": 872522325.0,
      "reward": 0.017578125,
      "reward_std": 0.03163585811853409,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1794.603515625,
      "completions/mean_terminated_length": 879.18017578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.3062217291115473,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.531396339008143,
      "learning_rate": 8.887914427811676e-07,
      "loss": 0.018,
      "num_tokens": 873530586.0,
      "reward": 0.048828125,
      "reward_std": 0.08159180730581284,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 1874.28125,
      "completions/mean_terminated_length": 812.6666870117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.30656311342493814,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.359007267099055,
      "learning_rate": 8.884383525849736e-07,
      "loss": 0.0161,
      "num_tokens": 874562506.0,
      "reward": 0.029296875,
      "reward_std": 0.05688370764255524,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17417415976524353,
      "step": 898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1678.982421875,
      "completions/mean_terminated_length": 796.7615966796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3069044977383289,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.517779518403353,
      "learning_rate": 8.880847820374395e-07,
      "loss": 0.0053,
      "num_tokens": 875497953.0,
      "reward": 0.0234375,
      "reward_std": 0.06805649399757385,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1805.34765625,
      "completions/mean_terminated_length": 864.781005859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3072458820517197,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.104829379012291,
      "learning_rate": 8.877307316403936e-07,
      "loss": 0.0017,
      "num_tokens": 876492867.0,
      "reward": 0.013671875,
      "reward_std": 0.032461829483509064,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1794.666015625,
      "completions/mean_terminated_length": 929.836181640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3075872663651105,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.41204495246442,
      "learning_rate": 8.87376201896346e-07,
      "loss": 0.0083,
      "num_tokens": 877497752.0,
      "reward": 0.01171875,
      "reward_std": 0.031083684414625168,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1851.3359375,
      "completions/mean_terminated_length": 904.4091186523438,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.30792865067850134,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.238588126920247,
      "learning_rate": 8.870211933084868e-07,
      "loss": 0.0037,
      "num_tokens": 878528244.0,
      "reward": 0.03515625,
      "reward_std": 0.06327171623706818,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1747.658203125,
      "completions/mean_terminated_length": 846.6328125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.30827003499189215,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 23.585498526051897,
      "learning_rate": 8.866657063806859e-07,
      "loss": 0.0022,
      "num_tokens": 879502949.0,
      "reward": 0.080078125,
      "reward_std": 0.08765621483325958,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 1727.962890625,
      "completions/mean_terminated_length": 877.5785522460938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3086114193052829,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.3473270874570704,
      "learning_rate": 8.863097416174916e-07,
      "loss": 0.0121,
      "num_tokens": 880469634.0,
      "reward": 0.037109375,
      "reward_std": 0.07817822694778442,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1721.701171875,
      "completions/mean_terminated_length": 819.5808715820312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3089528036186737,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.022643754273442,
      "learning_rate": 8.859532995241309e-07,
      "loss": -0.0157,
      "num_tokens": 881426041.0,
      "reward": 0.052734375,
      "reward_std": 0.05628519132733345,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1750.8125,
      "completions/mean_terminated_length": 830.7200317382812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.30929418793206453,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.801666732287964,
      "learning_rate": 8.855963806065085e-07,
      "loss": 0.0016,
      "num_tokens": 882403161.0,
      "reward": 0.041015625,
      "reward_std": 0.07339344918727875,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.88671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1922.271484375,
      "completions/mean_terminated_length": 938.1206665039062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.30963557224545535,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.5249261196057777,
      "learning_rate": 8.85238985371205e-07,
      "loss": 0.014,
      "num_tokens": 883465156.0,
      "reward": 0.0234375,
      "reward_std": 0.03779878467321396,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1777.671875,
      "completions/mean_terminated_length": 834.2982788085938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3099769565588461,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 8.082493635451137,
      "learning_rate": 8.848811143254779e-07,
      "loss": 0.0202,
      "num_tokens": 884452364.0,
      "reward": 0.103515625,
      "reward_std": 0.11873330175876617,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1813.486328125,
      "completions/mean_terminated_length": 893.47119140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3103183408722369,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 9.792449496332944,
      "learning_rate": 8.845227679772596e-07,
      "loss": 0.0075,
      "num_tokens": 885454453.0,
      "reward": 0.03515625,
      "reward_std": 0.05931950733065605,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872004121541977,
      "step": 909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1776.69921875,
      "completions/mean_terminated_length": 818.7433471679688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.31065972518562773,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.140876558577721,
      "learning_rate": 8.841639468351571e-07,
      "loss": 0.0313,
      "num_tokens": 886437819.0,
      "reward": 0.044921875,
      "reward_std": 0.09027662128210068,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1811.90625,
      "completions/mean_terminated_length": 939.5320434570312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.31100110949901855,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.39844096889801,
      "learning_rate": 8.838046514084516e-07,
      "loss": 0.0124,
      "num_tokens": 887446059.0,
      "reward": 0.01953125,
      "reward_std": 0.051493462175130844,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1657.48046875,
      "completions/mean_terminated_length": 774.4586181640625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3113424938124093,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.251234577297387,
      "learning_rate": 8.834448822070971e-07,
      "loss": -0.0051,
      "num_tokens": 888370161.0,
      "reward": 0.05078125,
      "reward_std": 0.06085042655467987,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1753.416015625,
      "completions/mean_terminated_length": 860.3858032226562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3116838781258001,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.982841117113028,
      "learning_rate": 8.830846397417202e-07,
      "loss": 0.0175,
      "num_tokens": 889340806.0,
      "reward": 0.037109375,
      "reward_std": 0.06311991065740585,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1822.853515625,
      "completions/mean_terminated_length": 960.5,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.31202526243919093,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 0.06309684439459966,
      "learning_rate": 8.827239245236194e-07,
      "loss": 0.0038,
      "num_tokens": 890356859.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1714.5,
      "completions/mean_terminated_length": 878.4657592773438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.31236664675258174,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.777618922363556,
      "learning_rate": 8.823627370647634e-07,
      "loss": -0.0043,
      "num_tokens": 891316251.0,
      "reward": 0.029296875,
      "reward_std": 0.059305962175130844,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1802.12890625,
      "completions/mean_terminated_length": 972.4530639648438,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3127080310659725,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.363956877118625,
      "learning_rate": 8.820010778777925e-07,
      "loss": 0.0044,
      "num_tokens": 892330269.0,
      "reward": 0.046875,
      "reward_std": 0.07696875929832458,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1813.068359375,
      "completions/mean_terminated_length": 807.9484252929688,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3130494153793633,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.280004532631335,
      "learning_rate": 8.816389474760151e-07,
      "loss": -0.0011,
      "num_tokens": 893335760.0,
      "reward": 0.021484375,
      "reward_std": 0.03462383896112442,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1798.962890625,
      "completions/mean_terminated_length": 899.288330078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.31339079969275413,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.886307869057184,
      "learning_rate": 8.812763463734095e-07,
      "loss": 0.0107,
      "num_tokens": 894330621.0,
      "reward": 0.05859375,
      "reward_std": 0.08804067224264145,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1771.46484375,
      "completions/mean_terminated_length": 868.11669921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.31373218400614494,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.980282174358649,
      "learning_rate": 8.809132750846214e-07,
      "loss": 0.0139,
      "num_tokens": 895314587.0,
      "reward": 0.029296875,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1751.1015625,
      "completions/mean_terminated_length": 921.9851684570312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3140735683195357,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 8.921443965171692,
      "learning_rate": 8.805497341249642e-07,
      "loss": 0.0119,
      "num_tokens": 896286559.0,
      "reward": 0.060546875,
      "reward_std": 0.11014118045568466,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1853.0,
      "completions/mean_length": 1773.103515625,
      "completions/mean_terminated_length": 845.0342407226562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3144149526329265,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 9.384515976146444,
      "learning_rate": 8.801857240104179e-07,
      "loss": 0.0227,
      "num_tokens": 897275828.0,
      "reward": 0.064453125,
      "reward_std": 0.11093293875455856,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1734.880859375,
      "completions/mean_terminated_length": 964.7770385742188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3147563369463173,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.67873326234176,
      "learning_rate": 8.798212452576282e-07,
      "loss": 0.0149,
      "num_tokens": 898248791.0,
      "reward": 0.06640625,
      "reward_std": 0.10594936460256577,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1752.859375,
      "completions/mean_terminated_length": 788.7333984375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.31509772125970814,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.923067328173586,
      "learning_rate": 8.794562983839058e-07,
      "loss": 0.0081,
      "num_tokens": 899224479.0,
      "reward": 0.03515625,
      "reward_std": 0.04599714279174805,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1804.013671875,
      "completions/mean_terminated_length": 901.9357299804688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3154391055730989,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 3.6045981638347033,
      "learning_rate": 8.790908839072262e-07,
      "loss": 0.0029,
      "num_tokens": 900220550.0,
      "reward": 0.01953125,
      "reward_std": 0.03674772381782532,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1888.0,
      "completions/mean_length": 1758.609375,
      "completions/mean_terminated_length": 872.0635375976562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3157804898864897,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.5325988196949685,
      "learning_rate": 8.787250023462286e-07,
      "loss": -0.0003,
      "num_tokens": 901193886.0,
      "reward": 0.01171875,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.012096773833036423,
      "rewards/accuracy_reward/std": 0.10942844301462173,
      "step": 925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1728.060546875,
      "completions/mean_terminated_length": 918.6344604492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3161218741998805,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.270839587951115,
      "learning_rate": 8.783586542202148e-07,
      "loss": 0.0228,
      "num_tokens": 902160157.0,
      "reward": 0.048828125,
      "reward_std": 0.06975477933883667,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1864.259765625,
      "completions/mean_terminated_length": 1047.7872314453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.31646325851327134,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.396773222267212,
      "learning_rate": 8.779918400491488e-07,
      "loss": 0.0045,
      "num_tokens": 903200258.0,
      "reward": 0.03515625,
      "reward_std": 0.0868166983127594,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1793.53515625,
      "completions/mean_terminated_length": 895.0265502929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3168046428266621,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.274668512212987,
      "learning_rate": 8.776245603536565e-07,
      "loss": 0.0219,
      "num_tokens": 904200644.0,
      "reward": 0.0625,
      "reward_std": 0.09777653962373734,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1743.919921875,
      "completions/mean_terminated_length": 919.81884765625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3171460271400529,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.3660017585141946,
      "learning_rate": 8.772568156550241e-07,
      "loss": 0.0127,
      "num_tokens": 905166411.0,
      "reward": 0.009765625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1759.935546875,
      "completions/mean_terminated_length": 913.46923828125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3174874114534437,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.23438429773587,
      "learning_rate": 8.76888606475198e-07,
      "loss": -0.0024,
      "num_tokens": 906143338.0,
      "reward": 0.041015625,
      "reward_std": 0.04698248207569122,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1934.0,
      "completions/mean_length": 1732.560546875,
      "completions/mean_terminated_length": 851.6666259765625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.31782879576683454,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.068153609397328,
      "learning_rate": 8.765199333367837e-07,
      "loss": 0.0207,
      "num_tokens": 907115481.0,
      "reward": 0.08203125,
      "reward_std": 0.08851956576108932,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 931
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1677.314453125,
      "completions/mean_terminated_length": 876.4506225585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.3181701800802253,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.399880525501455,
      "learning_rate": 8.761507967630453e-07,
      "loss": 0.0114,
      "num_tokens": 908049290.0,
      "reward": 0.01953125,
      "reward_std": 0.06085042655467987,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1879.0,
      "completions/mean_length": 1788.294921875,
      "completions/mean_terminated_length": 793.5755004882812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3185115643936161,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.8021536527336526,
      "learning_rate": 8.757811972779048e-07,
      "loss": 0.0176,
      "num_tokens": 909040145.0,
      "reward": 0.041015625,
      "reward_std": 0.06519509106874466,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1797.64453125,
      "completions/mean_terminated_length": 903.5178833007812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.3188529487070069,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 19.25262244671632,
      "learning_rate": 8.754111354059409e-07,
      "loss": 0.006,
      "num_tokens": 910037563.0,
      "reward": 0.03515625,
      "reward_std": 0.057269565761089325,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1927.0,
      "completions/mean_length": 1749.306640625,
      "completions/mean_terminated_length": 773.933349609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.31919433302039774,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.80430393862873,
      "learning_rate": 8.750406116723889e-07,
      "loss": 0.0031,
      "num_tokens": 911019272.0,
      "reward": 0.083984375,
      "reward_std": 0.08378186821937561,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1696.8203125,
      "completions/mean_terminated_length": 865.0789794921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3195357173337885,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 13.62397766449746,
      "learning_rate": 8.746696266031392e-07,
      "loss": 0.0066,
      "num_tokens": 911970396.0,
      "reward": 0.03125,
      "reward_std": 0.08752824366092682,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1767.849609375,
      "completions/mean_terminated_length": 707.46728515625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3198771016471793,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.309142070701617,
      "learning_rate": 8.742981807247374e-07,
      "loss": 0.0038,
      "num_tokens": 912953887.0,
      "reward": 0.04296875,
      "reward_std": 0.08191889524459839,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1799.318359375,
      "completions/mean_terminated_length": 890.5,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3202184859605701,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.624426304842892,
      "learning_rate": 8.739262745643832e-07,
      "loss": 0.0219,
      "num_tokens": 913949666.0,
      "reward": 0.041015625,
      "reward_std": 0.08961933106184006,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1819.65234375,
      "completions/mean_terminated_length": 965.4629516601562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.32055987027396093,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.455680689246552,
      "learning_rate": 8.735539086499291e-07,
      "loss": 0.0139,
      "num_tokens": 914955888.0,
      "reward": 0.021484375,
      "reward_std": 0.04230976849794388,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 1721.037109375,
      "completions/mean_terminated_length": 869.091552734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3209012545873517,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.8868910171060832,
      "learning_rate": 8.731810835098805e-07,
      "loss": 0.0125,
      "num_tokens": 915911107.0,
      "reward": 0.015625,
      "reward_std": 0.04175759106874466,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1800.572265625,
      "completions/mean_terminated_length": 926.9114990234375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3212426389007425,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.161647714948735,
      "learning_rate": 8.728077996733945e-07,
      "loss": 0.0234,
      "num_tokens": 916908232.0,
      "reward": 0.0234375,
      "reward_std": 0.06899453699588776,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1763.673828125,
      "completions/mean_terminated_length": 902.1810913085938,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3215840232141333,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.70916260713759,
      "learning_rate": 8.724340576702791e-07,
      "loss": 0.0039,
      "num_tokens": 917908225.0,
      "reward": 0.021484375,
      "reward_std": 0.058760736137628555,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1689.02734375,
      "completions/mean_terminated_length": 706.4379272460938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.32192540752752413,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.54949352326784,
      "learning_rate": 8.72059858030993e-07,
      "loss": 0.0151,
      "num_tokens": 918848207.0,
      "reward": 0.068359375,
      "reward_std": 0.10530310869216919,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.82421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1836.236328125,
      "completions/mean_terminated_length": 843.300048828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3222667918409149,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.4544983464932375,
      "learning_rate": 8.716852012866438e-07,
      "loss": 0.0207,
      "num_tokens": 919864616.0,
      "reward": 0.03515625,
      "reward_std": 0.03674772381782532,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1793.953125,
      "completions/mean_terminated_length": 896.9203491210938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3226081761543057,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.848257483305533,
      "learning_rate": 8.713100879689886e-07,
      "loss": -0.0011,
      "num_tokens": 920858048.0,
      "reward": 0.037109375,
      "reward_std": 0.06311991065740585,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1779.28515625,
      "completions/mean_terminated_length": 872.4701538085938,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3229495604676965,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.2865516108615734,
      "learning_rate": 8.709345186104319e-07,
      "loss": 0.0114,
      "num_tokens": 921855746.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1811.861328125,
      "completions/mean_terminated_length": 928.9629516601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.32329094478108733,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.67541637201727,
      "learning_rate": 8.705584937440257e-07,
      "loss": -0.0009,
      "num_tokens": 922861595.0,
      "reward": 0.025390625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1821.953125,
      "completions/mean_terminated_length": 890.6399536132812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3236323290944781,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.609864691782311,
      "learning_rate": 8.701820139034686e-07,
      "loss": 0.0005,
      "num_tokens": 923863651.0,
      "reward": 0.0390625,
      "reward_std": 0.04659565910696983,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1814.0,
      "completions/mean_length": 1832.80859375,
      "completions/mean_terminated_length": 863.2903442382812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3239737134078689,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.2184090147302316,
      "learning_rate": 8.698050796231049e-07,
      "loss": 0.0025,
      "num_tokens": 924878417.0,
      "reward": 0.03515625,
      "reward_std": 0.05738259106874466,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1778.203125,
      "completions/mean_terminated_length": 877.6610107421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3243150977212597,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.050515792077412,
      "learning_rate": 8.694276914379237e-07,
      "loss": 0.0159,
      "num_tokens": 925875129.0,
      "reward": 0.03125,
      "reward_std": 0.06194227933883667,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1640.0,
      "completions/mean_length": 1849.541015625,
      "completions/mean_terminated_length": 745.2948608398438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.32465648203465053,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.705077755615644,
      "learning_rate": 8.690498498835586e-07,
      "loss": 0.0035,
      "num_tokens": 926906014.0,
      "reward": 0.044921875,
      "reward_std": 0.07806520164012909,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1834.107421875,
      "completions/mean_terminated_length": 844.5604858398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3249978663480413,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.828066311661244,
      "learning_rate": 8.686715554962869e-07,
      "loss": 0.003,
      "num_tokens": 927924613.0,
      "reward": 0.02734375,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.833984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1832.306640625,
      "completions/mean_terminated_length": 748.7647094726562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3253392506614321,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.047753289758267,
      "learning_rate": 8.682928088130278e-07,
      "loss": 0.0286,
      "num_tokens": 928942610.0,
      "reward": 0.04296875,
      "reward_std": 0.07135801762342453,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1769.802734375,
      "completions/mean_terminated_length": 926.4487915039062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3256806349748229,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.661853735117854,
      "learning_rate": 8.679136103713431e-07,
      "loss": 0.0046,
      "num_tokens": 929922669.0,
      "reward": 0.037109375,
      "reward_std": 0.06716620922088623,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1737.828125,
      "completions/mean_terminated_length": 756.8779907226562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3260220192882137,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 34.318837463023065,
      "learning_rate": 8.675339607094356e-07,
      "loss": 0.0051,
      "num_tokens": 930885333.0,
      "reward": 0.06640625,
      "reward_std": 0.1088760495185852,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1855.0,
      "completions/mean_length": 1853.853515625,
      "completions/mean_terminated_length": 955.6593627929688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3263634036016045,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.75063572732186,
      "learning_rate": 8.671538603661489e-07,
      "loss": 0.0059,
      "num_tokens": 931907882.0,
      "reward": 0.03125,
      "reward_std": 0.06100328266620636,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1748.326171875,
      "completions/mean_terminated_length": 885.6287841796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3267047879149953,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.20335540580923,
      "learning_rate": 8.667733098809655e-07,
      "loss": 0.0337,
      "num_tokens": 932880241.0,
      "reward": 0.060546875,
      "reward_std": 0.09821805357933044,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1825.615234375,
      "completions/mean_terminated_length": 920.663330078125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3270461722283861,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 20.869769792517626,
      "learning_rate": 8.663923097940072e-07,
      "loss": -0.0014,
      "num_tokens": 933890684.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.002016128972172737,
      "rewards/accuracy_reward/std": 0.044901326298713684,
      "step": 958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1805.70703125,
      "completions/mean_terminated_length": 794.9293212890625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3273875565417769,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.414152253488377,
      "learning_rate": 8.660108606460343e-07,
      "loss": 0.009,
      "num_tokens": 934887862.0,
      "reward": 0.048828125,
      "reward_std": 0.05190078169107437,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1791.150390625,
      "completions/mean_terminated_length": 852.4818115234375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3277289408551677,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.301711215221964,
      "learning_rate": 8.656289629784439e-07,
      "loss": 0.0187,
      "num_tokens": 935882051.0,
      "reward": 0.0546875,
      "reward_std": 0.10064489394426346,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1795.9609375,
      "completions/mean_terminated_length": 925.8782348632812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3280703251685585,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.74255026428469,
      "learning_rate": 8.652466173332698e-07,
      "loss": 0.0104,
      "num_tokens": 936880191.0,
      "reward": 0.044921875,
      "reward_std": 0.07316835969686508,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1845.66796875,
      "completions/mean_terminated_length": 1051.9039306640625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3284117094819493,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.642718965075587,
      "learning_rate": 8.648638242531817e-07,
      "loss": 0.0086,
      "num_tokens": 937907445.0,
      "reward": 0.013671875,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.01411290280520916,
      "rewards/accuracy_reward/std": 0.11807556450366974,
      "step": 962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1789.095703125,
      "completions/mean_terminated_length": 934.0588989257812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3287530937953401,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.661024603204702,
      "learning_rate": 8.644805842814846e-07,
      "loss": 0.0121,
      "num_tokens": 938906054.0,
      "reward": 0.029296875,
      "reward_std": 0.064970001578331,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1837.978515625,
      "completions/mean_terminated_length": 993.7745361328125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3290944781087309,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.5101179323687752,
      "learning_rate": 8.640968979621174e-07,
      "loss": 0.0197,
      "num_tokens": 939924475.0,
      "reward": 0.037109375,
      "reward_std": 0.07638143002986908,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1751.3671875,
      "completions/mean_terminated_length": 861.46875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3294358624221217,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.29810092832554086,
      "learning_rate": 8.637127658396526e-07,
      "loss": 0.0042,
      "num_tokens": 940899591.0,
      "reward": 0.013671875,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1801.28125,
      "completions/mean_terminated_length": 867.9345703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3297772467355125,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.962705122686955,
      "learning_rate": 8.633281884592957e-07,
      "loss": 0.0161,
      "num_tokens": 941912167.0,
      "reward": 0.025390625,
      "reward_std": 0.07438573986291885,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1819.890625,
      "completions/mean_terminated_length": 856.244873046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3301186310489033,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.8134596153091294,
      "learning_rate": 8.629431663668834e-07,
      "loss": 0.0041,
      "num_tokens": 942921999.0,
      "reward": 0.01953125,
      "reward_std": 0.04907120764255524,
      "rewards/accuracy_reward/mean": 0.02016128972172737,
      "rewards/accuracy_reward/std": 0.14069372415542603,
      "step": 967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1803.453125,
      "completions/mean_terminated_length": 866.79248046875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3304600153622941,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 2.4968572175033015,
      "learning_rate": 8.625577001088848e-07,
      "loss": 0.0121,
      "num_tokens": 943919511.0,
      "reward": 0.056640625,
      "reward_std": 0.06041031330823898,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1701.3984375,
      "completions/mean_terminated_length": 856.9932861328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3308013996756849,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.808227081900512,
      "learning_rate": 8.621717902323987e-07,
      "loss": 0.0105,
      "num_tokens": 944865619.0,
      "reward": 0.03125,
      "reward_std": 0.06657323986291885,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 969
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1769.908203125,
      "completions/mean_terminated_length": 841.3643798828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3311427839890757,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 3.4634339860374457,
      "learning_rate": 8.61785437285153e-07,
      "loss": 0.0229,
      "num_tokens": 945853156.0,
      "reward": 0.037109375,
      "reward_std": 0.08444078266620636,
      "rewards/accuracy_reward/mean": 0.038306452333927155,
      "rewards/accuracy_reward/std": 0.19212883710861206,
      "step": 970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1761.810546875,
      "completions/mean_terminated_length": 876.4400634765625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3314841683024665,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.771348751685164,
      "learning_rate": 8.613986418155055e-07,
      "loss": 0.005,
      "num_tokens": 946846355.0,
      "reward": 0.0625,
      "reward_std": 0.0737205371260643,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 971
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1749.56640625,
      "completions/mean_terminated_length": 1029.3466796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3318255526158573,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.6395987660404225,
      "learning_rate": 8.610114043724416e-07,
      "loss": 0.0137,
      "num_tokens": 947817189.0,
      "reward": 0.0390625,
      "reward_std": 0.0876617580652237,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1797.380859375,
      "completions/mean_terminated_length": 764.8299560546875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3321669369292481,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.719642914726535,
      "learning_rate": 8.606237255055738e-07,
      "loss": 0.0073,
      "num_tokens": 948815688.0,
      "reward": 0.05859375,
      "reward_std": 0.10542967915534973,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1813.359375,
      "completions/mean_terminated_length": 965.6937255859375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3325083212426389,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.4611792070603915,
      "learning_rate": 8.602356057651416e-07,
      "loss": -0.0008,
      "num_tokens": 949827776.0,
      "reward": 0.01171875,
      "reward_std": 0.024649331346154213,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1823.427734375,
      "completions/mean_terminated_length": 931.6796264648438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3328497055560297,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.819143888207121,
      "learning_rate": 8.598470457020101e-07,
      "loss": 0.0062,
      "num_tokens": 950834955.0,
      "reward": 0.03125,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1766.220703125,
      "completions/mean_terminated_length": 814.91455078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3331910898694205,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.26958225534278,
      "learning_rate": 8.594580458676688e-07,
      "loss": 0.0188,
      "num_tokens": 951819724.0,
      "reward": 0.048828125,
      "reward_std": 0.07476464658975601,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1739.337890625,
      "completions/mean_terminated_length": 850.7651977539062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3335324741828113,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.037389837638769,
      "learning_rate": 8.59068606814232e-07,
      "loss": 0.0077,
      "num_tokens": 952782473.0,
      "reward": 0.095703125,
      "reward_std": 0.10712699592113495,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1718.36328125,
      "completions/mean_terminated_length": 825.0,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3338738584962021,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.647603936830429,
      "learning_rate": 8.586787290944373e-07,
      "loss": -0.0059,
      "num_tokens": 953742419.0,
      "reward": 0.048828125,
      "reward_std": 0.0659080371260643,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1683.373046875,
      "completions/mean_terminated_length": 943.3313598632812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3342152428095929,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.306868722803917,
      "learning_rate": 8.582884132616448e-07,
      "loss": 0.0175,
      "num_tokens": 954675378.0,
      "reward": 0.0703125,
      "reward_std": 0.10821780562400818,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1902.0,
      "completions/mean_length": 1760.595703125,
      "completions/mean_terminated_length": 907.2945556640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3345566271229837,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.699939905809723,
      "learning_rate": 8.578976598698364e-07,
      "loss": 0.0112,
      "num_tokens": 955648659.0,
      "reward": 0.037109375,
      "reward_std": 0.03957492858171463,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1726.98828125,
      "completions/mean_terminated_length": 865.568359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3348980114363745,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.5182520467823,
      "learning_rate": 8.57506469473615e-07,
      "loss": 0.0248,
      "num_tokens": 956606957.0,
      "reward": 0.060546875,
      "reward_std": 0.0967666283249855,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 1697.81640625,
      "completions/mean_terminated_length": 828.3129272460938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3352393957497653,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.042144755118716,
      "learning_rate": 8.57114842628204e-07,
      "loss": 0.0035,
      "num_tokens": 957553983.0,
      "reward": 0.03515625,
      "reward_std": 0.04505911096930504,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1669.677734375,
      "completions/mean_terminated_length": 934.77587890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3355807800631561,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 4.275034946582434,
      "learning_rate": 8.567227798894458e-07,
      "loss": 0.0237,
      "num_tokens": 958495482.0,
      "reward": 0.078125,
      "reward_std": 0.1371595412492752,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1717.796875,
      "completions/mean_terminated_length": 727.1875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3359221643765469,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 14.239763764517782,
      "learning_rate": 8.56330281813802e-07,
      "loss": 0.0289,
      "num_tokens": 959451954.0,
      "reward": 0.1015625,
      "reward_std": 0.10707925260066986,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1777.353515625,
      "completions/mean_terminated_length": 873.6694946289062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3362635486899377,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.742336370284916,
      "learning_rate": 8.559373489583518e-07,
      "loss": 0.0023,
      "num_tokens": 960450791.0,
      "reward": 0.052734375,
      "reward_std": 0.09303291887044907,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1759.09375,
      "completions/mean_terminated_length": 892.375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3366049330033285,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 4.5892656054758385,
      "learning_rate": 8.555439818807914e-07,
      "loss": 0.0131,
      "num_tokens": 961432663.0,
      "reward": 0.017578125,
      "reward_std": 0.048086829483509064,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1761.66015625,
      "completions/mean_terminated_length": 945.6992797851562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3369463173167193,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.542126449219264,
      "learning_rate": 8.551501811394335e-07,
      "loss": 0.001,
      "num_tokens": 962414057.0,
      "reward": 0.02734375,
      "reward_std": 0.03839729726314545,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1771.013671875,
      "completions/mean_terminated_length": 948.6434326171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.33728770163011007,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 1.0654922763405843,
      "learning_rate": 8.547559472932062e-07,
      "loss": 0.0158,
      "num_tokens": 963397616.0,
      "reward": 0.033203125,
      "reward_std": 0.06508206576108932,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1739.828125,
      "completions/mean_terminated_length": 912.8633422851562,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3376290859435009,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 9.357957290227729,
      "learning_rate": 8.543612809016524e-07,
      "loss": 0.0249,
      "num_tokens": 964369928.0,
      "reward": 0.0546875,
      "reward_std": 0.09640534222126007,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1897.0,
      "completions/mean_length": 1761.2734375,
      "completions/mean_terminated_length": 803.8983154296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3379704702568917,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.825673397274351,
      "learning_rate": 8.539661825249287e-07,
      "loss": 0.0092,
      "num_tokens": 965353892.0,
      "reward": 0.0234375,
      "reward_std": 0.04670868441462517,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1690.29296875,
      "completions/mean_terminated_length": 835.1126098632812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3383118545702825,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.0345676063068305,
      "learning_rate": 8.535706527238051e-07,
      "loss": 0.0206,
      "num_tokens": 966302026.0,
      "reward": 0.033203125,
      "reward_std": 0.06997986882925034,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211629986763,
      "step": 991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1737.9296875,
      "completions/mean_terminated_length": 777.9520263671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.33865323888367327,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 9.934603611047853,
      "learning_rate": 8.531746920596639e-07,
      "loss": 0.0026,
      "num_tokens": 967274150.0,
      "reward": 0.029296875,
      "reward_std": 0.03741292655467987,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.853515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1887.3203125,
      "completions/mean_terminated_length": 951.0933837890625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3389946231970641,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.0186849558101678,
      "learning_rate": 8.527783010944986e-07,
      "loss": 0.007,
      "num_tokens": 968318346.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1744.912109375,
      "completions/mean_terminated_length": 970.3541870117188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3393360075104549,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.36077386710483,
      "learning_rate": 8.523814803909137e-07,
      "loss": 0.0063,
      "num_tokens": 969288285.0,
      "reward": 0.046875,
      "reward_std": 0.08351518213748932,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1902.0,
      "completions/mean_length": 1774.712890625,
      "completions/mean_terminated_length": 831.2781982421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3396773918238457,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 9.363849066087976,
      "learning_rate": 8.51984230512124e-07,
      "loss": 0.0113,
      "num_tokens": 970273754.0,
      "reward": 0.048828125,
      "reward_std": 0.1028466522693634,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1806.115234375,
      "completions/mean_terminated_length": 998.4661254882812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.34001877613723647,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.830418117651566,
      "learning_rate": 8.515865520219526e-07,
      "loss": 0.0191,
      "num_tokens": 971273349.0,
      "reward": 0.03515625,
      "reward_std": 0.0788990929722786,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1851.8515625,
      "completions/mean_terminated_length": 968.1290283203125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3403601604506273,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.016076237145231,
      "learning_rate": 8.511884454848315e-07,
      "loss": 0.0175,
      "num_tokens": 972300217.0,
      "reward": 0.041015625,
      "reward_std": 0.07527707517147064,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1776.607421875,
      "completions/mean_terminated_length": 818.3274536132812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3407015447640181,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.768527857697114,
      "learning_rate": 8.507899114658003e-07,
      "loss": 0.0138,
      "num_tokens": 973289824.0,
      "reward": 0.05078125,
      "reward_std": 0.077679343521595,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1868.08984375,
      "completions/mean_terminated_length": 976.906982421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3410429290774089,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.054774148068096,
      "learning_rate": 8.503909505305048e-07,
      "loss": 0.024,
      "num_tokens": 974318286.0,
      "reward": 0.025390625,
      "reward_std": 0.06041031330823898,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1803.578125,
      "completions/mean_terminated_length": 878.4298706054688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.34138431339079967,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.3049787000461057,
      "learning_rate": 8.499915632451975e-07,
      "loss": 0.0093,
      "num_tokens": 975320662.0,
      "reward": 0.052734375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1849.0,
      "completions/mean_length": 1757.51171875,
      "completions/mean_terminated_length": 930.0300903320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3417256977041905,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 13.930003618726023,
      "learning_rate": 8.495917501767352e-07,
      "loss": 0.0377,
      "num_tokens": 976304732.0,
      "reward": 0.08203125,
      "reward_std": 0.15245094895362854,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1848.224609375,
      "completions/mean_terminated_length": 982.53125,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.3420670820175813,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.9165513512615133,
      "learning_rate": 8.491915118925798e-07,
      "loss": 0.0176,
      "num_tokens": 977332943.0,
      "reward": 0.021484375,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1691.412109375,
      "completions/mean_terminated_length": 892.7405395507812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3424084663309721,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.944478263852318,
      "learning_rate": 8.48790848960796e-07,
      "loss": 0.0041,
      "num_tokens": 978278930.0,
      "reward": 0.017578125,
      "reward_std": 0.048086829483509064,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1812.060546875,
      "completions/mean_terminated_length": 1006.612060546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3427498506443629,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.585752433978029,
      "learning_rate": 8.483897619500517e-07,
      "loss": 0.008,
      "num_tokens": 979283553.0,
      "reward": 0.013671875,
      "reward_std": 0.03741292655467987,
      "rewards/accuracy_reward/mean": 0.01411290280520916,
      "rewards/accuracy_reward/std": 0.11807556450366974,
      "step": 1004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1737.5,
      "completions/mean_terminated_length": 870.3999633789062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3430912349577537,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 12.393198351613488,
      "learning_rate": 8.479882514296165e-07,
      "loss": 0.0177,
      "num_tokens": 980253185.0,
      "reward": 0.078125,
      "reward_std": 0.1258384883403778,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1846.626953125,
      "completions/mean_terminated_length": 927.3152465820312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3434326192711445,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.630363797380602,
      "learning_rate": 8.475863179693613e-07,
      "loss": 0.0184,
      "num_tokens": 981269922.0,
      "reward": 0.048828125,
      "reward_std": 0.09271937608718872,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1860.923828125,
      "completions/mean_terminated_length": 959.5568237304688,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3437740035845353,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.914601936552197,
      "learning_rate": 8.471839621397569e-07,
      "loss": 0.0116,
      "num_tokens": 982296203.0,
      "reward": 0.025390625,
      "reward_std": 0.06354551762342453,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1886.0,
      "completions/mean_length": 1888.416015625,
      "completions/mean_terminated_length": 810.0151977539062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3441153878979261,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.491343950432109,
      "learning_rate": 8.467811845118741e-07,
      "loss": 0.0007,
      "num_tokens": 983342224.0,
      "reward": 0.01171875,
      "reward_std": 0.031083684414625168,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 1008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1782.765625,
      "completions/mean_terminated_length": 867.13037109375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3444567722113169,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 28.083402465945653,
      "learning_rate": 8.463779856573819e-07,
      "loss": 0.0088,
      "num_tokens": 984341304.0,
      "reward": 0.037109375,
      "reward_std": 0.07481793314218521,
      "rewards/accuracy_reward/mean": 0.038306452333927155,
      "rewards/accuracy_reward/std": 0.19212885200977325,
      "step": 1009
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1835.0,
      "completions/mean_length": 1794.86328125,
      "completions/mean_terminated_length": 940.2564697265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3447981565247077,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.020067901551045,
      "learning_rate": 8.459743661485472e-07,
      "loss": 0.0145,
      "num_tokens": 985336386.0,
      "reward": 0.060546875,
      "reward_std": 0.0765829086303711,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.833984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1870.310546875,
      "completions/mean_terminated_length": 977.682373046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3451395408380985,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.43197460117714,
      "learning_rate": 8.455703265582342e-07,
      "loss": 0.0065,
      "num_tokens": 986369409.0,
      "reward": 0.03125,
      "reward_std": 0.04659565910696983,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1011
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1810.0,
      "completions/mean_length": 1820.18359375,
      "completions/mean_terminated_length": 807.1276245117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3454809251514893,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 8.249044291063075,
      "learning_rate": 8.451658674599032e-07,
      "loss": 0.0067,
      "num_tokens": 987374831.0,
      "reward": 0.029296875,
      "reward_std": 0.039834219962358475,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1815.021484375,
      "completions/mean_terminated_length": 983.294677734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.3458223094648801,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.579004426330865,
      "learning_rate": 8.447609894276102e-07,
      "loss": 0.015,
      "num_tokens": 988386522.0,
      "reward": 0.060546875,
      "reward_std": 0.10210913419723511,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1885.0,
      "completions/mean_length": 1877.556640625,
      "completions/mean_terminated_length": 835.9583129882812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3461636937782709,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.053417745139626,
      "learning_rate": 8.443556930360048e-07,
      "loss": 0.0027,
      "num_tokens": 989417863.0,
      "reward": 0.060546875,
      "reward_std": 0.07489816844463348,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1831.9296875,
      "completions/mean_terminated_length": 871.1063842773438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3465050780916617,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.403770461980555,
      "learning_rate": 8.439499788603318e-07,
      "loss": -0.0021,
      "num_tokens": 990432211.0,
      "reward": 0.02734375,
      "reward_std": 0.024649331346154213,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1851.427734375,
      "completions/mean_terminated_length": 904.3068237304688,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.3468464624050525,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.093197422519866,
      "learning_rate": 8.435438474764281e-07,
      "loss": 0.0165,
      "num_tokens": 991453326.0,
      "reward": 0.0546875,
      "reward_std": 0.06535585969686508,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1796.919921875,
      "completions/mean_terminated_length": 920.8157958984375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.34718784671844327,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.512560936995271,
      "learning_rate": 8.431372994607225e-07,
      "loss": 0.0056,
      "num_tokens": 992453989.0,
      "reward": 0.029296875,
      "reward_std": 0.060957908630371094,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1725.11328125,
      "completions/mean_terminated_length": 867.1571655273438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3475292310318341,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.717356752182565,
      "learning_rate": 8.427303353902359e-07,
      "loss": 0.0178,
      "num_tokens": 993420751.0,
      "reward": 0.037109375,
      "reward_std": 0.05782270431518555,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 1805.65625,
      "completions/mean_terminated_length": 949.9468994140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3478706153452249,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 21.660366778558416,
      "learning_rate": 8.423229558425796e-07,
      "loss": 0.0154,
      "num_tokens": 994416431.0,
      "reward": 0.021484375,
      "reward_std": 0.03462383896112442,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1827.24609375,
      "completions/mean_terminated_length": 917.739990234375,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "epoch": 0.3482119996586157,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 9.878856782670251,
      "learning_rate": 8.419151613959539e-07,
      "loss": 0.0228,
      "num_tokens": 995426941.0,
      "reward": 0.07421875,
      "reward_std": 0.1165291890501976,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1829.515625,
      "completions/mean_terminated_length": 791.1011352539062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.34855338397200647,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.016803461282072,
      "learning_rate": 8.415069526291486e-07,
      "loss": 0.0057,
      "num_tokens": 996432741.0,
      "reward": 0.041015625,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1799.314453125,
      "completions/mean_terminated_length": 1021.1693115234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3488947682853973,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.893572373632423,
      "learning_rate": 8.410983301215415e-07,
      "loss": 0.0014,
      "num_tokens": 997426758.0,
      "reward": 0.017578125,
      "reward_std": 0.043680962175130844,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1022
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1788.064453125,
      "completions/mean_terminated_length": 957.1229248046875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.3492361525987881,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.75299649504567,
      "learning_rate": 8.406892944530973e-07,
      "loss": 0.0143,
      "num_tokens": 998417559.0,
      "reward": 0.0625,
      "reward_std": 0.07104447484016418,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1806.373046875,
      "completions/mean_terminated_length": 933.468505859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3495775369121789,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.606571273794385,
      "learning_rate": 8.402798462043673e-07,
      "loss": 0.0225,
      "num_tokens": 999407926.0,
      "reward": 0.0703125,
      "reward_std": 0.08738242089748383,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 1024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1881.0,
      "completions/mean_length": 1814.607421875,
      "completions/mean_terminated_length": 941.5463256835938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.34991892122556967,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 31.33963611162889,
      "learning_rate": 8.398699859564887e-07,
      "loss": 0.0258,
      "num_tokens": 1000417709.0,
      "reward": 0.0625,
      "reward_std": 0.0904373973608017,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1797.708984375,
      "completions/mean_terminated_length": 827.5333862304688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3502603055389605,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.226804481190621,
      "learning_rate": 8.39459714291183e-07,
      "loss": 0.0172,
      "num_tokens": 1001413608.0,
      "reward": 0.044921875,
      "reward_std": 0.060412678867578506,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1864.234375,
      "completions/mean_terminated_length": 966.5287475585938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3506016898523513,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.229421019989417,
      "learning_rate": 8.390490317907557e-07,
      "loss": 0.0298,
      "num_tokens": 1002439824.0,
      "reward": 0.048828125,
      "reward_std": 0.10139855742454529,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1824.28125,
      "completions/mean_terminated_length": 1016.0720825195312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3509430741657421,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 13.031020100907192,
      "learning_rate": 8.386379390380956e-07,
      "loss": 0.0271,
      "num_tokens": 1003446032.0,
      "reward": 0.072265625,
      "reward_std": 0.13020557165145874,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1807.98828125,
      "completions/mean_terminated_length": 819.1399536132812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.35128445847913287,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.46081562829866,
      "learning_rate": 8.382264366166736e-07,
      "loss": 0.0041,
      "num_tokens": 1004445258.0,
      "reward": 0.01953125,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1719.58984375,
      "completions/mean_terminated_length": 904.4557495117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3516258427925237,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.058953496918804,
      "learning_rate": 8.378145251105423e-07,
      "loss": 0.0129,
      "num_tokens": 1005410072.0,
      "reward": 0.064453125,
      "reward_std": 0.08648413419723511,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1751.86328125,
      "completions/mean_terminated_length": 1223.9674072265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3519672271059145,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.985636098147594,
      "learning_rate": 8.374022051043344e-07,
      "loss": 0.0212,
      "num_tokens": 1006385202.0,
      "reward": 0.064453125,
      "reward_std": 0.06602106243371964,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1815.671875,
      "completions/mean_terminated_length": 915.1238403320312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3523086114193053,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 8.392169320136071,
      "learning_rate": 8.36989477183263e-07,
      "loss": 0.005,
      "num_tokens": 1007390538.0,
      "reward": 0.01171875,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 1032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1734.767578125,
      "completions/mean_terminated_length": 949.5410766601562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.35264999573269606,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.945360606966492,
      "learning_rate": 8.365763419331199e-07,
      "loss": 0.0091,
      "num_tokens": 1008354675.0,
      "reward": 0.041015625,
      "reward_std": 0.0759630799293518,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1838.7421875,
      "completions/mean_terminated_length": 1027.619140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.3529913800460869,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.82911314401029,
      "learning_rate": 8.361627999402748e-07,
      "loss": 0.0088,
      "num_tokens": 1009367663.0,
      "reward": 0.05859375,
      "reward_std": 0.07179021835327148,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1829.51953125,
      "completions/mean_terminated_length": 992.6981201171875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.3533327643594777,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 19.775242794877773,
      "learning_rate": 8.357488517916752e-07,
      "loss": 0.012,
      "num_tokens": 1010382905.0,
      "reward": 0.02734375,
      "reward_std": 0.05012226849794388,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1718.548828125,
      "completions/mean_terminated_length": 868.4265747070312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3536741486728685,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.056353428423002,
      "learning_rate": 8.353344980748446e-07,
      "loss": 0.0154,
      "num_tokens": 1011344354.0,
      "reward": 0.04296875,
      "reward_std": 0.07383356243371964,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1854.0,
      "completions/mean_length": 1857.001953125,
      "completions/mean_terminated_length": 869.795166015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.35401553298625926,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.944942695040773,
      "learning_rate": 8.349197393778825e-07,
      "loss": 0.0075,
      "num_tokens": 1012369811.0,
      "reward": 0.025390625,
      "reward_std": 0.0557793527841568,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1817.22265625,
      "completions/mean_terminated_length": 854.48486328125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3543569172996501,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.519135208801546,
      "learning_rate": 8.345045762894628e-07,
      "loss": 0.0154,
      "num_tokens": 1013371349.0,
      "reward": 0.01953125,
      "reward_std": 0.023823359981179237,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1811.8671875,
      "completions/mean_terminated_length": 775.3684692382812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3546983016130409,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.120897787669056,
      "learning_rate": 8.340890093988336e-07,
      "loss": 0.0232,
      "num_tokens": 1014371665.0,
      "reward": 0.04296875,
      "reward_std": 0.0794091522693634,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1852.8515625,
      "completions/mean_terminated_length": 858.5238037109375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3550396859264317,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 15.888927381380547,
      "learning_rate": 8.336730392958163e-07,
      "loss": -0.0007,
      "num_tokens": 1015392965.0,
      "reward": 0.021484375,
      "reward_std": 0.04973640665411949,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1904.0546875,
      "completions/mean_terminated_length": 859.290283203125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.35538107023982246,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 24.276854607553346,
      "learning_rate": 8.332566665708041e-07,
      "loss": 0.0042,
      "num_tokens": 1016450305.0,
      "reward": 0.025390625,
      "reward_std": 0.030584799125790596,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1969.0,
      "completions/mean_length": 1709.044921875,
      "completions/mean_terminated_length": 851.137939453125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3557224545532133,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.19504078478282,
      "learning_rate": 8.328398918147622e-07,
      "loss": 0.0139,
      "num_tokens": 1017399624.0,
      "reward": 0.0625,
      "reward_std": 0.08675792813301086,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1791.02734375,
      "completions/mean_terminated_length": 873.2678833007812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3560638388666041,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.866374901890465,
      "learning_rate": 8.32422715619226e-07,
      "loss": 0.0021,
      "num_tokens": 1018403350.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.01411290280520916,
      "rewards/accuracy_reward/std": 0.11807556450366974,
      "step": 1043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1744.076171875,
      "completions/mean_terminated_length": 803.1280517578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.3564052231799949,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 28.050169617909198,
      "learning_rate": 8.320051385763005e-07,
      "loss": 0.02,
      "num_tokens": 1019382029.0,
      "reward": 0.08203125,
      "reward_std": 0.09486784040927887,
      "rewards/accuracy_reward/mean": 0.08467742055654526,
      "rewards/accuracy_reward/std": 0.278682142496109,
      "step": 1044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1806.453125,
      "completions/mean_terminated_length": 870.1714477539062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.35674660749338566,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.298429803880369,
      "learning_rate": 8.315871612786604e-07,
      "loss": 0.0208,
      "num_tokens": 1020390677.0,
      "reward": 0.044921875,
      "reward_std": 0.0921536535024643,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 1045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1785.47265625,
      "completions/mean_terminated_length": 964.01611328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3570879918067765,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 13.35327019742105,
      "learning_rate": 8.311687843195481e-07,
      "loss": 0.0038,
      "num_tokens": 1021382791.0,
      "reward": 0.02734375,
      "reward_std": 0.051493462175130844,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1871.0,
      "completions/mean_length": 1823.455078125,
      "completions/mean_terminated_length": 862.773193359375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3574293761201673,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.142855779593305,
      "learning_rate": 8.307500082927726e-07,
      "loss": 0.0099,
      "num_tokens": 1022398016.0,
      "reward": 0.03125,
      "reward_std": 0.07300759106874466,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1761.30078125,
      "completions/mean_terminated_length": 804.0169677734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3577707604335581,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 24.867748822848235,
      "learning_rate": 8.303308337927103e-07,
      "loss": -0.002,
      "num_tokens": 1023384186.0,
      "reward": 0.041015625,
      "reward_std": 0.05099457502365112,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1797.24609375,
      "completions/mean_terminated_length": 902.169677734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.35811214474694886,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 25.31652117155038,
      "learning_rate": 8.299112614143028e-07,
      "loss": 0.0187,
      "num_tokens": 1024385368.0,
      "reward": 0.087890625,
      "reward_std": 0.11250369995832443,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 1049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1822.185546875,
      "completions/mean_terminated_length": 868.2346801757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.35845352906033967,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 2.1197805731457176,
      "learning_rate": 8.29491291753056e-07,
      "loss": -0.0002,
      "num_tokens": 1025400103.0,
      "reward": 0.00390625,
      "reward_std": 0.010673906654119492,
      "rewards/accuracy_reward/mean": 0.004032257944345474,
      "rewards/accuracy_reward/std": 0.06343589723110199,
      "step": 1050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1954.0,
      "completions/mean_length": 1650.38671875,
      "completions/mean_terminated_length": 653.630126953125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3587949133737305,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.60842730212261,
      "learning_rate": 8.290709254050403e-07,
      "loss": 0.0175,
      "num_tokens": 1026319629.0,
      "reward": 0.072265625,
      "reward_std": 0.08699656277894974,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1785.998046875,
      "completions/mean_terminated_length": 758.144287109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3591362976871213,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 6.993621953210613,
      "learning_rate": 8.286501629668887e-07,
      "loss": -0.0032,
      "num_tokens": 1027307276.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 1052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1756.396484375,
      "completions/mean_terminated_length": 908.2977294921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.35947768200051206,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.217672045595186,
      "learning_rate": 8.282290050357966e-07,
      "loss": 0.0209,
      "num_tokens": 1028287191.0,
      "reward": 0.048828125,
      "reward_std": 0.07407219707965851,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1831.001953125,
      "completions/mean_terminated_length": 914.744873046875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.35981906631390287,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 6.4682384353089875,
      "learning_rate": 8.278074522095207e-07,
      "loss": 0.0016,
      "num_tokens": 1029310568.0,
      "reward": 0.013671875,
      "reward_std": 0.025633705779910088,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1761.427734375,
      "completions/mean_terminated_length": 845.3359985351562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3601604506272937,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 4.251167829416981,
      "learning_rate": 8.273855050863779e-07,
      "loss": 0.0094,
      "num_tokens": 1030286883.0,
      "reward": 0.060546875,
      "reward_std": 0.06602106243371964,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1745.0078125,
      "completions/mean_terminated_length": 836.03125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3605018349406845,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.5187319712984806,
      "learning_rate": 8.269631642652454e-07,
      "loss": 0.0128,
      "num_tokens": 1031255591.0,
      "reward": 0.029296875,
      "reward_std": 0.04973640665411949,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1765.0,
      "completions/mean_length": 1819.654296875,
      "completions/mean_terminated_length": 945.0471801757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.36084321925407525,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.074165024012252,
      "learning_rate": 8.265404303455583e-07,
      "loss": 0.0062,
      "num_tokens": 1032257798.0,
      "reward": 0.037109375,
      "reward_std": 0.07273616641759872,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1734.48046875,
      "completions/mean_terminated_length": 803.6434326171875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.36118460356746607,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 17.15321905079241,
      "learning_rate": 8.261173039273103e-07,
      "loss": 0.007,
      "num_tokens": 1033226108.0,
      "reward": 0.044921875,
      "reward_std": 0.03741292655467987,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1915.0,
      "completions/mean_length": 1673.26171875,
      "completions/mean_terminated_length": 833.6582641601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3615259878808569,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 13.817990981812805,
      "learning_rate": 8.25693785611052e-07,
      "loss": 0.0142,
      "num_tokens": 1034149106.0,
      "reward": 0.0703125,
      "reward_std": 0.10986737906932831,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1059
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 1739.341796875,
      "completions/mean_terminated_length": 885.99267578125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3618673721942477,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.766935989511598,
      "learning_rate": 8.2526987599789e-07,
      "loss": 0.0023,
      "num_tokens": 1035114897.0,
      "reward": 0.0546875,
      "reward_std": 0.08832141757011414,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1741.466796875,
      "completions/mean_terminated_length": 859.0227661132812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.36220875650763845,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 8.440053427541136,
      "learning_rate": 8.248455756894865e-07,
      "loss": 0.02,
      "num_tokens": 1036097392.0,
      "reward": 0.033203125,
      "reward_std": 0.0879673957824707,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1885.0,
      "completions/mean_length": 1649.724609375,
      "completions/mean_terminated_length": 765.5031127929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.36255014082102927,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.2529459478913803,
      "learning_rate": 8.244208852880583e-07,
      "loss": -0.0022,
      "num_tokens": 1037015843.0,
      "reward": 0.044921875,
      "reward_std": 0.057656385004520416,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1736.06640625,
      "completions/mean_terminated_length": 790.44091796875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3628915251344201,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.817836090554461,
      "learning_rate": 8.239958053963758e-07,
      "loss": 0.0283,
      "num_tokens": 1037973045.0,
      "reward": 0.072265625,
      "reward_std": 0.09781768918037415,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1678.244140625,
      "completions/mean_terminated_length": 768.8446044921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3632329094478109,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 36.807616073665045,
      "learning_rate": 8.23570336617762e-07,
      "loss": -0.0078,
      "num_tokens": 1038911970.0,
      "reward": 0.060546875,
      "reward_std": 0.04957009106874466,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1918.0,
      "completions/mean_length": 1714.072265625,
      "completions/mean_terminated_length": 884.9319458007812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.36357429376120165,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 23.60677450689423,
      "learning_rate": 8.23144479556092e-07,
      "loss": 0.0112,
      "num_tokens": 1039863703.0,
      "reward": 0.05859375,
      "reward_std": 0.09671889245510101,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1626.23046875,
      "completions/mean_terminated_length": 827.9661254882812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.36391567807459246,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.071367208311841,
      "learning_rate": 8.227182348157923e-07,
      "loss": 0.0297,
      "num_tokens": 1040768013.0,
      "reward": 0.111328125,
      "reward_std": 0.1078319400548935,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "step": 1066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1709.728515625,
      "completions/mean_terminated_length": 828.31689453125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3642570623879833,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 8.703691427984685,
      "learning_rate": 8.222916030018389e-07,
      "loss": 0.0132,
      "num_tokens": 1041723602.0,
      "reward": 0.078125,
      "reward_std": 0.1201498806476593,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1648.720703125,
      "completions/mean_terminated_length": 905.9273681640625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.3645984467013741,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 11.497292087012905,
      "learning_rate": 8.21864584719758e-07,
      "loss": 0.0199,
      "num_tokens": 1042644643.0,
      "reward": 0.078125,
      "reward_std": 0.1150255799293518,
      "rewards/accuracy_reward/mean": 0.08064515888690948,
      "rewards/accuracy_reward/std": 0.2725643217563629,
      "step": 1068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1751.888671875,
      "completions/mean_terminated_length": 1030.489990234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.36493983101476485,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.736426792450537,
      "learning_rate": 8.214371805756238e-07,
      "loss": 0.0015,
      "num_tokens": 1043622794.0,
      "reward": 0.0234375,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1069
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1566.40625,
      "completions/mean_terminated_length": 796.3451538085938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.36528121532815566,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 4.550799332810863,
      "learning_rate": 8.210093911760582e-07,
      "loss": 0.007,
      "num_tokens": 1044503290.0,
      "reward": 0.048828125,
      "reward_std": 0.064970001578331,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 1070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1616.333984375,
      "completions/mean_terminated_length": 792.2443237304688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3656225996415465,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 4.151064253424261,
      "learning_rate": 8.205812171282302e-07,
      "loss": 0.0096,
      "num_tokens": 1045415365.0,
      "reward": 0.0703125,
      "reward_std": 0.10916339606046677,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1071
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1675.703125,
      "completions/mean_terminated_length": 994.8729858398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3659639839549373,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 18.23929482315597,
      "learning_rate": 8.201526590398543e-07,
      "loss": 0.0068,
      "num_tokens": 1046351053.0,
      "reward": 0.08203125,
      "reward_std": 0.10189647972583771,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.572265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1530.111328125,
      "completions/mean_terminated_length": 837.3515625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.36630536826832805,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.377436425086909,
      "learning_rate": 8.197237175191907e-07,
      "loss": 0.0174,
      "num_tokens": 1047215494.0,
      "reward": 0.076171875,
      "reward_std": 0.08384227007627487,
      "rewards/accuracy_reward/mean": 0.0786290317773819,
      "rewards/accuracy_reward/std": 0.26943066716194153,
      "step": 1073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1581.9609375,
      "completions/mean_terminated_length": 798.7225341796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.36664675258171886,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 11.366920620380323,
      "learning_rate": 8.192943931750431e-07,
      "loss": 0.0421,
      "num_tokens": 1048096626.0,
      "reward": 0.111328125,
      "reward_std": 0.1449257880449295,
      "rewards/accuracy_reward/mean": 0.11491935700178146,
      "rewards/accuracy_reward/std": 0.3192465901374817,
      "step": 1074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1604.96875,
      "completions/mean_terminated_length": 854.1473999023438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.3669881368951097,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 19.296695107904313,
      "learning_rate": 8.188646866167591e-07,
      "loss": 0.0101,
      "num_tokens": 1048995330.0,
      "reward": 0.076171875,
      "reward_std": 0.14179153740406036,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 1075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1573.33984375,
      "completions/mean_terminated_length": 789.0673217773438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.3673295212085005,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.395138066910892,
      "learning_rate": 8.184345984542283e-07,
      "loss": 0.0156,
      "num_tokens": 1049879248.0,
      "reward": 0.072265625,
      "reward_std": 0.10074453055858612,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1890.0,
      "completions/mean_length": 1510.125,
      "completions/mean_terminated_length": 840.140380859375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.36767090552189124,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.247313697619292,
      "learning_rate": 8.180041292978826e-07,
      "loss": 0.0148,
      "num_tokens": 1050719936.0,
      "reward": 0.052734375,
      "reward_std": 0.07917051762342453,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.61328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1851.0,
      "completions/mean_length": 1615.302734375,
      "completions/mean_terminated_length": 929.1060791015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.36801228983528206,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.369241000743516,
      "learning_rate": 8.175732797586939e-07,
      "loss": 0.0086,
      "num_tokens": 1051614299.0,
      "reward": 0.021484375,
      "reward_std": 0.07014618813991547,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1543.2109375,
      "completions/mean_terminated_length": 840.4625854492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.36835367414867287,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.94878356587409,
      "learning_rate": 8.171420504481743e-07,
      "loss": 0.0191,
      "num_tokens": 1052487991.0,
      "reward": 0.03515625,
      "reward_std": 0.07352001965045929,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1079
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1680.541015625,
      "completions/mean_terminated_length": 966.7413940429688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3686950584620637,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.238720437917998,
      "learning_rate": 8.167104419783753e-07,
      "loss": 0.0037,
      "num_tokens": 1053430012.0,
      "reward": 0.0703125,
      "reward_std": 0.07955637574195862,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1613.716796875,
      "completions/mean_terminated_length": 871.529052734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.36903644277545444,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 6.95260561579441,
      "learning_rate": 8.162784549618855e-07,
      "loss": 0.0315,
      "num_tokens": 1054333723.0,
      "reward": 0.03125,
      "reward_std": 0.08698301017284393,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1625.12890625,
      "completions/mean_terminated_length": 1017.0000610351562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.36937782708884526,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.096928061043071,
      "learning_rate": 8.158460900118321e-07,
      "loss": 0.007,
      "num_tokens": 1055236573.0,
      "reward": 0.029296875,
      "reward_std": 0.06684703379869461,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1082
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1652.919921875,
      "completions/mean_terminated_length": 954.5892333984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.36971921140223607,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.00954503058334,
      "learning_rate": 8.15413347741878e-07,
      "loss": 0.0173,
      "num_tokens": 1056161524.0,
      "reward": 0.037109375,
      "reward_std": 0.08560487627983093,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1511.349609375,
      "completions/mean_terminated_length": 903.1458740234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3700605957156269,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 18.22106998469532,
      "learning_rate": 8.149802287662214e-07,
      "loss": 0.0226,
      "num_tokens": 1057006775.0,
      "reward": 0.091796875,
      "reward_std": 0.1285618543624878,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 1084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1907.0,
      "completions/mean_length": 1553.48828125,
      "completions/mean_terminated_length": 864.869140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.37040198002901764,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.76414460599428,
      "learning_rate": 8.145467336995954e-07,
      "loss": -0.0133,
      "num_tokens": 1057880225.0,
      "reward": 0.052734375,
      "reward_std": 0.04973640665411949,
      "rewards/accuracy_reward/mean": 0.05624999850988388,
      "rewards/accuracy_reward/std": 0.23064424097537994,
      "step": 1085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.61328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1625.099609375,
      "completions/mean_terminated_length": 954.4393920898438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37074336434240845,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.0411881148299713,
      "learning_rate": 8.141128631572676e-07,
      "loss": 0.008,
      "num_tokens": 1058790436.0,
      "reward": 0.064453125,
      "reward_std": 0.09055596590042114,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1086
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.49609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1829.0,
      "completions/mean_length": 1437.201171875,
      "completions/mean_terminated_length": 835.8720703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.37108474865579927,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 22.999905614479072,
      "learning_rate": 8.136786177550373e-07,
      "loss": 0.0133,
      "num_tokens": 1059606427.0,
      "reward": 0.08203125,
      "reward_std": 0.10418535768985748,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1902.0,
      "completions/mean_length": 1672.20703125,
      "completions/mean_terminated_length": 902.7261962890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3714261329691901,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.359560024218893,
      "learning_rate": 8.132439981092364e-07,
      "loss": 0.0113,
      "num_tokens": 1060537765.0,
      "reward": 0.033203125,
      "reward_std": 0.07493096590042114,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1543.43359375,
      "completions/mean_terminated_length": 829.7028198242188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.37176751728258084,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.374684591557838,
      "learning_rate": 8.128090048367283e-07,
      "loss": 0.0082,
      "num_tokens": 1061404211.0,
      "reward": 0.072265625,
      "reward_std": 0.07885697484016418,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1576.244140625,
      "completions/mean_terminated_length": 897.8143310546875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.37210890159597165,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 4.398562849048429,
      "learning_rate": 8.123736385549063e-07,
      "loss": 0.0319,
      "num_tokens": 1062285552.0,
      "reward": 0.078125,
      "reward_std": 0.12488233298063278,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1670.970703125,
      "completions/mean_terminated_length": 925.6802368164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37245028590936247,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.052033287637672,
      "learning_rate": 8.119378998816932e-07,
      "loss": 0.0187,
      "num_tokens": 1063222513.0,
      "reward": 0.048828125,
      "reward_std": 0.08406735956668854,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1693.908203125,
      "completions/mean_terminated_length": 789.201416015625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.3727916702227533,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.729028892685118,
      "learning_rate": 8.115017894355401e-07,
      "loss": 0.0122,
      "num_tokens": 1064173650.0,
      "reward": 0.056640625,
      "reward_std": 0.05833513289690018,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1629.107421875,
      "completions/mean_terminated_length": 849.8267822265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37313305453614404,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.8854574359168157,
      "learning_rate": 8.110653078354264e-07,
      "loss": 0.0123,
      "num_tokens": 1065081177.0,
      "reward": 0.041015625,
      "reward_std": 0.07085913419723511,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1704.26171875,
      "completions/mean_terminated_length": 874.7066650390625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37347443884953485,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.242001679256016,
      "learning_rate": 8.106284557008577e-07,
      "loss": 0.0033,
      "num_tokens": 1066031247.0,
      "reward": 0.072265625,
      "reward_std": 0.064970001578331,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1932.0,
      "completions/mean_length": 1841.87109375,
      "completions/mean_terminated_length": 791.5952758789062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37381582316292566,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.040588492249451,
      "learning_rate": 8.101912336518656e-07,
      "loss": 0.0277,
      "num_tokens": 1067052205.0,
      "reward": 0.0234375,
      "reward_std": 0.06068410724401474,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1712.34765625,
      "completions/mean_terminated_length": 878.9251708984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3741572074763165,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.9268085170734,
      "learning_rate": 8.097536423090072e-07,
      "loss": 0.0057,
      "num_tokens": 1068006719.0,
      "reward": 0.041015625,
      "reward_std": 0.059305962175130844,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1705.68359375,
      "completions/mean_terminated_length": 730.2105712890625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.37449859178970724,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.510482943250402,
      "learning_rate": 8.09315682293363e-07,
      "loss": 0.0086,
      "num_tokens": 1068947501.0,
      "reward": 0.05859375,
      "reward_std": 0.0921671986579895,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1909.0,
      "completions/mean_length": 1839.494140625,
      "completions/mean_terminated_length": 874.8681640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.37483997610309805,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.31251210113687,
      "learning_rate": 8.088773542265372e-07,
      "loss": 0.0029,
      "num_tokens": 1069966170.0,
      "reward": 0.029296875,
      "reward_std": 0.04672222584486008,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1763.0,
      "completions/mean_length": 1848.9453125,
      "completions/mean_terminated_length": 774.0499877929688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.37518136041648886,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.000336071300132,
      "learning_rate": 8.084386587306566e-07,
      "loss": 0.004,
      "num_tokens": 1070994686.0,
      "reward": 0.02734375,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1099
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1823.26171875,
      "completions/mean_terminated_length": 920.421630859375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3755227447298797,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.038236609404691,
      "learning_rate": 8.079995964283688e-07,
      "loss": 0.0159,
      "num_tokens": 1072005508.0,
      "reward": 0.0234375,
      "reward_std": 0.05880707502365112,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1767.470703125,
      "completions/mean_terminated_length": 820.3846435546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37586412904327043,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.828429751976278,
      "learning_rate": 8.075601679428427e-07,
      "loss": 0.031,
      "num_tokens": 1072985013.0,
      "reward": 0.029296875,
      "reward_std": 0.07085912674665451,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17417415976524353,
      "step": 1101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1836.1953125,
      "completions/mean_terminated_length": 869.2608642578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37620551335666125,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.382093211937953,
      "learning_rate": 8.071203738977667e-07,
      "loss": 0.0205,
      "num_tokens": 1074005241.0,
      "reward": 0.021484375,
      "reward_std": 0.04792051762342453,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.849609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1866.09765625,
      "completions/mean_terminated_length": 838.467529296875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.37654689767005206,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 4.101733005141735,
      "learning_rate": 8.066802149173479e-07,
      "loss": 0.008,
      "num_tokens": 1075033451.0,
      "reward": 0.029296875,
      "reward_std": 0.05683041363954544,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1840.23046875,
      "completions/mean_terminated_length": 928.2316284179688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3768882819834429,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.716512237858535,
      "learning_rate": 8.062396916263112e-07,
      "loss": 0.0136,
      "num_tokens": 1076055233.0,
      "reward": 0.025390625,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1859.087890625,
      "completions/mean_terminated_length": 775.3289794921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.3772296662968337,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.115055927360768,
      "learning_rate": 8.057988046498993e-07,
      "loss": 0.0134,
      "num_tokens": 1077078606.0,
      "reward": 0.009765625,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.010080644860863686,
      "rewards/accuracy_reward/std": 0.0999959334731102,
      "step": 1105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1781.9765625,
      "completions/mean_terminated_length": 712.6666870117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37757105061022445,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.372664871952672,
      "learning_rate": 8.053575546138706e-07,
      "loss": 0.0007,
      "num_tokens": 1078061810.0,
      "reward": 0.005859375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 1106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1809.986328125,
      "completions/mean_terminated_length": 940.5363159179688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.37791243492361526,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 22.947873390003068,
      "learning_rate": 8.049159421444986e-07,
      "loss": 0.0058,
      "num_tokens": 1079076699.0,
      "reward": 0.029296875,
      "reward_std": 0.04177113622426987,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1875.0,
      "completions/mean_length": 1721.556640625,
      "completions/mean_terminated_length": 594.6173706054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3782538192370061,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.1075845706332785,
      "learning_rate": 8.044739678685713e-07,
      "loss": 0.0162,
      "num_tokens": 1080033288.0,
      "reward": 0.046875,
      "reward_std": 0.07961063086986542,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1946.0,
      "completions/mean_length": 1788.685546875,
      "completions/mean_terminated_length": 903.4396362304688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3785952035503969,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 8.756116931166353,
      "learning_rate": 8.040316324133907e-07,
      "loss": 0.0297,
      "num_tokens": 1081024999.0,
      "reward": 0.0859375,
      "reward_std": 0.13534268736839294,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1818.587890625,
      "completions/mean_terminated_length": 798.4361572265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.37893658786378764,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.859190621081395,
      "learning_rate": 8.035889364067709e-07,
      "loss": -0.0043,
      "num_tokens": 1082036228.0,
      "reward": 0.033203125,
      "reward_std": 0.06661957502365112,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1836.0,
      "completions/mean_length": 1774.75,
      "completions/mean_terminated_length": 798.857177734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.37927797217717846,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.301070333254545,
      "learning_rate": 8.031458804770379e-07,
      "loss": 0.012,
      "num_tokens": 1083022692.0,
      "reward": 0.044921875,
      "reward_std": 0.08259069174528122,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1704.130859375,
      "completions/mean_terminated_length": 650.6904907226562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.37961935649056927,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 4.613399055731976,
      "learning_rate": 8.027024652530285e-07,
      "loss": 0.0069,
      "num_tokens": 1083971479.0,
      "reward": 0.056640625,
      "reward_std": 0.0830162987112999,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1849.041015625,
      "completions/mean_terminated_length": 835.297607421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3799607408039601,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.520319595351757,
      "learning_rate": 8.022586913640896e-07,
      "loss": 0.0265,
      "num_tokens": 1084995180.0,
      "reward": 0.025390625,
      "reward_std": 0.06536141037940979,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1809.962890625,
      "completions/mean_terminated_length": 978.9210815429688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.38030212511735084,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.777566763334661,
      "learning_rate": 8.018145594400772e-07,
      "loss": 0.0257,
      "num_tokens": 1085989129.0,
      "reward": 0.072265625,
      "reward_std": 0.08699656277894974,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1691.48046875,
      "completions/mean_terminated_length": 734.7769775390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.38064350943074166,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.463706565937134,
      "learning_rate": 8.01370070111355e-07,
      "loss": 0.0163,
      "num_tokens": 1086931791.0,
      "reward": 0.029296875,
      "reward_std": 0.055033616721630096,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1646.90234375,
      "completions/mean_terminated_length": 650.9795532226562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38098489374413247,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.6077925663347319,
      "learning_rate": 8.009252240087947e-07,
      "loss": 0.0086,
      "num_tokens": 1087849421.0,
      "reward": 0.021484375,
      "reward_std": 0.04230976849794388,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1739.02734375,
      "completions/mean_terminated_length": 840.801513671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3813262780575233,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 36.13097715464304,
      "learning_rate": 8.004800217637736e-07,
      "loss": 0.0193,
      "num_tokens": 1088827403.0,
      "reward": 0.052734375,
      "reward_std": 0.06755761802196503,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1716.001953125,
      "completions/mean_terminated_length": 720.0078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.38166766237091404,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.039312582270002,
      "learning_rate": 8.000344640081752e-07,
      "loss": 0.0111,
      "num_tokens": 1089779164.0,
      "reward": 0.064453125,
      "reward_std": 0.08654290437698364,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1743.798828125,
      "completions/mean_terminated_length": 831.1953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38200904668430485,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.104278326846828,
      "learning_rate": 7.99588551374387e-07,
      "loss": 0.0074,
      "num_tokens": 1090746357.0,
      "reward": 0.03125,
      "reward_std": 0.06469620764255524,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1657.306640625,
      "completions/mean_terminated_length": 705.4832153320312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.38235043099769567,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 17.49240402776864,
      "learning_rate": 7.991422844953004e-07,
      "loss": 0.0412,
      "num_tokens": 1091671378.0,
      "reward": 0.10546875,
      "reward_std": 0.1713758409023285,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 1120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1724.23828125,
      "completions/mean_terminated_length": 888.7971801757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3826918153110865,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.098260673817675,
      "learning_rate": 7.986956640043096e-07,
      "loss": 0.0292,
      "num_tokens": 1092635964.0,
      "reward": 0.029296875,
      "reward_std": 0.07328042387962341,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1707.16796875,
      "completions/mean_terminated_length": 783.4638061523438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.38303319962447724,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.022968640160011,
      "learning_rate": 7.98248690535311e-07,
      "loss": 0.0079,
      "num_tokens": 1093581250.0,
      "reward": 0.005859375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.006048386916518211,
      "rewards/accuracy_reward/std": 0.07761410623788834,
      "step": 1122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1673.734375,
      "completions/mean_terminated_length": 879.5609130859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.38337458393786805,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.632064039184149,
      "learning_rate": 7.978013647227015e-07,
      "loss": 0.0185,
      "num_tokens": 1094517114.0,
      "reward": 0.044921875,
      "reward_std": 0.0930916965007782,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1906.0,
      "completions/mean_length": 1657.404296875,
      "completions/mean_terminated_length": 705.8187866210938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.38371596825125887,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.0890692605384205,
      "learning_rate": 7.973536872013783e-07,
      "loss": 0.0059,
      "num_tokens": 1095435881.0,
      "reward": 0.04296875,
      "reward_std": 0.057157501578330994,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1699.830078125,
      "completions/mean_terminated_length": 961.0304565429688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3840573525646497,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.258839064005763,
      "learning_rate": 7.969056586067376e-07,
      "loss": -0.0113,
      "num_tokens": 1096383778.0,
      "reward": 0.02734375,
      "reward_std": 0.05806133896112442,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1649.353515625,
      "completions/mean_terminated_length": 772.3312377929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.38439873687804044,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.698117812687881,
      "learning_rate": 7.964572795746741e-07,
      "loss": 0.0118,
      "num_tokens": 1097297655.0,
      "reward": 0.037109375,
      "reward_std": 0.08560486882925034,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1703.35546875,
      "completions/mean_terminated_length": 721.2481689453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.38474012119143125,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.440638392115558,
      "learning_rate": 7.960085507415802e-07,
      "loss": 0.0209,
      "num_tokens": 1098251661.0,
      "reward": 0.0859375,
      "reward_std": 0.08482664078474045,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1707.841796875,
      "completions/mean_terminated_length": 767.4044189453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38508150550482206,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.644220379694776,
      "learning_rate": 7.955594727443439e-07,
      "loss": 0.0175,
      "num_tokens": 1099202876.0,
      "reward": 0.046875,
      "reward_std": 0.08419393002986908,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1707.509765625,
      "completions/mean_terminated_length": 706.9923095703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3854228898182129,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.601851376862413,
      "learning_rate": 7.951100462203494e-07,
      "loss": -0.0001,
      "num_tokens": 1100146257.0,
      "reward": 0.03515625,
      "reward_std": 0.05880707502365112,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1930.0,
      "completions/mean_length": 1623.9140625,
      "completions/mean_terminated_length": 707.6790161132812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38576427413160363,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.154470532031593,
      "learning_rate": 7.946602718074756e-07,
      "loss": 0.0221,
      "num_tokens": 1101065013.0,
      "reward": 0.056640625,
      "reward_std": 0.08686304092407227,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1723.4609375,
      "completions/mean_terminated_length": 844.3115844726562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38610565844499445,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.538006287101485,
      "learning_rate": 7.94210150144095e-07,
      "loss": 0.0034,
      "num_tokens": 1102023649.0,
      "reward": 0.064453125,
      "reward_std": 0.08074785768985748,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1616.080078125,
      "completions/mean_terminated_length": 754.76611328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38644704275838526,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.008773652447448,
      "learning_rate": 7.937596818690729e-07,
      "loss": 0.0063,
      "num_tokens": 1102927226.0,
      "reward": 0.025390625,
      "reward_std": 0.05024883896112442,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1745.6171875,
      "completions/mean_terminated_length": 828.9448852539062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3867884270717761,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.3852533880674023,
      "learning_rate": 7.933088676217667e-07,
      "loss": 0.0056,
      "num_tokens": 1103907910.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.01411290280520916,
      "rewards/accuracy_reward/std": 0.11807556450366974,
      "step": 1133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1937.0,
      "completions/mean_length": 1696.43359375,
      "completions/mean_terminated_length": 734.1167602539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.38712981138516683,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.653070292176036,
      "learning_rate": 7.928577080420247e-07,
      "loss": 0.001,
      "num_tokens": 1104855892.0,
      "reward": 0.0703125,
      "reward_std": 0.08235205709934235,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1772.447265625,
      "completions/mean_terminated_length": 810.4298095703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.38747119569855765,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 22.529968173272234,
      "learning_rate": 7.924062037701853e-07,
      "loss": 0.0035,
      "num_tokens": 1105848041.0,
      "reward": 0.044921875,
      "reward_std": 0.08928529918193817,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1685.462890625,
      "completions/mean_terminated_length": 850.4580688476562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.38781258001194846,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.81079984928334,
      "learning_rate": 7.919543554470763e-07,
      "loss": 0.0122,
      "num_tokens": 1106794550.0,
      "reward": 0.03125,
      "reward_std": 0.08549975603818893,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1610.267578125,
      "completions/mean_terminated_length": 759.9597778320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3881539643253393,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.103790470330274,
      "learning_rate": 7.91502163714014e-07,
      "loss": 0.0129,
      "num_tokens": 1107698831.0,
      "reward": 0.0859375,
      "reward_std": 0.09739763289690018,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.583984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1492.013671875,
      "completions/mean_terminated_length": 711.5446166992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38849534863873003,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 3.6574998971869004,
      "learning_rate": 7.910496292128015e-07,
      "loss": 0.0177,
      "num_tokens": 1108538678.0,
      "reward": 0.07421875,
      "reward_std": 0.07619608938694,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1679.5703125,
      "completions/mean_terminated_length": 846.496826171875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.38883673295212084,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 17.651934307100472,
      "learning_rate": 7.905967525857291e-07,
      "loss": -0.0023,
      "num_tokens": 1109469162.0,
      "reward": 0.03125,
      "reward_std": 0.0637841522693634,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 1139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1748.384765625,
      "completions/mean_terminated_length": 849.5390625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38917811726551166,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.882594443243401,
      "learning_rate": 7.901435344755721e-07,
      "loss": 0.0087,
      "num_tokens": 1110442271.0,
      "reward": 0.015625,
      "reward_std": 0.040274329483509064,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1598.85546875,
      "completions/mean_terminated_length": 756.0786743164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38951950157890247,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.576852939157652,
      "learning_rate": 7.896899755255906e-07,
      "loss": 0.0012,
      "num_tokens": 1111336773.0,
      "reward": 0.044921875,
      "reward_std": 0.05193261057138443,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1552.900390625,
      "completions/mean_terminated_length": 817.461181640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.38986088589229323,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 26.516191919548543,
      "learning_rate": 7.892360763795291e-07,
      "loss": 0.015,
      "num_tokens": 1112203474.0,
      "reward": 0.095703125,
      "reward_std": 0.10746511071920395,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 1142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1582.880859375,
      "completions/mean_terminated_length": 801.481689453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.39020227020568404,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 10.336358793659462,
      "learning_rate": 7.887818376816136e-07,
      "loss": 0.0025,
      "num_tokens": 1113096757.0,
      "reward": 0.12109375,
      "reward_std": 0.16460810601711273,
      "rewards/accuracy_reward/mean": 0.12109375,
      "rewards/accuracy_reward/std": 0.3265552520751953,
      "step": 1143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1548.65625,
      "completions/mean_terminated_length": 702.678955078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.39054365451907486,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.005698554614572,
      "learning_rate": 7.883272600765535e-07,
      "loss": 0.0077,
      "num_tokens": 1113979045.0,
      "reward": 0.041015625,
      "reward_std": 0.08070705831050873,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1610.380859375,
      "completions/mean_terminated_length": 823.6229248046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.39088503883246567,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.206436976535436,
      "learning_rate": 7.878723442095384e-07,
      "loss": 0.0011,
      "num_tokens": 1114881880.0,
      "reward": 0.029296875,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1622.7109375,
      "completions/mean_terminated_length": 804.0171508789062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39122642314585643,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 8.636780994474929,
      "learning_rate": 7.87417090726238e-07,
      "loss": -0.005,
      "num_tokens": 1115795124.0,
      "reward": 0.083984375,
      "reward_std": 0.10530310869216919,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 1146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1603.263671875,
      "completions/mean_terminated_length": 836.8031616210938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39156780745924724,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.3195402867773485,
      "learning_rate": 7.869615002728016e-07,
      "loss": 0.0106,
      "num_tokens": 1116688171.0,
      "reward": 0.05859375,
      "reward_std": 0.06079617515206337,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1874.0,
      "completions/mean_length": 1587.91015625,
      "completions/mean_terminated_length": 808.47900390625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.39190919177263805,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 6.080562278427425,
      "learning_rate": 7.865055734958566e-07,
      "loss": 0.015,
      "num_tokens": 1117581021.0,
      "reward": 0.041015625,
      "reward_std": 0.09479551017284393,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1554.697265625,
      "completions/mean_terminated_length": 711.6455078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.39225057608602887,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 10.614844030817949,
      "learning_rate": 7.860493110425073e-07,
      "loss": 0.0413,
      "num_tokens": 1118454418.0,
      "reward": 0.130859375,
      "reward_std": 0.14998990297317505,
      "rewards/accuracy_reward/mean": 0.130859375,
      "rewards/accuracy_reward/std": 0.33757632970809937,
      "step": 1149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.611328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1547.953125,
      "completions/mean_terminated_length": 761.4472045898438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.3925919603994196,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 41.61745241700161,
      "learning_rate": 7.855927135603348e-07,
      "loss": 0.0022,
      "num_tokens": 1119319866.0,
      "reward": 0.0625,
      "reward_std": 0.11923094093799591,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.552734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1529.962890625,
      "completions/mean_terminated_length": 889.7685546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39293334471281044,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 28.764453255424353,
      "learning_rate": 7.851357816973962e-07,
      "loss": 0.0198,
      "num_tokens": 1120174695.0,
      "reward": 0.111328125,
      "reward_std": 0.16661281883716583,
      "rewards/accuracy_reward/mean": 0.11491935700178146,
      "rewards/accuracy_reward/std": 0.3192465901374817,
      "step": 1151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1523.212890625,
      "completions/mean_terminated_length": 730.8872680664062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.39327472902620125,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.035769089590765,
      "learning_rate": 7.846785161022223e-07,
      "loss": 0.0244,
      "num_tokens": 1121038036.0,
      "reward": 0.044921875,
      "reward_std": 0.09126891195774078,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1624.607421875,
      "completions/mean_terminated_length": 850.3370361328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.39361611333959207,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.997149227823183,
      "learning_rate": 7.842209174238181e-07,
      "loss": -0.0066,
      "num_tokens": 1121952331.0,
      "reward": 0.041015625,
      "reward_std": 0.06354551017284393,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1607.54296875,
      "completions/mean_terminated_length": 835.5591430664062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3939574976529828,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.485292639226472,
      "learning_rate": 7.837629863116612e-07,
      "loss": 0.0056,
      "num_tokens": 1122843681.0,
      "reward": 0.041015625,
      "reward_std": 0.06018522381782532,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 1154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1661.201171875,
      "completions/mean_terminated_length": 778.5064086914062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39429888196637364,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.591285810002844,
      "learning_rate": 7.833047234157012e-07,
      "loss": 0.0203,
      "num_tokens": 1123770280.0,
      "reward": 0.060546875,
      "reward_std": 0.06733252108097076,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1641.947265625,
      "completions/mean_terminated_length": 748.6312866210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.39464026627976445,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.008939650255724,
      "learning_rate": 7.828461293863581e-07,
      "loss": 0.0079,
      "num_tokens": 1124691533.0,
      "reward": 0.017578125,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1631.20703125,
      "completions/mean_terminated_length": 800.3099365234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39498165059315526,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.803212358734505,
      "learning_rate": 7.823872048745223e-07,
      "loss": 0.0136,
      "num_tokens": 1125612999.0,
      "reward": 0.060546875,
      "reward_std": 0.10740634053945541,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1627.06640625,
      "completions/mean_terminated_length": 772.7455444335938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.395323034906546,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.240800015547556,
      "learning_rate": 7.819279505315538e-07,
      "loss": 0.0217,
      "num_tokens": 1126521497.0,
      "reward": 0.0390625,
      "reward_std": 0.06355906277894974,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1677.330078125,
      "completions/mean_terminated_length": 692.4071655273438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.39566441921993684,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 1.2449334635798142,
      "learning_rate": 7.814683670092795e-07,
      "loss": 0.0109,
      "num_tokens": 1127453522.0,
      "reward": 0.029296875,
      "reward_std": 0.059984706342220306,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 1159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1713.470703125,
      "completions/mean_terminated_length": 816.1582641601562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39600580353332765,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.58854134045238,
      "learning_rate": 7.810084549599944e-07,
      "loss": 0.0131,
      "num_tokens": 1128414339.0,
      "reward": 0.025390625,
      "reward_std": 0.06260748207569122,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 1160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1724.216796875,
      "completions/mean_terminated_length": 689.172119140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39634718784671846,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.310055790491502,
      "learning_rate": 7.805482150364598e-07,
      "loss": 0.0092,
      "num_tokens": 1129369154.0,
      "reward": 0.052734375,
      "reward_std": 0.0659080371260643,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1642.552734375,
      "completions/mean_terminated_length": 664.0733642578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.3966885721601092,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 21.13245130994666,
      "learning_rate": 7.800876478919014e-07,
      "loss": 0.0135,
      "num_tokens": 1130292237.0,
      "reward": 0.083984375,
      "reward_std": 0.10387972742319107,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 1162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1691.845703125,
      "completions/mean_terminated_length": 840.3775024414062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.39702995647350003,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 11.272049032269972,
      "learning_rate": 7.796267541800106e-07,
      "loss": 0.0169,
      "num_tokens": 1131233982.0,
      "reward": 0.044921875,
      "reward_std": 0.09644509106874466,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1946.0,
      "completions/mean_length": 1690.845703125,
      "completions/mean_terminated_length": 836.9867553710938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39737134078689085,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.326459236882656,
      "learning_rate": 7.791655345549416e-07,
      "loss": 0.0052,
      "num_tokens": 1132177183.0,
      "reward": 0.046875,
      "reward_std": 0.07041902095079422,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 1666.48046875,
      "completions/mean_terminated_length": 803.8088989257812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39771272510028166,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.804194544750429,
      "learning_rate": 7.78703989671311e-07,
      "loss": 0.0034,
      "num_tokens": 1133099317.0,
      "reward": 0.037109375,
      "reward_std": 0.05574656277894974,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1733.892578125,
      "completions/mean_terminated_length": 751.040283203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.3980541094136724,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.97469355275299,
      "learning_rate": 7.782421201841978e-07,
      "loss": 0.0197,
      "num_tokens": 1134062958.0,
      "reward": 0.033203125,
      "reward_std": 0.07712717354297638,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1727.140625,
      "completions/mean_terminated_length": 803.45458984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39839549372706323,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.930897395036539,
      "learning_rate": 7.77779926749141e-07,
      "loss": -0.0013,
      "num_tokens": 1135025222.0,
      "reward": 0.060546875,
      "reward_std": 0.08444078266620636,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1839.0,
      "completions/mean_length": 1643.603515625,
      "completions/mean_terminated_length": 822.84619140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.39873687804045405,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 19.743556297672896,
      "learning_rate": 7.773174100221398e-07,
      "loss": 0.0043,
      "num_tokens": 1135941467.0,
      "reward": 0.06640625,
      "reward_std": 0.12108887732028961,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1630.64453125,
      "completions/mean_terminated_length": 745.0365600585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.39907826235384486,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.1051099731647325,
      "learning_rate": 7.768545706596519e-07,
      "loss": 0.0124,
      "num_tokens": 1136850405.0,
      "reward": 0.1015625,
      "reward_std": 0.11063909530639648,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 1169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1690.173828125,
      "completions/mean_terminated_length": 748.6595458984375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.3994196466672356,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.318624409156783,
      "learning_rate": 7.763914093185932e-07,
      "loss": 0.01,
      "num_tokens": 1137790430.0,
      "reward": 0.033203125,
      "reward_std": 0.06849660724401474,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211629986763,
      "step": 1170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1718.1640625,
      "completions/mean_terminated_length": 797.066650390625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.39976103098062643,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 16.22759870386561,
      "learning_rate": 7.759279266563365e-07,
      "loss": 0.024,
      "num_tokens": 1138750658.0,
      "reward": 0.044921875,
      "reward_std": 0.09877577424049377,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 1696.5703125,
      "completions/mean_terminated_length": 631.2125854492188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.40010241529401724,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.1614684579522003,
      "learning_rate": 7.754641233307109e-07,
      "loss": 0.008,
      "num_tokens": 1139702598.0,
      "reward": 0.017578125,
      "reward_std": 0.04957009106874466,
      "rewards/accuracy_reward/mean": 0.018145160749554634,
      "rewards/accuracy_reward/std": 0.1336110234260559,
      "step": 1172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1658.375,
      "completions/mean_terminated_length": 888.18603515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40044379960740806,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.293519585649774,
      "learning_rate": 7.75e-07,
      "loss": 0.0182,
      "num_tokens": 1140632998.0,
      "reward": 0.037109375,
      "reward_std": 0.07174387574195862,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1617.62109375,
      "completions/mean_terminated_length": 843.8797607421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4007851839207988,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 20.616121201423113,
      "learning_rate": 7.745355573229422e-07,
      "loss": 0.0203,
      "num_tokens": 1141533828.0,
      "reward": 0.0625,
      "reward_std": 0.11075212061405182,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1619.31640625,
      "completions/mean_terminated_length": 717.7817993164062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.40112656823418963,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.249735378161522,
      "learning_rate": 7.740707959587289e-07,
      "loss": 0.0222,
      "num_tokens": 1142437238.0,
      "reward": 0.03125,
      "reward_std": 0.07426576316356659,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1614.1484375,
      "completions/mean_terminated_length": 793.0169677734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.40146795254758044,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.441774705816416,
      "learning_rate": 7.736057165670038e-07,
      "loss": 0.0173,
      "num_tokens": 1143337890.0,
      "reward": 0.0625,
      "reward_std": 0.09749060869216919,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1891.0,
      "completions/mean_length": 1663.08984375,
      "completions/mean_terminated_length": 831.4938354492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40180933686097126,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.209668395812065,
      "learning_rate": 7.731403198078622e-07,
      "loss": 0.0284,
      "num_tokens": 1144263808.0,
      "reward": 0.060546875,
      "reward_std": 0.10194281488656998,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1866.0,
      "completions/mean_length": 1722.638671875,
      "completions/mean_terminated_length": 785.992431640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.402150721174362,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.136172237557808,
      "learning_rate": 7.726746063418493e-07,
      "loss": 0.013,
      "num_tokens": 1145225975.0,
      "reward": 0.07421875,
      "reward_std": 0.06409768760204315,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1645.662109375,
      "completions/mean_terminated_length": 701.6143798828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4024921054877528,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.537834559232754,
      "learning_rate": 7.722085768299608e-07,
      "loss": 0.0198,
      "num_tokens": 1146144058.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1777.16796875,
      "completions/mean_terminated_length": 842.2086791992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40283348980114364,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 14.960035798261194,
      "learning_rate": 7.717422319336398e-07,
      "loss": 0.0185,
      "num_tokens": 1147132240.0,
      "reward": 0.046875,
      "reward_std": 0.09825544059276581,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1726.03515625,
      "completions/mean_terminated_length": 878.87939453125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.40317487411453445,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.938458838567134,
      "learning_rate": 7.712755723147777e-07,
      "loss": 0.0015,
      "num_tokens": 1148085234.0,
      "reward": 0.0390625,
      "reward_std": 0.042695626616477966,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1684.74609375,
      "completions/mean_terminated_length": 774.123291015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4035162584279252,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 9.398862745165575,
      "learning_rate": 7.708085986357127e-07,
      "loss": 0.0047,
      "num_tokens": 1149024624.0,
      "reward": 0.037109375,
      "reward_std": 0.09055596590042114,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1759.21484375,
      "completions/mean_terminated_length": 794.9661254882812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.403857642741316,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.8903023179523921,
      "learning_rate": 7.703413115592282e-07,
      "loss": 0.0093,
      "num_tokens": 1150004734.0,
      "reward": 0.04296875,
      "reward_std": 0.0801548957824707,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1875.0,
      "completions/mean_length": 1735.970703125,
      "completions/mean_terminated_length": 809.5581665039062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40419902705470684,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 14.877807766225922,
      "learning_rate": 7.69873711748553e-07,
      "loss": 0.0089,
      "num_tokens": 1150972911.0,
      "reward": 0.05859375,
      "reward_std": 0.1154462918639183,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1794.025390625,
      "completions/mean_terminated_length": 773.7059326171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40454041136809765,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 32.604778904381504,
      "learning_rate": 7.694057998673597e-07,
      "loss": 0.0031,
      "num_tokens": 1151975052.0,
      "reward": 0.0390625,
      "reward_std": 0.072782501578331,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1805.548828125,
      "completions/mean_terminated_length": 842.8058471679688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4048817956814884,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.141016035438558,
      "learning_rate": 7.68937576579763e-07,
      "loss": 0.0168,
      "num_tokens": 1152978485.0,
      "reward": 0.029296875,
      "reward_std": 0.05193261057138443,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1709.0,
      "completions/mean_length": 1694.248046875,
      "completions/mean_terminated_length": 675.8712158203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4052231799948792,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 13.324399305135795,
      "learning_rate": 7.684690425503208e-07,
      "loss": 0.0106,
      "num_tokens": 1153925444.0,
      "reward": 0.08984375,
      "reward_std": 0.1293414831161499,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.82421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1824.5625,
      "completions/mean_terminated_length": 776.888916015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.40556456430827004,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.18535227366748,
      "learning_rate": 7.680001984440312e-07,
      "loss": 0.0125,
      "num_tokens": 1154932276.0,
      "reward": 0.025390625,
      "reward_std": 0.06354551762342453,
      "rewards/accuracy_reward/mean": 0.02708333358168602,
      "rewards/accuracy_reward/std": 0.16249564290046692,
      "step": 1188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 1858.6015625,
      "completions/mean_terminated_length": 621.941162109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40590594862166085,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.118619357088836,
      "learning_rate": 7.675310449263326e-07,
      "loss": 0.0221,
      "num_tokens": 1155963352.0,
      "reward": 0.046875,
      "reward_std": 0.06794346868991852,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1896.361328125,
      "completions/mean_terminated_length": 871.6515502929688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4062473329350516,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.603861080594282,
      "learning_rate": 7.670615826631027e-07,
      "loss": 0.0186,
      "num_tokens": 1157003121.0,
      "reward": 0.05078125,
      "reward_std": 0.054249756038188934,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1841.23828125,
      "completions/mean_terminated_length": 757.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4065887172484424,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.359905530122068,
      "learning_rate": 7.665918123206572e-07,
      "loss": 0.0193,
      "num_tokens": 1158016075.0,
      "reward": 0.03515625,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1837.0,
      "completions/mean_length": 1881.796875,
      "completions/mean_terminated_length": 629.7333374023438,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.40693010156183324,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.872453086380027,
      "learning_rate": 7.661217345657495e-07,
      "loss": 0.0091,
      "num_tokens": 1159053523.0,
      "reward": 0.025390625,
      "reward_std": 0.06661957502365112,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1919.0,
      "completions/mean_length": 1769.171875,
      "completions/mean_terminated_length": 620.3999633789062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40727148587522405,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.550368853248177,
      "learning_rate": 7.656513500655688e-07,
      "loss": 0.0157,
      "num_tokens": 1160033131.0,
      "reward": 0.03515625,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.87890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1891.80078125,
      "completions/mean_terminated_length": 758.0967407226562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4076128701886148,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.799740948957593,
      "learning_rate": 7.651806594877397e-07,
      "loss": 0.0097,
      "num_tokens": 1161074389.0,
      "reward": 0.04296875,
      "reward_std": 0.07014063745737076,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1838.001953125,
      "completions/mean_terminated_length": 752.59033203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4079542545020056,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.285437921261953,
      "learning_rate": 7.647096635003216e-07,
      "loss": 0.0225,
      "num_tokens": 1162098710.0,
      "reward": 0.021484375,
      "reward_std": 0.06519509106874466,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1877.72265625,
      "completions/mean_terminated_length": 765.9117431640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.40829563881539643,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.30567212016571,
      "learning_rate": 7.642383627718072e-07,
      "loss": 0.0024,
      "num_tokens": 1163137304.0,
      "reward": 0.02734375,
      "reward_std": 0.06640692055225372,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1845.0,
      "completions/mean_length": 1798.380859375,
      "completions/mean_terminated_length": 688.372314453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40863702312878725,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.882567851948014,
      "learning_rate": 7.637667579711215e-07,
      "loss": 0.0017,
      "num_tokens": 1164143275.0,
      "reward": 0.048828125,
      "reward_std": 0.09448197484016418,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1824.7734375,
      "completions/mean_terminated_length": 749.227294921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.408978407442178,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 21.30616430554781,
      "learning_rate": 7.632948497676213e-07,
      "loss": -0.0036,
      "num_tokens": 1165153223.0,
      "reward": 0.04296875,
      "reward_std": 0.050948236137628555,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 1198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1735.0,
      "completions/mean_length": 1867.1796875,
      "completions/mean_terminated_length": 706.2608642578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4093197917555688,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.651026897248558,
      "learning_rate": 7.62822638831094e-07,
      "loss": 0.0139,
      "num_tokens": 1166178675.0,
      "reward": 0.03515625,
      "reward_std": 0.08345640450716019,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1781.787109375,
      "completions/mean_terminated_length": 516.528076171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.40966117606895963,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.70211317942562,
      "learning_rate": 7.623501258317567e-07,
      "loss": 0.009,
      "num_tokens": 1167171110.0,
      "reward": 0.025390625,
      "reward_std": 0.04863205552101135,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.841796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1875.0,
      "completions/mean_length": 1849.70703125,
      "completions/mean_terminated_length": 794.5925903320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.41000256038235044,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 20.545322889086556,
      "learning_rate": 7.618773114402554e-07,
      "loss": 0.0146,
      "num_tokens": 1168192336.0,
      "reward": 0.056640625,
      "reward_std": 0.08483455330133438,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.896484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1916.453125,
      "completions/mean_terminated_length": 777.2075805664062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.41034394469574126,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.702355764053578,
      "learning_rate": 7.61404196327663e-07,
      "loss": 0.0067,
      "num_tokens": 1169249848.0,
      "reward": 0.009765625,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 1202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.88671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1914.017578125,
      "completions/mean_terminated_length": 865.2586059570312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.410685329009132,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.4252063974196933,
      "learning_rate": 7.609307811654804e-07,
      "loss": 0.0067,
      "num_tokens": 1170311105.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1704.0,
      "completions/mean_length": 1833.22265625,
      "completions/mean_terminated_length": 561.9729614257812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.41102671332252283,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.684863938742346,
      "learning_rate": 7.604570666256339e-07,
      "loss": 0.0065,
      "num_tokens": 1171328355.0,
      "reward": 0.025390625,
      "reward_std": 0.06409074366092682,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1783.40625,
      "completions/mean_terminated_length": 732.7378540039062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.41136809763591364,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 37.810948958138965,
      "learning_rate": 7.599830533804741e-07,
      "loss": 0.0129,
      "num_tokens": 1172319763.0,
      "reward": 0.0703125,
      "reward_std": 0.08126020431518555,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 1205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.849609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1840.693359375,
      "completions/mean_terminated_length": 669.5454711914062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.41170948194930446,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.747585638411618,
      "learning_rate": 7.595087421027767e-07,
      "loss": 0.0052,
      "num_tokens": 1173338854.0,
      "reward": 0.021484375,
      "reward_std": 0.06519509106874466,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1808.953125,
      "completions/mean_terminated_length": 672.8090209960938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4120508662626952,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.741608849899395,
      "learning_rate": 7.590341334657395e-07,
      "loss": 0.0041,
      "num_tokens": 1174343806.0,
      "reward": 0.015625,
      "reward_std": 0.04670868441462517,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1785.0,
      "completions/mean_length": 1798.60546875,
      "completions/mean_terminated_length": 674.9892578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.41239225057608603,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 45.23357192343699,
      "learning_rate": 7.585592281429828e-07,
      "loss": 0.0115,
      "num_tokens": 1175344868.0,
      "reward": 0.080078125,
      "reward_std": 0.08830691128969193,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 1208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1834.201171875,
      "completions/mean_terminated_length": 818.0562133789062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.41273363488947684,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.144580254330286,
      "learning_rate": 7.580840268085477e-07,
      "loss": 0.0082,
      "num_tokens": 1176360635.0,
      "reward": 0.046875,
      "reward_std": 0.08461953699588776,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1869.185546875,
      "completions/mean_terminated_length": 758.5211181640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.41307501920286765,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.035850002135212,
      "learning_rate": 7.576085301368955e-07,
      "loss": 0.0021,
      "num_tokens": 1177387706.0,
      "reward": 0.01953125,
      "reward_std": 0.04318207502365112,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1800.435546875,
      "completions/mean_terminated_length": 741.7937622070312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4134164035162584,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 27.616297749082623,
      "learning_rate": 7.571327388029071e-07,
      "loss": 0.0109,
      "num_tokens": 1178396137.0,
      "reward": 0.0546875,
      "reward_std": 0.05507335811853409,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.865234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1854.47265625,
      "completions/mean_terminated_length": 611.9710083007812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4137577878296492,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.147621101863834,
      "learning_rate": 7.566566534818809e-07,
      "loss": 0.0155,
      "num_tokens": 1179423995.0,
      "reward": 0.03515625,
      "reward_std": 0.0720367580652237,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.841796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1846.283203125,
      "completions/mean_terminated_length": 773.6049194335938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.41409917214304004,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 8.862783353806183,
      "learning_rate": 7.561802748495332e-07,
      "loss": 0.0094,
      "num_tokens": 1180446972.0,
      "reward": 0.017578125,
      "reward_std": 0.02394993044435978,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.861328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1891.0,
      "completions/mean_length": 1863.234375,
      "completions/mean_terminated_length": 715.6055908203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.41444055645643085,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.062186271184311,
      "learning_rate": 7.557036035819963e-07,
      "loss": 0.0064,
      "num_tokens": 1181476308.0,
      "reward": 0.017578125,
      "reward_std": 0.04461899772286415,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1867.73828125,
      "completions/mean_terminated_length": 766.138916015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4147819407698216,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.686802188307055,
      "learning_rate": 7.552266403558176e-07,
      "loss": 0.0223,
      "num_tokens": 1182504462.0,
      "reward": 0.03125,
      "reward_std": 0.0628461092710495,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1837.0,
      "completions/mean_length": 1901.052734375,
      "completions/mean_terminated_length": 480.5625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4151233250832124,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.723084811874012,
      "learning_rate": 7.547493858479595e-07,
      "loss": 0.0134,
      "num_tokens": 1183556329.0,
      "reward": 0.021484375,
      "reward_std": 0.05287160724401474,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740891754627228,
      "step": 1216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1848.107421875,
      "completions/mean_terminated_length": 799.8901977539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.41546470939660324,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 19.674664914925824,
      "learning_rate": 7.542718407357973e-07,
      "loss": 0.0059,
      "num_tokens": 1184576592.0,
      "reward": 0.033203125,
      "reward_std": 0.048086829483509064,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1828.0,
      "completions/mean_length": 1822.568359375,
      "completions/mean_terminated_length": 605.2374877929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.41580609370999405,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.289511590722295,
      "learning_rate": 7.537940056971192e-07,
      "loss": 0.0078,
      "num_tokens": 1185594819.0,
      "reward": 0.044921875,
      "reward_std": 0.07344770431518555,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.845703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1841.654296875,
      "completions/mean_terminated_length": 710.6708984375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.4161474780233848,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.947440805747222,
      "learning_rate": 7.533158814101242e-07,
      "loss": 0.011,
      "num_tokens": 1186623634.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.012096773833036423,
      "rewards/accuracy_reward/std": 0.10942844301462173,
      "step": 1219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1836.693359375,
      "completions/mean_terminated_length": 789.9883422851562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4164888623367756,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.528352408750054,
      "learning_rate": 7.528374685534227e-07,
      "loss": 0.0024,
      "num_tokens": 1187642677.0,
      "reward": 0.01953125,
      "reward_std": 0.04907120764255524,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1737.8671875,
      "completions/mean_terminated_length": 724.7667236328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.41683024665016644,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 23.163101537814786,
      "learning_rate": 7.52358767806034e-07,
      "loss": 0.0037,
      "num_tokens": 1188609505.0,
      "reward": 0.044921875,
      "reward_std": 0.08505964279174805,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1792.892578125,
      "completions/mean_terminated_length": 741.8499755859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.41717163096355725,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 77.34105892107726,
      "learning_rate": 7.518797798473865e-07,
      "loss": 0.0164,
      "num_tokens": 1189602250.0,
      "reward": 0.056640625,
      "reward_std": 0.07763300091028214,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1770.810546875,
      "completions/mean_terminated_length": 769.909912109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.417513015276948,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 55.796782244880035,
      "learning_rate": 7.514005053573156e-07,
      "loss": 0.0149,
      "num_tokens": 1190588585.0,
      "reward": 0.041015625,
      "reward_std": 0.06150216609239578,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1753.103515625,
      "completions/mean_terminated_length": 610.028564453125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4178543995903388,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 22.08431494552066,
      "learning_rate": 7.509209450160639e-07,
      "loss": 0.0133,
      "num_tokens": 1191558974.0,
      "reward": 0.0234375,
      "reward_std": 0.040274329483509064,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1704.06640625,
      "completions/mean_terminated_length": 723.9849853515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.41819578390372963,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.392557113231142,
      "learning_rate": 7.5044109950428e-07,
      "loss": 0.0298,
      "num_tokens": 1192506496.0,
      "reward": 0.0390625,
      "reward_std": 0.08792105317115784,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1712.748046875,
      "completions/mean_terminated_length": 727.6231079101562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.41853716821712045,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.80782432403552,
      "learning_rate": 7.499609695030163e-07,
      "loss": 0.0158,
      "num_tokens": 1193461759.0,
      "reward": 0.02734375,
      "reward_std": 0.0580955371260643,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1707.0234375,
      "completions/mean_terminated_length": 773.6934204101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4188785525305112,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.149140957121258,
      "learning_rate": 7.494805556937299e-07,
      "loss": 0.019,
      "num_tokens": 1194413115.0,
      "reward": 0.0390625,
      "reward_std": 0.05864076316356659,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1695.64453125,
      "completions/mean_terminated_length": 777.5352172851562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.419219936843902,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 4.5494958734166975,
      "learning_rate": 7.4899985875828e-07,
      "loss": 0.0328,
      "num_tokens": 1195363493.0,
      "reward": 0.0625,
      "reward_std": 0.1155911535024643,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1670.998046875,
      "completions/mean_terminated_length": 778.0986938476562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.41956132115729283,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 14.959410518513367,
      "learning_rate": 7.485188793789284e-07,
      "loss": 0.0277,
      "num_tokens": 1196298580.0,
      "reward": 0.076171875,
      "reward_std": 0.12836134433746338,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 1229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 1670.828125,
      "completions/mean_terminated_length": 785.830078125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.41990270547068365,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 11.215384823305884,
      "learning_rate": 7.480376182383371e-07,
      "loss": 0.0387,
      "num_tokens": 1197235148.0,
      "reward": 0.0390625,
      "reward_std": 0.103924959897995,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1716.232421875,
      "completions/mean_terminated_length": 884.5410766601562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4202440897840744,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 22.159552435547813,
      "learning_rate": 7.47556076019568e-07,
      "loss": 0.0341,
      "num_tokens": 1198189891.0,
      "reward": 0.099609375,
      "reward_std": 0.12160685658454895,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 1231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1648.89453125,
      "completions/mean_terminated_length": 694.74169921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4205854740974652,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.960560402574,
      "learning_rate": 7.470742534060827e-07,
      "loss": 0.0076,
      "num_tokens": 1199124349.0,
      "reward": 0.099609375,
      "reward_std": 0.11199785768985748,
      "rewards/accuracy_reward/mean": 0.10282257944345474,
      "rewards/accuracy_reward/std": 0.30403366684913635,
      "step": 1232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1719.783203125,
      "completions/mean_terminated_length": 647.6083374023438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.42092685841085603,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 27.99823521103077,
      "learning_rate": 7.465921510817401e-07,
      "loss": -0.0038,
      "num_tokens": 1200081262.0,
      "reward": 0.005859375,
      "reward_std": 0.012597277760505676,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 1233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1695.138671875,
      "completions/mean_terminated_length": 827.2905883789062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.42126824272424684,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 12.443622654828356,
      "learning_rate": 7.461097697307962e-07,
      "loss": 0.0346,
      "num_tokens": 1201022805.0,
      "reward": 0.052734375,
      "reward_std": 0.08890638500452042,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1779.7890625,
      "completions/mean_terminated_length": 752.4906005859375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.4216096270376376,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.760850151810333,
      "learning_rate": 7.456271100379031e-07,
      "loss": 0.0191,
      "num_tokens": 1202018265.0,
      "reward": 0.033203125,
      "reward_std": 0.062347229570150375,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1735.333984375,
      "completions/mean_terminated_length": 787.93701171875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.4219510113510284,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 26.717921857520043,
      "learning_rate": 7.451441726881082e-07,
      "loss": 0.0011,
      "num_tokens": 1203000644.0,
      "reward": 0.109375,
      "reward_std": 0.15267950296401978,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 1236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1654.96484375,
      "completions/mean_terminated_length": 758.0384521484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.42229239566441923,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 8.933408814328024,
      "learning_rate": 7.446609583668522e-07,
      "loss": 0.0249,
      "num_tokens": 1203924962.0,
      "reward": 0.04296875,
      "reward_std": 0.09302500635385513,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1807.0,
      "completions/mean_length": 1587.53515625,
      "completions/mean_terminated_length": 716.0338745117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.42263377997781004,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 16.089814264083646,
      "learning_rate": 7.441774677599699e-07,
      "loss": 0.0158,
      "num_tokens": 1204816356.0,
      "reward": 0.0625,
      "reward_std": 0.11647041887044907,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1680.64453125,
      "completions/mean_terminated_length": 714.0567016601562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4229751642912008,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 9.241428258729558,
      "learning_rate": 7.436937015536876e-07,
      "loss": 0.01,
      "num_tokens": 1205752974.0,
      "reward": 0.04296875,
      "reward_std": 0.06640692055225372,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 1239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1664.947265625,
      "completions/mean_terminated_length": 806.7152099609375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4233165486045916,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.900943510274026,
      "learning_rate": 7.432096604346231e-07,
      "loss": 0.0024,
      "num_tokens": 1206685235.0,
      "reward": 0.03125,
      "reward_std": 0.04847269132733345,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1813.0,
      "completions/mean_length": 1680.84375,
      "completions/mean_terminated_length": 769.197265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4236579329179824,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.89006391086748,
      "learning_rate": 7.427253450897844e-07,
      "loss": 0.0217,
      "num_tokens": 1207622259.0,
      "reward": 0.064453125,
      "reward_std": 0.10678637027740479,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1678.734375,
      "completions/mean_terminated_length": 873.689453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.42399931723137324,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 13.237277116594331,
      "learning_rate": 7.422407562065678e-07,
      "loss": 0.0136,
      "num_tokens": 1208569067.0,
      "reward": 0.06640625,
      "reward_std": 0.10260801017284393,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293970108032227,
      "step": 1242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1838.0,
      "completions/mean_length": 1696.89453125,
      "completions/mean_terminated_length": 754.7194213867188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.424340701544764,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.79416028076382,
      "learning_rate": 7.417558944727593e-07,
      "loss": 0.0072,
      "num_tokens": 1209514645.0,
      "reward": 0.025390625,
      "reward_std": 0.06849661469459534,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1733.1796875,
      "completions/mean_terminated_length": 826.8787841796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4246820858581548,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 10.403694749853384,
      "learning_rate": 7.412707605765313e-07,
      "loss": 0.0142,
      "num_tokens": 1210481121.0,
      "reward": 0.07421875,
      "reward_std": 0.13016510009765625,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1673.142578125,
      "completions/mean_terminated_length": 817.69873046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4250234701715456,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 19.931124308886883,
      "learning_rate": 7.407853552064425e-07,
      "loss": 0.0137,
      "num_tokens": 1211414586.0,
      "reward": 0.08984375,
      "reward_std": 0.12862853705883026,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1724.74609375,
      "completions/mean_terminated_length": 723.9520263671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.42536485448493644,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.4162949704939196,
      "learning_rate": 7.40299679051437e-07,
      "loss": 0.0058,
      "num_tokens": 1212385672.0,
      "reward": 0.033203125,
      "reward_std": 0.026572702452540398,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1743.173828125,
      "completions/mean_terminated_length": 847.8692016601562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4257062387983272,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.906044583163013,
      "learning_rate": 7.398137328008435e-07,
      "loss": 0.0106,
      "num_tokens": 1213358033.0,
      "reward": 0.048828125,
      "reward_std": 0.06645326316356659,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1726.0,
      "completions/mean_length": 1738.42578125,
      "completions/mean_terminated_length": 759.3658447265625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.426047623111718,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 20.936963334428164,
      "learning_rate": 7.393275171443737e-07,
      "loss": 0.0019,
      "num_tokens": 1214330475.0,
      "reward": 0.078125,
      "reward_std": 0.08989076316356659,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1741.0,
      "completions/mean_length": 1724.72265625,
      "completions/mean_terminated_length": 734.3651123046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4263890074251088,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.97586093646319,
      "learning_rate": 7.388410327721218e-07,
      "loss": 0.0041,
      "num_tokens": 1215285149.0,
      "reward": 0.037109375,
      "reward_std": 0.08082245290279388,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1684.994140625,
      "completions/mean_terminated_length": 841.1233520507812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.42673039173849964,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 1.0239765281408597,
      "learning_rate": 7.383542803745632e-07,
      "loss": 0.0205,
      "num_tokens": 1216220250.0,
      "reward": 0.048828125,
      "reward_std": 0.07481793314218521,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1739.615234375,
      "completions/mean_terminated_length": 774.6693115234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4270717760518904,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 16.13745280223482,
      "learning_rate": 7.378672606425542e-07,
      "loss": 0.0187,
      "num_tokens": 1217182597.0,
      "reward": 0.05078125,
      "reward_std": 0.10178203880786896,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1722.560546875,
      "completions/mean_terminated_length": 822.816162109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4274131603652812,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.791048449799167,
      "learning_rate": 7.373799742673301e-07,
      "loss": 0.0253,
      "num_tokens": 1218136180.0,
      "reward": 0.044921875,
      "reward_std": 0.08110743016004562,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1700.58203125,
      "completions/mean_terminated_length": 566.0750122070312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.427754544678672,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 33.165501688459926,
      "learning_rate": 7.36892421940505e-07,
      "loss": 0.0133,
      "num_tokens": 1219088462.0,
      "reward": 0.021484375,
      "reward_std": 0.05287160724401474,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1809.0,
      "completions/mean_length": 1767.888671875,
      "completions/mean_terminated_length": 778.822998046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.42809592899206284,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.834294690629708,
      "learning_rate": 7.364046043540699e-07,
      "loss": -0.0089,
      "num_tokens": 1220072645.0,
      "reward": 0.044921875,
      "reward_std": 0.09380322694778442,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 1254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1720.9765625,
      "completions/mean_terminated_length": 629.0508422851562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4284373133054536,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.686193658310177,
      "learning_rate": 7.359165222003927e-07,
      "loss": 0.0221,
      "num_tokens": 1221031993.0,
      "reward": 0.037109375,
      "reward_std": 0.06508206576108932,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1529.0,
      "completions/mean_length": 1723.244140625,
      "completions/mean_terminated_length": 685.0901489257812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4287786976188444,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 19.95496810582337,
      "learning_rate": 7.354281761722168e-07,
      "loss": 0.026,
      "num_tokens": 1221988486.0,
      "reward": 0.068359375,
      "reward_std": 0.10282756388187408,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1651.451171875,
      "completions/mean_terminated_length": 729.6038818359375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4291200819322352,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.6827198132044954,
      "learning_rate": 7.349395669626601e-07,
      "loss": 0.0257,
      "num_tokens": 1222908349.0,
      "reward": 0.03515625,
      "reward_std": 0.07053203880786896,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1921.0,
      "completions/mean_length": 1728.71484375,
      "completions/mean_terminated_length": 854.7590942382812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.42946146624562603,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 14.063289619289382,
      "learning_rate": 7.344506952652141e-07,
      "loss": 0.0034,
      "num_tokens": 1223869643.0,
      "reward": 0.02734375,
      "reward_std": 0.0580955371260643,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1791.0,
      "completions/mean_length": 1761.693359375,
      "completions/mean_terminated_length": 651.914306640625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.4298028505590168,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.025525419403623,
      "learning_rate": 7.339615617737427e-07,
      "loss": 0.0194,
      "num_tokens": 1224847694.0,
      "reward": 0.037109375,
      "reward_std": 0.07355976104736328,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1653.447265625,
      "completions/mean_terminated_length": 816.2255859375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4301442348724076,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.153004133961364,
      "learning_rate": 7.334721671824814e-07,
      "loss": 0.0042,
      "num_tokens": 1225775683.0,
      "reward": 0.041015625,
      "reward_std": 0.08648413419723511,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1673.494140625,
      "completions/mean_terminated_length": 725.6068725585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4304856191857984,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.497636929099315,
      "learning_rate": 7.329825121860363e-07,
      "loss": 0.013,
      "num_tokens": 1226706960.0,
      "reward": 0.025390625,
      "reward_std": 0.030584799125790596,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 1261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1930.0,
      "completions/mean_length": 1702.712890625,
      "completions/mean_terminated_length": 655.9763793945312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.43082700349918923,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.476483911043664,
      "learning_rate": 7.32492597479383e-07,
      "loss": 0.0055,
      "num_tokens": 1227656477.0,
      "reward": 0.037109375,
      "reward_std": 0.06860867142677307,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1664.287109375,
      "completions/mean_terminated_length": 763.941162109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.43116838781258,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.427171632744047,
      "learning_rate": 7.32002423757866e-07,
      "loss": -0.0113,
      "num_tokens": 1228581536.0,
      "reward": 0.05078125,
      "reward_std": 0.06888246536254883,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1708.8203125,
      "completions/mean_terminated_length": 798.6475219726562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4315097721259708,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 1.1673621389787057,
      "learning_rate": 7.315119917171972e-07,
      "loss": -0.003,
      "num_tokens": 1229543076.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 1264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1888.0,
      "completions/mean_length": 1700.34765625,
      "completions/mean_terminated_length": 624.0160522460938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4318511564393616,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.252421314944644,
      "learning_rate": 7.310213020534549e-07,
      "loss": 0.0118,
      "num_tokens": 1230493686.0,
      "reward": 0.07421875,
      "reward_std": 0.08709507435560226,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1708.05078125,
      "completions/mean_terminated_length": 739.3233032226562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.43219254075275243,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.853572477672882,
      "learning_rate": 7.30530355463084e-07,
      "loss": 0.0117,
      "num_tokens": 1231449872.0,
      "reward": 0.02734375,
      "reward_std": 0.06327171623706818,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1758.078125,
      "completions/mean_terminated_length": 831.2786254882812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4325339250661432,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.497914350827873,
      "learning_rate": 7.300391526428928e-07,
      "loss": 0.0148,
      "num_tokens": 1232435912.0,
      "reward": 0.064453125,
      "reward_std": 0.08341006934642792,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1835.0,
      "completions/mean_length": 1661.224609375,
      "completions/mean_terminated_length": 633.5071411132812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.432875309379534,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.65346283302957,
      "learning_rate": 7.295476942900539e-07,
      "loss": 0.0108,
      "num_tokens": 1233364491.0,
      "reward": 0.1171875,
      "reward_std": 0.11482647061347961,
      "rewards/accuracy_reward/mean": 0.1171875,
      "rewards/accuracy_reward/std": 0.32195815443992615,
      "step": 1268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1861.0,
      "completions/mean_length": 1731.416015625,
      "completions/mean_terminated_length": 781.6640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4332166936929248,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 43.932433158645814,
      "learning_rate": 7.290559811021029e-07,
      "loss": 0.0187,
      "num_tokens": 1234323680.0,
      "reward": 0.08984375,
      "reward_std": 0.09908141195774078,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1838.9375,
      "completions/mean_terminated_length": 758.3613891601562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.43355807800631563,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.863612608984905,
      "learning_rate": 7.285640137769363e-07,
      "loss": 0.0032,
      "num_tokens": 1235336672.0,
      "reward": 0.068359375,
      "reward_std": 0.09912915527820587,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1738.66015625,
      "completions/mean_terminated_length": 510.3106994628906,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4338994623197064,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.047452249912592,
      "learning_rate": 7.280717930128119e-07,
      "loss": 0.0192,
      "num_tokens": 1236304130.0,
      "reward": 0.03515625,
      "reward_std": 0.06409768760204315,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 1271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1688.8984375,
      "completions/mean_terminated_length": 753.2112426757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4342408466330972,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.659324568222564,
      "learning_rate": 7.275793195083474e-07,
      "loss": -0.0073,
      "num_tokens": 1237247678.0,
      "reward": 0.0390625,
      "reward_std": 0.06469620764255524,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1781.5390625,
      "completions/mean_terminated_length": 697.2277221679688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.434582230946488,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.547418837313424,
      "learning_rate": 7.270865939625183e-07,
      "loss": 0.0221,
      "num_tokens": 1238231730.0,
      "reward": 0.041015625,
      "reward_std": 0.08463403582572937,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1683.37109375,
      "completions/mean_terminated_length": 760.4827270507812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4349236152598788,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 19.91744369390336,
      "learning_rate": 7.265936170746588e-07,
      "loss": 0.0158,
      "num_tokens": 1239165424.0,
      "reward": 0.072265625,
      "reward_std": 0.10982763767242432,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1791.04296875,
      "completions/mean_terminated_length": 806.8490600585938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4352649995732696,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.707594556465457,
      "learning_rate": 7.261003895444593e-07,
      "loss": 0.008,
      "num_tokens": 1240150374.0,
      "reward": 0.0390625,
      "reward_std": 0.06299237906932831,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1651.208984375,
      "completions/mean_terminated_length": 656.513671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4356063838866604,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.7644118097855,
      "learning_rate": 7.256069120719661e-07,
      "loss": 0.0136,
      "num_tokens": 1241071825.0,
      "reward": 0.033203125,
      "reward_std": 0.0715966522693634,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1749.248046875,
      "completions/mean_terminated_length": 717.904296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4359477682000512,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 55.99714848488093,
      "learning_rate": 7.2511318535758e-07,
      "loss": 0.009,
      "num_tokens": 1242036880.0,
      "reward": 0.03125,
      "reward_std": 0.0752037987112999,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 1742.1484375,
      "completions/mean_terminated_length": 732.0672607421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.436289152513442,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 31.14684390893561,
      "learning_rate": 7.246192101020559e-07,
      "loss": -0.0057,
      "num_tokens": 1243006636.0,
      "reward": 0.0234375,
      "reward_std": 0.054249756038188934,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1774.8046875,
      "completions/mean_terminated_length": 740.7476196289062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4366305368268328,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.803038812629198,
      "learning_rate": 7.241249870065014e-07,
      "loss": 0.0116,
      "num_tokens": 1243994904.0,
      "reward": 0.0390625,
      "reward_std": 0.07477159798145294,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1779.265625,
      "completions/mean_terminated_length": 851.5477905273438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4369719211402236,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.2772127652322518,
      "learning_rate": 7.236305167723758e-07,
      "loss": 0.0086,
      "num_tokens": 1244976624.0,
      "reward": 0.015625,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1697.666015625,
      "completions/mean_terminated_length": 776.2269287109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4373133054536144,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 24.919131172778428,
      "learning_rate": 7.231358001014891e-07,
      "loss": 0.0054,
      "num_tokens": 1245933365.0,
      "reward": 0.046875,
      "reward_std": 0.10409364104270935,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1702.552734375,
      "completions/mean_terminated_length": 633.0480346679688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4376546897670052,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.613020452400665,
      "learning_rate": 7.22640837696001e-07,
      "loss": 0.0115,
      "num_tokens": 1246879088.0,
      "reward": 0.0390625,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1734.173828125,
      "completions/mean_terminated_length": 741.6666259765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.437996074080396,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.3903618508876585,
      "learning_rate": 7.221456302584202e-07,
      "loss": -0.0086,
      "num_tokens": 1247841753.0,
      "reward": 0.02734375,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1743.3984375,
      "completions/mean_terminated_length": 692.3043212890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4383374583937868,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.52315698998046,
      "learning_rate": 7.216501784916032e-07,
      "loss": 0.0037,
      "num_tokens": 1248824757.0,
      "reward": 0.056640625,
      "reward_std": 0.09582037478685379,
      "rewards/accuracy_reward/mean": 0.058467742055654526,
      "rewards/accuracy_reward/std": 0.23486268520355225,
      "step": 1284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1857.0,
      "completions/mean_length": 1774.2578125,
      "completions/mean_terminated_length": 785.3333740234375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4386788427071776,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.615453911609256,
      "learning_rate": 7.211544830987533e-07,
      "loss": 0.0264,
      "num_tokens": 1249804857.0,
      "reward": 0.080078125,
      "reward_std": 0.06900903582572937,
      "rewards/accuracy_reward/mean": 0.08266129344701767,
      "rewards/accuracy_reward/std": 0.2756475806236267,
      "step": 1285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1718.25390625,
      "completions/mean_terminated_length": 759.2213745117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4390202270205684,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.6303740504701167,
      "learning_rate": 7.206585447834188e-07,
      "loss": 0.015,
      "num_tokens": 1250763819.0,
      "reward": 0.015625,
      "reward_std": 0.03344620764255524,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1716.93359375,
      "completions/mean_terminated_length": 744.107666015625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4393616113339592,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 27.32345565839729,
      "learning_rate": 7.201623642494943e-07,
      "loss": 0.0142,
      "num_tokens": 1251718889.0,
      "reward": 0.03125,
      "reward_std": 0.06464291363954544,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1624.458984375,
      "completions/mean_terminated_length": 709.4012451171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.43970299564735,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.467390542335078,
      "learning_rate": 7.196659422012166e-07,
      "loss": 0.0139,
      "num_tokens": 1252629892.0,
      "reward": 0.041015625,
      "reward_std": 0.06949453055858612,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1799.0,
      "completions/mean_length": 1715.859375,
      "completions/mean_terminated_length": 687.5520629882812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4400443799607408,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 4.223577776104194,
      "learning_rate": 7.191692793431663e-07,
      "loss": -0.0037,
      "num_tokens": 1253584268.0,
      "reward": 0.0078125,
      "reward_std": 0.013975424692034721,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 1289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 1726.337890625,
      "completions/mean_terminated_length": 698.07373046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4403857642741316,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.847524940092285,
      "learning_rate": 7.186723763802654e-07,
      "loss": -0.0003,
      "num_tokens": 1254543929.0,
      "reward": 0.03515625,
      "reward_std": 0.07317391037940979,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1777.169921875,
      "completions/mean_terminated_length": 764.5925903320312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4407271485875224,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.98281924602311,
      "learning_rate": 7.181752340177769e-07,
      "loss": 0.0236,
      "num_tokens": 1255540240.0,
      "reward": 0.037109375,
      "reward_std": 0.07669497281312943,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1676.091796875,
      "completions/mean_terminated_length": 734.779296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4410685329009132,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 25.14971518233946,
      "learning_rate": 7.176778529613035e-07,
      "loss": 0.0015,
      "num_tokens": 1256475679.0,
      "reward": 0.060546875,
      "reward_std": 0.11916647106409073,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1717.27734375,
      "completions/mean_terminated_length": 714.6929321289062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.441409917214304,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.058746303821399,
      "learning_rate": 7.171802339167864e-07,
      "loss": 0.0171,
      "num_tokens": 1257431677.0,
      "reward": 0.03125,
      "reward_std": 0.07300759106874466,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1787.72265625,
      "completions/mean_terminated_length": 674.1649169921875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4417513015276948,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.208561661090513,
      "learning_rate": 7.16682377590505e-07,
      "loss": 0.0212,
      "num_tokens": 1258421727.0,
      "reward": 0.0390625,
      "reward_std": 0.05507335811853409,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1705.705078125,
      "completions/mean_terminated_length": 720.3106079101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4420926858410856,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.184418638972135,
      "learning_rate": 7.161842846890751e-07,
      "loss": 0.0257,
      "num_tokens": 1259371464.0,
      "reward": 0.046875,
      "reward_std": 0.06657323241233826,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1916.0,
      "completions/mean_length": 1738.138671875,
      "completions/mean_terminated_length": 758.5284423828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4424340701544764,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.205306428292676,
      "learning_rate": 7.156859559194488e-07,
      "loss": 0.0112,
      "num_tokens": 1260339487.0,
      "reward": 0.037109375,
      "reward_std": 0.049345001578330994,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1735.060546875,
      "completions/mean_terminated_length": 690.1610107421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4427754544678672,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.701905559492975,
      "learning_rate": 7.151873919889122e-07,
      "loss": 0.0259,
      "num_tokens": 1261303566.0,
      "reward": 0.037109375,
      "reward_std": 0.07085912674665451,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1759.666015625,
      "completions/mean_terminated_length": 718.0270385742188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.443116838781258,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.70065445300811,
      "learning_rate": 7.146885936050861e-07,
      "loss": 0.0185,
      "num_tokens": 1262282035.0,
      "reward": 0.052734375,
      "reward_std": 0.07586899399757385,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1690.265625,
      "completions/mean_terminated_length": 711.065673828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4434582230946488,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.899263490880784,
      "learning_rate": 7.141895614759232e-07,
      "loss": 0.0042,
      "num_tokens": 1263219835.0,
      "reward": 0.0390625,
      "reward_std": 0.07093241065740585,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 1299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1700.80078125,
      "completions/mean_terminated_length": 740.8970336914062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4437996074080396,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.908078900546084,
      "learning_rate": 7.136902963097085e-07,
      "loss": 0.0285,
      "num_tokens": 1264169509.0,
      "reward": 0.03515625,
      "reward_std": 0.08587770164012909,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1739.671875,
      "completions/mean_terminated_length": 785.0880126953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4441409917214304,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 25.492125440100207,
      "learning_rate": 7.131907988150575e-07,
      "loss": 0.0083,
      "num_tokens": 1265146541.0,
      "reward": 0.046875,
      "reward_std": 0.09319227933883667,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1703.83984375,
      "completions/mean_terminated_length": 798.28369140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4444823760348212,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 48.39820377490189,
      "learning_rate": 7.126910697009156e-07,
      "loss": 0.0238,
      "num_tokens": 1266100603.0,
      "reward": 0.0546875,
      "reward_std": 0.09859496355056763,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1646.345703125,
      "completions/mean_terminated_length": 721.2451782226562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.444823760348212,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.6046642307956733,
      "learning_rate": 7.121911096765571e-07,
      "loss": 0.0131,
      "num_tokens": 1267019276.0,
      "reward": 0.02734375,
      "reward_std": 0.056218504905700684,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1645.13671875,
      "completions/mean_terminated_length": 774.7531127929688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4451651446616028,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.14362860214143,
      "learning_rate": 7.116909194515831e-07,
      "loss": -0.007,
      "num_tokens": 1267937458.0,
      "reward": 0.0625,
      "reward_std": 0.077679343521595,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1700.390625,
      "completions/mean_terminated_length": 812.0555419921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4455065289749936,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 7.464543098024644,
      "learning_rate": 7.111904997359229e-07,
      "loss": 0.0329,
      "num_tokens": 1268881210.0,
      "reward": 0.083984375,
      "reward_std": 0.13156922161579132,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 1305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1902.0,
      "completions/mean_length": 1708.720703125,
      "completions/mean_terminated_length": 669.34130859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4458479132883844,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.470060939648388,
      "learning_rate": 7.106898512398305e-07,
      "loss": 0.0142,
      "num_tokens": 1269831115.0,
      "reward": 0.03125,
      "reward_std": 0.0792168527841568,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1679.484375,
      "completions/mean_terminated_length": 660.6470336914062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4461892976017752,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.563141507416802,
      "learning_rate": 7.101889746738848e-07,
      "loss": 0.0104,
      "num_tokens": 1270771715.0,
      "reward": 0.03515625,
      "reward_std": 0.07477159798145294,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1713.544921875,
      "completions/mean_terminated_length": 798.065673828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.446530681915166,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.133919563267934,
      "learning_rate": 7.096878707489885e-07,
      "loss": 0.0068,
      "num_tokens": 1271724682.0,
      "reward": 0.044921875,
      "reward_std": 0.09841620922088623,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1631.69921875,
      "completions/mean_terminated_length": 748.3292236328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4468720662285568,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 20.20640398866373,
      "learning_rate": 7.091865401763671e-07,
      "loss": 0.0157,
      "num_tokens": 1272634848.0,
      "reward": 0.0546875,
      "reward_std": 0.06783140450716019,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1628.115234375,
      "completions/mean_terminated_length": 721.2777709960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4472134505419476,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.52241559846939,
      "learning_rate": 7.086849836675675e-07,
      "loss": 0.0002,
      "num_tokens": 1273557403.0,
      "reward": 0.041015625,
      "reward_std": 0.05303792655467987,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1593.05078125,
      "completions/mean_terminated_length": 693.7325439453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4475548348553384,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 4.82997478769421,
      "learning_rate": 7.081832019344573e-07,
      "loss": 0.0249,
      "num_tokens": 1274459061.0,
      "reward": 0.052734375,
      "reward_std": 0.10020478814840317,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1844.0,
      "completions/mean_length": 1653.830078125,
      "completions/mean_terminated_length": 702.5667114257812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4478962191687292,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 9.618872630089466,
      "learning_rate": 7.076811956892241e-07,
      "loss": 0.0337,
      "num_tokens": 1275388926.0,
      "reward": 0.05859375,
      "reward_std": 0.09765692055225372,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1535.37890625,
      "completions/mean_terminated_length": 597.9337158203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.44823760348212,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.404140867041663,
      "learning_rate": 7.07178965644374e-07,
      "loss": 0.0353,
      "num_tokens": 1276247008.0,
      "reward": 0.03125,
      "reward_std": 0.05789501965045929,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1656.814453125,
      "completions/mean_terminated_length": 666.7103271484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4485789877955108,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.9436619798384245,
      "learning_rate": 7.066765125127305e-07,
      "loss": 0.0321,
      "num_tokens": 1277175169.0,
      "reward": 0.041015625,
      "reward_std": 0.07669496536254883,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1686.98828125,
      "completions/mean_terminated_length": 698.8175048828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4489203721089016,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 22.719546786519388,
      "learning_rate": 7.061738370074342e-07,
      "loss": 0.0078,
      "num_tokens": 1278128395.0,
      "reward": 0.013671875,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1573.080078125,
      "completions/mean_terminated_length": 704.5801391601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4492617564222924,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.504743793403346,
      "learning_rate": 7.056709398419407e-07,
      "loss": 0.0207,
      "num_tokens": 1279016676.0,
      "reward": 0.06640625,
      "reward_std": 0.09338457882404327,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1611.3515625,
      "completions/mean_terminated_length": 740.6082153320312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.4496031407356832,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.058086495513347,
      "learning_rate": 7.051678217300207e-07,
      "loss": 0.0247,
      "num_tokens": 1279914088.0,
      "reward": 0.052734375,
      "reward_std": 0.07680703699588776,
      "rewards/accuracy_reward/mean": 0.05443548411130905,
      "rewards/accuracy_reward/std": 0.227104052901268,
      "step": 1317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1701.88671875,
      "completions/mean_terminated_length": 800.042236328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.449944525049074,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.49812497251133,
      "learning_rate": 7.046644833857583e-07,
      "loss": 0.0027,
      "num_tokens": 1280858366.0,
      "reward": 0.03515625,
      "reward_std": 0.08120594918727875,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1650.828125,
      "completions/mean_terminated_length": 744.4615478515625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4502859093624648,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.85734239284047,
      "learning_rate": 7.041609255235503e-07,
      "loss": 0.0239,
      "num_tokens": 1281783174.0,
      "reward": 0.0546875,
      "reward_std": 0.09429662674665451,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 1319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1664.6015625,
      "completions/mean_terminated_length": 694.2069091796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4506272936758556,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 26.857474296486128,
      "learning_rate": 7.036571488581049e-07,
      "loss": 0.031,
      "num_tokens": 1282703578.0,
      "reward": 0.072265625,
      "reward_std": 0.11286985874176025,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1607.98828125,
      "completions/mean_terminated_length": 525.7973022460938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4509686779892464,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 18.14181889256335,
      "learning_rate": 7.031531541044411e-07,
      "loss": 0.0265,
      "num_tokens": 1283608340.0,
      "reward": 0.0546875,
      "reward_std": 0.11857913434505463,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 1321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1636.759765625,
      "completions/mean_terminated_length": 575.58740234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4513100623026372,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.0751683436281185,
      "learning_rate": 7.026489419778871e-07,
      "loss": 0.0016,
      "num_tokens": 1284518873.0,
      "reward": 0.03515625,
      "reward_std": 0.05012226477265358,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1760.0,
      "completions/mean_length": 1737.1484375,
      "completions/mean_terminated_length": 699.2203369140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.451651446616028,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 53.06411160893595,
      "learning_rate": 7.021445131940797e-07,
      "loss": 0.0,
      "num_tokens": 1285494389.0,
      "reward": 0.029296875,
      "reward_std": 0.06403100490570068,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1814.0,
      "completions/mean_length": 1619.349609375,
      "completions/mean_terminated_length": 584.8733520507812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4519928309294188,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 28.868883303465935,
      "learning_rate": 7.016398684689636e-07,
      "loss": 0.017,
      "num_tokens": 1286394472.0,
      "reward": 0.087890625,
      "reward_std": 0.12906768918037415,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 1324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1682.6015625,
      "completions/mean_terminated_length": 662.1925659179688,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.4523342152428096,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.302861087033077,
      "learning_rate": 7.011350085187895e-07,
      "loss": -0.0014,
      "num_tokens": 1287336220.0,
      "reward": 0.017578125,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.018145160749554634,
      "rewards/accuracy_reward/std": 0.1336110234260559,
      "step": 1325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.619140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1850.0,
      "completions/mean_length": 1486.162109375,
      "completions/mean_terminated_length": 572.8154296875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4526755995562004,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 37.381653646933536,
      "learning_rate": 7.006299340601136e-07,
      "loss": 0.0037,
      "num_tokens": 1288173855.0,
      "reward": 0.0859375,
      "reward_std": 0.12120094895362854,
      "rewards/accuracy_reward/mean": 0.08870967477560043,
      "rewards/accuracy_reward/std": 0.2846112847328186,
      "step": 1326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1781.0,
      "completions/mean_length": 1634.74609375,
      "completions/mean_terminated_length": 608.6394653320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4530169838695912,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 11.249685721107372,
      "learning_rate": 7.001246458097972e-07,
      "loss": -0.0004,
      "num_tokens": 1289088509.0,
      "reward": 0.044921875,
      "reward_std": 0.0879673957824707,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1592.251953125,
      "completions/mean_terminated_length": 659.3511962890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.453358368182982,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 21.61853202556118,
      "learning_rate": 6.99619144485004e-07,
      "loss": 0.0037,
      "num_tokens": 1289987838.0,
      "reward": 0.048828125,
      "reward_std": 0.09363143146038055,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1649.35546875,
      "completions/mean_terminated_length": 668.9053955078125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4536997524963728,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.049022847974536,
      "learning_rate": 6.99113430803201e-07,
      "loss": 0.0048,
      "num_tokens": 1290909444.0,
      "reward": 0.04296875,
      "reward_std": 0.06783141195774078,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1881.0,
      "completions/mean_length": 1626.986328125,
      "completions/mean_terminated_length": 639.11767578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4540411368097636,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.939744580731258,
      "learning_rate": 6.986075054821561e-07,
      "loss": 0.0274,
      "num_tokens": 1291806573.0,
      "reward": 0.046875,
      "reward_std": 0.09228022396564484,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1673.810546875,
      "completions/mean_terminated_length": 726.72412109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4543825211231544,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 25.076488254669922,
      "learning_rate": 6.98101369239938e-07,
      "loss": 0.0161,
      "num_tokens": 1292745724.0,
      "reward": 0.076171875,
      "reward_std": 0.09263914823532104,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 1331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1675.0,
      "completions/mean_length": 1660.619140625,
      "completions/mean_terminated_length": 522.3153686523438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4547239054365452,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 8.475907272977782,
      "learning_rate": 6.975950227949143e-07,
      "loss": 0.0121,
      "num_tokens": 1293678937.0,
      "reward": 0.013671875,
      "reward_std": 0.026572702452540398,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1602.296875,
      "completions/mean_terminated_length": 603.6962280273438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.455065289749936,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 32.3955670322673,
      "learning_rate": 6.970884668657512e-07,
      "loss": -0.0106,
      "num_tokens": 1294582545.0,
      "reward": 0.033203125,
      "reward_std": 0.059472277760505676,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1698.1953125,
      "completions/mean_terminated_length": 615.2000122070312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4554066740633268,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.480896150343732,
      "learning_rate": 6.965817021714124e-07,
      "loss": -0.0002,
      "num_tokens": 1295527477.0,
      "reward": 0.021484375,
      "reward_std": 0.04456022381782532,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1702.421875,
      "completions/mean_terminated_length": 665.6875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4557480583767176,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.429878493826138,
      "learning_rate": 6.960747294311575e-07,
      "loss": -0.0045,
      "num_tokens": 1296471421.0,
      "reward": 0.0625,
      "reward_std": 0.10045164078474045,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1659.87890625,
      "completions/mean_terminated_length": 608.0145263671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.45608944269010837,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 58.035079576206755,
      "learning_rate": 6.955675493645415e-07,
      "loss": 0.0144,
      "num_tokens": 1297403887.0,
      "reward": 0.041015625,
      "reward_std": 0.08427447080612183,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 1336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1678.25,
      "completions/mean_terminated_length": 686.0431518554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4564308270034992,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.882906078572776,
      "learning_rate": 6.950601626914139e-07,
      "loss": 0.0005,
      "num_tokens": 1298342527.0,
      "reward": 0.03125,
      "reward_std": 0.04995594918727875,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1556.140625,
      "completions/mean_terminated_length": 625.2203369140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.45677221131689,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 20.07230014819777,
      "learning_rate": 6.94552570131917e-07,
      "loss": -0.0104,
      "num_tokens": 1299215991.0,
      "reward": 0.07421875,
      "reward_std": 0.09319227933883667,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1855.0,
      "completions/mean_length": 1705.6015625,
      "completions/mean_terminated_length": 634.2257690429688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4571135956302808,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 16.120989508695928,
      "learning_rate": 6.940447724064861e-07,
      "loss": 0.0116,
      "num_tokens": 1300157403.0,
      "reward": 0.068359375,
      "reward_std": 0.12741811573505402,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1871.0,
      "completions/mean_length": 1659.267578125,
      "completions/mean_terminated_length": 540.1893920898438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.45745497994367157,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.133899713613234,
      "learning_rate": 6.935367702358469e-07,
      "loss": 0.0127,
      "num_tokens": 1301089060.0,
      "reward": 0.0390625,
      "reward_std": 0.0736672431230545,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1621.078125,
      "completions/mean_terminated_length": 646.8204956054688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4577963642570624,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.193322477757516,
      "learning_rate": 6.930285643410154e-07,
      "loss": 0.0033,
      "num_tokens": 1301999420.0,
      "reward": 0.037109375,
      "reward_std": 0.04251687601208687,
      "rewards/accuracy_reward/mean": 0.03958333283662796,
      "rewards/accuracy_reward/std": 0.19518160820007324,
      "step": 1341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1841.0,
      "completions/mean_length": 1655.626953125,
      "completions/mean_terminated_length": 633.2464599609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4581377485704532,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 26.415554485636136,
      "learning_rate": 6.925201554432972e-07,
      "loss": 0.023,
      "num_tokens": 1302932205.0,
      "reward": 0.080078125,
      "reward_std": 0.11682207137346268,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 1342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1638.11328125,
      "completions/mean_terminated_length": 711.2993774414062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.458479132883844,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.56890586359392,
      "learning_rate": 6.920115442642858e-07,
      "loss": 0.0177,
      "num_tokens": 1303844007.0,
      "reward": 0.03125,
      "reward_std": 0.0761418342590332,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 1343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1587.7421875,
      "completions/mean_terminated_length": 709.0681762695312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.45882051719723477,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.346002765247002,
      "learning_rate": 6.915027315258614e-07,
      "loss": 0.0167,
      "num_tokens": 1304733843.0,
      "reward": 0.056640625,
      "reward_std": 0.09118282794952393,
      "rewards/accuracy_reward/mean": 0.058467742055654526,
      "rewards/accuracy_reward/std": 0.23486268520355225,
      "step": 1344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1583.513671875,
      "completions/mean_terminated_length": 657.25732421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4591619015106256,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.59197392701996,
      "learning_rate": 6.909937179501908e-07,
      "loss": 0.0097,
      "num_tokens": 1305615290.0,
      "reward": 0.072265625,
      "reward_std": 0.06733252108097076,
      "rewards/accuracy_reward/mean": 0.07459677755832672,
      "rewards/accuracy_reward/std": 0.263004869222641,
      "step": 1345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1540.7578125,
      "completions/mean_terminated_length": 636.5435180664062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4595032858240164,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.76918297692418,
      "learning_rate": 6.904845042597258e-07,
      "loss": 0.008,
      "num_tokens": 1306476302.0,
      "reward": 0.02734375,
      "reward_std": 0.04847269132733345,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1648.04296875,
      "completions/mean_terminated_length": 645.8013916015625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4598446701374072,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 18.55915123733681,
      "learning_rate": 6.899750911772019e-07,
      "loss": 0.0299,
      "num_tokens": 1307407716.0,
      "reward": 0.0625,
      "reward_std": 0.10355263948440552,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1508.560546875,
      "completions/mean_terminated_length": 586.661376953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.46018605445079797,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 26.12937549356265,
      "learning_rate": 6.894654794256378e-07,
      "loss": -0.0002,
      "num_tokens": 1308253811.0,
      "reward": 0.037109375,
      "reward_std": 0.06166848540306091,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1654.640625,
      "completions/mean_terminated_length": 577.927001953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4605274387641888,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 20.454096070937826,
      "learning_rate": 6.889556697283344e-07,
      "loss": 0.0083,
      "num_tokens": 1309182027.0,
      "reward": 0.041015625,
      "reward_std": 0.09792976081371307,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1779.0,
      "completions/mean_length": 1706.724609375,
      "completions/mean_terminated_length": 650.6160278320312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4608688230775796,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.550627450987065,
      "learning_rate": 6.88445662808873e-07,
      "loss": 0.0072,
      "num_tokens": 1310135918.0,
      "reward": 0.013671875,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1543.162109375,
      "completions/mean_terminated_length": 643.2337036132812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4612102073909704,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.1865046834519575,
      "learning_rate": 6.879354593911154e-07,
      "loss": 0.005,
      "num_tokens": 1311001233.0,
      "reward": 0.037109375,
      "reward_std": 0.06602106243371964,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1703.0,
      "completions/mean_length": 1669.40625,
      "completions/mean_terminated_length": 622.7058715820312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.46155159170436116,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 10.561579161379623,
      "learning_rate": 6.874250601992019e-07,
      "loss": 0.0119,
      "num_tokens": 1311933457.0,
      "reward": 0.056640625,
      "reward_std": 0.09778349101543427,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.583984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1464.619140625,
      "completions/mean_terminated_length": 645.69482421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.461892976017752,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 60.39423560879323,
      "learning_rate": 6.869144659575507e-07,
      "loss": -0.0078,
      "num_tokens": 1312769870.0,
      "reward": 0.03515625,
      "reward_std": 0.052318472415208817,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1634.63671875,
      "completions/mean_terminated_length": 655.618408203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4622343603311428,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 43.96253666820117,
      "learning_rate": 6.864036773908572e-07,
      "loss": 0.0219,
      "num_tokens": 1313689620.0,
      "reward": 0.05078125,
      "reward_std": 0.06956121325492859,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1558.298828125,
      "completions/mean_terminated_length": 631.4632568359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4625757446445336,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.085300261523463,
      "learning_rate": 6.858926952240925e-07,
      "loss": 0.0185,
      "num_tokens": 1314561437.0,
      "reward": 0.0625,
      "reward_std": 0.06464291363954544,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.619140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 1494.4921875,
      "completions/mean_terminated_length": 594.6871948242188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.46291712895792436,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.37901856552808,
      "learning_rate": 6.853815201825016e-07,
      "loss": 0.0105,
      "num_tokens": 1315407065.0,
      "reward": 0.015625,
      "reward_std": 0.04081955552101135,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.564453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1451.478515625,
      "completions/mean_terminated_length": 678.4080810546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4632585132713152,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.749692656207962,
      "learning_rate": 6.848701529916047e-07,
      "loss": -0.0162,
      "num_tokens": 1316222974.0,
      "reward": 0.0390625,
      "reward_std": 0.07807311415672302,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1516.451171875,
      "completions/mean_terminated_length": 623.1151733398438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.463599897584706,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.6380802932008,
      "learning_rate": 6.843585943771935e-07,
      "loss": 0.013,
      "num_tokens": 1317083957.0,
      "reward": 0.064453125,
      "reward_std": 0.08973139524459839,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1579.03125,
      "completions/mean_terminated_length": 619.0654907226562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4639412818980968,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 15.0739240447066,
      "learning_rate": 6.838468450653322e-07,
      "loss": 0.0082,
      "num_tokens": 1317977877.0,
      "reward": 0.041015625,
      "reward_std": 0.10805703699588776,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.64453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1548.1171875,
      "completions/mean_terminated_length": 641.7362670898438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.46428266621148756,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.239727925845521,
      "learning_rate": 6.833349057823553e-07,
      "loss": -0.0137,
      "num_tokens": 1318871617.0,
      "reward": 0.02734375,
      "reward_std": 0.07779236882925034,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.568359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1439.478515625,
      "completions/mean_terminated_length": 638.4072875976562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4646240505248784,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 20.280329262494572,
      "learning_rate": 6.828227772548669e-07,
      "loss": 0.0145,
      "num_tokens": 1319689558.0,
      "reward": 0.07421875,
      "reward_std": 0.12922608852386475,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1573.74609375,
      "completions/mean_terminated_length": 652.4942626953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4649654348382692,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.62762146773623,
      "learning_rate": 6.823104602097398e-07,
      "loss": 0.0225,
      "num_tokens": 1320579236.0,
      "reward": 0.041015625,
      "reward_std": 0.08611729741096497,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1576.84375,
      "completions/mean_terminated_length": 715.4364624023438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.46530681915166,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 3.6505834996527624,
      "learning_rate": 6.817979553741143e-07,
      "loss": 0.0104,
      "num_tokens": 1321467924.0,
      "reward": 0.0234375,
      "reward_std": 0.036547206342220306,
      "rewards/accuracy_reward/mean": 0.024193547666072845,
      "rewards/accuracy_reward/std": 0.15380479395389557,
      "step": 1363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1538.275390625,
      "completions/mean_terminated_length": 674.4263305664062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.46564820346505076,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.193890897779049,
      "learning_rate": 6.812852634753974e-07,
      "loss": 0.01,
      "num_tokens": 1322335809.0,
      "reward": 0.03125,
      "reward_std": 0.05914659798145294,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 1724.220703125,
      "completions/mean_terminated_length": 742.68505859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.46598958777844157,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.945393750644513,
      "learning_rate": 6.807723852412613e-07,
      "loss": 0.0037,
      "num_tokens": 1323301506.0,
      "reward": 0.04296875,
      "reward_std": 0.07289456576108932,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1579.826171875,
      "completions/mean_terminated_length": 670.3850708007812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4663309720918324,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.553615652351151,
      "learning_rate": 6.802593213996431e-07,
      "loss": 0.0101,
      "num_tokens": 1324185017.0,
      "reward": 0.064453125,
      "reward_std": 0.09488959610462189,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1680.423828125,
      "completions/mean_terminated_length": 653.933349609375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4666723564052232,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.653907535189768,
      "learning_rate": 6.797460726787427e-07,
      "loss": 0.0119,
      "num_tokens": 1325115810.0,
      "reward": 0.0546875,
      "reward_std": 0.08464758098125458,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1637.015625,
      "completions/mean_terminated_length": 749.08642578125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.46701374071861396,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.9808912382522084,
      "learning_rate": 6.792326398070233e-07,
      "loss": 0.0087,
      "num_tokens": 1326026442.0,
      "reward": 0.021484375,
      "reward_std": 0.036420635879039764,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 1368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1617.9140625,
      "completions/mean_terminated_length": 752.682373046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.46735512503200477,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 17.93567664268745,
      "learning_rate": 6.787190235132085e-07,
      "loss": 0.0152,
      "num_tokens": 1326932206.0,
      "reward": 0.029296875,
      "reward_std": 0.09055596590042114,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 1369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1601.900390625,
      "completions/mean_terminated_length": 611.5031127929688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4676965093453956,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 12.076609858561431,
      "learning_rate": 6.782052245262829e-07,
      "loss": 0.0111,
      "num_tokens": 1327837003.0,
      "reward": 0.080078125,
      "reward_std": 0.12356996536254883,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 1370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1783.0,
      "completions/mean_length": 1588.802734375,
      "completions/mean_terminated_length": 578.5687866210938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4680378936587864,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 4.336793567182062,
      "learning_rate": 6.7769124357549e-07,
      "loss": -0.0082,
      "num_tokens": 1328720662.0,
      "reward": 0.05078125,
      "reward_std": 0.09028453379869461,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1754.19140625,
      "completions/mean_terminated_length": 834.8547973632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.46837927797217715,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.450924548088405,
      "learning_rate": 6.77177081390332e-07,
      "loss": 0.0155,
      "num_tokens": 1329686808.0,
      "reward": 0.056640625,
      "reward_std": 0.08039448410272598,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1843.0,
      "completions/mean_length": 1676.798828125,
      "completions/mean_terminated_length": 619.0150756835938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.46872066228556797,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.423625513619431,
      "learning_rate": 6.76662738700568e-07,
      "loss": -0.0157,
      "num_tokens": 1330616897.0,
      "reward": 0.03515625,
      "reward_std": 0.04554459825158119,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1698.12890625,
      "completions/mean_terminated_length": 591.6259765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4690620465989588,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.9431566818651097,
      "learning_rate": 6.761482162362134e-07,
      "loss": 0.015,
      "num_tokens": 1331561891.0,
      "reward": 0.03515625,
      "reward_std": 0.04505910724401474,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 1374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1668.2734375,
      "completions/mean_terminated_length": 659.2857055664062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4694034309123496,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.829856259882128,
      "learning_rate": 6.756335147275387e-07,
      "loss": 0.0074,
      "num_tokens": 1332497279.0,
      "reward": 0.052734375,
      "reward_std": 0.06092274561524391,
      "rewards/accuracy_reward/mean": 0.05443548411130905,
      "rewards/accuracy_reward/std": 0.2271040380001068,
      "step": 1375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1717.93359375,
      "completions/mean_terminated_length": 565.5964965820312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.46974481522574035,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.582070368057904,
      "learning_rate": 6.751186349050683e-07,
      "loss": -0.0008,
      "num_tokens": 1333454317.0,
      "reward": 0.03125,
      "reward_std": 0.05012226477265358,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 1376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1863.0,
      "completions/mean_length": 1746.841796875,
      "completions/mean_terminated_length": 506.0699768066406,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.47008619953913117,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 20.412395649159794,
      "learning_rate": 6.746035774995805e-07,
      "loss": 0.0144,
      "num_tokens": 1334422444.0,
      "reward": 0.056640625,
      "reward_std": 0.061668481677770615,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1836.814453125,
      "completions/mean_terminated_length": 745.2650146484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.470427583852522,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 1.6495096115336052,
      "learning_rate": 6.740883432421044e-07,
      "loss": -0.0012,
      "num_tokens": 1335443693.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.002016128972172737,
      "rewards/accuracy_reward/std": 0.044901326298713684,
      "step": 1378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1672.365234375,
      "completions/mean_terminated_length": 730.7054443359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4707689681659128,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.260373586386889,
      "learning_rate": 6.735729328639213e-07,
      "loss": -0.0028,
      "num_tokens": 1336374472.0,
      "reward": 0.048828125,
      "reward_std": 0.05782270431518555,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1839.603515625,
      "completions/mean_terminated_length": 680.0640869140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.47111035247930355,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.829318815390029,
      "learning_rate": 6.730573470965618e-07,
      "loss": -0.0047,
      "num_tokens": 1337406621.0,
      "reward": 0.009765625,
      "reward_std": 0.029160313308238983,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 1380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1820.0,
      "completions/mean_length": 1780.373046875,
      "completions/mean_terminated_length": 779.6018676757812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.47145173679269436,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 17.86707798643111,
      "learning_rate": 6.725415866718055e-07,
      "loss": 0.011,
      "num_tokens": 1338402748.0,
      "reward": 0.0546875,
      "reward_std": 0.06441686302423477,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 1381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1891.0,
      "completions/mean_length": 1742.970703125,
      "completions/mean_terminated_length": 689.9564819335938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4717931211060852,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 6.206031883431773,
      "learning_rate": 6.720256523216802e-07,
      "loss": -0.0027,
      "num_tokens": 1339369949.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 1382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.82421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1792.486328125,
      "completions/mean_terminated_length": 594.4111328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.472134505419476,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 2.040164009486912,
      "learning_rate": 6.715095447784602e-07,
      "loss": 0.0111,
      "num_tokens": 1340358086.0,
      "reward": 0.017578125,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1858.412109375,
      "completions/mean_terminated_length": 699.8194580078125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.47247588973286675,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.846622189400305,
      "learning_rate": 6.709932647746659e-07,
      "loss": 0.0299,
      "num_tokens": 1341384425.0,
      "reward": 0.064453125,
      "reward_std": 0.07296784222126007,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1820.94921875,
      "completions/mean_terminated_length": 647.3975830078125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.47281727404625756,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.3114748835495895,
      "learning_rate": 6.704768130430619e-07,
      "loss": 0.016,
      "num_tokens": 1342401855.0,
      "reward": 0.013671875,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1886.0,
      "completions/mean_length": 1749.419921875,
      "completions/mean_terminated_length": 619.2803344726562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4731586583596484,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.1082993943323824,
      "learning_rate": 6.699601903166575e-07,
      "loss": 0.0215,
      "num_tokens": 1343372294.0,
      "reward": 0.041015625,
      "reward_std": 0.06508206576108932,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1768.927734375,
      "completions/mean_terminated_length": 660.7669677734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4735000426730392,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.2609648743078488,
      "learning_rate": 6.69443397328704e-07,
      "loss": 0.0163,
      "num_tokens": 1344356241.0,
      "reward": 0.029296875,
      "reward_std": 0.060957904905080795,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1800.0,
      "completions/mean_length": 1699.99609375,
      "completions/mean_terminated_length": 563.183349609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.47384142698642995,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.579897856451089,
      "learning_rate": 6.689264348126944e-07,
      "loss": 0.0368,
      "num_tokens": 1345300383.0,
      "reward": 0.0625,
      "reward_std": 0.0892922431230545,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1686.861328125,
      "completions/mean_terminated_length": 580.5159301757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.47418281129982076,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.267659316297761,
      "learning_rate": 6.684093035023626e-07,
      "loss": 0.0113,
      "num_tokens": 1346246504.0,
      "reward": 0.05859375,
      "reward_std": 0.06640692055225372,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 1389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 1768.349609375,
      "completions/mean_terminated_length": 697.23583984375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4745241956132116,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 25.18048385464495,
      "learning_rate": 6.678920041316818e-07,
      "loss": 0.0114,
      "num_tokens": 1347223643.0,
      "reward": 0.0625,
      "reward_std": 0.07284127175807953,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.24591811001300812,
      "step": 1390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1809.603515625,
      "completions/mean_terminated_length": 676.5505981445312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4748655799266024,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 21.528063056077638,
      "learning_rate": 6.67374537434864e-07,
      "loss": -0.0036,
      "num_tokens": 1348233552.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1756.994140625,
      "completions/mean_terminated_length": 785.3305053710938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.47520696423999315,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 14.381690105779146,
      "learning_rate": 6.668569041463582e-07,
      "loss": 0.0021,
      "num_tokens": 1349212429.0,
      "reward": 0.017578125,
      "reward_std": 0.03630761057138443,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1771.447265625,
      "completions/mean_terminated_length": 712.1981201171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.47554834855338396,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 49.66768913374407,
      "learning_rate": 6.663391050008505e-07,
      "loss": 0.0036,
      "num_tokens": 1350201090.0,
      "reward": 0.06640625,
      "reward_std": 0.09891509264707565,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1766.05859375,
      "completions/mean_terminated_length": 781.7368774414062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4758897328667748,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.757631971569335,
      "learning_rate": 6.658211407332619e-07,
      "loss": 0.0051,
      "num_tokens": 1351178896.0,
      "reward": 0.09765625,
      "reward_std": 0.10668125003576279,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 1394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1798.927734375,
      "completions/mean_terminated_length": 691.3510131835938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4762311171801656,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.678833770275776,
      "learning_rate": 6.65303012078748e-07,
      "loss": 0.0157,
      "num_tokens": 1352179195.0,
      "reward": 0.015625,
      "reward_std": 0.03839729726314545,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1825.376953125,
      "completions/mean_terminated_length": 658.695068359375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.47657250149355634,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.523591245166773,
      "learning_rate": 6.647847197726978e-07,
      "loss": 0.0158,
      "num_tokens": 1353190380.0,
      "reward": 0.01953125,
      "reward_std": 0.055899329483509064,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 1704.1171875,
      "completions/mean_terminated_length": 661.6378173828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.47691388580694716,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.41450783451255,
      "learning_rate": 6.642662645507322e-07,
      "loss": 0.0066,
      "num_tokens": 1354137128.0,
      "reward": 0.005859375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.006048386916518211,
      "rewards/accuracy_reward/std": 0.07761410623788834,
      "step": 1397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1908.0,
      "completions/mean_length": 1774.408203125,
      "completions/mean_terminated_length": 661.0792236328125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.47725527012033797,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.09313715087493,
      "learning_rate": 6.637476471487036e-07,
      "loss": 0.0152,
      "num_tokens": 1355117145.0,
      "reward": 0.021484375,
      "reward_std": 0.07108421623706818,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1766.380859375,
      "completions/mean_terminated_length": 783.5526123046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4775966544337288,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 22.956806215144805,
      "learning_rate": 6.632288683026946e-07,
      "loss": 0.0087,
      "num_tokens": 1356108524.0,
      "reward": 0.02734375,
      "reward_std": 0.04517117515206337,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1682.267578125,
      "completions/mean_terminated_length": 747.6180419921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.47793803874711954,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 11.14464243875765,
      "learning_rate": 6.62709928749017e-07,
      "loss": 0.0275,
      "num_tokens": 1357046837.0,
      "reward": 0.048828125,
      "reward_std": 0.09852826595306396,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 1400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1719.34375,
      "completions/mean_terminated_length": 792.23876953125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.47827942306051036,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.635424386954305,
      "learning_rate": 6.621908292242104e-07,
      "loss": 0.0018,
      "num_tokens": 1358002965.0,
      "reward": 0.044921875,
      "reward_std": 0.06292665004730225,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1732.890625,
      "completions/mean_terminated_length": 703.9166870117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.47862080737390117,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.091214312445108,
      "learning_rate": 6.616715704650418e-07,
      "loss": 0.0052,
      "num_tokens": 1358973341.0,
      "reward": 0.01953125,
      "reward_std": 0.023823359981179237,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1744.53515625,
      "completions/mean_terminated_length": 696.9216918945312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.478962191687292,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 66.85219103796454,
      "learning_rate": 6.611521532085038e-07,
      "loss": 0.0054,
      "num_tokens": 1359946463.0,
      "reward": 0.005859375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.006048386916518211,
      "rewards/accuracy_reward/std": 0.07761410623788834,
      "step": 1403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1742.2734375,
      "completions/mean_terminated_length": 785.6451416015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4793035760006828,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.18161866080606,
      "learning_rate": 6.606325781918144e-07,
      "loss": 0.0173,
      "num_tokens": 1360907803.0,
      "reward": 0.029296875,
      "reward_std": 0.06645326316356659,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 1404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1841.951171875,
      "completions/mean_terminated_length": 888.6923217773438,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.47964496031407355,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.0392390766528123,
      "learning_rate": 6.601128461524152e-07,
      "loss": 0.0215,
      "num_tokens": 1361930402.0,
      "reward": 0.01953125,
      "reward_std": 0.04505910724401474,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1764.896484375,
      "completions/mean_terminated_length": 667.5333862304688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.47998634462746437,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 26.706955843157182,
      "learning_rate": 6.595929578279708e-07,
      "loss": 0.0276,
      "num_tokens": 1362912893.0,
      "reward": 0.07421875,
      "reward_std": 0.11462032794952393,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1769.736328125,
      "completions/mean_terminated_length": 752.80908203125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4803277289408552,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 23.363204228157326,
      "learning_rate": 6.590729139563675e-07,
      "loss": 0.0121,
      "num_tokens": 1363894006.0,
      "reward": 0.02734375,
      "reward_std": 0.06546888500452042,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1789.51953125,
      "completions/mean_terminated_length": 855.729736328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.480669113254246,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.9388193938095077,
      "learning_rate": 6.585527152757128e-07,
      "loss": 0.0133,
      "num_tokens": 1364886080.0,
      "reward": 0.02734375,
      "reward_std": 0.05864076316356659,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1786.0,
      "completions/mean_length": 1820.185546875,
      "completions/mean_terminated_length": 659.4166870117188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.48101049756763675,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.61702704796192,
      "learning_rate": 6.580323625243332e-07,
      "loss": 0.0068,
      "num_tokens": 1365897167.0,
      "reward": 0.04296875,
      "reward_std": 0.07135801762342453,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1661.72265625,
      "completions/mean_terminated_length": 712.0811157226562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.48135188188102757,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 53.40134156704168,
      "learning_rate": 6.575118564407742e-07,
      "loss": 0.0003,
      "num_tokens": 1366830945.0,
      "reward": 0.056640625,
      "reward_std": 0.0906703919172287,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1830.974609375,
      "completions/mean_terminated_length": 755.9418334960938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4816932661944184,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 28.88819429655891,
      "learning_rate": 6.569911977637994e-07,
      "loss": 0.0215,
      "num_tokens": 1367851524.0,
      "reward": 0.0234375,
      "reward_std": 0.05974511429667473,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1809.4375,
      "completions/mean_terminated_length": 720.3478393554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4820346505078092,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 10.988399397620244,
      "learning_rate": 6.564703872323883e-07,
      "loss": 0.0124,
      "num_tokens": 1368857700.0,
      "reward": 0.01953125,
      "reward_std": 0.04318207502365112,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1733.482421875,
      "completions/mean_terminated_length": 659.7844848632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.48237603482119995,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.623516995992913,
      "learning_rate": 6.559494255857362e-07,
      "loss": 0.0106,
      "num_tokens": 1369818187.0,
      "reward": 0.037109375,
      "reward_std": 0.06613312661647797,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1742.388671875,
      "completions/mean_terminated_length": 663.283203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.48271741913459076,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.566109757620035,
      "learning_rate": 6.554283135632529e-07,
      "loss": 0.0047,
      "num_tokens": 1370787186.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 1414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1870.0,
      "completions/mean_length": 1730.39453125,
      "completions/mean_terminated_length": 646.1551513671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4830588034479816,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 40.17480950569346,
      "learning_rate": 6.549070519045615e-07,
      "loss": 0.0198,
      "num_tokens": 1371747788.0,
      "reward": 0.056640625,
      "reward_std": 0.09761717915534973,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1794.0,
      "completions/mean_length": 1803.20703125,
      "completions/mean_terminated_length": 685.6739501953125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.4834001877613724,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 63.31516703249113,
      "learning_rate": 6.543856413494979e-07,
      "loss": 0.0217,
      "num_tokens": 1372750182.0,
      "reward": 0.0703125,
      "reward_std": 0.10540273785591125,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1832.626953125,
      "completions/mean_terminated_length": 781.0919799804688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.48374157207476315,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.988251705941856,
      "learning_rate": 6.538640826381086e-07,
      "loss": 0.0253,
      "num_tokens": 1373763191.0,
      "reward": 0.04296875,
      "reward_std": 0.08004186302423477,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.841796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1834.294921875,
      "completions/mean_terminated_length": 697.1728515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.48408295638815396,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 18.109590450691698,
      "learning_rate": 6.533423765106509e-07,
      "loss": 0.0196,
      "num_tokens": 1374783614.0,
      "reward": 0.03515625,
      "reward_std": 0.06244811415672302,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1759.0,
      "completions/mean_length": 1700.357421875,
      "completions/mean_terminated_length": 576.9834594726562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4844243407015448,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.07382527230103,
      "learning_rate": 6.528205237075916e-07,
      "loss": 0.0049,
      "num_tokens": 1375727893.0,
      "reward": 0.0390625,
      "reward_std": 0.07775957137346268,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1793.203125,
      "completions/mean_terminated_length": 703.0927734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4847657250149356,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.816900049044747,
      "learning_rate": 6.522985249696049e-07,
      "loss": 0.0253,
      "num_tokens": 1376722285.0,
      "reward": 0.03515625,
      "reward_std": 0.0744580626487732,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 1420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1717.0,
      "completions/mean_length": 1768.677734375,
      "completions/mean_terminated_length": 659.5242919921875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.48510710932832635,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.35814307083946,
      "learning_rate": 6.517763810375727e-07,
      "loss": 0.0059,
      "num_tokens": 1377694152.0,
      "reward": 0.064453125,
      "reward_std": 0.07564390450716019,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1831.720703125,
      "completions/mean_terminated_length": 831.1318969726562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.48544849364171716,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.769176162152664,
      "learning_rate": 6.512540926525828e-07,
      "loss": 0.0276,
      "num_tokens": 1378708777.0,
      "reward": 0.01953125,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1725.0,
      "completions/mean_length": 1858.42578125,
      "completions/mean_terminated_length": 770.868408203125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.485789877955108,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 33.136952907079994,
      "learning_rate": 6.507316605559281e-07,
      "loss": 0.0055,
      "num_tokens": 1379732259.0,
      "reward": 0.03515625,
      "reward_std": 0.06327171623706818,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1825.28515625,
      "completions/mean_terminated_length": 766.7640380859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4861312622684988,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 21.755343588681697,
      "learning_rate": 6.502090854891051e-07,
      "loss": 0.0104,
      "num_tokens": 1380742181.0,
      "reward": 0.044921875,
      "reward_std": 0.08907270431518555,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1880.0,
      "completions/mean_length": 1809.44140625,
      "completions/mean_terminated_length": 660.0227661132812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.48647264658188955,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 9.755341671623205,
      "learning_rate": 6.496863681938138e-07,
      "loss": 0.0183,
      "num_tokens": 1381742087.0,
      "reward": 0.056640625,
      "reward_std": 0.055459219962358475,
      "rewards/accuracy_reward/mean": 0.058467742055654526,
      "rewards/accuracy_reward/std": 0.23486268520355225,
      "step": 1425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1738.595703125,
      "completions/mean_terminated_length": 694.0256958007812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.48681403089528036,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 29.705652542540914,
      "learning_rate": 6.491635094119558e-07,
      "loss": 0.003,
      "num_tokens": 1382706872.0,
      "reward": 0.03125,
      "reward_std": 0.05880707502365112,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1774.232421875,
      "completions/mean_terminated_length": 673.7941284179688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.48715541520867117,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 74.6742698270493,
      "learning_rate": 6.486405098856333e-07,
      "loss": 0.0113,
      "num_tokens": 1383695087.0,
      "reward": 0.017578125,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1796.181640625,
      "completions/mean_terminated_length": 704.96875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.487496799522062,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 20.94875629518616,
      "learning_rate": 6.481173703571487e-07,
      "loss": 0.0127,
      "num_tokens": 1384691820.0,
      "reward": 0.017578125,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1844.0859375,
      "completions/mean_terminated_length": 709.4871826171875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.48783818383545274,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 19.996064845265877,
      "learning_rate": 6.475940915690028e-07,
      "loss": 0.0073,
      "num_tokens": 1385717512.0,
      "reward": 0.0390625,
      "reward_std": 0.06162214279174805,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1752.017578125,
      "completions/mean_terminated_length": 682.7477416992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.48817956814884356,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 22.784105088430692,
      "learning_rate": 6.470706742638942e-07,
      "loss": 0.0056,
      "num_tokens": 1386694689.0,
      "reward": 0.037109375,
      "reward_std": 0.08136672526597977,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1678.615234375,
      "completions/mean_terminated_length": 604.2977294921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.48852095246223437,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 42.606477879428354,
      "learning_rate": 6.465471191847177e-07,
      "loss": 0.0419,
      "num_tokens": 1387629020.0,
      "reward": 0.080078125,
      "reward_std": 0.12713736295700073,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 1431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1793.626953125,
      "completions/mean_terminated_length": 795.7019653320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4888623367756252,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 32.05358850380085,
      "learning_rate": 6.460234270745645e-07,
      "loss": -0.0004,
      "num_tokens": 1388632333.0,
      "reward": 0.01953125,
      "reward_std": 0.03176242858171463,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1776.0,
      "completions/mean_length": 1640.390625,
      "completions/mean_terminated_length": 727.1392822265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.48920372108901594,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.148932696841907,
      "learning_rate": 6.454995986767193e-07,
      "loss": 0.018,
      "num_tokens": 1389546533.0,
      "reward": 0.060546875,
      "reward_std": 0.09793071448802948,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1918.0,
      "completions/mean_length": 1721.84765625,
      "completions/mean_terminated_length": 743.390625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.48954510540240675,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.835921685013446,
      "learning_rate": 6.44975634734661e-07,
      "loss": 0.0178,
      "num_tokens": 1390504583.0,
      "reward": 0.046875,
      "reward_std": 0.08698301762342453,
      "rewards/accuracy_reward/mean": 0.04838709533214569,
      "rewards/accuracy_reward/std": 0.21479946374893188,
      "step": 1434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1708.755859375,
      "completions/mean_terminated_length": 780.1678466796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.48988648971579757,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 42.92018877470073,
      "learning_rate": 6.444515359920605e-07,
      "loss": 0.0059,
      "num_tokens": 1391455370.0,
      "reward": 0.078125,
      "reward_std": 0.11834745109081268,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1779.349609375,
      "completions/mean_terminated_length": 774.8981323242188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4902278740291884,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 108.56796798241987,
      "learning_rate": 6.439273031927801e-07,
      "loss": 0.0044,
      "num_tokens": 1392463069.0,
      "reward": 0.064453125,
      "reward_std": 0.09138096868991852,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1674.857421875,
      "completions/mean_terminated_length": 683.3643188476562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.49056925834257914,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.512244363605728,
      "learning_rate": 6.434029370808722e-07,
      "loss": 0.014,
      "num_tokens": 1393400100.0,
      "reward": 0.033203125,
      "reward_std": 0.044359706342220306,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211629986763,
      "step": 1437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1735.0703125,
      "completions/mean_terminated_length": 690.203369140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.49091064265596995,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.479888934581439,
      "learning_rate": 6.428784384005789e-07,
      "loss": 0.0111,
      "num_tokens": 1394365256.0,
      "reward": 0.05859375,
      "reward_std": 0.09381677210330963,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1725.0,
      "completions/mean_length": 1743.16796875,
      "completions/mean_terminated_length": 702.5344848632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.49125202696936077,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 10.494916467439083,
      "learning_rate": 6.423538078963299e-07,
      "loss": 0.0197,
      "num_tokens": 1395340846.0,
      "reward": 0.1015625,
      "reward_std": 0.09666463732719421,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 1439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1679.01171875,
      "completions/mean_terminated_length": 754.013671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4915934112827516,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 36.713779456463264,
      "learning_rate": 6.418290463127423e-07,
      "loss": -0.0023,
      "num_tokens": 1396283268.0,
      "reward": 0.021484375,
      "reward_std": 0.036420635879039764,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 1440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1657.201171875,
      "completions/mean_terminated_length": 723.6224975585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.49193479559614234,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 108.78665438146012,
      "learning_rate": 6.413041543946192e-07,
      "loss": 0.0168,
      "num_tokens": 1397213515.0,
      "reward": 0.068359375,
      "reward_std": 0.1284080445766449,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1719.85546875,
      "completions/mean_terminated_length": 794.1940307617188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.49227617990953315,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 55.40885216284095,
      "learning_rate": 6.407791328869488e-07,
      "loss": 0.0203,
      "num_tokens": 1398172673.0,
      "reward": 0.048828125,
      "reward_std": 0.07251107692718506,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1672.443359375,
      "completions/mean_terminated_length": 684.2765502929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.49261756422292396,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 3.2712755058134024,
      "learning_rate": 6.402539825349032e-07,
      "loss": 0.0227,
      "num_tokens": 1399101764.0,
      "reward": 0.099609375,
      "reward_std": 0.10931616276502609,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 1443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1612.3515625,
      "completions/mean_terminated_length": 679.5828247070312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.4929589485363148,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.020980516220831,
      "learning_rate": 6.397287040838367e-07,
      "loss": 0.008,
      "num_tokens": 1399999704.0,
      "reward": 0.052734375,
      "reward_std": 0.06260748207569122,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1636.884765625,
      "completions/mean_terminated_length": 732.4312744140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.49330033284970554,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.35873829387781,
      "learning_rate": 6.392032982792865e-07,
      "loss": 0.0268,
      "num_tokens": 1400922413.0,
      "reward": 0.0859375,
      "reward_std": 0.09876786172389984,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1739.453125,
      "completions/mean_terminated_length": 763.6422729492188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.49364171716309635,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 13.10752064545794,
      "learning_rate": 6.386777658669698e-07,
      "loss": 0.0165,
      "num_tokens": 1401887189.0,
      "reward": 0.072265625,
      "reward_std": 0.1141214370727539,
      "rewards/accuracy_reward/mean": 0.07459677755832672,
      "rewards/accuracy_reward/std": 0.263004869222641,
      "step": 1446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1684.115234375,
      "completions/mean_terminated_length": 830.2941284179688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.49398310147648716,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.225184352557372,
      "learning_rate": 6.38152107592784e-07,
      "loss": 0.0172,
      "num_tokens": 1402824896.0,
      "reward": 0.03515625,
      "reward_std": 0.08120594918727875,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1610.0,
      "completions/mean_length": 1773.232421875,
      "completions/mean_terminated_length": 502.0549621582031,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.494324485789878,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.0264458006639359,
      "learning_rate": 6.376263242028048e-07,
      "loss": 0.0236,
      "num_tokens": 1403816023.0,
      "reward": 0.009765625,
      "reward_std": 0.028222277760505676,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 1448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1717.0,
      "completions/mean_length": 1688.185546875,
      "completions/mean_terminated_length": 641.7022705078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.49466587010326873,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.203802785136163,
      "learning_rate": 6.371004164432853e-07,
      "loss": 0.031,
      "num_tokens": 1404751926.0,
      "reward": 0.033203125,
      "reward_std": 0.08082009106874466,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1728.521484375,
      "completions/mean_terminated_length": 749.8016357421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.49500725441665955,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 20.35667558109254,
      "learning_rate": 6.365743850606555e-07,
      "loss": 0.0044,
      "num_tokens": 1405715121.0,
      "reward": 0.05859375,
      "reward_std": 0.08302421122789383,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1726.2265625,
      "completions/mean_terminated_length": 663.5630493164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.49534863873005036,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.13413887002305,
      "learning_rate": 6.360482308015209e-07,
      "loss": 0.0173,
      "num_tokens": 1406685861.0,
      "reward": 0.03125,
      "reward_std": 0.07053204625844955,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1806.939453125,
      "completions/mean_terminated_length": 720.8709716796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.4956900230434412,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.551419800574755,
      "learning_rate": 6.35521954412661e-07,
      "loss": -0.0036,
      "num_tokens": 1407691030.0,
      "reward": 0.041015625,
      "reward_std": 0.06183479726314545,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1770.41015625,
      "completions/mean_terminated_length": 812.5303955078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.49603140735683193,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.268487379277527,
      "learning_rate": 6.34995556641029e-07,
      "loss": 0.0111,
      "num_tokens": 1408676872.0,
      "reward": 0.080078125,
      "reward_std": 0.09841620922088623,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 1453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1492.0,
      "completions/mean_length": 1790.708984375,
      "completions/mean_terminated_length": 567.8539428710938,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.49637279167022275,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.394134018707034,
      "learning_rate": 6.344690382337503e-07,
      "loss": 0.0137,
      "num_tokens": 1409670659.0,
      "reward": 0.0234375,
      "reward_std": 0.053855981677770615,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1736.0,
      "completions/mean_length": 1723.455078125,
      "completions/mean_terminated_length": 651.638671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.49671417598361356,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.69859558472023,
      "learning_rate": 6.339423999381216e-07,
      "loss": -0.0152,
      "num_tokens": 1410628156.0,
      "reward": 0.08203125,
      "reward_std": 0.09347065538167953,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1789.16796875,
      "completions/mean_terminated_length": 820.9444580078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.4970555602970044,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.931127258141842,
      "learning_rate": 6.334156425016091e-07,
      "loss": 0.0168,
      "num_tokens": 1411620306.0,
      "reward": 0.044921875,
      "reward_std": 0.07481793314218521,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1753.0,
      "completions/mean_length": 1799.6171875,
      "completions/mean_terminated_length": 695.1063842773438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.49739694461039513,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.026343701837094,
      "learning_rate": 6.328887666718493e-07,
      "loss": 0.0252,
      "num_tokens": 1412629998.0,
      "reward": 0.08984375,
      "reward_std": 0.10332095623016357,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1888.0,
      "completions/mean_length": 1795.142578125,
      "completions/mean_terminated_length": 670.7340087890625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.49773832892378594,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 5.745747347960597,
      "learning_rate": 6.323617731966456e-07,
      "loss": 0.0267,
      "num_tokens": 1413638855.0,
      "reward": 0.0390625,
      "reward_std": 0.09347065538167953,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1855.0,
      "completions/mean_length": 1746.087890625,
      "completions/mean_terminated_length": 715.4224243164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.49807971323717676,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 30.858639232910065,
      "learning_rate": 6.318346628239691e-07,
      "loss": -0.001,
      "num_tokens": 1414616996.0,
      "reward": 0.041015625,
      "reward_std": 0.06502877175807953,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1785.61328125,
      "completions/mean_terminated_length": 717.8811645507812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.49842109755056757,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.100907583510008,
      "learning_rate": 6.313074363019565e-07,
      "loss": 0.0109,
      "num_tokens": 1415603918.0,
      "reward": 0.029296875,
      "reward_std": 0.0714043527841568,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1756.369140625,
      "completions/mean_terminated_length": 678.1375732421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.49876248186395833,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.96407771801503,
      "learning_rate": 6.307800943789093e-07,
      "loss": 0.0021,
      "num_tokens": 1416577371.0,
      "reward": 0.0234375,
      "reward_std": 0.03839729726314545,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1751.78515625,
      "completions/mean_terminated_length": 762.7288208007812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.49910386617734914,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.895456310998595,
      "learning_rate": 6.302526378032931e-07,
      "loss": 0.0151,
      "num_tokens": 1417560893.0,
      "reward": 0.0703125,
      "reward_std": 0.06310540437698364,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1773.31640625,
      "completions/mean_terminated_length": 627.4141235351562,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.49944525049073996,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.6583850309125374,
      "learning_rate": 6.29725067323736e-07,
      "loss": 0.0258,
      "num_tokens": 1418546671.0,
      "reward": 0.0390625,
      "reward_std": 0.07588253915309906,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1850.0,
      "completions/mean_length": 1803.166015625,
      "completions/mean_terminated_length": 623.5113525390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.49978663480413077,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.2066377516828233,
      "learning_rate": 6.291973836890276e-07,
      "loss": 0.0207,
      "num_tokens": 1419548292.0,
      "reward": 0.01953125,
      "reward_std": 0.046875,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 1623.7890625,
      "completions/mean_terminated_length": 539.6944580078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5001280191175216,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.023797796566203,
      "learning_rate": 6.286695876481185e-07,
      "loss": 0.0093,
      "num_tokens": 1420460632.0,
      "reward": 0.044921875,
      "reward_std": 0.049345001578330994,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 1465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1836.0078125,
      "completions/mean_terminated_length": 540.5,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5004694034309124,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.3077878022213638,
      "learning_rate": 6.281416799501187e-07,
      "loss": 0.0046,
      "num_tokens": 1421476972.0,
      "reward": 0.021484375,
      "reward_std": 0.04604348540306091,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1769.732421875,
      "completions/mean_terminated_length": 594.69384765625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5008107877443031,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.0235193741171722,
      "learning_rate": 6.276136613442964e-07,
      "loss": 0.0187,
      "num_tokens": 1422461539.0,
      "reward": 0.01953125,
      "reward_std": 0.04412011057138443,
      "rewards/accuracy_reward/mean": 0.02016128972172737,
      "rewards/accuracy_reward/std": 0.14069372415542603,
      "step": 1467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1747.501953125,
      "completions/mean_terminated_length": 686.4513549804688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5011521720576939,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 1.9995252362628586,
      "learning_rate": 6.270855325800775e-07,
      "loss": 0.0238,
      "num_tokens": 1423428404.0,
      "reward": 0.03515625,
      "reward_std": 0.06574726849794388,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1763.59375,
      "completions/mean_terminated_length": 661.1809692382812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5014935563710847,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.6900170609047698,
      "learning_rate": 6.265572944070444e-07,
      "loss": 0.025,
      "num_tokens": 1424400212.0,
      "reward": 0.04296875,
      "reward_std": 0.06964729726314545,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1792.0,
      "completions/mean_length": 1720.244140625,
      "completions/mean_terminated_length": 625.8728637695312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5018349406844755,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 19.56069146226166,
      "learning_rate": 6.260289475749344e-07,
      "loss": 0.0393,
      "num_tokens": 1425351121.0,
      "reward": 0.0625,
      "reward_std": 0.12290477007627487,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1946.0,
      "completions/mean_length": 1803.080078125,
      "completions/mean_terminated_length": 623.0113525390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5021763249978664,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.907489701801232,
      "learning_rate": 6.255004928336391e-07,
      "loss": 0.026,
      "num_tokens": 1426356042.0,
      "reward": 0.048828125,
      "reward_std": 0.08207826316356659,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 1471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1877.0,
      "completions/mean_length": 1815.953125,
      "completions/mean_terminated_length": 697.9091186523438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5025177093112572,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 24.951696580953566,
      "learning_rate": 6.249719309332036e-07,
      "loss": 0.0192,
      "num_tokens": 1427361810.0,
      "reward": 0.03515625,
      "reward_std": 0.06708566844463348,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1741.904296875,
      "completions/mean_terminated_length": 673.25439453125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.502859093624648,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 28.344764999645086,
      "learning_rate": 6.244432626238245e-07,
      "loss": 0.019,
      "num_tokens": 1428327009.0,
      "reward": 0.052734375,
      "reward_std": 0.08890542387962341,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.81640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1814.224609375,
      "completions/mean_terminated_length": 775.223388671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5032004779380388,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.979403647978972,
      "learning_rate": 6.239144886558501e-07,
      "loss": 0.0169,
      "num_tokens": 1429339860.0,
      "reward": 0.021484375,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1825.65625,
      "completions/mean_terminated_length": 660.3658447265625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5035418622514295,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 20.798090517873316,
      "learning_rate": 6.23385609779778e-07,
      "loss": 0.006,
      "num_tokens": 1430362260.0,
      "reward": 0.021484375,
      "reward_std": 0.03462383896112442,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1734.498046875,
      "completions/mean_terminated_length": 664.2672119140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5038832465648203,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.7091619444099373,
      "learning_rate": 6.228566267462555e-07,
      "loss": 0.0233,
      "num_tokens": 1431330339.0,
      "reward": 0.01953125,
      "reward_std": 0.05001020431518555,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1641.0,
      "completions/mean_length": 1712.169921875,
      "completions/mean_terminated_length": 455.9166564941406,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5042246308782111,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.908469758973546,
      "learning_rate": 6.22327540306077e-07,
      "loss": 0.0259,
      "num_tokens": 1432283674.0,
      "reward": 0.041015625,
      "reward_std": 0.07250870764255524,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1747.681640625,
      "completions/mean_terminated_length": 624.2684936523438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5045660151916019,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 45.99432201155855,
      "learning_rate": 6.217983512101838e-07,
      "loss": 0.0097,
      "num_tokens": 1433254551.0,
      "reward": 0.05859375,
      "reward_std": 0.07172322273254395,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1636.0,
      "completions/mean_length": 1772.494140625,
      "completions/mean_terminated_length": 578.6354370117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5049073995049927,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.924320122773592,
      "learning_rate": 6.212690602096631e-07,
      "loss": 0.012,
      "num_tokens": 1434240052.0,
      "reward": 0.009765625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 1479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1824.0,
      "completions/mean_length": 1810.361328125,
      "completions/mean_terminated_length": 488.1153869628906,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5052487838183836,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 23.956286241026543,
      "learning_rate": 6.207396680557468e-07,
      "loss": 0.0097,
      "num_tokens": 1435245213.0,
      "reward": 0.05078125,
      "reward_std": 0.0908287987112999,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1790.21875,
      "completions/mean_terminated_length": 658.6947631835938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5055901681317744,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.9881575822923265,
      "learning_rate": 6.202101754998101e-07,
      "loss": 0.0029,
      "num_tokens": 1436236381.0,
      "reward": 0.01953125,
      "reward_std": 0.04318207502365112,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.85546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1857.58984375,
      "completions/mean_terminated_length": 731.3513793945312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5059315524451652,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.524346440754835,
      "learning_rate": 6.196805832933709e-07,
      "loss": 0.0088,
      "num_tokens": 1437260347.0,
      "reward": 0.025390625,
      "reward_std": 0.05024883896112442,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1720.111328125,
      "completions/mean_terminated_length": 588.1825561523438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5062729367585559,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.445431303278404,
      "learning_rate": 6.191508921880886e-07,
      "loss": 0.0165,
      "num_tokens": 1438223188.0,
      "reward": 0.052734375,
      "reward_std": 0.07242262363433838,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1773.427734375,
      "completions/mean_terminated_length": 598.7113037109375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5066143210719467,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.712775876981828,
      "learning_rate": 6.186211029357625e-07,
      "loss": 0.0081,
      "num_tokens": 1439205807.0,
      "reward": 0.021484375,
      "reward_std": 0.059305962175130844,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 1484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1750.32421875,
      "completions/mean_terminated_length": 553.7843627929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5069557053853375,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.468026306558327,
      "learning_rate": 6.180912162883318e-07,
      "loss": 0.0103,
      "num_tokens": 1440181397.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1803.833984375,
      "completions/mean_terminated_length": 643.8651733398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5072970896987283,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.155338504590691,
      "learning_rate": 6.175612329978737e-07,
      "loss": 0.0031,
      "num_tokens": 1441193168.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 1486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1845.048828125,
      "completions/mean_terminated_length": 749.1124877929688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5076384740121191,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 10.896039848112984,
      "learning_rate": 6.170311538166026e-07,
      "loss": 0.0109,
      "num_tokens": 1442212505.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.008064515888690948,
      "rewards/accuracy_reward/std": 0.0895301103591919,
      "step": 1487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.830078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1791.0,
      "completions/mean_length": 1805.283203125,
      "completions/mean_terminated_length": 619.5977172851562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.50797985832551,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.7314020114068187,
      "learning_rate": 6.165009794968687e-07,
      "loss": 0.02,
      "num_tokens": 1443215466.0,
      "reward": 0.029296875,
      "reward_std": 0.055033616721630096,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1828.0,
      "completions/mean_length": 1690.1484375,
      "completions/mean_terminated_length": 495.28814697265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5083212426389008,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 25.36217204271973,
      "learning_rate": 6.159707107911575e-07,
      "loss": 0.0358,
      "num_tokens": 1444159238.0,
      "reward": 0.072265625,
      "reward_std": 0.08885213732719421,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1778.5078125,
      "completions/mean_terminated_length": 595.5789794921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5086626269522916,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 72.49000634756864,
      "learning_rate": 6.154403484520887e-07,
      "loss": 0.0332,
      "num_tokens": 1445145994.0,
      "reward": 0.0625,
      "reward_std": 0.10548296570777893,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1791.0,
      "completions/mean_terminated_length": 617.7391357421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5090040112656823,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.7702183944556926,
      "learning_rate": 6.149098932324145e-07,
      "loss": 0.0085,
      "num_tokens": 1446135658.0,
      "reward": 0.037109375,
      "reward_std": 0.05440815910696983,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1857.0,
      "completions/mean_length": 1746.35546875,
      "completions/mean_terminated_length": 577.1238403320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5093453955790731,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.448918934115191,
      "learning_rate": 6.143793458850188e-07,
      "loss": 0.019,
      "num_tokens": 1447101248.0,
      "reward": 0.05078125,
      "reward_std": 0.07419876754283905,
      "rewards/accuracy_reward/mean": 0.052419353276491165,
      "rewards/accuracy_reward/std": 0.22309619188308716,
      "step": 1492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1778.0,
      "completions/mean_length": 1721.865234375,
      "completions/mean_terminated_length": 570.2920532226562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5096867798924639,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.820345441438167,
      "learning_rate": 6.13848707162917e-07,
      "loss": -0.0043,
      "num_tokens": 1448057403.0,
      "reward": 0.021484375,
      "reward_std": 0.05287160724401474,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1696.0,
      "completions/mean_length": 1750.171875,
      "completions/mean_terminated_length": 636.0740966796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5100281642058547,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.21406744333176747,
      "learning_rate": 6.133179778192533e-07,
      "loss": 0.0213,
      "num_tokens": 1449020163.0,
      "reward": 0.017578125,
      "reward_std": 0.043135739862918854,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1745.009765625,
      "completions/mean_terminated_length": 637.7181396484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5103695485192455,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 24.213039170025468,
      "learning_rate": 6.127871586073012e-07,
      "loss": 0.0192,
      "num_tokens": 1449994952.0,
      "reward": 0.048828125,
      "reward_std": 0.09331225603818893,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.841796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1712.0,
      "completions/mean_length": 1810.13671875,
      "completions/mean_terminated_length": 544.4691162109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5107109328326364,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 18.005339121716144,
      "learning_rate": 6.122562502804614e-07,
      "loss": -0.0027,
      "num_tokens": 1450989902.0,
      "reward": 0.03515625,
      "reward_std": 0.03944835811853409,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1776.0,
      "completions/mean_length": 1784.84765625,
      "completions/mean_terminated_length": 658.9896850585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5110523171460272,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 23.81298234538911,
      "learning_rate": 6.117252535922611e-07,
      "loss": -0.0056,
      "num_tokens": 1451980448.0,
      "reward": 0.05859375,
      "reward_std": 0.08544550836086273,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1731.97265625,
      "completions/mean_terminated_length": 577.0363159179688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.511393701459418,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 10.913167025446416,
      "learning_rate": 6.111941692963531e-07,
      "loss": 0.0008,
      "num_tokens": 1452951922.0,
      "reward": 0.046875,
      "reward_std": 0.10073335468769073,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1758.0,
      "completions/mean_length": 1758.140625,
      "completions/mean_terminated_length": 593.0196533203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5117350857728087,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.485520888574328,
      "learning_rate": 6.106629981465142e-07,
      "loss": 0.0067,
      "num_tokens": 1453929930.0,
      "reward": 0.033203125,
      "reward_std": 0.07686128467321396,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1707.71484375,
      "completions/mean_terminated_length": 546.0516967773438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5120764700861995,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.499326610780647,
      "learning_rate": 6.101317408966451e-07,
      "loss": 0.0276,
      "num_tokens": 1454880344.0,
      "reward": 0.04296875,
      "reward_std": 0.08160631358623505,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1624.0,
      "completions/mean_length": 1742.603515625,
      "completions/mean_terminated_length": 402.0736999511719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5124178543995903,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 25.202970290657618,
      "learning_rate": 6.096003983007679e-07,
      "loss": 0.0021,
      "num_tokens": 1455846397.0,
      "reward": 0.0234375,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1725.25,
      "completions/mean_terminated_length": 598.4561157226562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5127592387129811,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.899147768730763,
      "learning_rate": 6.090689711130263e-07,
      "loss": 0.013,
      "num_tokens": 1456804029.0,
      "reward": 0.033203125,
      "reward_std": 0.062062256038188934,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211629986763,
      "step": 1502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1783.8203125,
      "completions/mean_terminated_length": 708.7920532226562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5131006230263719,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.990933728300156,
      "learning_rate": 6.085374600876842e-07,
      "loss": 0.024,
      "num_tokens": 1457791553.0,
      "reward": 0.0390625,
      "reward_std": 0.0792168527841568,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1796.939453125,
      "completions/mean_terminated_length": 650.7935180664062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5134420073397628,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 4.335415013515333,
      "learning_rate": 6.080058659791241e-07,
      "loss": 0.0101,
      "num_tokens": 1458787106.0,
      "reward": 0.0234375,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.024193547666072845,
      "rewards/accuracy_reward/std": 0.15380479395389557,
      "step": 1504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1761.2890625,
      "completions/mean_terminated_length": 550.0816040039062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5137833916531536,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 15.05040773508169,
      "learning_rate": 6.074741895418466e-07,
      "loss": -0.0007,
      "num_tokens": 1459769302.0,
      "reward": 0.025390625,
      "reward_std": 0.05859442427754402,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1758.458984375,
      "completions/mean_terminated_length": 724.3839721679688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5141247759665444,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.6576230895865556,
      "learning_rate": 6.069424315304693e-07,
      "loss": 0.0206,
      "num_tokens": 1460744817.0,
      "reward": 0.025390625,
      "reward_std": 0.04726085811853409,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1791.673828125,
      "completions/mean_terminated_length": 722.353515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5144661602799352,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 6.69011436276421,
      "learning_rate": 6.064105926997251e-07,
      "loss": 0.042,
      "num_tokens": 1461730874.0,
      "reward": 0.03125,
      "reward_std": 0.09341736882925034,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1728.17578125,
      "completions/mean_terminated_length": 636.362060546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5148075445933259,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.958448611887732,
      "learning_rate": 6.058786738044626e-07,
      "loss": 0.0099,
      "num_tokens": 1462694788.0,
      "reward": 0.064453125,
      "reward_std": 0.08452101051807404,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1713.025390625,
      "completions/mean_terminated_length": 708.1015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5151489289067167,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.7108511684195555,
      "learning_rate": 6.053466755996427e-07,
      "loss": 0.0127,
      "num_tokens": 1463644881.0,
      "reward": 0.01953125,
      "reward_std": 0.04505910724401474,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1797.0,
      "completions/mean_length": 1731.78515625,
      "completions/mean_terminated_length": 742.3386840820312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5154903132201075,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.699401477668564,
      "learning_rate": 6.0481459884034e-07,
      "loss": 0.0338,
      "num_tokens": 1464602947.0,
      "reward": 0.05078125,
      "reward_std": 0.08214398473501205,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1737.35546875,
      "completions/mean_terminated_length": 689.0684204101562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5158316975334983,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.1213390779529826,
      "learning_rate": 6.042824442817399e-07,
      "loss": 0.022,
      "num_tokens": 1465566377.0,
      "reward": 0.0234375,
      "reward_std": 0.03839729726314545,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1666.064453125,
      "completions/mean_terminated_length": 620.6204223632812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5161730818468891,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.951404915232107,
      "learning_rate": 6.037502126791386e-07,
      "loss": 0.0044,
      "num_tokens": 1466493706.0,
      "reward": 0.029296875,
      "reward_std": 0.06403100490570068,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1731.083984375,
      "completions/mean_terminated_length": 586.189208984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.51651446616028,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.2153715880723635,
      "learning_rate": 6.032179047879413e-07,
      "loss": 0.0289,
      "num_tokens": 1467460949.0,
      "reward": 0.072265625,
      "reward_std": 0.07603531330823898,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1671.61328125,
      "completions/mean_terminated_length": 620.5184936523438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5168558504736708,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.489564859370835,
      "learning_rate": 6.026855213636619e-07,
      "loss": 0.0211,
      "num_tokens": 1468388479.0,
      "reward": 0.0546875,
      "reward_std": 0.08471457660198212,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1728.66796875,
      "completions/mean_terminated_length": 809.3787841796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5171972347870616,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 12.939105171186295,
      "learning_rate": 6.021530631619213e-07,
      "loss": 0.0116,
      "num_tokens": 1469348149.0,
      "reward": 0.06640625,
      "reward_std": 0.11206454783678055,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293973088264465,
      "step": 1515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1934.0,
      "completions/mean_length": 1725.140625,
      "completions/mean_terminated_length": 610.5739135742188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5175386191004523,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 6.6046260277771465,
      "learning_rate": 6.016205309384466e-07,
      "loss": -0.0042,
      "num_tokens": 1470311725.0,
      "reward": 0.06640625,
      "reward_std": 0.12256665527820587,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293970108032227,
      "step": 1516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1565.03125,
      "completions/mean_terminated_length": 658.7864990234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5178800034138431,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 3.9368554451970024,
      "learning_rate": 6.010879254490695e-07,
      "loss": 0.0047,
      "num_tokens": 1471189677.0,
      "reward": 0.041015625,
      "reward_std": 0.09078246355056763,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 1664.09375,
      "completions/mean_terminated_length": 683.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5182213877272339,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.8215671984381185,
      "learning_rate": 6.005552474497264e-07,
      "loss": 0.0111,
      "num_tokens": 1472119613.0,
      "reward": 0.03515625,
      "reward_std": 0.05914659798145294,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 1518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1863.0,
      "completions/mean_length": 1675.1640625,
      "completions/mean_terminated_length": 664.7246704101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5185627720406247,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 10.423043368875431,
      "learning_rate": 6.000224976964563e-07,
      "loss": 0.0334,
      "num_tokens": 1473059905.0,
      "reward": 0.048828125,
      "reward_std": 0.10188952833414078,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1701.271484375,
      "completions/mean_terminated_length": 703.1136474609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5189041563540155,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.092466921206046,
      "learning_rate": 5.994896769453999e-07,
      "loss": 0.0015,
      "num_tokens": 1474008780.0,
      "reward": 0.03515625,
      "reward_std": 0.0479668527841568,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1649.09765625,
      "completions/mean_terminated_length": 677.275146484375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5192455406674064,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.9219695259100713,
      "learning_rate": 5.989567859527988e-07,
      "loss": 0.0188,
      "num_tokens": 1474930094.0,
      "reward": 0.01953125,
      "reward_std": 0.051493462175130844,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1717.294921875,
      "completions/mean_terminated_length": 671.406494140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5195869249807972,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 3.668714818918523,
      "learning_rate": 5.984238254749946e-07,
      "loss": 0.0375,
      "num_tokens": 1475880965.0,
      "reward": 0.037109375,
      "reward_std": 0.09149399399757385,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1674.765625,
      "completions/mean_terminated_length": 589.2518920898438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.519928309294188,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 3.191692323343434,
      "learning_rate": 5.978907962684267e-07,
      "loss": 0.0355,
      "num_tokens": 1476817229.0,
      "reward": 0.087890625,
      "reward_std": 0.09809703379869461,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 1523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1688.689453125,
      "completions/mean_terminated_length": 695.3014526367188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5202696936075787,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.394918768435236,
      "learning_rate": 5.973576990896331e-07,
      "loss": 0.0098,
      "num_tokens": 1477751838.0,
      "reward": 0.0234375,
      "reward_std": 0.05667105317115784,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1738.240234375,
      "completions/mean_terminated_length": 579.50927734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5206110779209695,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 20.843726607976432,
      "learning_rate": 5.968245346952473e-07,
      "loss": 0.0041,
      "num_tokens": 1478720553.0,
      "reward": 0.037109375,
      "reward_std": 0.07355976104736328,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1711.24609375,
      "completions/mean_terminated_length": 587.2626953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5209524622343603,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.61273419512016,
      "learning_rate": 5.962913038419988e-07,
      "loss": -0.001,
      "num_tokens": 1479676855.0,
      "reward": 0.0625,
      "reward_std": 0.10161572694778442,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1793.0,
      "completions/mean_length": 1603.716796875,
      "completions/mean_terminated_length": 685.88623046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5212938465477511,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.049159518715146,
      "learning_rate": 5.957580072867113e-07,
      "loss": 0.0215,
      "num_tokens": 1480572582.0,
      "reward": 0.052734375,
      "reward_std": 0.08298446238040924,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1824.0,
      "completions/mean_length": 1620.80859375,
      "completions/mean_terminated_length": 689.478271484375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5216352308611419,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.5573006106581655,
      "learning_rate": 5.952246457863019e-07,
      "loss": 0.0206,
      "num_tokens": 1481475028.0,
      "reward": 0.041015625,
      "reward_std": 0.07285481691360474,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 1528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1820.0,
      "completions/mean_length": 1657.556640625,
      "completions/mean_terminated_length": 609.8201293945312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5219766151745328,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 8.563085377122176,
      "learning_rate": 5.946912200977794e-07,
      "loss": 0.0133,
      "num_tokens": 1482406065.0,
      "reward": 0.060546875,
      "reward_std": 0.10453042387962341,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1669.744140625,
      "completions/mean_terminated_length": 634.3722534179688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5223179994879236,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.101719864674231,
      "learning_rate": 5.941577309782441e-07,
      "loss": 0.0118,
      "num_tokens": 1483341774.0,
      "reward": 0.0546875,
      "reward_std": 0.08493966609239578,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 1682.529296875,
      "completions/mean_terminated_length": 682.1532592773438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5226593838013144,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 1.2187278258013974,
      "learning_rate": 5.936241791848863e-07,
      "loss": 0.0497,
      "num_tokens": 1484276509.0,
      "reward": 0.06640625,
      "reward_std": 0.12846125662326813,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1661.78125,
      "completions/mean_terminated_length": 635.5428466796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5230007681147051,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.92125611976677,
      "learning_rate": 5.930905654749848e-07,
      "loss": 0.0103,
      "num_tokens": 1485204893.0,
      "reward": 0.0390625,
      "reward_std": 0.08109388500452042,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1701.076171875,
      "completions/mean_terminated_length": 741.933837890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5233421524280959,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 6.014843141083682,
      "learning_rate": 5.925568906059073e-07,
      "loss": 0.0373,
      "num_tokens": 1486150404.0,
      "reward": 0.0703125,
      "reward_std": 0.12145408242940903,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1658.205078125,
      "completions/mean_terminated_length": 708.9395751953125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5236835367414867,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.044092412160285,
      "learning_rate": 5.920231553351073e-07,
      "loss": 0.0304,
      "num_tokens": 1487083309.0,
      "reward": 0.056640625,
      "reward_std": 0.12636469304561615,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1681.025390625,
      "completions/mean_terminated_length": 603.0076904296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5240249210548775,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.288342537773524,
      "learning_rate": 5.914893604201244e-07,
      "loss": 0.0406,
      "num_tokens": 1488033226.0,
      "reward": 0.07421875,
      "reward_std": 0.11917342245578766,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1866.0,
      "completions/mean_length": 1669.74609375,
      "completions/mean_terminated_length": 654.7194213867188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5243663053682683,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 2.162597788905748,
      "learning_rate": 5.909555066185829e-07,
      "loss": 0.0094,
      "num_tokens": 1488966712.0,
      "reward": 0.021484375,
      "reward_std": 0.07108421623706818,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1591.837890625,
      "completions/mean_terminated_length": 682.1812744140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5247076896816592,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.067570829059677,
      "learning_rate": 5.904215946881907e-07,
      "loss": 0.012,
      "num_tokens": 1489861173.0,
      "reward": 0.056640625,
      "reward_std": 0.08070706576108932,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1672.943359375,
      "completions/mean_terminated_length": 686.0921630859375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.52504907399505,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 6.2500102276916465,
      "learning_rate": 5.898876253867379e-07,
      "loss": 0.014,
      "num_tokens": 1490793224.0,
      "reward": 0.080078125,
      "reward_std": 0.12055579572916031,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 1538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1864.0,
      "completions/mean_length": 1692.986328125,
      "completions/mean_terminated_length": 749.664306640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5253904583084408,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 6.400671788590852,
      "learning_rate": 5.893535994720965e-07,
      "loss": 0.0097,
      "num_tokens": 1491736145.0,
      "reward": 0.09375,
      "reward_std": 0.10563018918037415,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 1539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 1678.689453125,
      "completions/mean_terminated_length": 677.8043823242188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5257318426218315,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 9.811172391020396,
      "learning_rate": 5.888195177022185e-07,
      "loss": 0.0273,
      "num_tokens": 1492673282.0,
      "reward": 0.0546875,
      "reward_std": 0.10139063745737076,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1673.94140625,
      "completions/mean_terminated_length": 699.2816772460938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5260732269352223,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 23.88244549271511,
      "learning_rate": 5.882853808351354e-07,
      "loss": 0.0326,
      "num_tokens": 1493603300.0,
      "reward": 0.0703125,
      "reward_std": 0.12538030743598938,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 1541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1815.0,
      "completions/mean_length": 1701.55859375,
      "completions/mean_terminated_length": 672.9767456054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5264146112486131,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.203258176141997,
      "learning_rate": 5.877511896289566e-07,
      "loss": 0.02,
      "num_tokens": 1494548610.0,
      "reward": 0.025390625,
      "reward_std": 0.06354551017284393,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1795.78515625,
      "completions/mean_terminated_length": 744.0504760742188,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.5267559955620039,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 28.33020389871299,
      "learning_rate": 5.872169448418688e-07,
      "loss": 0.0267,
      "num_tokens": 1495555540.0,
      "reward": 0.05859375,
      "reward_std": 0.09056951105594635,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1801.0,
      "completions/mean_length": 1699.875,
      "completions/mean_terminated_length": 707.8496704101562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5270973798753947,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 67.1877398251019,
      "learning_rate": 5.866826472321351e-07,
      "loss": 0.0186,
      "num_tokens": 1496504276.0,
      "reward": 0.037109375,
      "reward_std": 0.07768725603818893,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1681.78515625,
      "completions/mean_terminated_length": 659.0963134765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5274387641887855,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.846714051394489,
      "learning_rate": 5.861482975580928e-07,
      "loss": 0.0124,
      "num_tokens": 1497450102.0,
      "reward": 0.021484375,
      "reward_std": 0.06425705552101135,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1918.0,
      "completions/mean_length": 1690.87890625,
      "completions/mean_terminated_length": 760.3521118164062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5277801485021764,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.06535709332671,
      "learning_rate": 5.856138965781538e-07,
      "loss": 0.0249,
      "num_tokens": 1498392120.0,
      "reward": 0.05859375,
      "reward_std": 0.09586012363433838,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24231401085853577,
      "step": 1546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1720.404296875,
      "completions/mean_terminated_length": 661.8098754882812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5281215328155672,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.72553105609815,
      "learning_rate": 5.850794450508026e-07,
      "loss": 0.0086,
      "num_tokens": 1499353399.0,
      "reward": 0.03125,
      "reward_std": 0.055114150047302246,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1865.0,
      "completions/mean_length": 1741.125,
      "completions/mean_terminated_length": 669.75439453125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5284629171289579,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.088981743521808,
      "learning_rate": 5.845449437345952e-07,
      "loss": 0.0142,
      "num_tokens": 1500327399.0,
      "reward": 0.025390625,
      "reward_std": 0.05298367142677307,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1814.0,
      "completions/mean_length": 1718.646484375,
      "completions/mean_terminated_length": 555.7079467773438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5288043014423487,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 21.943762028457954,
      "learning_rate": 5.840103933881584e-07,
      "loss": 0.0187,
      "num_tokens": 1501284866.0,
      "reward": 0.06640625,
      "reward_std": 0.1165834367275238,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1776.9453125,
      "completions/mean_terminated_length": 687.4118041992188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5291456857557395,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.719228970002369,
      "learning_rate": 5.834757947701889e-07,
      "loss": 0.0175,
      "num_tokens": 1502278950.0,
      "reward": 0.04296875,
      "reward_std": 0.051659777760505676,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 1550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1920.0,
      "completions/mean_length": 1794.490234375,
      "completions/mean_terminated_length": 682.252685546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5294870700691303,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.281057050240783,
      "learning_rate": 5.829411486394516e-07,
      "loss": 0.0119,
      "num_tokens": 1503280385.0,
      "reward": 0.03125,
      "reward_std": 0.05789502337574959,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1776.78515625,
      "completions/mean_terminated_length": 616.4329833984375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5298284543825211,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.894476504088849,
      "learning_rate": 5.824064557547785e-07,
      "loss": 0.0144,
      "num_tokens": 1504259875.0,
      "reward": 0.064453125,
      "reward_std": 0.08505964279174805,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1682.591796875,
      "completions/mean_terminated_length": 586.3671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5301698386959119,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.709904486595488,
      "learning_rate": 5.81871716875069e-07,
      "loss": 0.0142,
      "num_tokens": 1505205138.0,
      "reward": 0.03125,
      "reward_std": 0.06079617142677307,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 1749.833984375,
      "completions/mean_terminated_length": 708.868408203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5305112230093028,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 11.595723159705539,
      "learning_rate": 5.813369327592867e-07,
      "loss": 0.0075,
      "num_tokens": 1506182157.0,
      "reward": 0.02734375,
      "reward_std": 0.03779878467321396,
      "rewards/accuracy_reward/mean": 0.02822580561041832,
      "rewards/accuracy_reward/std": 0.1657845675945282,
      "step": 1554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1790.0,
      "completions/mean_length": 1786.193359375,
      "completions/mean_terminated_length": 651.6979370117188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5308526073226936,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.7419450565362995,
      "learning_rate": 5.808021041664599e-07,
      "loss": 0.0306,
      "num_tokens": 1507173136.0,
      "reward": 0.03125,
      "reward_std": 0.07300759106874466,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1654.302734375,
      "completions/mean_terminated_length": 628.4718017578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5311939916360843,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 30.72497500795339,
      "learning_rate": 5.802672318556802e-07,
      "loss": 0.0153,
      "num_tokens": 1508093307.0,
      "reward": 0.0703125,
      "reward_std": 0.11740703880786896,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1685.0,
      "completions/mean_length": 1750.765625,
      "completions/mean_terminated_length": 510.7878723144531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5315353759494751,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.90278174268782,
      "learning_rate": 5.797323165861007e-07,
      "loss": 0.0075,
      "num_tokens": 1509070195.0,
      "reward": 0.041015625,
      "reward_std": 0.07729348540306091,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1750.853515625,
      "completions/mean_terminated_length": 570.92236328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5318767602628659,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 18.750295588036277,
      "learning_rate": 5.791973591169359e-07,
      "loss": 0.0242,
      "num_tokens": 1510047112.0,
      "reward": 0.068359375,
      "reward_std": 0.09903506934642792,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1814.37890625,
      "completions/mean_terminated_length": 704.0224609375,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.5322181445762567,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 4.311757674099231,
      "learning_rate": 5.786623602074602e-07,
      "loss": 0.0366,
      "num_tokens": 1511045946.0,
      "reward": 0.046875,
      "reward_std": 0.09045085310935974,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1903.0,
      "completions/mean_length": 1751.97265625,
      "completions/mean_terminated_length": 618.132080078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5325595288896475,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.408238486982613,
      "learning_rate": 5.781273206170065e-07,
      "loss": 0.0172,
      "num_tokens": 1512025756.0,
      "reward": 0.044921875,
      "reward_std": 0.06118203327059746,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1794.0,
      "completions/mean_length": 1757.056640625,
      "completions/mean_terminated_length": 693.7908935546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5329009132030383,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.732166503274463,
      "learning_rate": 5.775922411049657e-07,
      "loss": -0.0018,
      "num_tokens": 1513014281.0,
      "reward": 0.0234375,
      "reward_std": 0.05754890665411949,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1824.0,
      "completions/mean_length": 1625.11328125,
      "completions/mean_terminated_length": 604.5466918945312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5332422975164292,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.69746032400128,
      "learning_rate": 5.770571224307855e-07,
      "loss": 0.0396,
      "num_tokens": 1513919123.0,
      "reward": 0.0859375,
      "reward_std": 0.11326298117637634,
      "rewards/accuracy_reward/mean": 0.08870967477560043,
      "rewards/accuracy_reward/std": 0.284611314535141,
      "step": 1562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1788.0,
      "completions/mean_length": 1758.107421875,
      "completions/mean_terminated_length": 686.302734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.53358368182982,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.219515853370563,
      "learning_rate": 5.765219653539687e-07,
      "loss": 0.0158,
      "num_tokens": 1514894714.0,
      "reward": 0.021484375,
      "reward_std": 0.06425705552101135,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 1563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1600.0,
      "completions/mean_length": 1732.203125,
      "completions/mean_terminated_length": 522.6415405273438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5339250661432107,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.81842005871031,
      "learning_rate": 5.759867706340731e-07,
      "loss": 0.0181,
      "num_tokens": 1515865394.0,
      "reward": 0.029296875,
      "reward_std": 0.04726085811853409,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 1564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1835.0,
      "completions/mean_length": 1809.412109375,
      "completions/mean_terminated_length": 576.2288818359375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5342664504566015,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.857555351983311,
      "learning_rate": 5.754515390307095e-07,
      "loss": 0.0131,
      "num_tokens": 1516872469.0,
      "reward": 0.033203125,
      "reward_std": 0.06508206576108932,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1838.0,
      "completions/mean_length": 1766.927734375,
      "completions/mean_terminated_length": 548.9479370117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5346078347699923,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.325629083931258,
      "learning_rate": 5.749162713035415e-07,
      "loss": -0.003,
      "num_tokens": 1517853936.0,
      "reward": 0.0234375,
      "reward_std": 0.04670868441462517,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1824.177734375,
      "completions/mean_terminated_length": 667.9879150390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5349492190833831,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 46.115858137580624,
      "learning_rate": 5.743809682122836e-07,
      "loss": 0.0153,
      "num_tokens": 1518869947.0,
      "reward": 0.03515625,
      "reward_std": 0.08709508180618286,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1920.0,
      "completions/mean_length": 1730.818359375,
      "completions/mean_terminated_length": 623.4649047851562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5352906033967739,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 22.45428717285173,
      "learning_rate": 5.738456305167007e-07,
      "loss": 0.0208,
      "num_tokens": 1519831118.0,
      "reward": 0.05078125,
      "reward_std": 0.1049705445766449,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1829.0,
      "completions/mean_length": 1682.189453125,
      "completions/mean_terminated_length": 561.5317993164062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5356319877101647,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.612831349622036,
      "learning_rate": 5.733102589766068e-07,
      "loss": -0.0029,
      "num_tokens": 1520761647.0,
      "reward": 0.0390625,
      "reward_std": 0.07795868813991547,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1759.0,
      "completions/mean_length": 1704.345703125,
      "completions/mean_terminated_length": 651.5635375976562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5359733720235555,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 42.354342888795586,
      "learning_rate": 5.727748543518637e-07,
      "loss": 0.0238,
      "num_tokens": 1521711024.0,
      "reward": 0.052734375,
      "reward_std": 0.10834437608718872,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.83203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1802.828125,
      "completions/mean_terminated_length": 588.3720703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5363147563369464,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 10.871412453464968,
      "learning_rate": 5.722394174023805e-07,
      "loss": 0.0168,
      "num_tokens": 1522713000.0,
      "reward": 0.052734375,
      "reward_std": 0.10854348540306091,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1722.22265625,
      "completions/mean_terminated_length": 571.9114990234375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5366561406503371,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 98.84162473141893,
      "learning_rate": 5.717039488881118e-07,
      "loss": 0.0079,
      "num_tokens": 1523670970.0,
      "reward": 0.052734375,
      "reward_std": 0.10453139245510101,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1799.0,
      "completions/mean_length": 1753.447265625,
      "completions/mean_terminated_length": 597.8942260742188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5369975249637279,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 29.69815838452254,
      "learning_rate": 5.711684495690573e-07,
      "loss": 0.0119,
      "num_tokens": 1524638671.0,
      "reward": 0.0390625,
      "reward_std": 0.08263043314218521,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1759.0,
      "completions/mean_length": 1745.953125,
      "completions/mean_terminated_length": 642.1090698242188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5373389092771187,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 84.9726769521578,
      "learning_rate": 5.706329202052605e-07,
      "loss": 0.006,
      "num_tokens": 1525610199.0,
      "reward": 0.03515625,
      "reward_std": 0.07531681656837463,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1609.865234375,
      "completions/mean_terminated_length": 628.2215576171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5376802935905095,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.304391370281301,
      "learning_rate": 5.700973615568072e-07,
      "loss": 0.0342,
      "num_tokens": 1526509426.0,
      "reward": 0.029296875,
      "reward_std": 0.06587383896112442,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1726.68359375,
      "completions/mean_terminated_length": 617.8869018554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5380216779039003,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 17.33273339545191,
      "learning_rate": 5.695617743838252e-07,
      "loss": 0.0017,
      "num_tokens": 1527475680.0,
      "reward": 0.017578125,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1799.552734375,
      "completions/mean_terminated_length": 618.7303466796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5383630622172911,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.48197827741597,
      "learning_rate": 5.690261594464824e-07,
      "loss": 0.0108,
      "num_tokens": 1528467179.0,
      "reward": 0.044921875,
      "reward_std": 0.07729348540306091,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1747.06640625,
      "completions/mean_terminated_length": 659.909912109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.538704446530682,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 39.89398079801477,
      "learning_rate": 5.68490517504986e-07,
      "loss": -0.0026,
      "num_tokens": 1529439005.0,
      "reward": 0.029296875,
      "reward_std": 0.03214829042553902,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1781.0,
      "completions/mean_length": 1632.5390625,
      "completions/mean_terminated_length": 648.5526123046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5390458308440728,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.495759615266493,
      "learning_rate": 5.67954849319582e-07,
      "loss": 0.0231,
      "num_tokens": 1530355585.0,
      "reward": 0.029296875,
      "reward_std": 0.05298367142677307,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1877.0,
      "completions/mean_length": 1730.712890625,
      "completions/mean_terminated_length": 635.382568359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5393872151574635,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.934966614858316,
      "learning_rate": 5.674191556505533e-07,
      "loss": 0.0126,
      "num_tokens": 1531329934.0,
      "reward": 0.01953125,
      "reward_std": 0.046875,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1861.0,
      "completions/mean_length": 1763.912109375,
      "completions/mean_terminated_length": 689.168212890625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5397285994708543,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.372383976573982,
      "learning_rate": 5.668834372582195e-07,
      "loss": 0.0043,
      "num_tokens": 1532318017.0,
      "reward": 0.0390625,
      "reward_std": 0.06888246536254883,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1747.025390625,
      "completions/mean_terminated_length": 492.0303039550781,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5400699837842451,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 27.415553284280957,
      "learning_rate": 5.663476949029342e-07,
      "loss": -0.0069,
      "num_tokens": 1533296318.0,
      "reward": 0.06640625,
      "reward_std": 0.12230625748634338,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1562.0,
      "completions/mean_length": 1764.556640625,
      "completions/mean_terminated_length": 596.7699584960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5404113680976359,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 16.497228299495884,
      "learning_rate": 5.65811929345086e-07,
      "loss": 0.0076,
      "num_tokens": 1534266027.0,
      "reward": 0.02734375,
      "reward_std": 0.04907120391726494,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1667.283203125,
      "completions/mean_terminated_length": 571.2803344726562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5407527524110267,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.331687761976324,
      "learning_rate": 5.652761413450965e-07,
      "loss": 0.0078,
      "num_tokens": 1535194556.0,
      "reward": 0.05078125,
      "reward_std": 0.08345641195774078,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1662.025390625,
      "completions/mean_terminated_length": 562.1428833007812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5410941367244175,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 55.04145002974786,
      "learning_rate": 5.647403316634181e-07,
      "loss": 0.0081,
      "num_tokens": 1536132665.0,
      "reward": 0.0390625,
      "reward_std": 0.06657323986291885,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1625.16015625,
      "completions/mean_terminated_length": 544.5694580078125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5414355210378083,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.457311670723753,
      "learning_rate": 5.64204501060535e-07,
      "loss": -0.006,
      "num_tokens": 1537043931.0,
      "reward": 0.03515625,
      "reward_std": 0.07537011057138443,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1856.0,
      "completions/mean_length": 1701.34765625,
      "completions/mean_terminated_length": 581.6445922851562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5417769053511992,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.038002968898347,
      "learning_rate": 5.636686502969606e-07,
      "loss": 0.0125,
      "num_tokens": 1537998781.0,
      "reward": 0.068359375,
      "reward_std": 0.10313399136066437,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 1700.55859375,
      "completions/mean_terminated_length": 565.5833740234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5421182896645899,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.900784099906076,
      "learning_rate": 5.631327801332373e-07,
      "loss": 0.0103,
      "num_tokens": 1538949067.0,
      "reward": 0.04296875,
      "reward_std": 0.09150753915309906,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1580.884765625,
      "completions/mean_terminated_length": 553.2312622070312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5424596739779807,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.8248263849383559,
      "learning_rate": 5.625968913299344e-07,
      "loss": 0.0309,
      "num_tokens": 1539833936.0,
      "reward": 0.056640625,
      "reward_std": 0.05519992858171463,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1768.27734375,
      "completions/mean_terminated_length": 757.7477416992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5428010582913715,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.589589780614224,
      "learning_rate": 5.620609846476486e-07,
      "loss": 0.0122,
      "num_tokens": 1540814574.0,
      "reward": 0.078125,
      "reward_std": 0.09951501339673996,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1703.974609375,
      "completions/mean_terminated_length": 627.508056640625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5431424426047623,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.670179818111938,
      "learning_rate": 5.615250608470009e-07,
      "loss": 0.0096,
      "num_tokens": 1541764561.0,
      "reward": 0.041015625,
      "reward_std": 0.07586899399757385,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1579.947265625,
      "completions/mean_terminated_length": 604.367431640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5434838269181531,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 56.247460021244855,
      "learning_rate": 5.609891206886373e-07,
      "loss": 0.0217,
      "num_tokens": 1542656710.0,
      "reward": 0.02734375,
      "reward_std": 0.05864075943827629,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1609.7421875,
      "completions/mean_terminated_length": 671.386474609375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5438252112315439,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 40.629658863616285,
      "learning_rate": 5.604531649332267e-07,
      "loss": 0.0226,
      "num_tokens": 1543558114.0,
      "reward": 0.0703125,
      "reward_std": 0.0969996377825737,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 1593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1919.0,
      "completions/mean_length": 1722.44140625,
      "completions/mean_terminated_length": 693.3251953125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5441665955449347,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.9854454469198055,
      "learning_rate": 5.599171943414605e-07,
      "loss": 0.0049,
      "num_tokens": 1544520500.0,
      "reward": 0.029296875,
      "reward_std": 0.03878315910696983,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1909.0,
      "completions/mean_length": 1819.775390625,
      "completions/mean_terminated_length": 640.1566162109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5445079798583256,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 17.56288186385348,
      "learning_rate": 5.593812096740507e-07,
      "loss": 0.0126,
      "num_tokens": 1545532625.0,
      "reward": 0.02734375,
      "reward_std": 0.06116959825158119,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1695.0,
      "completions/mean_length": 1748.48828125,
      "completions/mean_terminated_length": 587.5238037109375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5448493641717163,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 2.1949074880247674,
      "learning_rate": 5.588452116917299e-07,
      "loss": 0.0339,
      "num_tokens": 1546513803.0,
      "reward": 0.044921875,
      "reward_std": 0.07581022381782532,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1783.0,
      "completions/mean_length": 1742.146484375,
      "completions/mean_terminated_length": 584.4766235351562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5451907484851071,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.177997886752237,
      "learning_rate": 5.583092011552487e-07,
      "loss": 0.0193,
      "num_tokens": 1547483542.0,
      "reward": 0.05859375,
      "reward_std": 0.08912044763565063,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1815.357421875,
      "completions/mean_terminated_length": 612.903564453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5455321327984979,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 10.206085340401652,
      "learning_rate": 5.577731788253762e-07,
      "loss": -0.001,
      "num_tokens": 1548494941.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 1598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.826171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1805.357421875,
      "completions/mean_terminated_length": 652.1235961914062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5458735171118887,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 59.47086710723055,
      "learning_rate": 5.572371454628981e-07,
      "loss": 0.0299,
      "num_tokens": 1549495204.0,
      "reward": 0.052734375,
      "reward_std": 0.09903506934642792,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1608.0,
      "completions/mean_length": 1730.212890625,
      "completions/mean_terminated_length": 620.74560546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5462149014252795,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 24.80823724552991,
      "learning_rate": 5.567011018286159e-07,
      "loss": -0.004,
      "num_tokens": 1550452097.0,
      "reward": 0.03125,
      "reward_std": 0.06469620764255524,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1767.408203125,
      "completions/mean_terminated_length": 741.9727172851562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5465562857386703,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.29984998692511533,
      "learning_rate": 5.56165048683345e-07,
      "loss": 0.0038,
      "num_tokens": 1551434146.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.004032257944345474,
      "rewards/accuracy_reward/std": 0.06343589723110199,
      "step": 1601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.814453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1761.0,
      "completions/mean_length": 1779.69921875,
      "completions/mean_terminated_length": 602.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5468976700520611,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 48.14006881714492,
      "learning_rate": 5.556289867879155e-07,
      "loss": -0.0051,
      "num_tokens": 1552420440.0,
      "reward": 0.0234375,
      "reward_std": 0.06068411096930504,
      "rewards/accuracy_reward/mean": 0.024193547666072845,
      "rewards/accuracy_reward/std": 0.15380479395389557,
      "step": 1602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1731.076171875,
      "completions/mean_terminated_length": 661.1196899414062,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.547239054365452,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 109.44794937858974,
      "learning_rate": 5.550929169031685e-07,
      "loss": 0.0179,
      "num_tokens": 1553385519.0,
      "reward": 0.0703125,
      "reward_std": 0.10007822513580322,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1879.0,
      "completions/mean_length": 1781.2890625,
      "completions/mean_terminated_length": 625.5416870117188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5475804386788428,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 46.90944229688171,
      "learning_rate": 5.545568397899575e-07,
      "loss": -0.0068,
      "num_tokens": 1554380051.0,
      "reward": 0.044921875,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.82421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1713.0,
      "completions/mean_length": 1799.666015625,
      "completions/mean_terminated_length": 635.2555541992188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5479218229922335,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.121954078803187,
      "learning_rate": 5.540207562091459e-07,
      "loss": 0.0218,
      "num_tokens": 1555373320.0,
      "reward": 0.01953125,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1905.0,
      "completions/mean_length": 1731.94140625,
      "completions/mean_terminated_length": 563.3944702148438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5482632073056243,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.86973018059758,
      "learning_rate": 5.534846669216062e-07,
      "loss": 0.0098,
      "num_tokens": 1556330762.0,
      "reward": 0.0703125,
      "reward_std": 0.08698301017284393,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1770.404296875,
      "completions/mean_terminated_length": 767.55859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5486045916190151,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 15.462838239081623,
      "learning_rate": 5.529485726882193e-07,
      "loss": 0.0112,
      "num_tokens": 1557307769.0,
      "reward": 0.015625,
      "reward_std": 0.040274329483509064,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1595.517578125,
      "completions/mean_terminated_length": 609.0496826171875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5489459759324059,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 43.23379096921182,
      "learning_rate": 5.524124742698728e-07,
      "loss": 0.0377,
      "num_tokens": 1558197474.0,
      "reward": 0.072265625,
      "reward_std": 0.08215849101543427,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.837890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1863.0,
      "completions/mean_length": 1834.18359375,
      "completions/mean_terminated_length": 729.6867065429688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5492873602457967,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.26356002323211,
      "learning_rate": 5.518763724274602e-07,
      "loss": 0.0135,
      "num_tokens": 1559214480.0,
      "reward": 0.01953125,
      "reward_std": 0.05001020431518555,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1747.224609375,
      "completions/mean_terminated_length": 697.1491088867188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5496287445591875,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.884596128847715,
      "learning_rate": 5.513402679218801e-07,
      "loss": 0.0099,
      "num_tokens": 1560183795.0,
      "reward": 0.01171875,
      "reward_std": 0.03697281330823898,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 1610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1764.470703125,
      "completions/mean_terminated_length": 691.299072265625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5499701288725783,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 75.08828403766782,
      "learning_rate": 5.50804161514035e-07,
      "loss": 0.0036,
      "num_tokens": 1561163604.0,
      "reward": 0.017578125,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1718.046875,
      "completions/mean_terminated_length": 685.6128540039062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5503115131859692,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 37.928757065416605,
      "learning_rate": 5.502680539648296e-07,
      "loss": 0.008,
      "num_tokens": 1562116748.0,
      "reward": 0.056640625,
      "reward_std": 0.09930095076560974,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1874.0,
      "completions/mean_length": 1721.806640625,
      "completions/mean_terminated_length": 632.6525268554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5506528974993599,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.95754696095407,
      "learning_rate": 5.497319460351706e-07,
      "loss": 0.0315,
      "num_tokens": 1563072985.0,
      "reward": 0.021484375,
      "reward_std": 0.05782270431518555,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1664.958984375,
      "completions/mean_terminated_length": 740.5533447265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5509942818127507,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 60.86552428937514,
      "learning_rate": 5.491958384859652e-07,
      "loss": 0.0105,
      "num_tokens": 1564001844.0,
      "reward": 0.0625,
      "reward_std": 0.10628747940063477,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1708.05078125,
      "completions/mean_terminated_length": 698.7442016601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5513356661261415,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.801982869561556,
      "learning_rate": 5.486597320781199e-07,
      "loss": 0.015,
      "num_tokens": 1564950782.0,
      "reward": 0.01953125,
      "reward_std": 0.04412011057138443,
      "rewards/accuracy_reward/mean": 0.02016128972172737,
      "rewards/accuracy_reward/std": 0.14069372415542603,
      "step": 1615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1839.0,
      "completions/mean_length": 1617.49609375,
      "completions/mean_terminated_length": 678.944091796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5516770504395323,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.429400105017464,
      "learning_rate": 5.481236275725398e-07,
      "loss": 0.0102,
      "num_tokens": 1565853468.0,
      "reward": 0.056640625,
      "reward_std": 0.07780590653419495,
      "rewards/accuracy_reward/mean": 0.058467742055654526,
      "rewards/accuracy_reward/std": 0.23486268520355225,
      "step": 1616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1771.0,
      "completions/mean_length": 1607.841796875,
      "completions/mean_terminated_length": 584.6168823242188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5520184347529231,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.172403768891147,
      "learning_rate": 5.475875257301274e-07,
      "loss": 0.0107,
      "num_tokens": 1566756187.0,
      "reward": 0.02734375,
      "reward_std": 0.07284127175807953,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1896.0,
      "completions/mean_length": 1700.220703125,
      "completions/mean_terminated_length": 738.7132568359375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5523598190663139,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 15.030460260998021,
      "learning_rate": 5.470514273117807e-07,
      "loss": 0.0153,
      "num_tokens": 1567707676.0,
      "reward": 0.015625,
      "reward_std": 0.047646719962358475,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1654.228515625,
      "completions/mean_terminated_length": 676.49658203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5527012033797047,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.41145164261852,
      "learning_rate": 5.465153330783939e-07,
      "loss": 0.0205,
      "num_tokens": 1568627313.0,
      "reward": 0.068359375,
      "reward_std": 0.08864613622426987,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1748.0703125,
      "completions/mean_terminated_length": 626.1111450195312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5530425876930956,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.904342611748271,
      "learning_rate": 5.459792437908542e-07,
      "loss": 0.0003,
      "num_tokens": 1569596293.0,
      "reward": 0.056640625,
      "reward_std": 0.0891871303319931,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 1706.689453125,
      "completions/mean_terminated_length": 703.7615356445312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5533839720064863,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 9.920335216339595,
      "learning_rate": 5.454431602100425e-07,
      "loss": 0.0223,
      "num_tokens": 1570543638.0,
      "reward": 0.0390625,
      "reward_std": 0.09005707502365112,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1643.3203125,
      "completions/mean_terminated_length": 588.8732299804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5537253563198771,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.087527788184534,
      "learning_rate": 5.449070830968316e-07,
      "loss": 0.0275,
      "num_tokens": 1571461706.0,
      "reward": 0.041015625,
      "reward_std": 0.069866843521595,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1726.0,
      "completions/mean_length": 1709.193359375,
      "completions/mean_terminated_length": 682.1023559570312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5540667406332679,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 18.13806863823379,
      "learning_rate": 5.443710132120846e-07,
      "loss": 0.0134,
      "num_tokens": 1572413085.0,
      "reward": 0.03125,
      "reward_std": 0.05754890665411949,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 1623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1857.0,
      "completions/mean_length": 1656.2734375,
      "completions/mean_terminated_length": 692.8378295898438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5544081249466587,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 28.706270164835576,
      "learning_rate": 5.438349513166549e-07,
      "loss": -0.0039,
      "num_tokens": 1573333689.0,
      "reward": 0.021484375,
      "reward_std": 0.04230976849794388,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1954.0,
      "completions/mean_length": 1643.267578125,
      "completions/mean_terminated_length": 728.1082763671875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5547495092600495,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 33.42744635694024,
      "learning_rate": 5.432988981713842e-07,
      "loss": 0.0383,
      "num_tokens": 1574246178.0,
      "reward": 0.087890625,
      "reward_std": 0.1285536289215088,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 1625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1712.0,
      "completions/mean_length": 1713.986328125,
      "completions/mean_terminated_length": 622.8750610351562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5550908935734403,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 37.37660652247251,
      "learning_rate": 5.42762854537102e-07,
      "loss": 0.0296,
      "num_tokens": 1575198587.0,
      "reward": 0.048828125,
      "reward_std": 0.11779527366161346,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 1626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1621.921875,
      "completions/mean_terminated_length": 726.13330078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5554322778868311,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 61.239165904831594,
      "learning_rate": 5.422268211746239e-07,
      "loss": 0.0057,
      "num_tokens": 1576111059.0,
      "reward": 0.056640625,
      "reward_std": 0.07447256147861481,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1593.0859375,
      "completions/mean_terminated_length": 601.3167724609375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.555773662200222,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 18.74271606092731,
      "learning_rate": 5.416907988447514e-07,
      "loss": 0.0039,
      "num_tokens": 1577012991.0,
      "reward": 0.0703125,
      "reward_std": 0.1190669909119606,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1613.962890625,
      "completions/mean_terminated_length": 566.4866943359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5561150465136127,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 36.865221079727164,
      "learning_rate": 5.411547883082701e-07,
      "loss": 0.0189,
      "num_tokens": 1577913356.0,
      "reward": 0.056640625,
      "reward_std": 0.10854585468769073,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1834.0,
      "completions/mean_length": 1653.994140625,
      "completions/mean_terminated_length": 647.0902709960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5564564308270035,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.796400320146339,
      "learning_rate": 5.406187903259491e-07,
      "loss": 0.01,
      "num_tokens": 1578831897.0,
      "reward": 0.046875,
      "reward_std": 0.07053204625844955,
      "rewards/accuracy_reward/mean": 0.04838709533214569,
      "rewards/accuracy_reward/std": 0.21479946374893188,
      "step": 1630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1661.341796875,
      "completions/mean_terminated_length": 719.3489990234375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5567978151403943,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.989295862922379,
      "learning_rate": 5.400828056585394e-07,
      "loss": 0.0047,
      "num_tokens": 1579770696.0,
      "reward": 0.013671875,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 1693.296875,
      "completions/mean_terminated_length": 741.4676513671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5571391994537851,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 75.67640237032624,
      "learning_rate": 5.395468350667732e-07,
      "loss": -0.0002,
      "num_tokens": 1580717920.0,
      "reward": 0.025390625,
      "reward_std": 0.05335709825158119,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1867.0,
      "completions/mean_length": 1624.634765625,
      "completions/mean_terminated_length": 658.4935913085938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5574805837671759,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 40.33398414718872,
      "learning_rate": 5.390108793113628e-07,
      "loss": -0.0025,
      "num_tokens": 1581620837.0,
      "reward": 0.044921875,
      "reward_std": 0.08537977933883667,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1661.6796875,
      "completions/mean_terminated_length": 664.8111572265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5578219680805667,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 23.260771731586303,
      "learning_rate": 5.384749391529991e-07,
      "loss": 0.0143,
      "num_tokens": 1582551809.0,
      "reward": 0.103515625,
      "reward_std": 0.1405133157968521,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 1634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 1664.76171875,
      "completions/mean_terminated_length": 685.375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5581633523939575,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.739587039328736,
      "learning_rate": 5.379390153523515e-07,
      "loss": 0.0126,
      "num_tokens": 1583481159.0,
      "reward": 0.041015625,
      "reward_std": 0.0879673957824707,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1729.625,
      "completions/mean_terminated_length": 754.698486328125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.5585047367073483,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 30.268747247832582,
      "learning_rate": 5.374031086700654e-07,
      "loss": 0.0046,
      "num_tokens": 1584448023.0,
      "reward": 0.037109375,
      "reward_std": 0.07575597614049911,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1612.603515625,
      "completions/mean_terminated_length": 736.941162109375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.558846121020739,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.6913769660511266,
      "learning_rate": 5.368672198667627e-07,
      "loss": 0.0044,
      "num_tokens": 1585360428.0,
      "reward": 0.025390625,
      "reward_std": 0.04230976849794388,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1655.478515625,
      "completions/mean_terminated_length": 759.724365234375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5591875053341299,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.6864334362397269,
      "learning_rate": 5.363313497030395e-07,
      "loss": 0.0054,
      "num_tokens": 1586289313.0,
      "reward": 0.03515625,
      "reward_std": 0.04505911096930504,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 1638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1543.3359375,
      "completions/mean_terminated_length": 604.4915771484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5595288896475207,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 28.75318906139253,
      "learning_rate": 5.357954989394651e-07,
      "loss": -0.0048,
      "num_tokens": 1587148029.0,
      "reward": 0.064453125,
      "reward_std": 0.08648413419723511,
      "rewards/accuracy_reward/mean": 0.06653226166963577,
      "rewards/accuracy_reward/std": 0.24946178495883942,
      "step": 1639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1583.01953125,
      "completions/mean_terminated_length": 702.9717407226562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5598702739609115,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.467112168998574,
      "learning_rate": 5.35259668336582e-07,
      "loss": 0.009,
      "num_tokens": 1588032471.0,
      "reward": 0.037109375,
      "reward_std": 0.07763300836086273,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1599.435546875,
      "completions/mean_terminated_length": 681.2738037109375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5602116582743023,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 18.105598438490166,
      "learning_rate": 5.347238586549036e-07,
      "loss": 0.0256,
      "num_tokens": 1588930006.0,
      "reward": 0.083984375,
      "reward_std": 0.12064876407384872,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 1641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1687.611328125,
      "completions/mean_terminated_length": 606.4453125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5605530425876931,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.322341414725255,
      "learning_rate": 5.341880706549138e-07,
      "loss": 0.043,
      "num_tokens": 1589871519.0,
      "reward": 0.03515625,
      "reward_std": 0.06822281330823898,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1877.0,
      "completions/mean_length": 1605.12890625,
      "completions/mean_terminated_length": 556.2236938476562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5608944269010839,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 58.135449586735845,
      "learning_rate": 5.336523050970657e-07,
      "loss": 0.0217,
      "num_tokens": 1590773553.0,
      "reward": 0.052734375,
      "reward_std": 0.10057821124792099,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1556.451171875,
      "completions/mean_terminated_length": 618.039794921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5612358112144747,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.517399172075493,
      "learning_rate": 5.331165627417807e-07,
      "loss": 0.0098,
      "num_tokens": 1591653128.0,
      "reward": 0.021484375,
      "reward_std": 0.0546875,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1622.578125,
      "completions/mean_terminated_length": 678.0880126953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5615771955278654,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 29.488694737350862,
      "learning_rate": 5.325808443494467e-07,
      "loss": 0.007,
      "num_tokens": 1592568304.0,
      "reward": 0.1015625,
      "reward_std": 0.12225200235843658,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 1645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1937.0,
      "completions/mean_length": 1551.8671875,
      "completions/mean_terminated_length": 526.9221801757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5619185798412563,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.09939734373746,
      "learning_rate": 5.32045150680418e-07,
      "loss": 0.0195,
      "num_tokens": 1593433500.0,
      "reward": 0.037109375,
      "reward_std": 0.07003316283226013,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.583984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1536.521484375,
      "completions/mean_terminated_length": 818.530517578125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5622599641546471,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 23.450097969172788,
      "learning_rate": 5.31509482495014e-07,
      "loss": 0.0199,
      "num_tokens": 1594292695.0,
      "reward": 0.08203125,
      "reward_std": 0.1348828822374344,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1539.884765625,
      "completions/mean_terminated_length": 671.74072265625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5626013484680379,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 18.61825255663601,
      "learning_rate": 5.309738405535177e-07,
      "loss": 0.0293,
      "num_tokens": 1595159292.0,
      "reward": 0.10546875,
      "reward_std": 0.12310365587472916,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 1648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1826.0,
      "completions/mean_length": 1637.900390625,
      "completions/mean_terminated_length": 657.4635620117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5629427327814287,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 24.422153094050394,
      "learning_rate": 5.304382256161746e-07,
      "loss": 0.0262,
      "num_tokens": 1596071241.0,
      "reward": 0.0703125,
      "reward_std": 0.10351984947919846,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1639.8203125,
      "completions/mean_terminated_length": 645.39599609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5632841170948195,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.547314072867204,
      "learning_rate": 5.299026384431929e-07,
      "loss": 0.0296,
      "num_tokens": 1596984077.0,
      "reward": 0.07421875,
      "reward_std": 0.08482664078474045,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1463.548828125,
      "completions/mean_terminated_length": 636.4953002929688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5636255014082103,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 32.958100487220385,
      "learning_rate": 5.293670797947396e-07,
      "loss": 0.0314,
      "num_tokens": 1597803286.0,
      "reward": 0.076171875,
      "reward_std": 0.12246465682983398,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 1651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1692.24609375,
      "completions/mean_terminated_length": 678.481201171875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5639668857216011,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 46.8267009918828,
      "learning_rate": 5.288315504309429e-07,
      "loss": -0.0072,
      "num_tokens": 1598750116.0,
      "reward": 0.029296875,
      "reward_std": 0.07768725603818893,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1647.994140625,
      "completions/mean_terminated_length": 709.4183349609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5643082700349918,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 36.00313758741006,
      "learning_rate": 5.282960511118882e-07,
      "loss": 0.0127,
      "num_tokens": 1599670721.0,
      "reward": 0.041015625,
      "reward_std": 0.0814797431230545,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.64453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1911.0,
      "completions/mean_length": 1527.19921875,
      "completions/mean_terminated_length": 582.89013671875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5646496543483827,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 51.006675241895266,
      "learning_rate": 5.277605825976195e-07,
      "loss": 0.0115,
      "num_tokens": 1600531223.0,
      "reward": 0.03515625,
      "reward_std": 0.06672705709934235,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003823518753,
      "step": 1654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1736.5,
      "completions/mean_terminated_length": 782.2222900390625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5649910386617735,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 110.9152983650968,
      "learning_rate": 5.272251456481363e-07,
      "loss": -0.0005,
      "num_tokens": 1601493335.0,
      "reward": 0.037109375,
      "reward_std": 0.07575597614049911,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 1440.62109375,
      "completions/mean_terminated_length": 684.0614013671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5653324229751643,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.6806848587855985,
      "learning_rate": 5.266897410233934e-07,
      "loss": 0.0266,
      "num_tokens": 1602306357.0,
      "reward": 0.029296875,
      "reward_std": 0.0723423957824707,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1852.0,
      "completions/mean_length": 1685.63671875,
      "completions/mean_terminated_length": 684.1102905273438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5656738072885551,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 79.23697229736925,
      "learning_rate": 5.261543694832994e-07,
      "loss": 0.0149,
      "num_tokens": 1603242123.0,
      "reward": 0.04296875,
      "reward_std": 0.08655644953250885,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 1657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1705.181640625,
      "completions/mean_terminated_length": 708.1297607421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5660151916019459,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.73665048768005,
      "learning_rate": 5.256190317877164e-07,
      "loss": 0.0114,
      "num_tokens": 1604197016.0,
      "reward": 0.044921875,
      "reward_std": 0.0862836092710495,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 1658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1647.029296875,
      "completions/mean_terminated_length": 688.417236328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5663565759153367,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.532580196132601,
      "learning_rate": 5.250837286964585e-07,
      "loss": 0.0146,
      "num_tokens": 1605120727.0,
      "reward": 0.0234375,
      "reward_std": 0.062167368829250336,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.619140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1860.0,
      "completions/mean_length": 1530.748046875,
      "completions/mean_terminated_length": 689.882080078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5666979602287275,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.574757140261458,
      "learning_rate": 5.245484609692906e-07,
      "loss": 0.0037,
      "num_tokens": 1605977958.0,
      "reward": 0.015625,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1622.96875,
      "completions/mean_terminated_length": 634.9091186523438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5670393445421182,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 42.688717085118896,
      "learning_rate": 5.240132293659268e-07,
      "loss": 0.0357,
      "num_tokens": 1606890838.0,
      "reward": 0.10546875,
      "reward_std": 0.13172858953475952,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 1661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1916.0,
      "completions/mean_length": 1550.03515625,
      "completions/mean_terminated_length": 662.3587036132812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.567380728855509,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.403924701857363,
      "learning_rate": 5.234780346460314e-07,
      "loss": 0.0094,
      "num_tokens": 1607759256.0,
      "reward": 0.052734375,
      "reward_std": 0.069866843521595,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1834.0,
      "completions/mean_length": 1643.822265625,
      "completions/mean_terminated_length": 677.5430297851562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5677221131688999,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 53.38768495225109,
      "learning_rate": 5.229428775692146e-07,
      "loss": 0.025,
      "num_tokens": 1608682717.0,
      "reward": 0.052734375,
      "reward_std": 0.07003316283226013,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1861.0,
      "completions/mean_length": 1600.130859375,
      "completions/mean_terminated_length": 606.0880126953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5680634974822907,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 45.39491460847951,
      "learning_rate": 5.224077588950342e-07,
      "loss": 0.0422,
      "num_tokens": 1609588800.0,
      "reward": 0.078125,
      "reward_std": 0.12353022396564484,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.607421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1524.009765625,
      "completions/mean_terminated_length": 713.2586669921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5684048817956815,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 195.70172751033917,
      "learning_rate": 5.218726793829936e-07,
      "loss": 0.0108,
      "num_tokens": 1610435173.0,
      "reward": 0.10546875,
      "reward_std": 0.13682594895362854,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 1665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.599609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1507.548828125,
      "completions/mean_terminated_length": 698.1902465820312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5687462661090723,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 0.935269412343542,
      "learning_rate": 5.213376397925399e-07,
      "loss": 0.0265,
      "num_tokens": 1611281678.0,
      "reward": 0.03125,
      "reward_std": 0.07708083093166351,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 1666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1558.541015625,
      "completions/mean_terminated_length": 678.5846557617188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5690876504224631,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 48.18749015750313,
      "learning_rate": 5.208026408830641e-07,
      "loss": 0.0147,
      "num_tokens": 1612154035.0,
      "reward": 0.05859375,
      "reward_std": 0.10915680229663849,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1838.0,
      "completions/mean_length": 1548.416015625,
      "completions/mean_terminated_length": 594.8806762695312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5694290347358539,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 33.075900760294196,
      "learning_rate": 5.202676834138993e-07,
      "loss": 0.0007,
      "num_tokens": 1613033112.0,
      "reward": 0.03125,
      "reward_std": 0.05695698410272598,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1868.0,
      "completions/mean_length": 1565.953125,
      "completions/mean_terminated_length": 742.1375732421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5697704190492446,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 43.26088067331689,
      "learning_rate": 5.1973276814432e-07,
      "loss": 0.0195,
      "num_tokens": 1613906848.0,
      "reward": 0.091796875,
      "reward_std": 0.12340601533651352,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 1669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1548.76953125,
      "completions/mean_terminated_length": 695.5872802734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5701118033626354,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 5.572669498956679,
      "learning_rate": 5.191978958335402e-07,
      "loss": 0.0227,
      "num_tokens": 1614779066.0,
      "reward": 0.08203125,
      "reward_std": 0.11268904060125351,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1835.0,
      "completions/mean_length": 1599.775390625,
      "completions/mean_terminated_length": 673.8024291992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5704531876760263,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 19.083995537021174,
      "learning_rate": 5.186630672407133e-07,
      "loss": 0.0186,
      "num_tokens": 1615675767.0,
      "reward": 0.064453125,
      "reward_std": 0.12303133308887482,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1872.0,
      "completions/mean_length": 1549.361328125,
      "completions/mean_terminated_length": 704.300048828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5707945719894171,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.362695622488507,
      "learning_rate": 5.181282831249311e-07,
      "loss": 0.0005,
      "num_tokens": 1616549216.0,
      "reward": 0.029296875,
      "reward_std": 0.06519509106874466,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 1672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1774.0,
      "completions/mean_length": 1638.623046875,
      "completions/mean_terminated_length": 659.9138793945312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5711359563028079,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 27.431925772868905,
      "learning_rate": 5.175935442452213e-07,
      "loss": -0.0097,
      "num_tokens": 1617471103.0,
      "reward": 0.056640625,
      "reward_std": 0.11036626994609833,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1580.474609375,
      "completions/mean_terminated_length": 747.059814453125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5714773406161987,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.107035256130246,
      "learning_rate": 5.170588513605485e-07,
      "loss": 0.027,
      "num_tokens": 1618355714.0,
      "reward": 0.04296875,
      "reward_std": 0.09930886328220367,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1576.763671875,
      "completions/mean_terminated_length": 692.53369140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5718187249295895,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 21.902487619837963,
      "learning_rate": 5.165242052298112e-07,
      "loss": 0.046,
      "num_tokens": 1619242505.0,
      "reward": 0.091796875,
      "reward_std": 0.12451037019491196,
      "rewards/accuracy_reward/mean": 0.09791667014360428,
      "rewards/accuracy_reward/std": 0.2975119948387146,
      "step": 1675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1532.61328125,
      "completions/mean_terminated_length": 701.9387817382812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5721601092429803,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 29.906182728225684,
      "learning_rate": 5.159896066118417e-07,
      "loss": -0.0075,
      "num_tokens": 1620108595.0,
      "reward": 0.044921875,
      "reward_std": 0.064970001578331,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 1676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1696.0,
      "completions/mean_length": 1648.66015625,
      "completions/mean_terminated_length": 684.9200439453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.572501493556371,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 2.319766164938501,
      "learning_rate": 5.15455056265405e-07,
      "loss": 0.0226,
      "num_tokens": 1621027445.0,
      "reward": 0.05078125,
      "reward_std": 0.07289456576108932,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1557.611328125,
      "completions/mean_terminated_length": 690.8162231445312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5728428778697618,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.976963634444491,
      "learning_rate": 5.149205549491975e-07,
      "loss": 0.0261,
      "num_tokens": 1621899198.0,
      "reward": 0.03515625,
      "reward_std": 0.08219823986291885,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003823518753,
      "step": 1678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1702.068359375,
      "completions/mean_terminated_length": 755.934326171875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.5731842621831527,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 6.006209757059818,
      "learning_rate": 5.143861034218462e-07,
      "loss": 0.0027,
      "num_tokens": 1622855745.0,
      "reward": 0.01953125,
      "reward_std": 0.036034777760505676,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1531.52734375,
      "completions/mean_terminated_length": 670.7396240234375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5735256464965435,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 20.07107750501197,
      "learning_rate": 5.138517024419071e-07,
      "loss": 0.0318,
      "num_tokens": 1623714319.0,
      "reward": 0.048828125,
      "reward_std": 0.10194281488656998,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1528.04296875,
      "completions/mean_terminated_length": 601.1630249023438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5738670308099343,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 14.94345538810307,
      "learning_rate": 5.13317352767865e-07,
      "loss": 0.0168,
      "num_tokens": 1624580997.0,
      "reward": 0.044921875,
      "reward_std": 0.07302113622426987,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1615.00390625,
      "completions/mean_terminated_length": 671.0186157226562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5742084151233251,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 12.158901872846986,
      "learning_rate": 5.127830551581311e-07,
      "loss": 0.031,
      "num_tokens": 1625482647.0,
      "reward": 0.0625,
      "reward_std": 0.09671792387962341,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 1682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1698.96875,
      "completions/mean_terminated_length": 743.9780883789062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5745497994367159,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 37.86543485284138,
      "learning_rate": 5.122488103710435e-07,
      "loss": 0.0128,
      "num_tokens": 1626428183.0,
      "reward": 0.03125,
      "reward_std": 0.07069835811853409,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1747.787109375,
      "completions/mean_terminated_length": 663.2342529296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5748911837501067,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 40.71383223966807,
      "learning_rate": 5.117146191648647e-07,
      "loss": 0.0347,
      "num_tokens": 1627394986.0,
      "reward": 0.087890625,
      "reward_std": 0.1312296986579895,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 1684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1567.5625,
      "completions/mean_terminated_length": 557.1878662109375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5752325680634974,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 20.638246866731627,
      "learning_rate": 5.111804822977814e-07,
      "loss": 0.0017,
      "num_tokens": 1628271290.0,
      "reward": 0.0703125,
      "reward_std": 0.08044223487377167,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1848.0,
      "completions/mean_length": 1568.5859375,
      "completions/mean_terminated_length": 569.3252563476562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5755739523768882,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 39.55417597841334,
      "learning_rate": 5.106464005279034e-07,
      "loss": 0.0184,
      "num_tokens": 1629150246.0,
      "reward": 0.068359375,
      "reward_std": 0.09126891195774078,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1648.943359375,
      "completions/mean_terminated_length": 619.2097778320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5759153366902791,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.018973187369816,
      "learning_rate": 5.101123746132622e-07,
      "loss": -0.0046,
      "num_tokens": 1630073945.0,
      "reward": 0.03125,
      "reward_std": 0.05641176179051399,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1745.142578125,
      "completions/mean_terminated_length": 700.5477905273438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5762567210036699,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 86.75565488956244,
      "learning_rate": 5.095784053118094e-07,
      "loss": 0.0092,
      "num_tokens": 1631060002.0,
      "reward": 0.02734375,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1671.2421875,
      "completions/mean_terminated_length": 639.9708251953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5765981053170607,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 15.014602420932516,
      "learning_rate": 5.090444933814171e-07,
      "loss": 0.026,
      "num_tokens": 1631997006.0,
      "reward": 0.0546875,
      "reward_std": 0.11135159432888031,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1848.0,
      "completions/mean_length": 1688.90234375,
      "completions/mean_terminated_length": 588.8095703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5769394896304515,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 24.722525212805873,
      "learning_rate": 5.085106395798756e-07,
      "loss": -0.0004,
      "num_tokens": 1632934188.0,
      "reward": 0.044921875,
      "reward_std": 0.052045635879039764,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1694.869140625,
      "completions/mean_terminated_length": 657.2077026367188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5772808739438423,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 43.27838630705788,
      "learning_rate": 5.079768446648926e-07,
      "loss": 0.0032,
      "num_tokens": 1633877993.0,
      "reward": 0.052734375,
      "reward_std": 0.0742793083190918,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1890.0,
      "completions/mean_length": 1743.4296875,
      "completions/mean_terminated_length": 737.5798950195312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5776222582572331,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 15.678326575986901,
      "learning_rate": 5.074431093940927e-07,
      "loss": 0.0424,
      "num_tokens": 1634841285.0,
      "reward": 0.05078125,
      "reward_std": 0.102440744638443,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1876.0,
      "completions/mean_length": 1717.419921875,
      "completions/mean_terminated_length": 588.887939453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5779636425706238,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.946153532651053,
      "learning_rate": 5.069094345250152e-07,
      "loss": 0.0217,
      "num_tokens": 1635793964.0,
      "reward": 0.072265625,
      "reward_std": 0.0868302434682846,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1741.654296875,
      "completions/mean_terminated_length": 762.3524169921875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5783050268840146,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.58139157783209,
      "learning_rate": 5.063758208151139e-07,
      "loss": 0.0269,
      "num_tokens": 1636770795.0,
      "reward": 0.048828125,
      "reward_std": 0.08082009106874466,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 1694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1736.56640625,
      "completions/mean_terminated_length": 792.4566650390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5786464111974055,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 18.95436493028647,
      "learning_rate": 5.058422690217559e-07,
      "loss": 0.0135,
      "num_tokens": 1637738013.0,
      "reward": 0.015625,
      "reward_std": 0.04670868441462517,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1716.841796875,
      "completions/mean_terminated_length": 623.1848754882812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5789877955107963,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 68.46485994060792,
      "learning_rate": 5.053087799022207e-07,
      "loss": 0.014,
      "num_tokens": 1638687692.0,
      "reward": 0.029296875,
      "reward_std": 0.0659080371260643,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1778.5,
      "completions/mean_terminated_length": 610.6666870117188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5793291798241871,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 39.97449024934502,
      "learning_rate": 5.047753542136981e-07,
      "loss": 0.0034,
      "num_tokens": 1639682988.0,
      "reward": 0.021484375,
      "reward_std": 0.04973640665411949,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 1752.87890625,
      "completions/mean_terminated_length": 636.3644409179688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5796705641375779,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 31.590705883442613,
      "learning_rate": 5.042419927132886e-07,
      "loss": 0.0192,
      "num_tokens": 1640660126.0,
      "reward": 0.041015625,
      "reward_std": 0.1038198471069336,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1782.0,
      "completions/mean_length": 1576.466796875,
      "completions/mean_terminated_length": 619.44970703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5800119484509687,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.647857467325819,
      "learning_rate": 5.037086961580012e-07,
      "loss": 0.0588,
      "num_tokens": 1641535565.0,
      "reward": 0.0625,
      "reward_std": 0.1287938952445984,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1681.18359375,
      "completions/mean_terminated_length": 625.1969604492188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5803533327643595,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.58696955256426,
      "learning_rate": 5.031754653047528e-07,
      "loss": 0.0083,
      "num_tokens": 1642484939.0,
      "reward": 0.060546875,
      "reward_std": 0.10182979702949524,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1675.81640625,
      "completions/mean_terminated_length": 636.459228515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5806947170777502,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 53.179475515775344,
      "learning_rate": 5.02642300910367e-07,
      "loss": 0.0187,
      "num_tokens": 1643418653.0,
      "reward": 0.03515625,
      "reward_std": 0.06856893002986908,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1684.12109375,
      "completions/mean_terminated_length": 707.6690673828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.581036101391141,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.73481878408291,
      "learning_rate": 5.021092037315733e-07,
      "loss": -0.0108,
      "num_tokens": 1644351307.0,
      "reward": 0.04296875,
      "reward_std": 0.05738259106874466,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1896.0,
      "completions/mean_length": 1707.24609375,
      "completions/mean_terminated_length": 726.2879028320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5813774857045318,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 47.33199262944589,
      "learning_rate": 5.015761745250055e-07,
      "loss": 0.0014,
      "num_tokens": 1645301897.0,
      "reward": 0.064453125,
      "reward_std": 0.08130794763565063,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1648.5,
      "completions/mean_terminated_length": 656.544189453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5817188700179227,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.356393611068537,
      "learning_rate": 5.01043214047201e-07,
      "loss": 0.0135,
      "num_tokens": 1646218857.0,
      "reward": 0.046875,
      "reward_std": 0.06546888500452042,
      "rewards/accuracy_reward/mean": 0.04838709533214569,
      "rewards/accuracy_reward/std": 0.21479946374893188,
      "step": 1704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1691.810546875,
      "completions/mean_terminated_length": 634.2868041992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5820602543313135,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 26.975248150539183,
      "learning_rate": 5.005103230546e-07,
      "loss": 0.031,
      "num_tokens": 1647160648.0,
      "reward": 0.083984375,
      "reward_std": 0.13190844655036926,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 1705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 1677.244140625,
      "completions/mean_terminated_length": 598.9389038085938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5824016386447043,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 41.0284858154736,
      "learning_rate": 4.999775023035438e-07,
      "loss": -0.0001,
      "num_tokens": 1648099109.0,
      "reward": 0.052734375,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1606.44921875,
      "completions/mean_terminated_length": 643.8136596679688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5827430229580951,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 65.84761392871681,
      "learning_rate": 4.994447525502735e-07,
      "loss": 0.0129,
      "num_tokens": 1649000811.0,
      "reward": 0.08203125,
      "reward_std": 0.11043406277894974,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1534.900390625,
      "completions/mean_terminated_length": 672.5706787109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5830844072714859,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 80.67921401924828,
      "learning_rate": 4.989120745509305e-07,
      "loss": 0.0219,
      "num_tokens": 1649868904.0,
      "reward": 0.09375,
      "reward_std": 0.13682594895362854,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 1708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1647.4609375,
      "completions/mean_terminated_length": 583.1714477539062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5834257915848767,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.803462917662138,
      "learning_rate": 4.983794690615535e-07,
      "loss": 0.0041,
      "num_tokens": 1650795796.0,
      "reward": 0.033203125,
      "reward_std": 0.07250870764255524,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1745.796875,
      "completions/mean_terminated_length": 714.137939453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5837671758982674,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 25.44127424171362,
      "learning_rate": 4.978469368380787e-07,
      "loss": 0.0102,
      "num_tokens": 1651766396.0,
      "reward": 0.04296875,
      "reward_std": 0.09028454124927521,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1621.0,
      "completions/mean_length": 1511.326171875,
      "completions/mean_terminated_length": 554.8912963867188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5841085602116582,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 38.64146044353377,
      "learning_rate": 4.97314478636338e-07,
      "loss": 0.0161,
      "num_tokens": 1652617187.0,
      "reward": 0.03515625,
      "reward_std": 0.06888246536254883,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1688.845703125,
      "completions/mean_terminated_length": 771.0069580078125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5844499445250491,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 49.88085028182138,
      "learning_rate": 4.967820952120588e-07,
      "loss": 0.0268,
      "num_tokens": 1653552484.0,
      "reward": 0.0703125,
      "reward_std": 0.10414530336856842,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 1721.369140625,
      "completions/mean_terminated_length": 677.6392822265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5847913288384399,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 73.99758334615439,
      "learning_rate": 4.962497873208616e-07,
      "loss": 0.023,
      "num_tokens": 1654518129.0,
      "reward": 0.0625,
      "reward_std": 0.12329617142677307,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 1713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1736.0,
      "completions/mean_length": 1485.966796875,
      "completions/mean_terminated_length": 587.2842407226562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5851327131518307,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 20.456552504913056,
      "learning_rate": 4.957175557182601e-07,
      "loss": 0.0491,
      "num_tokens": 1655357456.0,
      "reward": 0.123046875,
      "reward_std": 0.15058745443820953,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "step": 1714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1678.0,
      "completions/mean_length": 1692.564453125,
      "completions/mean_terminated_length": 592.1360473632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5854740974652215,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 5.477576678118939,
      "learning_rate": 4.9518540115966e-07,
      "loss": 0.0007,
      "num_tokens": 1656296433.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 1715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1645.04296875,
      "completions/mean_terminated_length": 716.94189453125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5858154817786123,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 25.021323043434897,
      "learning_rate": 4.946533244003572e-07,
      "loss": 0.0115,
      "num_tokens": 1657214279.0,
      "reward": 0.0703125,
      "reward_std": 0.08271066844463348,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1609.017578125,
      "completions/mean_terminated_length": 669.1104125976562,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.5861568660920031,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.027668599567413,
      "learning_rate": 4.941213261955374e-07,
      "loss": 0.0325,
      "num_tokens": 1658116064.0,
      "reward": 0.08203125,
      "reward_std": 0.1186666265130043,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1703.955078125,
      "completions/mean_terminated_length": 604.1392822265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5864982504053938,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 30.20596131003759,
      "learning_rate": 4.935894073002749e-07,
      "loss": 0.0103,
      "num_tokens": 1659063033.0,
      "reward": 0.078125,
      "reward_std": 0.09319131076335907,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1495.46875,
      "completions/mean_terminated_length": 582.2175903320312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5868396347187846,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.271407438282715,
      "learning_rate": 4.930575684695309e-07,
      "loss": 0.0098,
      "num_tokens": 1659903209.0,
      "reward": 0.072265625,
      "reward_std": 0.09106838703155518,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1569.900390625,
      "completions/mean_terminated_length": 546.2392578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5871810190321755,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 29.110865906675496,
      "learning_rate": 4.925258104581534e-07,
      "loss": 0.023,
      "num_tokens": 1660779878.0,
      "reward": 0.05859375,
      "reward_std": 0.08450747281312943,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1849.0,
      "completions/mean_length": 1576.158203125,
      "completions/mean_terminated_length": 626.9235229492188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5875224033455663,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 67.27003016349549,
      "learning_rate": 4.91994134020876e-07,
      "loss": -0.0038,
      "num_tokens": 1661659063.0,
      "reward": 0.10546875,
      "reward_std": 0.08188469707965851,
      "rewards/accuracy_reward/mean": 0.1088709682226181,
      "rewards/accuracy_reward/std": 0.31179171800613403,
      "step": 1721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1661.4609375,
      "completions/mean_terminated_length": 702.0679931640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5878637876589571,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 72.42316134213137,
      "learning_rate": 4.914625399123159e-07,
      "loss": 0.014,
      "num_tokens": 1662588931.0,
      "reward": 0.064453125,
      "reward_std": 0.08864850550889969,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.80078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1798.005859375,
      "completions/mean_terminated_length": 793.1275024414062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5882051719723479,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 97.4499616600289,
      "learning_rate": 4.909310288869737e-07,
      "loss": 0.0019,
      "num_tokens": 1663596950.0,
      "reward": 0.0703125,
      "reward_std": 0.10893170535564423,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1603.669921875,
      "completions/mean_terminated_length": 669.8120727539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5885465562857387,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.696893799314998,
      "learning_rate": 4.903996016992323e-07,
      "loss": 0.0034,
      "num_tokens": 1664501405.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 1724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1629.69921875,
      "completions/mean_terminated_length": 675.1154174804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5888879405991295,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 19.573690330793777,
      "learning_rate": 4.898682591033551e-07,
      "loss": 0.0066,
      "num_tokens": 1665406355.0,
      "reward": 0.017578125,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.018145160749554634,
      "rewards/accuracy_reward/std": 0.1336110383272171,
      "step": 1725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1646.193359375,
      "completions/mean_terminated_length": 657.9662475585938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5892293249125202,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 21.092068912595174,
      "learning_rate": 4.893370018534858e-07,
      "loss": 0.0162,
      "num_tokens": 1666326678.0,
      "reward": 0.08984375,
      "reward_std": 0.13041163980960846,
      "rewards/accuracy_reward/mean": 0.0927419364452362,
      "rewards/accuracy_reward/std": 0.2903633117675781,
      "step": 1726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1659.6484375,
      "completions/mean_terminated_length": 686.1095581054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.589570709225911,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.006908596141031,
      "learning_rate": 4.888058307036468e-07,
      "loss": 0.0086,
      "num_tokens": 1667256066.0,
      "reward": 0.03125,
      "reward_std": 0.0677972063422203,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1750.0,
      "completions/mean_length": 1550.193359375,
      "completions/mean_terminated_length": 677.6935424804688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5899120935393019,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.056446426509225,
      "learning_rate": 4.882747464077388e-07,
      "loss": 0.013,
      "num_tokens": 1668118901.0,
      "reward": 0.06640625,
      "reward_std": 0.09739763289690018,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293970108032227,
      "step": 1728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1719.982421875,
      "completions/mean_terminated_length": 694.0564575195312,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.5902534778526927,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.029504921133565,
      "learning_rate": 4.877437497195385e-07,
      "loss": 0.0122,
      "num_tokens": 1669088524.0,
      "reward": 0.0703125,
      "reward_std": 0.0944955125451088,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 1729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1620.0546875,
      "completions/mean_terminated_length": 728.072265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5905948621660835,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 14.60317749699828,
      "learning_rate": 4.872128413926989e-07,
      "loss": 0.0284,
      "num_tokens": 1669996504.0,
      "reward": 0.048828125,
      "reward_std": 0.0681830644607544,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1702.923828125,
      "completions/mean_terminated_length": 709.5227661132812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5909362464794743,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.68389753572875,
      "learning_rate": 4.866820221807467e-07,
      "loss": 0.0227,
      "num_tokens": 1670944145.0,
      "reward": 0.044921875,
      "reward_std": 0.07603531330823898,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 1731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1599.501953125,
      "completions/mean_terminated_length": 772.272216796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5912776307928651,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 40.621538157366196,
      "learning_rate": 4.861512928370831e-07,
      "loss": 0.0153,
      "num_tokens": 1671842850.0,
      "reward": 0.087890625,
      "reward_std": 0.1111922413110733,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 1732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1794.0,
      "completions/mean_length": 1594.494140625,
      "completions/mean_terminated_length": 587.654052734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5916190151062559,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 23.679976144048887,
      "learning_rate": 4.856206541149812e-07,
      "loss": 0.0267,
      "num_tokens": 1672734639.0,
      "reward": 0.046875,
      "reward_std": 0.07509076595306396,
      "rewards/accuracy_reward/mean": 0.04838709533214569,
      "rewards/accuracy_reward/std": 0.2147994488477707,
      "step": 1733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1581.107421875,
      "completions/mean_terminated_length": 633.5089111328125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.5919603994196466,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.622606752148648,
      "learning_rate": 4.850901067675855e-07,
      "loss": -0.0064,
      "num_tokens": 1673623270.0,
      "reward": 0.06640625,
      "reward_std": 0.07477159798145294,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1891.0,
      "completions/mean_length": 1578.59765625,
      "completions/mean_terminated_length": 658.7861328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5923017837330374,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 14.530176875759707,
      "learning_rate": 4.845596515479113e-07,
      "loss": 0.0081,
      "num_tokens": 1674502760.0,
      "reward": 0.013671875,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1658.45703125,
      "completions/mean_terminated_length": 785.68359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5926431680464282,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 13.508141020196877,
      "learning_rate": 4.840292892088423e-07,
      "loss": -0.0045,
      "num_tokens": 1675425826.0,
      "reward": 0.04296875,
      "reward_std": 0.09671792387962341,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1734.0,
      "completions/mean_length": 1516.37109375,
      "completions/mean_terminated_length": 637.6683959960938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5929845523598191,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 12.035458062105782,
      "learning_rate": 4.834990205031314e-07,
      "loss": 0.035,
      "num_tokens": 1676272624.0,
      "reward": 0.0625,
      "reward_std": 0.12199270725250244,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.611328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1506.310546875,
      "completions/mean_terminated_length": 654.3065185546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5933259366732099,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.3646619089911172,
      "learning_rate": 4.829688461833975e-07,
      "loss": 0.014,
      "num_tokens": 1677118623.0,
      "reward": 0.0390625,
      "reward_std": 0.08752824366092682,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1686.89453125,
      "completions/mean_terminated_length": 717.8848876953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5936673209866007,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 23.749728402062377,
      "learning_rate": 4.824387670021263e-07,
      "loss": 0.001,
      "num_tokens": 1678056921.0,
      "reward": 0.0546875,
      "reward_std": 0.11308281868696213,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1932.0,
      "completions/mean_length": 1551.169921875,
      "completions/mean_terminated_length": 642.6022338867188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5940087052999915,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 60.574503881013854,
      "learning_rate": 4.819087837116682e-07,
      "loss": 0.0181,
      "num_tokens": 1678929120.0,
      "reward": 0.087890625,
      "reward_std": 0.11090600490570068,
      "rewards/accuracy_reward/mean": 0.09072580933570862,
      "rewards/accuracy_reward/std": 0.2875087857246399,
      "step": 1740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.591796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1486.51953125,
      "completions/mean_terminated_length": 672.5071411132812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5943500896133823,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 23.30272542789789,
      "learning_rate": 4.813788970642373e-07,
      "loss": 0.0205,
      "num_tokens": 1679767066.0,
      "reward": 0.10546875,
      "reward_std": 0.16966770589351654,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 1741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1865.0,
      "completions/mean_length": 1629.72265625,
      "completions/mean_terminated_length": 726.0370483398438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.594691473926773,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 30.568699189168832,
      "learning_rate": 4.808491078119114e-07,
      "loss": 0.0068,
      "num_tokens": 1680680572.0,
      "reward": 0.029296875,
      "reward_std": 0.0723423957824707,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1930.0,
      "completions/mean_length": 1504.654296875,
      "completions/mean_terminated_length": 606.58544921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5950328582401638,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 27.284667207552566,
      "learning_rate": 4.803194167066292e-07,
      "loss": 0.0157,
      "num_tokens": 1681517835.0,
      "reward": 0.12109375,
      "reward_std": 0.14497211575508118,
      "rewards/accuracy_reward/mean": 0.12109375,
      "rewards/accuracy_reward/std": 0.3265552520751953,
      "step": 1743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1529.216796875,
      "completions/mean_terminated_length": 620.2150268554688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5953742425535546,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 51.263658397355904,
      "learning_rate": 4.7978982450019e-07,
      "loss": 0.0112,
      "num_tokens": 1682377770.0,
      "reward": 0.01953125,
      "reward_std": 0.046875,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1558.05078125,
      "completions/mean_terminated_length": 692.3135375976562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5957156268669455,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 39.81239885443345,
      "learning_rate": 4.792603319442533e-07,
      "loss": 0.0048,
      "num_tokens": 1683255380.0,
      "reward": 0.064453125,
      "reward_std": 0.11833953857421875,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1676.74609375,
      "completions/mean_terminated_length": 797.4605102539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5960570111803363,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.8214386899226493,
      "learning_rate": 4.78730939790337e-07,
      "loss": 0.0092,
      "num_tokens": 1684189986.0,
      "reward": 0.044921875,
      "reward_std": 0.0628596618771553,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1638.396484375,
      "completions/mean_terminated_length": 686.2012939453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5963983954937271,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.821216602725464,
      "learning_rate": 4.782016487898163e-07,
      "loss": 0.0244,
      "num_tokens": 1685108109.0,
      "reward": 0.068359375,
      "reward_std": 0.11218452453613281,
      "rewards/accuracy_reward/mean": 0.07056451588869095,
      "rewards/accuracy_reward/std": 0.25635457038879395,
      "step": 1747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1851.0,
      "completions/mean_length": 1623.1171875,
      "completions/mean_terminated_length": 662.3948974609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5967397798071179,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 25.722893442582166,
      "learning_rate": 4.77672459693923e-07,
      "loss": 0.0186,
      "num_tokens": 1686015097.0,
      "reward": 0.06640625,
      "reward_std": 0.10475093126296997,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293970108032227,
      "step": 1748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1577.7109375,
      "completions/mean_terminated_length": 656.161865234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5970811641205087,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 16.360535070185918,
      "learning_rate": 4.771433732537446e-07,
      "loss": 0.0521,
      "num_tokens": 1686896021.0,
      "reward": 0.03125,
      "reward_std": 0.09193411469459534,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 1749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1702.796875,
      "completions/mean_terminated_length": 667.1875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5974225484338994,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 19.35386551251597,
      "learning_rate": 4.7661439022022186e-07,
      "loss": 0.0212,
      "num_tokens": 1687840749.0,
      "reward": 0.0390625,
      "reward_std": 0.09005707502365112,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1565.927734375,
      "completions/mean_terminated_length": 604.6023559570312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5977639327472902,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 6.463149345825972,
      "learning_rate": 4.7608551134415e-07,
      "loss": 0.0169,
      "num_tokens": 1688715112.0,
      "reward": 0.08203125,
      "reward_std": 0.11454009264707565,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1681.71484375,
      "completions/mean_terminated_length": 669.0441284179688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.598105317060681,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 16.468646235492148,
      "learning_rate": 4.755567373761755e-07,
      "loss": 0.0345,
      "num_tokens": 1689647686.0,
      "reward": 0.103515625,
      "reward_std": 0.11740150302648544,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 1752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1646.951171875,
      "completions/mean_terminated_length": 632.2138061523438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.5984467013740719,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 1.5852383603687683,
      "learning_rate": 4.750280690667965e-07,
      "loss": 0.0039,
      "num_tokens": 1690572013.0,
      "reward": 0.01953125,
      "reward_std": 0.03449726849794388,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1584.78515625,
      "completions/mean_terminated_length": 677.0982666015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5987880856874627,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 90.1284583937729,
      "learning_rate": 4.744995071663609e-07,
      "loss": 0.0294,
      "num_tokens": 1691454831.0,
      "reward": 0.09765625,
      "reward_std": 0.13793784379959106,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 1754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1800.0,
      "completions/mean_length": 1675.76953125,
      "completions/mean_terminated_length": 636.6592407226562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.5991294700008535,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 19.892408285078744,
      "learning_rate": 4.7397105242506576e-07,
      "loss": 0.0681,
      "num_tokens": 1692395657.0,
      "reward": 0.12890625,
      "reward_std": 0.1824507713317871,
      "rewards/accuracy_reward/mean": 0.12890625,
      "rewards/accuracy_reward/std": 0.33542385697364807,
      "step": 1755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1622.421875,
      "completions/mean_terminated_length": 642.2193603515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5994708543142443,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 28.438715682904697,
      "learning_rate": 4.734427055929556e-07,
      "loss": -0.0038,
      "num_tokens": 1693307121.0,
      "reward": 0.052734375,
      "reward_std": 0.11756782233715057,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1700.12109375,
      "completions/mean_terminated_length": 688.3511352539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.5998122386276351,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 34.76343818835714,
      "learning_rate": 4.729144674199225e-07,
      "loss": -0.011,
      "num_tokens": 1694261855.0,
      "reward": 0.06640625,
      "reward_std": 0.077679343521595,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1578.748046875,
      "completions/mean_terminated_length": 626.3609619140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6001536229410258,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 90.20808201620162,
      "learning_rate": 4.723863386557037e-07,
      "loss": 0.0228,
      "num_tokens": 1695148302.0,
      "reward": 0.041015625,
      "reward_std": 0.10743913054466248,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1857.0,
      "completions/mean_length": 1675.423828125,
      "completions/mean_terminated_length": 634.9703369140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6004950072544166,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.024118039907094,
      "learning_rate": 4.7185832004988133e-07,
      "loss": 0.0116,
      "num_tokens": 1696075559.0,
      "reward": 0.060546875,
      "reward_std": 0.06733252108097076,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1607.8671875,
      "completions/mean_terminated_length": 612.6624145507812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6008363915678074,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 23.10944354813371,
      "learning_rate": 4.713304123518814e-07,
      "loss": 0.0247,
      "num_tokens": 1696979219.0,
      "reward": 0.083984375,
      "reward_std": 0.13180816173553467,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 1760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.60546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1885.0,
      "completions/mean_length": 1509.361328125,
      "completions/mean_terminated_length": 682.7376098632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6011777758811983,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 32.272756881966046,
      "learning_rate": 4.708026163109725e-07,
      "loss": 0.0028,
      "num_tokens": 1697827932.0,
      "reward": 0.060546875,
      "reward_std": 0.0994330644607544,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1713.361328125,
      "completions/mean_terminated_length": 719.8217163085938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6015191601945891,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.465911730942768,
      "learning_rate": 4.7027493267626405e-07,
      "loss": 0.0048,
      "num_tokens": 1698780165.0,
      "reward": 0.048828125,
      "reward_std": 0.06425705552101135,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1679.0,
      "completions/mean_length": 1530.82421875,
      "completions/mean_terminated_length": 601.0382080078125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6018605445079799,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 56.417287247243806,
      "learning_rate": 4.69747362196707e-07,
      "loss": 0.0365,
      "num_tokens": 1699634619.0,
      "reward": 0.07421875,
      "reward_std": 0.12517979741096497,
      "rewards/accuracy_reward/mean": 0.07661290466785431,
      "rewards/accuracy_reward/std": 0.2662447690963745,
      "step": 1763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1579.13671875,
      "completions/mean_terminated_length": 691.7401123046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6022019288213707,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 12.507491630024033,
      "learning_rate": 4.692199056210907e-07,
      "loss": 0.0246,
      "num_tokens": 1700524865.0,
      "reward": 0.0859375,
      "reward_std": 0.12774282693862915,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1574.771484375,
      "completions/mean_terminated_length": 597.1437377929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6025433131347615,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.428112265761851,
      "learning_rate": 4.6869256369804353e-07,
      "loss": 0.0144,
      "num_tokens": 1701408076.0,
      "reward": 0.04296875,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1626.927734375,
      "completions/mean_terminated_length": 610.739990234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6028846974481522,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 64.10763037630339,
      "learning_rate": 4.6816533717603093e-07,
      "loss": 0.018,
      "num_tokens": 1702315735.0,
      "reward": 0.064453125,
      "reward_std": 0.11804604530334473,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1748.69921875,
      "completions/mean_terminated_length": 831.793701171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.603226081761543,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 24.72551911186027,
      "learning_rate": 4.676382268033544e-07,
      "loss": 0.0061,
      "num_tokens": 1703289853.0,
      "reward": 0.0546875,
      "reward_std": 0.09797609597444534,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1561.298828125,
      "completions/mean_terminated_length": 573.7692260742188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6035674660749338,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 58.65711889295814,
      "learning_rate": 4.671112333281508e-07,
      "loss": 0.0119,
      "num_tokens": 1704167302.0,
      "reward": 0.0546875,
      "reward_std": 0.10150270164012909,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1535.90625,
      "completions/mean_terminated_length": 583.240234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6039088503883246,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 38.48290516411763,
      "learning_rate": 4.6658435749839087e-07,
      "loss": 0.019,
      "num_tokens": 1705025670.0,
      "reward": 0.0703125,
      "reward_std": 0.07104447484016418,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 1769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1620.8984375,
      "completions/mean_terminated_length": 570.45947265625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6042502347017155,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.91925749806277,
      "learning_rate": 4.6605760006187857e-07,
      "loss": 0.0058,
      "num_tokens": 1705928674.0,
      "reward": 0.029296875,
      "reward_std": 0.05628519132733345,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 1770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1630.6875,
      "completions/mean_terminated_length": 594.9115600585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6045916190151063,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.939123723452132,
      "learning_rate": 4.655309617662496e-07,
      "loss": 0.0048,
      "num_tokens": 1706845346.0,
      "reward": 0.03515625,
      "reward_std": 0.0794091522693634,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1566.890625,
      "completions/mean_terminated_length": 572.9820556640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6049330033284971,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 25.378032399529374,
      "learning_rate": 4.6500444335897094e-07,
      "loss": 0.0233,
      "num_tokens": 1707723786.0,
      "reward": 0.06640625,
      "reward_std": 0.15393419563770294,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1585.423828125,
      "completions/mean_terminated_length": 662.9766235351562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6052743876418879,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.59547981771342,
      "learning_rate": 4.6447804558733894e-07,
      "loss": -0.0009,
      "num_tokens": 1708616275.0,
      "reward": 0.046875,
      "reward_std": 0.09723131358623505,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1874.0,
      "completions/mean_length": 1644.56640625,
      "completions/mean_terminated_length": 697.9476928710938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6056157719552786,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 56.24683300225862,
      "learning_rate": 4.6395176919847923e-07,
      "loss": 0.0259,
      "num_tokens": 1709536645.0,
      "reward": 0.0390625,
      "reward_std": 0.12560540437698364,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 1673.552734375,
      "completions/mean_terminated_length": 538.4172973632812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6059571562686694,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.316897250435495,
      "learning_rate": 4.634256149393445e-07,
      "loss": 0.027,
      "num_tokens": 1710473600.0,
      "reward": 0.021484375,
      "reward_std": 0.06519509106874466,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 1775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 1596.939453125,
      "completions/mean_terminated_length": 528.63818359375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6062985405820602,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.192807479981179,
      "learning_rate": 4.6289958355671475e-07,
      "loss": 0.0163,
      "num_tokens": 1711368401.0,
      "reward": 0.013671875,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1849.0,
      "completions/mean_length": 1754.98828125,
      "completions/mean_terminated_length": 720.3717041015625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.606639924895451,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.693809255990296,
      "learning_rate": 4.6237367579719535e-07,
      "loss": 0.0245,
      "num_tokens": 1712339067.0,
      "reward": 0.025390625,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196493625641,
      "step": 1777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1576.892578125,
      "completions/mean_terminated_length": 530.974853515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6069813092088419,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 42.58992168796201,
      "learning_rate": 4.618478924072159e-07,
      "loss": 0.0121,
      "num_tokens": 1713222884.0,
      "reward": 0.060546875,
      "reward_std": 0.07014855742454529,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1719.0,
      "completions/mean_length": 1682.626953125,
      "completions/mean_terminated_length": 608.9923095703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6073226935222327,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.794848268205975,
      "learning_rate": 4.6132223413303e-07,
      "loss": -0.0017,
      "num_tokens": 1714158869.0,
      "reward": 0.037109375,
      "reward_std": 0.07834454625844955,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1677.1953125,
      "completions/mean_terminated_length": 631.1940307617188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6076640778356235,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.92263605217602,
      "learning_rate": 4.607967017207135e-07,
      "loss": 0.0078,
      "num_tokens": 1715099913.0,
      "reward": 0.021484375,
      "reward_std": 0.06024399772286415,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1771.0,
      "completions/mean_length": 1705.587890625,
      "completions/mean_terminated_length": 537.1293334960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6080054621490143,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.83278625575297,
      "learning_rate": 4.602712959161633e-07,
      "loss": -0.0003,
      "num_tokens": 1716048758.0,
      "reward": 0.04296875,
      "reward_std": 0.050948236137628555,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1706.197265625,
      "completions/mean_terminated_length": 601.6942138671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.608346846462405,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 79.287740337927,
      "learning_rate": 4.5974601746509687e-07,
      "loss": 0.0118,
      "num_tokens": 1716992907.0,
      "reward": 0.064453125,
      "reward_std": 0.0975041538476944,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 1664.3046875,
      "completions/mean_terminated_length": 592.7999877929688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6086882307757958,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 43.95360653340727,
      "learning_rate": 4.592208671130511e-07,
      "loss": 0.0079,
      "num_tokens": 1717923111.0,
      "reward": 0.064453125,
      "reward_std": 0.08258409798145294,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1740.263671875,
      "completions/mean_terminated_length": 641.2053833007812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6090296150891866,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.22052354947723599,
      "learning_rate": 4.5869584560538065e-07,
      "loss": 0.0169,
      "num_tokens": 1718901134.0,
      "reward": 0.03515625,
      "reward_std": 0.04505910724401474,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1669.23046875,
      "completions/mean_terminated_length": 544.6666870117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6093709994025774,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 53.25865397701371,
      "learning_rate": 4.5817095368725754e-07,
      "loss": 0.0266,
      "num_tokens": 1719833444.0,
      "reward": 0.078125,
      "reward_std": 0.10658828169107437,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1820.0,
      "completions/mean_length": 1720.30078125,
      "completions/mean_terminated_length": 601.6034545898438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6097123837159683,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 15.336664911287983,
      "learning_rate": 4.576461921036702e-07,
      "loss": 0.0072,
      "num_tokens": 1720792910.0,
      "reward": 0.009765625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 1786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1849.0,
      "completions/mean_length": 1674.361328125,
      "completions/mean_terminated_length": 576.4384765625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6100537680293591,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 31.4563161853599,
      "learning_rate": 4.571215615994212e-07,
      "loss": 0.0189,
      "num_tokens": 1721730679.0,
      "reward": 0.060546875,
      "reward_std": 0.06750432401895523,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1858.0,
      "completions/mean_length": 1621.626953125,
      "completions/mean_terminated_length": 552.7739868164062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6103951523427499,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 165.11636596109886,
      "learning_rate": 4.565970629191278e-07,
      "loss": 0.0045,
      "num_tokens": 1722639448.0,
      "reward": 0.080078125,
      "reward_std": 0.13820409774780273,
      "rewards/accuracy_reward/mean": 0.08266129344701767,
      "rewards/accuracy_reward/std": 0.2756475806236267,
      "step": 1788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1919.0,
      "completions/mean_length": 1659.427734375,
      "completions/mean_terminated_length": 626.9357299804688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6107365366561407,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 14.591164161902098,
      "learning_rate": 4.5607269680721993e-07,
      "loss": 0.0163,
      "num_tokens": 1723565251.0,
      "reward": 0.037109375,
      "reward_std": 0.04461899772286415,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1742.0,
      "completions/mean_length": 1750.087890625,
      "completions/mean_terminated_length": 710.0087890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6110779209695314,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 15.830102008838816,
      "learning_rate": 4.5554846400793946e-07,
      "loss": -0.002,
      "num_tokens": 1724545328.0,
      "reward": 0.052734375,
      "reward_std": 0.07499225437641144,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1629.111328125,
      "completions/mean_terminated_length": 579.0205688476562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6114193052829222,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.346945159707694,
      "learning_rate": 4.5502436526533896e-07,
      "loss": 0.0215,
      "num_tokens": 1725455497.0,
      "reward": 0.052734375,
      "reward_std": 0.08318498730659485,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1619.904296875,
      "completions/mean_terminated_length": 686.6024780273438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.611760689596313,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.76036886807291,
      "learning_rate": 4.5450040132328074e-07,
      "loss": 0.017,
      "num_tokens": 1726353624.0,
      "reward": 0.080078125,
      "reward_std": 0.10925643146038055,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 1792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1657.447265625,
      "completions/mean_terminated_length": 715.260009765625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6121020739097038,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.811707480112897,
      "learning_rate": 4.539765729254356e-07,
      "loss": 0.0018,
      "num_tokens": 1727281341.0,
      "reward": 0.0390625,
      "reward_std": 0.08698301017284393,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1694.591796875,
      "completions/mean_terminated_length": 634.3671875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6124434582230946,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.720413781595724,
      "learning_rate": 4.5345288081528223e-07,
      "loss": 0.0218,
      "num_tokens": 1728229596.0,
      "reward": 0.046875,
      "reward_std": 0.09225328266620636,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1570.12109375,
      "completions/mean_terminated_length": 574.0602416992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6127848425364855,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 19.631576705001798,
      "learning_rate": 4.529293257361059e-07,
      "loss": 0.0234,
      "num_tokens": 1729122106.0,
      "reward": 0.11328125,
      "reward_std": 0.14618882536888123,
      "rewards/accuracy_reward/mean": 0.11328125,
      "rewards/accuracy_reward/std": 0.3172462284564972,
      "step": 1795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 1660.6171875,
      "completions/mean_terminated_length": 680.137939453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6131262268498763,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 35.70968879959113,
      "learning_rate": 4.5240590843099725e-07,
      "loss": -0.009,
      "num_tokens": 1730054486.0,
      "reward": 0.04296875,
      "reward_std": 0.09743183106184006,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1576.140625,
      "completions/mean_terminated_length": 643.3953247070312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6134676111632671,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 50.68452388034922,
      "learning_rate": 4.5188262964285126e-07,
      "loss": -0.0022,
      "num_tokens": 1730938686.0,
      "reward": 0.03125,
      "reward_std": 0.047646716237068176,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1687.685546875,
      "completions/mean_terminated_length": 650.4166870117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6138089954766578,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.231496164555943,
      "learning_rate": 4.513594901143668e-07,
      "loss": 0.0109,
      "num_tokens": 1731893613.0,
      "reward": 0.015625,
      "reward_std": 0.05259781330823898,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1735.412109375,
      "completions/mean_terminated_length": 619.0267944335938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6141503797900486,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.186711260204905,
      "learning_rate": 4.508364905880444e-07,
      "loss": 0.0093,
      "num_tokens": 1732858896.0,
      "reward": 0.029296875,
      "reward_std": 0.07570268213748932,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1580.34765625,
      "completions/mean_terminated_length": 647.7777709960938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6144917641034394,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.8823079623799638,
      "learning_rate": 4.503136318061863e-07,
      "loss": -0.0047,
      "num_tokens": 1733748946.0,
      "reward": 0.037109375,
      "reward_std": 0.06568294763565063,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.61328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1800.0,
      "completions/mean_length": 1478.064453125,
      "completions/mean_terminated_length": 574.227294921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6148331484168302,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 3.440715382871405,
      "learning_rate": 4.497909145108949e-07,
      "loss": 0.0266,
      "num_tokens": 1734574963.0,
      "reward": 0.060546875,
      "reward_std": 0.10618237406015396,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1663.755859375,
      "completions/mean_terminated_length": 579.84326171875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.615174532730221,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 57.573321466204106,
      "learning_rate": 4.4926833944407207e-07,
      "loss": 0.0155,
      "num_tokens": 1735500566.0,
      "reward": 0.107421875,
      "reward_std": 0.1546606868505478,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 1802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1637.591796875,
      "completions/mean_terminated_length": 774.4909057617188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6155159170436119,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.267590194011289,
      "learning_rate": 4.4874590734741715e-07,
      "loss": 0.0149,
      "num_tokens": 1736409557.0,
      "reward": 0.04296875,
      "reward_std": 0.08126020431518555,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1663.822265625,
      "completions/mean_terminated_length": 632.8992919921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6158573013570027,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.425682702275317,
      "learning_rate": 4.4822361896242734e-07,
      "loss": 0.0178,
      "num_tokens": 1737344122.0,
      "reward": 0.072265625,
      "reward_std": 0.07823152095079422,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1511.30859375,
      "completions/mean_terminated_length": 578.55615234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6161986856703935,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 19.422064369614954,
      "learning_rate": 4.477014750303951e-07,
      "loss": -0.0012,
      "num_tokens": 1738191320.0,
      "reward": 0.041015625,
      "reward_std": 0.10689391195774078,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1557.4296875,
      "completions/mean_terminated_length": 561.775146484375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6165400699837843,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.784035150807018,
      "learning_rate": 4.471794762924084e-07,
      "loss": 0.0152,
      "num_tokens": 1739070228.0,
      "reward": 0.060546875,
      "reward_std": 0.0815330445766449,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1954.0,
      "completions/mean_length": 1532.873046875,
      "completions/mean_terminated_length": 566.5786743164062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.616881454297175,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 58.40928013189856,
      "learning_rate": 4.46657623489349e-07,
      "loss": 0.0341,
      "num_tokens": 1739939779.0,
      "reward": 0.091796875,
      "reward_std": 0.13667413592338562,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 1807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1577.46875,
      "completions/mean_terminated_length": 630.87060546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6172228386105658,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.973966482179144,
      "learning_rate": 4.461359173618914e-07,
      "loss": 0.0056,
      "num_tokens": 1740830259.0,
      "reward": 0.060546875,
      "reward_std": 0.11384069174528122,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1534.32421875,
      "completions/mean_terminated_length": 656.4550170898438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6175642229239566,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.589240238210678,
      "learning_rate": 4.456143586505021e-07,
      "loss": 0.0017,
      "num_tokens": 1741688201.0,
      "reward": 0.046875,
      "reward_std": 0.08078034222126007,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1596.892578125,
      "completions/mean_terminated_length": 557.8903198242188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6179056072373474,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 31.959621530188844,
      "learning_rate": 4.450929480954383e-07,
      "loss": 0.0131,
      "num_tokens": 1742583650.0,
      "reward": 0.044921875,
      "reward_std": 0.09286519140005112,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 1594.419921875,
      "completions/mean_terminated_length": 649.0059814453125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6182469915507383,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.597695409867235,
      "learning_rate": 4.445716864367472e-07,
      "loss": 0.0001,
      "num_tokens": 1743476953.0,
      "reward": 0.03125,
      "reward_std": 0.06837663054466248,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1905.0,
      "completions/mean_length": 1629.60546875,
      "completions/mean_terminated_length": 580.75341796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6185883758641291,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.561996206174552,
      "learning_rate": 4.440505744142639e-07,
      "loss": 0.0104,
      "num_tokens": 1744382607.0,
      "reward": 0.0234375,
      "reward_std": 0.062167368829250336,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1653.640625,
      "completions/mean_terminated_length": 665.0410766601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6189297601775199,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.187602275271422,
      "learning_rate": 4.4352961276761183e-07,
      "loss": 0.0111,
      "num_tokens": 1745301959.0,
      "reward": 0.029296875,
      "reward_std": 0.046722229570150375,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1677.26953125,
      "completions/mean_terminated_length": 662.496337890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6192711444909107,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.76920017639966,
      "learning_rate": 4.4300880223620063e-07,
      "loss": 0.0168,
      "num_tokens": 1746237233.0,
      "reward": 0.0546875,
      "reward_std": 0.0868699848651886,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1697.10546875,
      "completions/mean_terminated_length": 587.3658447265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6196125288043014,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.48118460668332,
      "learning_rate": 4.424881435592256e-07,
      "loss": 0.0085,
      "num_tokens": 1747185191.0,
      "reward": 0.0546875,
      "reward_std": 0.09518137574195862,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1720.0,
      "completions/mean_length": 1520.03125,
      "completions/mean_terminated_length": 586.8108520507812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6199539131176922,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 34.61352425813081,
      "learning_rate": 4.419676374756668e-07,
      "loss": 0.0332,
      "num_tokens": 1748037863.0,
      "reward": 0.1171875,
      "reward_std": 0.14920367300510406,
      "rewards/accuracy_reward/mean": 0.1171875,
      "rewards/accuracy_reward/std": 0.32195815443992615,
      "step": 1816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1761.0,
      "completions/mean_length": 1673.232421875,
      "completions/mean_terminated_length": 500.9193420410156,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.620295297431083,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 10.577681196747255,
      "learning_rate": 4.4144728472428725e-07,
      "loss": 0.0081,
      "num_tokens": 1748979198.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 1817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1809.0,
      "completions/mean_length": 1586.27734375,
      "completions/mean_terminated_length": 502.8888854980469,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6206366817444738,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.38431695122219,
      "learning_rate": 4.409270860436325e-07,
      "loss": 0.0028,
      "num_tokens": 1749875692.0,
      "reward": 0.09765625,
      "reward_std": 0.09060370922088623,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 1818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1696.716796875,
      "completions/mean_terminated_length": 631.8031616210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6209780660578647,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 50.101126611449025,
      "learning_rate": 4.404070421720293e-07,
      "loss": 0.0046,
      "num_tokens": 1750826187.0,
      "reward": 0.029296875,
      "reward_std": 0.05688370764255524,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1858.0,
      "completions/mean_length": 1579.259765625,
      "completions/mean_terminated_length": 566.5493774414062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6213194503712555,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 25.579078851494202,
      "learning_rate": 4.398871538475849e-07,
      "loss": 0.02,
      "num_tokens": 1751710992.0,
      "reward": 0.087890625,
      "reward_std": 0.11630964279174805,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 1820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1740.0,
      "completions/mean_length": 1690.267578125,
      "completions/mean_terminated_length": 670.8646850585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6216608346846463,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.1482583364896275,
      "learning_rate": 4.3936742180818565e-07,
      "loss": 0.0069,
      "num_tokens": 1752662265.0,
      "reward": 0.03125,
      "reward_std": 0.047646716237068176,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1703.916015625,
      "completions/mean_terminated_length": 555.0254516601562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6220022189980371,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.334885314753166,
      "learning_rate": 4.3884784679149613e-07,
      "loss": 0.0194,
      "num_tokens": 1753615118.0,
      "reward": 0.02734375,
      "reward_std": 0.0645298957824707,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1748.908203125,
      "completions/mean_terminated_length": 561.25244140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6223436033114278,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 60.002725650760134,
      "learning_rate": 4.383284295349583e-07,
      "loss": 0.017,
      "num_tokens": 1754590031.0,
      "reward": 0.044921875,
      "reward_std": 0.10336729884147644,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1547.0,
      "completions/mean_length": 1697.1640625,
      "completions/mean_terminated_length": 551.1000366210938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6226849876248186,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 65.86490336253512,
      "learning_rate": 4.378091707757896e-07,
      "loss": 0.0029,
      "num_tokens": 1755533731.0,
      "reward": 0.052734375,
      "reward_std": 0.10145636647939682,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1761.0,
      "completions/mean_length": 1721.30078125,
      "completions/mean_terminated_length": 593.4782104492188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6230263719382094,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.726497648827249,
      "learning_rate": 4.372900712509831e-07,
      "loss": 0.0321,
      "num_tokens": 1756485885.0,
      "reward": 0.041015625,
      "reward_std": 0.0830162987112999,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 1825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1600.880859375,
      "completions/mean_terminated_length": 617.21875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6233677562516002,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 21.824288767814878,
      "learning_rate": 4.367711316973054e-07,
      "loss": 0.0196,
      "num_tokens": 1757382000.0,
      "reward": 0.08984375,
      "reward_std": 0.13093778491020203,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1749.0,
      "completions/mean_length": 1634.39453125,
      "completions/mean_terminated_length": 567.5174560546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.623709140564991,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.525159576689205,
      "learning_rate": 4.3625235285129634e-07,
      "loss": 0.0059,
      "num_tokens": 1758293562.0,
      "reward": 0.029296875,
      "reward_std": 0.08010855317115784,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1642.173828125,
      "completions/mean_terminated_length": 634.5101928710938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6240505248783819,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.182693050655088,
      "learning_rate": 4.3573373544926786e-07,
      "loss": 0.0114,
      "num_tokens": 1759215939.0,
      "reward": 0.052734375,
      "reward_std": 0.07712717354297638,
      "rewards/accuracy_reward/mean": 0.05443548411130905,
      "rewards/accuracy_reward/std": 0.227104052901268,
      "step": 1828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1779.0,
      "completions/mean_length": 1658.236328125,
      "completions/mean_terminated_length": 612.32373046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6243919091917727,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 1.3930119646294938,
      "learning_rate": 4.352152802273024e-07,
      "loss": 0.0169,
      "num_tokens": 1760140572.0,
      "reward": 0.015625,
      "reward_std": 0.023823359981179237,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1690.0,
      "completions/mean_length": 1610.041015625,
      "completions/mean_terminated_length": 553.1000366210938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6247332935051635,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 28.483093153318027,
      "learning_rate": 4.3469698792125196e-07,
      "loss": 0.0237,
      "num_tokens": 1761041649.0,
      "reward": 0.033203125,
      "reward_std": 0.07686128467321396,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1673.59375,
      "completions/mean_terminated_length": 584.6717529296875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6250746778185542,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 86.36712319459413,
      "learning_rate": 4.341788592667381e-07,
      "loss": 0.031,
      "num_tokens": 1761981665.0,
      "reward": 0.09765625,
      "reward_std": 0.12797029316425323,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 1831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1648.947265625,
      "completions/mean_terminated_length": 598.9573974609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.625416062131945,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.152933770072398,
      "learning_rate": 4.336608949991496e-07,
      "loss": 0.0188,
      "num_tokens": 1762908902.0,
      "reward": 0.03515625,
      "reward_std": 0.08493966609239578,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1661.40625,
      "completions/mean_terminated_length": 603.211669921875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6257574464453358,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.800753345782766,
      "learning_rate": 4.3314309585364185e-07,
      "loss": 0.0237,
      "num_tokens": 1763839094.0,
      "reward": 0.04296875,
      "reward_std": 0.09338457882404327,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1619.619140625,
      "completions/mean_terminated_length": 605.0328979492188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6260988307587266,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.519855647768585,
      "learning_rate": 4.3262546256513613e-07,
      "loss": 0.0101,
      "num_tokens": 1764746163.0,
      "reward": 0.046875,
      "reward_std": 0.07898354530334473,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1878.0,
      "completions/mean_length": 1635.24609375,
      "completions/mean_terminated_length": 620.0946044921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6264402150721174,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 119.7640821713971,
      "learning_rate": 4.3210799586831825e-07,
      "loss": 0.0052,
      "num_tokens": 1765660849.0,
      "reward": 0.0390625,
      "reward_std": 0.0712449923157692,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1693.01171875,
      "completions/mean_terminated_length": 616.8661499023438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6267815993855083,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.961395164986127,
      "learning_rate": 4.3159069649763747e-07,
      "loss": 0.0076,
      "num_tokens": 1766613319.0,
      "reward": 0.02734375,
      "reward_std": 0.054907046258449554,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1871.0,
      "completions/mean_length": 1596.236328125,
      "completions/mean_terminated_length": 593.26416015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6271229836988991,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 27.529488416793896,
      "learning_rate": 4.3107356518730564e-07,
      "loss": 0.0217,
      "num_tokens": 1767503200.0,
      "reward": 0.0234375,
      "reward_std": 0.051659777760505676,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1536.0,
      "completions/mean_length": 1691.791015625,
      "completions/mean_terminated_length": 588.968017578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6274643680122899,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 28.010663518769846,
      "learning_rate": 4.30556602671296e-07,
      "loss": 0.0028,
      "num_tokens": 1768444325.0,
      "reward": 0.03125,
      "reward_std": 0.05880707502365112,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1699.998046875,
      "completions/mean_terminated_length": 611.0886840820312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6278057523256806,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.85948075817655,
      "learning_rate": 4.300398096833424e-07,
      "loss": 0.0053,
      "num_tokens": 1769391332.0,
      "reward": 0.064453125,
      "reward_std": 0.09280546009540558,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1697.91796875,
      "completions/mean_terminated_length": 647.671875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6281471366390714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 107.37095180902176,
      "learning_rate": 4.2952318695693803e-07,
      "loss": -0.0041,
      "num_tokens": 1770330138.0,
      "reward": 0.0625,
      "reward_std": 0.10244170576334,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1654.841796875,
      "completions/mean_terminated_length": 620.723388671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6284885209524622,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 28.36705516977205,
      "learning_rate": 4.290067352253344e-07,
      "loss": 0.0166,
      "num_tokens": 1771258457.0,
      "reward": 0.068359375,
      "reward_std": 0.10353910177946091,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1598.0,
      "completions/mean_length": 1633.423828125,
      "completions/mean_terminated_length": 584.1172485351562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.628829905265853,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 36.13622735992118,
      "learning_rate": 4.2849045522153994e-07,
      "loss": 0.0079,
      "num_tokens": 1772171618.0,
      "reward": 0.05078125,
      "reward_std": 0.07807311415672302,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1601.009765625,
      "completions/mean_terminated_length": 776.5611572265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6291712895792438,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 1.2017222626306845,
      "learning_rate": 4.2797434767831985e-07,
      "loss": 0.014,
      "num_tokens": 1773059063.0,
      "reward": 0.0390625,
      "reward_std": 0.08177263289690018,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 1843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1623.66796875,
      "completions/mean_terminated_length": 664.1911010742188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6295126738926347,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 19.6054492603802,
      "learning_rate": 4.274584133281944e-07,
      "loss": -0.006,
      "num_tokens": 1773966365.0,
      "reward": 0.0625,
      "reward_std": 0.10904473811388016,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1489.626953125,
      "completions/mean_terminated_length": 527.324462890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6298540582060255,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.5058880491406486,
      "learning_rate": 4.269426529034382e-07,
      "loss": 0.0262,
      "num_tokens": 1774800990.0,
      "reward": 0.025390625,
      "reward_std": 0.05303792655467987,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.64453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1535.265625,
      "completions/mean_terminated_length": 605.5824584960938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6301954425194163,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 30.826239809888694,
      "learning_rate": 4.264270671360787e-07,
      "loss": 0.0048,
      "num_tokens": 1775668006.0,
      "reward": 0.048828125,
      "reward_std": 0.09231997281312943,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 1846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1610.08984375,
      "completions/mean_terminated_length": 655.3912963867188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.630536826832807,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 117.83752239192344,
      "learning_rate": 4.2591165675789555e-07,
      "loss": 0.0386,
      "num_tokens": 1776564260.0,
      "reward": 0.0703125,
      "reward_std": 0.1254933476448059,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1764.0,
      "completions/mean_length": 1571.625,
      "completions/mean_terminated_length": 523.6000366210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6308782111461978,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 30.961108684547437,
      "learning_rate": 4.2539642250041973e-07,
      "loss": 0.0148,
      "num_tokens": 1777442372.0,
      "reward": 0.048828125,
      "reward_std": 0.07438573986291885,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 1848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1691.486328125,
      "completions/mean_terminated_length": 665.1591186523438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6312195954595886,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 24.969633047847786,
      "learning_rate": 4.2488136509493165e-07,
      "loss": 0.006,
      "num_tokens": 1778387517.0,
      "reward": 0.064453125,
      "reward_std": 0.11509227007627487,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1672.365234375,
      "completions/mean_terminated_length": 722.027587890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6315609797729794,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.342091751656254,
      "learning_rate": 4.243664852724614e-07,
      "loss": 0.011,
      "num_tokens": 1779339384.0,
      "reward": 0.0546875,
      "reward_std": 0.088299959897995,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 1850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1514.173828125,
      "completions/mean_terminated_length": 681.6699829101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6319023640863702,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 45.96276231829346,
      "learning_rate": 4.238517837637866e-07,
      "loss": 0.0385,
      "num_tokens": 1780192625.0,
      "reward": 0.107421875,
      "reward_std": 0.16086195409297943,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 1851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1635.52734375,
      "completions/mean_terminated_length": 744.3827514648438,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.632243748399761,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.33648519828817,
      "learning_rate": 4.233372612994319e-07,
      "loss": 0.037,
      "num_tokens": 1781105791.0,
      "reward": 0.060546875,
      "reward_std": 0.1282098889350891,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1689.939453125,
      "completions/mean_terminated_length": 615.7578125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6325851327131519,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 7.116431464790528,
      "learning_rate": 4.228229186096679e-07,
      "loss": 0.0028,
      "num_tokens": 1782061056.0,
      "reward": 0.013671875,
      "reward_std": 0.026572702452540398,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 1853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1740.44140625,
      "completions/mean_terminated_length": 798.2381591796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6329265170265427,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.756186788779025,
      "learning_rate": 4.223087564245099e-07,
      "loss": 0.0174,
      "num_tokens": 1783026546.0,
      "reward": 0.021484375,
      "reward_std": 0.05287161096930504,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1839.0,
      "completions/mean_length": 1572.73046875,
      "completions/mean_terminated_length": 616.6000366210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6332679013399334,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.840068506715816,
      "learning_rate": 4.2179477547371713e-07,
      "loss": 0.008,
      "num_tokens": 1783909000.0,
      "reward": 0.025390625,
      "reward_std": 0.06536141037940979,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1920.0,
      "completions/mean_length": 1575.8671875,
      "completions/mean_terminated_length": 591.7830810546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6336092856533242,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 4.193641523439476,
      "learning_rate": 4.212809764867915e-07,
      "loss": 0.0102,
      "num_tokens": 1784790500.0,
      "reward": 0.0234375,
      "reward_std": 0.07300759106874466,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 1642.703125,
      "completions/mean_terminated_length": 522.1764526367188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.633950669966715,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 23.448024839000897,
      "learning_rate": 4.2076736019297674e-07,
      "loss": 0.0218,
      "num_tokens": 1785712588.0,
      "reward": 0.060546875,
      "reward_std": 0.11355571448802948,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1588.302734375,
      "completions/mean_terminated_length": 621.8484497070312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6342920542801058,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 27.131050244956064,
      "learning_rate": 4.202539273212572e-07,
      "loss": 0.0066,
      "num_tokens": 1786602151.0,
      "reward": 0.0703125,
      "reward_std": 0.09529343992471695,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1636.376953125,
      "completions/mean_terminated_length": 661.4802856445312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6346334385934966,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.303459305289422,
      "learning_rate": 4.197406786003569e-07,
      "loss": 0.0138,
      "num_tokens": 1787515336.0,
      "reward": 0.02734375,
      "reward_std": 0.06304662674665451,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1506.029296875,
      "completions/mean_terminated_length": 579.80419921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6349748229068874,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 24.651070077108315,
      "learning_rate": 4.192276147587387e-07,
      "loss": 0.0298,
      "num_tokens": 1788367799.0,
      "reward": 0.03515625,
      "reward_std": 0.0917677953839302,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1666.62890625,
      "completions/mean_terminated_length": 653.2714233398438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6353162072202783,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 11.958992180749624,
      "learning_rate": 4.1871473652460265e-07,
      "loss": 0.0117,
      "num_tokens": 1789308969.0,
      "reward": 0.05078125,
      "reward_std": 0.054534729570150375,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1533.990234375,
      "completions/mean_terminated_length": 662.8789672851562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6356575915336691,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.218555622655918,
      "learning_rate": 4.182020446258857e-07,
      "loss": 0.0193,
      "num_tokens": 1790169508.0,
      "reward": 0.060546875,
      "reward_std": 0.09742391854524612,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1701.494140625,
      "completions/mean_terminated_length": 628.7120361328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6359989758470598,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 17.92061487289862,
      "learning_rate": 4.1768953979026024e-07,
      "loss": 0.0278,
      "num_tokens": 1791117665.0,
      "reward": 0.0859375,
      "reward_std": 0.1312175691127777,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1869.0,
      "completions/mean_length": 1714.470703125,
      "completions/mean_terminated_length": 659.650390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6363403601604506,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.221371870284877,
      "learning_rate": 4.171772227451331e-07,
      "loss": 0.0075,
      "num_tokens": 1792072786.0,
      "reward": 0.046875,
      "reward_std": 0.09413031488656998,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1783.560546875,
      "completions/mean_terminated_length": 694.6099853515625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6366817444738414,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 25.433143644358598,
      "learning_rate": 4.166650942176447e-07,
      "loss": 0.01,
      "num_tokens": 1793072257.0,
      "reward": 0.064453125,
      "reward_std": 0.09016359597444534,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 1700.888671875,
      "completions/mean_terminated_length": 541.8898315429688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6370231287872322,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 85.78492015725827,
      "learning_rate": 4.1615315493466797e-07,
      "loss": 0.0126,
      "num_tokens": 1794017816.0,
      "reward": 0.03125,
      "reward_std": 0.08290977776050568,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1686.474609375,
      "completions/mean_terminated_length": 624.1461791992188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.637364513100623,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 54.130651710185056,
      "learning_rate": 4.156414056228065e-07,
      "loss": 0.0062,
      "num_tokens": 1794957307.0,
      "reward": 0.041015625,
      "reward_std": 0.06288585811853409,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1615.66796875,
      "completions/mean_terminated_length": 601.2418212890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6377058974140138,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 17.881941645674697,
      "learning_rate": 4.151298470083954e-07,
      "loss": 0.041,
      "num_tokens": 1795859041.0,
      "reward": 0.107421875,
      "reward_std": 0.13792334496974945,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 1868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1700.154296875,
      "completions/mean_terminated_length": 611.7338256835938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6380472817274047,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 24.588248207098324,
      "learning_rate": 4.146184798174983e-07,
      "loss": 0.0099,
      "num_tokens": 1796807584.0,
      "reward": 0.05078125,
      "reward_std": 0.09028453379869461,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1704.033203125,
      "completions/mean_terminated_length": 617.081298828125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6383886660407955,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 45.12812514210717,
      "learning_rate": 4.141073047759076e-07,
      "loss": 0.011,
      "num_tokens": 1797771777.0,
      "reward": 0.025390625,
      "reward_std": 0.048406004905700684,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1662.0,
      "completions/mean_length": 1635.787109375,
      "completions/mean_terminated_length": 518.6304321289062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6387300503541862,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 25.654035521992082,
      "learning_rate": 4.135963226091426e-07,
      "loss": 0.0101,
      "num_tokens": 1798690260.0,
      "reward": 0.107421875,
      "reward_std": 0.1643633246421814,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 1871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 1610.197265625,
      "completions/mean_terminated_length": 647.03125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.639071434667577,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 20.45110421108804,
      "learning_rate": 4.1308553404244927e-07,
      "loss": 0.0148,
      "num_tokens": 1799589305.0,
      "reward": 0.064453125,
      "reward_std": 0.0829630047082901,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1779.0,
      "completions/mean_length": 1676.05078125,
      "completions/mean_terminated_length": 605.2879028320312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6394128189809678,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 34.66645930680124,
      "learning_rate": 4.1257493980079825e-07,
      "loss": 0.0273,
      "num_tokens": 1800517619.0,
      "reward": 0.08984375,
      "reward_std": 0.10656822472810745,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1847.0,
      "completions/mean_length": 1752.974609375,
      "completions/mean_terminated_length": 596.0769653320312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6397542032943586,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 2.2474519057455,
      "learning_rate": 4.120645406088846e-07,
      "loss": -0.0009,
      "num_tokens": 1801494662.0,
      "reward": 0.029296875,
      "reward_std": 0.028222277760505676,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 1874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1606.20703125,
      "completions/mean_terminated_length": 540.0133666992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6400955876077494,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.664033533125084,
      "learning_rate": 4.1155433719112696e-07,
      "loss": 0.0149,
      "num_tokens": 1802403056.0,
      "reward": 0.02734375,
      "reward_std": 0.06987475603818893,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1650.171875,
      "completions/mean_terminated_length": 593.085693359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6404369719211402,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 17.07520178585204,
      "learning_rate": 4.1104433027166564e-07,
      "loss": 0.0374,
      "num_tokens": 1803318472.0,
      "reward": 0.056640625,
      "reward_std": 0.10837717354297638,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1710.0,
      "completions/mean_length": 1637.01953125,
      "completions/mean_terminated_length": 635.7717895507812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.640778356234531,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 53.899380724028475,
      "learning_rate": 4.1053452057436213e-07,
      "loss": 0.0118,
      "num_tokens": 1804233074.0,
      "reward": 0.033203125,
      "reward_std": 0.06920956075191498,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1555.181640625,
      "completions/mean_terminated_length": 630.455078125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6411197405479219,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.48270970018041,
      "learning_rate": 4.1002490882279804e-07,
      "loss": 0.0071,
      "num_tokens": 1805105887.0,
      "reward": 0.037109375,
      "reward_std": 0.064970001578331,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1716.28515625,
      "completions/mean_terminated_length": 596.3931884765625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6414611248613126,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 18.68429825475646,
      "learning_rate": 4.0951549574027434e-07,
      "loss": 0.0088,
      "num_tokens": 1806052625.0,
      "reward": 0.025390625,
      "reward_std": 0.057656385004520416,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 1879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1701.0,
      "completions/mean_length": 1684.283203125,
      "completions/mean_terminated_length": 593.1328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6418025091747034,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 49.4379803828236,
      "learning_rate": 4.0900628204980924e-07,
      "loss": 0.0165,
      "num_tokens": 1807000402.0,
      "reward": 0.072265625,
      "reward_std": 0.08961933851242065,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 1880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1665.091796875,
      "completions/mean_terminated_length": 705.1986083984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6421438934880942,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.902747443619923,
      "learning_rate": 4.084972684741386e-07,
      "loss": 0.0192,
      "num_tokens": 1807929857.0,
      "reward": 0.0546875,
      "reward_std": 0.09880206733942032,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1559.21484375,
      "completions/mean_terminated_length": 575.8941040039062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.642485277801485,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 51.563566395950225,
      "learning_rate": 4.079884557357142e-07,
      "loss": 0.009,
      "num_tokens": 1808809215.0,
      "reward": 0.041015625,
      "reward_std": 0.10100477933883667,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1618.8984375,
      "completions/mean_terminated_length": 691.8271484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6428266621148758,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 18.205882448412563,
      "learning_rate": 4.0747984455670257e-07,
      "loss": 0.0082,
      "num_tokens": 1809719483.0,
      "reward": 0.06640625,
      "reward_std": 0.11052703112363815,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1874.0,
      "completions/mean_length": 1590.919921875,
      "completions/mean_terminated_length": 547.8397216796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6431680464282666,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.997819879991024,
      "learning_rate": 4.069714356589844e-07,
      "loss": 0.0026,
      "num_tokens": 1810611906.0,
      "reward": 0.064453125,
      "reward_std": 0.08644992858171463,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1558.3671875,
      "completions/mean_terminated_length": 655.2666625976562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6435094307416575,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.680236661237201,
      "learning_rate": 4.064632297641533e-07,
      "loss": 0.0091,
      "num_tokens": 1811491502.0,
      "reward": 0.037109375,
      "reward_std": 0.0557793527841568,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1693.7265625,
      "completions/mean_terminated_length": 714.2647094726562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6438508150550483,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 23.44396970075306,
      "learning_rate": 4.05955227593514e-07,
      "loss": 0.0029,
      "num_tokens": 1812434962.0,
      "reward": 0.0859375,
      "reward_std": 0.1332087218761444,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1624.328125,
      "completions/mean_terminated_length": 620.8947143554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.644192199368439,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 21.949686178829488,
      "learning_rate": 4.05447429868083e-07,
      "loss": 0.0092,
      "num_tokens": 1813350810.0,
      "reward": 0.06640625,
      "reward_std": 0.09247933328151703,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1635.7421875,
      "completions/mean_terminated_length": 659.3421020507812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6445335836818298,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 64.40457654317638,
      "learning_rate": 4.049398373085862e-07,
      "loss": 0.01,
      "num_tokens": 1814262134.0,
      "reward": 0.033203125,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 1888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1607.421875,
      "completions/mean_terminated_length": 689.1083984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6448749679952206,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 33.52260018606143,
      "learning_rate": 4.044324506354585e-07,
      "loss": 0.0172,
      "num_tokens": 1815162846.0,
      "reward": 0.03125,
      "reward_std": 0.06068410724401474,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1494.90234375,
      "completions/mean_terminated_length": 588.2783203125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6452163523086114,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 14.791239591765395,
      "learning_rate": 4.0392527056884254e-07,
      "loss": 0.0232,
      "num_tokens": 1815999036.0,
      "reward": 0.044921875,
      "reward_std": 0.10453138500452042,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1806.0,
      "completions/mean_length": 1620.541015625,
      "completions/mean_terminated_length": 645.0576782226562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6455577366220022,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 39.13802754860844,
      "learning_rate": 4.034182978285877e-07,
      "loss": 0.0249,
      "num_tokens": 1816906577.0,
      "reward": 0.087890625,
      "reward_std": 0.13353893160820007,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 1891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1547.8671875,
      "completions/mean_terminated_length": 585.0171508789062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.645899120935393,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.715313124520083,
      "learning_rate": 4.0291153313424874e-07,
      "loss": -0.005,
      "num_tokens": 1817777373.0,
      "reward": 0.025390625,
      "reward_std": 0.047094546258449554,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 1892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1914.0,
      "completions/mean_length": 1626.55078125,
      "completions/mean_terminated_length": 646.8181762695312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6462405052487838,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 21.383252866922586,
      "learning_rate": 4.024049772050857e-07,
      "loss": 0.0105,
      "num_tokens": 1818693415.0,
      "reward": 0.052734375,
      "reward_std": 0.10854348540306091,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1603.267578125,
      "completions/mean_terminated_length": 606.841796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6465818895621747,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.602934390656614,
      "learning_rate": 4.01898630760062e-07,
      "loss": 0.0218,
      "num_tokens": 1819587568.0,
      "reward": 0.052734375,
      "reward_std": 0.06260652095079422,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 1894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1662.603515625,
      "completions/mean_terminated_length": 687.1517333984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6469232738755654,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 75.83647106595872,
      "learning_rate": 4.0139249451784383e-07,
      "loss": 0.0135,
      "num_tokens": 1820519669.0,
      "reward": 0.0625,
      "reward_std": 0.11294380575418472,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1742.587890625,
      "completions/mean_terminated_length": 777.6422729492188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6472646581889562,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 38.37533702448189,
      "learning_rate": 4.00886569196799e-07,
      "loss": 0.0223,
      "num_tokens": 1821497842.0,
      "reward": 0.0703125,
      "reward_std": 0.13247431814670563,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1578.185546875,
      "completions/mean_terminated_length": 688.9887084960938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.647606042502347,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 28.96005356889781,
      "learning_rate": 4.003808555149961e-07,
      "loss": 0.0146,
      "num_tokens": 1822383697.0,
      "reward": 0.08203125,
      "reward_std": 0.12285147607326508,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1606.001953125,
      "completions/mean_terminated_length": 700.9583740234375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6479474268157378,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.487891776182312,
      "learning_rate": 3.9987535419020303e-07,
      "loss": 0.0269,
      "num_tokens": 1823281090.0,
      "reward": 0.0546875,
      "reward_std": 0.059858135879039764,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1641.009765625,
      "completions/mean_terminated_length": 694.8895874023438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6482888111291286,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 70.17150869471854,
      "learning_rate": 3.993700659398863e-07,
      "loss": -0.0089,
      "num_tokens": 1824197991.0,
      "reward": 0.046875,
      "reward_std": 0.057157501578330994,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1602.35546875,
      "completions/mean_terminated_length": 665.1514892578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6486301954425194,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 56.92204271812513,
      "learning_rate": 3.9886499148121055e-07,
      "loss": 0.0284,
      "num_tokens": 1825098925.0,
      "reward": 0.107421875,
      "reward_std": 0.1464010328054428,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 1900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1608.669921875,
      "completions/mean_terminated_length": 568.1513061523438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6489715797559102,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 28.794096151075603,
      "learning_rate": 3.9836013153103643e-07,
      "loss": 0.029,
      "num_tokens": 1825998420.0,
      "reward": 0.0546875,
      "reward_std": 0.07861737906932831,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1571.388671875,
      "completions/mean_terminated_length": 653.8856811523438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6493129640693011,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 40.29705204654067,
      "learning_rate": 3.9785548680592027e-07,
      "loss": 0.0086,
      "num_tokens": 1826879147.0,
      "reward": 0.08984375,
      "reward_std": 0.1296786367893219,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1604.296875,
      "completions/mean_terminated_length": 636.9689331054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6496543483826919,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 18.014936877204462,
      "learning_rate": 3.97351058022113e-07,
      "loss": 0.0019,
      "num_tokens": 1827780131.0,
      "reward": 0.0625,
      "reward_std": 0.09435540437698364,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1881.0,
      "completions/mean_length": 1678.91796875,
      "completions/mean_terminated_length": 717.225341796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6499957326960826,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 21.39527370249335,
      "learning_rate": 3.9684684589555894e-07,
      "loss": 0.0282,
      "num_tokens": 1828719065.0,
      "reward": 0.03515625,
      "reward_std": 0.08345641195774078,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1605.140625,
      "completions/mean_terminated_length": 690.2515258789062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6503371170094734,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 22.325634809261118,
      "learning_rate": 3.9634285114189505e-07,
      "loss": -0.0177,
      "num_tokens": 1829619265.0,
      "reward": 0.041015625,
      "reward_std": 0.062062256038188934,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1625.32421875,
      "completions/mean_terminated_length": 660.7564086914062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6506785013228642,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 61.240552491319065,
      "learning_rate": 3.958390744764497e-07,
      "loss": 0.0201,
      "num_tokens": 1830529255.0,
      "reward": 0.05859375,
      "reward_std": 0.10150270164012909,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1874.0,
      "completions/mean_length": 1626.369140625,
      "completions/mean_terminated_length": 527.7535400390625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.651019885636255,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.628270906804987,
      "learning_rate": 3.953355166142417e-07,
      "loss": -0.0007,
      "num_tokens": 1831440260.0,
      "reward": 0.01953125,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.02083333395421505,
      "rewards/accuracy_reward/std": 0.14297515153884888,
      "step": 1907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1572.7890625,
      "completions/mean_terminated_length": 649.6781616210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6513612699496458,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 15.521275011062448,
      "learning_rate": 3.9483217826997927e-07,
      "loss": 0.0316,
      "num_tokens": 1832317544.0,
      "reward": 0.0859375,
      "reward_std": 0.12324429303407669,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1608.9921875,
      "completions/mean_terminated_length": 634.5848999023438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6517026542630366,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.360901834856371,
      "learning_rate": 3.9432906015805946e-07,
      "loss": 0.01,
      "num_tokens": 1833225940.0,
      "reward": 0.0234375,
      "reward_std": 0.062167368829250336,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.611328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1465.458984375,
      "completions/mean_terminated_length": 549.2009887695312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6520440385764275,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 27.16075614590475,
      "learning_rate": 3.93826162992566e-07,
      "loss": -0.0027,
      "num_tokens": 1834055327.0,
      "reward": 0.046875,
      "reward_std": 0.059858135879039764,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1554.236328125,
      "completions/mean_terminated_length": 651.2762451171875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6523854228898183,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.83903793745618,
      "learning_rate": 3.933234874872695e-07,
      "loss": 0.0198,
      "num_tokens": 1834927048.0,
      "reward": 0.03515625,
      "reward_std": 0.07796105742454529,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.576171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1501.0,
      "completions/mean_length": 1427.958984375,
      "completions/mean_terminated_length": 585.0460815429688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.652726807203209,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 2.8231449167928226,
      "learning_rate": 3.92821034355626e-07,
      "loss": 0.0282,
      "num_tokens": 1835731171.0,
      "reward": 0.076171875,
      "reward_std": 0.11377984285354614,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 1912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1531.041015625,
      "completions/mean_terminated_length": 654.9315795898438,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6530681915165998,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 12.60456839287929,
      "learning_rate": 3.923188043107758e-07,
      "loss": 0.0473,
      "num_tokens": 1836590040.0,
      "reward": 0.048828125,
      "reward_std": 0.09601948410272598,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 1913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1647.740234375,
      "completions/mean_terminated_length": 672.6107177734375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6534095758299906,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 41.09799395784116,
      "learning_rate": 3.9181679806554267e-07,
      "loss": 0.0209,
      "num_tokens": 1837509219.0,
      "reward": 0.080078125,
      "reward_std": 0.09534214437007904,
      "rewards/accuracy_reward/mean": 0.08266129344701767,
      "rewards/accuracy_reward/std": 0.2756475806236267,
      "step": 1914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1828.0,
      "completions/mean_length": 1625.287109375,
      "completions/mean_terminated_length": 642.6168823242188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6537509601433814,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 37.618429167559775,
      "learning_rate": 3.913150163324326e-07,
      "loss": 0.017,
      "num_tokens": 1838416566.0,
      "reward": 0.0546875,
      "reward_std": 0.0767945945262909,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1529.771484375,
      "completions/mean_terminated_length": 573.9277954101562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6540923444567722,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 0.21945231270445678,
      "learning_rate": 3.90813459823633e-07,
      "loss": 0.0099,
      "num_tokens": 1839278673.0,
      "reward": 0.06640625,
      "reward_std": 0.04847269132733345,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1890.0,
      "completions/mean_length": 1557.873046875,
      "completions/mean_terminated_length": 747.766845703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.654433728770163,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 1.610379892229922,
      "learning_rate": 3.9031212925101144e-07,
      "loss": 0.0204,
      "num_tokens": 1840152768.0,
      "reward": 0.04296875,
      "reward_std": 0.09671889245510101,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1597.68359375,
      "completions/mean_terminated_length": 650.654541015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6547751130835539,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.946039753780898,
      "learning_rate": 3.898110253261151e-07,
      "loss": 0.0129,
      "num_tokens": 1841048654.0,
      "reward": 0.06640625,
      "reward_std": 0.09580586850643158,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293970108032227,
      "step": 1918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1863.0,
      "completions/mean_length": 1577.87109375,
      "completions/mean_terminated_length": 606.646728515625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6551164973969447,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.168538975497906,
      "learning_rate": 3.8931014876016944e-07,
      "loss": 0.0135,
      "num_tokens": 1841936076.0,
      "reward": 0.0703125,
      "reward_std": 0.12538030743598938,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 1919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1566.767578125,
      "completions/mean_terminated_length": 723.3171997070312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6554578817103354,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 12.76728289799827,
      "learning_rate": 3.88809500264077e-07,
      "loss": 0.0194,
      "num_tokens": 1842813733.0,
      "reward": 0.060546875,
      "reward_std": 0.10441835969686508,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 1920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1570.7421875,
      "completions/mean_terminated_length": 682.8826904296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6557992660237262,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.947854748541362,
      "learning_rate": 3.8830908054841673e-07,
      "loss": -0.0077,
      "num_tokens": 1843694481.0,
      "reward": 0.02734375,
      "reward_std": 0.05806133896112442,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 1921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1707.0,
      "completions/mean_length": 1662.904296875,
      "completions/mean_terminated_length": 649.6383056640625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.656140650337117,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 8.282081646655259,
      "learning_rate": 3.878088903234431e-07,
      "loss": 0.0317,
      "num_tokens": 1844626656.0,
      "reward": 0.04296875,
      "reward_std": 0.09248073399066925,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1629.91015625,
      "completions/mean_terminated_length": 803.4534912109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6564820346505078,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.364429226048358,
      "learning_rate": 3.873089302990844e-07,
      "loss": 0.005,
      "num_tokens": 1845553026.0,
      "reward": 0.025390625,
      "reward_std": 0.04957009106874466,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1578.521484375,
      "completions/mean_terminated_length": 599.9698486328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6568234189638986,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 114.6146550085507,
      "learning_rate": 3.868092011849425e-07,
      "loss": 0.0113,
      "num_tokens": 1846431917.0,
      "reward": 0.041015625,
      "reward_std": 0.0814797505736351,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 1924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1538.7109375,
      "completions/mean_terminated_length": 682.7853393554688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6571648032772894,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 44.44046880427549,
      "learning_rate": 3.8630970369029146e-07,
      "loss": 0.0267,
      "num_tokens": 1847295129.0,
      "reward": 0.0625,
      "reward_std": 0.09959384053945541,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1927.0,
      "completions/mean_length": 1677.900390625,
      "completions/mean_terminated_length": 654.683837890625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6575061875906802,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 129.48153553409983,
      "learning_rate": 3.858104385240768e-07,
      "loss": 0.0164,
      "num_tokens": 1848230118.0,
      "reward": 0.029296875,
      "reward_std": 0.08654290437698364,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1573.306640625,
      "completions/mean_terminated_length": 618.3353271484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6578475719040711,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 16.877465650830253,
      "learning_rate": 3.85311406394914e-07,
      "loss": 0.0014,
      "num_tokens": 1849109955.0,
      "reward": 0.078125,
      "reward_std": 0.0921671986579895,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1601.75390625,
      "completions/mean_terminated_length": 671.62646484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6581889562174618,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 26.7497106550931,
      "learning_rate": 3.848126080110878e-07,
      "loss": 0.0149,
      "num_tokens": 1850018917.0,
      "reward": 0.0546875,
      "reward_std": 0.10381289571523666,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1643.60546875,
      "completions/mean_terminated_length": 589.9013671875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6585303405308526,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.43784689102482,
      "learning_rate": 3.8431404408055133e-07,
      "loss": 0.0153,
      "num_tokens": 1850943947.0,
      "reward": 0.048828125,
      "reward_std": 0.08395528793334961,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899640560150146,
      "step": 1929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1841.0,
      "completions/mean_length": 1705.958984375,
      "completions/mean_terminated_length": 624.219482421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6588717248442434,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.649750323093194,
      "learning_rate": 3.8381571531092496e-07,
      "loss": 0.0111,
      "num_tokens": 1851896070.0,
      "reward": 0.029296875,
      "reward_std": 0.059984706342220306,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1604.974609375,
      "completions/mean_terminated_length": 656.4110107421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6592131091576342,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.276748754782338,
      "learning_rate": 3.8331762240949503e-07,
      "loss": 0.0119,
      "num_tokens": 1852797305.0,
      "reward": 0.01953125,
      "reward_std": 0.04907120764255524,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 1931
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 1666.044921875,
      "completions/mean_terminated_length": 689.9375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.659554493471025,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 38.474988449109986,
      "learning_rate": 3.8281976608321366e-07,
      "loss": 0.0273,
      "num_tokens": 1853723456.0,
      "reward": 0.07421875,
      "reward_std": 0.11305920034646988,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1644.75,
      "completions/mean_terminated_length": 671.5733642578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6598958777844158,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 29.958146312958622,
      "learning_rate": 3.823221470386965e-07,
      "loss": 0.0123,
      "num_tokens": 1854641200.0,
      "reward": 0.021484375,
      "reward_std": 0.05193261057138443,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1720.0,
      "completions/mean_length": 1613.302734375,
      "completions/mean_terminated_length": 630.3885498046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6602372620978066,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 134.38455745394924,
      "learning_rate": 3.81824765982223e-07,
      "loss": 0.0131,
      "num_tokens": 1855542315.0,
      "reward": 0.0625,
      "reward_std": 0.11360110342502594,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 1734.376953125,
      "completions/mean_terminated_length": 639.4473876953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6605786464111975,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.146875587259746,
      "learning_rate": 3.8132762361973456e-07,
      "loss": -0.001,
      "num_tokens": 1856506956.0,
      "reward": 0.021484375,
      "reward_std": 0.0546875,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 1935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1624.94140625,
      "completions/mean_terminated_length": 677.0759887695312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6609200307245882,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 20.820176880599945,
      "learning_rate": 3.8083072065683373e-07,
      "loss": 0.031,
      "num_tokens": 1857415870.0,
      "reward": 0.068359375,
      "reward_std": 0.12542080879211426,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1702.0,
      "completions/mean_length": 1646.38671875,
      "completions/mean_terminated_length": 639.6027221679688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.661261415037979,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 1.842423646244592,
      "learning_rate": 3.803340577987834e-07,
      "loss": 0.0034,
      "num_tokens": 1858335076.0,
      "reward": 0.041015625,
      "reward_std": 0.09291848540306091,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1596.2734375,
      "completions/mean_terminated_length": 506.106689453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6616027993513698,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 129.0727114858703,
      "learning_rate": 3.7983763575050575e-07,
      "loss": -0.0052,
      "num_tokens": 1859236000.0,
      "reward": 0.06640625,
      "reward_std": 0.11382715404033661,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293973088264465,
      "step": 1938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1600.294921875,
      "completions/mean_terminated_length": 650.2865600585938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6619441836647606,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.79687851534851,
      "learning_rate": 3.79341455216581e-07,
      "loss": 0.0119,
      "num_tokens": 1860129351.0,
      "reward": 0.04296875,
      "reward_std": 0.0801548883318901,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1844.0,
      "completions/mean_length": 1594.65625,
      "completions/mean_terminated_length": 540.7792358398438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6622855679781514,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.302880074106984,
      "learning_rate": 3.788455169012469e-07,
      "loss": 0.008,
      "num_tokens": 1861015591.0,
      "reward": 0.03515625,
      "reward_std": 0.06888246536254883,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1591.091796875,
      "completions/mean_terminated_length": 638.740966796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6626269522915422,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 27.62542811601489,
      "learning_rate": 3.783498215083967e-07,
      "loss": 0.0229,
      "num_tokens": 1861903622.0,
      "reward": 0.0625,
      "reward_std": 0.11179865896701813,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 1941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1573.845703125,
      "completions/mean_terminated_length": 728.6141357421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.662968336604933,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 113.42485994310648,
      "learning_rate": 3.778543697415797e-07,
      "loss": 0.0354,
      "num_tokens": 1862778471.0,
      "reward": 0.10546875,
      "reward_std": 0.13864420354366302,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 1942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1670.068359375,
      "completions/mean_terminated_length": 749.671142578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6633097209183239,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 25.13782941878759,
      "learning_rate": 3.77359162303999e-07,
      "loss": 0.0042,
      "num_tokens": 1863716378.0,
      "reward": 0.068359375,
      "reward_std": 0.09731185436248779,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 1943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1824.0,
      "completions/mean_length": 1615.54296875,
      "completions/mean_terminated_length": 681.2222290039062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6636511052317146,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 42.91499210315436,
      "learning_rate": 3.7686419989851104e-07,
      "loss": 0.0292,
      "num_tokens": 1864613968.0,
      "reward": 0.095703125,
      "reward_std": 0.13276061415672302,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 1944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1662.95703125,
      "completions/mean_terminated_length": 688.4000244140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6639924895451054,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.606609122648123,
      "learning_rate": 3.763694832276242e-07,
      "loss": -0.0075,
      "num_tokens": 1865541178.0,
      "reward": 0.03515625,
      "reward_std": 0.05644455552101135,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1919.0,
      "completions/mean_length": 1673.373046875,
      "completions/mean_terminated_length": 706.6782836914062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6643338738584962,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.341940137915266,
      "learning_rate": 3.758750129934988e-07,
      "loss": 0.0226,
      "num_tokens": 1866479097.0,
      "reward": 0.0234375,
      "reward_std": 0.05880707502365112,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1722.248046875,
      "completions/mean_terminated_length": 610.6034545898438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.664675258171887,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 75.71014409972499,
      "learning_rate": 3.753807898979442e-07,
      "loss": -0.0091,
      "num_tokens": 1867441000.0,
      "reward": 0.03125,
      "reward_std": 0.06711846590042114,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1612.03515625,
      "completions/mean_terminated_length": 703.3373413085938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6650166424852778,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 91.87204973098542,
      "learning_rate": 3.748868146424201e-07,
      "loss": -0.0058,
      "num_tokens": 1868343882.0,
      "reward": 0.060546875,
      "reward_std": 0.1333048790693283,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 1948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1678.384765625,
      "completions/mean_terminated_length": 676.6739501953125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6653580267986686,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 46.45154082282707,
      "learning_rate": 3.7439308792803405e-07,
      "loss": 0.0092,
      "num_tokens": 1869281375.0,
      "reward": 0.099609375,
      "reward_std": 0.13248127698898315,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 1949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.595703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1494.88671875,
      "completions/mean_terminated_length": 679.9130249023438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6656994111120594,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 48.06504360074654,
      "learning_rate": 3.738996104555406e-07,
      "loss": 0.0355,
      "num_tokens": 1870129173.0,
      "reward": 0.03515625,
      "reward_std": 0.08615703880786896,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 1950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1682.36328125,
      "completions/mean_terminated_length": 630.23486328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6660407954254502,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.669517583765767,
      "learning_rate": 3.734063829253411e-07,
      "loss": 0.0034,
      "num_tokens": 1871066831.0,
      "reward": 0.029296875,
      "reward_std": 0.07328139245510101,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1677.826171875,
      "completions/mean_terminated_length": 633.962646484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.666382179738841,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 58.895488730141004,
      "learning_rate": 3.7291340603748146e-07,
      "loss": 0.0088,
      "num_tokens": 1872003782.0,
      "reward": 0.08984375,
      "reward_std": 0.12793609499931335,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1717.095703125,
      "completions/mean_terminated_length": 624.27734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6667235640522318,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 56.18263103415097,
      "learning_rate": 3.724206804916526e-07,
      "loss": 0.0106,
      "num_tokens": 1872961031.0,
      "reward": 0.06640625,
      "reward_std": 0.08627147972583771,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1616.974609375,
      "completions/mean_terminated_length": 677.2857055664062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6670649483656226,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.378567912438644,
      "learning_rate": 3.7192820698718797e-07,
      "loss": 0.0085,
      "num_tokens": 1873875210.0,
      "reward": 0.021484375,
      "reward_std": 0.058760736137628555,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 1954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1856.0,
      "completions/mean_length": 1535.353515625,
      "completions/mean_terminated_length": 597.8618774414062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6674063326790134,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 25.0177318452696,
      "learning_rate": 3.7143598622306374e-07,
      "loss": 0.0066,
      "num_tokens": 1874739039.0,
      "reward": 0.044921875,
      "reward_std": 0.07803337275981903,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1868.0,
      "completions/mean_length": 1590.146484375,
      "completions/mean_terminated_length": 609.8343505859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6677477169924042,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 40.91140456636508,
      "learning_rate": 3.7094401889789715e-07,
      "loss": 0.0101,
      "num_tokens": 1875630522.0,
      "reward": 0.07421875,
      "reward_std": 0.09367191791534424,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 1636.171875,
      "completions/mean_terminated_length": 696.3590087890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.668089101305795,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 52.58977840071835,
      "learning_rate": 3.70452305709946e-07,
      "loss": 0.0128,
      "num_tokens": 1876545618.0,
      "reward": 0.037109375,
      "reward_std": 0.07581022381782532,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1634.177734375,
      "completions/mean_terminated_length": 586.779296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6684304856191858,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.987811161614058,
      "learning_rate": 3.699608473571072e-07,
      "loss": -0.0051,
      "num_tokens": 1877455101.0,
      "reward": 0.0859375,
      "reward_std": 0.09811057895421982,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1639.21484375,
      "completions/mean_terminated_length": 671.0394897460938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6687718699325766,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.322672873283942,
      "learning_rate": 3.694696445369161e-07,
      "loss": 0.0082,
      "num_tokens": 1878367179.0,
      "reward": 0.029296875,
      "reward_std": 0.06271953880786896,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 1959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1724.814453125,
      "completions/mean_terminated_length": 633.7179565429688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6691132542459673,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 52.968759319113815,
      "learning_rate": 3.68978697946545e-07,
      "loss": -0.0021,
      "num_tokens": 1879338364.0,
      "reward": 0.0390625,
      "reward_std": 0.10056467354297638,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1531.40234375,
      "completions/mean_terminated_length": 527.8965454101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6694546385593582,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.93845553695336,
      "learning_rate": 3.6848800828280303e-07,
      "loss": 0.0184,
      "num_tokens": 1880201642.0,
      "reward": 0.017578125,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 1961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1715.7109375,
      "completions/mean_terminated_length": 542.4071044921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.669796022872749,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 62.756249492685946,
      "learning_rate": 3.67997576242134e-07,
      "loss": 0.0076,
      "num_tokens": 1881154006.0,
      "reward": 0.076171875,
      "reward_std": 0.12758280336856842,
      "rewards/accuracy_reward/mean": 0.0786290317773819,
      "rewards/accuracy_reward/std": 0.26943063735961914,
      "step": 1962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1891.0,
      "completions/mean_length": 1635.578125,
      "completions/mean_terminated_length": 601.6986083984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6701374071861398,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 38.742672223165776,
      "learning_rate": 3.67507402520617e-07,
      "loss": -0.003,
      "num_tokens": 1882066158.0,
      "reward": 0.05078125,
      "reward_std": 0.08746850490570068,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1483.611328125,
      "completions/mean_terminated_length": 631.4951171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6704787914995306,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 30.07118084137147,
      "learning_rate": 3.6701748781396367e-07,
      "loss": 0.0196,
      "num_tokens": 1882903863.0,
      "reward": 0.0390625,
      "reward_std": 0.07477159798145294,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1649.021484375,
      "completions/mean_terminated_length": 658.3605346679688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6708201758129214,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 20.828769637008634,
      "learning_rate": 3.6652783281751873e-07,
      "loss": 0.004,
      "num_tokens": 1883833170.0,
      "reward": 0.064453125,
      "reward_std": 0.08736887574195862,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1582.546875,
      "completions/mean_terminated_length": 709.1685180664062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6711615601263122,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.844985662898475,
      "learning_rate": 3.6603843822625734e-07,
      "loss": 0.0017,
      "num_tokens": 1884714458.0,
      "reward": 0.03125,
      "reward_std": 0.07147008180618286,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 1966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1587.951171875,
      "completions/mean_terminated_length": 686.4682006835938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.671502944439703,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 29.01153872742641,
      "learning_rate": 3.6554930473478595e-07,
      "loss": 0.0079,
      "num_tokens": 1885606705.0,
      "reward": 0.076171875,
      "reward_std": 0.10592307895421982,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 1967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1915.0,
      "completions/mean_length": 1640.8828125,
      "completions/mean_terminated_length": 667.576171875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6718443287530937,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 146.75221062076855,
      "learning_rate": 3.650604330373398e-07,
      "loss": 0.0227,
      "num_tokens": 1886521445.0,
      "reward": 0.080078125,
      "reward_std": 0.10586882382631302,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 1968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1888.0,
      "completions/mean_length": 1540.52734375,
      "completions/mean_terminated_length": 673.2592163085938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6721857130664846,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 55.559349819124726,
      "learning_rate": 3.6457182382778315e-07,
      "loss": 0.0091,
      "num_tokens": 1887388739.0,
      "reward": 0.04296875,
      "reward_std": 0.08966471254825592,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1969
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1617.51953125,
      "completions/mean_terminated_length": 728.20361328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6725270973798754,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 129.9211284555465,
      "learning_rate": 3.6408347779960734e-07,
      "loss": -0.0075,
      "num_tokens": 1888302141.0,
      "reward": 0.05859375,
      "reward_std": 0.12026290595531464,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 1970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.599609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1481.630859375,
      "completions/mean_terminated_length": 633.4585571289062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6728684816932662,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.519506172147983,
      "learning_rate": 3.6359539564593036e-07,
      "loss": 0.0032,
      "num_tokens": 1889130528.0,
      "reward": 0.0234375,
      "reward_std": 0.06068410724401474,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 1971
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1535.412109375,
      "completions/mean_terminated_length": 681.0989990234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.673209866006657,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 14.521777875029994,
      "learning_rate": 3.631075780594952e-07,
      "loss": 0.0346,
      "num_tokens": 1889998867.0,
      "reward": 0.08203125,
      "reward_std": 0.14277496933937073,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1867.0,
      "completions/mean_length": 1576.8359375,
      "completions/mean_terminated_length": 620.5680541992188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6735512503200478,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 15.804738422435477,
      "learning_rate": 3.626200257326697e-07,
      "loss": 0.0366,
      "num_tokens": 1890885487.0,
      "reward": 0.07421875,
      "reward_std": 0.1401873528957367,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1911.0,
      "completions/mean_length": 1619.83984375,
      "completions/mean_terminated_length": 773.4767456054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6738926346334386,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 27.112510943008317,
      "learning_rate": 3.621327393574458e-07,
      "loss": -0.0022,
      "num_tokens": 1891801165.0,
      "reward": 0.025390625,
      "reward_std": 0.031097229570150375,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 1974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1662.388671875,
      "completions/mean_terminated_length": 657.6267700195312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6742340189468294,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.470861371573744,
      "learning_rate": 3.616457196254367e-07,
      "loss": 0.0007,
      "num_tokens": 1892735172.0,
      "reward": 0.046875,
      "reward_std": 0.042583562433719635,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 1975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1582.046875,
      "completions/mean_terminated_length": 628.2678833007812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6745754032602201,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 22.935143982029984,
      "learning_rate": 3.6115896722787833e-07,
      "loss": 0.0102,
      "num_tokens": 1893629500.0,
      "reward": 0.04296875,
      "reward_std": 0.07944431155920029,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 1976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1529.396484375,
      "completions/mean_terminated_length": 628.0802001953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.674916787573611,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 64.98743825334853,
      "learning_rate": 3.606724828556265e-07,
      "loss": 0.0063,
      "num_tokens": 1894489015.0,
      "reward": 0.06640625,
      "reward_std": 0.09739763289690018,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 1977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1602.3828125,
      "completions/mean_terminated_length": 721.5115966796875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6752581718870018,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 55.22697989556693,
      "learning_rate": 3.6018626719915646e-07,
      "loss": 0.021,
      "num_tokens": 1895383963.0,
      "reward": 0.08984375,
      "reward_std": 0.12478742748498917,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 1978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1562.806640625,
      "completions/mean_terminated_length": 740.5316162109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6755995562003926,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 29.070425650999866,
      "learning_rate": 3.5970032094856305e-07,
      "loss": 0.0118,
      "num_tokens": 1896253576.0,
      "reward": 0.05078125,
      "reward_std": 0.0999661535024643,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.64453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1531.51953125,
      "completions/mean_terminated_length": 595.0439453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6759409405137834,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.2688991150374105,
      "learning_rate": 3.5921464479355744e-07,
      "loss": 0.0079,
      "num_tokens": 1897112402.0,
      "reward": 0.041015625,
      "reward_std": 0.07654774188995361,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 1980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1678.470703125,
      "completions/mean_terminated_length": 716.0211181640625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6762823248271742,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 64.57568753126111,
      "learning_rate": 3.5872923942346875e-07,
      "loss": 0.0121,
      "num_tokens": 1898062099.0,
      "reward": 0.099609375,
      "reward_std": 0.17435188591480255,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 1981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 1627.890625,
      "completions/mean_terminated_length": 660.2838745117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.676623709140565,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 20.200998787146453,
      "learning_rate": 3.582441055272406e-07,
      "loss": 0.0165,
      "num_tokens": 1898968203.0,
      "reward": 0.103515625,
      "reward_std": 0.14486464858055115,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 1982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.619140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1532.29296875,
      "completions/mean_terminated_length": 693.9384765625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6769650934539558,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 13.775472228414001,
      "learning_rate": 3.577592437934321e-07,
      "loss": -0.0007,
      "num_tokens": 1899837425.0,
      "reward": 0.044921875,
      "reward_std": 0.05913964658975601,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 1983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1550.4765625,
      "completions/mean_terminated_length": 656.0218505859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6773064777673465,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 10.017215052974752,
      "learning_rate": 3.572746549102159e-07,
      "loss": 0.0143,
      "num_tokens": 1900705989.0,
      "reward": 0.03515625,
      "reward_std": 0.0893455371260643,
      "rewards/accuracy_reward/mean": 0.03629032149910927,
      "rewards/accuracy_reward/std": 0.1872003972530365,
      "step": 1984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1498.275390625,
      "completions/mean_terminated_length": 668.299072265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6776478620807374,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 51.00286265154916,
      "learning_rate": 3.567903395653767e-07,
      "loss": 0.0157,
      "num_tokens": 1901548962.0,
      "reward": 0.083984375,
      "reward_std": 0.15112704038619995,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 1985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1620.279296875,
      "completions/mean_terminated_length": 729.0963745117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6779892463941282,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.174357043836892,
      "learning_rate": 3.563062984463123e-07,
      "loss": 0.0074,
      "num_tokens": 1902459681.0,
      "reward": 0.05078125,
      "reward_std": 0.06794346868991852,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 1986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1771.0,
      "completions/mean_length": 1522.228515625,
      "completions/mean_terminated_length": 527.124267578125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.678330630707519,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.9424894820761374,
      "learning_rate": 3.5582253224003e-07,
      "loss": 0.0014,
      "num_tokens": 1903320342.0,
      "reward": 0.0390625,
      "reward_std": 0.05864075943827629,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 1987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1648.923828125,
      "completions/mean_terminated_length": 638.8482666015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6786720150209098,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 17.174071424605156,
      "learning_rate": 3.553390416331478e-07,
      "loss": 0.0098,
      "num_tokens": 1904241327.0,
      "reward": 0.056640625,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 1988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1570.732421875,
      "completions/mean_terminated_length": 734.231201171875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6790133993343006,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 24.953005660323637,
      "learning_rate": 3.5485582731189176e-07,
      "loss": 0.017,
      "num_tokens": 1905123446.0,
      "reward": 0.109375,
      "reward_std": 0.1497160941362381,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 1989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1568.1015625,
      "completions/mean_terminated_length": 627.7225341796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6793547836476914,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 30.543565226885953,
      "learning_rate": 3.5437288996209704e-07,
      "loss": 0.0324,
      "num_tokens": 1906009642.0,
      "reward": 0.064453125,
      "reward_std": 0.11309494823217392,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 1990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1604.623046875,
      "completions/mean_terminated_length": 712.6529541015625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6796961679610822,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.3511192851781164,
      "learning_rate": 3.5389023026920384e-07,
      "loss": 0.0102,
      "num_tokens": 1906911961.0,
      "reward": 0.015625,
      "reward_std": 0.040274329483509064,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 1991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1588.296875,
      "completions/mean_terminated_length": 703.0399780273438,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6800375522744729,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 34.97873395477791,
      "learning_rate": 3.534078489182598e-07,
      "loss": 0.0227,
      "num_tokens": 1907795793.0,
      "reward": 0.0546875,
      "reward_std": 0.08698301017284393,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 1992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1821.0,
      "completions/mean_length": 1587.224609375,
      "completions/mean_terminated_length": 668.368408203125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6803789365878637,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 37.86623879337408,
      "learning_rate": 3.5292574659391716e-07,
      "loss": 0.0416,
      "num_tokens": 1908688276.0,
      "reward": 0.09375,
      "reward_std": 0.1290597766637802,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 1993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1908.0,
      "completions/mean_length": 1562.111328125,
      "completions/mean_terminated_length": 665.9166870117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6807203209012546,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 49.96421973042858,
      "learning_rate": 3.524439239804318e-07,
      "loss": 0.0169,
      "num_tokens": 1909570397.0,
      "reward": 0.07421875,
      "reward_std": 0.10958899557590485,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 1994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1536.361328125,
      "completions/mean_terminated_length": 669.2684326171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6810617052146454,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.824549636955396,
      "learning_rate": 3.519623817616629e-07,
      "loss": 0.0186,
      "num_tokens": 1910431174.0,
      "reward": 0.037109375,
      "reward_std": 0.07014618813991547,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 1995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.611328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1531.34375,
      "completions/mean_terminated_length": 718.7135620117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6814030895280362,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 14.873700612979453,
      "learning_rate": 3.514811206210716e-07,
      "loss": 0.006,
      "num_tokens": 1911296166.0,
      "reward": 0.1015625,
      "reward_std": 0.13380376994609833,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 1996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1512.935546875,
      "completions/mean_terminated_length": 657.3756103515625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.681744473841427,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 27.926973330135663,
      "learning_rate": 3.5100014124171995e-07,
      "loss": 0.0165,
      "num_tokens": 1912146453.0,
      "reward": 0.08203125,
      "reward_std": 0.11630409955978394,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 1997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1537.765625,
      "completions/mean_terminated_length": 721.9086303710938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6820858581548178,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 31.5191943322163,
      "learning_rate": 3.505194443062701e-07,
      "loss": 0.0202,
      "num_tokens": 1913002589.0,
      "reward": 0.0859375,
      "reward_std": 0.10394641757011414,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 1998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1558.443359375,
      "completions/mean_terminated_length": 647.7039184570312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6824272424682086,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 34.526846391398045,
      "learning_rate": 3.5003903049698356e-07,
      "loss": -0.0004,
      "num_tokens": 1913880752.0,
      "reward": 0.078125,
      "reward_std": 0.12146230041980743,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 1999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1549.30078125,
      "completions/mean_terminated_length": 697.3174438476562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6827686267815993,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 29.881643525349762,
      "learning_rate": 3.4955890049572e-07,
      "loss": 0.0055,
      "num_tokens": 1914757514.0,
      "reward": 0.05078125,
      "reward_std": 0.09186187386512756,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1586.501953125,
      "completions/mean_terminated_length": 615.9575805664062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6831100110949901,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 24.18606316733675,
      "learning_rate": 3.490790549839359e-07,
      "loss": 0.0248,
      "num_tokens": 1915653979.0,
      "reward": 0.052734375,
      "reward_std": 0.09820909798145294,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1800.0,
      "completions/mean_length": 1625.578125,
      "completions/mean_terminated_length": 625.1052856445312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.683451395408381,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.494348631377462,
      "learning_rate": 3.4859949464268456e-07,
      "loss": 0.028,
      "num_tokens": 1916566595.0,
      "reward": 0.04296875,
      "reward_std": 0.08026791363954544,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1519.470703125,
      "completions/mean_terminated_length": 714.9605712890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6837927797217718,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 89.97144049459428,
      "learning_rate": 3.481202201526136e-07,
      "loss": -0.0001,
      "num_tokens": 1917424100.0,
      "reward": 0.107421875,
      "reward_std": 0.15409593284130096,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 2003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1587.55859375,
      "completions/mean_terminated_length": 700.8800048828125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6841341640351626,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 48.80334493609286,
      "learning_rate": 3.4764123219396613e-07,
      "loss": -0.0004,
      "num_tokens": 1918316898.0,
      "reward": 0.033203125,
      "reward_std": 0.06041031330823898,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1864.0,
      "completions/mean_length": 1670.064453125,
      "completions/mean_terminated_length": 656.3093872070312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6844755483485534,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 32.940967126170634,
      "learning_rate": 3.471625314465773e-07,
      "loss": 0.0161,
      "num_tokens": 1919263267.0,
      "reward": 0.05859375,
      "reward_std": 0.0801548957824707,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 2005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1596.724609375,
      "completions/mean_terminated_length": 647.6787719726562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6848169326619442,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 48.946077928328464,
      "learning_rate": 3.466841185898756e-07,
      "loss": -0.0058,
      "num_tokens": 1920157862.0,
      "reward": 0.08203125,
      "reward_std": 0.13401183485984802,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1599.759765625,
      "completions/mean_terminated_length": 681.9345092773438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.685158316975335,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 29.165821932788443,
      "learning_rate": 3.4620599430288077e-07,
      "loss": 0.0291,
      "num_tokens": 1921060331.0,
      "reward": 0.056640625,
      "reward_std": 0.11093294620513916,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.591796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1464.46875,
      "completions/mean_terminated_length": 618.488037109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6854997012887258,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 17.291498444943254,
      "learning_rate": 3.457281592642026e-07,
      "loss": 0.0237,
      "num_tokens": 1921886155.0,
      "reward": 0.0546875,
      "reward_std": 0.11206455528736115,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1624.48046875,
      "completions/mean_terminated_length": 621.76318359375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6858410856021165,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.507376339519271,
      "learning_rate": 3.452506141520406e-07,
      "loss": 0.0118,
      "num_tokens": 1922802657.0,
      "reward": 0.025390625,
      "reward_std": 0.06755761802196503,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2009
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1577.201171875,
      "completions/mean_terminated_length": 678.4034423828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6861824699155074,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 33.88163295969299,
      "learning_rate": 3.4477335964418237e-07,
      "loss": 0.0148,
      "num_tokens": 1923690936.0,
      "reward": 0.064453125,
      "reward_std": 0.1340966522693634,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1609.0859375,
      "completions/mean_terminated_length": 741.465087890625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.6865238542288982,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 11.860884419984826,
      "learning_rate": 3.442963964180039e-07,
      "loss": 0.0159,
      "num_tokens": 1924595060.0,
      "reward": 0.05078125,
      "reward_std": 0.10354605317115784,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2011
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.607421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1507.072265625,
      "completions/mean_terminated_length": 670.1144409179688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.686865238542289,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.888842520814444,
      "learning_rate": 3.4381972515046675e-07,
      "loss": 0.0236,
      "num_tokens": 1925439961.0,
      "reward": 0.0234375,
      "reward_std": 0.06068410724401474,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1661.232421875,
      "completions/mean_terminated_length": 710.3311157226562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6872066228556798,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 178.3392350435034,
      "learning_rate": 3.4334334651811895e-07,
      "loss": 0.012,
      "num_tokens": 1926376144.0,
      "reward": 0.046875,
      "reward_std": 0.10739278793334961,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1911.0,
      "completions/mean_length": 1596.5546875,
      "completions/mean_terminated_length": 727.451416015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6875480071690706,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 65.18044455919652,
      "learning_rate": 3.428672611970931e-07,
      "loss": 0.0073,
      "num_tokens": 1927272012.0,
      "reward": 0.072265625,
      "reward_std": 0.10530310869216919,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.60546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1495.142578125,
      "completions/mean_terminated_length": 646.697998046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6878893914824614,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.67961907172786,
      "learning_rate": 3.423914698631044e-07,
      "loss": 0.0114,
      "num_tokens": 1928107317.0,
      "reward": 0.060546875,
      "reward_std": 0.10139618813991547,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 2015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.591796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1415.25,
      "completions/mean_terminated_length": 497.9138488769531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6882307757958522,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 20.53518517947413,
      "learning_rate": 3.4191597319145246e-07,
      "loss": 0.0242,
      "num_tokens": 1928913653.0,
      "reward": 0.0546875,
      "reward_std": 0.1024416983127594,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1531.533203125,
      "completions/mean_terminated_length": 677.8911743164062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6885721601092429,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 31.31642597136823,
      "learning_rate": 3.414407718570172e-07,
      "loss": 0.0224,
      "num_tokens": 1929776550.0,
      "reward": 0.078125,
      "reward_std": 0.12185370922088623,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.580078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1452.70703125,
      "completions/mean_terminated_length": 630.3720703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6889135444226338,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 83.63178004188556,
      "learning_rate": 3.4096586653426053e-07,
      "loss": 0.0256,
      "num_tokens": 1930594768.0,
      "reward": 0.0703125,
      "reward_std": 0.12042921781539917,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1610.703125,
      "completions/mean_terminated_length": 630.9367065429688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6892549287360246,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.770057470535581,
      "learning_rate": 3.404912578972232e-07,
      "loss": 0.0109,
      "num_tokens": 1931506168.0,
      "reward": 0.056640625,
      "reward_std": 0.05629178136587143,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1529.65234375,
      "completions/mean_terminated_length": 636.3297729492188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6895963130494154,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.243987160739314,
      "learning_rate": 3.400169466195256e-07,
      "loss": -0.0135,
      "num_tokens": 1932357990.0,
      "reward": 0.041015625,
      "reward_std": 0.08279120922088623,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 2020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.587890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1455.865234375,
      "completions/mean_terminated_length": 611.4265747070312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6899376973628062,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 46.3221544949549,
      "learning_rate": 3.395429333743663e-07,
      "loss": -0.0008,
      "num_tokens": 1933184945.0,
      "reward": 0.080078125,
      "reward_std": 0.13341790437698364,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1907.0,
      "completions/mean_length": 1583.625,
      "completions/mean_terminated_length": 641.1361083984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.690279081676197,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 25.388885630312487,
      "learning_rate": 3.3906921883451957e-07,
      "loss": -0.0025,
      "num_tokens": 1934067089.0,
      "reward": 0.044921875,
      "reward_std": 0.080595001578331,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2022
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1545.099609375,
      "completions/mean_terminated_length": 692.8157958984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6906204659895878,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 6.585441697110707,
      "learning_rate": 3.3859580367233695e-07,
      "loss": 0.0133,
      "num_tokens": 1934924996.0,
      "reward": 0.0390625,
      "reward_std": 0.07465953379869461,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.576171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1437.66015625,
      "completions/mean_terminated_length": 607.9354858398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6909618503029786,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 149.44881492523203,
      "learning_rate": 3.3812268855974475e-07,
      "loss": 0.0265,
      "num_tokens": 1935739894.0,
      "reward": 0.0546875,
      "reward_std": 0.10767117142677307,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1824.0,
      "completions/mean_length": 1563.283203125,
      "completions/mean_terminated_length": 588.1470947265625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6913032346163693,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 40.85489343336783,
      "learning_rate": 3.376498741682433e-07,
      "loss": -0.0124,
      "num_tokens": 1936622983.0,
      "reward": 0.05078125,
      "reward_std": 0.09594620764255524,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1583.158203125,
      "completions/mean_terminated_length": 622.8563232421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6916446189297601,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 43.54316556914517,
      "learning_rate": 3.3717736116890585e-07,
      "loss": 0.0196,
      "num_tokens": 1937508840.0,
      "reward": 0.068359375,
      "reward_std": 0.12263333797454834,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1568.451171875,
      "completions/mean_terminated_length": 683.9500122070312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.691986003243151,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.1005545821204,
      "learning_rate": 3.3670515023237866e-07,
      "loss": 0.0021,
      "num_tokens": 1938383375.0,
      "reward": 0.099609375,
      "reward_std": 0.12997153401374817,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 2027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1596.982421875,
      "completions/mean_terminated_length": 586.4746704101562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6923273875565418,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.851763842090207,
      "learning_rate": 3.362332420288786e-07,
      "loss": 0.0077,
      "num_tokens": 1939287910.0,
      "reward": 0.04296875,
      "reward_std": 0.08263044059276581,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1582.94140625,
      "completions/mean_terminated_length": 679.9022827148438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6926687718699326,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 83.74372184694154,
      "learning_rate": 3.3576163722819273e-07,
      "loss": 0.0214,
      "num_tokens": 1940188472.0,
      "reward": 0.080078125,
      "reward_std": 0.11623015999794006,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1554.3671875,
      "completions/mean_terminated_length": 611.977294921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6930101561833234,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 44.28437423992409,
      "learning_rate": 3.3529033649967843e-07,
      "loss": 0.0419,
      "num_tokens": 1941059412.0,
      "reward": 0.09765625,
      "reward_std": 0.14422638714313507,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.587890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1937.0,
      "completions/mean_length": 1461.978515625,
      "completions/mean_terminated_length": 625.9953002929688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6933515404967142,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 38.05531967898502,
      "learning_rate": 3.3481934051226024e-07,
      "loss": -0.0007,
      "num_tokens": 1941891193.0,
      "reward": 0.03515625,
      "reward_std": 0.08274346590042114,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1467.142578125,
      "completions/mean_terminated_length": 618.1971435546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.693692924810105,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 11.89901083510781,
      "learning_rate": 3.3434864993443123e-07,
      "loss": 0.0379,
      "num_tokens": 1942707986.0,
      "reward": 0.0859375,
      "reward_std": 0.10056467354297638,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1607.7578125,
      "completions/mean_terminated_length": 690.4156494140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6940343091234957,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.748402295301776,
      "learning_rate": 3.338782654342506e-07,
      "loss": 0.008,
      "num_tokens": 1943618646.0,
      "reward": 0.041015625,
      "reward_std": 0.08406735956668854,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.599609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1510.283203125,
      "completions/mean_terminated_length": 705.01953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6943756934368865,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.367762265059742,
      "learning_rate": 3.334081876793427e-07,
      "loss": 0.0031,
      "num_tokens": 1944464039.0,
      "reward": 0.08203125,
      "reward_std": 0.08670367300510406,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1514.12109375,
      "completions/mean_terminated_length": 601.724853515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6947170777502774,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 30.023164434329964,
      "learning_rate": 3.3293841733689745e-07,
      "loss": 0.0167,
      "num_tokens": 1945317813.0,
      "reward": 0.111328125,
      "reward_std": 0.15493440628051758,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "step": 2035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1545.2421875,
      "completions/mean_terminated_length": 656.5838012695312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6950584620636682,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 20.21608847885543,
      "learning_rate": 3.324689550736674e-07,
      "loss": -0.0085,
      "num_tokens": 1946183697.0,
      "reward": 0.029296875,
      "reward_std": 0.06288585811853409,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1536.029296875,
      "completions/mean_terminated_length": 646.2406616210938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.695399846377059,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 77.23223590948912,
      "learning_rate": 3.3199980155596895e-07,
      "loss": -0.0007,
      "num_tokens": 1947054112.0,
      "reward": 0.0390625,
      "reward_std": 0.060571081936359406,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1528.05078125,
      "completions/mean_terminated_length": 736.6009521484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6957412306904498,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 25.969533425263116,
      "learning_rate": 3.315309574496792e-07,
      "loss": 0.006,
      "num_tokens": 1947912778.0,
      "reward": 0.060546875,
      "reward_std": 0.08537881821393967,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 2038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1624.8046875,
      "completions/mean_terminated_length": 676.9683837890625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6960826150038406,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 64.35573317486414,
      "learning_rate": 3.310624234202369e-07,
      "loss": 0.0179,
      "num_tokens": 1948825462.0,
      "reward": 0.07421875,
      "reward_std": 0.11824101209640503,
      "rewards/accuracy_reward/mean": 0.07661290466785431,
      "rewards/accuracy_reward/std": 0.2662447690963745,
      "step": 2039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1667.25390625,
      "completions/mean_terminated_length": 721.8639526367188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6964239993172314,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 122.23248740945724,
      "learning_rate": 3.305942001326404e-07,
      "loss": 0.0037,
      "num_tokens": 1949756584.0,
      "reward": 0.0546875,
      "reward_std": 0.08604401350021362,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 1511.685546875,
      "completions/mean_terminated_length": 579.5882568359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.6967653836306221,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 92.51768673437242,
      "learning_rate": 3.3012628825144685e-07,
      "loss": 0.0043,
      "num_tokens": 1950612055.0,
      "reward": 0.0546875,
      "reward_std": 0.11214477568864822,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1650.369140625,
      "completions/mean_terminated_length": 717.3660278320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6971067679440129,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 32.92817516635831,
      "learning_rate": 3.296586884407717e-07,
      "loss": 0.0117,
      "num_tokens": 1951538308.0,
      "reward": 0.08203125,
      "reward_std": 0.08963288366794586,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1628.591796875,
      "completions/mean_terminated_length": 697.4528198242188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.6974481522574038,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 43.31814393751971,
      "learning_rate": 3.2919140136428727e-07,
      "loss": 0.0172,
      "num_tokens": 1952445811.0,
      "reward": 0.048828125,
      "reward_std": 0.05919293314218521,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1914.0,
      "completions/mean_length": 1565.32421875,
      "completions/mean_terminated_length": 627.712646484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6977895365707946,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 143.95312631651663,
      "learning_rate": 3.287244276852223e-07,
      "loss": 0.0188,
      "num_tokens": 1953321673.0,
      "reward": 0.03125,
      "reward_std": 0.07152433693408966,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1559.3046875,
      "completions/mean_terminated_length": 657.933349609375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6981309208841854,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 55.58020166957072,
      "learning_rate": 3.282577680663604e-07,
      "loss": 0.0021,
      "num_tokens": 1954201573.0,
      "reward": 0.068359375,
      "reward_std": 0.09020963311195374,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1836.0,
      "completions/mean_length": 1533.001953125,
      "completions/mean_terminated_length": 660.2158203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6984723051975762,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 43.689061727235156,
      "learning_rate": 3.277914231700393e-07,
      "loss": 0.0113,
      "num_tokens": 1955066854.0,
      "reward": 0.04296875,
      "reward_std": 0.06964729726314545,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 2046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1533.478515625,
      "completions/mean_terminated_length": 675.9427490234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.698813689510967,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 41.02430527797403,
      "learning_rate": 3.273253936581506e-07,
      "loss": 0.0009,
      "num_tokens": 1955925179.0,
      "reward": 0.0390625,
      "reward_std": 0.08422812819480896,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1516.904296875,
      "completions/mean_terminated_length": 639.0828857421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6991550738243578,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.82404224891299,
      "learning_rate": 3.2685968019213784e-07,
      "loss": 0.0174,
      "num_tokens": 1956776074.0,
      "reward": 0.03515625,
      "reward_std": 0.07443207502365112,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1634.53515625,
      "completions/mean_terminated_length": 708.16455078125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6994964581377485,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 16.615725340369927,
      "learning_rate": 3.2639428343299623e-07,
      "loss": 0.004,
      "num_tokens": 1957687068.0,
      "reward": 0.015625,
      "reward_std": 0.03344620764255524,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1597.525390625,
      "completions/mean_terminated_length": 666.9042358398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.6998378424511393,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 92.29997608715432,
      "learning_rate": 3.259292040412711e-07,
      "loss": 0.0364,
      "num_tokens": 1958581017.0,
      "reward": 0.115234375,
      "reward_std": 0.16516265273094177,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "step": 2050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1634.677734375,
      "completions/mean_terminated_length": 646.5364379882812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7001792267645301,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 41.52746029897798,
      "learning_rate": 3.2546444267705786e-07,
      "loss": -0.0008,
      "num_tokens": 1959491588.0,
      "reward": 0.03515625,
      "reward_std": 0.08214399218559265,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1603.701171875,
      "completions/mean_terminated_length": 635.3975219726562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.700520611077921,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 32.7995522967481,
      "learning_rate": 3.250000000000001e-07,
      "loss": 0.0142,
      "num_tokens": 1960388971.0,
      "reward": 0.048828125,
      "reward_std": 0.08184495568275452,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1946.0,
      "completions/mean_length": 1625.08984375,
      "completions/mean_terminated_length": 703.0932006835938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7008619953913118,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.156427950796258,
      "learning_rate": 3.245358766692891e-07,
      "loss": 0.0152,
      "num_tokens": 1961297337.0,
      "reward": 0.060546875,
      "reward_std": 0.07069281488656998,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1620.775390625,
      "completions/mean_terminated_length": 689.3726806640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7012033797047026,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 22.684303239790186,
      "learning_rate": 3.2407207334366347e-07,
      "loss": 0.0053,
      "num_tokens": 1962204070.0,
      "reward": 0.021484375,
      "reward_std": 0.054354868829250336,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1535.599609375,
      "completions/mean_terminated_length": 637.521484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7015447640180934,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.571154698087064,
      "learning_rate": 3.2360859068140666e-07,
      "loss": 0.0289,
      "num_tokens": 1963069001.0,
      "reward": 0.048828125,
      "reward_std": 0.09363143146038055,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1539.08203125,
      "completions/mean_terminated_length": 631.8804321289062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7018861483314842,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 35.260464535352945,
      "learning_rate": 3.2314542934034813e-07,
      "loss": 0.01,
      "num_tokens": 1963928771.0,
      "reward": 0.0703125,
      "reward_std": 0.09970590472221375,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1954.0,
      "completions/mean_length": 1594.025390625,
      "completions/mean_terminated_length": 664.4583740234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7022275326448749,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.448077060752237,
      "learning_rate": 3.2268258997786015e-07,
      "loss": 0.0045,
      "num_tokens": 1964820656.0,
      "reward": 0.02734375,
      "reward_std": 0.07284127175807953,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1855.0,
      "completions/mean_length": 1622.609375,
      "completions/mean_terminated_length": 624.7777709960938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7025689169582657,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 46.664486883601015,
      "learning_rate": 3.2222007325085885e-07,
      "loss": 0.0144,
      "num_tokens": 1965731752.0,
      "reward": 0.103515625,
      "reward_std": 0.1336519569158554,
      "rewards/accuracy_reward/mean": 0.10685484111309052,
      "rewards/accuracy_reward/std": 0.3092404901981354,
      "step": 2058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1548.703125,
      "completions/mean_terminated_length": 666.1621704101562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7029103012716565,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 130.9995498795912,
      "learning_rate": 3.217578798158022e-07,
      "loss": 0.0167,
      "num_tokens": 1966597760.0,
      "reward": 0.10546875,
      "reward_std": 0.13027715682983398,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 2059
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1572.05859375,
      "completions/mean_terminated_length": 679.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7032516855850474,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 121.6637216076443,
      "learning_rate": 3.2129601032868884e-07,
      "loss": 0.0046,
      "num_tokens": 1967482206.0,
      "reward": 0.1015625,
      "reward_std": 0.11228963732719421,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 2060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1748.810546875,
      "completions/mean_terminated_length": 761.2521362304688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7035930698984382,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 45.72616905030464,
      "learning_rate": 3.2083446544505847e-07,
      "loss": 0.0007,
      "num_tokens": 1968467661.0,
      "reward": 0.07421875,
      "reward_std": 0.11989058554172516,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1657.28515625,
      "completions/mean_terminated_length": 714.3600463867188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.703934454211829,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 123.5932995574792,
      "learning_rate": 3.203732458199893e-07,
      "loss": -0.0061,
      "num_tokens": 1969389807.0,
      "reward": 0.1015625,
      "reward_std": 0.10079116374254227,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 2062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1946.0,
      "completions/mean_length": 1629.236328125,
      "completions/mean_terminated_length": 682.350341796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7042758385252198,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 45.09570648828714,
      "learning_rate": 3.199123521080985e-07,
      "loss": 0.0048,
      "num_tokens": 1970304040.0,
      "reward": 0.091796875,
      "reward_std": 0.13095036149024963,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 2063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.64453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1542.720703125,
      "completions/mean_terminated_length": 626.8516845703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7046172228386106,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.692904742401137,
      "learning_rate": 3.194517849635404e-07,
      "loss": 0.0114,
      "num_tokens": 1971179097.0,
      "reward": 0.046875,
      "reward_std": 0.08780898153781891,
      "rewards/accuracy_reward/mean": 0.04838709533214569,
      "rewards/accuracy_reward/std": 0.21479946374893188,
      "step": 2064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1546.52734375,
      "completions/mean_terminated_length": 605.5618286132812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7049586071520013,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.801723405728734,
      "learning_rate": 3.1899154504000544e-07,
      "loss": 0.0257,
      "num_tokens": 1972046631.0,
      "reward": 0.064453125,
      "reward_std": 0.0971047431230545,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1573.609375,
      "completions/mean_terminated_length": 706.077392578125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7052999914653921,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 40.66839924548039,
      "learning_rate": 3.185316329907204e-07,
      "loss": -0.0019,
      "num_tokens": 1972931407.0,
      "reward": 0.048828125,
      "reward_std": 0.07493096590042114,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1541.583984375,
      "completions/mean_terminated_length": 549.2427368164062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7056413757787829,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.918111794638964,
      "learning_rate": 3.1807204946844613e-07,
      "loss": 0.0027,
      "num_tokens": 1973797530.0,
      "reward": 0.03515625,
      "reward_std": 0.05530741065740585,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 1648.8359375,
      "completions/mean_terminated_length": 746.5732421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7059827600921738,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.01388166767705,
      "learning_rate": 3.176127951254775e-07,
      "loss": 0.0405,
      "num_tokens": 1974723142.0,
      "reward": 0.064453125,
      "reward_std": 0.08879335969686508,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1560.298828125,
      "completions/mean_terminated_length": 653.0111694335938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7063241444055646,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.955420253573317,
      "learning_rate": 3.1715387061364187e-07,
      "loss": 0.0168,
      "num_tokens": 1975603567.0,
      "reward": 0.033203125,
      "reward_std": 0.06570751965045929,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2069
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1580.921875,
      "completions/mean_terminated_length": 632.94677734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7066655287189554,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 29.018306524252484,
      "learning_rate": 3.1669527658429914e-07,
      "loss": 0.0129,
      "num_tokens": 1976486311.0,
      "reward": 0.0703125,
      "reward_std": 0.13132822513580322,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 2070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1482.923828125,
      "completions/mean_terminated_length": 622.7832641601562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7070069130323462,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 33.92418337490979,
      "learning_rate": 3.162370136883389e-07,
      "loss": 0.0008,
      "num_tokens": 1977326848.0,
      "reward": 0.056640625,
      "reward_std": 0.11383409798145294,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2071
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1578.40625,
      "completions/mean_terminated_length": 650.1395263671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.707348297345737,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.304359804523731,
      "learning_rate": 3.157790825761818e-07,
      "loss": -0.006,
      "num_tokens": 1978210096.0,
      "reward": 0.021484375,
      "reward_std": 0.06519509106874466,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1693.0,
      "completions/mean_length": 1582.2734375,
      "completions/mean_terminated_length": 611.5421752929688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7076896816591277,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.017632317170996,
      "learning_rate": 3.1532148389777766e-07,
      "loss": 0.0082,
      "num_tokens": 1979090476.0,
      "reward": 0.08203125,
      "reward_std": 0.11835404485464096,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1604.4609375,
      "completions/mean_terminated_length": 663.2926635742188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7080310659725185,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 112.35664168327071,
      "learning_rate": 3.148642183026037e-07,
      "loss": 0.019,
      "num_tokens": 1979990728.0,
      "reward": 0.044921875,
      "reward_std": 0.0830162987112999,
      "rewards/accuracy_reward/mean": 0.0463709682226181,
      "rewards/accuracy_reward/std": 0.21049949526786804,
      "step": 2074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1969.0,
      "completions/mean_length": 1541.19140625,
      "completions/mean_terminated_length": 645.3729858398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7083724502859093,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 36.25662558441624,
      "learning_rate": 3.14407286439665e-07,
      "loss": 0.0175,
      "num_tokens": 1980857546.0,
      "reward": 0.09765625,
      "reward_std": 0.14319565892219543,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1908.0,
      "completions/mean_length": 1499.466796875,
      "completions/mean_terminated_length": 600.32470703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7087138345993002,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 39.591695047263805,
      "learning_rate": 3.1395068895749275e-07,
      "loss": -0.0001,
      "num_tokens": 1981707641.0,
      "reward": 0.033203125,
      "reward_std": 0.06695909798145294,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211629986763,
      "step": 2076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1880.0,
      "completions/mean_length": 1615.04296875,
      "completions/mean_terminated_length": 679.6419677734375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.709055218912691,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.45142760747069,
      "learning_rate": 3.134944265041436e-07,
      "loss": 0.0277,
      "num_tokens": 1982611551.0,
      "reward": 0.07421875,
      "reward_std": 0.09948080778121948,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1547.646484375,
      "completions/mean_terminated_length": 648.1038208007812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7093966032260818,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 35.088840288631104,
      "learning_rate": 3.1303849972719834e-07,
      "loss": 0.0087,
      "num_tokens": 1983474970.0,
      "reward": 0.0625,
      "reward_std": 0.13896197080612183,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1632.234375,
      "completions/mean_terminated_length": 665.7142944335938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7097379875394726,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 94.36120064906882,
      "learning_rate": 3.1258290927376187e-07,
      "loss": -0.0052,
      "num_tokens": 1984390338.0,
      "reward": 0.060546875,
      "reward_std": 0.04726085811853409,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2079
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1512.21484375,
      "completions/mean_terminated_length": 604.2000122070312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7100793718528634,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 28.74341934206451,
      "learning_rate": 3.121276557904616e-07,
      "loss": 0.007,
      "num_tokens": 1985241840.0,
      "reward": 0.02734375,
      "reward_std": 0.05402229726314545,
      "rewards/accuracy_reward/mean": 0.02822580561041832,
      "rewards/accuracy_reward/std": 0.1657845675945282,
      "step": 2080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1786.0,
      "completions/mean_length": 1529.40625,
      "completions/mean_terminated_length": 612.7567749023438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7104207561662541,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 108.9552564520055,
      "learning_rate": 3.1167273992344646e-07,
      "loss": 0.0045,
      "num_tokens": 1986103360.0,
      "reward": 0.04296875,
      "reward_std": 0.10668125748634338,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1601.0,
      "completions/mean_length": 1608.630859375,
      "completions/mean_terminated_length": 558.2185668945312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7107621404796449,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 62.19652651673116,
      "learning_rate": 3.112181623183866e-07,
      "loss": 0.0124,
      "num_tokens": 1987003587.0,
      "reward": 0.025390625,
      "reward_std": 0.04620979726314545,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2082
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.572265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1425.36328125,
      "completions/mean_terminated_length": 592.337890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7111035247930357,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.071100894793616,
      "learning_rate": 3.1076392362047117e-07,
      "loss": -0.0032,
      "num_tokens": 1987806013.0,
      "reward": 0.07421875,
      "reward_std": 0.07662828266620636,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1555.46484375,
      "completions/mean_terminated_length": 692.2042846679688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7114449091064265,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 56.09849120397067,
      "learning_rate": 3.1031002447440945e-07,
      "loss": 0.0071,
      "num_tokens": 1988671899.0,
      "reward": 0.083984375,
      "reward_std": 0.13721179962158203,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1529.162109375,
      "completions/mean_terminated_length": 678.6958618164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7117862934198174,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 43.497110095292925,
      "learning_rate": 3.0985646552442794e-07,
      "loss": 0.0071,
      "num_tokens": 1989531278.0,
      "reward": 0.0703125,
      "reward_std": 0.10056467354297638,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1516.572265625,
      "completions/mean_terminated_length": 623.4398193359375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7121276777332082,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 64.54069808962709,
      "learning_rate": 3.0940324741427103e-07,
      "loss": 0.0091,
      "num_tokens": 1990382259.0,
      "reward": 0.025390625,
      "reward_std": 0.06755761057138443,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2086
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1597.0703125,
      "completions/mean_terminated_length": 657.1807250976562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.712469062046599,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 54.81338224920416,
      "learning_rate": 3.089503707871983e-07,
      "loss": 0.0008,
      "num_tokens": 1991281719.0,
      "reward": 0.072265625,
      "reward_std": 0.13717760145664215,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1611.068359375,
      "completions/mean_terminated_length": 683.9207153320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7128104463599898,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 13.602775862880675,
      "learning_rate": 3.084978362859859e-07,
      "loss": 0.0178,
      "num_tokens": 1992189898.0,
      "reward": 0.060546875,
      "reward_std": 0.1127297431230545,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1595.0546875,
      "completions/mean_terminated_length": 675.7633056640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7131518306733805,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 22.659904632380997,
      "learning_rate": 3.080456445529237e-07,
      "loss": 0.0169,
      "num_tokens": 1993083910.0,
      "reward": 0.0625,
      "reward_std": 0.08141306042671204,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1827.0,
      "completions/mean_length": 1598.7734375,
      "completions/mean_terminated_length": 564.1032104492188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7134932149867713,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 24.79468584601153,
      "learning_rate": 3.075937962298147e-07,
      "loss": 0.0227,
      "num_tokens": 1993975330.0,
      "reward": 0.056640625,
      "reward_std": 0.1127830445766449,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1560.078125,
      "completions/mean_terminated_length": 578.494140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7138345993001621,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 43.48332757833183,
      "learning_rate": 3.0714229195797545e-07,
      "loss": 0.0052,
      "num_tokens": 1994849098.0,
      "reward": 0.146484375,
      "reward_std": 0.14131072163581848,
      "rewards/accuracy_reward/mean": 0.15120968222618103,
      "rewards/accuracy_reward/std": 0.35861483216285706,
      "step": 2091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1807.0,
      "completions/mean_length": 1594.40234375,
      "completions/mean_terminated_length": 657.329345703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.714175983613553,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 27.578527355911092,
      "learning_rate": 3.066911323782333e-07,
      "loss": 0.0042,
      "num_tokens": 1995745496.0,
      "reward": 0.046875,
      "reward_std": 0.08584587275981903,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1644.96875,
      "completions/mean_terminated_length": 681.4304809570312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7145173679269438,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 68.83237109358673,
      "learning_rate": 3.062403181309271e-07,
      "loss": -0.0,
      "num_tokens": 1996660504.0,
      "reward": 0.068359375,
      "reward_std": 0.11270354688167572,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1532.369140625,
      "completions/mean_terminated_length": 564.8370971679688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7148587522403346,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.666846989593071,
      "learning_rate": 3.057898498559049e-07,
      "loss": 0.0153,
      "num_tokens": 1997519349.0,
      "reward": 0.0546875,
      "reward_std": 0.08698301017284393,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1642.39453125,
      "completions/mean_terminated_length": 644.8243408203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7152001365537254,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 25.381641768321064,
      "learning_rate": 3.053397281925244e-07,
      "loss": 0.017,
      "num_tokens": 1998431839.0,
      "reward": 0.029296875,
      "reward_std": 0.07328138500452042,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1567.37109375,
      "completions/mean_terminated_length": 625.5606689453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7155415208671162,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 127.87539587907906,
      "learning_rate": 3.0488995377965064e-07,
      "loss": -0.0152,
      "num_tokens": 1999310413.0,
      "reward": 0.06640625,
      "reward_std": 0.10053187608718872,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1679.142578125,
      "completions/mean_terminated_length": 669.496337890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7158829051805069,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 49.93796341915726,
      "learning_rate": 3.0444052725565614e-07,
      "loss": 0.0054,
      "num_tokens": 2000241174.0,
      "reward": 0.0703125,
      "reward_std": 0.11086063086986542,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1839.0,
      "completions/mean_length": 1534.58203125,
      "completions/mean_terminated_length": 537.2528686523438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7162242894938977,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 51.8223202132066,
      "learning_rate": 3.0399144925841993e-07,
      "loss": 0.0161,
      "num_tokens": 2001110800.0,
      "reward": 0.044921875,
      "reward_std": 0.08423367142677307,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1518.9453125,
      "completions/mean_terminated_length": 630.0576171875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7165656738072885,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 33.55706020272989,
      "learning_rate": 3.0354272042532573e-07,
      "loss": -0.0083,
      "num_tokens": 2001961508.0,
      "reward": 0.064453125,
      "reward_std": 0.09500166773796082,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2099
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1859.0,
      "completions/mean_length": 1484.490234375,
      "completions/mean_terminated_length": 633.7009887695312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7169070581206793,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 25.69656031424839,
      "learning_rate": 3.030943413932624e-07,
      "loss": 0.0466,
      "num_tokens": 2002790287.0,
      "reward": 0.09765625,
      "reward_std": 0.1418900489807129,
      "rewards/accuracy_reward/mean": 0.10080645233392715,
      "rewards/accuracy_reward/std": 0.30137622356414795,
      "step": 2100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1584.056640625,
      "completions/mean_terminated_length": 582.0493774414062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7172484424340702,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 61.92127933297947,
      "learning_rate": 3.0264631279862183e-07,
      "loss": 0.0166,
      "num_tokens": 2003682508.0,
      "reward": 0.091796875,
      "reward_std": 0.15636999905109406,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 2101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1417.962890625,
      "completions/mean_terminated_length": 633.1798095703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.717589826747461,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 23.53358520748768,
      "learning_rate": 3.021986352772985e-07,
      "loss": 0.0105,
      "num_tokens": 2004492057.0,
      "reward": 0.0859375,
      "reward_std": 0.12737178802490234,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1517.890625,
      "completions/mean_terminated_length": 596.5775756835938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7179312110608518,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 26.339287537232327,
      "learning_rate": 3.0175130946468894e-07,
      "loss": 0.0332,
      "num_tokens": 2005347009.0,
      "reward": 0.0625,
      "reward_std": 0.12478180229663849,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1594.30078125,
      "completions/mean_terminated_length": 665.297607421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7182725953742426,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 70.86283084805004,
      "learning_rate": 3.013043359956903e-07,
      "loss": 0.0103,
      "num_tokens": 2006240123.0,
      "reward": 0.064453125,
      "reward_std": 0.10288181900978088,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1588.08984375,
      "completions/mean_terminated_length": 629.48193359375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7186139796876334,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 112.06093086486976,
      "learning_rate": 3.008577155046997e-07,
      "loss": 0.0232,
      "num_tokens": 2007133433.0,
      "reward": 0.044921875,
      "reward_std": 0.08204546570777893,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 1558.294921875,
      "completions/mean_terminated_length": 692.7081298828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7189553640010241,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.2251411968431123,
      "learning_rate": 3.0041144862561307e-07,
      "loss": 0.0019,
      "num_tokens": 2008006944.0,
      "reward": 0.041015625,
      "reward_std": 0.05907990783452988,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 2106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.587890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1472.7109375,
      "completions/mean_terminated_length": 652.0379638671875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.7192967483144149,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 11.372619449927114,
      "learning_rate": 2.9996553599182487e-07,
      "loss": -0.0022,
      "num_tokens": 2008841164.0,
      "reward": 0.072265625,
      "reward_std": 0.11669091880321503,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1400.58203125,
      "completions/mean_terminated_length": 643.4321899414062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7196381326278057,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 17.94512173951747,
      "learning_rate": 2.995199782362264e-07,
      "loss": -0.0018,
      "num_tokens": 2009633462.0,
      "reward": 0.05859375,
      "reward_std": 0.10013246536254883,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1502.3671875,
      "completions/mean_terminated_length": 622.6734619140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7199795169411966,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 25.795837495466298,
      "learning_rate": 2.9907477599120537e-07,
      "loss": 0.0188,
      "num_tokens": 2010475474.0,
      "reward": 0.083984375,
      "reward_std": 0.12340269237756729,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1535.6015625,
      "completions/mean_terminated_length": 674.4502563476562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7203209012545874,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.9266647590356358,
      "learning_rate": 2.98629929888645e-07,
      "loss": 0.0114,
      "num_tokens": 2011344630.0,
      "reward": 0.025390625,
      "reward_std": 0.044359706342220306,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 1552.669921875,
      "completions/mean_terminated_length": 639.3611450195312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7206622855679782,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.3908546761184745,
      "learning_rate": 2.981854405599228e-07,
      "loss": 0.0024,
      "num_tokens": 2012221277.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 2111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.55078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1423.806640625,
      "completions/mean_terminated_length": 658.4912719726562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.721003669881369,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 116.44842924076096,
      "learning_rate": 2.9774130863591035e-07,
      "loss": 0.0453,
      "num_tokens": 2013026202.0,
      "reward": 0.1015625,
      "reward_std": 0.12988576292991638,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 2112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1547.572265625,
      "completions/mean_terminated_length": 624.5610961914062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7213450541947598,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 31.148788561009,
      "learning_rate": 2.9729753474697157e-07,
      "loss": 0.0333,
      "num_tokens": 2013892783.0,
      "reward": 0.1015625,
      "reward_std": 0.1591646373271942,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 2113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 1509.60546875,
      "completions/mean_terminated_length": 722.72119140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7216864385081505,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 34.16179392993526,
      "learning_rate": 2.9685411952296214e-07,
      "loss": 0.0261,
      "num_tokens": 2014736101.0,
      "reward": 0.046875,
      "reward_std": 0.11322767287492752,
      "rewards/accuracy_reward/mean": 0.04838709533214569,
      "rewards/accuracy_reward/std": 0.21479946374893188,
      "step": 2114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1452.013671875,
      "completions/mean_terminated_length": 545.1083374023438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7220278228215413,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 6.371800954336332,
      "learning_rate": 2.964110635932292e-07,
      "loss": 0.0532,
      "num_tokens": 2015559372.0,
      "reward": 0.064453125,
      "reward_std": 0.13831710815429688,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1930.0,
      "completions/mean_length": 1620.830078125,
      "completions/mean_terminated_length": 636.9613037109375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7223692071349321,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 20.994038027963132,
      "learning_rate": 2.959683675866093e-07,
      "loss": 0.0047,
      "num_tokens": 2016461797.0,
      "reward": 0.03125,
      "reward_std": 0.08846627175807953,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1637.13671875,
      "completions/mean_terminated_length": 673.0849609375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.722710591448323,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 77.60443361460818,
      "learning_rate": 2.955260321314287e-07,
      "loss": 0.0211,
      "num_tokens": 2017382955.0,
      "reward": 0.09375,
      "reward_std": 0.13056449592113495,
      "rewards/accuracy_reward/mean": 0.09677419066429138,
      "rewards/accuracy_reward/std": 0.2959485352039337,
      "step": 2117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1756.0,
      "completions/mean_length": 1686.826171875,
      "completions/mean_terminated_length": 657.9849853515625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7230519757617138,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 106.6455793241189,
      "learning_rate": 2.9508405785550144e-07,
      "loss": 0.0225,
      "num_tokens": 2018325874.0,
      "reward": 0.08984375,
      "reward_std": 0.15241675078868866,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 2118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1622.09765625,
      "completions/mean_terminated_length": 702.2098999023438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7233933600751046,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 32.25114948899705,
      "learning_rate": 2.946424453861294e-07,
      "loss": -0.0033,
      "num_tokens": 2019242916.0,
      "reward": 0.025390625,
      "reward_std": 0.06849660724401474,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 2119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1946.0,
      "completions/mean_length": 1709.6953125,
      "completions/mean_terminated_length": 695.203125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.7237347443884954,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 15.140927571795668,
      "learning_rate": 2.942011953501007e-07,
      "loss": 0.015,
      "num_tokens": 2020193608.0,
      "reward": 0.05859375,
      "reward_std": 0.08235109597444534,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1635.654296875,
      "completions/mean_terminated_length": 659.4144897460938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7240761287018862,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 78.35080327197711,
      "learning_rate": 2.937603083736887e-07,
      "loss": 0.0125,
      "num_tokens": 2021119735.0,
      "reward": 0.09375,
      "reward_std": 0.1529373973608017,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 2121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1915.0,
      "completions/mean_length": 1633.21484375,
      "completions/mean_terminated_length": 593.4109497070312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7244175130152769,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 87.49090776121162,
      "learning_rate": 2.9331978508265225e-07,
      "loss": -0.0034,
      "num_tokens": 2022034181.0,
      "reward": 0.08203125,
      "reward_std": 0.12917280197143555,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1845.0,
      "completions/mean_length": 1579.50390625,
      "completions/mean_terminated_length": 585.719482421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7247588973286677,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.1251685996495475,
      "learning_rate": 2.9287962610223326e-07,
      "loss": 0.0159,
      "num_tokens": 2022921863.0,
      "reward": 0.029296875,
      "reward_std": 0.057768452912569046,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17417415976524353,
      "step": 2123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1878.0,
      "completions/mean_length": 1631.65625,
      "completions/mean_terminated_length": 607.6757202148438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7251002816420585,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.247549999673293,
      "learning_rate": 2.924398320571573e-07,
      "loss": -0.0052,
      "num_tokens": 2023833671.0,
      "reward": 0.048828125,
      "reward_std": 0.08275700360536575,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1576.05859375,
      "completions/mean_terminated_length": 601.08984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7254416659554493,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 80.06744771309165,
      "learning_rate": 2.9200040357163114e-07,
      "loss": 0.0201,
      "num_tokens": 2024718389.0,
      "reward": 0.099609375,
      "reward_std": 0.11494503915309906,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 2125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1512.138671875,
      "completions/mean_terminated_length": 655.3045654296875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7257830502688402,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 10.939497160811365,
      "learning_rate": 2.915613412693435e-07,
      "loss": 0.0099,
      "num_tokens": 2025569260.0,
      "reward": 0.0859375,
      "reward_std": 0.1209416463971138,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1864.0,
      "completions/mean_length": 1687.203125,
      "completions/mean_terminated_length": 582.3651123046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.726124434582231,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.493811249008756,
      "learning_rate": 2.911226457734628e-07,
      "loss": 0.0181,
      "num_tokens": 2026520916.0,
      "reward": 0.041015625,
      "reward_std": 0.09352485090494156,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1547.080078125,
      "completions/mean_terminated_length": 556.8895263671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7264658188956218,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 32.6843699245717,
      "learning_rate": 2.90684317706637e-07,
      "loss": 0.018,
      "num_tokens": 2027395677.0,
      "reward": 0.0390625,
      "reward_std": 0.08422909677028656,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 2128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1513.82421875,
      "completions/mean_terminated_length": 593.223388671875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7268072032090126,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.907946947362637,
      "learning_rate": 2.9024635769099287e-07,
      "loss": 0.0107,
      "num_tokens": 2028245523.0,
      "reward": 0.05078125,
      "reward_std": 0.12279270589351654,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1811.0,
      "completions/mean_length": 1525.5234375,
      "completions/mean_terminated_length": 561.844482421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7271485875224033,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 23.061260393837998,
      "learning_rate": 2.8980876634813424e-07,
      "loss": -0.0119,
      "num_tokens": 2029105615.0,
      "reward": 0.0703125,
      "reward_std": 0.1009584367275238,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 2130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.591796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1709.0,
      "completions/mean_length": 1473.966796875,
      "completions/mean_terminated_length": 641.7559814453125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7274899718357941,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 93.75600436920634,
      "learning_rate": 2.8937154429914233e-07,
      "loss": 0.016,
      "num_tokens": 2029933166.0,
      "reward": 0.08203125,
      "reward_std": 0.12538030743598938,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1583.759765625,
      "completions/mean_terminated_length": 712.6572875976562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7278313561491849,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 27.260480554920285,
      "learning_rate": 2.889346921645737e-07,
      "loss": 0.0091,
      "num_tokens": 2030823507.0,
      "reward": 0.04296875,
      "reward_std": 0.0957798957824707,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 2132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1561.701171875,
      "completions/mean_terminated_length": 672.3922729492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7281727404625757,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 21.598119000094954,
      "learning_rate": 2.8849821056445983e-07,
      "loss": 0.0095,
      "num_tokens": 2031697674.0,
      "reward": 0.037109375,
      "reward_std": 0.09001073241233826,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1561.9296875,
      "completions/mean_terminated_length": 665.9166870117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7285141247759666,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 30.932063536092194,
      "learning_rate": 2.880621001183069e-07,
      "loss": 0.0129,
      "num_tokens": 2032577494.0,
      "reward": 0.033203125,
      "reward_std": 0.04726085811853409,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1543.669921875,
      "completions/mean_terminated_length": 696.0785522460938,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.7288555090893574,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 24.87981394803063,
      "learning_rate": 2.8762636144509366e-07,
      "loss": 0.0294,
      "num_tokens": 2033440189.0,
      "reward": 0.123046875,
      "reward_std": 0.12631139159202576,
      "rewards/accuracy_reward/mean": 0.123046875,
      "rewards/accuracy_reward/std": 0.32881227135658264,
      "step": 2135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1689.712890625,
      "completions/mean_terminated_length": 625.9612426757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7291968934027482,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 4.826984093652752,
      "learning_rate": 2.871909951632716e-07,
      "loss": 0.0018,
      "num_tokens": 2034387706.0,
      "reward": 0.02734375,
      "reward_std": 0.03944835811853409,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1607.451171875,
      "completions/mean_terminated_length": 721.1705932617188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.729538277716139,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 162.6916226319031,
      "learning_rate": 2.867560018907634e-07,
      "loss": 0.0144,
      "num_tokens": 2035295457.0,
      "reward": 0.03515625,
      "reward_std": 0.09452171623706818,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1639.021484375,
      "completions/mean_terminated_length": 679.3921508789062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7298796620295297,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 6.375931532454759,
      "learning_rate": 2.863213822449629e-07,
      "loss": 0.0109,
      "num_tokens": 2036213036.0,
      "reward": 0.0625,
      "reward_std": 0.11454009264707565,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1677.818359375,
      "completions/mean_terminated_length": 654.375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7302210463429205,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 24.615095473629758,
      "learning_rate": 2.8588713684273247e-07,
      "loss": 0.0112,
      "num_tokens": 2037148047.0,
      "reward": 0.017578125,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1918.0,
      "completions/mean_length": 1597.84765625,
      "completions/mean_terminated_length": 642.6463012695312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7305624306563113,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.6170912042429,
      "learning_rate": 2.8545326630040436e-07,
      "loss": 0.0051,
      "num_tokens": 2038041505.0,
      "reward": 0.0859375,
      "reward_std": 0.0876617580652237,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1684.169921875,
      "completions/mean_terminated_length": 647.3909912109375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7309038149697021,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 123.20078432377602,
      "learning_rate": 2.850197712337786e-07,
      "loss": -0.008,
      "num_tokens": 2038982616.0,
      "reward": 0.029296875,
      "reward_std": 0.06288585811853409,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 2141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1606.841796875,
      "completions/mean_terminated_length": 511.448974609375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.731245199283093,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 24.665294381701703,
      "learning_rate": 2.84586652258122e-07,
      "loss": 0.02,
      "num_tokens": 2039876487.0,
      "reward": 0.0625,
      "reward_std": 0.11926513910293579,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1920.0,
      "completions/mean_length": 1611.685546875,
      "completions/mean_terminated_length": 677.4907836914062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7315865835964838,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 33.53266727016599,
      "learning_rate": 2.841539099881678e-07,
      "loss": -0.0006,
      "num_tokens": 2040777190.0,
      "reward": 0.05078125,
      "reward_std": 0.11493149399757385,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1888.0,
      "completions/mean_length": 1565.595703125,
      "completions/mean_terminated_length": 652.5706176757812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7319279679098746,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 23.921330018514727,
      "learning_rate": 2.837215450381144e-07,
      "loss": 0.0125,
      "num_tokens": 2041655671.0,
      "reward": 0.046875,
      "reward_std": 0.08490687608718872,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1508.40625,
      "completions/mean_terminated_length": 562.6666870117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7322693522232654,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 408.78243737744873,
      "learning_rate": 2.832895580216249e-07,
      "loss": -0.01,
      "num_tokens": 2042504375.0,
      "reward": 0.0234375,
      "reward_std": 0.05385598540306091,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1591.685546875,
      "completions/mean_terminated_length": 665.5562133789062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7326107365366561,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 33.833339106538666,
      "learning_rate": 2.828579495518256e-07,
      "loss": 0.0197,
      "num_tokens": 2043400342.0,
      "reward": 0.087890625,
      "reward_std": 0.09886875748634338,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 2146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1597.1171875,
      "completions/mean_terminated_length": 698.2924194335938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7329521208500469,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 18.74986535595416,
      "learning_rate": 2.824267202413061e-07,
      "loss": 0.0041,
      "num_tokens": 2044300146.0,
      "reward": 0.0390625,
      "reward_std": 0.09655256569385529,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 2147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1661.197265625,
      "completions/mean_terminated_length": 612.9058227539062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7332935051634377,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 54.985629609165294,
      "learning_rate": 2.8199587070211737e-07,
      "loss": -0.0134,
      "num_tokens": 2045238807.0,
      "reward": 0.068359375,
      "reward_std": 0.13329237699508667,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1609.6953125,
      "completions/mean_terminated_length": 627.9746704101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7336348894768285,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 22.758863577655593,
      "learning_rate": 2.815654015457715e-07,
      "loss": 0.0003,
      "num_tokens": 2046142971.0,
      "reward": 0.0625,
      "reward_std": 0.11399487406015396,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.60546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1447.439453125,
      "completions/mean_terminated_length": 525.787109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7339762737902193,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.33310318371966,
      "learning_rate": 2.8113531338324104e-07,
      "loss": 0.0456,
      "num_tokens": 2046956908.0,
      "reward": 0.083984375,
      "reward_std": 0.09721681475639343,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1468.966796875,
      "completions/mean_terminated_length": 608.8495483398438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7343176581036102,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 45.62410379278095,
      "learning_rate": 2.807056068249569e-07,
      "loss": -0.0047,
      "num_tokens": 2047781355.0,
      "reward": 0.107421875,
      "reward_std": 0.14394694566726685,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 2151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1566.19921875,
      "completions/mean_terminated_length": 630.287353515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.734659042417001,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 4.828628081196631,
      "learning_rate": 2.8027628248080944e-07,
      "loss": 0.0532,
      "num_tokens": 2048657921.0,
      "reward": 0.1171875,
      "reward_std": 0.1552657186985016,
      "rewards/accuracy_reward/mean": 0.1171875,
      "rewards/accuracy_reward/std": 0.32195815443992615,
      "step": 2152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.64453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1564.083984375,
      "completions/mean_terminated_length": 686.6538696289062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7350004267303918,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 129.09812042631734,
      "learning_rate": 2.7984734096014567e-07,
      "loss": -0.0013,
      "num_tokens": 2049530508.0,
      "reward": 0.064453125,
      "reward_std": 0.10791876912117004,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1775.0,
      "completions/mean_length": 1630.28515625,
      "completions/mean_terminated_length": 677.0384521484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7353418110437825,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.892844275676405,
      "learning_rate": 2.794187828717698e-07,
      "loss": 0.0005,
      "num_tokens": 2050438734.0,
      "reward": 0.044921875,
      "reward_std": 0.08412160724401474,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1659.203125,
      "completions/mean_terminated_length": 675.5379028320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7356831953571733,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.962784308527507,
      "learning_rate": 2.789906088239419e-07,
      "loss": -0.0031,
      "num_tokens": 2051373174.0,
      "reward": 0.078125,
      "reward_std": 0.09811057895421982,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1632.462890625,
      "completions/mean_terminated_length": 726.5404052734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7360245796705641,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.156284615498176,
      "learning_rate": 2.7856281942437635e-07,
      "loss": 0.0033,
      "num_tokens": 2052285187.0,
      "reward": 0.0703125,
      "reward_std": 0.11658906936645508,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1616.939453125,
      "completions/mean_terminated_length": 718.4638061523438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7363659639839549,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 12.253528561246215,
      "learning_rate": 2.781354152802422e-07,
      "loss": 0.0192,
      "num_tokens": 2053182788.0,
      "reward": 0.033203125,
      "reward_std": 0.06750432401895523,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1683.3671875,
      "completions/mean_terminated_length": 704.89208984375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7367073482973457,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 50.28575735998116,
      "learning_rate": 2.777083969981611e-07,
      "loss": -0.0033,
      "num_tokens": 2054131792.0,
      "reward": 0.037109375,
      "reward_std": 0.08537977933883667,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.619140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1519.478515625,
      "completions/mean_terminated_length": 660.2923583984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7370487326107366,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 35.30051746518248,
      "learning_rate": 2.7728176518420786e-07,
      "loss": 0.0037,
      "num_tokens": 2054984917.0,
      "reward": 0.0390625,
      "reward_std": 0.07465953379869461,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.623046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1559.234375,
      "completions/mean_terminated_length": 751.3782348632812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7373901169241274,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.4318451217488195,
      "learning_rate": 2.768555204439079e-07,
      "loss": 0.0214,
      "num_tokens": 2055861325.0,
      "reward": 0.03515625,
      "reward_std": 0.07779236882925034,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1574.849609375,
      "completions/mean_terminated_length": 655.7413940429688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7377315012375182,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 33.94684259827365,
      "learning_rate": 2.764296633822379e-07,
      "loss": 0.0434,
      "num_tokens": 2056748160.0,
      "reward": 0.078125,
      "reward_std": 0.11186403036117554,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1558.080078125,
      "completions/mean_terminated_length": 684.7445678710938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7380728855509089,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 60.527931257144765,
      "learning_rate": 2.7600419460362416e-07,
      "loss": -0.0093,
      "num_tokens": 2057617881.0,
      "reward": 0.064453125,
      "reward_std": 0.12121544778347015,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1565.96484375,
      "completions/mean_terminated_length": 596.2235107421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7384142698642997,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 22.361342581883594,
      "learning_rate": 2.7557911471194167e-07,
      "loss": 0.0094,
      "num_tokens": 2058502711.0,
      "reward": 0.0546875,
      "reward_std": 0.10821780562400818,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1916.0,
      "completions/mean_length": 1533.142578125,
      "completions/mean_terminated_length": 667.858642578125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7387556541776905,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 45.85597883938025,
      "learning_rate": 2.7515442431051363e-07,
      "loss": 0.0137,
      "num_tokens": 2059363856.0,
      "reward": 0.037109375,
      "reward_std": 0.0946291983127594,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1613.0,
      "completions/mean_length": 1629.923828125,
      "completions/mean_terminated_length": 639.743408203125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7390970384910813,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 2.820880402168814,
      "learning_rate": 2.747301240021101e-07,
      "loss": 0.0063,
      "num_tokens": 2060266537.0,
      "reward": 0.0390625,
      "reward_std": 0.06563520431518555,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.611328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1481.8671875,
      "completions/mean_terminated_length": 591.4170532226562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7394384228044721,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.805108863433123,
      "learning_rate": 2.7430621438894816e-07,
      "loss": 0.0114,
      "num_tokens": 2061106533.0,
      "reward": 0.025390625,
      "reward_std": 0.06755761057138443,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1527.873046875,
      "completions/mean_terminated_length": 616.252685546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.739779807117863,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 30.656893189894657,
      "learning_rate": 2.7388269607268967e-07,
      "loss": 0.0056,
      "num_tokens": 2061964900.0,
      "reward": 0.044921875,
      "reward_std": 0.09369117021560669,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1771.0,
      "completions/mean_length": 1640.94921875,
      "completions/mean_terminated_length": 611.0689697265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7401211914312538,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 29.407697003197022,
      "learning_rate": 2.734595696544416e-07,
      "loss": 0.0145,
      "num_tokens": 2062891722.0,
      "reward": 0.052734375,
      "reward_std": 0.1062953919172287,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1635.716796875,
      "completions/mean_terminated_length": 694.8654174804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7404625757446446,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 45.01250673342925,
      "learning_rate": 2.730368357347548e-07,
      "loss": 0.0071,
      "num_tokens": 2063803785.0,
      "reward": 0.048828125,
      "reward_std": 0.09380322694778442,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 1538.35546875,
      "completions/mean_terminated_length": 660.0319213867188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7408039600580353,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 11.373094852052875,
      "learning_rate": 2.7261449491362197e-07,
      "loss": 0.0227,
      "num_tokens": 2064671775.0,
      "reward": 0.115234375,
      "reward_std": 0.136160746216774,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "step": 2170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1568.78515625,
      "completions/mean_terminated_length": 613.1578979492188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7411453443714261,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 19.92048153553588,
      "learning_rate": 2.721925477904794e-07,
      "loss": 0.0085,
      "num_tokens": 2065550017.0,
      "reward": 0.056640625,
      "reward_std": 0.11261671781539917,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1764.0,
      "completions/mean_length": 1496.28515625,
      "completions/mean_terminated_length": 591.9277954101562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7414867286848169,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 32.187289055574006,
      "learning_rate": 2.717709949642034e-07,
      "loss": 0.0126,
      "num_tokens": 2066386595.0,
      "reward": 0.04296875,
      "reward_std": 0.07873040437698364,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.58203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1460.166015625,
      "completions/mean_terminated_length": 641.5934448242188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7418281129982077,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 78.86628871886052,
      "learning_rate": 2.7134983703311136e-07,
      "loss": 0.0348,
      "num_tokens": 2067211976.0,
      "reward": 0.05078125,
      "reward_std": 0.0918210819363594,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.595703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1448.177734375,
      "completions/mean_terminated_length": 564.3816528320312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7421694973115985,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 27.311016956466656,
      "learning_rate": 2.7092907459495973e-07,
      "loss": 0.0135,
      "num_tokens": 2068028291.0,
      "reward": 0.037109375,
      "reward_std": 0.07768725603818893,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1867.0,
      "completions/mean_length": 1684.357421875,
      "completions/mean_terminated_length": 708.5396118164062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7425108816249893,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 8.541195220484141,
      "learning_rate": 2.7050870824694407e-07,
      "loss": 0.0207,
      "num_tokens": 2068974298.0,
      "reward": 0.0390625,
      "reward_std": 0.06948098540306091,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1619.640625,
      "completions/mean_terminated_length": 727.1325073242188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7428522659383802,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 0.13536573905432692,
      "learning_rate": 2.700887385856974e-07,
      "loss": 0.0057,
      "num_tokens": 2069882978.0,
      "reward": 0.05859375,
      "reward_std": 0.05806133896112442,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 2176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1932.0,
      "completions/mean_length": 1648.224609375,
      "completions/mean_terminated_length": 626.576416015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.743193650251771,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 10.49712994726984,
      "learning_rate": 2.6966916620728966e-07,
      "loss": -0.0,
      "num_tokens": 2070815765.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 2177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1584.033203125,
      "completions/mean_terminated_length": 698.2784423828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7435350345651617,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.690880484134997,
      "learning_rate": 2.6924999170722743e-07,
      "loss": 0.0142,
      "num_tokens": 2071700918.0,
      "reward": 0.06640625,
      "reward_std": 0.08917921781539917,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293973088264465,
      "step": 2178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1509.478515625,
      "completions/mean_terminated_length": 627.0360717773438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7438764188785525,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.72867863873013,
      "learning_rate": 2.6883121568045197e-07,
      "loss": 0.0077,
      "num_tokens": 2072559627.0,
      "reward": 0.048828125,
      "reward_std": 0.05782270431518555,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 2179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1509.763671875,
      "completions/mean_terminated_length": 641.994873046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7442178031919433,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 9.443728891362426,
      "learning_rate": 2.6841283872133954e-07,
      "loss": 0.018,
      "num_tokens": 2073403794.0,
      "reward": 0.0390625,
      "reward_std": 0.09836846590042114,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1672.67578125,
      "completions/mean_terminated_length": 675.7857055664062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7445591875053341,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 24.265957413288792,
      "learning_rate": 2.6799486142369955e-07,
      "loss": 0.0191,
      "num_tokens": 2074350796.0,
      "reward": 0.07421875,
      "reward_std": 0.1152530387043953,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1517.095703125,
      "completions/mean_terminated_length": 624.8429565429688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7449005718187249,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 148.11759865013968,
      "learning_rate": 2.6757728438077414e-07,
      "loss": 0.0042,
      "num_tokens": 2075205309.0,
      "reward": 0.064453125,
      "reward_std": 0.1170879527926445,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 1618.65625,
      "completions/mean_terminated_length": 638.871826171875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.7452419561321157,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 71.36047709872058,
      "learning_rate": 2.6716010818523794e-07,
      "loss": 0.0226,
      "num_tokens": 2076106925.0,
      "reward": 0.021484375,
      "reward_std": 0.06425705552101135,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740891754627228,
      "step": 2183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1787.0,
      "completions/mean_length": 1637.396484375,
      "completions/mean_terminated_length": 567.5140991210938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7455833404455066,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 28.292484266741525,
      "learning_rate": 2.667433334291958e-07,
      "loss": 0.0108,
      "num_tokens": 2077030104.0,
      "reward": 0.068359375,
      "reward_std": 0.11910437047481537,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1627.892578125,
      "completions/mean_terminated_length": 760.0060424804688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7459247247588974,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.0906711081578,
      "learning_rate": 2.663269607041837e-07,
      "loss": -0.0025,
      "num_tokens": 2077936529.0,
      "reward": 0.041015625,
      "reward_std": 0.09258585423231125,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1877.0,
      "completions/mean_length": 1548.1015625,
      "completions/mean_terminated_length": 559.9302368164062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7462661090722881,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 55.286040781531426,
      "learning_rate": 2.6591099060116625e-07,
      "loss": -0.0048,
      "num_tokens": 2078810213.0,
      "reward": 0.048828125,
      "reward_std": 0.10598184913396835,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1591.287109375,
      "completions/mean_terminated_length": 639.3433227539062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7466074933856789,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 24.122156358625357,
      "learning_rate": 2.6549542371053714e-07,
      "loss": -0.0112,
      "num_tokens": 2079701944.0,
      "reward": 0.109375,
      "reward_std": 0.13574068248271942,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 2187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1615.23828125,
      "completions/mean_terminated_length": 774.586181640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7469488776990697,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 40.63971003997408,
      "learning_rate": 2.650802606221175e-07,
      "loss": 0.0374,
      "num_tokens": 2080605314.0,
      "reward": 0.103515625,
      "reward_std": 0.11122028529644012,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 2188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1627.2890625,
      "completions/mean_terminated_length": 602.3355712890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7472902620124605,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 13.954175533464152,
      "learning_rate": 2.6466550192515526e-07,
      "loss": 0.0136,
      "num_tokens": 2081513350.0,
      "reward": 0.080078125,
      "reward_std": 0.11372203379869461,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1549.09765625,
      "completions/mean_terminated_length": 667.2540893554688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7476316463258513,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 57.169619310039074,
      "learning_rate": 2.642511482083247e-07,
      "loss": 0.0219,
      "num_tokens": 2082380552.0,
      "reward": 0.064453125,
      "reward_std": 0.10296767204999924,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1617.09375,
      "completions/mean_terminated_length": 624.6193237304688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7479730306392421,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 31.768027290790332,
      "learning_rate": 2.638372000597251e-07,
      "loss": -0.0031,
      "num_tokens": 2083288728.0,
      "reward": 0.052734375,
      "reward_std": 0.0985880047082901,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1601.693359375,
      "completions/mean_terminated_length": 637.4506225585938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.748314414952633,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.65628818225515,
      "learning_rate": 2.634236580668802e-07,
      "loss": 0.0163,
      "num_tokens": 2084188651.0,
      "reward": 0.05078125,
      "reward_std": 0.09775100648403168,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1674.216796875,
      "completions/mean_terminated_length": 598.1742553710938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7486557992660238,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.21046703825872332,
      "learning_rate": 2.630105228167369e-07,
      "loss": -0.0013,
      "num_tokens": 2085113130.0,
      "reward": 0.02734375,
      "reward_std": 0.02960042469203472,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1800.0,
      "completions/mean_length": 1591.76953125,
      "completions/mean_terminated_length": 649.2575073242188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7489971835794145,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 23.454985233805438,
      "learning_rate": 2.625977948956656e-07,
      "loss": 0.0042,
      "num_tokens": 2086005556.0,
      "reward": 0.048828125,
      "reward_std": 0.07328139245510101,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1820.0,
      "completions/mean_length": 1602.3203125,
      "completions/mean_terminated_length": 463.75,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7493385678928053,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 35.59483977039,
      "learning_rate": 2.621854748894578e-07,
      "loss": -0.0134,
      "num_tokens": 2086908360.0,
      "reward": 0.05859375,
      "reward_std": 0.08985796570777893,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 2195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1679.0,
      "completions/mean_length": 1669.181640625,
      "completions/mean_terminated_length": 642.5289916992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7496799522061961,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 23.94870590034524,
      "learning_rate": 2.6177356338332635e-07,
      "loss": 0.0088,
      "num_tokens": 2087844149.0,
      "reward": 0.02734375,
      "reward_std": 0.06233368441462517,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1590.728515625,
      "completions/mean_terminated_length": 584.7312622070312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7500213365195869,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 24.11278995839209,
      "learning_rate": 2.6136206096190445e-07,
      "loss": 0.0136,
      "num_tokens": 2088740170.0,
      "reward": 0.11328125,
      "reward_std": 0.1556394398212433,
      "rewards/accuracy_reward/mean": 0.11328125,
      "rewards/accuracy_reward/std": 0.3172462284564972,
      "step": 2197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1629.341796875,
      "completions/mean_terminated_length": 665.077392578125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7503627208329777,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 15.749584835939213,
      "learning_rate": 2.609509682092442e-07,
      "loss": 0.0256,
      "num_tokens": 2089648905.0,
      "reward": 0.099609375,
      "reward_std": 0.11646249890327454,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 2198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1607.853515625,
      "completions/mean_terminated_length": 630.6729125976562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7507041051463685,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.210973572572502,
      "learning_rate": 2.6054028570881697e-07,
      "loss": 0.0308,
      "num_tokens": 2090544270.0,
      "reward": 0.076171875,
      "reward_std": 0.1091160923242569,
      "rewards/accuracy_reward/mean": 0.0786290317773819,
      "rewards/accuracy_reward/std": 0.26943066716194153,
      "step": 2199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.642578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1818.0,
      "completions/mean_length": 1560.884765625,
      "completions/mean_terminated_length": 685.1420288085938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7510454894597594,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 299.73813234765373,
      "learning_rate": 2.6013001404351133e-07,
      "loss": 0.0248,
      "num_tokens": 2091415171.0,
      "reward": 0.0625,
      "reward_std": 0.09959383308887482,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1858.0,
      "completions/mean_length": 1600.435546875,
      "completions/mean_terminated_length": 675.8263549804688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7513868737731502,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 9.78397215096451,
      "learning_rate": 2.5972015379563263e-07,
      "loss": 0.0217,
      "num_tokens": 2092311362.0,
      "reward": 0.041015625,
      "reward_std": 0.080595001578331,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1856.0,
      "completions/mean_length": 1500.95703125,
      "completions/mean_terminated_length": 589.21875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7517282580865409,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 52.00350531787857,
      "learning_rate": 2.5931070554690284e-07,
      "loss": -0.0079,
      "num_tokens": 2093148780.0,
      "reward": 0.048828125,
      "reward_std": 0.09673243016004562,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1604.19921875,
      "completions/mean_terminated_length": 653.9754638671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7520696423999317,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 30.421510031241386,
      "learning_rate": 2.589016698784585e-07,
      "loss": 0.0474,
      "num_tokens": 2094041970.0,
      "reward": 0.078125,
      "reward_std": 0.15327692031860352,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1790.36328125,
      "completions/mean_terminated_length": 791.7142944335938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7524110267133225,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 20.689079597521534,
      "learning_rate": 2.5849304737085143e-07,
      "loss": 0.0177,
      "num_tokens": 2095040220.0,
      "reward": 0.05859375,
      "reward_std": 0.08522041887044907,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1882.0,
      "completions/mean_length": 1585.25390625,
      "completions/mean_terminated_length": 576.4099731445312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7527524110267133,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 10.800193253984355,
      "learning_rate": 2.5808483860404605e-07,
      "loss": 0.0155,
      "num_tokens": 2095925854.0,
      "reward": 0.017578125,
      "reward_std": 0.03630761057138443,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1612.21875,
      "completions/mean_terminated_length": 653.5,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7530937953401041,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 76.97829901393658,
      "learning_rate": 2.576770441574204e-07,
      "loss": 0.014,
      "num_tokens": 2096829726.0,
      "reward": 0.095703125,
      "reward_std": 0.10100477933883667,
      "rewards/accuracy_reward/mean": 0.09879032522439957,
      "rewards/accuracy_reward/std": 0.2986815273761749,
      "step": 2206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1616.7734375,
      "completions/mean_terminated_length": 614.3117065429688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7534351796534949,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 36.333292473213305,
      "learning_rate": 2.5726966460976406e-07,
      "loss": 0.0204,
      "num_tokens": 2097735418.0,
      "reward": 0.09375,
      "reward_std": 0.1331530660390854,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 2207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1518.87890625,
      "completions/mean_terminated_length": 583.6216430664062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7537765639668857,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 31.126770946411682,
      "learning_rate": 2.5686270053927743e-07,
      "loss": 0.0057,
      "num_tokens": 2098596396.0,
      "reward": 0.0234375,
      "reward_std": 0.05721627548336983,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1866.0,
      "completions/mean_length": 1621.298828125,
      "completions/mean_terminated_length": 571.8446044921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7541179482802766,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 27.15206895078571,
      "learning_rate": 2.5645615252357205e-07,
      "loss": 0.0393,
      "num_tokens": 2099508501.0,
      "reward": 0.056640625,
      "reward_std": 0.08804761618375778,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1586.888671875,
      "completions/mean_terminated_length": 617.4909057617188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7544593325936674,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 29.903027516589866,
      "learning_rate": 2.560500211396681e-07,
      "loss": 0.0087,
      "num_tokens": 2100402124.0,
      "reward": 0.025390625,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1650.51953125,
      "completions/mean_terminated_length": 583.8992919921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7548007169070581,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 17.082855226278348,
      "learning_rate": 2.556443069639951e-07,
      "loss": 0.0098,
      "num_tokens": 2101318790.0,
      "reward": 0.037109375,
      "reward_std": 0.07085913419723511,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1837.0,
      "completions/mean_length": 1662.154296875,
      "completions/mean_terminated_length": 595.4044189453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7551421012204489,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 34.739905982749036,
      "learning_rate": 2.5523901057238994e-07,
      "loss": 0.0122,
      "num_tokens": 2102249525.0,
      "reward": 0.060546875,
      "reward_std": 0.10960254818201065,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1918.0,
      "completions/mean_length": 1562.990234375,
      "completions/mean_terminated_length": 637.0625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7554834855338397,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 63.57129223584638,
      "learning_rate": 2.5483413254009666e-07,
      "loss": 0.0078,
      "num_tokens": 2103123360.0,
      "reward": 0.029296875,
      "reward_std": 0.06271954625844955,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1557.833984375,
      "completions/mean_terminated_length": 653.75,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7558248698472305,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 20.925509821042002,
      "learning_rate": 2.544296734417658e-07,
      "loss": 0.0196,
      "num_tokens": 2103999483.0,
      "reward": 0.095703125,
      "reward_std": 0.11623741686344147,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 2214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1572.49609375,
      "completions/mean_terminated_length": 607.4201049804688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7561662541606213,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.114035088567925,
      "learning_rate": 2.540256338514528e-07,
      "loss": -0.0115,
      "num_tokens": 2104874457.0,
      "reward": 0.05078125,
      "reward_std": 0.08746850490570068,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1611.82421875,
      "completions/mean_terminated_length": 597.8571166992188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7565076384740121,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 26.20568632094594,
      "learning_rate": 2.536220143426182e-07,
      "loss": 0.0357,
      "num_tokens": 2105772319.0,
      "reward": 0.09375,
      "reward_std": 0.11630973219871521,
      "rewards/accuracy_reward/mean": 0.09677419066429138,
      "rewards/accuracy_reward/std": 0.2959485352039337,
      "step": 2216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1610.703125,
      "completions/mean_terminated_length": 584.62744140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.756849022787403,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 42.62162106043469,
      "learning_rate": 2.532188154881258e-07,
      "loss": 0.0013,
      "num_tokens": 2106669223.0,
      "reward": 0.08203125,
      "reward_std": 0.12754468619823456,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1688.08984375,
      "completions/mean_terminated_length": 597.0236206054688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7571904071007938,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 113.5196805740244,
      "learning_rate": 2.528160378602431e-07,
      "loss": 0.0208,
      "num_tokens": 2107605237.0,
      "reward": 0.068359375,
      "reward_std": 0.11685723811388016,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1667.0546875,
      "completions/mean_terminated_length": 624.7372436523438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7575317914141845,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 85.14756383016665,
      "learning_rate": 2.5241368203063875e-07,
      "loss": -0.0011,
      "num_tokens": 2108532369.0,
      "reward": 0.046875,
      "reward_std": 0.08670367300510406,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1784.0,
      "completions/mean_length": 1672.328125,
      "completions/mean_terminated_length": 612.5969848632812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7578731757275753,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 116.23516740238533,
      "learning_rate": 2.5201174857038344e-07,
      "loss": 0.0042,
      "num_tokens": 2109469753.0,
      "reward": 0.04296875,
      "reward_std": 0.11123858392238617,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1829.0,
      "completions/mean_length": 1617.462890625,
      "completions/mean_terminated_length": 578.433349609375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7582145600409661,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 29.659560851519238,
      "learning_rate": 2.516102380499483e-07,
      "loss": -0.0018,
      "num_tokens": 2110374358.0,
      "reward": 0.078125,
      "reward_std": 0.10150270164012909,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1695.029296875,
      "completions/mean_terminated_length": 668.4503784179688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7585559443543569,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 50.878008347479955,
      "learning_rate": 2.51209151039204e-07,
      "loss": 0.0238,
      "num_tokens": 2111319525.0,
      "reward": 0.048828125,
      "reward_std": 0.09143522381782532,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1700.81640625,
      "completions/mean_terminated_length": 648.3306884765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7588973286677477,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 90.00534449616639,
      "learning_rate": 2.5080848810742027e-07,
      "loss": -0.0093,
      "num_tokens": 2112276359.0,
      "reward": 0.0390625,
      "reward_std": 0.07187044620513916,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1665.08203125,
      "completions/mean_terminated_length": 595.7481079101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7592387129811385,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.476201648315932,
      "learning_rate": 2.504082498232648e-07,
      "loss": 0.0002,
      "num_tokens": 2113199985.0,
      "reward": 0.048828125,
      "reward_std": 0.09072227776050568,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 2224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1631.884765625,
      "completions/mean_terminated_length": 682.2884521484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7595800972945294,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 21.959572119266443,
      "learning_rate": 2.5000843675480264e-07,
      "loss": -0.0017,
      "num_tokens": 2114109126.0,
      "reward": 0.013671875,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 2225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 1618.904296875,
      "completions/mean_terminated_length": 657.5126953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7599214816079202,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.159964388175874,
      "learning_rate": 2.4960904946949513e-07,
      "loss": 0.0114,
      "num_tokens": 2115011749.0,
      "reward": 0.0625,
      "reward_std": 0.103780098259449,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1614.96484375,
      "completions/mean_terminated_length": 635.8088989257812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7602628659213109,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 19.31017129891979,
      "learning_rate": 2.492100885341997e-07,
      "loss": -0.0014,
      "num_tokens": 2115922643.0,
      "reward": 0.072265625,
      "reward_std": 0.09886875748634338,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1553.955078125,
      "completions/mean_terminated_length": 618.8983154296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7606042502347017,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 16.480227990445243,
      "learning_rate": 2.4881155451516844e-07,
      "loss": 0.0135,
      "num_tokens": 2116791036.0,
      "reward": 0.021484375,
      "reward_std": 0.03462383896112442,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1713.0,
      "completions/mean_length": 1627.5234375,
      "completions/mean_terminated_length": 583.4829711914062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7609456345480925,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.099363321025816,
      "learning_rate": 2.484134479780473e-07,
      "loss": 0.0081,
      "num_tokens": 2117700504.0,
      "reward": 0.03125,
      "reward_std": 0.07559756934642792,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686307430267334,
      "step": 2229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1667.388671875,
      "completions/mean_terminated_length": 571.6893920898438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7612870188614833,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 128.03170989470343,
      "learning_rate": 2.48015769487876e-07,
      "loss": -0.0015,
      "num_tokens": 2118630031.0,
      "reward": 0.056640625,
      "reward_std": 0.11492595076560974,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1815.0,
      "completions/mean_length": 1628.451171875,
      "completions/mean_terminated_length": 596.5878295898438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7616284031748741,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 42.09742917566184,
      "learning_rate": 2.476185196090862e-07,
      "loss": 0.0145,
      "num_tokens": 2119536438.0,
      "reward": 0.048828125,
      "reward_std": 0.05688370764255524,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1735.60546875,
      "completions/mean_terminated_length": 657.6956176757812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7619697874882649,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.257373094538761,
      "learning_rate": 2.472216989055015e-07,
      "loss": 0.0237,
      "num_tokens": 2120497532.0,
      "reward": 0.037109375,
      "reward_std": 0.061668481677770615,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1862.0,
      "completions/mean_length": 1700.455078125,
      "completions/mean_terminated_length": 624.4560546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7623111718016558,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 89.81193074808552,
      "learning_rate": 2.468253079403362e-07,
      "loss": 0.0062,
      "num_tokens": 2121458517.0,
      "reward": 0.060546875,
      "reward_std": 0.10271549224853516,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1619.662109375,
      "completions/mean_terminated_length": 642.173095703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7626525561150466,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.615257359413665,
      "learning_rate": 2.464293472761948e-07,
      "loss": -0.0042,
      "num_tokens": 2122369640.0,
      "reward": 0.037109375,
      "reward_std": 0.08537977933883667,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1542.0,
      "completions/mean_length": 1611.814453125,
      "completions/mean_terminated_length": 559.1533203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7629939404284373,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 12.452921701009723,
      "learning_rate": 2.460338174750713e-07,
      "loss": 0.0235,
      "num_tokens": 2123273241.0,
      "reward": 0.076171875,
      "reward_std": 0.1247420534491539,
      "rewards/accuracy_reward/mean": 0.0786290317773819,
      "rewards/accuracy_reward/std": 0.26943066716194153,
      "step": 2235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1860.0,
      "completions/mean_length": 1633.9765625,
      "completions/mean_terminated_length": 625.6778564453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7633353247418281,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 61.45202360904577,
      "learning_rate": 2.4563871909834755e-07,
      "loss": 0.0027,
      "num_tokens": 2124195325.0,
      "reward": 0.052734375,
      "reward_std": 0.0881948471069336,
      "rewards/accuracy_reward/mean": 0.05443548411130905,
      "rewards/accuracy_reward/std": 0.227104052901268,
      "step": 2236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1825.0,
      "completions/mean_length": 1561.017578125,
      "completions/mean_terminated_length": 572.6449584960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7636767090552189,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 29.91338125473617,
      "learning_rate": 2.4524405270679386e-07,
      "loss": -0.0119,
      "num_tokens": 2125075142.0,
      "reward": 0.091796875,
      "reward_std": 0.12402814626693726,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 2237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1587.421875,
      "completions/mean_terminated_length": 564.8804931640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7640180933686097,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 54.1485772617581,
      "learning_rate": 2.4484981886056647e-07,
      "loss": 0.0221,
      "num_tokens": 2125970958.0,
      "reward": 0.09375,
      "reward_std": 0.11018751561641693,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 2238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1799.0,
      "completions/mean_length": 1620.25390625,
      "completions/mean_terminated_length": 704.4049072265625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7643594776820005,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 93.1376682315806,
      "learning_rate": 2.444560181192087e-07,
      "loss": 0.0118,
      "num_tokens": 2126874496.0,
      "reward": 0.052734375,
      "reward_std": 0.08505964279174805,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1562.755859375,
      "completions/mean_terminated_length": 611.9017333984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7647008619953913,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 24.055875642354074,
      "learning_rate": 2.4406265104164814e-07,
      "loss": -0.0064,
      "num_tokens": 2127754371.0,
      "reward": 0.060546875,
      "reward_std": 0.10672760009765625,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1684.4375,
      "completions/mean_terminated_length": 582.2991943359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7650422463087821,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 71.7981845041056,
      "learning_rate": 2.4366971818619785e-07,
      "loss": 0.0048,
      "num_tokens": 2128689859.0,
      "reward": 0.033203125,
      "reward_std": 0.06975477933883667,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1699.46875,
      "completions/mean_terminated_length": 631.74609375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.765383630622173,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 24.872947986499543,
      "learning_rate": 2.4327722011055407e-07,
      "loss": 0.0042,
      "num_tokens": 2129642835.0,
      "reward": 0.087890625,
      "reward_std": 0.1310567855834961,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 2242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1641.755859375,
      "completions/mean_terminated_length": 583.2323608398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7657250149355637,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.046160899195035,
      "learning_rate": 2.428851573717961e-07,
      "loss": 0.0147,
      "num_tokens": 2130561974.0,
      "reward": 0.01953125,
      "reward_std": 0.06085042655467987,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 2243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1763.599609375,
      "completions/mean_terminated_length": 700.1851806640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7660663992489545,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 41.293665093029254,
      "learning_rate": 2.42493530526385e-07,
      "loss": 0.0042,
      "num_tokens": 2131550185.0,
      "reward": 0.046875,
      "reward_std": 0.1049705296754837,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1577.033203125,
      "completions/mean_terminated_length": 637.8538208007812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7664077835623453,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.66245887058529,
      "learning_rate": 2.421023401301636e-07,
      "loss": 0.0052,
      "num_tokens": 2132438378.0,
      "reward": 0.025390625,
      "reward_std": 0.06755761057138443,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 2245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1678.810546875,
      "completions/mean_terminated_length": 627.481201171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7667491678757361,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 41.12605647594873,
      "learning_rate": 2.417115867383553e-07,
      "loss": 0.0109,
      "num_tokens": 2133386265.0,
      "reward": 0.060546875,
      "reward_std": 0.12254488468170166,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1909.0,
      "completions/mean_length": 1580.08984375,
      "completions/mean_terminated_length": 531.7341918945312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7670905521891269,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 17.17961463986252,
      "learning_rate": 2.4132127090556265e-07,
      "loss": 0.0057,
      "num_tokens": 2134272871.0,
      "reward": 0.046875,
      "reward_std": 0.0687694400548935,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1881.0,
      "completions/mean_length": 1694.26171875,
      "completions/mean_terminated_length": 622.3228149414062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.7674319365025177,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 62.936927524265975,
      "learning_rate": 2.4093139318576793e-07,
      "loss": 0.0311,
      "num_tokens": 2135221981.0,
      "reward": 0.078125,
      "reward_std": 0.08098085969686508,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1594.37109375,
      "completions/mean_terminated_length": 559.1666870117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7677733208159085,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 101.35861448461155,
      "learning_rate": 2.405419541323314e-07,
      "loss": 0.0205,
      "num_tokens": 2136115739.0,
      "reward": 0.05078125,
      "reward_std": 0.11585800349712372,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1642.927734375,
      "completions/mean_terminated_length": 656.4563598632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7681147051292994,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 49.48775176084035,
      "learning_rate": 2.4015295429799e-07,
      "loss": 0.0095,
      "num_tokens": 2137032454.0,
      "reward": 0.03125,
      "reward_std": 0.07795868813991547,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1652.755859375,
      "completions/mean_terminated_length": 652.3793334960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7684560894426901,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.934399068283161,
      "learning_rate": 2.397643942348584e-07,
      "loss": 0.0237,
      "num_tokens": 2137955817.0,
      "reward": 0.0390625,
      "reward_std": 0.06068410724401474,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1639.0,
      "completions/mean_length": 1675.205078125,
      "completions/mean_terminated_length": 602.007568359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7687974737560809,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.562930842663347,
      "learning_rate": 2.393762744944261e-07,
      "loss": 0.0029,
      "num_tokens": 2138893970.0,
      "reward": 0.021484375,
      "reward_std": 0.05341683328151703,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1689.5,
      "completions/mean_terminated_length": 667.9097900390625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7691388580694717,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 30.13805288560358,
      "learning_rate": 2.389885956275585e-07,
      "loss": 0.0108,
      "num_tokens": 2139847634.0,
      "reward": 0.037109375,
      "reward_std": 0.08780203759670258,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1621.458984375,
      "completions/mean_terminated_length": 683.0687866210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7694802423828625,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 38.450479179537155,
      "learning_rate": 2.386013581844945e-07,
      "loss": 0.0146,
      "num_tokens": 2140755613.0,
      "reward": 0.04296875,
      "reward_std": 0.06469620764255524,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1916.0,
      "completions/mean_length": 1580.37890625,
      "completions/mean_terminated_length": 551.6124877929688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7698216266962533,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 6.423560684812626,
      "learning_rate": 2.3821456271484704e-07,
      "loss": 0.0256,
      "num_tokens": 2141633007.0,
      "reward": 0.04296875,
      "reward_std": 0.09486784040927887,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 1614.126953125,
      "completions/mean_terminated_length": 685.1594848632812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7701630110096441,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 31.229647576363117,
      "learning_rate": 2.3782820976760153e-07,
      "loss": 0.0115,
      "num_tokens": 2142540320.0,
      "reward": 0.05078125,
      "reward_std": 0.09671792387962341,
      "rewards/accuracy_reward/mean": 0.052419353276491165,
      "rewards/accuracy_reward/std": 0.22309619188308716,
      "step": 2256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1864.0,
      "completions/mean_length": 1735.603515625,
      "completions/mean_terminated_length": 524.6952514648438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7705043953230349,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 94.13466649496164,
      "learning_rate": 2.374422998911151e-07,
      "loss": -0.0047,
      "num_tokens": 2143504469.0,
      "reward": 0.033203125,
      "reward_std": 0.07586899399757385,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1626.51953125,
      "completions/mean_terminated_length": 599.6912841796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7708457796364258,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.25394798125014,
      "learning_rate": 2.3705683363311656e-07,
      "loss": 0.0085,
      "num_tokens": 2144411375.0,
      "reward": 0.048828125,
      "reward_std": 0.06536141037940979,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 2258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1566.763671875,
      "completions/mean_terminated_length": 648.039794921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7711871639498165,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 116.8396280220696,
      "learning_rate": 2.3667181154070443e-07,
      "loss": 0.0289,
      "num_tokens": 2145281910.0,
      "reward": 0.056640625,
      "reward_std": 0.11036626249551773,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1728.2890625,
      "completions/mean_terminated_length": 612.1052856445312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7715285482632073,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 13.737625679621692,
      "learning_rate": 2.3628723416034742e-07,
      "loss": 0.0133,
      "num_tokens": 2146241626.0,
      "reward": 0.064453125,
      "reward_std": 0.11982375383377075,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1688.43359375,
      "completions/mean_terminated_length": 598.409423828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7718699325765981,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.1005908054538525,
      "learning_rate": 2.359031020378827e-07,
      "loss": 0.0258,
      "num_tokens": 2147174296.0,
      "reward": 0.04296875,
      "reward_std": 0.09392979741096497,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1648.703125,
      "completions/mean_terminated_length": 720.467529296875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7722113168899889,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 45.68550030411381,
      "learning_rate": 2.3551941571851534e-07,
      "loss": 0.0112,
      "num_tokens": 2148090000.0,
      "reward": 0.025390625,
      "reward_std": 0.06943464279174805,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196493625641,
      "step": 2262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1873.0,
      "completions/mean_length": 1718.291015625,
      "completions/mean_terminated_length": 653.322265625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7725527012033797,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 38.33644412997567,
      "learning_rate": 2.3513617574681828e-07,
      "loss": 0.0121,
      "num_tokens": 2149041413.0,
      "reward": 0.015625,
      "reward_std": 0.051659777760505676,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1625.869140625,
      "completions/mean_terminated_length": 635.3790893554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7728940855167705,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.03017966919862,
      "learning_rate": 2.347533826667302e-07,
      "loss": 0.0077,
      "num_tokens": 2149947922.0,
      "reward": 0.037109375,
      "reward_std": 0.046437256038188934,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1754.0,
      "completions/mean_length": 1663.490234375,
      "completions/mean_terminated_length": 621.4130249023438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7732354698301613,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 20.149293631711878,
      "learning_rate": 2.3437103702155617e-07,
      "loss": 0.0083,
      "num_tokens": 2150869933.0,
      "reward": 0.10546875,
      "reward_std": 0.11780644953250885,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 2265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1570.021484375,
      "completions/mean_terminated_length": 489.2420349121094,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7735768541435522,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 48.9125508012108,
      "learning_rate": 2.339891393539656e-07,
      "loss": 0.0292,
      "num_tokens": 2151746504.0,
      "reward": 0.083984375,
      "reward_std": 0.10188952088356018,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1686.423828125,
      "completions/mean_terminated_length": 696.7080078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7739182384569429,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 22.127961104248037,
      "learning_rate": 2.336076902059927e-07,
      "loss": 0.0101,
      "num_tokens": 2152687441.0,
      "reward": 0.0546875,
      "reward_std": 0.08812816441059113,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1727.140625,
      "completions/mean_terminated_length": 594.1947021484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7742596227703337,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 76.19844901662259,
      "learning_rate": 2.3322669011903461e-07,
      "loss": 0.0021,
      "num_tokens": 2153647369.0,
      "reward": 0.052734375,
      "reward_std": 0.0971580371260643,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1732.42578125,
      "completions/mean_terminated_length": 643.0086669921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7746010070837245,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 43.14853579607224,
      "learning_rate": 2.3284613963385113e-07,
      "loss": 0.0062,
      "num_tokens": 2154610899.0,
      "reward": 0.0390625,
      "reward_std": 0.09413031488656998,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1782.0,
      "completions/mean_length": 1716.205078125,
      "completions/mean_terminated_length": 632.3416748046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7749423913971153,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 68.43877699446037,
      "learning_rate": 2.3246603929056435e-07,
      "loss": 0.0086,
      "num_tokens": 2155563580.0,
      "reward": 0.0859375,
      "reward_std": 0.11533089727163315,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1644.640625,
      "completions/mean_terminated_length": 572.8571166992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7752837757105061,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 12.858585896342689,
      "learning_rate": 2.320863896286569e-07,
      "loss": -0.0009,
      "num_tokens": 2156481476.0,
      "reward": 0.03125,
      "reward_std": 0.031083684414625168,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1619.005859375,
      "completions/mean_terminated_length": 666.5848999023438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7756251600238969,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 44.803110703627276,
      "learning_rate": 2.3170719118697228e-07,
      "loss": 0.0069,
      "num_tokens": 2157384711.0,
      "reward": 0.048828125,
      "reward_std": 0.0815330371260643,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1614.0,
      "completions/mean_length": 1659.51171875,
      "completions/mean_terminated_length": 596.1314086914062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7759665443372877,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 59.62908370981753,
      "learning_rate": 2.3132844450371314e-07,
      "loss": 0.0135,
      "num_tokens": 2158306685.0,
      "reward": 0.052734375,
      "reward_std": 0.09585316479206085,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1629.611328125,
      "completions/mean_terminated_length": 656.9935302734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7763079286506785,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.710164054319664,
      "learning_rate": 2.3095015011644128e-07,
      "loss": 0.0198,
      "num_tokens": 2159220406.0,
      "reward": 0.01953125,
      "reward_std": 0.05243149772286415,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 2274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1696.1484375,
      "completions/mean_terminated_length": 814.1095581054688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7766493129640692,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 16.162100384017894,
      "learning_rate": 2.3057230856207633e-07,
      "loss": 0.0133,
      "num_tokens": 2160163202.0,
      "reward": 0.03125,
      "reward_std": 0.053257472813129425,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1649.50390625,
      "completions/mean_terminated_length": 650.5342407226562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7769906972774601,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 69.65292993387105,
      "learning_rate": 2.3019492037689518e-07,
      "loss": 0.0034,
      "num_tokens": 2161081620.0,
      "reward": 0.0859375,
      "reward_std": 0.07642117142677307,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1604.58203125,
      "completions/mean_terminated_length": 514.0135498046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7773320815908509,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.4489645987364,
      "learning_rate": 2.2981798609653148e-07,
      "loss": 0.0078,
      "num_tokens": 2161973998.0,
      "reward": 0.0625,
      "reward_std": 0.09060370922088623,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 2277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1691.734375,
      "completions/mean_terminated_length": 633.9844970703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7776734659042417,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 7.901140110434337,
      "learning_rate": 2.294415062559743e-07,
      "loss": -0.0011,
      "num_tokens": 2162918230.0,
      "reward": 0.015625,
      "reward_std": 0.03344620764255524,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1624.31640625,
      "completions/mean_terminated_length": 683.6854858398438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7780148502176325,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 55.95168223664823,
      "learning_rate": 2.2906548138956815e-07,
      "loss": -0.0199,
      "num_tokens": 2163827736.0,
      "reward": 0.052734375,
      "reward_std": 0.06260748207569122,
      "rewards/accuracy_reward/mean": 0.05624999850988388,
      "rewards/accuracy_reward/std": 0.23064424097537994,
      "step": 2279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1591.580078125,
      "completions/mean_terminated_length": 479.6308898925781,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7783562345310233,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 64.12161728163721,
      "learning_rate": 2.2868991203101145e-07,
      "loss": -0.0078,
      "num_tokens": 2164723505.0,
      "reward": 0.060546875,
      "reward_std": 0.10123127698898315,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1587.568359375,
      "completions/mean_terminated_length": 685.3352661132812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.7786976188444141,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 184.79244638455157,
      "learning_rate": 2.283147987133561e-07,
      "loss": 0.0092,
      "num_tokens": 2165608084.0,
      "reward": 0.048828125,
      "reward_std": 0.06661957502365112,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1792.0,
      "completions/mean_length": 1607.267578125,
      "completions/mean_terminated_length": 533.5369262695312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7790390031578049,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.309914748058596,
      "learning_rate": 2.2794014196900704e-07,
      "loss": 0.0192,
      "num_tokens": 2166513277.0,
      "reward": 0.072265625,
      "reward_std": 0.08285094052553177,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 1688.755859375,
      "completions/mean_terminated_length": 685.5333251953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7793803874711956,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 25.328743280804456,
      "learning_rate": 2.275659423297208e-07,
      "loss": 0.0113,
      "num_tokens": 2167456480.0,
      "reward": 0.041015625,
      "reward_std": 0.07085912674665451,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1768.0,
      "completions/mean_length": 1703.51171875,
      "completions/mean_terminated_length": 614.032470703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7797217717845865,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 8.291370927237596,
      "learning_rate": 2.2719220032660553e-07,
      "loss": 0.0052,
      "num_tokens": 2168403238.0,
      "reward": 0.029296875,
      "reward_std": 0.06271954625844955,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1755.0,
      "completions/mean_length": 1580.572265625,
      "completions/mean_terminated_length": 579.7607421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7800631560979773,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 12.515069183193397,
      "learning_rate": 2.2681891649011942e-07,
      "loss": -0.0059,
      "num_tokens": 2169291307.0,
      "reward": 0.05859375,
      "reward_std": 0.08109388500452042,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1670.35546875,
      "completions/mean_terminated_length": 695.8741455078125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7804045404113681,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 24.44283840250339,
      "learning_rate": 2.2644609135007088e-07,
      "loss": 0.0003,
      "num_tokens": 2170220593.0,
      "reward": 0.009765625,
      "reward_std": 0.0390625,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 2286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1858.0,
      "completions/mean_length": 1630.17578125,
      "completions/mean_terminated_length": 658.8701171875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7807459247247589,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.817337426239106,
      "learning_rate": 2.2607372543561681e-07,
      "loss": 0.005,
      "num_tokens": 2171129419.0,
      "reward": 0.0234375,
      "reward_std": 0.06657323241233826,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1711.0,
      "completions/mean_length": 1677.357421875,
      "completions/mean_terminated_length": 529.8480224609375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7810873090381497,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.395331565369919,
      "learning_rate": 2.257018192752625e-07,
      "loss": 0.02,
      "num_tokens": 2172061730.0,
      "reward": 0.041015625,
      "reward_std": 0.08329563587903976,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1874.0,
      "completions/mean_length": 1613.021484375,
      "completions/mean_terminated_length": 611.167724609375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7814286933515405,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 23.689557717830898,
      "learning_rate": 2.2533037339686085e-07,
      "loss": 0.0187,
      "num_tokens": 2172971261.0,
      "reward": 0.0390625,
      "reward_std": 0.05864076316356659,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 2289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1857.0,
      "completions/mean_length": 1703.431640625,
      "completions/mean_terminated_length": 669.7265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7817700776649313,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 36.07385360632374,
      "learning_rate": 2.2495938832761114e-07,
      "loss": -0.0062,
      "num_tokens": 2173924714.0,
      "reward": 0.046875,
      "reward_std": 0.06574726849794388,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1542.609375,
      "completions/mean_terminated_length": 586.0791015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.782111461978322,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 67.19638597512645,
      "learning_rate": 2.245888645940591e-07,
      "loss": -0.0012,
      "num_tokens": 2174784658.0,
      "reward": 0.103515625,
      "reward_std": 0.13837699592113495,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 2291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1886.0,
      "completions/mean_length": 1761.84765625,
      "completions/mean_terminated_length": 625.5728149414062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7824528462917129,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 16.926319847930557,
      "learning_rate": 2.2421880272209524e-07,
      "loss": 0.0024,
      "num_tokens": 2175768132.0,
      "reward": 0.017578125,
      "reward_std": 0.04461899772286415,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1682.759765625,
      "completions/mean_terminated_length": 662.7926025390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7827942306051037,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 137.8500087942121,
      "learning_rate": 2.238492032369546e-07,
      "loss": 0.0058,
      "num_tokens": 2176710217.0,
      "reward": 0.07421875,
      "reward_std": 0.10890313982963562,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1578.267578125,
      "completions/mean_terminated_length": 590.4060668945312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7831356149184945,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 29.109965465128127,
      "learning_rate": 2.2348006666321633e-07,
      "loss": 0.0247,
      "num_tokens": 2177587650.0,
      "reward": 0.05859375,
      "reward_std": 0.10270209610462189,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 2294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1851.0,
      "completions/mean_length": 1632.9375,
      "completions/mean_terminated_length": 621.7449951171875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7834769992318853,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 23.843420872438283,
      "learning_rate": 2.2311139352480196e-07,
      "loss": 0.0128,
      "num_tokens": 2178500722.0,
      "reward": 0.01953125,
      "reward_std": 0.04318207502365112,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 2295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1679.150390625,
      "completions/mean_terminated_length": 595.2999877929688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7838183835452761,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 51.081935887958686,
      "learning_rate": 2.227431843449759e-07,
      "loss": 0.003,
      "num_tokens": 2179438847.0,
      "reward": 0.091796875,
      "reward_std": 0.11261671781539917,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 2296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1859.0,
      "completions/mean_length": 1701.13671875,
      "completions/mean_terminated_length": 692.3206176757812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7841597678586669,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 57.558526105451904,
      "learning_rate": 2.2237543964634343e-07,
      "loss": 0.0048,
      "num_tokens": 2180388837.0,
      "reward": 0.0859375,
      "reward_std": 0.09561356902122498,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1675.9453125,
      "completions/mean_terminated_length": 667.6232299804688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7845011521720577,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 92.13754857222388,
      "learning_rate": 2.2200815995085132e-07,
      "loss": 0.0096,
      "num_tokens": 2181333417.0,
      "reward": 0.064453125,
      "reward_std": 0.11182428896427155,
      "rewards/accuracy_reward/mean": 0.06653226166963577,
      "rewards/accuracy_reward/std": 0.2494617998600006,
      "step": 2298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1709.51171875,
      "completions/mean_terminated_length": 661.552001953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7848425364854484,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 134.59491057686972,
      "learning_rate": 2.2164134577978528e-07,
      "loss": 0.0214,
      "num_tokens": 2182285583.0,
      "reward": 0.080078125,
      "reward_std": 0.1344509869813919,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1689.658203125,
      "completions/mean_terminated_length": 591.8809814453125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7851839207988393,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 41.889077917804855,
      "learning_rate": 2.2127499765377133e-07,
      "loss": 0.0004,
      "num_tokens": 2183230864.0,
      "reward": 0.0234375,
      "reward_std": 0.05012226849794388,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1727.33203125,
      "completions/mean_terminated_length": 620.3303833007812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7855253051122301,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 3.948665500334967,
      "learning_rate": 2.2090911609277375e-07,
      "loss": 0.014,
      "num_tokens": 2184196538.0,
      "reward": 0.01953125,
      "reward_std": 0.05001020431518555,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 2301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1665.33203125,
      "completions/mean_terminated_length": 638.798583984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7858666894256209,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 44.789119092323645,
      "learning_rate": 2.2054370161609415e-07,
      "loss": 0.0061,
      "num_tokens": 2185125012.0,
      "reward": 0.11328125,
      "reward_std": 0.14328739047050476,
      "rewards/accuracy_reward/mean": 0.11328125,
      "rewards/accuracy_reward/std": 0.3172462284564972,
      "step": 2302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1684.41796875,
      "completions/mean_terminated_length": 746.2237548828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7862080737390117,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 23.803980615278903,
      "learning_rate": 2.201787547423719e-07,
      "loss": 0.0157,
      "num_tokens": 2186064394.0,
      "reward": 0.056640625,
      "reward_std": 0.10541516542434692,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1820.0,
      "completions/mean_length": 1607.771484375,
      "completions/mean_terminated_length": 574.8170166015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7865494580524025,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.543524482588994,
      "learning_rate": 2.19814275989582e-07,
      "loss": 0.0174,
      "num_tokens": 2186962181.0,
      "reward": 0.0703125,
      "reward_std": 0.08177263289690018,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1729.0,
      "completions/mean_length": 1666.556640625,
      "completions/mean_terminated_length": 579.5864868164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7868908423657933,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 33.94891767565079,
      "learning_rate": 2.1945026587503578e-07,
      "loss": 0.0481,
      "num_tokens": 2187889634.0,
      "reward": 0.068359375,
      "reward_std": 0.12241136282682419,
      "rewards/accuracy_reward/mean": 0.07056451588869095,
      "rewards/accuracy_reward/std": 0.25635457038879395,
      "step": 2305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1684.0,
      "completions/mean_length": 1557.46875,
      "completions/mean_terminated_length": 535.0361328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7872322266791841,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.71055783535664,
      "learning_rate": 2.1908672491537854e-07,
      "loss": 0.0085,
      "num_tokens": 2188764194.0,
      "reward": 0.046875,
      "reward_std": 0.06574726849794388,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1718.74609375,
      "completions/mean_terminated_length": 594.7413940429688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.787573610992575,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 88.38607462571197,
      "learning_rate": 2.187236536265904e-07,
      "loss": 0.0085,
      "num_tokens": 2189722112.0,
      "reward": 0.095703125,
      "reward_std": 0.13055095076560974,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 2307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1555.216796875,
      "completions/mean_terminated_length": 528.09033203125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7879149953059656,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 88.92295878105215,
      "learning_rate": 2.1836105252398483e-07,
      "loss": 0.0263,
      "num_tokens": 2190596991.0,
      "reward": 0.033203125,
      "reward_std": 0.08082009106874466,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1691.08984375,
      "completions/mean_terminated_length": 631.4263305664062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7882563796193565,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 74.41291579298706,
      "learning_rate": 2.1799892212220745e-07,
      "loss": 0.0093,
      "num_tokens": 2191538669.0,
      "reward": 0.068359375,
      "reward_std": 0.11104501038789749,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1717.0,
      "completions/mean_length": 1595.52734375,
      "completions/mean_terminated_length": 562.9615478515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7885977639327473,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 28.333364814729112,
      "learning_rate": 2.1763726293523642e-07,
      "loss": 0.0204,
      "num_tokens": 2192430171.0,
      "reward": 0.080078125,
      "reward_std": 0.11845915019512177,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1644.87109375,
      "completions/mean_terminated_length": 604.6293334960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7889391482461381,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 123.15233850278845,
      "learning_rate": 2.1727607547638073e-07,
      "loss": 0.0035,
      "num_tokens": 2193348697.0,
      "reward": 0.0703125,
      "reward_std": 0.08126020431518555,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1680.373046875,
      "completions/mean_terminated_length": 632.7744750976562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7892805325595289,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 50.89979454443745,
      "learning_rate": 2.1691536025827982e-07,
      "loss": -0.0095,
      "num_tokens": 2194291704.0,
      "reward": 0.033203125,
      "reward_std": 0.05628519132733345,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1672.73046875,
      "completions/mean_terminated_length": 558.5581665039062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7896219168729197,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 106.32848149359123,
      "learning_rate": 2.1655511779290285e-07,
      "loss": 0.0014,
      "num_tokens": 2195229534.0,
      "reward": 0.01953125,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 2313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1740.400390625,
      "completions/mean_terminated_length": 641.8303833007812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7899633011863105,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 33.28195273694159,
      "learning_rate": 2.161953485915483e-07,
      "loss": 0.0168,
      "num_tokens": 2196196347.0,
      "reward": 0.07421875,
      "reward_std": 0.09586012363433838,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1859.0,
      "completions/mean_length": 1637.08203125,
      "completions/mean_terminated_length": 586.9583129882812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7903046854997013,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.026846147922716,
      "learning_rate": 2.1583605316484286e-07,
      "loss": 0.0075,
      "num_tokens": 2197105093.0,
      "reward": 0.056640625,
      "reward_std": 0.08130794763565063,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1832.8359375,
      "completions/mean_terminated_length": 796.1364135742188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.790646069813092,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 52.12857852157952,
      "learning_rate": 2.1547723202274039e-07,
      "loss": 0.0142,
      "num_tokens": 2198121121.0,
      "reward": 0.029296875,
      "reward_std": 0.07014618813991547,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1909.0,
      "completions/mean_length": 1651.078125,
      "completions/mean_terminated_length": 646.4552001953125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7909874541264829,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 16.692409896267765,
      "learning_rate": 2.1511888567452224e-07,
      "loss": 0.0215,
      "num_tokens": 2199041225.0,
      "reward": 0.05078125,
      "reward_std": 0.1443656086921692,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1592.9921875,
      "completions/mean_terminated_length": 619.0858764648438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7913288384398737,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 73.11480541739448,
      "learning_rate": 2.1476101462879504e-07,
      "loss": 0.0131,
      "num_tokens": 2199936213.0,
      "reward": 0.07421875,
      "reward_std": 0.11482083797454834,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1903.0,
      "completions/mean_length": 1792.884765625,
      "completions/mean_terminated_length": 728.6161499023438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7916702227532645,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 32.51115316340324,
      "learning_rate": 2.1440361939349166e-07,
      "loss": 0.0078,
      "num_tokens": 2200935258.0,
      "reward": 0.01953125,
      "reward_std": 0.050948239862918854,
      "rewards/accuracy_reward/mean": 0.02016128972172737,
      "rewards/accuracy_reward/std": 0.14069372415542603,
      "step": 2319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1695.857421875,
      "completions/mean_terminated_length": 593.991943359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7920116070666553,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.80027609902641,
      "learning_rate": 2.1404670047586905e-07,
      "loss": 0.0155,
      "num_tokens": 2201868705.0,
      "reward": 0.044921875,
      "reward_std": 0.09126890450716019,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1672.931640625,
      "completions/mean_terminated_length": 666.4532470703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7923529913800461,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 11.825715894944935,
      "learning_rate": 2.1369025838250847e-07,
      "loss": 0.0277,
      "num_tokens": 2202803022.0,
      "reward": 0.07421875,
      "reward_std": 0.12097585201263428,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1735.0,
      "completions/mean_length": 1759.318359375,
      "completions/mean_terminated_length": 640.3333740234375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7926943756934369,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 23.739636668733542,
      "learning_rate": 2.1333429361931412e-07,
      "loss": 0.0195,
      "num_tokens": 2203780081.0,
      "reward": 0.033203125,
      "reward_std": 0.06018522381782532,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1642.646484375,
      "completions/mean_terminated_length": 645.6959838867188,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.7930357600068277,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 1.9998989742191888,
      "learning_rate": 2.1297880669151315e-07,
      "loss": 0.008,
      "num_tokens": 2204691932.0,
      "reward": 0.052734375,
      "reward_std": 0.06997986882925034,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1713.384765625,
      "completions/mean_terminated_length": 655.1300659179688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7933771443202184,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 59.33059729153684,
      "learning_rate": 2.1262379810365404e-07,
      "loss": -0.0015,
      "num_tokens": 2205647505.0,
      "reward": 0.064453125,
      "reward_std": 0.07737371325492859,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1905.0,
      "completions/mean_length": 1730.0234375,
      "completions/mean_terminated_length": 632.31298828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7937185286336093,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 102.86449558097105,
      "learning_rate": 2.122692683596064e-07,
      "loss": 0.0153,
      "num_tokens": 2206608493.0,
      "reward": 0.048828125,
      "reward_std": 0.06371183693408966,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1832.0,
      "completions/mean_length": 1763.619140625,
      "completions/mean_terminated_length": 634.378662109375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7940599129470001,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.178142510863946,
      "learning_rate": 2.1191521796256067e-07,
      "loss": 0.018,
      "num_tokens": 2207588810.0,
      "reward": 0.08203125,
      "reward_std": 0.09629231691360474,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1869.0,
      "completions/mean_length": 1652.4609375,
      "completions/mean_terminated_length": 611.71630859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7944012972603909,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 69.64055666030006,
      "learning_rate": 2.1156164741502639e-07,
      "loss": 0.0146,
      "num_tokens": 2208508854.0,
      "reward": 0.095703125,
      "reward_std": 0.136918306350708,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 2327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1853.0,
      "completions/mean_length": 1742.3046875,
      "completions/mean_terminated_length": 598.7777709960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7947426815737817,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.912850176433427,
      "learning_rate": 2.1120855721883253e-07,
      "loss": 0.0021,
      "num_tokens": 2209486978.0,
      "reward": 0.0390625,
      "reward_std": 0.07289456576108932,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1561.826171875,
      "completions/mean_terminated_length": 501.9068298339844,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7950840658871725,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 12.448221811656028,
      "learning_rate": 2.1085594787512579e-07,
      "loss": 0.0296,
      "num_tokens": 2210368569.0,
      "reward": 0.0625,
      "reward_std": 0.1045791357755661,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1725.544921875,
      "completions/mean_terminated_length": 612.3739013671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7954254502005633,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 54.44276330759143,
      "learning_rate": 2.105038198843707e-07,
      "loss": 0.008,
      "num_tokens": 2211318880.0,
      "reward": 0.0546875,
      "reward_std": 0.11587189882993698,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1914.0,
      "completions/mean_length": 1633.37890625,
      "completions/mean_terminated_length": 583.9586181640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7957668345139541,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.101626387600385,
      "learning_rate": 2.101521737463487e-07,
      "loss": 0.0254,
      "num_tokens": 2212234402.0,
      "reward": 0.021484375,
      "reward_std": 0.059305962175130844,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1839.0,
      "completions/mean_length": 1721.04296875,
      "completions/mean_terminated_length": 629.3389892578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7961082188273448,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 10.763200878778521,
      "learning_rate": 2.0980100996015694e-07,
      "loss": -0.0012,
      "num_tokens": 2213204824.0,
      "reward": 0.00390625,
      "reward_std": 0.015625,
      "rewards/accuracy_reward/mean": 0.00390625,
      "rewards/accuracy_reward/std": 0.06243881583213806,
      "step": 2332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1694.966796875,
      "completions/mean_terminated_length": 590.3145141601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7964496031407357,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.21259084836439932,
      "learning_rate": 2.094503290242084e-07,
      "loss": 0.003,
      "num_tokens": 2214156023.0,
      "reward": 0.02734375,
      "reward_std": 0.024649331346154213,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1877.0,
      "completions/mean_length": 1706.279296875,
      "completions/mean_terminated_length": 625.5527954101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7967909874541265,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 52.74611986954134,
      "learning_rate": 2.091001314362303e-07,
      "loss": -0.0056,
      "num_tokens": 2215105702.0,
      "reward": 0.025390625,
      "reward_std": 0.052045635879039764,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 1632.55078125,
      "completions/mean_terminated_length": 581.0344848632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7971323717675173,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 85.39331085513942,
      "learning_rate": 2.087504176932643e-07,
      "loss": -0.0087,
      "num_tokens": 2216013488.0,
      "reward": 0.048828125,
      "reward_std": 0.06391072273254395,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1760.265625,
      "completions/mean_terminated_length": 463.9139709472656,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7974737560809081,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 7.826832074826158,
      "learning_rate": 2.0840118829166498e-07,
      "loss": 0.0072,
      "num_tokens": 2216994296.0,
      "reward": 0.033203125,
      "reward_std": 0.05739613622426987,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1688.720703125,
      "completions/mean_terminated_length": 743.3829345703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7978151403942989,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 82.417618219873,
      "learning_rate": 2.0805244372709952e-07,
      "loss": 0.0046,
      "num_tokens": 2217934457.0,
      "reward": 0.08203125,
      "reward_std": 0.10360593348741531,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1704.603515625,
      "completions/mean_terminated_length": 674.4140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7981565247076897,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.470507158949687,
      "learning_rate": 2.077041844945472e-07,
      "loss": 0.0228,
      "num_tokens": 2218884110.0,
      "reward": 0.03515625,
      "reward_std": 0.08989076316356659,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1717.638671875,
      "completions/mean_terminated_length": 564.2719116210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7984979090210805,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.885528113056802,
      "learning_rate": 2.0735641108829813e-07,
      "loss": 0.0013,
      "num_tokens": 2219843541.0,
      "reward": 0.02734375,
      "reward_std": 0.05032937601208687,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1782.3984375,
      "completions/mean_terminated_length": 674.80810546875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.7988392933344712,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 37.54077045209394,
      "learning_rate": 2.070091240019533e-07,
      "loss": 0.0038,
      "num_tokens": 2220832177.0,
      "reward": 0.06640625,
      "reward_std": 0.0907755047082901,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1727.375,
      "completions/mean_terminated_length": 569.0811157226562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.799180677647862,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 44.208385368551035,
      "learning_rate": 2.066623237284229e-07,
      "loss": 0.0051,
      "num_tokens": 2221787137.0,
      "reward": 0.01953125,
      "reward_std": 0.050948236137628555,
      "rewards/accuracy_reward/mean": 0.02016128972172737,
      "rewards/accuracy_reward/std": 0.14069372415542603,
      "step": 2341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1674.53515625,
      "completions/mean_terminated_length": 642.0147094726562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7995220619612529,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 47.00040401198957,
      "learning_rate": 2.0631601075992677e-07,
      "loss": -0.0165,
      "num_tokens": 2222725283.0,
      "reward": 0.064453125,
      "reward_std": 0.09673243016004562,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1666.021484375,
      "completions/mean_terminated_length": 689.8541870117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.7998634462746437,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 32.50472099380991,
      "learning_rate": 2.0597018558799272e-07,
      "loss": 0.0072,
      "num_tokens": 2223658622.0,
      "reward": 0.04296875,
      "reward_std": 0.09028453379869461,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1717.0,
      "completions/mean_length": 1733.44921875,
      "completions/mean_terminated_length": 635.2807006835938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8002048305880345,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.74382005587314,
      "learning_rate": 2.056248487034562e-07,
      "loss": 0.0184,
      "num_tokens": 2224625332.0,
      "reward": 0.033203125,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1655.8359375,
      "completions/mean_terminated_length": 593.0145263671875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8005462149014253,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 74.26744304555125,
      "learning_rate": 2.0528000059645995e-07,
      "loss": 0.001,
      "num_tokens": 2225555552.0,
      "reward": 0.09765625,
      "reward_std": 0.11478663980960846,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1638.267578125,
      "completions/mean_terminated_length": 458.73486328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8008875992148161,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 21.947734292105586,
      "learning_rate": 2.0493564175645256e-07,
      "loss": 0.0117,
      "num_tokens": 2226482169.0,
      "reward": 0.044921875,
      "reward_std": 0.06417001038789749,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1790.0,
      "completions/mean_length": 1782.31640625,
      "completions/mean_terminated_length": 631.6041870117188,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8012289835282069,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 157.2591046553519,
      "learning_rate": 2.0459177267218878e-07,
      "loss": 0.0157,
      "num_tokens": 2227472123.0,
      "reward": 0.037109375,
      "reward_std": 0.07355976849794388,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1715.0,
      "completions/mean_terminated_length": 694.857177734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8015703678415976,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 36.2203498230152,
      "learning_rate": 2.042483938317276e-07,
      "loss": 0.0135,
      "num_tokens": 2228425995.0,
      "reward": 0.0390625,
      "reward_std": 0.07368633151054382,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1732.705078125,
      "completions/mean_terminated_length": 606.6517944335938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8019117521549884,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.299639884149655,
      "learning_rate": 2.0390550572243242e-07,
      "loss": 0.0214,
      "num_tokens": 2229392084.0,
      "reward": 0.03125,
      "reward_std": 0.07394562661647797,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1728.14453125,
      "completions/mean_terminated_length": 559.2181396484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8022531364683793,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 112.52357578693962,
      "learning_rate": 2.0356310883097045e-07,
      "loss": 0.0141,
      "num_tokens": 2230350286.0,
      "reward": 0.048828125,
      "reward_std": 0.09038965404033661,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 1669.35546875,
      "completions/mean_terminated_length": 590.3609008789062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8025945207817701,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 35.321963871602684,
      "learning_rate": 2.0322120364331119e-07,
      "loss": 0.0159,
      "num_tokens": 2231290308.0,
      "reward": 0.052734375,
      "reward_std": 0.07669496536254883,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 1617.904296875,
      "completions/mean_terminated_length": 497.2323913574219,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8029359050951609,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.453376373906206,
      "learning_rate": 2.028797906447268e-07,
      "loss": 0.0175,
      "num_tokens": 2232200227.0,
      "reward": 0.064453125,
      "reward_std": 0.09271937608718872,
      "rewards/accuracy_reward/mean": 0.06653226166963577,
      "rewards/accuracy_reward/std": 0.2494617998600006,
      "step": 2352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1788.0,
      "completions/mean_length": 1701.537109375,
      "completions/mean_terminated_length": 505.4869384765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8032772894085517,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 102.45565549040242,
      "learning_rate": 2.025388703197903e-07,
      "loss": -0.0068,
      "num_tokens": 2233151638.0,
      "reward": 0.0546875,
      "reward_std": 0.07795868813991547,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1631.576171875,
      "completions/mean_terminated_length": 557.0279541015625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8036186737219425,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 116.76885641439672,
      "learning_rate": 2.0219844315237595e-07,
      "loss": 0.0105,
      "num_tokens": 2234070349.0,
      "reward": 0.0390625,
      "reward_std": 0.1049705445766449,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1633.908203125,
      "completions/mean_terminated_length": 533.6071166992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8039600580353333,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.94041085031215,
      "learning_rate": 2.0185850962565782e-07,
      "loss": 0.0124,
      "num_tokens": 2234989886.0,
      "reward": 0.037109375,
      "reward_std": 0.07003315538167953,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 1716.90234375,
      "completions/mean_terminated_length": 573.8956298828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.804301442348724,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 33.16970325150947,
      "learning_rate": 2.0151907022210917e-07,
      "loss": 0.0109,
      "num_tokens": 2235942428.0,
      "reward": 0.09765625,
      "reward_std": 0.11564444750547409,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1829.171875,
      "completions/mean_terminated_length": 843.8709716796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8046428266621148,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 57.933791522022574,
      "learning_rate": 2.0118012542350245e-07,
      "loss": -0.005,
      "num_tokens": 2236963508.0,
      "reward": 0.03515625,
      "reward_std": 0.07477159798145294,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1683.123046875,
      "completions/mean_terminated_length": 723.0567016601562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8049842109755057,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 47.865891343447345,
      "learning_rate": 2.0084167571090753e-07,
      "loss": 0.0175,
      "num_tokens": 2237913763.0,
      "reward": 0.064453125,
      "reward_std": 0.12451459467411041,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1732.88671875,
      "completions/mean_terminated_length": 632.75439453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8053255952888965,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 14.918082991180457,
      "learning_rate": 2.0050372156469214e-07,
      "loss": 0.0016,
      "num_tokens": 2238882521.0,
      "reward": 0.009765625,
      "reward_std": 0.0390625,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 2359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1695.150390625,
      "completions/mean_terminated_length": 719.625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8056669796022873,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 41.19709490896527,
      "learning_rate": 2.0016626346452027e-07,
      "loss": 0.005,
      "num_tokens": 2239820374.0,
      "reward": 0.05078125,
      "reward_std": 0.08477334678173065,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1632.69140625,
      "completions/mean_terminated_length": 667.2337646484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8060083639156781,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 20.18628436938063,
      "learning_rate": 1.998293018893518e-07,
      "loss": 0.0205,
      "num_tokens": 2240727432.0,
      "reward": 0.0703125,
      "reward_std": 0.10975530743598938,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1687.0,
      "completions/mean_length": 1674.759765625,
      "completions/mean_terminated_length": 555.0390625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8063497482290689,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 116.11114513473144,
      "learning_rate": 1.994928373174423e-07,
      "loss": 0.0221,
      "num_tokens": 2241666797.0,
      "reward": 0.103515625,
      "reward_std": 0.15234442055225372,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 2362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1760.0,
      "completions/mean_length": 1740.38671875,
      "completions/mean_terminated_length": 701.86328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8066911325424597,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 24.14525121534884,
      "learning_rate": 1.991568702263415e-07,
      "loss": 0.0072,
      "num_tokens": 2242635955.0,
      "reward": 0.060546875,
      "reward_std": 0.10712136328220367,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 1559.375,
      "completions/mean_terminated_length": 576.37646484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8070325168558504,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 71.130220900969,
      "learning_rate": 1.9882140109289342e-07,
      "loss": 0.013,
      "num_tokens": 2243504547.0,
      "reward": 0.0625,
      "reward_std": 0.11211783438920975,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.806640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1788.83203125,
      "completions/mean_terminated_length": 707.6565551757812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8073739011692412,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 22.30202303856115,
      "learning_rate": 1.9848643039323499e-07,
      "loss": 0.0057,
      "num_tokens": 2244499325.0,
      "reward": 0.029296875,
      "reward_std": 0.06271954625844955,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 2365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1694.49609375,
      "completions/mean_terminated_length": 655.7384643554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.807715285482632,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 50.665211144706575,
      "learning_rate": 1.9815195860279594e-07,
      "loss": 0.0044,
      "num_tokens": 2245448715.0,
      "reward": 0.072265625,
      "reward_std": 0.10057821869850159,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1733.873046875,
      "completions/mean_terminated_length": 625.1150512695312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8080566697960229,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 27.702283829907472,
      "learning_rate": 1.978179861962978e-07,
      "loss": -0.0013,
      "num_tokens": 2246422058.0,
      "reward": 0.025390625,
      "reward_std": 0.06041031330823898,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1906.0,
      "completions/mean_length": 1666.736328125,
      "completions/mean_terminated_length": 522.9453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8083980541094137,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.522280171389955,
      "learning_rate": 1.9748451364775318e-07,
      "loss": 0.0219,
      "num_tokens": 2247358627.0,
      "reward": 0.0546875,
      "reward_std": 0.08408090472221375,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1575.283203125,
      "completions/mean_terminated_length": 589.98193359375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8087394384228045,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 73.70984172439701,
      "learning_rate": 1.9715154143046558e-07,
      "loss": 0.0295,
      "num_tokens": 2248240788.0,
      "reward": 0.09765625,
      "reward_std": 0.13566282391548157,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1572.32421875,
      "completions/mean_terminated_length": 623.75439453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8090808227361953,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 53.90882983180562,
      "learning_rate": 1.968190700170279e-07,
      "loss": 0.009,
      "num_tokens": 2249119210.0,
      "reward": 0.0390625,
      "reward_std": 0.072782501578331,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1687.39453125,
      "completions/mean_terminated_length": 594.55908203125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8094222070495861,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 69.25571043436973,
      "learning_rate": 1.9648709987932282e-07,
      "loss": -0.0103,
      "num_tokens": 2250075780.0,
      "reward": 0.076171875,
      "reward_std": 0.13715851306915283,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 2371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1623.03125,
      "completions/mean_terminated_length": 537.0,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8097635913629768,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 28.3392348422541,
      "learning_rate": 1.9615563148852092e-07,
      "loss": 0.0008,
      "num_tokens": 2250988084.0,
      "reward": 0.021484375,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 1686.654296875,
      "completions/mean_terminated_length": 646.4166870117188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8101049756763676,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 32.84198159466664,
      "learning_rate": 1.9582466531508135e-07,
      "loss": 0.0172,
      "num_tokens": 2251933747.0,
      "reward": 0.072265625,
      "reward_std": 0.06975477933883667,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1879.0,
      "completions/mean_length": 1730.986328125,
      "completions/mean_terminated_length": 648.7672119140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8104463599897584,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 69.37044844794951,
      "learning_rate": 1.9549420182874956e-07,
      "loss": -0.0041,
      "num_tokens": 2252900156.0,
      "reward": 0.03515625,
      "reward_std": 0.07630910724401474,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1760.072265625,
      "completions/mean_terminated_length": 754.8508911132812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8107877443031493,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 43.44709629053883,
      "learning_rate": 1.9516424149855829e-07,
      "loss": 0.0119,
      "num_tokens": 2253877697.0,
      "reward": 0.056640625,
      "reward_std": 0.08930579572916031,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1638.06640625,
      "completions/mean_terminated_length": 600.8482666015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8111291286165401,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 40.367098943748786,
      "learning_rate": 1.94834784792826e-07,
      "loss": 0.0058,
      "num_tokens": 2254795859.0,
      "reward": 0.0546875,
      "reward_std": 0.08587866276502609,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.822265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1643.0,
      "completions/mean_length": 1773.3125,
      "completions/mean_terminated_length": 502.5054931640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8114705129299309,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 23.0068042387084,
      "learning_rate": 1.9450583217915595e-07,
      "loss": 0.0083,
      "num_tokens": 2255784499.0,
      "reward": 0.0546875,
      "reward_std": 0.07961063086986542,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1614.546875,
      "completions/mean_terminated_length": 587.9473876953125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8118118972433217,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 113.46613152516082,
      "learning_rate": 1.9417738412443647e-07,
      "loss": 0.0177,
      "num_tokens": 2256685483.0,
      "reward": 0.0390625,
      "reward_std": 0.08604402840137482,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1707.02734375,
      "completions/mean_terminated_length": 617.0327758789062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8121532815567125,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 11.710617153134043,
      "learning_rate": 1.938494410948394e-07,
      "loss": 0.0159,
      "num_tokens": 2257635721.0,
      "reward": 0.03515625,
      "reward_std": 0.06948098540306091,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1759.46484375,
      "completions/mean_terminated_length": 705.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8124946658701032,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 31.73394419984976,
      "learning_rate": 1.9352200355581988e-07,
      "loss": 0.0014,
      "num_tokens": 2258610215.0,
      "reward": 0.08984375,
      "reward_std": 0.08851956576108932,
      "rewards/accuracy_reward/mean": 0.0927419364452362,
      "rewards/accuracy_reward/std": 0.2903633117675781,
      "step": 2380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1630.16796875,
      "completions/mean_terminated_length": 612.2282104492188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.812836050183494,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.5451585846974485,
      "learning_rate": 1.9319507197211538e-07,
      "loss": 0.024,
      "num_tokens": 2259521261.0,
      "reward": 0.037109375,
      "reward_std": 0.07729348540306091,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1633.615234375,
      "completions/mean_terminated_length": 594.8150634765625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8131774344968848,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 10.269616327278692,
      "learning_rate": 1.9286864680774578e-07,
      "loss": 0.0121,
      "num_tokens": 2260441176.0,
      "reward": 0.025390625,
      "reward_std": 0.06755761802196503,
      "rewards/accuracy_reward/mean": 0.026209676638245583,
      "rewards/accuracy_reward/std": 0.1599196344614029,
      "step": 2382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1655.54296875,
      "completions/mean_terminated_length": 559.5703735351562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8135188188102757,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 86.46369488355815,
      "learning_rate": 1.9254272852601193e-07,
      "loss": 0.0066,
      "num_tokens": 2261361374.0,
      "reward": 0.095703125,
      "reward_std": 0.127755269408226,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 2383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1704.91796875,
      "completions/mean_terminated_length": 653.888916015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8138602031236665,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 86.507797004118,
      "learning_rate": 1.9221731758949498e-07,
      "loss": 0.0059,
      "num_tokens": 2262311892.0,
      "reward": 0.05859375,
      "reward_std": 0.08798330277204514,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1717.48828125,
      "completions/mean_terminated_length": 725.953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8142015874370573,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 36.90936366866858,
      "learning_rate": 1.918924144600566e-07,
      "loss": 0.011,
      "num_tokens": 2263277118.0,
      "reward": 0.0546875,
      "reward_std": 0.10571042448282242,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1619.349609375,
      "completions/mean_terminated_length": 676.3187866210938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8145429717504481,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 59.32024847477373,
      "learning_rate": 1.915680195988369e-07,
      "loss": -0.0146,
      "num_tokens": 2264187777.0,
      "reward": 0.03125,
      "reward_std": 0.06469620764255524,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1930.0,
      "completions/mean_length": 1562.865234375,
      "completions/mean_terminated_length": 578.2426147460938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8148843560638389,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 36.401452128281754,
      "learning_rate": 1.912441334662554e-07,
      "loss": -0.0007,
      "num_tokens": 2265067804.0,
      "reward": 0.048828125,
      "reward_std": 0.0673912912607193,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1674.37890625,
      "completions/mean_terminated_length": 565.534912109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8152257403772296,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 37.00439333912248,
      "learning_rate": 1.9092075652200894e-07,
      "loss": -0.0073,
      "num_tokens": 2266010782.0,
      "reward": 0.0390625,
      "reward_std": 0.104032501578331,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1676.6875,
      "completions/mean_terminated_length": 650.11767578125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8155671246906204,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.940244333587208,
      "learning_rate": 1.9059788922507213e-07,
      "loss": 0.0054,
      "num_tokens": 2266946702.0,
      "reward": 0.041015625,
      "reward_std": 0.06943464279174805,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1804.0,
      "completions/mean_length": 1648.38671875,
      "completions/mean_terminated_length": 636.9517211914062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8159085090040112,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 73.98209700648937,
      "learning_rate": 1.902755320336961e-07,
      "loss": 0.0003,
      "num_tokens": 2267868324.0,
      "reward": 0.041015625,
      "reward_std": 0.06234722584486008,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1919.0,
      "completions/mean_length": 1568.408203125,
      "completions/mean_terminated_length": 586.3869018554688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.816249893317402,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 31.503703525765093,
      "learning_rate": 1.899536854054079e-07,
      "loss": 0.0103,
      "num_tokens": 2268751061.0,
      "reward": 0.072265625,
      "reward_std": 0.12576615810394287,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1587.634765625,
      "completions/mean_terminated_length": 601.9447631835938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8165912776307929,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 37.65155847365325,
      "learning_rate": 1.8963234979700986e-07,
      "loss": 0.0106,
      "num_tokens": 2269639578.0,
      "reward": 0.0859375,
      "reward_std": 0.1249004602432251,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1798.0,
      "completions/mean_length": 1665.451171875,
      "completions/mean_terminated_length": 628.6884155273438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8169326619441837,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 49.74834220852008,
      "learning_rate": 1.8931152566457903e-07,
      "loss": 0.0297,
      "num_tokens": 2270566833.0,
      "reward": 0.04296875,
      "reward_std": 0.09424237906932831,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 2393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1561.248046875,
      "completions/mean_terminated_length": 647.9044799804688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8172740462575745,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 41.887315389048005,
      "learning_rate": 1.8899121346346682e-07,
      "loss": 0.0226,
      "num_tokens": 2271438928.0,
      "reward": 0.087890625,
      "reward_std": 0.10735208541154861,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 2394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1843.0,
      "completions/mean_length": 1603.369140625,
      "completions/mean_terminated_length": 550.2960815429688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8176154305709653,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 34.538404230653015,
      "learning_rate": 1.8867141364829758e-07,
      "loss": -0.0122,
      "num_tokens": 2272344381.0,
      "reward": 0.060546875,
      "reward_std": 0.08175812661647797,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1653.189453125,
      "completions/mean_terminated_length": 653.9103393554688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.817956814884356,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 24.446545873701883,
      "learning_rate": 1.8835212667296873e-07,
      "loss": 0.0089,
      "num_tokens": 2273268622.0,
      "reward": 0.02734375,
      "reward_std": 0.05311024561524391,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1565.212890625,
      "completions/mean_terminated_length": 674.7388916015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8182981991977468,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 35.2135498172077,
      "learning_rate": 1.8803335299064998e-07,
      "loss": 0.0131,
      "num_tokens": 2274145355.0,
      "reward": 0.048828125,
      "reward_std": 0.1022823378443718,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1848.0,
      "completions/mean_length": 1558.78125,
      "completions/mean_terminated_length": 583.2047119140625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8186395835111376,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 37.83840348933624,
      "learning_rate": 1.8771509305378186e-07,
      "loss": -0.0026,
      "num_tokens": 2275023899.0,
      "reward": 0.064453125,
      "reward_std": 0.09268517792224884,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1840.0,
      "completions/mean_length": 1601.35546875,
      "completions/mean_terminated_length": 591.4267578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8189809678245284,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 60.504144028243445,
      "learning_rate": 1.8739734731407646e-07,
      "loss": 0.0129,
      "num_tokens": 2275919105.0,
      "reward": 0.05078125,
      "reward_std": 0.06558094918727875,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1625.76953125,
      "completions/mean_terminated_length": 679.759521484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8193223521379193,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 22.556367182540573,
      "learning_rate": 1.870801162225155e-07,
      "loss": 0.0295,
      "num_tokens": 2276832283.0,
      "reward": 0.0546875,
      "reward_std": 0.08940431475639343,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1600.62109375,
      "completions/mean_terminated_length": 676.395263671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8196637364513101,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 41.92500824991513,
      "learning_rate": 1.8676340022935073e-07,
      "loss": 0.0217,
      "num_tokens": 2277723465.0,
      "reward": 0.0390625,
      "reward_std": 0.0625,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1907.0,
      "completions/mean_length": 1632.537109375,
      "completions/mean_terminated_length": 539.3687744140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8200051207647009,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 39.20352764063072,
      "learning_rate": 1.8644719978410227e-07,
      "loss": 0.047,
      "num_tokens": 2278633276.0,
      "reward": 0.080078125,
      "reward_std": 0.11984425783157349,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 1501.068359375,
      "completions/mean_terminated_length": 619.2805786132812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8203465050780917,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 27.758081654029525,
      "learning_rate": 1.861315153355592e-07,
      "loss": 0.0212,
      "num_tokens": 2279474719.0,
      "reward": 0.060546875,
      "reward_std": 0.08532553166151047,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1594.44140625,
      "completions/mean_terminated_length": 649.072265625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8206878893914825,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.270552634233083,
      "learning_rate": 1.8581634733177758e-07,
      "loss": -0.0065,
      "num_tokens": 2280364929.0,
      "reward": 0.017578125,
      "reward_std": 0.043680962175130844,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1915.0,
      "completions/mean_length": 1543.373046875,
      "completions/mean_terminated_length": 545.8546752929688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8210292737048732,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 40.85661673042326,
      "learning_rate": 1.8550169622008078e-07,
      "loss": 0.007,
      "num_tokens": 2281231232.0,
      "reward": 0.044921875,
      "reward_std": 0.07718046009540558,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1792.0,
      "completions/mean_length": 1543.181640625,
      "completions/mean_terminated_length": 579.7556762695312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.821370658018264,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 43.14568712145668,
      "learning_rate": 1.851875624470586e-07,
      "loss": 0.0147,
      "num_tokens": 2282103037.0,
      "reward": 0.072265625,
      "reward_std": 0.09926675260066986,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1572.271484375,
      "completions/mean_terminated_length": 631.8779296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8217120423316548,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 97.20281769766575,
      "learning_rate": 1.8487394645856636e-07,
      "loss": 0.0101,
      "num_tokens": 2282981304.0,
      "reward": 0.072265625,
      "reward_std": 0.0971047431230545,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1637.72265625,
      "completions/mean_terminated_length": 692.7612915039062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8220534266450457,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 49.18603278315709,
      "learning_rate": 1.8456084869972472e-07,
      "loss": 0.0193,
      "num_tokens": 2283887066.0,
      "reward": 0.05859375,
      "reward_std": 0.08325589448213577,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1602.955078125,
      "completions/mean_terminated_length": 623.8562622070312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8223948109584365,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 30.633507567713714,
      "learning_rate": 1.8424826961491852e-07,
      "loss": -0.0071,
      "num_tokens": 2284783571.0,
      "reward": 0.0546875,
      "reward_std": 0.08518621325492859,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1696.619140625,
      "completions/mean_terminated_length": 642.4765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8227361952718273,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.546852089517008,
      "learning_rate": 1.8393620964779675e-07,
      "loss": 0.0071,
      "num_tokens": 2285725712.0,
      "reward": 0.029296875,
      "reward_std": 0.07917051017284393,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1666.19921875,
      "completions/mean_terminated_length": 699.8482666015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8230775795852181,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 24.233103509627846,
      "learning_rate": 1.8362466924127145e-07,
      "loss": 0.0182,
      "num_tokens": 2286654038.0,
      "reward": 0.099609375,
      "reward_std": 0.12380305677652359,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 2411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1845.0,
      "completions/mean_length": 1653.6953125,
      "completions/mean_terminated_length": 563.558837890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8234189638986089,
      "frac_reward_zero_std": 0.53125,
      "grad_norm": 42.865394002176984,
      "learning_rate": 1.833136488375171e-07,
      "loss": 0.0271,
      "num_tokens": 2287570874.0,
      "reward": 0.06640625,
      "reward_std": 0.15498074889183044,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.630859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1557.28515625,
      "completions/mean_terminated_length": 718.6560668945312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8237603482119996,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 36.85989035570521,
      "learning_rate": 1.8300314887797048e-07,
      "loss": 0.0084,
      "num_tokens": 2288450668.0,
      "reward": 0.060546875,
      "reward_std": 0.09753695130348206,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1591.40234375,
      "completions/mean_terminated_length": 664.6982421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8241017325253904,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 49.09549285609956,
      "learning_rate": 1.8269316980332926e-07,
      "loss": 0.0093,
      "num_tokens": 2289348170.0,
      "reward": 0.052734375,
      "reward_std": 0.11537161469459534,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1587.0,
      "completions/mean_length": 1543.24609375,
      "completions/mean_terminated_length": 571.2342529296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8244431168387812,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.402577956899501,
      "learning_rate": 1.823837120535523e-07,
      "loss": 0.004,
      "num_tokens": 2290215400.0,
      "reward": 0.072265625,
      "reward_std": 0.09178133308887482,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1681.0234375,
      "completions/mean_terminated_length": 743.1944580078125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8247845011521721,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.130866845995918,
      "learning_rate": 1.820747760678581e-07,
      "loss": 0.0129,
      "num_tokens": 2291153140.0,
      "reward": 0.025390625,
      "reward_std": 0.06409074366092682,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1637.51171875,
      "completions/mean_terminated_length": 618.2720947265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8251258854655629,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.89982665056698,
      "learning_rate": 1.8176636228472476e-07,
      "loss": 0.0181,
      "num_tokens": 2292069274.0,
      "reward": 0.064453125,
      "reward_std": 0.09519492089748383,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1544.58984375,
      "completions/mean_terminated_length": 669.6791381835938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8254672697789537,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 40.93666344695196,
      "learning_rate": 1.814584711418894e-07,
      "loss": 0.0016,
      "num_tokens": 2292936920.0,
      "reward": 0.091796875,
      "reward_std": 0.12502044439315796,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 2418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1522.177734375,
      "completions/mean_terminated_length": 660.2628784179688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8258086540923445,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 5.5577390552976,
      "learning_rate": 1.8115110307634695e-07,
      "loss": 0.0268,
      "num_tokens": 2293792595.0,
      "reward": 0.06640625,
      "reward_std": 0.08493966609239578,
      "rewards/accuracy_reward/mean": 0.07083333283662796,
      "rewards/accuracy_reward/std": 0.2568138837814331,
      "step": 2419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1532.4140625,
      "completions/mean_terminated_length": 636.34228515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8261500384057353,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 3.73311428328459,
      "learning_rate": 1.8084425852435044e-07,
      "loss": 0.0175,
      "num_tokens": 2294655687.0,
      "reward": 0.07421875,
      "reward_std": 0.09094182401895523,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1652.044921875,
      "completions/mean_terminated_length": 722.973876953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.826491422719126,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 114.67779104481684,
      "learning_rate": 1.805379379214093e-07,
      "loss": 0.008,
      "num_tokens": 2295577390.0,
      "reward": 0.060546875,
      "reward_std": 0.10689391195774078,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1668.208984375,
      "completions/mean_terminated_length": 678.6126708984375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8268328070325168,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 32.207437152004296,
      "learning_rate": 1.802321417022899e-07,
      "loss": 0.0077,
      "num_tokens": 2296514361.0,
      "reward": 0.04296875,
      "reward_std": 0.08655644953250885,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1653.525390625,
      "completions/mean_terminated_length": 594.9712524414062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8271741913459076,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 23.643178987099184,
      "learning_rate": 1.7992687030101388e-07,
      "loss": 0.0094,
      "num_tokens": 2297438278.0,
      "reward": 0.0625,
      "reward_std": 0.07528402656316757,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1663.80078125,
      "completions/mean_terminated_length": 736.6000366210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8275155756592985,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 11.054214289995855,
      "learning_rate": 1.7962212415085804e-07,
      "loss": 0.0314,
      "num_tokens": 2298369024.0,
      "reward": 0.04296875,
      "reward_std": 0.10574321448802948,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1562.69921875,
      "completions/mean_terminated_length": 594.9356689453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8278569599726893,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 44.627553102602704,
      "learning_rate": 1.7931790368435403e-07,
      "loss": 0.0118,
      "num_tokens": 2299246086.0,
      "reward": 0.078125,
      "reward_std": 0.10787243396043777,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1514.845703125,
      "completions/mean_terminated_length": 564.4402465820312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8281983442860801,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 126.73481564879475,
      "learning_rate": 1.7901420933328696e-07,
      "loss": 0.0163,
      "num_tokens": 2300100007.0,
      "reward": 0.115234375,
      "reward_std": 0.15972480177879333,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "step": 2426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.662109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1937.0,
      "completions/mean_length": 1605.845703125,
      "completions/mean_terminated_length": 739.427734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8285397285994709,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 85.22665679670817,
      "learning_rate": 1.787110415286956e-07,
      "loss": 0.02,
      "num_tokens": 2300999912.0,
      "reward": 0.10546875,
      "reward_std": 0.14591112732887268,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 2427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1555.646484375,
      "completions/mean_terminated_length": 631.7921142578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8288811129128617,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 26.956761066932827,
      "learning_rate": 1.784084007008711e-07,
      "loss": 0.0344,
      "num_tokens": 2301871027.0,
      "reward": 0.083984375,
      "reward_std": 0.11674421280622482,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1565.625,
      "completions/mean_terminated_length": 603.6959228515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8292224972262524,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 12.051030145973305,
      "learning_rate": 1.781062872793567e-07,
      "loss": 0.027,
      "num_tokens": 2302749251.0,
      "reward": 0.142578125,
      "reward_std": 0.16356495022773743,
      "rewards/accuracy_reward/mean": 0.142578125,
      "rewards/accuracy_reward/std": 0.3499840497970581,
      "step": 2429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1599.93359375,
      "completions/mean_terminated_length": 614.1875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8295638815396432,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 52.219044121701,
      "learning_rate": 1.778047016929473e-07,
      "loss": 0.0151,
      "num_tokens": 2303643105.0,
      "reward": 0.072265625,
      "reward_std": 0.0790574848651886,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.615234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1498.708984375,
      "completions/mean_terminated_length": 620.4010009765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.829905265853034,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 21.323735135268663,
      "learning_rate": 1.7750364436968836e-07,
      "loss": 0.0009,
      "num_tokens": 2304488716.0,
      "reward": 0.068359375,
      "reward_std": 0.11113345623016357,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1559.603515625,
      "completions/mean_terminated_length": 619.0914306640625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8302466501664248,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 20.875935578684587,
      "learning_rate": 1.7720311573687575e-07,
      "loss": -0.0018,
      "num_tokens": 2305363937.0,
      "reward": 0.052734375,
      "reward_std": 0.1188259869813919,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1596.103515625,
      "completions/mean_terminated_length": 620.0308837890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8305880344798157,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 45.548370607970504,
      "learning_rate": 1.769031162210548e-07,
      "loss": 0.0028,
      "num_tokens": 2306261510.0,
      "reward": 0.041015625,
      "reward_std": 0.08753519505262375,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1634.892578125,
      "completions/mean_terminated_length": 692.1602783203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8309294187932065,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 29.62303726552462,
      "learning_rate": 1.766036462480201e-07,
      "loss": 0.0408,
      "num_tokens": 2307179183.0,
      "reward": 0.0703125,
      "reward_std": 0.13067752122879028,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1654.705078125,
      "completions/mean_terminated_length": 678.1564331054688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8312708031065973,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 37.45353910406339,
      "learning_rate": 1.7630470624281442e-07,
      "loss": -0.0151,
      "num_tokens": 2308100648.0,
      "reward": 0.064453125,
      "reward_std": 0.1277552843093872,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1546.240234375,
      "completions/mean_terminated_length": 620.772216796875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8316121874199881,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.376547185875587,
      "learning_rate": 1.7600629662972832e-07,
      "loss": 0.0072,
      "num_tokens": 2308973827.0,
      "reward": 0.048828125,
      "reward_std": 0.08395528793334961,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1684.125,
      "completions/mean_terminated_length": 647.2180786132812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8319535717333788,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 29.47996986565397,
      "learning_rate": 1.757084178322999e-07,
      "loss": -0.0023,
      "num_tokens": 2309909251.0,
      "reward": 0.056640625,
      "reward_std": 0.06767063587903976,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1657.455078125,
      "completions/mean_terminated_length": 599.021728515625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8322949560467696,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.292955670914946,
      "learning_rate": 1.754110702733134e-07,
      "loss": 0.0115,
      "num_tokens": 2310840076.0,
      "reward": 0.05078125,
      "reward_std": 0.10295412689447403,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1829.0,
      "completions/mean_length": 1715.990234375,
      "completions/mean_terminated_length": 643.1322021484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8326363403601604,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 84.10901240630035,
      "learning_rate": 1.7511425437479946e-07,
      "loss": 0.0158,
      "num_tokens": 2311800247.0,
      "reward": 0.0703125,
      "reward_std": 0.11305683851242065,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1703.0625,
      "completions/mean_terminated_length": 635.1360473632812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8329777246735512,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 28.049883394507432,
      "learning_rate": 1.7481797055803382e-07,
      "loss": 0.0184,
      "num_tokens": 2312747815.0,
      "reward": 0.046875,
      "reward_std": 0.07966843992471695,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1544.76171875,
      "completions/mean_terminated_length": 584.0341186523438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8333191089869421,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 14.721422240156052,
      "learning_rate": 1.7452221924353733e-07,
      "loss": 0.0318,
      "num_tokens": 2313620301.0,
      "reward": 0.126953125,
      "reward_std": 0.17244866490364075,
      "rewards/accuracy_reward/mean": 0.13104838132858276,
      "rewards/accuracy_reward/std": 0.3377939462661743,
      "step": 2441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1706.8984375,
      "completions/mean_terminated_length": 744.6865234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8336604933003329,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.546618130082427,
      "learning_rate": 1.7422700085107485e-07,
      "loss": 0.0059,
      "num_tokens": 2314570681.0,
      "reward": 0.048828125,
      "reward_std": 0.043847277760505676,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1829.0,
      "completions/mean_length": 1680.009765625,
      "completions/mean_terminated_length": 683.1087036132812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8340018776137237,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.690283639551385,
      "learning_rate": 1.7393231579965467e-07,
      "loss": -0.0042,
      "num_tokens": 2315520990.0,
      "reward": 0.0234375,
      "reward_std": 0.06805649399757385,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1585.6171875,
      "completions/mean_terminated_length": 663.5555419921875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8343432619271145,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 57.897965728013894,
      "learning_rate": 1.736381645075286e-07,
      "loss": 0.0139,
      "num_tokens": 2316409178.0,
      "reward": 0.044921875,
      "reward_std": 0.08780106902122498,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1700.16796875,
      "completions/mean_terminated_length": 634.5873413085938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8346846462405052,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 15.03079739787849,
      "learning_rate": 1.733445473921904e-07,
      "loss": 0.0248,
      "num_tokens": 2317353712.0,
      "reward": 0.056640625,
      "reward_std": 0.10837717354297638,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1870.0,
      "completions/mean_length": 1707.330078125,
      "completions/mean_terminated_length": 629.9268188476562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.835026030553896,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 7.0236254032041066,
      "learning_rate": 1.7305146487037603e-07,
      "loss": 0.0237,
      "num_tokens": 2318301001.0,
      "reward": 0.044921875,
      "reward_std": 0.08164606243371964,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1864.0,
      "completions/mean_length": 1691.9453125,
      "completions/mean_terminated_length": 666.9393920898438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8353674148672868,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 114.86323201527865,
      "learning_rate": 1.727589173580625e-07,
      "loss": 0.0156,
      "num_tokens": 2319242669.0,
      "reward": 0.037109375,
      "reward_std": 0.08295752108097076,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1602.259765625,
      "completions/mean_terminated_length": 639.24072265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8357087991806776,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 12.385882218391371,
      "learning_rate": 1.724669052704673e-07,
      "loss": 0.0362,
      "num_tokens": 2320133314.0,
      "reward": 0.08203125,
      "reward_std": 0.13352441787719727,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.771484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1711.5546875,
      "completions/mean_terminated_length": 575.6923217773438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8360501834940685,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 1.4256683652082736,
      "learning_rate": 1.7217542902204847e-07,
      "loss": 0.0229,
      "num_tokens": 2321092654.0,
      "reward": 0.05078125,
      "reward_std": 0.07383356243371964,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1675.33984375,
      "completions/mean_terminated_length": 665.3768310546875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8363915678074593,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.29077629259919,
      "learning_rate": 1.7188448902650287e-07,
      "loss": 0.012,
      "num_tokens": 2322026860.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 2450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1825.0,
      "completions/mean_length": 1671.484375,
      "completions/mean_terminated_length": 651.0724487304688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8367329521208501,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 31.571270963268077,
      "learning_rate": 1.7159408569676704e-07,
      "loss": 0.0162,
      "num_tokens": 2322953828.0,
      "reward": 0.064453125,
      "reward_std": 0.1156046986579895,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1681.0625,
      "completions/mean_terminated_length": 724.9577026367188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8370743364342409,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 54.7029599860654,
      "learning_rate": 1.71304219445015e-07,
      "loss": 0.0053,
      "num_tokens": 2323898308.0,
      "reward": 0.05078125,
      "reward_std": 0.0961260050535202,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1593.720703125,
      "completions/mean_terminated_length": 638.3575439453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8374157207476316,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 40.60600258790086,
      "learning_rate": 1.7101489068265935e-07,
      "loss": 0.0006,
      "num_tokens": 2324791125.0,
      "reward": 0.056640625,
      "reward_std": 0.08318261057138443,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1652.125,
      "completions/mean_terminated_length": 678.4865112304688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8377571050610224,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 51.89640687927489,
      "learning_rate": 1.7072609982034874e-07,
      "loss": 0.0269,
      "num_tokens": 2325711189.0,
      "reward": 0.072265625,
      "reward_std": 0.11046035587787628,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1726.75390625,
      "completions/mean_terminated_length": 654.11865234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8380984893744132,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 22.367061206155213,
      "learning_rate": 1.7043784726796934e-07,
      "loss": 0.0158,
      "num_tokens": 2326676951.0,
      "reward": 0.041015625,
      "reward_std": 0.07322713732719421,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1581.43359375,
      "completions/mean_terminated_length": 545.5974731445312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.838439873687804,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 98.02973095646546,
      "learning_rate": 1.7015013343464302e-07,
      "loss": -0.0102,
      "num_tokens": 2327561781.0,
      "reward": 0.060546875,
      "reward_std": 0.13550205528736115,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.24230584502220154,
      "step": 2456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1597.53125,
      "completions/mean_terminated_length": 588.253173828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8387812580011949,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 34.01524040146943,
      "learning_rate": 1.698629587287266e-07,
      "loss": 0.043,
      "num_tokens": 2328449317.0,
      "reward": 0.099609375,
      "reward_std": 0.1253339648246765,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 2457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1618.376953125,
      "completions/mean_terminated_length": 610.6732177734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8391226423145857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.28112767502058,
      "learning_rate": 1.6957632355781243e-07,
      "loss": 0.0167,
      "num_tokens": 2329355526.0,
      "reward": 0.060546875,
      "reward_std": 0.09759023040533066,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1624.89453125,
      "completions/mean_terminated_length": 641.3117065429688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8394640266279765,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.013998671697308,
      "learning_rate": 1.6929022832872653e-07,
      "loss": 0.0125,
      "num_tokens": 2330265648.0,
      "reward": 0.07421875,
      "reward_std": 0.08549975603818893,
      "rewards/accuracy_reward/mean": 0.07661290466785431,
      "rewards/accuracy_reward/std": 0.2662447690963745,
      "step": 2459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1567.09765625,
      "completions/mean_terminated_length": 680.1000366210938,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8398054109413673,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 14.973416587170767,
      "learning_rate": 1.6900467344752872e-07,
      "loss": 0.0095,
      "num_tokens": 2331139458.0,
      "reward": 0.0390625,
      "reward_std": 0.09039659798145294,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1803.0,
      "completions/mean_length": 1666.265625,
      "completions/mean_terminated_length": 600.2369995117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.840146795254758,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 6.299233016236517,
      "learning_rate": 1.6871965931951178e-07,
      "loss": 0.0255,
      "num_tokens": 2332060938.0,
      "reward": 0.0546875,
      "reward_std": 0.0993933230638504,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1616.279296875,
      "completions/mean_terminated_length": 593.7828979492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8404881795681488,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 2.8045307953462024,
      "learning_rate": 1.684351863492014e-07,
      "loss": 0.0025,
      "num_tokens": 2332962521.0,
      "reward": 0.021484375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 2462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1679.0,
      "completions/mean_length": 1624.1640625,
      "completions/mean_terminated_length": 591.5972900390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8408295638815396,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 42.726586959532604,
      "learning_rate": 1.6815125494035494e-07,
      "loss": 0.0067,
      "num_tokens": 2333875181.0,
      "reward": 0.041015625,
      "reward_std": 0.08521346747875214,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1806.0,
      "completions/mean_length": 1627.390625,
      "completions/mean_terminated_length": 583.0203857421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8411709481949304,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 24.174258047440986,
      "learning_rate": 1.678678654959609e-07,
      "loss": 0.003,
      "num_tokens": 2334783669.0,
      "reward": 0.04296875,
      "reward_std": 0.06398563086986542,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1762.734375,
      "completions/mean_terminated_length": 601.9010009765625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8415123325083212,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 23.78048564079735,
      "learning_rate": 1.6758501841823902e-07,
      "loss": -0.0017,
      "num_tokens": 2335775469.0,
      "reward": 0.029296875,
      "reward_std": 0.07108421623706818,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1666.48046875,
      "completions/mean_terminated_length": 691.4861450195312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8418537168217121,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 24.300035680049852,
      "learning_rate": 1.6730271410863864e-07,
      "loss": 0.009,
      "num_tokens": 2336708163.0,
      "reward": 0.0390625,
      "reward_std": 0.07889671623706818,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1896.0,
      "completions/mean_length": 1634.92578125,
      "completions/mean_terminated_length": 638.0400390625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8421951011351029,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 99.19390883899514,
      "learning_rate": 1.6702095296783942e-07,
      "loss": 0.0093,
      "num_tokens": 2337620109.0,
      "reward": 0.060546875,
      "reward_std": 0.07669496536254883,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1784.0,
      "completions/mean_length": 1722.70703125,
      "completions/mean_terminated_length": 704.8547973632812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8425364854484937,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 47.42924908200362,
      "learning_rate": 1.6673973539574953e-07,
      "loss": 0.0133,
      "num_tokens": 2338585607.0,
      "reward": 0.044921875,
      "reward_std": 0.09935424476861954,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1700.0,
      "completions/mean_length": 1585.93359375,
      "completions/mean_terminated_length": 550.6708984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8428778697618844,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 13.8061170052062,
      "learning_rate": 1.6645906179150592e-07,
      "loss": 0.0202,
      "num_tokens": 2339475109.0,
      "reward": 0.052734375,
      "reward_std": 0.11073969304561615,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1708.0,
      "completions/mean_length": 1641.55859375,
      "completions/mean_terminated_length": 632.3673095703125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8432192540752752,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 31.773593954625255,
      "learning_rate": 1.661789325534737e-07,
      "loss": 0.0372,
      "num_tokens": 2340385523.0,
      "reward": 0.1015625,
      "reward_std": 0.14435848593711853,
      "rewards/accuracy_reward/mean": 0.10483870655298233,
      "rewards/accuracy_reward/std": 0.30665475130081177,
      "step": 2470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1733.9140625,
      "completions/mean_terminated_length": 729.8688354492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.843560638388666,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 27.74800951683062,
      "learning_rate": 1.6589934807924482e-07,
      "loss": 0.0095,
      "num_tokens": 2341348071.0,
      "reward": 0.03515625,
      "reward_std": 0.07135801017284393,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1622.0,
      "completions/mean_length": 1669.490234375,
      "completions/mean_terminated_length": 568.6336059570312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8439020227020568,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 69.43353361004944,
      "learning_rate": 1.6562030876563843e-07,
      "loss": 0.0171,
      "num_tokens": 2342277586.0,
      "reward": 0.06640625,
      "reward_std": 0.10821876674890518,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1819.0,
      "completions/mean_length": 1729.23828125,
      "completions/mean_terminated_length": 641.0516967773438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8442434070154476,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 108.39637458379629,
      "learning_rate": 1.653418150086996e-07,
      "loss": -0.0024,
      "num_tokens": 2343238652.0,
      "reward": 0.0625,
      "reward_std": 0.09908141195774078,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 2473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1549.693359375,
      "completions/mean_terminated_length": 614.6685180664062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8445847913288385,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 58.10240198511226,
      "learning_rate": 1.6506386720369953e-07,
      "loss": 0.0369,
      "num_tokens": 2344104735.0,
      "reward": 0.10546875,
      "reward_std": 0.1544739454984665,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 2474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1586.568359375,
      "completions/mean_terminated_length": 650.0532836914062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8449261756422293,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 41.46929513024965,
      "learning_rate": 1.6478646574513409e-07,
      "loss": -0.0125,
      "num_tokens": 2345004674.0,
      "reward": 0.041015625,
      "reward_std": 0.062062256038188934,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1738.802734375,
      "completions/mean_terminated_length": 608.8272705078125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8452675599556201,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 82.17590699252654,
      "learning_rate": 1.6450961102672394e-07,
      "loss": 0.0302,
      "num_tokens": 2345981037.0,
      "reward": 0.0859375,
      "reward_std": 0.1245543360710144,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1863.0,
      "completions/mean_length": 1718.568359375,
      "completions/mean_terminated_length": 618.6016845703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8456089442690108,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 38.68191746576925,
      "learning_rate": 1.6423330344141401e-07,
      "loss": 0.013,
      "num_tokens": 2346938416.0,
      "reward": 0.041015625,
      "reward_std": 0.06849660724401474,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 2477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1696.4140625,
      "completions/mean_terminated_length": 714.5777587890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8459503285824016,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 116.31613822570915,
      "learning_rate": 1.6395754338137203e-07,
      "loss": 0.0031,
      "num_tokens": 2347885860.0,
      "reward": 0.0390625,
      "reward_std": 0.08917921781539917,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1793.0,
      "completions/mean_length": 1590.57421875,
      "completions/mean_terminated_length": 537.0193481445312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8462917128957924,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 69.25798212378204,
      "learning_rate": 1.6368233123798913e-07,
      "loss": 0.0213,
      "num_tokens": 2348766746.0,
      "reward": 0.068359375,
      "reward_std": 0.14085254073143005,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.603515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1501.287109375,
      "completions/mean_terminated_length": 669.0985107421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8466330972091832,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.80232104779852,
      "learning_rate": 1.634076674018785e-07,
      "loss": 0.0246,
      "num_tokens": 2349610973.0,
      "reward": 0.056640625,
      "reward_std": 0.09331226348876953,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1896.0,
      "completions/mean_length": 1622.630859375,
      "completions/mean_terminated_length": 535.576416015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.846974481522574,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 117.39871145366247,
      "learning_rate": 1.6313355226287535e-07,
      "loss": 0.0168,
      "num_tokens": 2350519328.0,
      "reward": 0.048828125,
      "reward_std": 0.05463561788201332,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1913.0,
      "completions/mean_length": 1717.986328125,
      "completions/mean_terminated_length": 552.716796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8473158658359649,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.967204647278244,
      "learning_rate": 1.6285998621003581e-07,
      "loss": 0.0037,
      "num_tokens": 2351478089.0,
      "reward": 0.009765625,
      "reward_std": 0.03411140665411949,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 2482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1734.9765625,
      "completions/mean_terminated_length": 723.4710693359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8476572501493557,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.644900709205602,
      "learning_rate": 1.6258696963163704e-07,
      "loss": 0.0205,
      "num_tokens": 2352449341.0,
      "reward": 0.0625,
      "reward_std": 0.09110813587903976,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1675.392578125,
      "completions/mean_terminated_length": 713.9090576171875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8479986344627465,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 313.4522522875926,
      "learning_rate": 1.6231450291517617e-07,
      "loss": 0.0039,
      "num_tokens": 2353386534.0,
      "reward": 0.072265625,
      "reward_std": 0.10695268213748932,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1911.0,
      "completions/mean_length": 1575.779296875,
      "completions/mean_terminated_length": 608.8511962890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8483400187761372,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 56.12616883071651,
      "learning_rate": 1.6204258644736966e-07,
      "loss": 0.0162,
      "num_tokens": 2354271285.0,
      "reward": 0.060546875,
      "reward_std": 0.09633205831050873,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1911.0,
      "completions/mean_length": 1639.482421875,
      "completions/mean_terminated_length": 615.390380859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.848681403089528,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 3.873614259818,
      "learning_rate": 1.6177122061415337e-07,
      "loss": 0.0327,
      "num_tokens": 2355181916.0,
      "reward": 0.078125,
      "reward_std": 0.11195248365402222,
      "rewards/accuracy_reward/mean": 0.08064515888690948,
      "rewards/accuracy_reward/std": 0.2725643217563629,
      "step": 2486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1720.359375,
      "completions/mean_terminated_length": 716.6349487304688,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8490227874029188,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 36.28393802281751,
      "learning_rate": 1.615004058006812e-07,
      "loss": 0.0171,
      "num_tokens": 2356148356.0,
      "reward": 0.048828125,
      "reward_std": 0.08808182179927826,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1581.90625,
      "completions/mean_terminated_length": 610.4096069335938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8493641717163096,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.365506156421668,
      "learning_rate": 1.6123014239132568e-07,
      "loss": 0.0021,
      "num_tokens": 2357029412.0,
      "reward": 0.02734375,
      "reward_std": 0.04847269132733345,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1681.94921875,
      "completions/mean_terminated_length": 649.3582153320312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8497055560297004,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 126.78743633048502,
      "learning_rate": 1.6096043076967592e-07,
      "loss": 0.0075,
      "num_tokens": 2357968330.0,
      "reward": 0.099609375,
      "reward_std": 0.14469268918037415,
      "rewards/accuracy_reward/mean": 0.099609375,
      "rewards/accuracy_reward/std": 0.29977133870124817,
      "step": 2489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1739.74609375,
      "completions/mean_terminated_length": 732.7833862304688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8500469403430913,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.1716992311816425,
      "learning_rate": 1.6069127131853846e-07,
      "loss": 0.0182,
      "num_tokens": 2358930376.0,
      "reward": 0.025390625,
      "reward_std": 0.04957009106874466,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1891.0,
      "completions/mean_length": 1692.513671875,
      "completions/mean_terminated_length": 626.5234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8503883246564821,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 84.24221189164871,
      "learning_rate": 1.6042266441993583e-07,
      "loss": 0.0129,
      "num_tokens": 2359877935.0,
      "reward": 0.05859375,
      "reward_std": 0.10216332226991653,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1595.748046875,
      "completions/mean_terminated_length": 636.0914306640625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8507297089698729,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 198.28256082638302,
      "learning_rate": 1.6015461045510626e-07,
      "loss": 0.0196,
      "num_tokens": 2360779406.0,
      "reward": 0.103515625,
      "reward_std": 0.14421284198760986,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 2492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 1626.03125,
      "completions/mean_terminated_length": 526.5352172851562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8510710932832636,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.510558003897218,
      "learning_rate": 1.598871098045036e-07,
      "loss": 0.0047,
      "num_tokens": 2361686302.0,
      "reward": 0.013671875,
      "reward_std": 0.04973640665411949,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 2493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1760.0,
      "completions/mean_length": 1689.2265625,
      "completions/mean_terminated_length": 554.569091796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8514124775966544,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 18.77744092290952,
      "learning_rate": 1.5962016284779578e-07,
      "loss": 0.0092,
      "num_tokens": 2362627874.0,
      "reward": 0.091796875,
      "reward_std": 0.1260877102613449,
      "rewards/accuracy_reward/mean": 0.0947580635547638,
      "rewards/accuracy_reward/std": 0.29317617416381836,
      "step": 2494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1507.29296875,
      "completions/mean_terminated_length": 590.9368896484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8517538619100452,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 26.690213498562148,
      "learning_rate": 1.5935376996386552e-07,
      "loss": 0.0189,
      "num_tokens": 2363471672.0,
      "reward": 0.064453125,
      "reward_std": 0.10794496536254883,
      "rewards/accuracy_reward/mean": 0.06653226166963577,
      "rewards/accuracy_reward/std": 0.2494617998600006,
      "step": 2495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1699.416015625,
      "completions/mean_terminated_length": 653.6640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.852095246223436,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.369315616820842,
      "learning_rate": 1.590879315308086e-07,
      "loss": 0.0159,
      "num_tokens": 2364424605.0,
      "reward": 0.080078125,
      "reward_std": 0.10760685801506042,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1607.916015625,
      "completions/mean_terminated_length": 612.8216552734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8524366305368268,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 25.250005551357134,
      "learning_rate": 1.5882264792593397e-07,
      "loss": 0.0097,
      "num_tokens": 2365324274.0,
      "reward": 0.0390625,
      "reward_std": 0.06355905532836914,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 2497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1930.0,
      "completions/mean_length": 1687.103515625,
      "completions/mean_terminated_length": 615.9767456054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8527780148502176,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 58.161082483966034,
      "learning_rate": 1.5855791952576342e-07,
      "loss": 0.0048,
      "num_tokens": 2366277175.0,
      "reward": 0.044921875,
      "reward_std": 0.06260748207569122,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1918.0,
      "completions/mean_length": 1684.2890625,
      "completions/mean_terminated_length": 698.5797119140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8531193991636085,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 37.843486555756215,
      "learning_rate": 1.582937467060302e-07,
      "loss": 0.0031,
      "num_tokens": 2367221611.0,
      "reward": 0.064453125,
      "reward_std": 0.060957904905080795,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1881.0,
      "completions/mean_length": 1754.24609375,
      "completions/mean_terminated_length": 668.6972045898438,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8534607834769993,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 45.53725030401169,
      "learning_rate": 1.5803012984167963e-07,
      "loss": 0.0124,
      "num_tokens": 2368205209.0,
      "reward": 0.05078125,
      "reward_std": 0.062054343521595,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1637.3359375,
      "completions/mean_terminated_length": 627.3243408203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.85380216779039,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 23.381935556505667,
      "learning_rate": 1.5776706930686738e-07,
      "loss": -0.0021,
      "num_tokens": 2369120229.0,
      "reward": 0.048828125,
      "reward_std": 0.08042772114276886,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 2501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 1587.3828125,
      "completions/mean_terminated_length": 601.1533813476562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8541435521037808,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 102.70727538071121,
      "learning_rate": 1.5750456547495995e-07,
      "loss": -0.0049,
      "num_tokens": 2370012601.0,
      "reward": 0.041015625,
      "reward_std": 0.06642046570777893,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 2502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.57421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1442.87890625,
      "completions/mean_terminated_length": 626.7981567382812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8544849364171716,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 66.00511986661476,
      "learning_rate": 1.572426187185334e-07,
      "loss": 0.012,
      "num_tokens": 2370823515.0,
      "reward": 0.0859375,
      "reward_std": 0.12740197777748108,
      "rewards/accuracy_reward/mean": 0.08870967477560043,
      "rewards/accuracy_reward/std": 0.2846112847328186,
      "step": 2503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1896.0,
      "completions/mean_length": 1560.91796875,
      "completions/mean_terminated_length": 646.955078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8548263207305624,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 3.9006223281016066,
      "learning_rate": 1.5698122940937325e-07,
      "loss": 0.005,
      "num_tokens": 2371692049.0,
      "reward": 0.013671875,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 2504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1723.134765625,
      "completions/mean_terminated_length": 661.9083862304688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8551677050439532,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 29.66417409840493,
      "learning_rate": 1.5672039791847385e-07,
      "loss": 0.0113,
      "num_tokens": 2372644150.0,
      "reward": 0.052734375,
      "reward_std": 0.08868034183979034,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1617.95703125,
      "completions/mean_terminated_length": 654.4430541992188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.855509089357344,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 69.17180149536003,
      "learning_rate": 1.5646012461603773e-07,
      "loss": 0.0094,
      "num_tokens": 2373550160.0,
      "reward": 0.017578125,
      "reward_std": 0.050508126616477966,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1668.06640625,
      "completions/mean_terminated_length": 648.5324096679688,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8558504736707349,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 14.844015147095856,
      "learning_rate": 1.5620040987147536e-07,
      "loss": 0.0215,
      "num_tokens": 2374483346.0,
      "reward": 0.03125,
      "reward_std": 0.06657323986291885,
      "rewards/accuracy_reward/mean": 0.032258063554763794,
      "rewards/accuracy_reward/std": 0.17686308920383453,
      "step": 2507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1606.63671875,
      "completions/mean_terminated_length": 608.6497192382812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8561918579841257,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 23.364996652415677,
      "learning_rate": 1.559412540534043e-07,
      "loss": 0.0005,
      "num_tokens": 2375383816.0,
      "reward": 0.0546875,
      "reward_std": 0.09908141195774078,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 1696.10546875,
      "completions/mean_terminated_length": 640.421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8565332422975165,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 17.16084546199387,
      "learning_rate": 1.5568265752964865e-07,
      "loss": 0.0067,
      "num_tokens": 2376329614.0,
      "reward": 0.029296875,
      "reward_std": 0.05870648846030235,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1472.0,
      "completions/mean_length": 1746.1953125,
      "completions/mean_terminated_length": 562.75,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8568746266109072,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 29.371474981579752,
      "learning_rate": 1.5542462066723912e-07,
      "loss": 0.0284,
      "num_tokens": 2377307122.0,
      "reward": 0.09375,
      "reward_std": 0.1272030919790268,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 2510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1872.0,
      "completions/mean_length": 1629.189453125,
      "completions/mean_terminated_length": 599.1419067382812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.857216010924298,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 14.070377713547375,
      "learning_rate": 1.551671438324116e-07,
      "loss": 0.0028,
      "num_tokens": 2378229907.0,
      "reward": 0.052734375,
      "reward_std": 0.06271954625844955,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1882.0,
      "completions/mean_length": 1686.822265625,
      "completions/mean_terminated_length": 717.6187133789062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8575573952376888,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 47.8749366852237,
      "learning_rate": 1.549102273906076e-07,
      "loss": 0.0095,
      "num_tokens": 2379166776.0,
      "reward": 0.05078125,
      "reward_std": 0.10232867300510406,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1691.681640625,
      "completions/mean_terminated_length": 588.52001953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8578987795510796,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 31.53900131626096,
      "learning_rate": 1.5465387170647284e-07,
      "loss": 0.0096,
      "num_tokens": 2380113589.0,
      "reward": 0.05859375,
      "reward_std": 0.10085200518369675,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1708.517578125,
      "completions/mean_terminated_length": 679.3779296875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8582401638644704,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 83.50638973783518,
      "learning_rate": 1.5439807714385747e-07,
      "loss": 0.0194,
      "num_tokens": 2381069694.0,
      "reward": 0.0625,
      "reward_std": 0.07863742858171463,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1691.732421875,
      "completions/mean_terminated_length": 706.7573852539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8585815481778613,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 71.44569889232295,
      "learning_rate": 1.54142844065815e-07,
      "loss": 0.0243,
      "num_tokens": 2382012053.0,
      "reward": 0.048828125,
      "reward_std": 0.06519509106874466,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1688.953125,
      "completions/mean_terminated_length": 644.7022705078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8589229324912521,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.395569760908234,
      "learning_rate": 1.5388817283460205e-07,
      "loss": 0.0098,
      "num_tokens": 2382955405.0,
      "reward": 0.017578125,
      "reward_std": 0.043680962175130844,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1680.0,
      "completions/mean_length": 1748.953125,
      "completions/mean_terminated_length": 589.79052734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8592643168046429,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 24.08760944224268,
      "learning_rate": 1.5363406381167798e-07,
      "loss": -0.0025,
      "num_tokens": 2383926293.0,
      "reward": 0.046875,
      "reward_std": 0.06948098540306091,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1709.533203125,
      "completions/mean_terminated_length": 639.0975341796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8596057011180336,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 53.0481258390873,
      "learning_rate": 1.533805173577039e-07,
      "loss": 0.0253,
      "num_tokens": 2384881878.0,
      "reward": 0.09765625,
      "reward_std": 0.12600576877593994,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.787109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1753.716796875,
      "completions/mean_terminated_length": 666.2201538085938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8599470854314244,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.965083164635452,
      "learning_rate": 1.531275338325429e-07,
      "loss": 0.0109,
      "num_tokens": 2385863637.0,
      "reward": 0.05859375,
      "reward_std": 0.08413515239953995,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1642.27734375,
      "completions/mean_terminated_length": 663.1333618164062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8602884697448152,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.364264106239652,
      "learning_rate": 1.528751135952585e-07,
      "loss": 0.0268,
      "num_tokens": 2386783107.0,
      "reward": 0.056640625,
      "reward_std": 0.08557207137346268,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1801.0,
      "completions/mean_length": 1621.244140625,
      "completions/mean_terminated_length": 600.9867553710938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.860629854058206,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 37.276246095158555,
      "learning_rate": 1.5262325700411534e-07,
      "loss": 0.0207,
      "num_tokens": 2387693648.0,
      "reward": 0.0859375,
      "reward_std": 0.1260918527841568,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1745.94921875,
      "completions/mean_terminated_length": 791.0650024414062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8609712383715968,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 79.12150500995565,
      "learning_rate": 1.5237196441657767e-07,
      "loss": 0.0188,
      "num_tokens": 2388668902.0,
      "reward": 0.0390625,
      "reward_std": 0.09187985956668854,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1618.66015625,
      "completions/mean_terminated_length": 592.2251586914062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8613126226849876,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 13.573922224854496,
      "learning_rate": 1.5212123618930924e-07,
      "loss": 0.0222,
      "num_tokens": 2389574200.0,
      "reward": 0.087890625,
      "reward_std": 0.11575096100568771,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 2523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1625.2890625,
      "completions/mean_terminated_length": 513.0496215820312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8616540069983785,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 144.09319928591722,
      "learning_rate": 1.518710726781731e-07,
      "loss": 0.0222,
      "num_tokens": 2390486444.0,
      "reward": 0.115234375,
      "reward_std": 0.16641941666603088,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "step": 2524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1646.890625,
      "completions/mean_terminated_length": 678.8800048828125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8619953913117693,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 75.02715437384968,
      "learning_rate": 1.5162147423823043e-07,
      "loss": 0.0129,
      "num_tokens": 2391402804.0,
      "reward": 0.08203125,
      "reward_std": 0.12615203857421875,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1776.0,
      "completions/mean_length": 1623.779296875,
      "completions/mean_terminated_length": 600.3800048828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.86233677562516,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 28.723624270462768,
      "learning_rate": 1.5137244122374076e-07,
      "loss": 0.0088,
      "num_tokens": 2392314467.0,
      "reward": 0.0703125,
      "reward_std": 0.0896916538476944,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1779.0,
      "completions/mean_length": 1679.84375,
      "completions/mean_terminated_length": 563.779541015625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8626781599385508,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.2915581539763728,
      "learning_rate": 1.5112397398816076e-07,
      "loss": 0.0092,
      "num_tokens": 2393255619.0,
      "reward": 0.013671875,
      "reward_std": 0.030584799125790596,
      "rewards/accuracy_reward/mean": 0.01411290280520916,
      "rewards/accuracy_reward/std": 0.11807557195425034,
      "step": 2527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.818359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1801.044921875,
      "completions/mean_terminated_length": 688.4193725585938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8630195442519416,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 64.31801265760994,
      "learning_rate": 1.508760728841442e-07,
      "loss": -0.0004,
      "num_tokens": 2394273466.0,
      "reward": 0.046875,
      "reward_std": 0.07494450360536575,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1793.0,
      "completions/mean_length": 1646.798828125,
      "completions/mean_terminated_length": 570.1942749023438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8633609285653324,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 29.97471088726863,
      "learning_rate": 1.506287382635415e-07,
      "loss": 0.0087,
      "num_tokens": 2395192803.0,
      "reward": 0.052734375,
      "reward_std": 0.07285481691360474,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1570.09375,
      "completions/mean_terminated_length": 617.0760498046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8637023128787232,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 23.7320107106578,
      "learning_rate": 1.503819704773987e-07,
      "loss": -0.0002,
      "num_tokens": 2396068515.0,
      "reward": 0.05859375,
      "reward_std": 0.10542967170476913,
      "rewards/accuracy_reward/mean": 0.060483869165182114,
      "rewards/accuracy_reward/std": 0.2386218160390854,
      "step": 2530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1630.630859375,
      "completions/mean_terminated_length": 678.173095703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.864043697192114,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 27.72477845038621,
      "learning_rate": 1.501357698759578e-07,
      "loss": 0.0051,
      "num_tokens": 2396978726.0,
      "reward": 0.060546875,
      "reward_std": 0.06095694378018379,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1640.2734375,
      "completions/mean_terminated_length": 627.89111328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8643850815055049,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 26.18784945468839,
      "learning_rate": 1.498901368086553e-07,
      "loss": 0.0133,
      "num_tokens": 2397899538.0,
      "reward": 0.041015625,
      "reward_std": 0.09820909798145294,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1705.0,
      "completions/mean_length": 1623.423828125,
      "completions/mean_terminated_length": 654.5192260742188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8647264658188957,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 47.10658308738134,
      "learning_rate": 1.4964507162412268e-07,
      "loss": -0.0061,
      "num_tokens": 2398802971.0,
      "reward": 0.05859375,
      "reward_std": 0.09919347614049911,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1659.291015625,
      "completions/mean_terminated_length": 646.457763671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8650678501322864,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 53.06636707655769,
      "learning_rate": 1.4940057467018482e-07,
      "loss": -0.0004,
      "num_tokens": 2399730896.0,
      "reward": 0.08984375,
      "reward_std": 0.13326609134674072,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 2534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1634.45703125,
      "completions/mean_terminated_length": 617.3648681640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8654092344456772,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 62.99446167026236,
      "learning_rate": 1.491566462938605e-07,
      "loss": 0.0155,
      "num_tokens": 2400654618.0,
      "reward": 0.03125,
      "reward_std": 0.08054865896701813,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1875.0,
      "completions/mean_length": 1668.517578125,
      "completions/mean_terminated_length": 587.1353759765625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.865750618759068,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 32.41018710459828,
      "learning_rate": 1.489132868413617e-07,
      "loss": 0.011,
      "num_tokens": 2401586403.0,
      "reward": 0.0703125,
      "reward_std": 0.10084541141986847,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1645.125,
      "completions/mean_terminated_length": 663.6241455078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8660920030724588,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.164600057614315,
      "learning_rate": 1.4867049665809232e-07,
      "loss": 0.0121,
      "num_tokens": 2402508771.0,
      "reward": 0.078125,
      "reward_std": 0.09011821448802948,
      "rewards/accuracy_reward/mean": 0.08064515888690948,
      "rewards/accuracy_reward/std": 0.2725643217563629,
      "step": 2537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1747.642578125,
      "completions/mean_terminated_length": 699.0263061523438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8664333873858496,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 51.61400381805593,
      "learning_rate": 1.4842827608864886e-07,
      "loss": 0.0307,
      "num_tokens": 2403482924.0,
      "reward": 0.125,
      "reward_std": 0.1436934620141983,
      "rewards/accuracy_reward/mean": 0.12903225421905518,
      "rewards/accuracy_reward/std": 0.33557409048080444,
      "step": 2538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1574.974609375,
      "completions/mean_terminated_length": 562.1779174804688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8667747716992404,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 64.91194608452355,
      "learning_rate": 1.48186625476819e-07,
      "loss": 0.014,
      "num_tokens": 2404367519.0,
      "reward": 0.060546875,
      "reward_std": 0.10074453055858612,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1778.0,
      "completions/mean_length": 1650.431640625,
      "completions/mean_terminated_length": 624.5384521484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8671161560126313,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 43.304218139688395,
      "learning_rate": 1.4794554516558166e-07,
      "loss": 0.0231,
      "num_tokens": 2405287612.0,
      "reward": 0.046875,
      "reward_std": 0.07340699434280396,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1633.46484375,
      "completions/mean_terminated_length": 713.1446533203125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8674575403260221,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 26.235841971079545,
      "learning_rate": 1.477050354971061e-07,
      "loss": -0.0008,
      "num_tokens": 2406205754.0,
      "reward": 0.01171875,
      "reward_std": 0.046875,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 2541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1670.884765625,
      "completions/mean_terminated_length": 698.468505859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8677989246394128,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 47.57754469776387,
      "learning_rate": 1.474650968127518e-07,
      "loss": -0.0035,
      "num_tokens": 2407149679.0,
      "reward": 0.015625,
      "reward_std": 0.051659777760505676,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1810.0,
      "completions/mean_length": 1665.06640625,
      "completions/mean_terminated_length": 595.6889038085938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8681403089528036,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 115.63878532873551,
      "learning_rate": 1.4722572945306812e-07,
      "loss": 0.002,
      "num_tokens": 2408089649.0,
      "reward": 0.103515625,
      "reward_std": 0.1329134702682495,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 2543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1776.0,
      "completions/mean_length": 1667.603515625,
      "completions/mean_terminated_length": 656.835693359375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8684816932661944,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 21.627427738617595,
      "learning_rate": 1.4698693375779296e-07,
      "loss": 0.0064,
      "num_tokens": 2409025430.0,
      "reward": 0.0078125,
      "reward_std": 0.02629890665411949,
      "rewards/accuracy_reward/mean": 0.0078125,
      "rewards/accuracy_reward/std": 0.08812850713729858,
      "step": 2544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 1651.248046875,
      "completions/mean_terminated_length": 597.0214233398438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8688230775795852,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 72.38246405724892,
      "learning_rate": 1.4674871006585326e-07,
      "loss": 0.0071,
      "num_tokens": 2409941989.0,
      "reward": 0.025390625,
      "reward_std": 0.05793476477265358,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1652.400390625,
      "completions/mean_terminated_length": 679.439208984375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.869164461892976,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 29.746942194299564,
      "learning_rate": 1.46511058715364e-07,
      "loss": 0.0068,
      "num_tokens": 2410874594.0,
      "reward": 0.03125,
      "reward_std": 0.06558094918727875,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1657.62109375,
      "completions/mean_terminated_length": 578.3382568359375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8695058462063668,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 12.019235556000336,
      "learning_rate": 1.4627398004362774e-07,
      "loss": 0.0076,
      "num_tokens": 2411803520.0,
      "reward": 0.021484375,
      "reward_std": 0.05287160724401474,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1624.291015625,
      "completions/mean_terminated_length": 551.8689575195312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8698472305197577,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 21.472414184119458,
      "learning_rate": 1.4603747438713426e-07,
      "loss": -0.0065,
      "num_tokens": 2412713861.0,
      "reward": 0.064453125,
      "reward_std": 0.08725585043430328,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1524.498046875,
      "completions/mean_terminated_length": 614.66845703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8701886148331485,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 61.347934229940904,
      "learning_rate": 1.458015420815601e-07,
      "loss": 0.0112,
      "num_tokens": 2413578116.0,
      "reward": 0.052734375,
      "reward_std": 0.09653191268444061,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 1626.38671875,
      "completions/mean_terminated_length": 618.423828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8705299991465392,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 33.136947453045245,
      "learning_rate": 1.4556618346176813e-07,
      "loss": -0.0029,
      "num_tokens": 2414490810.0,
      "reward": 0.06640625,
      "reward_std": 0.10300742089748383,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1584.25390625,
      "completions/mean_terminated_length": 564.0125122070312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.87087138345993,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 16.613480975147798,
      "learning_rate": 1.453313988618067e-07,
      "loss": 0.0278,
      "num_tokens": 2415366460.0,
      "reward": 0.06640625,
      "reward_std": 0.10124340653419495,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1932.0,
      "completions/mean_length": 1664.92578125,
      "completions/mean_terminated_length": 647.3928833007812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8712127677733208,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 42.12043570543552,
      "learning_rate": 1.4509718861490983e-07,
      "loss": 0.0288,
      "num_tokens": 2416302118.0,
      "reward": 0.072265625,
      "reward_std": 0.11970524489879608,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1654.19921875,
      "completions/mean_terminated_length": 703.82666015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8715541520867116,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.943678718648693,
      "learning_rate": 1.4486355305349583e-07,
      "loss": 0.0171,
      "num_tokens": 2417226860.0,
      "reward": 0.01953125,
      "reward_std": 0.04505910724401474,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 2553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1631.265625,
      "completions/mean_terminated_length": 625.5466918945312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8718955364001024,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 40.84797510998624,
      "learning_rate": 1.4463049250916792e-07,
      "loss": 0.013,
      "num_tokens": 2418145748.0,
      "reward": 0.06640625,
      "reward_std": 0.09205513447523117,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1679.697265625,
      "completions/mean_terminated_length": 756.4177856445312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8722369207134932,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 78.3288705860024,
      "learning_rate": 1.4439800731271267e-07,
      "loss": 0.0047,
      "num_tokens": 2419088041.0,
      "reward": 0.033203125,
      "reward_std": 0.06755761057138443,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1668.708984375,
      "completions/mean_terminated_length": 542.596923828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.872578305026884,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 58.931756272459666,
      "learning_rate": 1.4416609779410049e-07,
      "loss": 0.0227,
      "num_tokens": 2420028740.0,
      "reward": 0.09375,
      "reward_std": 0.13984671235084534,
      "rewards/accuracy_reward/mean": 0.09677419066429138,
      "rewards/accuracy_reward/std": 0.2959485352039337,
      "step": 2556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1634.37109375,
      "completions/mean_terminated_length": 707.6329345703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8729196893402749,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 118.9299061542486,
      "learning_rate": 1.4393476428248442e-07,
      "loss": 0.0216,
      "num_tokens": 2420939122.0,
      "reward": 0.05078125,
      "reward_std": 0.11877964437007904,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1913.0,
      "completions/mean_length": 1638.830078125,
      "completions/mean_terminated_length": 613.4794311523438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8732610736536656,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 19.442090108449733,
      "learning_rate": 1.4370400710620017e-07,
      "loss": 0.017,
      "num_tokens": 2421855403.0,
      "reward": 0.05078125,
      "reward_std": 0.10519799590110779,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1557.6640625,
      "completions/mean_terminated_length": 645.474853515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8736024579670564,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.246613710760492,
      "learning_rate": 1.4347382659276529e-07,
      "loss": 0.007,
      "num_tokens": 2422730047.0,
      "reward": 0.0390625,
      "reward_std": 0.08109389245510101,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1927.0,
      "completions/mean_length": 1690.6171875,
      "completions/mean_terminated_length": 607.2125854492188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8739438422804472,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 44.743599172277605,
      "learning_rate": 1.4324422306887873e-07,
      "loss": 0.0246,
      "num_tokens": 2423676843.0,
      "reward": 0.09765625,
      "reward_std": 0.14750081300735474,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1842.0,
      "completions/mean_length": 1738.49609375,
      "completions/mean_terminated_length": 738.3635864257812,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.874285226593838,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 32.515682819638286,
      "learning_rate": 1.430151968604211e-07,
      "loss": 0.0201,
      "num_tokens": 2424639177.0,
      "reward": 0.08203125,
      "reward_std": 0.09777135401964188,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1604.173828125,
      "completions/mean_terminated_length": 662.3963012695312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8746266109072288,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.455049688208057,
      "learning_rate": 1.4278674829245282e-07,
      "loss": 0.0244,
      "num_tokens": 2425539090.0,
      "reward": 0.087890625,
      "reward_std": 0.12038947641849518,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 2562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 1620.07421875,
      "completions/mean_terminated_length": 606.5657958984375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8749679952206196,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 103.71023040941192,
      "learning_rate": 1.425588776892151e-07,
      "loss": -0.0003,
      "num_tokens": 2426447272.0,
      "reward": 0.08984375,
      "reward_std": 0.08832141757011414,
      "rewards/accuracy_reward/mean": 0.0927419364452362,
      "rewards/accuracy_reward/std": 0.2903633117675781,
      "step": 2563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1862.0,
      "completions/mean_length": 1665.513671875,
      "completions/mean_terminated_length": 707.0684814453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8753093795340104,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 52.748714641869284,
      "learning_rate": 1.423315853741285e-07,
      "loss": 0.0112,
      "num_tokens": 2427390847.0,
      "reward": 0.02734375,
      "reward_std": 0.057157501578330994,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1897.0,
      "completions/mean_length": 1594.484375,
      "completions/mean_terminated_length": 657.5808715820312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8756507638474013,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 52.669373645234714,
      "learning_rate": 1.4210487166979283e-07,
      "loss": 0.0136,
      "num_tokens": 2428283591.0,
      "reward": 0.0625,
      "reward_std": 0.11487960815429688,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1682.97265625,
      "completions/mean_terminated_length": 632.1363525390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.875992148160792,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 186.6926399034019,
      "learning_rate": 1.4187873689798684e-07,
      "loss": 0.018,
      "num_tokens": 2429225849.0,
      "reward": 0.02734375,
      "reward_std": 0.07779236882925034,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 1525.42578125,
      "completions/mean_terminated_length": 617.2085571289062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8763335324741828,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 38.87229133279316,
      "learning_rate": 1.416531813796674e-07,
      "loss": 0.0384,
      "num_tokens": 2430085747.0,
      "reward": 0.111328125,
      "reward_std": 0.14585818350315094,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "step": 2567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1890.0,
      "completions/mean_length": 1616.32421875,
      "completions/mean_terminated_length": 657.9496459960938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8766749167875736,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 28.3617982027029,
      "learning_rate": 1.4142820543496936e-07,
      "loss": 0.0141,
      "num_tokens": 2430985737.0,
      "reward": 0.037109375,
      "reward_std": 0.07768725603818893,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1614.0,
      "completions/mean_length": 1588.41796875,
      "completions/mean_terminated_length": 621.9030151367188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8770163011009644,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.259464741690593,
      "learning_rate": 1.4120380938320487e-07,
      "loss": 0.0259,
      "num_tokens": 2431869999.0,
      "reward": 0.0546875,
      "reward_std": 0.09341736882925034,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1646.712890625,
      "completions/mean_terminated_length": 650.3197021484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8773576854143552,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 56.90335062692698,
      "learning_rate": 1.4097999354286316e-07,
      "loss": 0.0256,
      "num_tokens": 2432796780.0,
      "reward": 0.048828125,
      "reward_std": 0.10068464279174805,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1865.0,
      "completions/mean_length": 1668.396484375,
      "completions/mean_terminated_length": 564.3587646484375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.877699069727746,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.67960608019363,
      "learning_rate": 1.4075675823160982e-07,
      "loss": 0.0017,
      "num_tokens": 2433720279.0,
      "reward": 0.033203125,
      "reward_std": 0.06536141037940979,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1652.53125,
      "completions/mean_terminated_length": 642.263916015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8780404540411368,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.622973922019595,
      "learning_rate": 1.4053410376628647e-07,
      "loss": 0.0011,
      "num_tokens": 2434650695.0,
      "reward": 0.037109375,
      "reward_std": 0.06024399772286415,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1788.0,
      "completions/mean_length": 1610.2578125,
      "completions/mean_terminated_length": 647.2250366210938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8783818383545277,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 27.888683357959476,
      "learning_rate": 1.403120304629106e-07,
      "loss": -0.0058,
      "num_tokens": 2435552203.0,
      "reward": 0.0390625,
      "reward_std": 0.06936796009540558,
      "rewards/accuracy_reward/mean": 0.04032257944345474,
      "rewards/accuracy_reward/std": 0.19691328704357147,
      "step": 2573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1631.005859375,
      "completions/mean_terminated_length": 555.3916015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8787232226679184,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 35.48613274007586,
      "learning_rate": 1.4009053863667448e-07,
      "loss": 0.0093,
      "num_tokens": 2436467262.0,
      "reward": 0.044921875,
      "reward_std": 0.0922069400548935,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1882.0,
      "completions/mean_length": 1592.49609375,
      "completions/mean_terminated_length": 617.2147216796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8790646069813092,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 146.72170005130323,
      "learning_rate": 1.3986962860194528e-07,
      "loss": 0.0105,
      "num_tokens": 2437359852.0,
      "reward": 0.0546875,
      "reward_std": 0.08855310082435608,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1768.0,
      "completions/mean_length": 1608.451171875,
      "completions/mean_terminated_length": 632.5974731445312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8794059912947,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 23.59732952144052,
      "learning_rate": 1.396493006722645e-07,
      "loss": 0.0246,
      "num_tokens": 2438257075.0,
      "reward": 0.0546875,
      "reward_std": 0.07218398153781891,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 2576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1651.1171875,
      "completions/mean_terminated_length": 719.8692626953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8797473756080908,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 261.621415972833,
      "learning_rate": 1.3942955516034715e-07,
      "loss": 0.0175,
      "num_tokens": 2439178463.0,
      "reward": 0.068359375,
      "reward_std": 0.10837717354297638,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1656.314453125,
      "completions/mean_terminated_length": 720.25830078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8800887599214816,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 24.462477399923486,
      "learning_rate": 1.3921039237808198e-07,
      "loss": 0.0148,
      "num_tokens": 2440117024.0,
      "reward": 0.0234375,
      "reward_std": 0.05479498207569122,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1626.298828125,
      "completions/mean_terminated_length": 618.1258544921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8804301442348724,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 101.11125030362852,
      "learning_rate": 1.3899181263653026e-07,
      "loss": -0.0097,
      "num_tokens": 2441026633.0,
      "reward": 0.0859375,
      "reward_std": 0.09270582348108292,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1630.314453125,
      "completions/mean_terminated_length": 668.290283203125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8807715285482632,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 16.90407167905349,
      "learning_rate": 1.3877381624592616e-07,
      "loss": 0.018,
      "num_tokens": 2441932122.0,
      "reward": 0.064453125,
      "reward_std": 0.10746511071920395,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1697.818359375,
      "completions/mean_terminated_length": 590.8292236328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.881112912861654,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.255693142079199,
      "learning_rate": 1.3855640351567553e-07,
      "loss": 0.0104,
      "num_tokens": 2442887453.0,
      "reward": 0.02734375,
      "reward_std": 0.06079617515206337,
      "rewards/accuracy_reward/mean": 0.02822580561041832,
      "rewards/accuracy_reward/std": 0.1657845675945282,
      "step": 2581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1639.5078125,
      "completions/mean_terminated_length": 605.5999755859375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8814542971750448,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 0.04592259591068291,
      "learning_rate": 1.3833957475435613e-07,
      "loss": 0.0048,
      "num_tokens": 2443809649.0,
      "reward": 0.001953125,
      "reward_std": 0.0078125,
      "rewards/accuracy_reward/mean": 0.001953125,
      "rewards/accuracy_reward/std": 0.04419417306780815,
      "step": 2582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1607.474609375,
      "completions/mean_terminated_length": 673.0304565429688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8817956814884356,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 168.8098879434718,
      "learning_rate": 1.3812333026971663e-07,
      "loss": 0.0056,
      "num_tokens": 2444715348.0,
      "reward": 0.05078125,
      "reward_std": 0.12114076316356659,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1674.541015625,
      "completions/mean_terminated_length": 701.4436645507812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8821370658018264,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 12.044235324881859,
      "learning_rate": 1.3790767036867645e-07,
      "loss": 0.0013,
      "num_tokens": 2445649193.0,
      "reward": 0.0703125,
      "reward_std": 0.14334052801132202,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1562.564453125,
      "completions/mean_terminated_length": 651.6910400390625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8824784501152172,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 39.24333446966561,
      "learning_rate": 1.3769259535732561e-07,
      "loss": 0.0294,
      "num_tokens": 2446520474.0,
      "reward": 0.05078125,
      "reward_std": 0.07972268760204315,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1872.0,
      "completions/mean_length": 1657.4765625,
      "completions/mean_terminated_length": 678.4931640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.882819834428608,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 12.129675679783324,
      "learning_rate": 1.374781055409235e-07,
      "loss": 0.0386,
      "num_tokens": 2447442910.0,
      "reward": 0.083984375,
      "reward_std": 0.1506405919790268,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1587.4765625,
      "completions/mean_terminated_length": 583.478271484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8831612187419988,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 94.50477724956566,
      "learning_rate": 1.372642012238993e-07,
      "loss": 0.018,
      "num_tokens": 2448332258.0,
      "reward": 0.06640625,
      "reward_std": 0.09734337776899338,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293970108032227,
      "step": 2587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1781.0,
      "completions/mean_length": 1662.3203125,
      "completions/mean_terminated_length": 647.5177001953125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8835026030553896,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 0.9545814130024881,
      "learning_rate": 1.3705088270985103e-07,
      "loss": 0.0173,
      "num_tokens": 2449257478.0,
      "reward": 0.0234375,
      "reward_std": 0.05754890665411949,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1605.177734375,
      "completions/mean_terminated_length": 630.96875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8838439873687804,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 28.325213225573407,
      "learning_rate": 1.3683815030154538e-07,
      "loss": -0.0029,
      "num_tokens": 2450162497.0,
      "reward": 0.06640625,
      "reward_std": 0.0727282464504242,
      "rewards/accuracy_reward/mean": 0.06854838877916336,
      "rewards/accuracy_reward/std": 0.25293973088264465,
      "step": 2589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1654.89453125,
      "completions/mean_terminated_length": 579.211669921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8841853716821712,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 38.9051811748272,
      "learning_rate": 1.3662600430091707e-07,
      "loss": 0.0054,
      "num_tokens": 2451091179.0,
      "reward": 0.02734375,
      "reward_std": 0.06492365896701813,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1562.1484375,
      "completions/mean_terminated_length": 601.7442016601562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.884526755995562,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 99.74967040805672,
      "learning_rate": 1.3641444500906846e-07,
      "loss": 0.0209,
      "num_tokens": 2451960471.0,
      "reward": 0.09765625,
      "reward_std": 0.10045164078474045,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1606.28515625,
      "completions/mean_terminated_length": 643.2919311523438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8848681403089528,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 94.91501332618131,
      "learning_rate": 1.3620347272626933e-07,
      "loss": 0.0181,
      "num_tokens": 2452852169.0,
      "reward": 0.109375,
      "reward_std": 0.15148106217384338,
      "rewards/accuracy_reward/mean": 0.109375,
      "rewards/accuracy_reward/std": 0.31241437792778015,
      "step": 2592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1648.7265625,
      "completions/mean_terminated_length": 676.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8852095246223436,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 81.26350004864784,
      "learning_rate": 1.359930877519562e-07,
      "loss": 0.014,
      "num_tokens": 2453772141.0,
      "reward": 0.013671875,
      "reward_std": 0.04973640665411949,
      "rewards/accuracy_reward/mean": 0.01411290280520916,
      "rewards/accuracy_reward/std": 0.11807556450366974,
      "step": 2593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1864.0,
      "completions/mean_length": 1562.9375,
      "completions/mean_terminated_length": 551.903564453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8855509089357344,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.468465891854425,
      "learning_rate": 1.3578329038473222e-07,
      "loss": 0.0287,
      "num_tokens": 2454645933.0,
      "reward": 0.10546875,
      "reward_std": 0.13625310361385345,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 2594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 1643.763671875,
      "completions/mean_terminated_length": 610.7152709960938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8858922932491252,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 91.10358357724911,
      "learning_rate": 1.355740809223662e-07,
      "loss": 0.0264,
      "num_tokens": 2455556692.0,
      "reward": 0.048828125,
      "reward_std": 0.07933682948350906,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1694.693359375,
      "completions/mean_terminated_length": 577.3251953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.886233677562516,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 9.031722356817268,
      "learning_rate": 1.3536545966179274e-07,
      "loss": 0.0237,
      "num_tokens": 2456496999.0,
      "reward": 0.03515625,
      "reward_std": 0.10998040437698364,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1659.37109375,
      "completions/mean_terminated_length": 595.6058349609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8865750618759068,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 97.03339588721056,
      "learning_rate": 1.3515742689911166e-07,
      "loss": -0.0032,
      "num_tokens": 2457423269.0,
      "reward": 0.048828125,
      "reward_std": 0.07163181155920029,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.646484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1892.0,
      "completions/mean_length": 1577.64453125,
      "completions/mean_terminated_length": 717.4917602539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8869164461892975,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 9.032317677306038,
      "learning_rate": 1.3494998292958725e-07,
      "loss": -0.0011,
      "num_tokens": 2458305743.0,
      "reward": 0.044921875,
      "reward_std": 0.09528100490570068,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1871.0,
      "completions/mean_length": 1603.654296875,
      "completions/mean_terminated_length": 510.8040771484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8872578305026884,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.387011818490193,
      "learning_rate": 1.3474312804764853e-07,
      "loss": 0.0004,
      "num_tokens": 2459204798.0,
      "reward": 0.052734375,
      "reward_std": 0.06695909798145294,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1503.6328125,
      "completions/mean_terminated_length": 625.9795532226562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8875992148160792,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 34.552076243846884,
      "learning_rate": 1.345368625468879e-07,
      "loss": 0.0108,
      "num_tokens": 2460048114.0,
      "reward": 0.05078125,
      "reward_std": 0.10519799590110779,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1560.275390625,
      "completions/mean_terminated_length": 612.8563232421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.88794059912947,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 4.611143230975061,
      "learning_rate": 1.3433118672006173e-07,
      "loss": 0.0063,
      "num_tokens": 2460920415.0,
      "reward": 0.03515625,
      "reward_std": 0.07662828266620636,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1698.681640625,
      "completions/mean_terminated_length": 693.0681762695312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8882819834428608,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.358095907107318,
      "learning_rate": 1.3412610085908912e-07,
      "loss": 0.0285,
      "num_tokens": 2461865180.0,
      "reward": 0.068359375,
      "reward_std": 0.09506140649318695,
      "rewards/accuracy_reward/mean": 0.07056451588869095,
      "rewards/accuracy_reward/std": 0.25635457038879395,
      "step": 2602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1710.029296875,
      "completions/mean_terminated_length": 652.508056640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8886233677562516,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.946390105870801,
      "learning_rate": 1.3392160525505191e-07,
      "loss": 0.0104,
      "num_tokens": 2462815339.0,
      "reward": 0.041015625,
      "reward_std": 0.07328042387962341,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1592.13671875,
      "completions/mean_terminated_length": 675.0470581054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8889647520696424,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 21.693148858229854,
      "learning_rate": 1.3371770019819433e-07,
      "loss": 0.0397,
      "num_tokens": 2463702641.0,
      "reward": 0.080078125,
      "reward_std": 0.10896982997655869,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1661.62890625,
      "completions/mean_terminated_length": 614.5072631835938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8893061363830332,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 62.488641186456604,
      "learning_rate": 1.3351438597792218e-07,
      "loss": 0.0064,
      "num_tokens": 2464628339.0,
      "reward": 0.06640625,
      "reward_std": 0.12857332825660706,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1570.572265625,
      "completions/mean_terminated_length": 610.1000366210938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.889647520696424,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 31.26239939018135,
      "learning_rate": 1.3331166288280295e-07,
      "loss": 0.0061,
      "num_tokens": 2465504680.0,
      "reward": 0.103515625,
      "reward_std": 0.1269710510969162,
      "rewards/accuracy_reward/mean": 0.103515625,
      "rewards/accuracy_reward/std": 0.30492907762527466,
      "step": 2606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1890.0,
      "completions/mean_length": 1606.53125,
      "completions/mean_terminated_length": 599.076904296875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8899889050098148,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 36.76547732652226,
      "learning_rate": 1.3310953120056488e-07,
      "loss": 0.0218,
      "num_tokens": 2466401272.0,
      "reward": 0.064453125,
      "reward_std": 0.09385748207569122,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1809.0,
      "completions/mean_length": 1610.966796875,
      "completions/mean_terminated_length": 658.1801147460938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8903302893232056,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.193858885167952,
      "learning_rate": 1.3290799121809702e-07,
      "loss": 0.0116,
      "num_tokens": 2467303351.0,
      "reward": 0.044921875,
      "reward_std": 0.06943464279174805,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1551.76171875,
      "completions/mean_terminated_length": 544.6035766601562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8906716736365964,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 40.61312553932107,
      "learning_rate": 1.3270704322144832e-07,
      "loss": 0.026,
      "num_tokens": 2468171437.0,
      "reward": 0.0703125,
      "reward_std": 0.11123858392238617,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 1623.931640625,
      "completions/mean_terminated_length": 690.9812622070312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8910130579499872,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 150.40773777458514,
      "learning_rate": 1.3250668749582782e-07,
      "loss": 0.0056,
      "num_tokens": 2469076618.0,
      "reward": 0.05078125,
      "reward_std": 0.08477335423231125,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1599.83984375,
      "completions/mean_terminated_length": 518.280029296875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.891354442263378,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.83605761959913,
      "learning_rate": 1.3230692432560403e-07,
      "loss": 0.0118,
      "num_tokens": 2469978424.0,
      "reward": 0.060546875,
      "reward_std": 0.08847981691360474,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1518.25390625,
      "completions/mean_terminated_length": 605.2872314453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8916958265767688,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 46.46193918051186,
      "learning_rate": 1.321077539943039e-07,
      "loss": 0.0334,
      "num_tokens": 2470828314.0,
      "reward": 0.056640625,
      "reward_std": 0.10715978592634201,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1644.53515625,
      "completions/mean_terminated_length": 623.3517456054688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8920372108901596,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 3.2328445596171917,
      "learning_rate": 1.319091767846136e-07,
      "loss": -0.0086,
      "num_tokens": 2471753868.0,
      "reward": 0.015625,
      "reward_std": 0.03125,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1618.69140625,
      "completions/mean_terminated_length": 611.3594970703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8923785952035505,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.513461665661305,
      "learning_rate": 1.3171119297837686e-07,
      "loss": 0.0045,
      "num_tokens": 2472658318.0,
      "reward": 0.033203125,
      "reward_std": 0.07564390450716019,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211629986763,
      "step": 2614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1677.0,
      "completions/mean_length": 1670.748046875,
      "completions/mean_terminated_length": 595.7218017578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8927199795169412,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.5804476201088,
      "learning_rate": 1.3151380285659565e-07,
      "loss": 0.0243,
      "num_tokens": 2473589981.0,
      "reward": 0.056640625,
      "reward_std": 0.07965600490570068,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.794921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1755.296875,
      "completions/mean_terminated_length": 620.7238159179688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.893061363830332,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 37.49575633053217,
      "learning_rate": 1.3131700669942907e-07,
      "loss": 0.0061,
      "num_tokens": 2474571797.0,
      "reward": 0.0390625,
      "reward_std": 0.06999341398477554,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1660.486328125,
      "completions/mean_terminated_length": 599.7737426757812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8934027481437228,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 4.9263298705819425,
      "learning_rate": 1.3112080478619333e-07,
      "loss": 0.0013,
      "num_tokens": 2475496798.0,
      "reward": 0.037109375,
      "reward_std": 0.06069765239953995,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1641.166015625,
      "completions/mean_terminated_length": 631.0,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.8937441324571136,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 61.07437299990088,
      "learning_rate": 1.309251973953612e-07,
      "loss": 0.0291,
      "num_tokens": 2476416867.0,
      "reward": 0.04296875,
      "reward_std": 0.0878089889883995,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1643.3203125,
      "completions/mean_terminated_length": 648.0270385742188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8940855167705044,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.163058106177084,
      "learning_rate": 1.3073018480456148e-07,
      "loss": 0.0023,
      "num_tokens": 2477339863.0,
      "reward": 0.015625,
      "reward_std": 0.040274329483509064,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1594.013671875,
      "completions/mean_terminated_length": 681.0352783203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8944269010838952,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 28.762413729651612,
      "learning_rate": 1.3053576729057902e-07,
      "loss": 0.0356,
      "num_tokens": 2478239342.0,
      "reward": 0.083984375,
      "reward_std": 0.14133691787719727,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1853.0,
      "completions/mean_length": 1676.376953125,
      "completions/mean_terminated_length": 617.3909912109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.894768285397286,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 9.086692749327831,
      "learning_rate": 1.3034194512935377e-07,
      "loss": 0.0108,
      "num_tokens": 2479177359.0,
      "reward": 0.03125,
      "reward_std": 0.06783141195774078,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1638.021484375,
      "completions/mean_terminated_length": 569.767578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8951096697106768,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 60.892173662204,
      "learning_rate": 1.3014871859598092e-07,
      "loss": -0.0016,
      "num_tokens": 2480100122.0,
      "reward": 0.064453125,
      "reward_std": 0.09315156936645508,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1709.080078125,
      "completions/mean_terminated_length": 834.5244750976562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8954510540240675,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 46.55378966705435,
      "learning_rate": 1.299560879647101e-07,
      "loss": 0.0145,
      "num_tokens": 2481051299.0,
      "reward": 0.044921875,
      "reward_std": 0.09217274188995361,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1602.63671875,
      "completions/mean_terminated_length": 640.43212890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8957924383374584,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 16.744258406877986,
      "learning_rate": 1.2976405350894536e-07,
      "loss": 0.0148,
      "num_tokens": 2481945993.0,
      "reward": 0.0390625,
      "reward_std": 0.061308603733778,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 1640.171875,
      "completions/mean_terminated_length": 607.94482421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8961338226508492,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.01510158767108,
      "learning_rate": 1.295726155012445e-07,
      "loss": 0.0091,
      "num_tokens": 2482862721.0,
      "reward": 0.048828125,
      "reward_std": 0.0921536535024643,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 2625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1642.0234375,
      "completions/mean_terminated_length": 614.4827270507812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.89647520696424,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 35.766057823413796,
      "learning_rate": 1.2938177421331875e-07,
      "loss": 0.0134,
      "num_tokens": 2483787229.0,
      "reward": 0.08984375,
      "reward_std": 0.12364465743303299,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 2626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.658203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1570.669921875,
      "completions/mean_terminated_length": 651.4685668945312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8968165912776308,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 170.32923708655812,
      "learning_rate": 1.2919152991603235e-07,
      "loss": 0.0117,
      "num_tokens": 2484666276.0,
      "reward": 0.05859375,
      "reward_std": 0.11527761071920395,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1685.908203125,
      "completions/mean_terminated_length": 674.7333374023438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8971579755910216,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 19.84809704462691,
      "learning_rate": 1.2900188287940223e-07,
      "loss": 0.0058,
      "num_tokens": 2485603029.0,
      "reward": 0.04296875,
      "reward_std": 0.09338457882404327,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1677.45703125,
      "completions/mean_terminated_length": 673.2318725585938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8974993599044124,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 12.473307080978197,
      "learning_rate": 1.2881283337259784e-07,
      "loss": 0.0062,
      "num_tokens": 2486532959.0,
      "reward": 0.05078125,
      "reward_std": 0.10292718559503555,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1616.0,
      "completions/mean_length": 1735.9375,
      "completions/mean_terminated_length": 554.766357421875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8978407442178032,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 40.78364422485982,
      "learning_rate": 1.2862438166394022e-07,
      "loss": 0.0072,
      "num_tokens": 2487505519.0,
      "reward": 0.05078125,
      "reward_std": 0.0595787949860096,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1589.44140625,
      "completions/mean_terminated_length": 562.0379638671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.898182128531194,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 25.5979100696351,
      "learning_rate": 1.284365280209022e-07,
      "loss": 0.0107,
      "num_tokens": 2488394705.0,
      "reward": 0.0625,
      "reward_std": 0.10975532233715057,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1665.7421875,
      "completions/mean_terminated_length": 608.9117431640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8985235128445848,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 25.11178383328119,
      "learning_rate": 1.2824927271010777e-07,
      "loss": -0.0016,
      "num_tokens": 2489326829.0,
      "reward": 0.03515625,
      "reward_std": 0.04318207502365112,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1662.48828125,
      "completions/mean_terminated_length": 575.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8988648971579756,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 154.26176219196702,
      "learning_rate": 1.2806261599733127e-07,
      "loss": 0.0301,
      "num_tokens": 2490256343.0,
      "reward": 0.068359375,
      "reward_std": 0.12042367458343506,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1770.0,
      "completions/mean_length": 1630.689453125,
      "completions/mean_terminated_length": 584.5548095703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8992062814713664,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 50.20600807016281,
      "learning_rate": 1.278765581474981e-07,
      "loss": -0.0133,
      "num_tokens": 2491166456.0,
      "reward": 0.05859375,
      "reward_std": 0.09814241528511047,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1617.0,
      "completions/mean_length": 1561.33984375,
      "completions/mean_terminated_length": 555.964111328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8995476657847572,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.533965691232433,
      "learning_rate": 1.276910994246831e-07,
      "loss": 0.0053,
      "num_tokens": 2492039974.0,
      "reward": 0.0546875,
      "reward_std": 0.052172206342220306,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1632.93359375,
      "completions/mean_terminated_length": 582.38623046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.899889050098148,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 0.0892123151270677,
      "learning_rate": 1.275062400921112e-07,
      "loss": 0.0055,
      "num_tokens": 2492958772.0,
      "reward": 0.005859375,
      "reward_std": 0.01848640665411949,
      "rewards/accuracy_reward/mean": 0.006048386916518211,
      "rewards/accuracy_reward/std": 0.07761410623788834,
      "step": 2636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.763671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1717.1796875,
      "completions/mean_terminated_length": 648.165283203125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9002304344115388,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 17.465575480785812,
      "learning_rate": 1.273219804121562e-07,
      "loss": 0.0071,
      "num_tokens": 2493919536.0,
      "reward": 0.07421875,
      "reward_std": 0.0721839889883995,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1914.0,
      "completions/mean_length": 1612.67578125,
      "completions/mean_terminated_length": 672.1605224609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9005718187249296,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 76.31475256587117,
      "learning_rate": 1.2713832064634125e-07,
      "loss": 0.0051,
      "num_tokens": 2494823994.0,
      "reward": 0.0546875,
      "reward_std": 0.11803390085697174,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1602.498046875,
      "completions/mean_terminated_length": 639.9938354492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9009132030383203,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.583995743047526,
      "learning_rate": 1.2695526105533768e-07,
      "loss": 0.0049,
      "num_tokens": 2495726009.0,
      "reward": 0.048828125,
      "reward_std": 0.09550845623016357,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 2639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1697.796875,
      "completions/mean_terminated_length": 658.8062133789062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9012545873517112,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 41.06824188920856,
      "learning_rate": 1.26772801898965e-07,
      "loss": 0.0185,
      "num_tokens": 2496677569.0,
      "reward": 0.0703125,
      "reward_std": 0.11607901006937027,
      "rewards/accuracy_reward/mean": 0.07258064299821854,
      "rewards/accuracy_reward/std": 0.25970885157585144,
      "step": 2640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1605.6640625,
      "completions/mean_terminated_length": 586.864501953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.901595971665102,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 46.015264450247926,
      "learning_rate": 1.2659094343619087e-07,
      "loss": 0.0154,
      "num_tokens": 2497574901.0,
      "reward": 0.056640625,
      "reward_std": 0.08395528793334961,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1724.259765625,
      "completions/mean_terminated_length": 700.3983764648438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9019373559784928,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 14.438538727672466,
      "learning_rate": 1.2640968592512978e-07,
      "loss": -0.006,
      "num_tokens": 2498536506.0,
      "reward": 0.029296875,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1615.744140625,
      "completions/mean_terminated_length": 572.8400268554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9022787402918836,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 168.58015065708898,
      "learning_rate": 1.2622902962304394e-07,
      "loss": 0.0058,
      "num_tokens": 2499447991.0,
      "reward": 0.072265625,
      "reward_std": 0.15269054472446442,
      "rewards/accuracy_reward/mean": 0.07459677755832672,
      "rewards/accuracy_reward/std": 0.263004869222641,
      "step": 2643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1588.134765625,
      "completions/mean_terminated_length": 585.5714111328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9026201246052744,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.140874581705576,
      "learning_rate": 1.2604897478634172e-07,
      "loss": 0.0096,
      "num_tokens": 2500342092.0,
      "reward": 0.03515625,
      "reward_std": 0.06085042655467987,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1684.0,
      "completions/mean_length": 1587.537109375,
      "completions/mean_terminated_length": 610.457275390625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9029615089186652,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 39.653449839858204,
      "learning_rate": 1.2586952167057805e-07,
      "loss": 0.0181,
      "num_tokens": 2501229039.0,
      "reward": 0.056640625,
      "reward_std": 0.0815330371260643,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1551.962890625,
      "completions/mean_terminated_length": 588.7069091796875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.903302893232056,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 27.250919203736725,
      "learning_rate": 1.256906705304539e-07,
      "loss": 0.01,
      "num_tokens": 2502107020.0,
      "reward": 0.03125,
      "reward_std": 0.05969182401895523,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1721.0,
      "completions/mean_length": 1619.638671875,
      "completions/mean_terminated_length": 576.0469970703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9036442775454467,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 158.77926796001358,
      "learning_rate": 1.2551242161981563e-07,
      "loss": 0.0253,
      "num_tokens": 2503012787.0,
      "reward": 0.083984375,
      "reward_std": 0.1171422079205513,
      "rewards/accuracy_reward/mean": 0.08669354766607285,
      "rewards/accuracy_reward/std": 0.281669557094574,
      "step": 2647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1826.0,
      "completions/mean_length": 1575.03515625,
      "completions/mean_terminated_length": 640.4302368164062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.9039856618588376,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 36.63063415437126,
      "learning_rate": 1.253347751916551e-07,
      "loss": 0.0249,
      "num_tokens": 2503901381.0,
      "reward": 0.044921875,
      "reward_std": 0.08298446238040924,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1865.0,
      "completions/mean_length": 1585.76953125,
      "completions/mean_terminated_length": 540.5987548828125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9043270461722284,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.716262182459976,
      "learning_rate": 1.2515773149810875e-07,
      "loss": 0.0282,
      "num_tokens": 2504795439.0,
      "reward": 0.048828125,
      "reward_std": 0.07988205552101135,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1658.484375,
      "completions/mean_terminated_length": 663.0555419921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9046684304856192,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 47.247834642059466,
      "learning_rate": 1.24981290790458e-07,
      "loss": 0.0118,
      "num_tokens": 2505717591.0,
      "reward": 0.05078125,
      "reward_std": 0.08501990139484406,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1604.31640625,
      "completions/mean_terminated_length": 591.8076782226562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.90500981479901,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 39.76025085675048,
      "learning_rate": 1.2480545331912786e-07,
      "loss": -0.0214,
      "num_tokens": 2506610393.0,
      "reward": 0.119140625,
      "reward_std": 0.159950852394104,
      "rewards/accuracy_reward/mean": 0.119140625,
      "rewards/accuracy_reward/std": 0.32427072525024414,
      "step": 2651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.619140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1476.30078125,
      "completions/mean_terminated_length": 546.923095703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9053511991124008,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 17.72578038485607,
      "learning_rate": 1.246302193336876e-07,
      "loss": 0.0079,
      "num_tokens": 2507443827.0,
      "reward": 0.025390625,
      "reward_std": 0.04803257808089256,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1586.63671875,
      "completions/mean_terminated_length": 598.809814453125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9056925834257916,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 20.682672032273622,
      "learning_rate": 1.2445558908284983e-07,
      "loss": 0.0115,
      "num_tokens": 2508332889.0,
      "reward": 0.03515625,
      "reward_std": 0.0853334367275238,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1523.498046875,
      "completions/mean_terminated_length": 604.2096557617188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9060339677391824,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 45.37614648473046,
      "learning_rate": 1.2428156281447017e-07,
      "loss": 0.0218,
      "num_tokens": 2509180072.0,
      "reward": 0.06640625,
      "reward_std": 0.13144925236701965,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1656.8203125,
      "completions/mean_terminated_length": 730.625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.9063753520525731,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.082394539660557,
      "learning_rate": 1.2410814077554717e-07,
      "loss": 0.0085,
      "num_tokens": 2510108940.0,
      "reward": 0.06640625,
      "reward_std": 0.0961792916059494,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1930.0,
      "completions/mean_length": 1688.666015625,
      "completions/mean_terminated_length": 587.8492431640625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.906716736365964,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 72.0515138179927,
      "learning_rate": 1.239353232122216e-07,
      "loss": 0.02,
      "num_tokens": 2511041393.0,
      "reward": 0.076171875,
      "reward_std": 0.0986412987112999,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 2656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1648.125,
      "completions/mean_terminated_length": 692.52978515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9070581206793548,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 40.247496353478766,
      "learning_rate": 1.2376311036977652e-07,
      "loss": -0.0018,
      "num_tokens": 2511964465.0,
      "reward": 0.0546875,
      "reward_std": 0.10276086628437042,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 2657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1605.15625,
      "completions/mean_terminated_length": 585.1870727539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9073995049927456,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 46.041308995122705,
      "learning_rate": 1.2359150249263649e-07,
      "loss": 0.0153,
      "num_tokens": 2512862785.0,
      "reward": 0.080078125,
      "reward_std": 0.11981005221605301,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1628.0,
      "completions/mean_length": 1670.048828125,
      "completions/mean_terminated_length": 625.125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9077408893061364,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 15.350329247527585,
      "learning_rate": 1.2342049982436734e-07,
      "loss": 0.0234,
      "num_tokens": 2513794330.0,
      "reward": 0.0546875,
      "reward_std": 0.11300258338451385,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1743.0,
      "completions/mean_length": 1687.546875,
      "completions/mean_terminated_length": 547.5772094726562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9080822736195272,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.41944089934865,
      "learning_rate": 1.2325010260767639e-07,
      "loss": -0.002,
      "num_tokens": 2514729954.0,
      "reward": 0.017578125,
      "reward_std": 0.048086829483509064,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1826.0,
      "completions/mean_length": 1600.4609375,
      "completions/mean_terminated_length": 569.6774291992188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.908423657932918,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 11.967166108161393,
      "learning_rate": 1.2308031108441105e-07,
      "loss": 0.0065,
      "num_tokens": 2515621982.0,
      "reward": 0.015625,
      "reward_std": 0.04522542655467987,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1668.9921875,
      "completions/mean_terminated_length": 812.3312377929688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9087650422463088,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 83.27295747751994,
      "learning_rate": 1.2291112549555952e-07,
      "loss": 0.0142,
      "num_tokens": 2516555258.0,
      "reward": 0.08203125,
      "reward_std": 0.12404991686344147,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1610.734375,
      "completions/mean_terminated_length": 666.0247192382812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9091064265596995,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 41.31636336103393,
      "learning_rate": 1.2274254608124973e-07,
      "loss": -0.0033,
      "num_tokens": 2517459170.0,
      "reward": 0.0390625,
      "reward_std": 0.08510598540306091,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1850.0,
      "completions/mean_length": 1534.189453125,
      "completions/mean_terminated_length": 561.72314453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9094478108730903,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 39.37147980159843,
      "learning_rate": 1.2257457308074925e-07,
      "loss": 0.0095,
      "num_tokens": 2518316739.0,
      "reward": 0.095703125,
      "reward_std": 0.09121561795473099,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 2664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 1595.177734375,
      "completions/mean_terminated_length": 580.6265869140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9097891951864812,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 39.55892785541511,
      "learning_rate": 1.2240720673246515e-07,
      "loss": 0.0129,
      "num_tokens": 2519217310.0,
      "reward": 0.052734375,
      "reward_std": 0.06750431656837463,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 1698.146484375,
      "completions/mean_terminated_length": 670.1153564453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.910130579499872,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 20.896062231083587,
      "learning_rate": 1.2224044727394326e-07,
      "loss": 0.0281,
      "num_tokens": 2520161273.0,
      "reward": 0.02734375,
      "reward_std": 0.06546888500452042,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1611.3515625,
      "completions/mean_terminated_length": 633.0379638671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9104719638132628,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 179.0401877357721,
      "learning_rate": 1.2207429494186826e-07,
      "loss": 0.0239,
      "num_tokens": 2521058765.0,
      "reward": 0.06640625,
      "reward_std": 0.11439286172389984,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1615.880859375,
      "completions/mean_terminated_length": 638.7962036132812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9108133481266536,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 18.719464381534646,
      "learning_rate": 1.219087499720628e-07,
      "loss": 0.012,
      "num_tokens": 2521968784.0,
      "reward": 0.046875,
      "reward_std": 0.06942769140005112,
      "rewards/accuracy_reward/mean": 0.04838709533214569,
      "rewards/accuracy_reward/std": 0.21479946374893188,
      "step": 2668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1803.0,
      "completions/mean_length": 1644.283203125,
      "completions/mean_terminated_length": 539.218994140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9111547324400444,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 20.655388367207642,
      "learning_rate": 1.2174381259948785e-07,
      "loss": -0.0066,
      "num_tokens": 2522900417.0,
      "reward": 0.037109375,
      "reward_std": 0.06767063587903976,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1668.72265625,
      "completions/mean_terminated_length": 577.2651977539062,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.9114961167534352,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 37.62800210261315,
      "learning_rate": 1.2157948305824184e-07,
      "loss": 0.0263,
      "num_tokens": 2523833507.0,
      "reward": 0.052734375,
      "reward_std": 0.09448197484016418,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1897.0,
      "completions/mean_length": 1583.7109375,
      "completions/mean_terminated_length": 598.5121459960938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9118375010668259,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.313348720614254,
      "learning_rate": 1.2141576158156031e-07,
      "loss": 0.0293,
      "num_tokens": 2524719647.0,
      "reward": 0.0546875,
      "reward_std": 0.09193411469459534,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 2671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1660.62109375,
      "completions/mean_terminated_length": 651.2534790039062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9121788853802167,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 37.55590370457818,
      "learning_rate": 1.2125264840181623e-07,
      "loss": 0.0187,
      "num_tokens": 2525643981.0,
      "reward": 0.037109375,
      "reward_std": 0.08010855317115784,
      "rewards/accuracy_reward/mean": 0.038306452333927155,
      "rewards/accuracy_reward/std": 0.19212885200977325,
      "step": 2672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1833.0,
      "completions/mean_length": 1687.822265625,
      "completions/mean_terminated_length": 650.9469604492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9125202696936076,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.962958414511192,
      "learning_rate": 1.2109014375051868e-07,
      "loss": -0.004,
      "num_tokens": 2526582706.0,
      "reward": 0.046875,
      "reward_std": 0.056411758065223694,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1573.078125,
      "completions/mean_terminated_length": 600.8988037109375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9128616540069984,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 20.049250347128748,
      "learning_rate": 1.2092824785831342e-07,
      "loss": 0.0221,
      "num_tokens": 2527470810.0,
      "reward": 0.060546875,
      "reward_std": 0.0706586092710495,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1618.92578125,
      "completions/mean_terminated_length": 657.582275390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9132030383203892,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 30.752971624082775,
      "learning_rate": 1.2076696095498203e-07,
      "loss": 0.0067,
      "num_tokens": 2528381556.0,
      "reward": 0.10546875,
      "reward_std": 0.1424015313386917,
      "rewards/accuracy_reward/mean": 0.10546875,
      "rewards/accuracy_reward/std": 0.3074568510055542,
      "step": 2675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1591.306640625,
      "completions/mean_terminated_length": 622.2255859375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.91354442263378,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 14.31009969927499,
      "learning_rate": 1.2060628326944175e-07,
      "loss": 0.0103,
      "num_tokens": 2529274353.0,
      "reward": 0.04296875,
      "reward_std": 0.059520021080970764,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1654.22265625,
      "completions/mean_terminated_length": 648.2569580078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9138858069471708,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 29.36820642916589,
      "learning_rate": 1.204462150297452e-07,
      "loss": 0.0091,
      "num_tokens": 2530207923.0,
      "reward": 0.08984375,
      "reward_std": 0.13106893002986908,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 2677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1711.08203125,
      "completions/mean_terminated_length": 634.0491333007812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9142271912605616,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 33.145445098029626,
      "learning_rate": 1.202867564630799e-07,
      "loss": 0.0275,
      "num_tokens": 2531165709.0,
      "reward": 0.064453125,
      "reward_std": 0.09414386004209518,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1805.0,
      "completions/mean_length": 1592.2890625,
      "completions/mean_terminated_length": 607.7283935546875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.9145685755739523,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 65.36714056252808,
      "learning_rate": 1.2012790779576833e-07,
      "loss": 0.0214,
      "num_tokens": 2532060193.0,
      "reward": 0.064453125,
      "reward_std": 0.12240587919950485,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1691.6015625,
      "completions/mean_terminated_length": 644.3384399414062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9149099598873431,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 62.036618332383505,
      "learning_rate": 1.1996966925326677e-07,
      "loss": 0.0208,
      "num_tokens": 2533008581.0,
      "reward": 0.056640625,
      "reward_std": 0.1190534457564354,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1897.0,
      "completions/mean_length": 1625.595703125,
      "completions/mean_terminated_length": 577.1564331054688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.915251344200734,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.0400499286434,
      "learning_rate": 1.1981204106016626e-07,
      "loss": 0.0296,
      "num_tokens": 2533930246.0,
      "reward": 0.052734375,
      "reward_std": 0.08285094797611237,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1637.884765625,
      "completions/mean_terminated_length": 629.2229614257812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9155927285141248,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 50.519699906866606,
      "learning_rate": 1.196550234401909e-07,
      "loss": -0.0009,
      "num_tokens": 2534849035.0,
      "reward": 0.017578125,
      "reward_std": 0.05452118441462517,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1840.0,
      "completions/mean_length": 1542.904296875,
      "completions/mean_terminated_length": 642.5162963867188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9159341128275156,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 26.154366962389492,
      "learning_rate": 1.194986166161986e-07,
      "loss": 0.0184,
      "num_tokens": 2535710554.0,
      "reward": 0.05859375,
      "reward_std": 0.0921671986579895,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1643.130859375,
      "completions/mean_terminated_length": 628.1849365234375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9162754971409064,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 45.78015294625495,
      "learning_rate": 1.1934282081018023e-07,
      "loss": 0.0138,
      "num_tokens": 2536626189.0,
      "reward": 0.08203125,
      "reward_std": 0.09655161201953888,
      "rewards/accuracy_reward/mean": 0.08467742055654526,
      "rewards/accuracy_reward/std": 0.278682142496109,
      "step": 2684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1867.0,
      "completions/mean_length": 1635.498046875,
      "completions/mean_terminated_length": 550.1205444335938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9166168814542972,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 62.569540792616586,
      "learning_rate": 1.1918763624325942e-07,
      "loss": 0.0275,
      "num_tokens": 2537536332.0,
      "reward": 0.03515625,
      "reward_std": 0.08493967354297638,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1686.67578125,
      "completions/mean_terminated_length": 697.649658203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.916958265767688,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 39.352672277183366,
      "learning_rate": 1.1903306313569242e-07,
      "loss": 0.0289,
      "num_tokens": 2538470694.0,
      "reward": 0.04296875,
      "reward_std": 0.09891509264707565,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1576.2265625,
      "completions/mean_terminated_length": 610.2142944335938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9172996500810787,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.483814036372228,
      "learning_rate": 1.1887910170686726e-07,
      "loss": 0.0262,
      "num_tokens": 2539344250.0,
      "reward": 0.05078125,
      "reward_std": 0.09099511802196503,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1625.06640625,
      "completions/mean_terminated_length": 686.1005859375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.9176410343944695,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 66.21176219708461,
      "learning_rate": 1.1872575217530422e-07,
      "loss": 0.0042,
      "num_tokens": 2540249292.0,
      "reward": 0.0234375,
      "reward_std": 0.05974511057138443,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1848.0,
      "completions/mean_length": 1614.158203125,
      "completions/mean_terminated_length": 624.1090087890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9179824187078603,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 16.858899246772374,
      "learning_rate": 1.1857301475865477e-07,
      "loss": 0.0273,
      "num_tokens": 2541145277.0,
      "reward": 0.041015625,
      "reward_std": 0.07780591398477554,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1875.0,
      "completions/mean_length": 1677.619140625,
      "completions/mean_terminated_length": 622.1729736328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9183238030212512,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 40.27477226381563,
      "learning_rate": 1.1842088967370173e-07,
      "loss": 0.0084,
      "num_tokens": 2542088234.0,
      "reward": 0.07421875,
      "reward_std": 0.13915522396564484,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 1629.306640625,
      "completions/mean_terminated_length": 618.8599853515625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.918665187334642,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 6.943651406759533,
      "learning_rate": 1.1826937713635902e-07,
      "loss": 0.017,
      "num_tokens": 2542993783.0,
      "reward": 0.04296875,
      "reward_std": 0.0843411535024643,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1687.072265625,
      "completions/mean_terminated_length": 569.6400146484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9190065716480328,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 14.010187967142512,
      "learning_rate": 1.1811847736167078e-07,
      "loss": 0.035,
      "num_tokens": 2543928236.0,
      "reward": 0.091796875,
      "reward_std": 0.11879178136587143,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 2692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 1705.23828125,
      "completions/mean_terminated_length": 585.550048828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9193479559614236,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 153.15785904263205,
      "learning_rate": 1.1796819056381175e-07,
      "loss": 0.0141,
      "num_tokens": 2544882630.0,
      "reward": 0.0703125,
      "reward_std": 0.10229447484016418,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.810546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1780.873046875,
      "completions/mean_terminated_length": 638.6288452148438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9196893402748144,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 89.2986222257337,
      "learning_rate": 1.178185169560865e-07,
      "loss": 0.002,
      "num_tokens": 2545875605.0,
      "reward": 0.0546875,
      "reward_std": 0.07873040437698364,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1649.33984375,
      "completions/mean_terminated_length": 600.3829345703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9200307245882051,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 30.40658796290671,
      "learning_rate": 1.1766945675092938e-07,
      "loss": 0.0157,
      "num_tokens": 2546789107.0,
      "reward": 0.072265625,
      "reward_std": 0.12556564807891846,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1674.5,
      "completions/mean_terminated_length": 682.05712890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9203721089015959,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 13.427209547897768,
      "learning_rate": 1.1752101015990404e-07,
      "loss": 0.0133,
      "num_tokens": 2547727987.0,
      "reward": 0.033203125,
      "reward_std": 0.06602106243371964,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1666.92578125,
      "completions/mean_terminated_length": 654.6500244140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9207134932149867,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 39.34895516214777,
      "learning_rate": 1.1737317739370323e-07,
      "loss": 0.0118,
      "num_tokens": 2548663261.0,
      "reward": 0.048828125,
      "reward_std": 0.0775209367275238,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1614.271484375,
      "completions/mean_terminated_length": 615.2967529296875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9210548775283776,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 95.13349357725528,
      "learning_rate": 1.172259586621487e-07,
      "loss": 0.0087,
      "num_tokens": 2549563128.0,
      "reward": 0.056640625,
      "reward_std": 0.08309652656316757,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.791015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1899.0,
      "completions/mean_length": 1729.455078125,
      "completions/mean_terminated_length": 523.7476196289062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9213962618417684,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 10.563887941527359,
      "learning_rate": 1.170793541741903e-07,
      "loss": 0.0142,
      "num_tokens": 2550532513.0,
      "reward": 0.037109375,
      "reward_std": 0.06405794620513916,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1739.0,
      "completions/mean_length": 1634.998046875,
      "completions/mean_terminated_length": 569.2797241210938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9217376461551592,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 78.0721216098307,
      "learning_rate": 1.169333641379065e-07,
      "loss": -0.0029,
      "num_tokens": 2551449840.0,
      "reward": 0.021484375,
      "reward_std": 0.06519509106874466,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1681.302734375,
      "completions/mean_terminated_length": 667.49267578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.92207903046855,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 24.034663805380703,
      "learning_rate": 1.167879887605032e-07,
      "loss": 0.0044,
      "num_tokens": 2552383979.0,
      "reward": 0.01171875,
      "reward_std": 0.04192390665411949,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 2701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1623.3359375,
      "completions/mean_terminated_length": 645.2387084960938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9224204147819408,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 32.66605559967687,
      "learning_rate": 1.1664322824831437e-07,
      "loss": 0.0087,
      "num_tokens": 2553288887.0,
      "reward": 0.072265625,
      "reward_std": 0.12004335969686508,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1721.0,
      "completions/mean_length": 1640.97265625,
      "completions/mean_terminated_length": 667.8807983398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9227617990953315,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 119.11291744511516,
      "learning_rate": 1.1649908280680094e-07,
      "loss": 0.0232,
      "num_tokens": 2554206729.0,
      "reward": 0.052734375,
      "reward_std": 0.10095052421092987,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1872.0,
      "completions/mean_length": 1622.822265625,
      "completions/mean_terminated_length": 567.1088256835938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9231031834087223,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 91.50595019641537,
      "learning_rate": 1.1635555264055105e-07,
      "loss": 0.0227,
      "num_tokens": 2555119742.0,
      "reward": 0.041015625,
      "reward_std": 0.08279120922088623,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1602.443359375,
      "completions/mean_terminated_length": 595.2993774414062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9234445677221131,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 65.47751476389558,
      "learning_rate": 1.1621263795327965e-07,
      "loss": 0.0003,
      "num_tokens": 2556019793.0,
      "reward": 0.041015625,
      "reward_std": 0.06548243016004562,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1628.95703125,
      "completions/mean_terminated_length": 588.4761962890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.923785952035504,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 50.20686603189005,
      "learning_rate": 1.1607033894782782e-07,
      "loss": 0.0047,
      "num_tokens": 2556931211.0,
      "reward": 0.095703125,
      "reward_std": 0.13593661785125732,
      "rewards/accuracy_reward/mean": 0.095703125,
      "rewards/accuracy_reward/std": 0.2944713830947876,
      "step": 2706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1724.6171875,
      "completions/mean_terminated_length": 608.2434692382812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9241273363488948,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 118.78194004543495,
      "learning_rate": 1.1592865582616306e-07,
      "loss": 0.0263,
      "num_tokens": 2557897047.0,
      "reward": 0.08203125,
      "reward_std": 0.15197205543518066,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1892.0,
      "completions/mean_length": 1530.962890625,
      "completions/mean_terminated_length": 526.6034545898438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9244687206622856,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 29.48698281235695,
      "learning_rate": 1.1578758878937856e-07,
      "loss": -0.0057,
      "num_tokens": 2558754084.0,
      "reward": 0.033203125,
      "reward_std": 0.08489333093166351,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1786.0,
      "completions/mean_length": 1657.443359375,
      "completions/mean_terminated_length": 619.6785888671875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9248101049756764,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.305023217987692,
      "learning_rate": 1.1564713803769327e-07,
      "loss": -0.0014,
      "num_tokens": 2559685383.0,
      "reward": 0.060546875,
      "reward_std": 0.09226682782173157,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.798828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1770.923828125,
      "completions/mean_terminated_length": 671.2621459960938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9251514892890672,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.011637490709759,
      "learning_rate": 1.1550730377045126e-07,
      "loss": 0.0171,
      "num_tokens": 2560673504.0,
      "reward": 0.013671875,
      "reward_std": 0.03889618441462517,
      "rewards/accuracy_reward/mean": 0.013671875,
      "rewards/accuracy_reward/std": 0.1162383034825325,
      "step": 2710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1681.56640625,
      "completions/mean_terminated_length": 668.8235473632812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.925492873602458,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 41.004865879357595,
      "learning_rate": 1.1536808618612175e-07,
      "loss": -0.0023,
      "num_tokens": 2561619506.0,
      "reward": 0.046875,
      "reward_std": 0.06739920377731323,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1654.224609375,
      "completions/mean_terminated_length": 628.1901245117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9258342579158487,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 84.52424197698116,
      "learning_rate": 1.1522948548229875e-07,
      "loss": 0.0394,
      "num_tokens": 2562541621.0,
      "reward": 0.08203125,
      "reward_std": 0.09637914597988129,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1736.0,
      "completions/mean_length": 1647.44921875,
      "completions/mean_terminated_length": 603.7605590820312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9261756422292395,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 43.66863055563476,
      "learning_rate": 1.150915018557004e-07,
      "loss": 0.0072,
      "num_tokens": 2563460987.0,
      "reward": 0.0546875,
      "reward_std": 0.10716674476861954,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1881.0,
      "completions/mean_length": 1623.728515625,
      "completions/mean_terminated_length": 560.1438598632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9265170265426304,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 25.863699183515383,
      "learning_rate": 1.1495413550216933e-07,
      "loss": 0.0164,
      "num_tokens": 2564367264.0,
      "reward": 0.02734375,
      "reward_std": 0.0635918527841568,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1764.0,
      "completions/mean_length": 1576.21875,
      "completions/mean_terminated_length": 547.677001953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9268584108560212,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 36.15677394177097,
      "learning_rate": 1.1481738661667192e-07,
      "loss": 0.006,
      "num_tokens": 2565250128.0,
      "reward": 0.05078125,
      "reward_std": 0.08599072694778442,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1627.0390625,
      "completions/mean_terminated_length": 725.7177734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.927199795169412,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 20.22435238516492,
      "learning_rate": 1.1468125539329826e-07,
      "loss": 0.0117,
      "num_tokens": 2566163012.0,
      "reward": 0.04296875,
      "reward_std": 0.08368149399757385,
      "rewards/accuracy_reward/mean": 0.04435483738780022,
      "rewards/accuracy_reward/std": 0.2060900777578354,
      "step": 2716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1839.0,
      "completions/mean_length": 1683.9921875,
      "completions/mean_terminated_length": 716.7714233398438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9275411794828028,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 10.40211046814643,
      "learning_rate": 1.1454574202526165e-07,
      "loss": 0.0081,
      "num_tokens": 2567096784.0,
      "reward": 0.04296875,
      "reward_std": 0.08274346590042114,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.82421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1826.90234375,
      "completions/mean_terminated_length": 790.822265625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9278825637961936,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 57.33836108659242,
      "learning_rate": 1.1441084670489857e-07,
      "loss": -0.0051,
      "num_tokens": 2568117566.0,
      "reward": 0.046875,
      "reward_std": 0.08296996355056763,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1913.0,
      "completions/mean_length": 1655.236328125,
      "completions/mean_terminated_length": 651.5069580078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9282239481095844,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 29.284215973339307,
      "learning_rate": 1.1427656962366829e-07,
      "loss": -0.0046,
      "num_tokens": 2569043095.0,
      "reward": 0.044921875,
      "reward_std": 0.06409074366092682,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1874.0,
      "completions/mean_length": 1585.908203125,
      "completions/mean_terminated_length": 587.5617065429688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9285653324229751,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 17.209251690673963,
      "learning_rate": 1.1414291097215244e-07,
      "loss": 0.0119,
      "num_tokens": 2569926408.0,
      "reward": 0.037109375,
      "reward_std": 0.08042868226766586,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1668.431640625,
      "completions/mean_terminated_length": 669.709228515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9289067167363659,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.160842792222304,
      "learning_rate": 1.1400987094005518e-07,
      "loss": 0.0048,
      "num_tokens": 2570858789.0,
      "reward": 0.05859375,
      "reward_std": 0.09259280562400818,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1694.875,
      "completions/mean_terminated_length": 646.4495849609375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9292481010497567,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 241.11601149971844,
      "learning_rate": 1.1387744971620236e-07,
      "loss": 0.0039,
      "num_tokens": 2571804261.0,
      "reward": 0.1015625,
      "reward_std": 0.12356442213058472,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 2722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1819.0,
      "completions/mean_length": 1568.29296875,
      "completions/mean_terminated_length": 541.190185546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9295894853631476,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 168.52586375181815,
      "learning_rate": 1.137456474885418e-07,
      "loss": 0.0182,
      "num_tokens": 2572685355.0,
      "reward": 0.0625,
      "reward_std": 0.103780098259449,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 2723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1623.955078125,
      "completions/mean_terminated_length": 610.1788330078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9299308696765384,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 48.45278978914982,
      "learning_rate": 1.136144644441426e-07,
      "loss": 0.0194,
      "num_tokens": 2573592212.0,
      "reward": 0.03515625,
      "reward_std": 0.0868166983127594,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1847.0,
      "completions/mean_length": 1581.89453125,
      "completions/mean_terminated_length": 584.2637939453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9302722539899292,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 144.97910134563475,
      "learning_rate": 1.1348390076919519e-07,
      "loss": 0.0038,
      "num_tokens": 2574489374.0,
      "reward": 0.06640625,
      "reward_std": 0.11117979884147644,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1737.857421875,
      "completions/mean_terminated_length": 724.7250366210938,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.93061363830332,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 92.00130666992754,
      "learning_rate": 1.1335395664901071e-07,
      "loss": 0.0037,
      "num_tokens": 2575465221.0,
      "reward": 0.044921875,
      "reward_std": 0.09001073986291885,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1663.509765625,
      "completions/mean_terminated_length": 671.3636474609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9309550226167108,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 5.960058399792734,
      "learning_rate": 1.1322463226802109e-07,
      "loss": 0.0272,
      "num_tokens": 2576397754.0,
      "reward": 0.05078125,
      "reward_std": 0.0908287987112999,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1882.0,
      "completions/mean_length": 1644.53125,
      "completions/mean_terminated_length": 652.2162475585938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9312964069301015,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 28.097684850926672,
      "learning_rate": 1.1309592780977867e-07,
      "loss": 0.0218,
      "num_tokens": 2577317818.0,
      "reward": 0.05078125,
      "reward_std": 0.08171837776899338,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1589.921875,
      "completions/mean_terminated_length": 635.1325073242188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9316377912434923,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 80.1537414749105,
      "learning_rate": 1.1296784345695585e-07,
      "loss": 0.0139,
      "num_tokens": 2578212338.0,
      "reward": 0.05859375,
      "reward_std": 0.09959384053945541,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1710.052734375,
      "completions/mean_terminated_length": 593.9747924804688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9319791755568831,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 55.22754725480669,
      "learning_rate": 1.1284037939134502e-07,
      "loss": 0.0165,
      "num_tokens": 2579164253.0,
      "reward": 0.0390625,
      "reward_std": 0.08698301762342453,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1878.0,
      "completions/mean_length": 1673.841796875,
      "completions/mean_terminated_length": 735.883544921875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.932320559870274,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 30.091282368687303,
      "learning_rate": 1.1271353579385804e-07,
      "loss": 0.0093,
      "num_tokens": 2580096716.0,
      "reward": 0.03515625,
      "reward_std": 0.06728477776050568,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1934.0,
      "completions/mean_length": 1616.3984375,
      "completions/mean_terminated_length": 603.686279296875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9326619441836648,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 15.01036058263,
      "learning_rate": 1.1258731284452616e-07,
      "loss": 0.0219,
      "num_tokens": 2581000264.0,
      "reward": 0.060546875,
      "reward_std": 0.10051832348108292,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 1662.4453125,
      "completions/mean_terminated_length": 695.9177856445312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9330033284970556,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 157.00944534543342,
      "learning_rate": 1.1246171072249991e-07,
      "loss": 0.0334,
      "num_tokens": 2581922508.0,
      "reward": 0.05078125,
      "reward_std": 0.11399346590042114,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1672.53125,
      "completions/mean_terminated_length": 644.788330078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9333447128104464,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 67.66819588666246,
      "learning_rate": 1.1233672960604836e-07,
      "loss": 0.0053,
      "num_tokens": 2582863100.0,
      "reward": 0.083984375,
      "reward_std": 0.09488959610462189,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.638671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1573.80859375,
      "completions/mean_terminated_length": 735.6432495117188,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9336860971238372,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 37.98364460807045,
      "learning_rate": 1.1221236967255949e-07,
      "loss": 0.0158,
      "num_tokens": 2583743226.0,
      "reward": 0.048828125,
      "reward_std": 0.09995920211076736,
      "rewards/accuracy_reward/mean": 0.05040322616696358,
      "rewards/accuracy_reward/std": 0.21899642050266266,
      "step": 2735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1534.20703125,
      "completions/mean_terminated_length": 618.3152465820312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9340274814372279,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 44.67205170004858,
      "learning_rate": 1.1208863109853938e-07,
      "loss": 0.0095,
      "num_tokens": 2584603140.0,
      "reward": 0.087890625,
      "reward_std": 0.09291848540306091,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 2736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1633.912109375,
      "completions/mean_terminated_length": 634.5800170898438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9343688657506187,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 115.5863804021599,
      "learning_rate": 1.1196551405961232e-07,
      "loss": 0.0076,
      "num_tokens": 2585514087.0,
      "reward": 0.078125,
      "reward_std": 0.11599055677652359,
      "rewards/accuracy_reward/mean": 0.08064515888690948,
      "rewards/accuracy_reward/std": 0.2725643217563629,
      "step": 2737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1851.0,
      "completions/mean_length": 1628.193359375,
      "completions/mean_terminated_length": 661.5806274414062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9347102500640095,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 20.030953089130673,
      "learning_rate": 1.1184301873052052e-07,
      "loss": 0.0247,
      "num_tokens": 2586430154.0,
      "reward": 0.056640625,
      "reward_std": 0.09793071448802948,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1656.51171875,
      "completions/mean_terminated_length": 665.641357421875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9350516343774004,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.98536389419781,
      "learning_rate": 1.1172114528512358e-07,
      "loss": 0.0124,
      "num_tokens": 2587352576.0,
      "reward": 0.02734375,
      "reward_std": 0.07041998207569122,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1778.0,
      "completions/mean_length": 1700.65625,
      "completions/mean_terminated_length": 566.0,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9353930186907912,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 66.11732070506618,
      "learning_rate": 1.115998938963986e-07,
      "loss": -0.0109,
      "num_tokens": 2588304880.0,
      "reward": 0.025390625,
      "reward_std": 0.05913964658975601,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1880.0,
      "completions/mean_length": 1563.783203125,
      "completions/mean_terminated_length": 581.023681640625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.935734403004182,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 51.30875452422642,
      "learning_rate": 1.1147926473643973e-07,
      "loss": 0.018,
      "num_tokens": 2589178081.0,
      "reward": 0.076171875,
      "reward_std": 0.06713200360536575,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 2741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1756.0,
      "completions/mean_length": 1648.302734375,
      "completions/mean_terminated_length": 626.8541870117188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9360757873175728,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 12.031535529426744,
      "learning_rate": 1.1135925797645812e-07,
      "loss": 0.0281,
      "num_tokens": 2590099516.0,
      "reward": 0.05078125,
      "reward_std": 0.10107050836086273,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 1601.560546875,
      "completions/mean_terminated_length": 695.473388671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9364171716309636,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 91.9273160364728,
      "learning_rate": 1.1123987378678127e-07,
      "loss": -0.0145,
      "num_tokens": 2591001099.0,
      "reward": 0.06640625,
      "reward_std": 0.12862756848335266,
      "rewards/accuracy_reward/mean": 0.06640625,
      "rewards/accuracy_reward/std": 0.2492343932390213,
      "step": 2743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1624.4921875,
      "completions/mean_terminated_length": 675.6203002929688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9367585559443543,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 58.1854467136591,
      "learning_rate": 1.1112111233685323e-07,
      "loss": 0.0133,
      "num_tokens": 2591915447.0,
      "reward": 0.080078125,
      "reward_std": 0.11256343126296997,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1611.607421875,
      "completions/mean_terminated_length": 677.245361328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9370999402577451,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 52.80460932369611,
      "learning_rate": 1.1100297379523423e-07,
      "loss": 0.0325,
      "num_tokens": 2592813342.0,
      "reward": 0.078125,
      "reward_std": 0.12452814728021622,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1676.80078125,
      "completions/mean_terminated_length": 660.7445068359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9374413245711359,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 186.4548638799342,
      "learning_rate": 1.108854583296002e-07,
      "loss": 0.0135,
      "num_tokens": 2593764888.0,
      "reward": 0.03515625,
      "reward_std": 0.09358368813991547,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1677.2578125,
      "completions/mean_terminated_length": 642.34814453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9377827088845268,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 23.75898137694367,
      "learning_rate": 1.1076856610674298e-07,
      "loss": 0.0023,
      "num_tokens": 2594706412.0,
      "reward": 0.041015625,
      "reward_std": 0.051707521080970764,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.607421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1497.279296875,
      "completions/mean_terminated_length": 645.1691284179688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9381240931979176,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 35.61007948697366,
      "learning_rate": 1.106522972925696e-07,
      "loss": 0.0227,
      "num_tokens": 2595543211.0,
      "reward": 0.056640625,
      "reward_std": 0.11520528793334961,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1548.552734375,
      "completions/mean_terminated_length": 611.3876342773438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9384654775113084,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 44.124940602045285,
      "learning_rate": 1.1053665205210249e-07,
      "loss": 0.006,
      "num_tokens": 2596410790.0,
      "reward": 0.044921875,
      "reward_std": 0.08626452088356018,
      "rewards/accuracy_reward/mean": 0.04791666567325592,
      "rewards/accuracy_reward/std": 0.21381278336048126,
      "step": 2749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1888.0,
      "completions/mean_length": 1543.865234375,
      "completions/mean_terminated_length": 614.0166625976562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9388068618246992,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 44.82521235730442,
      "learning_rate": 1.1042163054947881e-07,
      "loss": 0.008,
      "num_tokens": 2597278337.0,
      "reward": 0.0546875,
      "reward_std": 0.09908141195774078,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1641.416015625,
      "completions/mean_terminated_length": 582.0070190429688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.93914824613809,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 51.29181778064116,
      "learning_rate": 1.1030723294795055e-07,
      "loss": -0.0011,
      "num_tokens": 2598198502.0,
      "reward": 0.021484375,
      "reward_std": 0.036420635879039764,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1679.267578125,
      "completions/mean_terminated_length": 595.7615356445312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9394896304514807,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 75.63200972463699,
      "learning_rate": 1.1019345940988427e-07,
      "loss": 0.0057,
      "num_tokens": 2599148095.0,
      "reward": 0.03125,
      "reward_std": 0.06805649399757385,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1632.703125,
      "completions/mean_terminated_length": 621.2684326171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9398310147648715,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.784256299772256,
      "learning_rate": 1.1008031009676061e-07,
      "loss": 0.007,
      "num_tokens": 2600065271.0,
      "reward": 0.107421875,
      "reward_std": 0.10524433106184006,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 2753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1608.4765625,
      "completions/mean_terminated_length": 596.1548461914062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9401723990782623,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 21.234588876096534,
      "learning_rate": 1.0996778516917438e-07,
      "loss": 0.0025,
      "num_tokens": 2600965051.0,
      "reward": 0.048828125,
      "reward_std": 0.1072334349155426,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1612.96484375,
      "completions/mean_terminated_length": 620.1923217773438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9405137833916531,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 132.7547833821959,
      "learning_rate": 1.0985588478683407e-07,
      "loss": -0.0065,
      "num_tokens": 2601869161.0,
      "reward": 0.068359375,
      "reward_std": 0.11258251965045929,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1600.240234375,
      "completions/mean_terminated_length": 658.9030151367188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.940855167705044,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 112.81075642327981,
      "learning_rate": 1.0974460910856182e-07,
      "loss": 0.0141,
      "num_tokens": 2602767652.0,
      "reward": 0.15234375,
      "reward_std": 0.17551058530807495,
      "rewards/accuracy_reward/mean": 0.15234375,
      "rewards/accuracy_reward/std": 0.35970520973205566,
      "step": 2756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1686.90625,
      "completions/mean_terminated_length": 678.5184936523438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9411965520184348,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 38.14059005238338,
      "learning_rate": 1.0963395829229322e-07,
      "loss": 0.0071,
      "num_tokens": 2603712660.0,
      "reward": 0.08203125,
      "reward_std": 0.11199754476547241,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1734.0,
      "completions/mean_length": 1574.728515625,
      "completions/mean_terminated_length": 622.61767578125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.9415379363318256,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.127502583790335,
      "learning_rate": 1.0952393249507669e-07,
      "loss": 0.0043,
      "num_tokens": 2604598345.0,
      "reward": 0.029296875,
      "reward_std": 0.04859926179051399,
      "rewards/accuracy_reward/mean": 0.030241934582591057,
      "rewards/accuracy_reward/std": 0.1714252382516861,
      "step": 2758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1727.6328125,
      "completions/mean_terminated_length": 634.336181640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9418793206452164,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 33.65955123591645,
      "learning_rate": 1.0941453187307386e-07,
      "loss": 0.0072,
      "num_tokens": 2605575837.0,
      "reward": 0.02734375,
      "reward_std": 0.0580955371260643,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1534.009765625,
      "completions/mean_terminated_length": 640.7112426757812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9422207049586071,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 60.070594072096036,
      "learning_rate": 1.0930575658155882e-07,
      "loss": -0.0078,
      "num_tokens": 2606437602.0,
      "reward": 0.044921875,
      "reward_std": 0.08384227007627487,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 1645.064453125,
      "completions/mean_terminated_length": 644.5782470703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9425620892719979,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 80.31843034298011,
      "learning_rate": 1.0919760677491827e-07,
      "loss": -0.0115,
      "num_tokens": 2607369187.0,
      "reward": 0.02734375,
      "reward_std": 0.05311024188995361,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1657.345703125,
      "completions/mean_terminated_length": 687.3536987304688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9429034735853887,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 55.031603826544206,
      "learning_rate": 1.0909008260665102e-07,
      "loss": 0.0086,
      "num_tokens": 2608292164.0,
      "reward": 0.076171875,
      "reward_std": 0.1269710510969162,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 2762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1624.078125,
      "completions/mean_terminated_length": 682.918212890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9432448578987795,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 20.479561532627358,
      "learning_rate": 1.0898318422936796e-07,
      "loss": 0.0234,
      "num_tokens": 2609202188.0,
      "reward": 0.046875,
      "reward_std": 0.09668608754873276,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1618.396484375,
      "completions/mean_terminated_length": 664.6226196289062,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9435862422121704,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 33.901773629948465,
      "learning_rate": 1.0887691179479182e-07,
      "loss": 0.02,
      "num_tokens": 2610106903.0,
      "reward": 0.09375,
      "reward_std": 0.12933912873268127,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 2764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1615.375,
      "completions/mean_terminated_length": 646.0759887695312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9439276265255612,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 53.82307433033624,
      "learning_rate": 1.0877126545375688e-07,
      "loss": -0.0011,
      "num_tokens": 2611007559.0,
      "reward": 0.03125,
      "reward_std": 0.07152432948350906,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1592.484375,
      "completions/mean_terminated_length": 625.9024047851562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.944269010838952,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 20.78402405448913,
      "learning_rate": 1.0866624535620878e-07,
      "loss": 0.0274,
      "num_tokens": 2611894287.0,
      "reward": 0.072265625,
      "reward_std": 0.10606793314218521,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1618.310546875,
      "completions/mean_terminated_length": 664.3458862304688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9446103951523428,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 41.76218707746861,
      "learning_rate": 1.0856185165120433e-07,
      "loss": 0.0131,
      "num_tokens": 2612793422.0,
      "reward": 0.0546875,
      "reward_std": 0.09737758338451385,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 2767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1627.8671875,
      "completions/mean_terminated_length": 613.9466552734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9449517794657335,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 74.51461093563086,
      "learning_rate": 1.0845808448691141e-07,
      "loss": 0.0193,
      "num_tokens": 2613704890.0,
      "reward": 0.064453125,
      "reward_std": 0.09804278612136841,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1617.619140625,
      "completions/mean_terminated_length": 728.509033203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9452931637791243,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 17.538055564272263,
      "learning_rate": 1.0835494401060835e-07,
      "loss": 0.0308,
      "num_tokens": 2614605127.0,
      "reward": 0.060546875,
      "reward_std": 0.11081992089748383,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1627.767578125,
      "completions/mean_terminated_length": 632.4802856445312,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9456345480925151,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 28.037376991573883,
      "learning_rate": 1.0825243036868424e-07,
      "loss": -0.0077,
      "num_tokens": 2615524032.0,
      "reward": 0.03125,
      "reward_std": 0.054907046258449554,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1565.544921875,
      "completions/mean_terminated_length": 568.8563232421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9459759324059059,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 59.25682114292323,
      "learning_rate": 1.081505437066386e-07,
      "loss": 0.0131,
      "num_tokens": 2616403655.0,
      "reward": 0.052734375,
      "reward_std": 0.09875431656837463,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1678.3671875,
      "completions/mean_terminated_length": 635.671630859375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9463173167192968,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.608687420642724,
      "learning_rate": 1.0804928416908073e-07,
      "loss": 0.0141,
      "num_tokens": 2617352851.0,
      "reward": 0.025390625,
      "reward_std": 0.07344770431518555,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1658.947265625,
      "completions/mean_terminated_length": 625.1785888671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9466587010326876,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 26.398674378618324,
      "learning_rate": 1.0794865189973011e-07,
      "loss": 0.0031,
      "num_tokens": 2618280008.0,
      "reward": 0.083984375,
      "reward_std": 0.10854348540306091,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1924.0,
      "completions/mean_length": 1701.197265625,
      "completions/mean_terminated_length": 713.26318359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9470000853460784,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 14.626394930538885,
      "learning_rate": 1.0784864704141585e-07,
      "loss": 0.0121,
      "num_tokens": 2619228205.0,
      "reward": 0.046875,
      "reward_std": 0.07104447484016418,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1659.0,
      "completions/mean_length": 1552.5234375,
      "completions/mean_terminated_length": 537.9761962890625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9473414696594692,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 6.401036062291176,
      "learning_rate": 1.0774926973607648e-07,
      "loss": 0.0223,
      "num_tokens": 2620094409.0,
      "reward": 0.03515625,
      "reward_std": 0.058320626616477966,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1624.53125,
      "completions/mean_terminated_length": 531.80419921875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9476828539728599,
      "frac_reward_zero_std": 0.59375,
      "grad_norm": 76.81720096755522,
      "learning_rate": 1.0765052012475998e-07,
      "loss": 0.0102,
      "num_tokens": 2621001897.0,
      "reward": 0.115234375,
      "reward_std": 0.15497520565986633,
      "rewards/accuracy_reward/mean": 0.115234375,
      "rewards/accuracy_reward/std": 0.3196168541908264,
      "step": 2776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.744140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1771.0,
      "completions/mean_length": 1684.212890625,
      "completions/mean_terminated_length": 626.1755981445312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9480242382862507,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.185943554605814,
      "learning_rate": 1.0755239834762326e-07,
      "loss": -0.0038,
      "num_tokens": 2621946630.0,
      "reward": 0.02734375,
      "reward_std": 0.07284127175807953,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1757.0,
      "completions/mean_length": 1544.9453125,
      "completions/mean_terminated_length": 584.5681762695312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9483656225996415,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 40.33436865191403,
      "learning_rate": 1.0745490454393239e-07,
      "loss": 0.0179,
      "num_tokens": 2622817530.0,
      "reward": 0.04296875,
      "reward_std": 0.09820214658975601,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1633.0,
      "completions/mean_length": 1633.5703125,
      "completions/mean_terminated_length": 574.4722290039062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9487070069130323,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 23.7167455906824,
      "learning_rate": 1.0735803885206191e-07,
      "loss": 0.0191,
      "num_tokens": 2623740334.0,
      "reward": 0.0546875,
      "reward_std": 0.10810574144124985,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 1597.3125,
      "completions/mean_terminated_length": 632.3435668945312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9490483912264231,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 31.43848003449339,
      "learning_rate": 1.0726180140949497e-07,
      "loss": 0.0202,
      "num_tokens": 2624638670.0,
      "reward": 0.0625,
      "reward_std": 0.09055041521787643,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1934.0,
      "completions/mean_length": 1524.224609375,
      "completions/mean_terminated_length": 643.952880859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.949389775539814,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 298.9405124362075,
      "learning_rate": 1.0716619235282295e-07,
      "loss": -0.0019,
      "num_tokens": 2625502833.0,
      "reward": 0.078125,
      "reward_std": 0.13090261816978455,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1763.0,
      "completions/mean_length": 1657.978515625,
      "completions/mean_terminated_length": 670.8206787109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9497311598532048,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 32.508942537629025,
      "learning_rate": 1.0707121181774556e-07,
      "loss": -0.001,
      "num_tokens": 2626425366.0,
      "reward": 0.041015625,
      "reward_std": 0.052558064460754395,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.775390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1869.0,
      "completions/mean_length": 1731.576171875,
      "completions/mean_terminated_length": 639.22607421875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9500725441665956,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 43.158924354935245,
      "learning_rate": 1.0697685993907009e-07,
      "loss": 0.005,
      "num_tokens": 2627382765.0,
      "reward": 0.076171875,
      "reward_std": 0.09994550049304962,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 2783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1791.0,
      "completions/mean_length": 1669.041015625,
      "completions/mean_terminated_length": 681.6126708984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9504139284799863,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 22.36082329580779,
      "learning_rate": 1.0688313685071194e-07,
      "loss": 0.0558,
      "num_tokens": 2628306546.0,
      "reward": 0.08203125,
      "reward_std": 0.11379435658454895,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1875.0,
      "completions/mean_length": 1636.912109375,
      "completions/mean_terminated_length": 616.1836547851562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9507553127933771,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.4984989202737102,
      "learning_rate": 1.067900426856939e-07,
      "loss": 0.012,
      "num_tokens": 2629230789.0,
      "reward": 0.041015625,
      "reward_std": 0.04957009106874466,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.669921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1557.091796875,
      "completions/mean_terminated_length": 560.75146484375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9510966971067679,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 87.08025709085688,
      "learning_rate": 1.0669757757614602e-07,
      "loss": 0.0063,
      "num_tokens": 2630104660.0,
      "reward": 0.072265625,
      "reward_std": 0.11190377175807953,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1647.5703125,
      "completions/mean_terminated_length": 672.02685546875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9514380814201587,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 57.59161770725123,
      "learning_rate": 1.0660574165330567e-07,
      "loss": 0.0135,
      "num_tokens": 2631026280.0,
      "reward": 0.07421875,
      "reward_std": 0.13241459429264069,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.67578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1872.0,
      "completions/mean_length": 1591.00390625,
      "completions/mean_terminated_length": 638.4698486328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9517794657335495,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.165057838133421,
      "learning_rate": 1.065145350475171e-07,
      "loss": 0.0196,
      "num_tokens": 2631920746.0,
      "reward": 0.048828125,
      "reward_std": 0.069866843521595,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1994.0,
      "completions/mean_length": 1699.037109375,
      "completions/mean_terminated_length": 595.406494140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9521208500469404,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 17.10636160723882,
      "learning_rate": 1.0642395788823144e-07,
      "loss": 0.0076,
      "num_tokens": 2632869517.0,
      "reward": 0.029296875,
      "reward_std": 0.055033616721630096,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1679.794921875,
      "completions/mean_terminated_length": 630.5488891601562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9524622343603312,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 57.983493154565714,
      "learning_rate": 1.0633401030400637e-07,
      "loss": 0.0042,
      "num_tokens": 2633816116.0,
      "reward": 0.056640625,
      "reward_std": 0.1038198471069336,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1928.0,
      "completions/mean_length": 1711.263671875,
      "completions/mean_terminated_length": 611.2583618164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.952803618673722,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 17.62248292375537,
      "learning_rate": 1.0624469242250607e-07,
      "loss": 0.0217,
      "num_tokens": 2634773547.0,
      "reward": 0.072265625,
      "reward_std": 0.1204371303319931,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1801.0,
      "completions/mean_length": 1613.625,
      "completions/mean_terminated_length": 658.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9531450029871127,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 45.22998985733543,
      "learning_rate": 1.0615600437050094e-07,
      "loss": 0.0015,
      "num_tokens": 2635678219.0,
      "reward": 0.064453125,
      "reward_std": 0.08995649218559265,
      "rewards/accuracy_reward/mean": 0.06653226166963577,
      "rewards/accuracy_reward/std": 0.2494617998600006,
      "step": 2792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1981.0,
      "completions/mean_length": 1606.55078125,
      "completions/mean_terminated_length": 617.4810180664062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9534863873005035,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 41.57475349412186,
      "learning_rate": 1.0606794627386739e-07,
      "loss": 0.0239,
      "num_tokens": 2636578533.0,
      "reward": 0.064453125,
      "reward_std": 0.13596117496490479,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1679.35546875,
      "completions/mean_terminated_length": 596.107666015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9538277716138943,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 339.1737721130537,
      "learning_rate": 1.0598051825758785e-07,
      "loss": 0.0078,
      "num_tokens": 2637517531.0,
      "reward": 0.021484375,
      "reward_std": 0.04478531330823898,
      "rewards/accuracy_reward/mean": 0.02217741869390011,
      "rewards/accuracy_reward/std": 0.14740893244743347,
      "step": 2794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1714.236328125,
      "completions/mean_terminated_length": 691.7540283203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9541691559272851,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 76.41940991299751,
      "learning_rate": 1.0589372044575035e-07,
      "loss": 0.0046,
      "num_tokens": 2638473316.0,
      "reward": 0.05859375,
      "reward_std": 0.09644614160060883,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1866.0,
      "completions/mean_length": 1696.296875,
      "completions/mean_terminated_length": 683.8181762695312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9545105402406759,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 67.88778799587612,
      "learning_rate": 1.0580755296154857e-07,
      "loss": 0.0097,
      "num_tokens": 2639414524.0,
      "reward": 0.052734375,
      "reward_std": 0.10453138500452042,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.77734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1745.779296875,
      "completions/mean_terminated_length": 691.0877075195312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9548519245540668,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 76.90895384380907,
      "learning_rate": 1.0572201592728136e-07,
      "loss": 0.0061,
      "num_tokens": 2640393947.0,
      "reward": 0.041015625,
      "reward_std": 0.09820909798145294,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1973.0,
      "completions/mean_length": 1686.509765625,
      "completions/mean_terminated_length": 613.248046875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9551933088674576,
      "frac_reward_zero_std": 0.96875,
      "grad_norm": 34.13231580675639,
      "learning_rate": 1.0563710946435309e-07,
      "loss": -0.001,
      "num_tokens": 2641329760.0,
      "reward": 0.009765625,
      "reward_std": 0.014959799125790596,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 2798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1562.044921875,
      "completions/mean_terminated_length": 584.4176635742188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9555346931808484,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 62.972893025440385,
      "learning_rate": 1.0555283369327283e-07,
      "loss": 0.0196,
      "num_tokens": 2642204839.0,
      "reward": 0.099609375,
      "reward_std": 0.11646249890327454,
      "rewards/accuracy_reward/mean": 0.10282257944345474,
      "rewards/accuracy_reward/std": 0.30403366684913635,
      "step": 2799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1669.064453125,
      "completions/mean_terminated_length": 631.8320922851562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9558760774942391,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 4.8162017538011455,
      "learning_rate": 1.0546918873365457e-07,
      "loss": 0.0113,
      "num_tokens": 2643133128.0,
      "reward": 0.029296875,
      "reward_std": 0.0688755214214325,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1849.0,
      "completions/mean_length": 1629.677734375,
      "completions/mean_terminated_length": 648.1242065429688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9562174618076299,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 50.36986136012113,
      "learning_rate": 1.0538617470421715e-07,
      "loss": 0.0169,
      "num_tokens": 2644050003.0,
      "reward": 0.072265625,
      "reward_std": 0.08329563587903976,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1887.0,
      "completions/mean_length": 1659.3515625,
      "completions/mean_terminated_length": 551.849609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9565588461210207,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 15.521473220634554,
      "learning_rate": 1.0530379172278375e-07,
      "loss": 0.0134,
      "num_tokens": 2644976567.0,
      "reward": 0.078125,
      "reward_std": 0.11983070522546768,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.79296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1750.0,
      "completions/mean_length": 1780.99609375,
      "completions/mean_terminated_length": 758.3207397460938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9569002304344115,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 54.45079795062958,
      "learning_rate": 1.0522203990628196e-07,
      "loss": 0.0122,
      "num_tokens": 2645965877.0,
      "reward": 0.064453125,
      "reward_std": 0.11409997940063477,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1841.0,
      "completions/mean_length": 1647.619140625,
      "completions/mean_terminated_length": 662.898681640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9572416147478023,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 30.585366899428866,
      "learning_rate": 1.0514091937074349e-07,
      "loss": 0.0075,
      "num_tokens": 2646888594.0,
      "reward": 0.087890625,
      "reward_std": 0.11674422025680542,
      "rewards/accuracy_reward/mean": 0.087890625,
      "rewards/accuracy_reward/std": 0.2834126651287079,
      "step": 2804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1699.06640625,
      "completions/mean_terminated_length": 559.2166748046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9575829990611932,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.93363708169347,
      "learning_rate": 1.050604302313042e-07,
      "loss": -0.0009,
      "num_tokens": 2647835060.0,
      "reward": 0.021484375,
      "reward_std": 0.05688370764255524,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.76953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1752.119140625,
      "completions/mean_terminated_length": 764.177978515625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.957924383374584,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 23.87044937716521,
      "learning_rate": 1.0498057260220361e-07,
      "loss": 0.0219,
      "num_tokens": 2648811681.0,
      "reward": 0.0390625,
      "reward_std": 0.1009584367275238,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1870.0,
      "completions/mean_length": 1627.259765625,
      "completions/mean_terminated_length": 592.4662475585938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9582657676879748,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.046464069521894,
      "learning_rate": 1.0490134659678501e-07,
      "loss": 0.0012,
      "num_tokens": 2649722934.0,
      "reward": 0.05078125,
      "reward_std": 0.07261618971824646,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.779296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1746.93359375,
      "completions/mean_terminated_length": 683.8760986328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9586071520013656,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 33.981074276649544,
      "learning_rate": 1.0482275232749527e-07,
      "loss": -0.0007,
      "num_tokens": 2650698244.0,
      "reward": 0.060546875,
      "reward_std": 0.06570751965045929,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1603.669921875,
      "completions/mean_terminated_length": 541.3973388671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9589485363147563,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 63.151196722593745,
      "learning_rate": 1.0474478990588456e-07,
      "loss": 0.0097,
      "num_tokens": 2651597083.0,
      "reward": 0.048828125,
      "reward_std": 0.10310593992471695,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1641.763671875,
      "completions/mean_terminated_length": 706.1096801757812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9592899206281471,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 34.988260941838924,
      "learning_rate": 1.0466745944260631e-07,
      "loss": -0.0037,
      "num_tokens": 2652516258.0,
      "reward": 0.0234375,
      "reward_std": 0.05974511429667473,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1717.16796875,
      "completions/mean_terminated_length": 671.3495483398438,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9596313049415379,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 55.5623160185692,
      "learning_rate": 1.0459076104741699e-07,
      "loss": 0.0234,
      "num_tokens": 2653483592.0,
      "reward": 0.078125,
      "reward_std": 0.13841770589351654,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.724609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1661.779296875,
      "completions/mean_terminated_length": 645.5531616210938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9599726892549287,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 59.45413373747896,
      "learning_rate": 1.045146948291758e-07,
      "loss": -0.0079,
      "num_tokens": 2654414679.0,
      "reward": 0.02734375,
      "reward_std": 0.07190324366092682,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1646.609375,
      "completions/mean_terminated_length": 548.2554931640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9603140735683195,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 40.86011064635546,
      "learning_rate": 1.0443926089584498e-07,
      "loss": 0.0155,
      "num_tokens": 2655343391.0,
      "reward": 0.060546875,
      "reward_std": 0.12559986114501953,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1676.404296875,
      "completions/mean_terminated_length": 638.6889038085938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9606554578817104,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 55.44845658479631,
      "learning_rate": 1.0436445935448916e-07,
      "loss": 0.0357,
      "num_tokens": 2656277534.0,
      "reward": 0.078125,
      "reward_std": 0.12275754660367966,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1670.787109375,
      "completions/mean_terminated_length": 606.7089233398438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9609968421951012,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 90.74118891929331,
      "learning_rate": 1.0429029031127539e-07,
      "loss": 0.014,
      "num_tokens": 2657212945.0,
      "reward": 0.017578125,
      "reward_std": 0.04125870764255524,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1628.625,
      "completions/mean_terminated_length": 653.7142944335938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.961338226508492,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.735630472699523,
      "learning_rate": 1.042167538714731e-07,
      "loss": 0.0027,
      "num_tokens": 2658126657.0,
      "reward": 0.037109375,
      "reward_std": 0.07509727776050568,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1614.47265625,
      "completions/mean_terminated_length": 694.5487670898438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9616796108218827,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 24.325912136683133,
      "learning_rate": 1.0414385013945384e-07,
      "loss": 0.02,
      "num_tokens": 2659026147.0,
      "reward": 0.07421875,
      "reward_std": 0.09765692055225372,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1567.1796875,
      "completions/mean_terminated_length": 657.1525268554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9620209951352735,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 17.69442269370351,
      "learning_rate": 1.040715792186911e-07,
      "loss": 0.0101,
      "num_tokens": 2659902079.0,
      "reward": 0.02734375,
      "reward_std": 0.044233135879039764,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1766.0,
      "completions/mean_length": 1677.353515625,
      "completions/mean_terminated_length": 642.2888793945312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9623623794486643,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 38.01990825802955,
      "learning_rate": 1.0399994121176025e-07,
      "loss": 0.0056,
      "num_tokens": 2660832932.0,
      "reward": 0.041015625,
      "reward_std": 0.07933683693408966,
      "rewards/accuracy_reward/mean": 0.04233871027827263,
      "rewards/accuracy_reward/std": 0.2015640139579773,
      "step": 2819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1786.0,
      "completions/mean_length": 1625.408203125,
      "completions/mean_terminated_length": 704.1055908203125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9627037637620551,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 46.61658634146071,
      "learning_rate": 1.039289362203383e-07,
      "loss": 0.0158,
      "num_tokens": 2661735397.0,
      "reward": 0.060546875,
      "reward_std": 0.10766562819480896,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1654.796875,
      "completions/mean_terminated_length": 599.6546630859375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.963045148075446,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 69.15783122166368,
      "learning_rate": 1.0385856434520387e-07,
      "loss": 0.0164,
      "num_tokens": 2662655485.0,
      "reward": 0.08984375,
      "reward_std": 0.10579650849103928,
      "rewards/accuracy_reward/mean": 0.08984375,
      "rewards/accuracy_reward/std": 0.2862374484539032,
      "step": 2821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1693.587890625,
      "completions/mean_terminated_length": 673.3106079101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9633865323888368,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 28.06453506220154,
      "learning_rate": 1.0378882568623697e-07,
      "loss": 0.0261,
      "num_tokens": 2663607258.0,
      "reward": 0.111328125,
      "reward_std": 0.14288339018821716,
      "rewards/accuracy_reward/mean": 0.111328125,
      "rewards/accuracy_reward/std": 0.31484565138816833,
      "step": 2822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1855.0,
      "completions/mean_length": 1636.26953125,
      "completions/mean_terminated_length": 563.45068359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9637279167022276,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 18.20562581942902,
      "learning_rate": 1.037197203424189e-07,
      "loss": -0.0064,
      "num_tokens": 2664519076.0,
      "reward": 0.03125,
      "reward_std": 0.06574726104736328,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1630.193359375,
      "completions/mean_terminated_length": 685.8344116210938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9640693010156184,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 29.577812066301465,
      "learning_rate": 1.0365124841183199e-07,
      "loss": 0.0065,
      "num_tokens": 2665438599.0,
      "reward": 0.060546875,
      "reward_std": 0.09110259264707565,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1617.326171875,
      "completions/mean_terminated_length": 616.1493530273438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9644106853290091,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 33.911037396901534,
      "learning_rate": 1.0358340999165966e-07,
      "loss": -0.0029,
      "num_tokens": 2666347822.0,
      "reward": 0.064453125,
      "reward_std": 0.08879335969686508,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1937.0,
      "completions/mean_length": 1627.984375,
      "completions/mean_terminated_length": 669.4871826171875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9647520696423999,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 118.65605462662212,
      "learning_rate": 1.035162051781861e-07,
      "loss": 0.0135,
      "num_tokens": 2667262870.0,
      "reward": 0.048828125,
      "reward_std": 0.09958265721797943,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1564.994140625,
      "completions/mean_terminated_length": 642.89208984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9650934539557907,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 5.879935933961238,
      "learning_rate": 1.0344963406679633e-07,
      "loss": 0.0255,
      "num_tokens": 2668136819.0,
      "reward": 0.037109375,
      "reward_std": 0.07399433106184006,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.70703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1621.35546875,
      "completions/mean_terminated_length": 592.0933227539062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9654348382691815,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 7.246639897713062,
      "learning_rate": 1.0338369675197584e-07,
      "loss": -0.0122,
      "num_tokens": 2669057049.0,
      "reward": 0.029296875,
      "reward_std": 0.06183479726314545,
      "rewards/accuracy_reward/mean": 0.029296875,
      "rewards/accuracy_reward/std": 0.16880230605602264,
      "step": 2828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1920.0,
      "completions/mean_length": 1643.900390625,
      "completions/mean_terminated_length": 611.201416015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9657762225825723,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 183.4541280317545,
      "learning_rate": 1.0331839332731053e-07,
      "loss": 0.04,
      "num_tokens": 2669978902.0,
      "reward": 0.083984375,
      "reward_std": 0.14557647705078125,
      "rewards/accuracy_reward/mean": 0.083984375,
      "rewards/accuracy_reward/std": 0.2776356339454651,
      "step": 2829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1579.33984375,
      "completions/mean_terminated_length": 644.76025390625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9661176068959632,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 33.91036134659535,
      "learning_rate": 1.0325372388548673e-07,
      "loss": 0.0272,
      "num_tokens": 2670868756.0,
      "reward": 0.0859375,
      "reward_std": 0.1156454086303711,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1679.0,
      "completions/mean_length": 1678.3359375,
      "completions/mean_terminated_length": 592.09228515625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.966458991209354,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 37.96084090286949,
      "learning_rate": 1.0318968851829084e-07,
      "loss": 0.0173,
      "num_tokens": 2671804016.0,
      "reward": 0.064453125,
      "reward_std": 0.09656515717506409,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1751.162109375,
      "completions/mean_terminated_length": 678.8018188476562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9668003755227448,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.438861638100496,
      "learning_rate": 1.031262873166094e-07,
      "loss": 0.0102,
      "num_tokens": 2672778499.0,
      "reward": 0.072265625,
      "reward_std": 0.09212085604667664,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1674.5703125,
      "completions/mean_terminated_length": 672.4891967773438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9671417598361355,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 43.35877301799867,
      "learning_rate": 1.0306352037042878e-07,
      "loss": 0.0009,
      "num_tokens": 2673705383.0,
      "reward": 0.0390625,
      "reward_std": 0.06310540437698364,
      "rewards/accuracy_reward/mean": 0.0390625,
      "rewards/accuracy_reward/std": 0.1939331740140915,
      "step": 2833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1661.263671875,
      "completions/mean_terminated_length": 663.3216552734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9674831441495263,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.77442352253742,
      "learning_rate": 1.0300138776883521e-07,
      "loss": 0.0075,
      "num_tokens": 2674631822.0,
      "reward": 0.02734375,
      "reward_std": 0.07685433328151703,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1640.572265625,
      "completions/mean_terminated_length": 619.2123413085938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9678245284629171,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 5.553063967294221,
      "learning_rate": 1.0293988960001453e-07,
      "loss": -0.0018,
      "num_tokens": 2675539651.0,
      "reward": 0.005859375,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.005859375,
      "rewards/accuracy_reward/std": 0.07639661431312561,
      "step": 2835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1851.0,
      "completions/mean_length": 1564.513671875,
      "completions/mean_terminated_length": 702.9619750976562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9681659127763079,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 99.31695707004648,
      "learning_rate": 1.0287902595125212e-07,
      "loss": 0.0176,
      "num_tokens": 2676420522.0,
      "reward": 0.0703125,
      "reward_std": 0.12999781966209412,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 1602.0703125,
      "completions/mean_terminated_length": 712.8187255859375,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.9685072970896987,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 39.407667955363884,
      "learning_rate": 1.0281879690893287e-07,
      "loss": 0.0022,
      "num_tokens": 2677317454.0,
      "reward": 0.048828125,
      "reward_std": 0.07797222584486008,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1606.453125,
      "completions/mean_terminated_length": 661.05517578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9688486814030896,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 62.54421755793352,
      "learning_rate": 1.0275920255854082e-07,
      "loss": 0.0156,
      "num_tokens": 2678222534.0,
      "reward": 0.0625,
      "reward_std": 0.06409768760204315,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1552.720703125,
      "completions/mean_terminated_length": 573.6802368164062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9691900657164804,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.044876609113754,
      "learning_rate": 1.0270024298465929e-07,
      "loss": 0.0111,
      "num_tokens": 2679093863.0,
      "reward": 0.08203125,
      "reward_std": 0.11379435658454895,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1591.505859375,
      "completions/mean_terminated_length": 681.1871337890625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9695314500298712,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 28.719995002620692,
      "learning_rate": 1.0264191827097057e-07,
      "loss": 0.0081,
      "num_tokens": 2679991450.0,
      "reward": 0.015625,
      "reward_std": 0.04175759106874466,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1642.142578125,
      "completions/mean_terminated_length": 707.3612670898438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9698728343432619,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 7.560760169506948,
      "learning_rate": 1.0258422850025601e-07,
      "loss": 0.0043,
      "num_tokens": 2680913187.0,
      "reward": 0.041015625,
      "reward_std": 0.09511469304561615,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.673828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1590.009765625,
      "completions/mean_terminated_length": 644.1377563476562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9702142186566527,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 99.34078843576712,
      "learning_rate": 1.025271737543956e-07,
      "loss": 0.0103,
      "num_tokens": 2681813336.0,
      "reward": 0.037109375,
      "reward_std": 0.08725681155920029,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1624.525390625,
      "completions/mean_terminated_length": 658.4807739257812,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9705556029700435,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 84.22504371034195,
      "learning_rate": 1.0247075411436815e-07,
      "loss": 0.0181,
      "num_tokens": 2682720581.0,
      "reward": 0.064453125,
      "reward_std": 0.11563889682292938,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.650390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1583.681640625,
      "completions/mean_terminated_length": 719.8938598632812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9708969872834343,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 71.82667001143582,
      "learning_rate": 1.0241496966025103e-07,
      "loss": 0.0184,
      "num_tokens": 2683611554.0,
      "reward": 0.0625,
      "reward_std": 0.09145425260066986,
      "rewards/accuracy_reward/mean": 0.0625,
      "rewards/accuracy_reward/std": 0.2422981858253479,
      "step": 2844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 1624.330078125,
      "completions/mean_terminated_length": 552.0068969726562,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9712383715968251,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 5.761440234800375,
      "learning_rate": 1.0235982047121997e-07,
      "loss": -0.0012,
      "num_tokens": 2684524939.0,
      "reward": 0.044921875,
      "reward_std": 0.05287160724401474,
      "rewards/accuracy_reward/mean": 0.044921875,
      "rewards/accuracy_reward/std": 0.20733514428138733,
      "step": 2845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1720.0,
      "completions/mean_length": 1604.26171875,
      "completions/mean_terminated_length": 572.7142944335938,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.971579755910216,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 31.834671505250128,
      "learning_rate": 1.0230530662554937e-07,
      "loss": 0.0049,
      "num_tokens": 2685424609.0,
      "reward": 0.064453125,
      "reward_std": 0.08241777867078781,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1866.0,
      "completions/mean_length": 1651.60546875,
      "completions/mean_terminated_length": 685.8926391601562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9719211402236068,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.713543753850242,
      "learning_rate": 1.0225142820061143e-07,
      "loss": -0.0032,
      "num_tokens": 2686348503.0,
      "reward": 0.09765625,
      "reward_std": 0.1158457025885582,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.66796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1576.96484375,
      "completions/mean_terminated_length": 629.3529663085938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9722625245369976,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 62.51068606352571,
      "learning_rate": 1.0219818527287691e-07,
      "loss": -0.0067,
      "num_tokens": 2687238277.0,
      "reward": 0.068359375,
      "reward_std": 0.09820909798145294,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1666.322265625,
      "completions/mean_terminated_length": 709.9383544921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9726039088503883,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.731152225323623,
      "learning_rate": 1.0214557791791425e-07,
      "loss": 0.005,
      "num_tokens": 2688177194.0,
      "reward": 0.02734375,
      "reward_std": 0.07630910724401474,
      "rewards/accuracy_reward/mean": 0.02822580561041832,
      "rewards/accuracy_reward/std": 0.1657845675945282,
      "step": 2849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1689.6875,
      "completions/mean_terminated_length": 699.058837890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9729452931637791,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 49.90230468108034,
      "learning_rate": 1.0209360621039007e-07,
      "loss": 0.0196,
      "num_tokens": 2689120106.0,
      "reward": 0.1015625,
      "reward_std": 0.14755409955978394,
      "rewards/accuracy_reward/mean": 0.1015625,
      "rewards/accuracy_reward/std": 0.30236753821372986,
      "step": 2850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 1590.05078125,
      "completions/mean_terminated_length": 627.5575561523438,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9732866774771699,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 129.85617536703256,
      "learning_rate": 1.0204227022406866e-07,
      "loss": 0.022,
      "num_tokens": 2690023348.0,
      "reward": 0.09375,
      "reward_std": 0.10590954124927521,
      "rewards/accuracy_reward/mean": 0.09375,
      "rewards/accuracy_reward/std": 0.29176566004753113,
      "step": 2851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1625.298828125,
      "completions/mean_terminated_length": 633.7843017578125,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9736280617905607,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 18.83981082691846,
      "learning_rate": 1.019915700318121e-07,
      "loss": 0.0007,
      "num_tokens": 2690941501.0,
      "reward": 0.041015625,
      "reward_std": 0.0706586092710495,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 1725.42578125,
      "completions/mean_terminated_length": 660.11767578125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9739694461039515,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 40.20038317790714,
      "learning_rate": 1.0194150570558e-07,
      "loss": 0.004,
      "num_tokens": 2691906279.0,
      "reward": 0.05078125,
      "reward_std": 0.09225328266620636,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1673.64453125,
      "completions/mean_terminated_length": 752.9324340820312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9743108304173423,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 8.834654771958183,
      "learning_rate": 1.0189207731642956e-07,
      "loss": 0.0085,
      "num_tokens": 2692845809.0,
      "reward": 0.017578125,
      "reward_std": 0.04219770431518555,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.634765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1533.400390625,
      "completions/mean_terminated_length": 639.0427856445312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9746522147307332,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 52.922226261703905,
      "learning_rate": 1.0184328493451527e-07,
      "loss": 0.0211,
      "num_tokens": 2693707918.0,
      "reward": 0.0546875,
      "reward_std": 0.11889266967773438,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1797.0,
      "completions/mean_length": 1645.373046875,
      "completions/mean_terminated_length": 543.2919921875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.974993599044124,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 32.55534278262826,
      "learning_rate": 1.0179512862908905e-07,
      "loss": 0.0147,
      "num_tokens": 2694626237.0,
      "reward": 0.078125,
      "reward_std": 0.09554657340049744,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1676.3046875,
      "completions/mean_terminated_length": 584.515380859375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9753349833575147,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 28.232388209437882,
      "learning_rate": 1.0174760846849994e-07,
      "loss": 0.0163,
      "num_tokens": 2695569193.0,
      "reward": 0.064453125,
      "reward_std": 0.08577118813991547,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 1521.71484375,
      "completions/mean_terminated_length": 629.800048828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9756763676709055,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 7.724777644549814,
      "learning_rate": 1.0170072452019414e-07,
      "loss": 0.0261,
      "num_tokens": 2696424263.0,
      "reward": 0.068359375,
      "reward_std": 0.11218452453613281,
      "rewards/accuracy_reward/mean": 0.068359375,
      "rewards/accuracy_reward/std": 0.25260838866233826,
      "step": 2858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.72265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 1654.435546875,
      "completions/mean_terminated_length": 628.95068359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9760177519842963,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 40.930787500666476,
      "learning_rate": 1.0165447685071481e-07,
      "loss": 0.0022,
      "num_tokens": 2697355366.0,
      "reward": 0.03515625,
      "reward_std": 0.06987475603818893,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1673.90234375,
      "completions/mean_terminated_length": 787.881591796875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9763591362976871,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 19.793387056519745,
      "learning_rate": 1.0160886552570211e-07,
      "loss": -0.0041,
      "num_tokens": 2698287332.0,
      "reward": 0.025390625,
      "reward_std": 0.06661957502365112,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1613.166015625,
      "completions/mean_terminated_length": 620.8526000976562,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9767005206110779,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.960496320120647,
      "learning_rate": 1.0156389060989289e-07,
      "loss": 0.0126,
      "num_tokens": 2699188601.0,
      "reward": 0.05078125,
      "reward_std": 0.06904878467321396,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1600.416015625,
      "completions/mean_terminated_length": 597.6012573242188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9770419049244687,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 11.719228620061923,
      "learning_rate": 1.0151955216712089e-07,
      "loss": 0.0039,
      "num_tokens": 2700089150.0,
      "reward": 0.046875,
      "reward_std": 0.08698301762342453,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1530.20703125,
      "completions/mean_terminated_length": 681.45361328125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9773832892378596,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 102.18074082610285,
      "learning_rate": 1.0147585026031632e-07,
      "loss": -0.001,
      "num_tokens": 2700950072.0,
      "reward": 0.03125,
      "reward_std": 0.05641176179051399,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1631.70703125,
      "completions/mean_terminated_length": 617.5167846679688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9777246735512504,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 111.70927043383357,
      "learning_rate": 1.0143278495150619e-07,
      "loss": 0.0016,
      "num_tokens": 2701863762.0,
      "reward": 0.091796875,
      "reward_std": 0.1298992931842804,
      "rewards/accuracy_reward/mean": 0.091796875,
      "rewards/accuracy_reward/std": 0.289021372795105,
      "step": 2864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.751953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1671.275390625,
      "completions/mean_terminated_length": 529.2362060546875,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9780660578646411,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 59.11837716668084,
      "learning_rate": 1.0139035630181373e-07,
      "loss": 0.0051,
      "num_tokens": 2702789039.0,
      "reward": 0.078125,
      "reward_std": 0.11575747281312943,
      "rewards/accuracy_reward/mean": 0.078125,
      "rewards/accuracy_reward/std": 0.26863065361976624,
      "step": 2865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.736328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 1693.677734375,
      "completions/mean_terminated_length": 704.5481567382812,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9784074421780319,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 25.07626925278958,
      "learning_rate": 1.0134856437145871e-07,
      "loss": 0.0077,
      "num_tokens": 2703743274.0,
      "reward": 0.076171875,
      "reward_std": 0.13808965682983398,
      "rewards/accuracy_reward/mean": 0.0786290317773819,
      "rewards/accuracy_reward/std": 0.26943066716194153,
      "step": 2866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.759765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1721.595703125,
      "completions/mean_terminated_length": 689.3088989257812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9787488264914227,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 34.63252352992463,
      "learning_rate": 1.0130740921975706e-07,
      "loss": 0.0136,
      "num_tokens": 2704700139.0,
      "reward": 0.048828125,
      "reward_std": 0.07686128467321396,
      "rewards/accuracy_reward/mean": 0.048828125,
      "rewards/accuracy_reward/std": 0.2157193273305893,
      "step": 2867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1939.0,
      "completions/mean_length": 1649.58984375,
      "completions/mean_terminated_length": 641.6206665039062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9790902108048135,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 41.104247776451146,
      "learning_rate": 1.0126689090512102e-07,
      "loss": 0.0085,
      "num_tokens": 2705626697.0,
      "reward": 0.068359375,
      "reward_std": 0.12576615810394287,
      "rewards/accuracy_reward/mean": 0.07056451588869095,
      "rewards/accuracy_reward/std": 0.25635457038879395,
      "step": 2868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.611328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 1474.27734375,
      "completions/mean_terminated_length": 571.8894653320312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9794315951182043,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 122.05249741625943,
      "learning_rate": 1.0122700948505894e-07,
      "loss": 0.0627,
      "num_tokens": 2706455031.0,
      "reward": 0.080078125,
      "reward_std": 0.1427883505821228,
      "rewards/accuracy_reward/mean": 0.080078125,
      "rewards/accuracy_reward/std": 0.271679550409317,
      "step": 2869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 1710.9375,
      "completions/mean_terminated_length": 760.119384765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9797729794315951,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 21.736023448111542,
      "learning_rate": 1.0118776501617519e-07,
      "loss": 0.0045,
      "num_tokens": 2707418263.0,
      "reward": 0.0546875,
      "reward_std": 0.07355422526597977,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102474212646484,
      "step": 2870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1793.0,
      "completions/mean_length": 1640.564453125,
      "completions/mean_terminated_length": 628.9047241210938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.980114363744986,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 10.859671086388024,
      "learning_rate": 1.0114915755417014e-07,
      "loss": 0.0222,
      "num_tokens": 2708337496.0,
      "reward": 0.025390625,
      "reward_std": 0.06166848540306091,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.701171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1695.0,
      "completions/mean_length": 1609.236328125,
      "completions/mean_terminated_length": 579.7189331054688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9804557480583768,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 13.69726999787243,
      "learning_rate": 1.0111118715383995e-07,
      "loss": 0.0017,
      "num_tokens": 2709238001.0,
      "reward": 0.072265625,
      "reward_std": 0.10556350648403168,
      "rewards/accuracy_reward/mean": 0.072265625,
      "rewards/accuracy_reward/std": 0.2591804563999176,
      "step": 2872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.626953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1975.0,
      "completions/mean_length": 1507.169921875,
      "completions/mean_terminated_length": 598.235595703125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9807971323717675,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 53.87182218245254,
      "learning_rate": 1.0107385386907679e-07,
      "loss": 0.0013,
      "num_tokens": 2710084472.0,
      "reward": 0.076171875,
      "reward_std": 0.08588561415672302,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 2873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 1614.236328125,
      "completions/mean_terminated_length": 651.2263793945312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9811385166851583,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 27.147053981870883,
      "learning_rate": 1.0103715775286826e-07,
      "loss": 0.0141,
      "num_tokens": 2710985505.0,
      "reward": 0.0859375,
      "reward_std": 0.10107050091028214,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.748046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1667.822265625,
      "completions/mean_terminated_length": 539.43408203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9814799009985491,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 37.13452790883539,
      "learning_rate": 1.010010988572979e-07,
      "loss": 0.021,
      "num_tokens": 2711925462.0,
      "reward": 0.076171875,
      "reward_std": 0.12066785991191864,
      "rewards/accuracy_reward/mean": 0.0786290317773819,
      "rewards/accuracy_reward/std": 0.26943066716194153,
      "step": 2875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.681640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1597.244140625,
      "completions/mean_terminated_length": 632.1288452148438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9818212853119399,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 4.168044770021395,
      "learning_rate": 1.0096567723354473e-07,
      "loss": 0.0146,
      "num_tokens": 2712817251.0,
      "reward": 0.01171875,
      "reward_std": 0.022772299125790596,
      "rewards/accuracy_reward/mean": 0.01171875,
      "rewards/accuracy_reward/std": 0.10772226005792618,
      "step": 2876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1661.76953125,
      "completions/mean_terminated_length": 561.1578979492188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9821626696253307,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 13.704344436381552,
      "learning_rate": 1.0093089293188319e-07,
      "loss": 0.0038,
      "num_tokens": 2713744157.0,
      "reward": 0.046875,
      "reward_std": 0.07662828266620636,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1836.0,
      "completions/mean_length": 1580.322265625,
      "completions/mean_terminated_length": 587.9329223632812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9825040539387215,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 109.37150619209498,
      "learning_rate": 1.0089674600168329e-07,
      "loss": 0.032,
      "num_tokens": 2714629538.0,
      "reward": 0.041015625,
      "reward_std": 0.09511469304561615,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1584.43359375,
      "completions/mean_terminated_length": 668.0814208984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9828454382521123,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 20.678783486882647,
      "learning_rate": 1.0086323649141032e-07,
      "loss": 0.0005,
      "num_tokens": 2715518400.0,
      "reward": 0.0703125,
      "reward_std": 0.11097268760204315,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1868.0,
      "completions/mean_length": 1605.224609375,
      "completions/mean_terminated_length": 622.20751953125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9831868225655032,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 14.445390615902841,
      "learning_rate": 1.0083036444862492e-07,
      "loss": 0.0205,
      "num_tokens": 2716408595.0,
      "reward": 0.052734375,
      "reward_std": 0.08791409432888031,
      "rewards/accuracy_reward/mean": 0.05443548411130905,
      "rewards/accuracy_reward/std": 0.227104052901268,
      "step": 2880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1701.2265625,
      "completions/mean_terminated_length": 568.433349609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9835282068788939,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 24.624345397724333,
      "learning_rate": 1.0079812991998291e-07,
      "loss": 0.0282,
      "num_tokens": 2717361767.0,
      "reward": 0.0859375,
      "reward_std": 0.08792105317115784,
      "rewards/accuracy_reward/mean": 0.0859375,
      "rewards/accuracy_reward/std": 0.28054583072662354,
      "step": 2881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1624.09375,
      "completions/mean_terminated_length": 732.6060180664062,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9838695911922847,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 26.341631261954728,
      "learning_rate": 1.0076653295123537e-07,
      "loss": -0.0015,
      "num_tokens": 2718267191.0,
      "reward": 0.0546875,
      "reward_std": 0.0893455445766449,
      "rewards/accuracy_reward/mean": 0.0546875,
      "rewards/accuracy_reward/std": 0.2275916188955307,
      "step": 2882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.74609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1690.67578125,
      "completions/mean_terminated_length": 640.6923217773438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9842109755056755,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 62.39520358592612,
      "learning_rate": 1.0073557358722834e-07,
      "loss": 0.0154,
      "num_tokens": 2719215185.0,
      "reward": 0.037109375,
      "reward_std": 0.08412160724401474,
      "rewards/accuracy_reward/mean": 0.038306452333927155,
      "rewards/accuracy_reward/std": 0.19212883710861206,
      "step": 2883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.802734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 1779.228515625,
      "completions/mean_terminated_length": 685.5148315429688,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9845523598190663,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 37.69118220102959,
      "learning_rate": 1.0070525187190301e-07,
      "loss": -0.0062,
      "num_tokens": 2720212166.0,
      "reward": 0.02734375,
      "reward_std": 0.05311024561524391,
      "rewards/accuracy_reward/mean": 0.02734375,
      "rewards/accuracy_reward/std": 0.16324250400066376,
      "step": 2884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1639.041015625,
      "completions/mean_terminated_length": 593.9236450195312,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9848937441324571,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 41.51337757613398,
      "learning_rate": 1.0067556784829557e-07,
      "loss": -0.009,
      "num_tokens": 2721134459.0,
      "reward": 0.05859375,
      "reward_std": 0.08450747281312943,
      "rewards/accuracy_reward/mean": 0.05859375,
      "rewards/accuracy_reward/std": 0.23509246110916138,
      "step": 2885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1600.986328125,
      "completions/mean_terminated_length": 626.4409790039062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9852351284458479,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 23.35618071605464,
      "learning_rate": 1.0064652155853695e-07,
      "loss": 0.0222,
      "num_tokens": 2722034676.0,
      "reward": 0.0703125,
      "reward_std": 0.094524085521698,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.654296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1576.390625,
      "completions/mean_terminated_length": 683.796630859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9855765127592387,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 106.26054271929235,
      "learning_rate": 1.0061811304385314e-07,
      "loss": 0.0058,
      "num_tokens": 2722916876.0,
      "reward": 0.03515625,
      "reward_std": 0.05385598540306091,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1580.5,
      "completions/mean_terminated_length": 570.4691162109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9859178970726296,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 55.7578031730014,
      "learning_rate": 1.0059034234456476e-07,
      "loss": 0.0212,
      "num_tokens": 2723803324.0,
      "reward": 0.07421875,
      "reward_std": 0.11943789571523666,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1948.0,
      "completions/mean_length": 1640.810546875,
      "completions/mean_terminated_length": 590.0908813476562,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9862592813860203,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 11.58924987969007,
      "learning_rate": 1.0056320950008728e-07,
      "loss": 0.0008,
      "num_tokens": 2724726091.0,
      "reward": 0.03125,
      "reward_std": 0.0737205371260643,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.740234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 1662.82421875,
      "completions/mean_terminated_length": 565.2180786132812,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9866006656994111,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 16.871163938001057,
      "learning_rate": 1.0053671454893084e-07,
      "loss": 0.002,
      "num_tokens": 2725653249.0,
      "reward": 0.05078125,
      "reward_std": 0.07113292813301086,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1564.34765625,
      "completions/mean_terminated_length": 608.2907104492188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9869420500128019,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 19.01770263631309,
      "learning_rate": 1.0051085752870009e-07,
      "loss": -0.0184,
      "num_tokens": 2726532147.0,
      "reward": 0.07421875,
      "reward_std": 0.11180429905653,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1610.35546875,
      "completions/mean_terminated_length": 638.7295532226562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9872834343261927,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 12.516958522735711,
      "learning_rate": 1.0048563847609443e-07,
      "loss": 0.0173,
      "num_tokens": 2727435081.0,
      "reward": 0.033203125,
      "reward_std": 0.0234375,
      "rewards/accuracy_reward/mean": 0.03427419438958168,
      "rewards/accuracy_reward/std": 0.18211629986763,
      "step": 2892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1569.640625,
      "completions/mean_terminated_length": 590.1428833007812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9876248186395835,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 86.34302535217995,
      "learning_rate": 1.0046105742690761e-07,
      "loss": 0.0158,
      "num_tokens": 2728321905.0,
      "reward": 0.052734375,
      "reward_std": 0.10155140608549118,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.783203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1740.587890625,
      "completions/mean_terminated_length": 630.0270385742188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9879662029529743,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 141.25029384303645,
      "learning_rate": 1.0043711441602797e-07,
      "loss": 0.0061,
      "num_tokens": 2729288862.0,
      "reward": 0.033203125,
      "reward_std": 0.07493096590042114,
      "rewards/accuracy_reward/mean": 0.033203125,
      "rewards/accuracy_reward/std": 0.17934183776378632,
      "step": 2894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.767578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1730.060546875,
      "completions/mean_terminated_length": 680.058837890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9883075872663651,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 20.967274329792332,
      "learning_rate": 1.0041380947743828e-07,
      "loss": 0.0124,
      "num_tokens": 2730253725.0,
      "reward": 0.09765625,
      "reward_std": 0.12328367680311203,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.68359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1595.552734375,
      "completions/mean_terminated_length": 618.043212890625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.988648971579756,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 53.658209639368096,
      "learning_rate": 1.0039114264421555e-07,
      "loss": 0.0023,
      "num_tokens": 2731140536.0,
      "reward": 0.05078125,
      "reward_std": 0.09962804615497589,
      "rewards/accuracy_reward/mean": 0.05078125,
      "rewards/accuracy_reward/std": 0.21976542472839355,
      "step": 2896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.708984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1847.0,
      "completions/mean_length": 1639.779296875,
      "completions/mean_terminated_length": 645.2550659179688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9889903558931467,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 16.241572705096488,
      "learning_rate": 1.0036911394853133e-07,
      "loss": 0.0471,
      "num_tokens": 2732053495.0,
      "reward": 0.064453125,
      "reward_std": 0.1035923957824707,
      "rewards/accuracy_reward/mean": 0.064453125,
      "rewards/accuracy_reward/std": 0.24579854309558868,
      "step": 2897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1909.0,
      "completions/mean_length": 1637.783203125,
      "completions/mean_terminated_length": 666.2171020507812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9893317402065375,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.753228457685946,
      "learning_rate": 1.0034772342165116e-07,
      "loss": 0.0088,
      "num_tokens": 2732980232.0,
      "reward": 0.03125,
      "reward_std": 0.031083684414625168,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1917.0,
      "completions/mean_length": 1517.029296875,
      "completions/mean_terminated_length": 646.6752319335938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9896731245199283,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 85.30452265228864,
      "learning_rate": 1.003269710939351e-07,
      "loss": 0.0151,
      "num_tokens": 2733827559.0,
      "reward": 0.04296875,
      "reward_std": 0.07972928881645203,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.693359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1858.0,
      "completions/mean_length": 1600.865234375,
      "completions/mean_terminated_length": 589.8280639648438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9900145088333191,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 38.20320033151139,
      "learning_rate": 1.0030685699483725e-07,
      "loss": 0.0014,
      "num_tokens": 2734721266.0,
      "reward": 0.060546875,
      "reward_std": 0.07438573241233826,
      "rewards/accuracy_reward/mean": 0.060546875,
      "rewards/accuracy_reward/std": 0.2387305200099945,
      "step": 2900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.732421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1678.359375,
      "completions/mean_terminated_length": 666.5693359375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9903558931467099,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 2.2930164611398363,
      "learning_rate": 1.0028738115290592e-07,
      "loss": -0.0012,
      "num_tokens": 2735661338.0,
      "reward": 0.015625,
      "reward_std": 0.04081955552101135,
      "rewards/accuracy_reward/mean": 0.015625,
      "rewards/accuracy_reward/std": 0.12414088100194931,
      "step": 2901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1598.896484375,
      "completions/mean_terminated_length": 610.8687744140625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9906972774601007,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 5.315972544352134,
      "learning_rate": 1.0026854359578345e-07,
      "loss": 0.0074,
      "num_tokens": 2736558021.0,
      "reward": 0.017578125,
      "reward_std": 0.04957009106874466,
      "rewards/accuracy_reward/mean": 0.017578125,
      "rewards/accuracy_reward/std": 0.13154059648513794,
      "step": 2902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 1669.76953125,
      "completions/mean_terminated_length": 624.4705810546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9910386617734915,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 2.951849838514443,
      "learning_rate": 1.0025034435020633e-07,
      "loss": 0.0141,
      "num_tokens": 2737497231.0,
      "reward": 0.01953125,
      "reward_std": 0.04907120764255524,
      "rewards/accuracy_reward/mean": 0.01953125,
      "rewards/accuracy_reward/std": 0.1385180652141571,
      "step": 2903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1834.0,
      "completions/mean_length": 1567.134765625,
      "completions/mean_terminated_length": 518.788818359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9913800460868823,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 47.18804606024116,
      "learning_rate": 1.0023278344200509e-07,
      "loss": 0.0076,
      "num_tokens": 2738378884.0,
      "reward": 0.056640625,
      "reward_std": 0.0870293527841568,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.677734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1598.876953125,
      "completions/mean_terminated_length": 654.3575439453125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9917214304002732,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 38.53162246213055,
      "learning_rate": 1.0021586089610422e-07,
      "loss": 0.0064,
      "num_tokens": 2739266485.0,
      "reward": 0.021484375,
      "reward_std": 0.05688370764255524,
      "rewards/accuracy_reward/mean": 0.021484375,
      "rewards/accuracy_reward/std": 0.14513419568538666,
      "step": 2905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.685546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1748.0,
      "completions/mean_length": 1598.623046875,
      "completions/mean_terminated_length": 618.9254760742188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9920628147136639,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 10.001976236915034,
      "learning_rate": 1.0019957673652214e-07,
      "loss": 0.03,
      "num_tokens": 2740164692.0,
      "reward": 0.126953125,
      "reward_std": 0.15081490576267242,
      "rewards/accuracy_reward/mean": 0.126953125,
      "rewards/accuracy_reward/std": 0.33324605226516724,
      "step": 2906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1660.46484375,
      "completions/mean_terminated_length": 660.4615478515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9924041990270547,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 20.51033762081488,
      "learning_rate": 1.0018393098637125e-07,
      "loss": 0.0185,
      "num_tokens": 2741093090.0,
      "reward": 0.09765625,
      "reward_std": 0.13130201399326324,
      "rewards/accuracy_reward/mean": 0.09765625,
      "rewards/accuracy_reward/std": 0.29713961482048035,
      "step": 2907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 1602.169921875,
      "completions/mean_terminated_length": 603.2848510742188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9927455833404455,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 49.746729670366946,
      "learning_rate": 1.0016892366785786e-07,
      "loss": 0.0041,
      "num_tokens": 2741987561.0,
      "reward": 0.076171875,
      "reward_std": 0.12961195409297943,
      "rewards/accuracy_reward/mean": 0.076171875,
      "rewards/accuracy_reward/std": 0.26553234457969666,
      "step": 2908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1666.03125,
      "completions/mean_terminated_length": 680.3916015625,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.9930869676538363,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 25.34545420345999,
      "learning_rate": 1.0015455480228208e-07,
      "loss": 0.006,
      "num_tokens": 2742921081.0,
      "reward": 0.052734375,
      "reward_std": 0.11993582546710968,
      "rewards/accuracy_reward/mean": 0.052734375,
      "rewards/accuracy_reward/std": 0.22372129559516907,
      "step": 2909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.720703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1654.248046875,
      "completions/mean_terminated_length": 638.2028198242188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9934283519672271,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 124.40137879401742,
      "learning_rate": 1.0014082441003791e-07,
      "loss": 0.0184,
      "num_tokens": 2743854008.0,
      "reward": 0.0703125,
      "reward_std": 0.11641617119312286,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.755859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1712.30859375,
      "completions/mean_terminated_length": 673.416015625,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9937697362806179,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 136.7744002463834,
      "learning_rate": 1.001277325106131e-07,
      "loss": 0.001,
      "num_tokens": 2744813702.0,
      "reward": 0.064453125,
      "reward_std": 0.08825473487377167,
      "rewards/accuracy_reward/mean": 0.06653226166963577,
      "rewards/accuracy_reward/std": 0.2494617998600006,
      "step": 2911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1841.0,
      "completions/mean_length": 1633.568359375,
      "completions/mean_terminated_length": 614.29052734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9941111205940087,
      "frac_reward_zero_std": 0.90625,
      "grad_norm": 0.7808250238000148,
      "learning_rate": 1.0011527912258924e-07,
      "loss": 0.015,
      "num_tokens": 2745725305.0,
      "reward": 0.0234375,
      "reward_std": 0.03449726849794388,
      "rewards/accuracy_reward/mean": 0.0234375,
      "rewards/accuracy_reward/std": 0.15143637359142303,
      "step": 2912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.73828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1916.0,
      "completions/mean_length": 1687.01171875,
      "completions/mean_terminated_length": 668.7014770507812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9944525049073996,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 62.25508818880001,
      "learning_rate": 1.0010346426364161e-07,
      "loss": 0.0128,
      "num_tokens": 2746659631.0,
      "reward": 0.0625,
      "reward_std": 0.11795367300510406,
      "rewards/accuracy_reward/mean": 0.06451612710952759,
      "rewards/accuracy_reward/std": 0.2459181249141693,
      "step": 2913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.62890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1511.3125,
      "completions/mean_terminated_length": 601.7684326171875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9947938892207903,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 34.70397896268805,
      "learning_rate": 1.0009228795053926e-07,
      "loss": 0.021,
      "num_tokens": 2747510831.0,
      "reward": 0.056640625,
      "reward_std": 0.08384227007627487,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 1587.34765625,
      "completions/mean_terminated_length": 644.107177734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9951352735341811,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 50.38357555931986,
      "learning_rate": 1.0008175019914494e-07,
      "loss": -0.0072,
      "num_tokens": 2748395553.0,
      "reward": 0.04296875,
      "reward_std": 0.07477159798145294,
      "rewards/accuracy_reward/mean": 0.04296875,
      "rewards/accuracy_reward/std": 0.2029850035905838,
      "step": 2915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1871.0,
      "completions/mean_length": 1644.50390625,
      "completions/mean_terminated_length": 679.8543090820312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9954766578475719,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 46.49427255684848,
      "learning_rate": 1.0007185102441505e-07,
      "loss": 0.0045,
      "num_tokens": 2749323075.0,
      "reward": 0.037109375,
      "reward_std": 0.07894542813301086,
      "rewards/accuracy_reward/mean": 0.037109375,
      "rewards/accuracy_reward/std": 0.18921469151973724,
      "step": 2916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1983.0,
      "completions/mean_length": 1605.68359375,
      "completions/mean_terminated_length": 632.5875244140625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9958180421609627,
      "frac_reward_zero_std": 0.8125,
      "grad_norm": 43.39794387579375,
      "learning_rate": 1.0006259044039964e-07,
      "loss": 0.0075,
      "num_tokens": 2750221601.0,
      "reward": 0.041015625,
      "reward_std": 0.0673912987112999,
      "rewards/accuracy_reward/mean": 0.041015625,
      "rewards/accuracy_reward/std": 0.19852031767368317,
      "step": 2917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.697265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 1627.2109375,
      "completions/mean_terminated_length": 658.0386962890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9961594264743535,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 33.95512716338215,
      "learning_rate": 1.000539684602424e-07,
      "loss": -0.005,
      "num_tokens": 2751126301.0,
      "reward": 0.025390625,
      "reward_std": 0.06755761802196503,
      "rewards/accuracy_reward/mean": 0.025390625,
      "rewards/accuracy_reward/std": 0.15746226906776428,
      "step": 2918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1521.2890625,
      "completions/mean_terminated_length": 582.3695678710938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9965008107877443,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 19.961791525407527,
      "learning_rate": 1.0004598509618068e-07,
      "loss": 0.0082,
      "num_tokens": 2751981857.0,
      "reward": 0.107421875,
      "reward_std": 0.14617595076560974,
      "rewards/accuracy_reward/mean": 0.107421875,
      "rewards/accuracy_reward/std": 0.30995169281959534,
      "step": 2919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1979.0,
      "completions/mean_length": 1600.60546875,
      "completions/mean_terminated_length": 607.654052734375,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9968421951011351,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 114.35284707076231,
      "learning_rate": 1.0003864035954539e-07,
      "loss": 0.0071,
      "num_tokens": 2752891511.0,
      "reward": 0.08203125,
      "reward_std": 0.11687077581882477,
      "rewards/accuracy_reward/mean": 0.08203125,
      "rewards/accuracy_reward/std": 0.2746807038784027,
      "step": 2920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.69921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1619.150390625,
      "completions/mean_terminated_length": 622.2142944335938,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.997183579414526,
      "frac_reward_zero_std": 0.65625,
      "grad_norm": 15.177119997846791,
      "learning_rate": 1.0003193426076107e-07,
      "loss": 0.0146,
      "num_tokens": 2753796020.0,
      "reward": 0.03515625,
      "reward_std": 0.09853477776050568,
      "rewards/accuracy_reward/mean": 0.03515625,
      "rewards/accuracy_reward/std": 0.1843547374010086,
      "step": 2921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.666015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1937.0,
      "completions/mean_length": 1562.55859375,
      "completions/mean_terminated_length": 594.5146484375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9975249637279167,
      "frac_reward_zero_std": 0.71875,
      "grad_norm": 47.36340697254016,
      "learning_rate": 1.0002586680934577e-07,
      "loss": 0.0121,
      "num_tokens": 2754673874.0,
      "reward": 0.056640625,
      "reward_std": 0.1076337993144989,
      "rewards/accuracy_reward/mean": 0.056640625,
      "rewards/accuracy_reward/std": 0.23138070106506348,
      "step": 2922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.63671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 1516.759765625,
      "completions/mean_terminated_length": 585.6613159179688,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9978663480413075,
      "frac_reward_zero_std": 0.84375,
      "grad_norm": 8.409006334902793,
      "learning_rate": 1.0002043801391112e-07,
      "loss": 0.0062,
      "num_tokens": 2755520679.0,
      "reward": 0.03125,
      "reward_std": 0.05641176179051399,
      "rewards/accuracy_reward/mean": 0.03125,
      "rewards/accuracy_reward/std": 0.17416280508041382,
      "step": 2923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.716796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1676.609375,
      "completions/mean_terminated_length": 736.6068725585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9982077323546983,
      "frac_reward_zero_std": 0.6875,
      "grad_norm": 31.8641102083544,
      "learning_rate": 1.0001564788216237e-07,
      "loss": 0.0199,
      "num_tokens": 2756449871.0,
      "reward": 0.07421875,
      "reward_std": 0.12037497013807297,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 1663.767578125,
      "completions/mean_terminated_length": 709.7210693359375,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.9985491166680891,
      "frac_reward_zero_std": 0.5625,
      "grad_norm": 73.80738272060073,
      "learning_rate": 1.0001149642089817e-07,
      "loss": -0.0022,
      "num_tokens": 2757381752.0,
      "reward": 0.07421875,
      "reward_std": 0.1546139121055603,
      "rewards/accuracy_reward/mean": 0.07421875,
      "rewards/accuracy_reward/std": 0.2623828947544098,
      "step": 2925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.705078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1630.35546875,
      "completions/mean_terminated_length": 631.8807983398438,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9988905009814799,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 68.42868504750174,
      "learning_rate": 1.0000798363601074e-07,
      "loss": 0.0277,
      "num_tokens": 2758291102.0,
      "reward": 0.046875,
      "reward_std": 0.09039659798145294,
      "rewards/accuracy_reward/mean": 0.046875,
      "rewards/accuracy_reward/std": 0.21157780289649963,
      "step": 2926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.728515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1882.0,
      "completions/mean_length": 1681.779296875,
      "completions/mean_terminated_length": 699.0431518554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9992318852948707,
      "frac_reward_zero_std": 0.9375,
      "grad_norm": 8.45734651403176,
      "learning_rate": 1.0000510953248593e-07,
      "loss": -0.0037,
      "num_tokens": 2759226093.0,
      "reward": 0.009765625,
      "reward_std": 0.023271184414625168,
      "rewards/accuracy_reward/mean": 0.009765625,
      "rewards/accuracy_reward/std": 0.09843364357948303,
      "step": 2927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.689453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1920.0,
      "completions/mean_length": 1603.537109375,
      "completions/mean_terminated_length": 616.7735595703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9995732696082615,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 15.172487258378336,
      "learning_rate": 1.0000287411440292e-07,
      "loss": -0.0058,
      "num_tokens": 2760117328.0,
      "reward": 0.0546875,
      "reward_std": 0.07730703055858612,
      "rewards/accuracy_reward/mean": 0.05645161122083664,
      "rewards/accuracy_reward/std": 0.23102475702762604,
      "step": 2928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.712890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1657.44921875,
      "completions/mean_terminated_length": 687.7142944335938,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.9999146539216524,
      "frac_reward_zero_std": 0.78125,
      "grad_norm": 38.57795061124567,
      "learning_rate": 1.0000127738493448e-07,
      "loss": -0.0021,
      "num_tokens": 2761048070.0,
      "reward": 0.0703125,
      "reward_std": 0.08351518213748932,
      "rewards/accuracy_reward/mean": 0.0703125,
      "rewards/accuracy_reward/std": 0.25592297315597534,
      "step": 2929
    },
    {
      "epoch": 0.9999146539216524,
      "step": 2929,
      "total_flos": 0.0,
      "train_loss": 0.012068198455147701,
      "train_runtime": 103778.9718,
      "train_samples_per_second": 0.903,
      "train_steps_per_second": 0.028
    }
  ],
  "logging_steps": 1,
  "max_steps": 2930,
  "num_input_tokens_seen": 2761048070,
  "num_train_epochs": 1,
  "save_steps": 147,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}