{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1347517730496455, "eval_steps": 20, "global_step": 320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2212.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 2212.75, "completions/mean_terminated_length": 2212.75, "completions/min_length": 1834.0, "completions/min_terminated_length": 1834.0, "epoch": 0.0035460992907801418, "frac_reward_zero_std": 0.0, "grad_norm": 0.5901697278022766, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 18447.0, "reward": 6.6875, "reward_std": 1.5643821954727173, "rewards/reward_model_wrapper/mean": 6.6875, "rewards/reward_model_wrapper/std": 1.5643820762634277, "step": 1 }, { "completion_length": 3272.25, "completions/clipped_ratio": 0.0, "completions/max_length": 5006.0, "completions/max_terminated_length": 5006.0, "completions/mean_length": 3272.25, "completions/mean_terminated_length": 3272.25, "completions/min_length": 2160.0, "completions/min_terminated_length": 2160.0, "epoch": 0.0070921985815602835, "frac_reward_zero_std": 0.0, "grad_norm": 0.4964694082736969, "kl": 0.0, "learning_rate": 7.142857142857144e-08, "loss": 0.0, "num_tokens": 40956.0, "reward": 7.099946975708008, "reward_std": 2.016603708267212, "rewards/reward_model_wrapper/mean": 7.099946975708008, "rewards/reward_model_wrapper/std": 2.016603946685791, "step": 2 }, { "completion_length": 2607.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 2607.5, "completions/mean_terminated_length": 2607.5, "completions/min_length": 2366.0, "completions/min_terminated_length": 2366.0, "epoch": 0.010638297872340425, "frac_reward_zero_std": 0.0, "grad_norm": 0.1904979944229126, "kl": 0.0, "learning_rate": 1.4285714285714287e-07, "loss": 0.0, "num_tokens": 62130.0, "reward": 6.7999467849731445, "reward_std": 0.6481132507324219, "rewards/reward_model_wrapper/mean": 6.7999467849731445, "rewards/reward_model_wrapper/std": 0.6481131911277771, "step": 3 }, { "completion_length": 2666.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3441.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 2666.25, "completions/mean_terminated_length": 2666.25, "completions/min_length": 2266.0, "completions/min_terminated_length": 2266.0, "epoch": 0.014184397163120567, "frac_reward_zero_std": 0.0, "grad_norm": 0.3886282742023468, "kl": 0.0, "learning_rate": 2.142857142857143e-07, "loss": -0.0, "num_tokens": 82011.0, "reward": 7.349859237670898, "reward_std": 1.4479856491088867, "rewards/reward_model_wrapper/mean": 7.349859237670898, "rewards/reward_model_wrapper/std": 1.4479858875274658, "step": 4 }, { "completion_length": 2455.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 2455.5, "completions/mean_terminated_length": 2455.5, "completions/min_length": 2196.0, "completions/min_terminated_length": 2196.0, "epoch": 0.01773049645390071, "frac_reward_zero_std": 0.0, "grad_norm": 0.9358339905738831, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0, "num_tokens": 100549.0, "reward": 6.049900054931641, "reward_std": 2.6839942932128906, "rewards/reward_model_wrapper/mean": 6.049900054931641, "rewards/reward_model_wrapper/std": 2.6839942932128906, "step": 5 }, { "completion_length": 3103.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3413.0, "completions/max_terminated_length": 3413.0, "completions/mean_length": 3103.25, "completions/mean_terminated_length": 3103.25, "completions/min_length": 2747.0, "completions/min_terminated_length": 2747.0, "epoch": 0.02127659574468085, "frac_reward_zero_std": 0.0, "grad_norm": 0.4240210950374603, "kl": 0.0, "learning_rate": 3.5714285714285716e-07, "loss": -0.0, "num_tokens": 122334.0, "reward": 6.299783706665039, "reward_std": 1.383275032043457, "rewards/reward_model_wrapper/mean": 6.299783706665039, "rewards/reward_model_wrapper/std": 1.383275032043457, "step": 6 }, { "completion_length": 2357.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 2357.0, "completions/mean_terminated_length": 2357.0, "completions/min_length": 1892.0, "completions/min_terminated_length": 1892.0, "epoch": 0.024822695035460994, "frac_reward_zero_std": 0.0, "grad_norm": 0.49061980843544006, "kl": 0.0, "learning_rate": 4.285714285714286e-07, "loss": 0.0, "num_tokens": 141122.0, "reward": 4.7996826171875, "reward_std": 1.163220763206482, "rewards/reward_model_wrapper/mean": 4.7996826171875, "rewards/reward_model_wrapper/std": 1.163220763206482, "step": 7 }, { "completion_length": 477.5, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 477.5, "completions/mean_terminated_length": 477.5, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.028368794326241134, "frac_reward_zero_std": 0.0, "grad_norm": 2.7073280811309814, "kl": 0.0, "learning_rate": 5.000000000000001e-07, "loss": -0.0, "num_tokens": 152924.0, "reward": 5.774999618530273, "reward_std": 2.194500684738159, "rewards/reward_model_wrapper/mean": 5.774999618530273, "rewards/reward_model_wrapper/std": 2.194500684738159, "step": 8 }, { "completion_length": 458.25, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 458.25, "completions/mean_terminated_length": 458.25, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.031914893617021274, "frac_reward_zero_std": 0.0, "grad_norm": 0.5933535695075989, "kl": 0.0, "learning_rate": 5.714285714285715e-07, "loss": -0.0, "num_tokens": 165733.0, "reward": 8.5, "reward_std": 0.4760953187942505, "rewards/reward_model_wrapper/mean": 8.5, "rewards/reward_model_wrapper/std": 0.4760953187942505, "step": 9 }, { "completion_length": 412.75, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 412.75, "completions/mean_terminated_length": 412.75, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.03546099290780142, "frac_reward_zero_std": 0.0, "grad_norm": 0.4559244215488434, "kl": 0.0, "learning_rate": 6.428571428571428e-07, "loss": 0.0, "num_tokens": 176508.0, "reward": 8.175000190734863, "reward_std": 0.4031129479408264, "rewards/reward_model_wrapper/mean": 8.175000190734863, "rewards/reward_model_wrapper/std": 0.40311288833618164, "step": 10 }, { "completion_length": 514.5, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 514.5, "completions/mean_terminated_length": 514.5, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.03900709219858156, "frac_reward_zero_std": 0.0, "grad_norm": 0.7823895812034607, "kl": 0.0, "learning_rate": 7.142857142857143e-07, "loss": 0.0, "num_tokens": 188374.0, "reward": 9.02500057220459, "reward_std": 0.8057088851928711, "rewards/reward_model_wrapper/mean": 9.02500057220459, "rewards/reward_model_wrapper/std": 0.8057088851928711, "step": 11 }, { "completion_length": 2648.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 2648.5, "completions/mean_terminated_length": 2648.5, "completions/min_length": 2467.0, "completions/min_terminated_length": 2467.0, "epoch": 0.0425531914893617, "frac_reward_zero_std": 0.0, "grad_norm": 0.5016942024230957, "kl": 0.0, "learning_rate": 7.857142857142857e-07, "loss": 0.0, "num_tokens": 208044.0, "reward": 4.42363977432251, "reward_std": 1.5363537073135376, "rewards/reward_model_wrapper/mean": 4.42363977432251, "rewards/reward_model_wrapper/std": 1.5363537073135376, "step": 12 }, { "completion_length": 2402.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 2402.5, "completions/mean_terminated_length": 2402.5, "completions/min_length": 2210.0, "completions/min_terminated_length": 2210.0, "epoch": 0.04609929078014184, "frac_reward_zero_std": 0.0, "grad_norm": 0.2453913539648056, "kl": 0.0, "learning_rate": 8.571428571428572e-07, "loss": -0.0, "num_tokens": 227142.0, "reward": 7.173472881317139, "reward_std": 0.8060213923454285, "rewards/reward_model_wrapper/mean": 7.173472881317139, "rewards/reward_model_wrapper/std": 0.8060213923454285, "step": 13 }, { "completion_length": 416.5, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 416.5, "completions/mean_terminated_length": 416.5, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.04964539007092199, "frac_reward_zero_std": 0.0, "grad_norm": 0.5306812524795532, "kl": 0.0, "learning_rate": 9.285714285714287e-07, "loss": -0.0, "num_tokens": 238588.0, "reward": 8.875, "reward_std": 0.6075908541679382, "rewards/reward_model_wrapper/mean": 8.875, "rewards/reward_model_wrapper/std": 0.6075908541679382, "step": 14 }, { "completion_length": 489.25, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 489.25, "completions/mean_terminated_length": 489.25, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.05319148936170213, "frac_reward_zero_std": 0.0, "grad_norm": 1.0143647193908691, "kl": 0.0, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "num_tokens": 250693.0, "reward": 8.850000381469727, "reward_std": 1.3000000715255737, "rewards/reward_model_wrapper/mean": 8.850000381469727, "rewards/reward_model_wrapper/std": 1.2999999523162842, "step": 15 }, { "completion_length": 2304.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 2304.0, "completions/mean_terminated_length": 2304.0, "completions/min_length": 1817.0, "completions/min_terminated_length": 1817.0, "epoch": 0.05673758865248227, "frac_reward_zero_std": 0.0, "grad_norm": 0.40231409668922424, "kl": 0.0, "learning_rate": 1.0714285714285714e-06, "loss": -0.0, "num_tokens": 269149.0, "reward": 7.748144149780273, "reward_std": 1.2592248916625977, "rewards/reward_model_wrapper/mean": 7.748144149780273, "rewards/reward_model_wrapper/std": 1.2592248916625977, "step": 16 }, { "completion_length": 655.25, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 655.25, "completions/mean_terminated_length": 655.25, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.06028368794326241, "frac_reward_zero_std": 0.0, "grad_norm": 1.69741952419281, "kl": 0.0, "learning_rate": 1.142857142857143e-06, "loss": -0.0, "num_tokens": 280338.0, "reward": 7.024999618530273, "reward_std": 1.5966109037399292, "rewards/reward_model_wrapper/mean": 7.024999618530273, "rewards/reward_model_wrapper/std": 1.5966109037399292, "step": 17 }, { "completion_length": 2930.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4488.0, "completions/max_terminated_length": 4488.0, "completions/mean_length": 2930.75, "completions/mean_terminated_length": 2930.75, "completions/min_length": 2182.0, "completions/min_terminated_length": 2182.0, "epoch": 0.06382978723404255, "frac_reward_zero_std": 0.0, "grad_norm": 0.21302948892116547, "kl": 0.0, "learning_rate": 1.2142857142857144e-06, "loss": -0.0, "num_tokens": 300997.0, "reward": 6.0449395179748535, "reward_std": 0.7859781384468079, "rewards/reward_model_wrapper/mean": 6.0449395179748535, "rewards/reward_model_wrapper/std": 0.7859781384468079, "step": 18 }, { "completion_length": 2721.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2982.0, "completions/max_terminated_length": 2982.0, "completions/mean_length": 2721.25, "completions/mean_terminated_length": 2721.25, "completions/min_length": 2248.0, "completions/min_terminated_length": 2248.0, "epoch": 0.0673758865248227, "frac_reward_zero_std": 0.0, "grad_norm": 0.2560366988182068, "kl": 0.0, "learning_rate": 1.2857142857142856e-06, "loss": 0.0, "num_tokens": 320866.0, "reward": 4.647911071777344, "reward_std": 0.7257235646247864, "rewards/reward_model_wrapper/mean": 4.647911071777344, "rewards/reward_model_wrapper/std": 0.7257235646247864, "step": 19 }, { "completion_length": 2237.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 2237.5, "completions/mean_terminated_length": 2237.5, "completions/min_length": 1939.0, "completions/min_terminated_length": 1939.0, "epoch": 0.07092198581560284, "frac_reward_zero_std": 0.0, "grad_norm": 0.35093405842781067, "kl": 0.0, "learning_rate": 1.3571428571428572e-06, "loss": 0.0, "num_tokens": 338280.0, "reward": 7.896511077880859, "reward_std": 1.0942840576171875, "rewards/reward_model_wrapper/mean": 7.896511077880859, "rewards/reward_model_wrapper/std": 1.094283938407898, "step": 20 }, { "epoch": 0.07092198581560284, "eval_completion_length": 1752.25, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2015.3333333333333, "eval_completions/max_terminated_length": 2015.3333333333333, "eval_completions/mean_length": 1752.25, "eval_completions/mean_terminated_length": 1752.25, "eval_completions/min_length": 1506.3333333333333, "eval_completions/min_terminated_length": 1506.3333333333333, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": -1.7270595620288987e-08, "eval_num_tokens": 338280.0, "eval_reward": 6.9638644854227705, "eval_reward_std": 1.6500025788942974, "eval_rewards/reward_model_wrapper/mean": 6.9638644854227705, "eval_rewards/reward_model_wrapper/std": 1.6500025590260823, "eval_runtime": 280.0735, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 20 }, { "completion_length": 563.25, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 563.25, "completions/mean_terminated_length": 563.25, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.07446808510638298, "frac_reward_zero_std": 0.0, "grad_norm": 3.217763900756836, "kl": 0.0, "learning_rate": 1.4285714285714286e-06, "loss": -0.0, "num_tokens": 351209.0, "reward": 6.025000095367432, "reward_std": 3.3688526153564453, "rewards/reward_model_wrapper/mean": 6.025000095367432, "rewards/reward_model_wrapper/std": 3.3688528537750244, "step": 21 }, { "completion_length": 2458.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2919.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 2458.5, "completions/mean_terminated_length": 2458.5, "completions/min_length": 1865.0, "completions/min_terminated_length": 1865.0, "epoch": 0.07801418439716312, "frac_reward_zero_std": 0.0, "grad_norm": 0.27910012006759644, "kl": 0.0, "learning_rate": 1.5e-06, "loss": -0.0, "num_tokens": 370979.0, "reward": 6.721708297729492, "reward_std": 0.8965873122215271, "rewards/reward_model_wrapper/mean": 6.721708297729492, "rewards/reward_model_wrapper/std": 0.8965873122215271, "step": 22 }, { "completion_length": 630.25, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 630.25, "completions/mean_terminated_length": 630.25, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.08156028368794327, "frac_reward_zero_std": 0.0, "grad_norm": 2.1498959064483643, "kl": 0.0, "learning_rate": 1.5714285714285714e-06, "loss": 0.0, "num_tokens": 382448.0, "reward": 6.425000190734863, "reward_std": 2.1219096183776855, "rewards/reward_model_wrapper/mean": 6.425000190734863, "rewards/reward_model_wrapper/std": 2.1219096183776855, "step": 23 }, { "completion_length": 2216.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 2216.0, "completions/mean_terminated_length": 2216.0, "completions/min_length": 2060.0, "completions/min_terminated_length": 2060.0, "epoch": 0.0851063829787234, "frac_reward_zero_std": 0.0, "grad_norm": 0.5428902506828308, "kl": 0.0, "learning_rate": 1.642857142857143e-06, "loss": 0.0, "num_tokens": 400164.0, "reward": 6.471179962158203, "reward_std": 1.9223214387893677, "rewards/reward_model_wrapper/mean": 6.471179962158203, "rewards/reward_model_wrapper/std": 1.9223215579986572, "step": 24 }, { "completion_length": 439.25, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 439.25, "completions/mean_terminated_length": 439.25, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.08865248226950355, "frac_reward_zero_std": 0.0, "grad_norm": 0.6949577927589417, "kl": 0.0, "learning_rate": 1.7142857142857145e-06, "loss": -0.0, "num_tokens": 410805.0, "reward": 8.524999618530273, "reward_std": 0.6849574446678162, "rewards/reward_model_wrapper/mean": 8.524999618530273, "rewards/reward_model_wrapper/std": 0.6849573254585266, "step": 25 }, { "completion_length": 1639.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1639.25, "completions/mean_terminated_length": 1639.25, "completions/min_length": 1435.0, "completions/min_terminated_length": 1435.0, "epoch": 0.09219858156028368, "frac_reward_zero_std": 0.0, "grad_norm": 0.64581298828125, "kl": 0.0, "learning_rate": 1.7857142857142859e-06, "loss": -0.0, "num_tokens": 427102.0, "reward": 8.746435165405273, "reward_std": 1.4508137702941895, "rewards/reward_model_wrapper/mean": 8.746435165405273, "rewards/reward_model_wrapper/std": 1.450813889503479, "step": 26 }, { "completion_length": 2275.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 2275.0, "completions/mean_terminated_length": 2275.0, "completions/min_length": 2060.0, "completions/min_terminated_length": 2060.0, "epoch": 0.09574468085106383, "frac_reward_zero_std": 0.0, "grad_norm": 0.2987869381904602, "kl": 0.0, "learning_rate": 1.8571428571428573e-06, "loss": 0.0, "num_tokens": 445226.0, "reward": 7.54554557800293, "reward_std": 0.9154556393623352, "rewards/reward_model_wrapper/mean": 7.54554557800293, "rewards/reward_model_wrapper/std": 0.9154555797576904, "step": 27 }, { "completion_length": 1989.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 1989.5, "completions/mean_terminated_length": 1989.5, "completions/min_length": 1862.0, "completions/min_terminated_length": 1862.0, "epoch": 0.09929078014184398, "frac_reward_zero_std": 0.0, "grad_norm": 0.24192310869693756, "kl": 0.0, "learning_rate": 1.928571428571429e-06, "loss": -0.0, "num_tokens": 462820.0, "reward": 6.125, "reward_std": 0.7135590314865112, "rewards/reward_model_wrapper/mean": 6.125, "rewards/reward_model_wrapper/std": 0.713559091091156, "step": 28 }, { "completion_length": 704.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 704.0, "completions/mean_terminated_length": 704.0, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.10283687943262411, "frac_reward_zero_std": 0.0, "grad_norm": 1.2528235912322998, "kl": 0.0, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "num_tokens": 476320.0, "reward": 8.125, "reward_std": 1.4103782176971436, "rewards/reward_model_wrapper/mean": 8.125, "rewards/reward_model_wrapper/std": 1.4103782176971436, "step": 29 }, { "completion_length": 779.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 779.5, "completions/mean_terminated_length": 779.5, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.10638297872340426, "frac_reward_zero_std": 0.0, "grad_norm": 2.181734085083008, "kl": 0.0, "learning_rate": 2.0714285714285717e-06, "loss": -0.0, "num_tokens": 489798.0, "reward": 6.347088813781738, "reward_std": 2.401723623275757, "rewards/reward_model_wrapper/mean": 6.347088813781738, "rewards/reward_model_wrapper/std": 2.401723623275757, "step": 30 }, { "completion_length": 2530.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 2530.75, "completions/mean_terminated_length": 2530.75, "completions/min_length": 2278.0, "completions/min_terminated_length": 2278.0, "epoch": 0.1099290780141844, "frac_reward_zero_std": 0.0, "grad_norm": 0.48727503418922424, "kl": 0.0, "learning_rate": 2.1428571428571427e-06, "loss": 0.0, "num_tokens": 510221.0, "reward": 6.519838809967041, "reward_std": 1.4019051790237427, "rewards/reward_model_wrapper/mean": 6.519838809967041, "rewards/reward_model_wrapper/std": 1.4019051790237427, "step": 31 }, { "completion_length": 2456.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3087.0, "completions/max_terminated_length": 3087.0, "completions/mean_length": 2456.25, "completions/mean_terminated_length": 2456.25, "completions/min_length": 1943.0, "completions/min_terminated_length": 1943.0, "epoch": 0.11347517730496454, "frac_reward_zero_std": 0.0, "grad_norm": 0.35889488458633423, "kl": 0.0, "learning_rate": 2.2142857142857146e-06, "loss": -0.0, "num_tokens": 529214.0, "reward": 7.093780040740967, "reward_std": 0.982136607170105, "rewards/reward_model_wrapper/mean": 7.093780040740967, "rewards/reward_model_wrapper/std": 0.9821364879608154, "step": 32 }, { "completion_length": 2132.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 2132.5, "completions/mean_terminated_length": 2132.5, "completions/min_length": 1824.0, "completions/min_terminated_length": 1824.0, "epoch": 0.11702127659574468, "frac_reward_zero_std": 0.0, "grad_norm": 0.5538814663887024, "kl": 0.0, "learning_rate": 2.285714285714286e-06, "loss": 0.0, "num_tokens": 547152.0, "reward": 7.193449974060059, "reward_std": 1.4048391580581665, "rewards/reward_model_wrapper/mean": 7.193449974060059, "rewards/reward_model_wrapper/std": 1.4048391580581665, "step": 33 }, { "completion_length": 3178.25, "completions/clipped_ratio": 0.25, "completions/max_length": 5290.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 3178.25, "completions/mean_terminated_length": 2474.33349609375, "completions/min_length": 2413.0, "completions/min_terminated_length": 2413.0, "epoch": 0.12056737588652482, "frac_reward_zero_std": 0.0, "grad_norm": 0.7284203171730042, "kl": 0.0, "learning_rate": 2.3571428571428574e-06, "loss": -0.0, "num_tokens": 563611.0, "reward": 5.4159464836120605, "reward_std": 3.6559295654296875, "rewards/reward_model_wrapper/mean": 5.4159464836120605, "rewards/reward_model_wrapper/std": 3.6559293270111084, "step": 34 }, { "completion_length": 409.25, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 409.25, "completions/mean_terminated_length": 409.25, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.12411347517730496, "frac_reward_zero_std": 0.0, "grad_norm": 0.6938031315803528, "kl": 0.0, "learning_rate": 2.428571428571429e-06, "loss": 0.0, "num_tokens": 574200.0, "reward": 8.850000381469727, "reward_std": 0.7681146264076233, "rewards/reward_model_wrapper/mean": 8.850000381469727, "rewards/reward_model_wrapper/std": 0.7681146860122681, "step": 35 }, { "completion_length": 2259.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 2259.5, "completions/mean_terminated_length": 2259.5, "completions/min_length": 1781.0, "completions/min_terminated_length": 1781.0, "epoch": 0.1276595744680851, "frac_reward_zero_std": 0.0, "grad_norm": 0.2925398051738739, "kl": 0.0, "learning_rate": 2.5e-06, "loss": 0.0, "num_tokens": 593166.0, "reward": 7.712985515594482, "reward_std": 0.8043503761291504, "rewards/reward_model_wrapper/mean": 7.712985515594482, "rewards/reward_model_wrapper/std": 0.8043503761291504, "step": 36 }, { "completion_length": 683.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 683.75, "completions/mean_terminated_length": 683.75, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.13120567375886524, "frac_reward_zero_std": 0.0, "grad_norm": 3.620107412338257, "kl": 0.0, "learning_rate": 2.571428571428571e-06, "loss": -0.0, "num_tokens": 605141.0, "reward": 5.125, "reward_std": 3.6417715549468994, "rewards/reward_model_wrapper/mean": 5.125, "rewards/reward_model_wrapper/std": 3.6417715549468994, "step": 37 }, { "completion_length": 2444.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 2444.75, "completions/mean_terminated_length": 2444.75, "completions/min_length": 2257.0, "completions/min_terminated_length": 2257.0, "epoch": 0.1347517730496454, "frac_reward_zero_std": 0.0, "grad_norm": 0.27157241106033325, "kl": 0.0, "learning_rate": 2.642857142857143e-06, "loss": -0.0, "num_tokens": 623636.0, "reward": 6.25681734085083, "reward_std": 0.7134534120559692, "rewards/reward_model_wrapper/mean": 6.25681734085083, "rewards/reward_model_wrapper/std": 0.7134534120559692, "step": 38 }, { "completion_length": 460.5, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 460.5, "completions/mean_terminated_length": 460.5, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.13829787234042554, "frac_reward_zero_std": 0.0, "grad_norm": 0.38862529397010803, "kl": 0.0, "learning_rate": 2.7142857142857144e-06, "loss": -0.0, "num_tokens": 634978.0, "reward": 8.399999618530273, "reward_std": 0.37416601181030273, "rewards/reward_model_wrapper/mean": 8.399999618530273, "rewards/reward_model_wrapper/std": 0.37416577339172363, "step": 39 }, { "completion_length": 2484.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2688.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 2484.25, "completions/mean_terminated_length": 2484.25, "completions/min_length": 2160.0, "completions/min_terminated_length": 2160.0, "epoch": 0.14184397163120568, "frac_reward_zero_std": 0.0, "grad_norm": 0.513646125793457, "kl": 0.0, "learning_rate": 2.785714285714286e-06, "loss": 0.0, "num_tokens": 654295.0, "reward": 5.962157249450684, "reward_std": 1.4933234453201294, "rewards/reward_model_wrapper/mean": 5.962157249450684, "rewards/reward_model_wrapper/std": 1.4933234453201294, "step": 40 }, { "epoch": 0.14184397163120568, "eval_completion_length": 1798.6666666666667, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1999.0, "eval_completions/max_terminated_length": 1999.0, "eval_completions/mean_length": 1798.6666666666667, "eval_completions/mean_terminated_length": 1798.6666666666667, "eval_completions/min_length": 1504.0, "eval_completions/min_terminated_length": 1504.0, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": -4.380959950367469e-08, "eval_num_tokens": 654295.0, "eval_reward": 7.893599033355713, "eval_reward_std": 1.1092036565144856, "eval_rewards/reward_model_wrapper/mean": 7.893599033355713, "eval_rewards/reward_model_wrapper/std": 1.1092037359873455, "eval_runtime": 285.006, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 40 }, { "completion_length": 412.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 412.0, "completions/mean_terminated_length": 412.0, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.1453900709219858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2787930965423584, "kl": 0.0, "learning_rate": 2.8571428571428573e-06, "loss": 0.0, "num_tokens": 664303.0, "reward": 9.450000762939453, "reward_std": 0.3000001907348633, "rewards/reward_model_wrapper/mean": 9.450000762939453, "rewards/reward_model_wrapper/std": 0.3000001907348633, "step": 41 }, { "completion_length": 2280.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 2280.5, "completions/mean_terminated_length": 2280.5, "completions/min_length": 1967.0, "completions/min_terminated_length": 1967.0, "epoch": 0.14893617021276595, "frac_reward_zero_std": 0.0, "grad_norm": 0.3746965229511261, "kl": 0.0, "learning_rate": 2.928571428571429e-06, "loss": -0.0, "num_tokens": 681809.0, "reward": 6.840847969055176, "reward_std": 1.1893590688705444, "rewards/reward_model_wrapper/mean": 6.840847969055176, "rewards/reward_model_wrapper/std": 1.1893590688705444, "step": 42 }, { "completion_length": 789.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 789.25, "completions/mean_terminated_length": 789.25, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.1524822695035461, "frac_reward_zero_std": 0.0, "grad_norm": 2.537296772003174, "kl": 0.0, "learning_rate": 3e-06, "loss": -0.0, "num_tokens": 695326.0, "reward": 5.275000095367432, "reward_std": 3.6298532485961914, "rewards/reward_model_wrapper/mean": 5.275000095367432, "rewards/reward_model_wrapper/std": 3.6298530101776123, "step": 43 }, { "completion_length": 2584.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 2584.0, "completions/mean_terminated_length": 2584.0, "completions/min_length": 2485.0, "completions/min_terminated_length": 2485.0, "epoch": 0.15602836879432624, "frac_reward_zero_std": 0.0, "grad_norm": 0.8075482249259949, "kl": 0.0, "learning_rate": 3.071428571428572e-06, "loss": -0.0, "num_tokens": 714006.0, "reward": 4.777698993682861, "reward_std": 2.413390636444092, "rewards/reward_model_wrapper/mean": 4.777698993682861, "rewards/reward_model_wrapper/std": 2.413390636444092, "step": 44 }, { "completion_length": 797.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 797.5, "completions/mean_terminated_length": 797.5, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.1595744680851064, "frac_reward_zero_std": 0.0, "grad_norm": 1.8642765283584595, "kl": 0.0, "learning_rate": 3.142857142857143e-06, "loss": -0.0, "num_tokens": 726444.0, "reward": 6.7690887451171875, "reward_std": 3.0840888023376465, "rewards/reward_model_wrapper/mean": 6.7690887451171875, "rewards/reward_model_wrapper/std": 3.0840888023376465, "step": 45 }, { "completion_length": 510.25, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 510.25, "completions/mean_terminated_length": 510.25, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.16312056737588654, "frac_reward_zero_std": 0.0, "grad_norm": 1.441992998123169, "kl": 0.0, "learning_rate": 3.2142857142857147e-06, "loss": 0.0, "num_tokens": 736765.0, "reward": 7.125, "reward_std": 1.623524785041809, "rewards/reward_model_wrapper/mean": 7.125, "rewards/reward_model_wrapper/std": 1.623524785041809, "step": 46 }, { "completion_length": 2423.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 2423.75, "completions/mean_terminated_length": 2423.75, "completions/min_length": 2146.0, "completions/min_terminated_length": 2146.0, "epoch": 0.16666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.7147428393363953, "kl": 0.0, "learning_rate": 3.285714285714286e-06, "loss": -0.0, "num_tokens": 755500.0, "reward": 6.6348724365234375, "reward_std": 2.5032196044921875, "rewards/reward_model_wrapper/mean": 6.6348724365234375, "rewards/reward_model_wrapper/std": 2.5032196044921875, "step": 47 }, { "completion_length": 579.25, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 579.25, "completions/mean_terminated_length": 579.25, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.1702127659574468, "frac_reward_zero_std": 0.0, "grad_norm": 1.028961420059204, "kl": 0.0, "learning_rate": 3.357142857142857e-06, "loss": 0.0, "num_tokens": 765953.0, "reward": 7.850000381469727, "reward_std": 1.3329169750213623, "rewards/reward_model_wrapper/mean": 7.850000381469727, "rewards/reward_model_wrapper/std": 1.3329167366027832, "step": 48 }, { "completion_length": 2689.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3158.0, "completions/max_terminated_length": 3158.0, "completions/mean_length": 2689.0, "completions/mean_terminated_length": 2689.0, "completions/min_length": 2207.0, "completions/min_terminated_length": 2207.0, "epoch": 0.17375886524822695, "frac_reward_zero_std": 0.0, "grad_norm": 0.8769707679748535, "kl": 0.0, "learning_rate": 3.428571428571429e-06, "loss": 0.0, "num_tokens": 787045.0, "reward": 6.579168319702148, "reward_std": 2.897588014602661, "rewards/reward_model_wrapper/mean": 6.579168319702148, "rewards/reward_model_wrapper/std": 2.8975882530212402, "step": 49 }, { "completion_length": 2257.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 2257.5, "completions/mean_terminated_length": 2257.5, "completions/min_length": 1955.0, "completions/min_terminated_length": 1955.0, "epoch": 0.1773049645390071, "frac_reward_zero_std": 0.0, "grad_norm": 0.5193084478378296, "kl": 0.0, "learning_rate": 3.5e-06, "loss": -0.0, "num_tokens": 805199.0, "reward": 7.3177080154418945, "reward_std": 1.4581682682037354, "rewards/reward_model_wrapper/mean": 7.3177080154418945, "rewards/reward_model_wrapper/std": 1.458168387413025, "step": 50 }, { "completion_length": 2750.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3094.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 2750.75, "completions/mean_terminated_length": 2750.75, "completions/min_length": 2509.0, "completions/min_terminated_length": 2509.0, "epoch": 0.18085106382978725, "frac_reward_zero_std": 0.0, "grad_norm": 0.18432997167110443, "kl": 0.0, "learning_rate": 3.5714285714285718e-06, "loss": -0.0, "num_tokens": 827250.0, "reward": 6.096419811248779, "reward_std": 0.6123758554458618, "rewards/reward_model_wrapper/mean": 6.096419811248779, "rewards/reward_model_wrapper/std": 0.6123759150505066, "step": 51 }, { "completion_length": 584.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 584.0, "completions/mean_terminated_length": 584.0, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.18439716312056736, "frac_reward_zero_std": 0.0, "grad_norm": 0.6591967940330505, "kl": 0.0, "learning_rate": 3.642857142857143e-06, "loss": -0.0, "num_tokens": 839642.0, "reward": 9.149999618530273, "reward_std": 0.793725311756134, "rewards/reward_model_wrapper/mean": 9.149999618530273, "rewards/reward_model_wrapper/std": 0.793725311756134, "step": 52 }, { "completion_length": 2219.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 2219.75, "completions/mean_terminated_length": 2219.75, "completions/min_length": 1932.0, "completions/min_terminated_length": 1932.0, "epoch": 0.1879432624113475, "frac_reward_zero_std": 0.0, "grad_norm": 0.3163740336894989, "kl": 0.0, "learning_rate": 3.7142857142857146e-06, "loss": -0.0, "num_tokens": 857349.0, "reward": 5.83310604095459, "reward_std": 0.9162445664405823, "rewards/reward_model_wrapper/mean": 5.83310604095459, "rewards/reward_model_wrapper/std": 0.9162445068359375, "step": 53 }, { "completion_length": 554.5, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 554.5, "completions/mean_terminated_length": 554.5, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.19148936170212766, "frac_reward_zero_std": 0.0, "grad_norm": 0.2564683258533478, "kl": 0.0, "learning_rate": 3.785714285714286e-06, "loss": 0.0, "num_tokens": 868103.0, "reward": 7.800000190734863, "reward_std": 0.2943919897079468, "rewards/reward_model_wrapper/mean": 7.800000190734863, "rewards/reward_model_wrapper/std": 0.2943919897079468, "step": 54 }, { "completion_length": 527.25, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 527.25, "completions/mean_terminated_length": 527.25, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.1950354609929078, "frac_reward_zero_std": 0.0, "grad_norm": 1.3544580936431885, "kl": 0.0, "learning_rate": 3.857142857142858e-06, "loss": -0.0, "num_tokens": 879504.0, "reward": 8.699999809265137, "reward_std": 1.8018509149551392, "rewards/reward_model_wrapper/mean": 8.699999809265137, "rewards/reward_model_wrapper/std": 1.8018509149551392, "step": 55 }, { "completion_length": 2547.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2841.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 2547.0, "completions/mean_terminated_length": 2547.0, "completions/min_length": 2386.0, "completions/min_terminated_length": 2386.0, "epoch": 0.19858156028368795, "frac_reward_zero_std": 0.0, "grad_norm": 0.2155103087425232, "kl": 0.0, "learning_rate": 3.928571428571429e-06, "loss": 0.0, "num_tokens": 899352.0, "reward": 7.150000095367432, "reward_std": 0.6557435393333435, "rewards/reward_model_wrapper/mean": 7.150000095367432, "rewards/reward_model_wrapper/std": 0.6557436585426331, "step": 56 }, { "completion_length": 404.25, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 404.25, "completions/mean_terminated_length": 404.25, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.20212765957446807, "frac_reward_zero_std": 0.0, "grad_norm": 0.2633416950702667, "kl": 0.0, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "num_tokens": 909577.0, "reward": 9.125, "reward_std": 0.22173549234867096, "rewards/reward_model_wrapper/mean": 9.125, "rewards/reward_model_wrapper/std": 0.22173553705215454, "step": 57 }, { "completion_length": 3444.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4210.0, "completions/max_terminated_length": 4210.0, "completions/mean_length": 3444.75, "completions/mean_terminated_length": 3444.75, "completions/min_length": 2770.0, "completions/min_terminated_length": 2770.0, "epoch": 0.20567375886524822, "frac_reward_zero_std": 0.0, "grad_norm": 0.49284127354621887, "kl": 0.0, "learning_rate": 4.071428571428572e-06, "loss": 0.0, "num_tokens": 934052.0, "reward": 5.5309529304504395, "reward_std": 1.5419420003890991, "rewards/reward_model_wrapper/mean": 5.5309529304504395, "rewards/reward_model_wrapper/std": 1.5419420003890991, "step": 58 }, { "completion_length": 576.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 576.0, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.20921985815602837, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484453678131104, "kl": 0.0, "learning_rate": 4.1428571428571435e-06, "loss": -0.0, "num_tokens": 945920.0, "reward": 8.34999942779541, "reward_std": 1.6663329601287842, "rewards/reward_model_wrapper/mean": 8.34999942779541, "rewards/reward_model_wrapper/std": 1.6663331985473633, "step": 59 }, { "completion_length": 2335.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 2335.5, "completions/mean_terminated_length": 2335.5, "completions/min_length": 1921.0, "completions/min_terminated_length": 1921.0, "epoch": 0.2127659574468085, "frac_reward_zero_std": 0.0, "grad_norm": 0.1321672797203064, "kl": 0.0, "learning_rate": 4.2142857142857145e-06, "loss": 0.0, "num_tokens": 965374.0, "reward": 6.520479679107666, "reward_std": 0.4530620276927948, "rewards/reward_model_wrapper/mean": 6.520479679107666, "rewards/reward_model_wrapper/std": 0.4530620276927948, "step": 60 }, { "epoch": 0.2127659574468085, "eval_completion_length": 1793.3333333333333, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2035.6666666666667, "eval_completions/max_terminated_length": 2035.6666666666667, "eval_completions/mean_length": 1793.3333333333333, "eval_completions/mean_terminated_length": 1793.3333333333333, "eval_completions/min_length": 1633.0, "eval_completions/min_terminated_length": 1633.0, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": -7.477217955909055e-09, "eval_num_tokens": 965374.0, "eval_reward": 7.822843869527181, "eval_reward_std": 1.000960757335027, "eval_rewards/reward_model_wrapper/mean": 7.822843869527181, "eval_rewards/reward_model_wrapper/std": 1.000960757335027, "eval_runtime": 280.7169, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 60 }, { "completion_length": 438.25, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 438.25, "completions/mean_terminated_length": 438.25, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.21631205673758866, "frac_reward_zero_std": 0.0, "grad_norm": 2.3442351818084717, "kl": 0.0, "learning_rate": 4.2857142857142855e-06, "loss": 0.0, "num_tokens": 976607.0, "reward": 6.200000286102295, "reward_std": 2.0976176261901855, "rewards/reward_model_wrapper/mean": 6.200000286102295, "rewards/reward_model_wrapper/std": 2.0976178646087646, "step": 61 }, { "completion_length": 1946.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1946.0, "completions/mean_terminated_length": 1946.0, "completions/min_length": 1924.0, "completions/min_terminated_length": 1924.0, "epoch": 0.2198581560283688, "frac_reward_zero_std": 0.0, "grad_norm": 0.3163416385650635, "kl": 0.0, "learning_rate": 4.357142857142857e-06, "loss": 0.0, "num_tokens": 993871.0, "reward": 7.9882049560546875, "reward_std": 0.8443927764892578, "rewards/reward_model_wrapper/mean": 7.9882049560546875, "rewards/reward_model_wrapper/std": 0.8443926572799683, "step": 62 }, { "completion_length": 453.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 453.0, "completions/mean_terminated_length": 453.0, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.22340425531914893, "frac_reward_zero_std": 0.0, "grad_norm": 0.8453798890113831, "kl": 0.0, "learning_rate": 4.428571428571429e-06, "loss": -0.0, "num_tokens": 1004839.0, "reward": 8.824999809265137, "reward_std": 0.8845905065536499, "rewards/reward_model_wrapper/mean": 8.824999809265137, "rewards/reward_model_wrapper/std": 0.8845903873443604, "step": 63 }, { "completion_length": 2117.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 2117.0, "completions/mean_terminated_length": 2117.0, "completions/min_length": 1495.0, "completions/min_terminated_length": 1495.0, "epoch": 0.22695035460992907, "frac_reward_zero_std": 0.0, "grad_norm": 0.3863529562950134, "kl": 0.0, "learning_rate": 4.5e-06, "loss": 0.0, "num_tokens": 1021815.0, "reward": 7.234441757202148, "reward_std": 0.9137738943099976, "rewards/reward_model_wrapper/mean": 7.234441757202148, "rewards/reward_model_wrapper/std": 0.9137738347053528, "step": 64 }, { "completion_length": 454.25, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 454.25, "completions/mean_terminated_length": 454.25, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.23049645390070922, "frac_reward_zero_std": 0.0, "grad_norm": 0.6597458720207214, "kl": 0.0, "learning_rate": 4.571428571428572e-06, "loss": -0.0, "num_tokens": 1033140.0, "reward": 9.174999237060547, "reward_std": 0.6652067303657532, "rewards/reward_model_wrapper/mean": 9.174999237060547, "rewards/reward_model_wrapper/std": 0.6652067303657532, "step": 65 }, { "completion_length": 2188.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 2188.5, "completions/mean_terminated_length": 2188.5, "completions/min_length": 1955.0, "completions/min_terminated_length": 1955.0, "epoch": 0.23404255319148937, "frac_reward_zero_std": 0.0, "grad_norm": 0.25693178176879883, "kl": 0.0, "learning_rate": 4.642857142857144e-06, "loss": -0.0, "num_tokens": 1050378.0, "reward": 6.946656227111816, "reward_std": 0.6017813086509705, "rewards/reward_model_wrapper/mean": 6.946656227111816, "rewards/reward_model_wrapper/std": 0.6017813086509705, "step": 66 }, { "completion_length": 412.5, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 412.5, "completions/mean_terminated_length": 412.5, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.2375886524822695, "frac_reward_zero_std": 0.0, "grad_norm": 0.8362342715263367, "kl": 0.0, "learning_rate": 4.714285714285715e-06, "loss": 0.0, "num_tokens": 1061632.0, "reward": 8.975000381469727, "reward_std": 0.8500001430511475, "rewards/reward_model_wrapper/mean": 8.975000381469727, "rewards/reward_model_wrapper/std": 0.8500001430511475, "step": 67 }, { "completion_length": 2431.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 2431.5, "completions/mean_terminated_length": 2431.5, "completions/min_length": 2202.0, "completions/min_terminated_length": 2202.0, "epoch": 0.24113475177304963, "frac_reward_zero_std": 0.0, "grad_norm": 0.47485271096229553, "kl": 0.0, "learning_rate": 4.785714285714287e-06, "loss": -0.0, "num_tokens": 1081422.0, "reward": 7.010645866394043, "reward_std": 1.5614266395568848, "rewards/reward_model_wrapper/mean": 7.010645866394043, "rewards/reward_model_wrapper/std": 1.5614266395568848, "step": 68 }, { "completion_length": 2427.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 2427.0, "completions/mean_terminated_length": 2427.0, "completions/min_length": 1920.0, "completions/min_terminated_length": 1920.0, "epoch": 0.24468085106382978, "frac_reward_zero_std": 0.0, "grad_norm": 0.2435370534658432, "kl": 0.0, "learning_rate": 4.857142857142858e-06, "loss": 0.0, "num_tokens": 1101058.0, "reward": 7.045786380767822, "reward_std": 0.797202467918396, "rewards/reward_model_wrapper/mean": 7.045786380767822, "rewards/reward_model_wrapper/std": 0.7972025275230408, "step": 69 }, { "completion_length": 1891.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1891.75, "completions/mean_terminated_length": 1891.75, "completions/min_length": 1650.0, "completions/min_terminated_length": 1650.0, "epoch": 0.24822695035460993, "frac_reward_zero_std": 0.0, "grad_norm": 0.2595655918121338, "kl": 0.0, "learning_rate": 4.928571428571429e-06, "loss": 0.0, "num_tokens": 1116861.0, "reward": 8.587764739990234, "reward_std": 0.8105887174606323, "rewards/reward_model_wrapper/mean": 8.587764739990234, "rewards/reward_model_wrapper/std": 0.8105888366699219, "step": 70 }, { "completion_length": 2440.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 2440.75, "completions/mean_terminated_length": 2440.75, "completions/min_length": 2125.0, "completions/min_terminated_length": 2125.0, "epoch": 0.25177304964539005, "frac_reward_zero_std": 0.0, "grad_norm": 0.19154395163059235, "kl": 0.0, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 1135580.0, "reward": 8.099620819091797, "reward_std": 0.617441713809967, "rewards/reward_model_wrapper/mean": 8.099620819091797, "rewards/reward_model_wrapper/std": 0.617441713809967, "step": 71 }, { "completion_length": 2438.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3382.0, "completions/max_terminated_length": 3382.0, "completions/mean_length": 2438.75, "completions/mean_terminated_length": 2438.75, "completions/min_length": 2061.0, "completions/min_terminated_length": 2061.0, "epoch": 0.2553191489361702, "frac_reward_zero_std": 0.0, "grad_norm": 0.16602367162704468, "kl": 0.0, "learning_rate": 4.992063492063493e-06, "loss": 0.0, "num_tokens": 1155179.0, "reward": 6.976248741149902, "reward_std": 0.4972566068172455, "rewards/reward_model_wrapper/mean": 6.976248741149902, "rewards/reward_model_wrapper/std": 0.4972566068172455, "step": 72 }, { "completion_length": 2367.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2664.0, "completions/max_terminated_length": 2664.0, "completions/mean_length": 2367.5, "completions/mean_terminated_length": 2367.5, "completions/min_length": 2091.0, "completions/min_terminated_length": 2091.0, "epoch": 0.25886524822695034, "frac_reward_zero_std": 0.0, "grad_norm": 0.39853912591934204, "kl": 0.0, "learning_rate": 4.9841269841269845e-06, "loss": 0.0, "num_tokens": 1174385.0, "reward": 7.407626152038574, "reward_std": 1.2193504571914673, "rewards/reward_model_wrapper/mean": 7.407626152038574, "rewards/reward_model_wrapper/std": 1.2193505764007568, "step": 73 }, { "completion_length": 2268.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 2268.0, "completions/mean_terminated_length": 2268.0, "completions/min_length": 2022.0, "completions/min_terminated_length": 2022.0, "epoch": 0.2624113475177305, "frac_reward_zero_std": 0.0, "grad_norm": 0.5691909193992615, "kl": 0.0, "learning_rate": 4.976190476190477e-06, "loss": -0.0, "num_tokens": 1192461.0, "reward": 6.209198951721191, "reward_std": 1.4776816368103027, "rewards/reward_model_wrapper/mean": 6.209198951721191, "rewards/reward_model_wrapper/std": 1.4776816368103027, "step": 74 }, { "completion_length": 420.75, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 420.75, "completions/mean_terminated_length": 420.75, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.26595744680851063, "frac_reward_zero_std": 0.0, "grad_norm": 2.7742209434509277, "kl": 0.0, "learning_rate": 4.968253968253969e-06, "loss": -0.0, "num_tokens": 1203096.0, "reward": 7.625, "reward_std": 2.3070545196533203, "rewards/reward_model_wrapper/mean": 7.625, "rewards/reward_model_wrapper/std": 2.3070545196533203, "step": 75 }, { "completion_length": 1959.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 1959.25, "completions/mean_terminated_length": 1959.25, "completions/min_length": 1482.0, "completions/min_terminated_length": 1482.0, "epoch": 0.2695035460992908, "frac_reward_zero_std": 0.0, "grad_norm": 0.4712032973766327, "kl": 0.0, "learning_rate": 4.960317460317461e-06, "loss": 0.0, "num_tokens": 1221129.0, "reward": 6.871856212615967, "reward_std": 1.0656650066375732, "rewards/reward_model_wrapper/mean": 6.871856212615967, "rewards/reward_model_wrapper/std": 1.0656651258468628, "step": 76 }, { "completion_length": 387.5, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 387.5, "completions/mean_terminated_length": 387.5, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.2730496453900709, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.952380952380953e-06, "loss": 0.0, "num_tokens": 1232287.0, "reward": 9.5, "reward_std": 0.0, "rewards/reward_model_wrapper/mean": 9.5, "rewards/reward_model_wrapper/std": 0.0, "step": 77 }, { "completion_length": 2221.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 2221.0, "completions/mean_terminated_length": 2221.0, "completions/min_length": 2036.0, "completions/min_terminated_length": 2036.0, "epoch": 0.2765957446808511, "frac_reward_zero_std": 0.0, "grad_norm": 0.23646359145641327, "kl": 0.0, "learning_rate": 4.944444444444445e-06, "loss": 0.0, "num_tokens": 1249839.0, "reward": 6.262145042419434, "reward_std": 0.6967249512672424, "rewards/reward_model_wrapper/mean": 6.262145042419434, "rewards/reward_model_wrapper/std": 0.6967249512672424, "step": 78 }, { "completion_length": 1968.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 1968.0, "completions/mean_terminated_length": 1968.0, "completions/min_length": 1651.0, "completions/min_terminated_length": 1651.0, "epoch": 0.2801418439716312, "frac_reward_zero_std": 0.0, "grad_norm": 0.15990205109119415, "kl": 0.0, "learning_rate": 4.936507936507937e-06, "loss": -0.0, "num_tokens": 1266711.0, "reward": 8.097787857055664, "reward_std": 0.4715704023838043, "rewards/reward_model_wrapper/mean": 8.097787857055664, "rewards/reward_model_wrapper/std": 0.47157034277915955, "step": 79 }, { "completion_length": 548.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 548.0, "completions/mean_terminated_length": 548.0, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.28368794326241137, "frac_reward_zero_std": 0.0, "grad_norm": 2.202300786972046, "kl": 0.0, "learning_rate": 4.928571428571429e-06, "loss": 0.0, "num_tokens": 1278227.0, "reward": 6.75, "reward_std": 2.111081838607788, "rewards/reward_model_wrapper/mean": 6.75, "rewards/reward_model_wrapper/std": 2.111081838607788, "step": 80 }, { "epoch": 0.28368794326241137, "eval_completion_length": 1814.4166666666667, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2118.6666666666665, "eval_completions/max_terminated_length": 2118.6666666666665, "eval_completions/mean_length": 1814.4166666666667, "eval_completions/mean_terminated_length": 1814.4166666666667, "eval_completions/min_length": 1588.6666666666667, "eval_completions/min_terminated_length": 1588.6666666666667, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 1.3958366196220595e-07, "eval_num_tokens": 1278227.0, "eval_reward": 7.458119710286458, "eval_reward_std": 0.8745014270146688, "eval_rewards/reward_model_wrapper/mean": 7.458119710286458, "eval_rewards/reward_model_wrapper/std": 0.8745013078053793, "eval_runtime": 287.049, "eval_samples_per_second": 0.01, "eval_steps_per_second": 0.003, "step": 80 }, { "completion_length": 2290.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2695.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 2290.0, "completions/mean_terminated_length": 2290.0, "completions/min_length": 1958.0, "completions/min_terminated_length": 1958.0, "epoch": 0.2872340425531915, "frac_reward_zero_std": 0.0, "grad_norm": 0.564092755317688, "kl": 0.0, "learning_rate": 4.920634920634921e-06, "loss": -0.0, "num_tokens": 1297335.0, "reward": 6.31046199798584, "reward_std": 1.445178508758545, "rewards/reward_model_wrapper/mean": 6.31046199798584, "rewards/reward_model_wrapper/std": 1.445178508758545, "step": 81 }, { "completion_length": 2793.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3640.0, "completions/max_terminated_length": 3640.0, "completions/mean_length": 2793.0, "completions/mean_terminated_length": 2793.0, "completions/min_length": 2123.0, "completions/min_terminated_length": 2123.0, "epoch": 0.2907801418439716, "frac_reward_zero_std": 0.0, "grad_norm": 0.5407919883728027, "kl": 0.0, "learning_rate": 4.912698412698413e-06, "loss": -0.0, "num_tokens": 1318983.0, "reward": 6.5652995109558105, "reward_std": 1.7705683708190918, "rewards/reward_model_wrapper/mean": 6.5652995109558105, "rewards/reward_model_wrapper/std": 1.7705683708190918, "step": 82 }, { "completion_length": 2393.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 2393.25, "completions/mean_terminated_length": 2393.25, "completions/min_length": 1929.0, "completions/min_terminated_length": 1929.0, "epoch": 0.29432624113475175, "frac_reward_zero_std": 0.0, "grad_norm": 1.4402027130126953, "kl": 0.0, "learning_rate": 4.904761904761905e-06, "loss": -0.0, "num_tokens": 1338496.0, "reward": 4.660165786743164, "reward_std": 3.9108667373657227, "rewards/reward_model_wrapper/mean": 4.660165786743164, "rewards/reward_model_wrapper/std": 3.9108667373657227, "step": 83 }, { "completion_length": 438.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 438.0, "completions/mean_terminated_length": 438.0, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.2978723404255319, "frac_reward_zero_std": 0.0, "grad_norm": 1.2837103605270386, "kl": 0.0, "learning_rate": 4.896825396825397e-06, "loss": 0.0, "num_tokens": 1349404.0, "reward": 8.950000762939453, "reward_std": 1.1090537309646606, "rewards/reward_model_wrapper/mean": 8.950000762939453, "rewards/reward_model_wrapper/std": 1.1090537309646606, "step": 84 }, { "completion_length": 2017.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2272.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 2017.25, "completions/mean_terminated_length": 2017.25, "completions/min_length": 1881.0, "completions/min_terminated_length": 1881.0, "epoch": 0.30141843971631205, "frac_reward_zero_std": 0.0, "grad_norm": 0.42920663952827454, "kl": 0.0, "learning_rate": 4.888888888888889e-06, "loss": 0.0, "num_tokens": 1367565.0, "reward": 6.955231666564941, "reward_std": 1.0950185060501099, "rewards/reward_model_wrapper/mean": 6.955231666564941, "rewards/reward_model_wrapper/std": 1.0950185060501099, "step": 85 }, { "completion_length": 2290.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2440.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 2290.25, "completions/mean_terminated_length": 2290.25, "completions/min_length": 2132.0, "completions/min_terminated_length": 2132.0, "epoch": 0.3049645390070922, "frac_reward_zero_std": 0.0, "grad_norm": 0.13954462110996246, "kl": 0.0, "learning_rate": 4.880952380952381e-06, "loss": 0.0, "num_tokens": 1386882.0, "reward": 8.255382537841797, "reward_std": 0.5295007824897766, "rewards/reward_model_wrapper/mean": 8.255382537841797, "rewards/reward_model_wrapper/std": 0.5295006036758423, "step": 86 }, { "completion_length": 2676.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2826.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 2676.25, "completions/mean_terminated_length": 2676.25, "completions/min_length": 2531.0, "completions/min_terminated_length": 2531.0, "epoch": 0.30851063829787234, "frac_reward_zero_std": 0.0, "grad_norm": 0.08985288441181183, "kl": 0.0, "learning_rate": 4.8730158730158735e-06, "loss": -0.0, "num_tokens": 1406523.0, "reward": 6.2893242835998535, "reward_std": 0.25009262561798096, "rewards/reward_model_wrapper/mean": 6.2893242835998535, "rewards/reward_model_wrapper/std": 0.25009262561798096, "step": 87 }, { "completion_length": 2355.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2835.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 2355.75, "completions/mean_terminated_length": 2355.75, "completions/min_length": 2156.0, "completions/min_terminated_length": 2156.0, "epoch": 0.3120567375886525, "frac_reward_zero_std": 0.0, "grad_norm": 0.3683772683143616, "kl": 0.0, "learning_rate": 4.865079365079365e-06, "loss": -0.0, "num_tokens": 1426554.0, "reward": 6.554407119750977, "reward_std": 1.027694821357727, "rewards/reward_model_wrapper/mean": 6.554407119750977, "rewards/reward_model_wrapper/std": 1.027694821357727, "step": 88 }, { "completion_length": 493.75, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 493.75, "completions/mean_terminated_length": 493.75, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.31560283687943264, "frac_reward_zero_std": 0.0, "grad_norm": 0.43611329793930054, "kl": 0.0, "learning_rate": 4.857142857142858e-06, "loss": -0.0, "num_tokens": 1439893.0, "reward": 9.399999618530273, "reward_std": 0.5033223032951355, "rewards/reward_model_wrapper/mean": 9.399999618530273, "rewards/reward_model_wrapper/std": 0.5033223032951355, "step": 89 }, { "completion_length": 1890.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 1890.5, "completions/mean_terminated_length": 1890.5, "completions/min_length": 1677.0, "completions/min_terminated_length": 1677.0, "epoch": 0.3191489361702128, "frac_reward_zero_std": 0.0, "grad_norm": 0.266103595495224, "kl": 0.0, "learning_rate": 4.849206349206349e-06, "loss": -0.0, "num_tokens": 1456155.0, "reward": 7.829479217529297, "reward_std": 0.7465320229530334, "rewards/reward_model_wrapper/mean": 7.829479217529297, "rewards/reward_model_wrapper/std": 0.7465320229530334, "step": 90 }, { "completion_length": 2328.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 2328.0, "completions/mean_terminated_length": 2328.0, "completions/min_length": 2088.0, "completions/min_terminated_length": 2088.0, "epoch": 0.32269503546099293, "frac_reward_zero_std": 0.0, "grad_norm": 0.32121556997299194, "kl": 0.0, "learning_rate": 4.841269841269842e-06, "loss": 0.0, "num_tokens": 1475079.0, "reward": 8.323665618896484, "reward_std": 0.9566733837127686, "rewards/reward_model_wrapper/mean": 8.323665618896484, "rewards/reward_model_wrapper/std": 0.9566734433174133, "step": 91 }, { "completion_length": 1950.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1950.75, "completions/mean_terminated_length": 1950.75, "completions/min_length": 1667.0, "completions/min_terminated_length": 1667.0, "epoch": 0.3262411347517731, "frac_reward_zero_std": 0.0, "grad_norm": 0.20439590513706207, "kl": 0.0, "learning_rate": 4.833333333333333e-06, "loss": 0.0, "num_tokens": 1491554.0, "reward": 8.77527904510498, "reward_std": 0.6501860022544861, "rewards/reward_model_wrapper/mean": 8.77527904510498, "rewards/reward_model_wrapper/std": 0.6501861214637756, "step": 92 }, { "completion_length": 401.25, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 401.25, "completions/mean_terminated_length": 401.25, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.32978723404255317, "frac_reward_zero_std": 0.0, "grad_norm": 0.8270260095596313, "kl": 0.0, "learning_rate": 4.825396825396826e-06, "loss": 0.0, "num_tokens": 1502071.0, "reward": 8.475000381469727, "reward_std": 0.6849574446678162, "rewards/reward_model_wrapper/mean": 8.475000381469727, "rewards/reward_model_wrapper/std": 0.6849573254585266, "step": 93 }, { "completion_length": 2697.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3255.0, "completions/max_terminated_length": 3255.0, "completions/mean_length": 2697.25, "completions/mean_terminated_length": 2697.25, "completions/min_length": 2423.0, "completions/min_terminated_length": 2423.0, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.3437507450580597, "kl": 0.0, "learning_rate": 4.817460317460318e-06, "loss": -0.0, "num_tokens": 1521412.0, "reward": 5.873003959655762, "reward_std": 1.1466795206069946, "rewards/reward_model_wrapper/mean": 5.873003959655762, "rewards/reward_model_wrapper/std": 1.1466796398162842, "step": 94 }, { "completion_length": 2076.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 2076.75, "completions/mean_terminated_length": 2076.75, "completions/min_length": 1834.0, "completions/min_terminated_length": 1834.0, "epoch": 0.33687943262411346, "frac_reward_zero_std": 0.0, "grad_norm": 0.2490464150905609, "kl": 0.0, "learning_rate": 4.80952380952381e-06, "loss": -0.0, "num_tokens": 1539275.0, "reward": 7.249784469604492, "reward_std": 0.7509192228317261, "rewards/reward_model_wrapper/mean": 7.249784469604492, "rewards/reward_model_wrapper/std": 0.7509191632270813, "step": 95 }, { "completion_length": 398.75, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 398.75, "completions/mean_terminated_length": 398.75, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.3404255319148936, "frac_reward_zero_std": 0.0, "grad_norm": 0.05594082549214363, "kl": 0.0, "learning_rate": 4.8015873015873025e-06, "loss": -0.0, "num_tokens": 1549870.0, "reward": 9.449999809265137, "reward_std": 0.057735245674848557, "rewards/reward_model_wrapper/mean": 9.449999809265137, "rewards/reward_model_wrapper/std": 0.057735245674848557, "step": 96 }, { "completion_length": 2215.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 2215.0, "completions/mean_terminated_length": 2215.0, "completions/min_length": 2117.0, "completions/min_terminated_length": 2117.0, "epoch": 0.34397163120567376, "frac_reward_zero_std": 0.0, "grad_norm": 0.2827056050300598, "kl": 0.0, "learning_rate": 4.793650793650794e-06, "loss": 0.0, "num_tokens": 1566942.0, "reward": 7.744659423828125, "reward_std": 0.7868044376373291, "rewards/reward_model_wrapper/mean": 7.744659423828125, "rewards/reward_model_wrapper/std": 0.7868044376373291, "step": 97 }, { "completion_length": 2244.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 2244.75, "completions/mean_terminated_length": 2244.75, "completions/min_length": 2069.0, "completions/min_terminated_length": 2069.0, "epoch": 0.3475177304964539, "frac_reward_zero_std": 0.0, "grad_norm": 0.24975745379924774, "kl": 0.0, "learning_rate": 4.785714285714287e-06, "loss": 0.0, "num_tokens": 1584277.0, "reward": 6.880223751068115, "reward_std": 0.7900782823562622, "rewards/reward_model_wrapper/mean": 6.880223751068115, "rewards/reward_model_wrapper/std": 0.790078341960907, "step": 98 }, { "completion_length": 2200.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 2200.25, "completions/mean_terminated_length": 2200.25, "completions/min_length": 1998.0, "completions/min_terminated_length": 1998.0, "epoch": 0.35106382978723405, "frac_reward_zero_std": 0.0, "grad_norm": 0.26868605613708496, "kl": 0.0, "learning_rate": 4.777777777777778e-06, "loss": -0.0, "num_tokens": 1601526.0, "reward": 7.865045070648193, "reward_std": 0.8130189776420593, "rewards/reward_model_wrapper/mean": 7.865045070648193, "rewards/reward_model_wrapper/std": 0.8130189180374146, "step": 99 }, { "completion_length": 387.75, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 387.75, "completions/mean_terminated_length": 387.75, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3546099290780142, "frac_reward_zero_std": 0.0, "grad_norm": 7.494416236877441, "kl": 0.0, "learning_rate": 4.769841269841271e-06, "loss": 0.0, "num_tokens": 1612881.0, "reward": 7.175000190734863, "reward_std": 4.784262180328369, "rewards/reward_model_wrapper/mean": 7.175000190734863, "rewards/reward_model_wrapper/std": 4.784262180328369, "step": 100 }, { "epoch": 0.3546099290780142, "eval_completion_length": 1692.5, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1925.6666666666667, "eval_completions/max_terminated_length": 1925.6666666666667, "eval_completions/mean_length": 1692.5, "eval_completions/mean_terminated_length": 1692.5, "eval_completions/min_length": 1543.6666666666667, "eval_completions/min_terminated_length": 1543.6666666666667, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": -6.889968773293731e-08, "eval_num_tokens": 1612881.0, "eval_reward": 7.3422651290893555, "eval_reward_std": 1.120375653107961, "eval_rewards/reward_model_wrapper/mean": 7.3422651290893555, "eval_rewards/reward_model_wrapper/std": 1.1203757723172505, "eval_runtime": 270.7246, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 100 }, { "completion_length": 2440.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 2440.5, "completions/mean_terminated_length": 2440.5, "completions/min_length": 2080.0, "completions/min_terminated_length": 2080.0, "epoch": 0.35815602836879434, "frac_reward_zero_std": 0.0, "grad_norm": 0.611094057559967, "kl": 0.0, "learning_rate": 4.761904761904762e-06, "loss": -0.0, "num_tokens": 1631939.0, "reward": 5.438066482543945, "reward_std": 1.6007986068725586, "rewards/reward_model_wrapper/mean": 5.438066482543945, "rewards/reward_model_wrapper/std": 1.6007986068725586, "step": 101 }, { "completion_length": 2332.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2636.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 2332.5, "completions/mean_terminated_length": 2332.5, "completions/min_length": 1887.0, "completions/min_terminated_length": 1887.0, "epoch": 0.3617021276595745, "frac_reward_zero_std": 0.0, "grad_norm": 0.738368570804596, "kl": 0.0, "learning_rate": 4.753968253968254e-06, "loss": 0.0, "num_tokens": 1649769.0, "reward": 6.860766410827637, "reward_std": 2.031764030456543, "rewards/reward_model_wrapper/mean": 6.860766410827637, "rewards/reward_model_wrapper/std": 2.031764030456543, "step": 102 }, { "completion_length": 2344.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 2344.25, "completions/mean_terminated_length": 2344.25, "completions/min_length": 2120.0, "completions/min_terminated_length": 2120.0, "epoch": 0.36524822695035464, "frac_reward_zero_std": 0.0, "grad_norm": 0.38690242171287537, "kl": 0.0, "learning_rate": 4.7460317460317465e-06, "loss": 0.0, "num_tokens": 1669190.0, "reward": 7.933413505554199, "reward_std": 1.203122854232788, "rewards/reward_model_wrapper/mean": 7.933413505554199, "rewards/reward_model_wrapper/std": 1.203122615814209, "step": 103 }, { "completion_length": 1983.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 1983.5, "completions/mean_terminated_length": 1983.5, "completions/min_length": 1912.0, "completions/min_terminated_length": 1912.0, "epoch": 0.36879432624113473, "frac_reward_zero_std": 0.0, "grad_norm": 0.9026746153831482, "kl": 0.0, "learning_rate": 4.738095238095238e-06, "loss": 0.0, "num_tokens": 1685676.0, "reward": 6.459442615509033, "reward_std": 2.438467502593994, "rewards/reward_model_wrapper/mean": 6.459442615509033, "rewards/reward_model_wrapper/std": 2.438467502593994, "step": 104 }, { "completion_length": 2670.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 2670.75, "completions/mean_terminated_length": 2670.75, "completions/min_length": 2471.0, "completions/min_terminated_length": 2471.0, "epoch": 0.3723404255319149, "frac_reward_zero_std": 0.0, "grad_norm": 0.08364684134721756, "kl": 0.0, "learning_rate": 4.730158730158731e-06, "loss": -0.0, "num_tokens": 1706027.0, "reward": 6.277359962463379, "reward_std": 0.2880236506462097, "rewards/reward_model_wrapper/mean": 6.277359962463379, "rewards/reward_model_wrapper/std": 0.2880236506462097, "step": 105 }, { "completion_length": 1955.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2092.0, "completions/max_terminated_length": 2092.0, "completions/mean_length": 1955.75, "completions/mean_terminated_length": 1955.75, "completions/min_length": 1695.0, "completions/min_terminated_length": 1695.0, "epoch": 0.375886524822695, "frac_reward_zero_std": 0.0, "grad_norm": 0.345714271068573, "kl": 0.0, "learning_rate": 4.722222222222222e-06, "loss": 0.0, "num_tokens": 1722950.0, "reward": 8.122148513793945, "reward_std": 1.0299512147903442, "rewards/reward_model_wrapper/mean": 8.122148513793945, "rewards/reward_model_wrapper/std": 1.0299512147903442, "step": 106 }, { "completion_length": 2156.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 2156.25, "completions/mean_terminated_length": 2156.25, "completions/min_length": 1970.0, "completions/min_terminated_length": 1970.0, "epoch": 0.37943262411347517, "frac_reward_zero_std": 0.0, "grad_norm": 0.7028115391731262, "kl": 0.0, "learning_rate": 4.714285714285715e-06, "loss": 0.0, "num_tokens": 1740355.0, "reward": 6.057631492614746, "reward_std": 2.0164129734039307, "rewards/reward_model_wrapper/mean": 6.057631492614746, "rewards/reward_model_wrapper/std": 2.0164132118225098, "step": 107 }, { "completion_length": 2465.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 2465.75, "completions/mean_terminated_length": 2465.75, "completions/min_length": 2260.0, "completions/min_terminated_length": 2260.0, "epoch": 0.3829787234042553, "frac_reward_zero_std": 0.0, "grad_norm": 0.5588269829750061, "kl": 0.0, "learning_rate": 4.706349206349206e-06, "loss": -0.0, "num_tokens": 1761454.0, "reward": 6.601025581359863, "reward_std": 1.2482508420944214, "rewards/reward_model_wrapper/mean": 6.601025581359863, "rewards/reward_model_wrapper/std": 1.2482508420944214, "step": 108 }, { "completion_length": 2602.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3270.0, "completions/max_terminated_length": 3270.0, "completions/mean_length": 2602.25, "completions/mean_terminated_length": 2602.25, "completions/min_length": 1844.0, "completions/min_terminated_length": 1844.0, "epoch": 0.38652482269503546, "frac_reward_zero_std": 0.0, "grad_norm": 0.25729504227638245, "kl": 0.0, "learning_rate": 4.698412698412699e-06, "loss": -0.0, "num_tokens": 1781971.0, "reward": 6.849999904632568, "reward_std": 0.7852813005447388, "rewards/reward_model_wrapper/mean": 6.849999904632568, "rewards/reward_model_wrapper/std": 0.7852813601493835, "step": 109 }, { "completion_length": 441.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 441.0, "completions/mean_terminated_length": 441.0, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.3900709219858156, "frac_reward_zero_std": 0.0, "grad_norm": 0.1176198348402977, "kl": 0.0, "learning_rate": 4.6904761904761905e-06, "loss": -0.0, "num_tokens": 1793755.0, "reward": 9.549999237060547, "reward_std": 0.12909956276416779, "rewards/reward_model_wrapper/mean": 9.549999237060547, "rewards/reward_model_wrapper/std": 0.12909957766532898, "step": 110 }, { "completion_length": 2251.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 2251.5, "completions/mean_terminated_length": 2251.5, "completions/min_length": 2122.0, "completions/min_terminated_length": 2122.0, "epoch": 0.39361702127659576, "frac_reward_zero_std": 0.0, "grad_norm": 0.43039387464523315, "kl": 0.0, "learning_rate": 4.682539682539683e-06, "loss": 0.0, "num_tokens": 1811957.0, "reward": 7.326974868774414, "reward_std": 1.4444708824157715, "rewards/reward_model_wrapper/mean": 7.326974868774414, "rewards/reward_model_wrapper/std": 1.4444708824157715, "step": 111 }, { "completion_length": 429.75, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 429.75, "completions/mean_terminated_length": 429.75, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.3971631205673759, "frac_reward_zero_std": 0.0, "grad_norm": 0.37695449590682983, "kl": 0.0, "learning_rate": 4.674603174603175e-06, "loss": 0.0, "num_tokens": 1822752.0, "reward": 9.350000381469727, "reward_std": 0.40414491295814514, "rewards/reward_model_wrapper/mean": 9.350000381469727, "rewards/reward_model_wrapper/std": 0.40414491295814514, "step": 112 }, { "completion_length": 2365.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 2365.5, "completions/mean_terminated_length": 2365.5, "completions/min_length": 2108.0, "completions/min_terminated_length": 2108.0, "epoch": 0.40070921985815605, "frac_reward_zero_std": 0.0, "grad_norm": 0.3936285078525543, "kl": 0.0, "learning_rate": 4.666666666666667e-06, "loss": 0.0, "num_tokens": 1842186.0, "reward": 7.997593879699707, "reward_std": 1.347114086151123, "rewards/reward_model_wrapper/mean": 7.997593879699707, "rewards/reward_model_wrapper/std": 1.3471142053604126, "step": 113 }, { "completion_length": 2147.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 2147.0, "completions/mean_terminated_length": 2147.0, "completions/min_length": 1949.0, "completions/min_terminated_length": 1949.0, "epoch": 0.40425531914893614, "frac_reward_zero_std": 0.0, "grad_norm": 0.07402750849723816, "kl": 0.0, "learning_rate": 4.658730158730159e-06, "loss": -0.0, "num_tokens": 1860510.0, "reward": 8.880744934082031, "reward_std": 0.19976162910461426, "rewards/reward_model_wrapper/mean": 8.880744934082031, "rewards/reward_model_wrapper/std": 0.19976162910461426, "step": 114 }, { "completion_length": 2052.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2210.0, "completions/max_terminated_length": 2210.0, "completions/mean_length": 2052.5, "completions/mean_terminated_length": 2052.5, "completions/min_length": 1895.0, "completions/min_terminated_length": 1895.0, "epoch": 0.4078014184397163, "frac_reward_zero_std": 0.0, "grad_norm": 0.5474584102630615, "kl": 0.0, "learning_rate": 4.650793650793651e-06, "loss": 0.0, "num_tokens": 1879284.0, "reward": 7.0091447830200195, "reward_std": 1.5641767978668213, "rewards/reward_model_wrapper/mean": 7.0091447830200195, "rewards/reward_model_wrapper/std": 1.5641767978668213, "step": 115 }, { "completion_length": 2053.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 2053.0, "completions/mean_terminated_length": 2053.0, "completions/min_length": 1717.0, "completions/min_terminated_length": 1717.0, "epoch": 0.41134751773049644, "frac_reward_zero_std": 0.0, "grad_norm": 0.31686148047447205, "kl": 0.0, "learning_rate": 4.642857142857144e-06, "loss": -0.0, "num_tokens": 1896800.0, "reward": 7.329637050628662, "reward_std": 0.7055403590202332, "rewards/reward_model_wrapper/mean": 7.329637050628662, "rewards/reward_model_wrapper/std": 0.7055404782295227, "step": 116 }, { "completion_length": 413.75, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 413.75, "completions/mean_terminated_length": 413.75, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.4148936170212766, "frac_reward_zero_std": 0.0, "grad_norm": 0.1570892482995987, "kl": 0.0, "learning_rate": 4.634920634920635e-06, "loss": -0.0, "num_tokens": 1908115.0, "reward": 9.375, "reward_std": 0.15000025928020477, "rewards/reward_model_wrapper/mean": 9.375, "rewards/reward_model_wrapper/std": 0.15000009536743164, "step": 117 }, { "completion_length": 2630.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 2630.75, "completions/mean_terminated_length": 2630.75, "completions/min_length": 2410.0, "completions/min_terminated_length": 2410.0, "epoch": 0.41843971631205673, "frac_reward_zero_std": 0.0, "grad_norm": 0.14462856948375702, "kl": 0.0, "learning_rate": 4.626984126984128e-06, "loss": -0.0, "num_tokens": 1928602.0, "reward": 7.053372383117676, "reward_std": 0.41455021500587463, "rewards/reward_model_wrapper/mean": 7.053372383117676, "rewards/reward_model_wrapper/std": 0.4145503640174866, "step": 118 }, { "completion_length": 423.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 423.0, "completions/mean_terminated_length": 423.0, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.4219858156028369, "frac_reward_zero_std": 0.0, "grad_norm": 1.2437680959701538, "kl": 0.0, "learning_rate": 4.6190476190476196e-06, "loss": -0.0, "num_tokens": 1940270.0, "reward": 8.449999809265137, "reward_std": 1.2819255590438843, "rewards/reward_model_wrapper/mean": 8.449999809265137, "rewards/reward_model_wrapper/std": 1.2819255590438843, "step": 119 }, { "completion_length": 2634.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 2634.0, "completions/mean_terminated_length": 2634.0, "completions/min_length": 2287.0, "completions/min_terminated_length": 2287.0, "epoch": 0.425531914893617, "frac_reward_zero_std": 0.0, "grad_norm": 0.6771729588508606, "kl": 0.0, "learning_rate": 4.611111111111112e-06, "loss": -0.0, "num_tokens": 1960874.0, "reward": 6.568253993988037, "reward_std": 2.200967311859131, "rewards/reward_model_wrapper/mean": 6.568253993988037, "rewards/reward_model_wrapper/std": 2.200967311859131, "step": 120 }, { "epoch": 0.425531914893617, "eval_completion_length": 1615.5, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1961.3333333333333, "eval_completions/max_terminated_length": 1961.3333333333333, "eval_completions/mean_length": 1615.5, "eval_completions/mean_terminated_length": 1615.5, "eval_completions/min_length": 1434.0, "eval_completions/min_terminated_length": 1434.0, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 1.451928177687023e-08, "eval_num_tokens": 1960874.0, "eval_reward": 8.309905370076498, "eval_reward_std": 0.9038732747236887, "eval_rewards/reward_model_wrapper/mean": 8.309905370076498, "eval_rewards/reward_model_wrapper/std": 0.9038733045260111, "eval_runtime": 267.0852, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 120 }, { "completion_length": 1858.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 1858.5, "completions/mean_terminated_length": 1858.5, "completions/min_length": 1733.0, "completions/min_terminated_length": 1733.0, "epoch": 0.42907801418439717, "frac_reward_zero_std": 0.0, "grad_norm": 0.4375554025173187, "kl": 0.0, "learning_rate": 4.603174603174604e-06, "loss": -0.0, "num_tokens": 1977016.0, "reward": 7.649999618530273, "reward_std": 1.2396235466003418, "rewards/reward_model_wrapper/mean": 7.649999618530273, "rewards/reward_model_wrapper/std": 1.2396235466003418, "step": 121 }, { "completion_length": 2612.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2838.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 2612.75, "completions/mean_terminated_length": 2612.75, "completions/min_length": 2418.0, "completions/min_terminated_length": 2418.0, "epoch": 0.4326241134751773, "frac_reward_zero_std": 0.0, "grad_norm": 0.18942217528820038, "kl": 0.0, "learning_rate": 4.595238095238095e-06, "loss": -0.0, "num_tokens": 1996959.0, "reward": 7.269717216491699, "reward_std": 0.6022006273269653, "rewards/reward_model_wrapper/mean": 7.269717216491699, "rewards/reward_model_wrapper/std": 0.6022007465362549, "step": 122 }, { "completion_length": 408.25, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 408.25, "completions/mean_terminated_length": 408.25, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.43617021276595747, "frac_reward_zero_std": 0.0, "grad_norm": 0.21342535316944122, "kl": 0.0, "learning_rate": 4.587301587301588e-06, "loss": -0.0, "num_tokens": 2007148.0, "reward": 9.524999618530273, "reward_std": 0.20615530014038086, "rewards/reward_model_wrapper/mean": 9.524999618530273, "rewards/reward_model_wrapper/std": 0.20615531504154205, "step": 123 }, { "completion_length": 468.25, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 468.25, "completions/mean_terminated_length": 468.25, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.4397163120567376, "frac_reward_zero_std": 0.0, "grad_norm": 4.215380668640137, "kl": 0.0, "learning_rate": 4.5793650793650795e-06, "loss": -0.0, "num_tokens": 2017853.0, "reward": 5.0, "reward_std": 3.7514443397521973, "rewards/reward_model_wrapper/mean": 5.0, "rewards/reward_model_wrapper/std": 3.7514445781707764, "step": 124 }, { "completion_length": 1985.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 1985.0, "completions/mean_terminated_length": 1985.0, "completions/min_length": 1453.0, "completions/min_terminated_length": 1453.0, "epoch": 0.4432624113475177, "frac_reward_zero_std": 0.0, "grad_norm": 0.889869749546051, "kl": 0.0, "learning_rate": 4.571428571428572e-06, "loss": -0.0, "num_tokens": 2035633.0, "reward": 7.179272651672363, "reward_std": 2.016270875930786, "rewards/reward_model_wrapper/mean": 7.179272651672363, "rewards/reward_model_wrapper/std": 2.0162711143493652, "step": 125 }, { "completion_length": 417.75, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 417.75, "completions/mean_terminated_length": 417.75, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.44680851063829785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.563492063492064e-06, "loss": 0.0, "num_tokens": 2046112.0, "reward": 9.399999618530273, "reward_std": 0.0, "rewards/reward_model_wrapper/mean": 9.399999618530273, "rewards/reward_model_wrapper/std": 0.0, "step": 126 }, { "completion_length": 2298.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3914.0, "completions/max_terminated_length": 3914.0, "completions/mean_length": 2298.0, "completions/mean_terminated_length": 2298.0, "completions/min_length": 1508.0, "completions/min_terminated_length": 1508.0, "epoch": 0.450354609929078, "frac_reward_zero_std": 0.0, "grad_norm": 0.38401299715042114, "kl": 0.0, "learning_rate": 4.555555555555556e-06, "loss": 0.0, "num_tokens": 2064712.0, "reward": 6.875598430633545, "reward_std": 1.0635424852371216, "rewards/reward_model_wrapper/mean": 6.875598430633545, "rewards/reward_model_wrapper/std": 1.0635424852371216, "step": 127 }, { "completion_length": 404.5, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 404.5, "completions/mean_terminated_length": 404.5, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.45390070921985815, "frac_reward_zero_std": 0.0, "grad_norm": 0.22563499212265015, "kl": 0.0, "learning_rate": 4.547619047619048e-06, "loss": 0.0, "num_tokens": 2075530.0, "reward": 9.200000762939453, "reward_std": 0.2708013653755188, "rewards/reward_model_wrapper/mean": 9.200000762939453, "rewards/reward_model_wrapper/std": 0.2708013653755188, "step": 128 }, { "completion_length": 1583.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1583.25, "completions/mean_terminated_length": 1583.25, "completions/min_length": 1203.0, "completions/min_terminated_length": 1203.0, "epoch": 0.4574468085106383, "frac_reward_zero_std": 0.0, "grad_norm": 0.3642731010913849, "kl": 0.0, "learning_rate": 4.53968253968254e-06, "loss": 0.0, "num_tokens": 2090567.0, "reward": 7.515666961669922, "reward_std": 0.7715764045715332, "rewards/reward_model_wrapper/mean": 7.515666961669922, "rewards/reward_model_wrapper/std": 0.7715765237808228, "step": 129 }, { "completion_length": 2288.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 2288.25, "completions/mean_terminated_length": 2288.25, "completions/min_length": 2104.0, "completions/min_terminated_length": 2104.0, "epoch": 0.46099290780141844, "frac_reward_zero_std": 0.0, "grad_norm": 0.5531026124954224, "kl": 0.0, "learning_rate": 4.531746031746032e-06, "loss": -0.0, "num_tokens": 2108236.0, "reward": 6.552907943725586, "reward_std": 1.2477290630340576, "rewards/reward_model_wrapper/mean": 6.552907943725586, "rewards/reward_model_wrapper/std": 1.2477290630340576, "step": 130 }, { "completion_length": 429.75, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 429.75, "completions/mean_terminated_length": 429.75, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.4645390070921986, "frac_reward_zero_std": 0.0, "grad_norm": 0.8561726212501526, "kl": 0.0, "learning_rate": 4.523809523809524e-06, "loss": 0.0, "num_tokens": 2119711.0, "reward": 9.0, "reward_std": 0.9380831122398376, "rewards/reward_model_wrapper/mean": 9.0, "rewards/reward_model_wrapper/std": 0.9380831718444824, "step": 131 }, { "completion_length": 439.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 439.0, "completions/mean_terminated_length": 439.0, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.46808510638297873, "frac_reward_zero_std": 0.0, "grad_norm": 1.4553359746932983, "kl": 0.0, "learning_rate": 4.515873015873016e-06, "loss": -0.0, "num_tokens": 2131331.0, "reward": 7.625, "reward_std": 1.3400870561599731, "rewards/reward_model_wrapper/mean": 7.625, "rewards/reward_model_wrapper/std": 1.3400870561599731, "step": 132 }, { "completion_length": 1789.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1789.5, "completions/mean_terminated_length": 1789.5, "completions/min_length": 1598.0, "completions/min_terminated_length": 1598.0, "epoch": 0.4716312056737589, "frac_reward_zero_std": 0.0, "grad_norm": 0.261265367269516, "kl": 0.0, "learning_rate": 4.5079365079365085e-06, "loss": -0.0, "num_tokens": 2147013.0, "reward": 8.304464340209961, "reward_std": 0.749656081199646, "rewards/reward_model_wrapper/mean": 8.304464340209961, "rewards/reward_model_wrapper/std": 0.7496560215950012, "step": 133 }, { "completion_length": 490.25, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 490.25, "completions/mean_terminated_length": 490.25, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.475177304964539, "frac_reward_zero_std": 0.0, "grad_norm": 0.6956883668899536, "kl": 0.0, "learning_rate": 4.5e-06, "loss": 0.0, "num_tokens": 2158958.0, "reward": 8.475000381469727, "reward_std": 0.7135592103004456, "rewards/reward_model_wrapper/mean": 8.475000381469727, "rewards/reward_model_wrapper/std": 0.7135592103004456, "step": 134 }, { "completion_length": 458.5, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 458.5, "completions/mean_terminated_length": 458.5, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.4787234042553192, "frac_reward_zero_std": 0.0, "grad_norm": 0.13199543952941895, "kl": 0.0, "learning_rate": 4.492063492063493e-06, "loss": 0.0, "num_tokens": 2170384.0, "reward": 9.475000381469727, "reward_std": 0.12583079934120178, "rewards/reward_model_wrapper/mean": 9.475000381469727, "rewards/reward_model_wrapper/std": 0.12583062052726746, "step": 135 }, { "completion_length": 2191.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 2191.0, "completions/mean_terminated_length": 2191.0, "completions/min_length": 1770.0, "completions/min_terminated_length": 1770.0, "epoch": 0.48226950354609927, "frac_reward_zero_std": 0.0, "grad_norm": 0.17606620490550995, "kl": 0.0, "learning_rate": 4.484126984126984e-06, "loss": -0.0, "num_tokens": 2188344.0, "reward": 6.775217056274414, "reward_std": 0.47128474712371826, "rewards/reward_model_wrapper/mean": 6.775217056274414, "rewards/reward_model_wrapper/std": 0.4712846875190735, "step": 136 }, { "completion_length": 407.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 407.0, "completions/mean_terminated_length": 407.0, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.4858156028368794, "frac_reward_zero_std": 0.0, "grad_norm": 0.6999905109405518, "kl": 0.0, "learning_rate": 4.476190476190477e-06, "loss": 0.0, "num_tokens": 2199324.0, "reward": 8.574999809265137, "reward_std": 0.75, "rewards/reward_model_wrapper/mean": 8.574999809265137, "rewards/reward_model_wrapper/std": 0.75, "step": 137 }, { "completion_length": 2310.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 2310.5, "completions/mean_terminated_length": 2310.5, "completions/min_length": 2172.0, "completions/min_terminated_length": 2172.0, "epoch": 0.48936170212765956, "frac_reward_zero_std": 0.0, "grad_norm": 0.596656322479248, "kl": 0.0, "learning_rate": 4.468253968253969e-06, "loss": 0.0, "num_tokens": 2218090.0, "reward": 5.798462390899658, "reward_std": 1.5374131202697754, "rewards/reward_model_wrapper/mean": 5.798462390899658, "rewards/reward_model_wrapper/std": 1.537413239479065, "step": 138 }, { "completion_length": 434.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 434.0, "completions/mean_terminated_length": 434.0, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.4929078014184397, "frac_reward_zero_std": 0.0, "grad_norm": 0.11086136847734451, "kl": 0.0, "learning_rate": 4.460317460317461e-06, "loss": 0.0, "num_tokens": 2228846.0, "reward": 9.5, "reward_std": 0.11547049134969711, "rewards/reward_model_wrapper/mean": 9.5, "rewards/reward_model_wrapper/std": 0.11547049134969711, "step": 139 }, { "completion_length": 2451.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2940.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 2451.0, "completions/mean_terminated_length": 2451.0, "completions/min_length": 1791.0, "completions/min_terminated_length": 1791.0, "epoch": 0.49645390070921985, "frac_reward_zero_std": 0.0, "grad_norm": 0.15072345733642578, "kl": 0.0, "learning_rate": 4.4523809523809525e-06, "loss": -0.0, "num_tokens": 2248082.0, "reward": 8.040382385253906, "reward_std": 0.4516172409057617, "rewards/reward_model_wrapper/mean": 8.040382385253906, "rewards/reward_model_wrapper/std": 0.4516172409057617, "step": 140 }, { "epoch": 0.49645390070921985, "eval_completion_length": 1696.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1946.6666666666667, "eval_completions/max_terminated_length": 1946.6666666666667, "eval_completions/mean_length": 1696.0, "eval_completions/mean_terminated_length": 1696.0, "eval_completions/min_length": 1540.0, "eval_completions/min_terminated_length": 1540.0, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": -2.611144509501173e-07, "eval_num_tokens": 2248082.0, "eval_reward": 8.40399201711019, "eval_reward_std": 0.9402754306793213, "eval_rewards/reward_model_wrapper/mean": 8.40399201711019, "eval_rewards/reward_model_wrapper/std": 0.9402754505475363, "eval_runtime": 267.0355, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 140 }, { "completion_length": 415.5, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 415.5, "completions/mean_terminated_length": 415.5, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.444444444444444e-06, "loss": 0.0, "num_tokens": 2258268.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model_wrapper/mean": 10.0, "rewards/reward_model_wrapper/std": 0.0, "step": 141 }, { "completion_length": 1965.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 1965.75, "completions/mean_terminated_length": 1965.75, "completions/min_length": 1858.0, "completions/min_terminated_length": 1858.0, "epoch": 0.5035460992907801, "frac_reward_zero_std": 0.0, "grad_norm": 0.5515777468681335, "kl": 0.0, "learning_rate": 4.436507936507937e-06, "loss": -0.0, "num_tokens": 2275739.0, "reward": 6.6948747634887695, "reward_std": 1.5562154054641724, "rewards/reward_model_wrapper/mean": 6.6948747634887695, "rewards/reward_model_wrapper/std": 1.5562152862548828, "step": 142 }, { "completion_length": 2718.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3123.0, "completions/max_terminated_length": 3123.0, "completions/mean_length": 2718.5, "completions/mean_terminated_length": 2718.5, "completions/min_length": 2370.0, "completions/min_terminated_length": 2370.0, "epoch": 0.5070921985815603, "frac_reward_zero_std": 0.0, "grad_norm": 0.5304810404777527, "kl": 0.0, "learning_rate": 4.428571428571429e-06, "loss": -0.0, "num_tokens": 2295597.0, "reward": 7.202945232391357, "reward_std": 1.8583394289016724, "rewards/reward_model_wrapper/mean": 7.202945232391357, "rewards/reward_model_wrapper/std": 1.8583396673202515, "step": 143 }, { "completion_length": 2204.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 2204.0, "completions/mean_terminated_length": 2204.0, "completions/min_length": 2034.0, "completions/min_terminated_length": 2034.0, "epoch": 0.5106382978723404, "frac_reward_zero_std": 0.0, "grad_norm": 0.32715511322021484, "kl": 0.0, "learning_rate": 4.420634920634921e-06, "loss": 0.0, "num_tokens": 2312825.0, "reward": 6.755857944488525, "reward_std": 0.9909467101097107, "rewards/reward_model_wrapper/mean": 6.755857944488525, "rewards/reward_model_wrapper/std": 0.9909466505050659, "step": 144 }, { "completion_length": 427.75, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 427.75, "completions/mean_terminated_length": 427.75, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.5141843971631206, "frac_reward_zero_std": 0.0, "grad_norm": 1.0862383842468262, "kl": 0.0, "learning_rate": 4.412698412698413e-06, "loss": -0.0, "num_tokens": 2323776.0, "reward": 8.899999618530273, "reward_std": 1.1860297918319702, "rewards/reward_model_wrapper/mean": 8.899999618530273, "rewards/reward_model_wrapper/std": 1.1860297918319702, "step": 145 }, { "completion_length": 2072.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 2072.25, "completions/mean_terminated_length": 2072.25, "completions/min_length": 1969.0, "completions/min_terminated_length": 1969.0, "epoch": 0.5177304964539007, "frac_reward_zero_std": 0.0, "grad_norm": 0.3771529495716095, "kl": 0.0, "learning_rate": 4.404761904761905e-06, "loss": -0.0, "num_tokens": 2340677.0, "reward": 5.547879219055176, "reward_std": 1.1149834394454956, "rewards/reward_model_wrapper/mean": 5.547879219055176, "rewards/reward_model_wrapper/std": 1.1149834394454956, "step": 146 }, { "completion_length": 2414.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 2414.75, "completions/mean_terminated_length": 2414.75, "completions/min_length": 1993.0, "completions/min_terminated_length": 1993.0, "epoch": 0.5212765957446809, "frac_reward_zero_std": 0.0, "grad_norm": 0.24231797456741333, "kl": 0.0, "learning_rate": 4.396825396825397e-06, "loss": 0.0, "num_tokens": 2359688.0, "reward": 8.2395658493042, "reward_std": 0.7291795611381531, "rewards/reward_model_wrapper/mean": 8.2395658493042, "rewards/reward_model_wrapper/std": 0.7291796207427979, "step": 147 }, { "completion_length": 2271.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 2271.0, "completions/mean_terminated_length": 2271.0, "completions/min_length": 2098.0, "completions/min_terminated_length": 2098.0, "epoch": 0.524822695035461, "frac_reward_zero_std": 0.0, "grad_norm": 0.4925796687602997, "kl": 0.0, "learning_rate": 4.388888888888889e-06, "loss": -0.0, "num_tokens": 2376952.0, "reward": 5.887813568115234, "reward_std": 1.301381230354309, "rewards/reward_model_wrapper/mean": 5.887813568115234, "rewards/reward_model_wrapper/std": 1.301381230354309, "step": 148 }, { "completion_length": 446.25, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 446.25, "completions/mean_terminated_length": 446.25, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.5283687943262412, "frac_reward_zero_std": 0.0, "grad_norm": 0.8594765067100525, "kl": 0.0, "learning_rate": 4.3809523809523815e-06, "loss": -0.0, "num_tokens": 2389389.0, "reward": 8.399999618530273, "reward_std": 0.8640987873077393, "rewards/reward_model_wrapper/mean": 8.399999618530273, "rewards/reward_model_wrapper/std": 0.864098846912384, "step": 149 }, { "completion_length": 511.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 511.0, "completions/mean_terminated_length": 511.0, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.5319148936170213, "frac_reward_zero_std": 0.0, "grad_norm": 1.6069363355636597, "kl": 0.0, "learning_rate": 4.373015873015873e-06, "loss": -0.0, "num_tokens": 2401277.0, "reward": 7.09999942779541, "reward_std": 1.6431673765182495, "rewards/reward_model_wrapper/mean": 7.09999942779541, "rewards/reward_model_wrapper/std": 1.643167495727539, "step": 150 }, { "completion_length": 2341.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 2341.25, "completions/mean_terminated_length": 2341.25, "completions/min_length": 2006.0, "completions/min_terminated_length": 2006.0, "epoch": 0.5354609929078015, "frac_reward_zero_std": 0.0, "grad_norm": 0.7177795767784119, "kl": 0.0, "learning_rate": 4.365079365079366e-06, "loss": 0.0, "num_tokens": 2420294.0, "reward": 6.7753424644470215, "reward_std": 1.9033061265945435, "rewards/reward_model_wrapper/mean": 6.7753424644470215, "rewards/reward_model_wrapper/std": 1.9033061265945435, "step": 151 }, { "completion_length": 2215.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2455.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 2215.0, "completions/mean_terminated_length": 2215.0, "completions/min_length": 1949.0, "completions/min_terminated_length": 1949.0, "epoch": 0.5390070921985816, "frac_reward_zero_std": 0.0, "grad_norm": 0.1539050042629242, "kl": 0.0, "learning_rate": 4.357142857142857e-06, "loss": 0.0, "num_tokens": 2438286.0, "reward": 7.745049476623535, "reward_std": 0.44775378704071045, "rewards/reward_model_wrapper/mean": 7.745049476623535, "rewards/reward_model_wrapper/std": 0.44775390625, "step": 152 }, { "completion_length": 2623.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3280.0, "completions/max_terminated_length": 3280.0, "completions/mean_length": 2623.25, "completions/mean_terminated_length": 2623.25, "completions/min_length": 2180.0, "completions/min_terminated_length": 2180.0, "epoch": 0.5425531914893617, "frac_reward_zero_std": 0.0, "grad_norm": 0.3442251980304718, "kl": 0.0, "learning_rate": 4.34920634920635e-06, "loss": -0.0, "num_tokens": 2457555.0, "reward": 7.0105366706848145, "reward_std": 1.0002729892730713, "rewards/reward_model_wrapper/mean": 7.0105366706848145, "rewards/reward_model_wrapper/std": 1.0002731084823608, "step": 153 }, { "completion_length": 413.75, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 413.75, "completions/mean_terminated_length": 413.75, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.5460992907801419, "frac_reward_zero_std": 0.0, "grad_norm": 0.37713876366615295, "kl": 0.0, "learning_rate": 4.3412698412698414e-06, "loss": -0.0, "num_tokens": 2470106.0, "reward": 8.899999618530273, "reward_std": 0.29439181089401245, "rewards/reward_model_wrapper/mean": 8.899999618530273, "rewards/reward_model_wrapper/std": 0.2943919897079468, "step": 154 }, { "completion_length": 435.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 435.0, "completions/mean_terminated_length": 435.0, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.549645390070922, "frac_reward_zero_std": 0.0, "grad_norm": 0.11632431298494339, "kl": 0.0, "learning_rate": 4.333333333333334e-06, "loss": 0.0, "num_tokens": 2480490.0, "reward": 9.25, "reward_std": 0.12909895181655884, "rewards/reward_model_wrapper/mean": 9.25, "rewards/reward_model_wrapper/std": 0.12909920513629913, "step": 155 }, { "completion_length": 608.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 608.0, "completions/mean_terminated_length": 608.0, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.5531914893617021, "frac_reward_zero_std": 0.0, "grad_norm": 1.2606799602508545, "kl": 0.0, "learning_rate": 4.3253968253968256e-06, "loss": 0.0, "num_tokens": 2492634.0, "reward": 6.350000381469727, "reward_std": 1.5459628105163574, "rewards/reward_model_wrapper/mean": 6.350000381469727, "rewards/reward_model_wrapper/std": 1.5459626913070679, "step": 156 }, { "completion_length": 2481.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2709.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 2481.5, "completions/mean_terminated_length": 2481.5, "completions/min_length": 2280.0, "completions/min_terminated_length": 2280.0, "epoch": 0.5567375886524822, "frac_reward_zero_std": 0.0, "grad_norm": 0.17594365775585175, "kl": 0.0, "learning_rate": 4.317460317460318e-06, "loss": -0.0, "num_tokens": 2511924.0, "reward": 8.550357818603516, "reward_std": 0.49009615182876587, "rewards/reward_model_wrapper/mean": 8.550357818603516, "rewards/reward_model_wrapper/std": 0.4900960922241211, "step": 157 }, { "completion_length": 428.75, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 428.75, "completions/mean_terminated_length": 428.75, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.5602836879432624, "frac_reward_zero_std": 0.0, "grad_norm": 0.9969485998153687, "kl": 0.0, "learning_rate": 4.30952380952381e-06, "loss": -0.0, "num_tokens": 2522135.0, "reward": 8.899999618530273, "reward_std": 0.9486833810806274, "rewards/reward_model_wrapper/mean": 8.899999618530273, "rewards/reward_model_wrapper/std": 0.9486834406852722, "step": 158 }, { "completion_length": 1805.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1990.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1805.75, "completions/mean_terminated_length": 1805.75, "completions/min_length": 1605.0, "completions/min_terminated_length": 1605.0, "epoch": 0.5638297872340425, "frac_reward_zero_std": 0.0, "grad_norm": 0.3487490713596344, "kl": 0.0, "learning_rate": 4.301587301587302e-06, "loss": -0.0, "num_tokens": 2537554.0, "reward": 8.5611572265625, "reward_std": 0.9626208543777466, "rewards/reward_model_wrapper/mean": 8.5611572265625, "rewards/reward_model_wrapper/std": 0.9626208543777466, "step": 159 }, { "completion_length": 388.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5673758865248227, "frac_reward_zero_std": 0.0, "grad_norm": 0.1445457935333252, "kl": 0.0, "learning_rate": 4.293650793650794e-06, "loss": -0.0, "num_tokens": 2547398.0, "reward": 9.524999618530273, "reward_std": 0.15000014007091522, "rewards/reward_model_wrapper/mean": 9.524999618530273, "rewards/reward_model_wrapper/std": 0.15000019967556, "step": 160 }, { "epoch": 0.5673758865248227, "eval_completion_length": 1964.1666666666667, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2536.0, "eval_completions/max_terminated_length": 2536.0, "eval_completions/mean_length": 1964.1666666666667, "eval_completions/mean_terminated_length": 1964.1666666666667, "eval_completions/min_length": 1504.6666666666667, "eval_completions/min_terminated_length": 1504.6666666666667, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 4.387926821891597e-08, "eval_num_tokens": 2547398.0, "eval_reward": 7.452057520548503, "eval_reward_std": 1.4210925896962483, "eval_rewards/reward_model_wrapper/mean": 7.452057520548503, "eval_rewards/reward_model_wrapper/std": 1.4210926294326782, "eval_runtime": 326.6969, "eval_samples_per_second": 0.009, "eval_steps_per_second": 0.003, "step": 160 }, { "completion_length": 2549.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 2549.5, "completions/mean_terminated_length": 2549.5, "completions/min_length": 2229.0, "completions/min_terminated_length": 2229.0, "epoch": 0.5709219858156028, "frac_reward_zero_std": 0.0, "grad_norm": 0.21853336691856384, "kl": 0.0, "learning_rate": 4.2857142857142855e-06, "loss": -0.0, "num_tokens": 2565836.0, "reward": 7.349999904632568, "reward_std": 0.6350852847099304, "rewards/reward_model_wrapper/mean": 7.349999904632568, "rewards/reward_model_wrapper/std": 0.6350853443145752, "step": 161 }, { "completion_length": 1857.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2258.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 1857.0, "completions/mean_terminated_length": 1857.0, "completions/min_length": 1672.0, "completions/min_terminated_length": 1672.0, "epoch": 0.574468085106383, "frac_reward_zero_std": 0.0, "grad_norm": 0.4788321852684021, "kl": 0.0, "learning_rate": 4.277777777777778e-06, "loss": -0.0, "num_tokens": 2581548.0, "reward": 7.949999809265137, "reward_std": 1.554563283920288, "rewards/reward_model_wrapper/mean": 7.949999809265137, "rewards/reward_model_wrapper/std": 1.5545634031295776, "step": 162 }, { "completion_length": 2134.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 2134.5, "completions/mean_terminated_length": 2134.5, "completions/min_length": 2017.0, "completions/min_terminated_length": 2017.0, "epoch": 0.5780141843971631, "frac_reward_zero_std": 0.0, "grad_norm": 0.2382994145154953, "kl": 0.0, "learning_rate": 4.26984126984127e-06, "loss": 0.0, "num_tokens": 2599174.0, "reward": 8.125, "reward_std": 0.6512807607650757, "rewards/reward_model_wrapper/mean": 8.125, "rewards/reward_model_wrapper/std": 0.6512807607650757, "step": 163 }, { "completion_length": 359.25, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.5815602836879432, "frac_reward_zero_std": 0.0, "grad_norm": 0.9030126333236694, "kl": 0.0, "learning_rate": 4.261904761904762e-06, "loss": -0.0, "num_tokens": 2610883.0, "reward": 8.049999237060547, "reward_std": 0.9327378273010254, "rewards/reward_model_wrapper/mean": 8.049999237060547, "rewards/reward_model_wrapper/std": 0.9327380061149597, "step": 164 }, { "completion_length": 2477.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 2477.0, "completions/mean_terminated_length": 2477.0, "completions/min_length": 2241.0, "completions/min_terminated_length": 2241.0, "epoch": 0.5851063829787234, "frac_reward_zero_std": 0.0, "grad_norm": 0.5839810967445374, "kl": 0.0, "learning_rate": 4.2539682539682546e-06, "loss": -0.0, "num_tokens": 2631011.0, "reward": 6.209723949432373, "reward_std": 1.5474190711975098, "rewards/reward_model_wrapper/mean": 6.209723949432373, "rewards/reward_model_wrapper/std": 1.5474191904067993, "step": 165 }, { "completion_length": 3044.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4130.0, "completions/max_terminated_length": 4130.0, "completions/mean_length": 3044.5, "completions/mean_terminated_length": 3044.5, "completions/min_length": 2181.0, "completions/min_terminated_length": 2181.0, "epoch": 0.5886524822695035, "frac_reward_zero_std": 0.0, "grad_norm": 0.26197847723960876, "kl": 0.0, "learning_rate": 4.246031746031746e-06, "loss": -0.0, "num_tokens": 2652753.0, "reward": 7.354349613189697, "reward_std": 0.9007920622825623, "rewards/reward_model_wrapper/mean": 7.354349613189697, "rewards/reward_model_wrapper/std": 0.900792121887207, "step": 166 }, { "completion_length": 2385.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2823.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 2385.5, "completions/mean_terminated_length": 2385.5, "completions/min_length": 1973.0, "completions/min_terminated_length": 1973.0, "epoch": 0.5921985815602837, "frac_reward_zero_std": 0.0, "grad_norm": 1.3769402503967285, "kl": 0.0, "learning_rate": 4.238095238095239e-06, "loss": -0.0, "num_tokens": 2673227.0, "reward": 5.75, "reward_std": 4.174925327301025, "rewards/reward_model_wrapper/mean": 5.75, "rewards/reward_model_wrapper/std": 4.174925327301025, "step": 167 }, { "completion_length": 2343.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 2343.75, "completions/mean_terminated_length": 2343.75, "completions/min_length": 2024.0, "completions/min_terminated_length": 2024.0, "epoch": 0.5957446808510638, "frac_reward_zero_std": 0.0, "grad_norm": 0.15502206981182098, "kl": 0.0, "learning_rate": 4.23015873015873e-06, "loss": -0.0, "num_tokens": 2691030.0, "reward": 6.625, "reward_std": 0.3862210214138031, "rewards/reward_model_wrapper/mean": 6.625, "rewards/reward_model_wrapper/std": 0.38622087240219116, "step": 168 }, { "completion_length": 2801.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3281.0, "completions/max_terminated_length": 3281.0, "completions/mean_length": 2801.5, "completions/mean_terminated_length": 2801.5, "completions/min_length": 2206.0, "completions/min_terminated_length": 2206.0, "epoch": 0.599290780141844, "frac_reward_zero_std": 0.0, "grad_norm": 0.26676297187805176, "kl": 0.0, "learning_rate": 4.222222222222223e-06, "loss": 0.0, "num_tokens": 2711780.0, "reward": 7.480178356170654, "reward_std": 0.8363959789276123, "rewards/reward_model_wrapper/mean": 7.480178356170654, "rewards/reward_model_wrapper/std": 0.8363959789276123, "step": 169 }, { "completion_length": 377.25, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6028368794326241, "frac_reward_zero_std": 0.0, "grad_norm": 0.983574390411377, "kl": 0.0, "learning_rate": 4.2142857142857145e-06, "loss": 0.0, "num_tokens": 2721953.0, "reward": 8.200000762939453, "reward_std": 0.9831921458244324, "rewards/reward_model_wrapper/mean": 8.200000762939453, "rewards/reward_model_wrapper/std": 0.9831920862197876, "step": 170 }, { "completion_length": 391.75, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 391.75, "completions/mean_terminated_length": 391.75, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.6063829787234043, "frac_reward_zero_std": 0.0, "grad_norm": 8.445734024047852, "kl": 0.0, "learning_rate": 4.206349206349207e-06, "loss": -0.0, "num_tokens": 2732512.0, "reward": 6.199999809265137, "reward_std": 4.146484375, "rewards/reward_model_wrapper/mean": 6.199999809265137, "rewards/reward_model_wrapper/std": 4.146484375, "step": 171 }, { "completion_length": 2505.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 2505.0, "completions/mean_terminated_length": 2505.0, "completions/min_length": 2431.0, "completions/min_terminated_length": 2431.0, "epoch": 0.6099290780141844, "frac_reward_zero_std": 0.0, "grad_norm": 0.4046436846256256, "kl": 0.0, "learning_rate": 4.198412698412699e-06, "loss": 0.0, "num_tokens": 2751756.0, "reward": 7.406938076019287, "reward_std": 1.3873201608657837, "rewards/reward_model_wrapper/mean": 7.406938076019287, "rewards/reward_model_wrapper/std": 1.3873201608657837, "step": 172 }, { "completion_length": 2869.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3142.0, "completions/max_terminated_length": 3142.0, "completions/mean_length": 2869.75, "completions/mean_terminated_length": 2869.75, "completions/min_length": 2239.0, "completions/min_terminated_length": 2239.0, "epoch": 0.6134751773049646, "frac_reward_zero_std": 0.0, "grad_norm": 0.352485328912735, "kl": 0.0, "learning_rate": 4.190476190476191e-06, "loss": 0.0, "num_tokens": 2772219.0, "reward": 7.518502712249756, "reward_std": 1.1285885572433472, "rewards/reward_model_wrapper/mean": 7.518502712249756, "rewards/reward_model_wrapper/std": 1.1285885572433472, "step": 173 }, { "completion_length": 2158.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2430.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 2158.5, "completions/mean_terminated_length": 2158.5, "completions/min_length": 1832.0, "completions/min_terminated_length": 1832.0, "epoch": 0.6170212765957447, "frac_reward_zero_std": 0.0, "grad_norm": 0.5254389643669128, "kl": 0.0, "learning_rate": 4.182539682539683e-06, "loss": 0.0, "num_tokens": 2790069.0, "reward": 7.197629928588867, "reward_std": 1.7431215047836304, "rewards/reward_model_wrapper/mean": 7.197629928588867, "rewards/reward_model_wrapper/std": 1.7431215047836304, "step": 174 }, { "completion_length": 2206.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 2206.25, "completions/mean_terminated_length": 2206.25, "completions/min_length": 1884.0, "completions/min_terminated_length": 1884.0, "epoch": 0.6205673758865248, "frac_reward_zero_std": 0.0, "grad_norm": 0.6022999286651611, "kl": 0.0, "learning_rate": 4.174603174603175e-06, "loss": -0.0, "num_tokens": 2808782.0, "reward": 7.5173258781433105, "reward_std": 1.575560212135315, "rewards/reward_model_wrapper/mean": 7.5173258781433105, "rewards/reward_model_wrapper/std": 1.575560212135315, "step": 175 }, { "completion_length": 464.25, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 464.25, "completions/mean_terminated_length": 464.25, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.624113475177305, "frac_reward_zero_std": 0.0, "grad_norm": 0.30153781175613403, "kl": 0.0, "learning_rate": 4.166666666666667e-06, "loss": 0.0, "num_tokens": 2819607.0, "reward": 9.27500057220459, "reward_std": 0.3403429388999939, "rewards/reward_model_wrapper/mean": 9.27500057220459, "rewards/reward_model_wrapper/std": 0.3403429687023163, "step": 176 }, { "completion_length": 394.75, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 394.75, "completions/mean_terminated_length": 394.75, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.6276595744680851, "frac_reward_zero_std": 0.0, "grad_norm": 0.9257121086120605, "kl": 0.0, "learning_rate": 4.158730158730159e-06, "loss": -0.0, "num_tokens": 2830874.0, "reward": 8.699999809265137, "reward_std": 0.8981465101242065, "rewards/reward_model_wrapper/mean": 8.699999809265137, "rewards/reward_model_wrapper/std": 0.8981465697288513, "step": 177 }, { "completion_length": 2291.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 2291.75, "completions/mean_terminated_length": 2291.75, "completions/min_length": 2168.0, "completions/min_terminated_length": 2168.0, "epoch": 0.6312056737588653, "frac_reward_zero_std": 0.0, "grad_norm": 0.45953604578971863, "kl": 0.0, "learning_rate": 4.150793650793651e-06, "loss": 0.0, "num_tokens": 2848409.0, "reward": 7.071104049682617, "reward_std": 1.2701901197433472, "rewards/reward_model_wrapper/mean": 7.071104049682617, "rewards/reward_model_wrapper/std": 1.2701901197433472, "step": 178 }, { "completion_length": 602.25, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 602.25, "completions/mean_terminated_length": 602.25, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.6347517730496454, "frac_reward_zero_std": 0.0, "grad_norm": 0.5169432163238525, "kl": 0.0, "learning_rate": 4.1428571428571435e-06, "loss": -0.0, "num_tokens": 2860234.0, "reward": 9.024999618530273, "reward_std": 0.6344289779663086, "rewards/reward_model_wrapper/mean": 9.024999618530273, "rewards/reward_model_wrapper/std": 0.6344287991523743, "step": 179 }, { "completion_length": 402.5, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 402.5, "completions/mean_terminated_length": 402.5, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.6382978723404256, "frac_reward_zero_std": 0.0, "grad_norm": 0.2915506958961487, "kl": 0.0, "learning_rate": 4.134920634920635e-06, "loss": -0.0, "num_tokens": 2871380.0, "reward": 9.25, "reward_std": 0.3316624164581299, "rewards/reward_model_wrapper/mean": 9.25, "rewards/reward_model_wrapper/std": 0.3316623866558075, "step": 180 }, { "epoch": 0.6382978723404256, "eval_completion_length": 1742.1666666666667, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1969.0, "eval_completions/max_terminated_length": 1969.0, "eval_completions/mean_length": 1742.1666666666667, "eval_completions/mean_terminated_length": 1742.1666666666667, "eval_completions/min_length": 1561.3333333333333, "eval_completions/min_terminated_length": 1561.3333333333333, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 1.82764154033066e-07, "eval_num_tokens": 2871380.0, "eval_reward": 6.6593254407246905, "eval_reward_std": 2.179860273996989, "eval_rewards/reward_model_wrapper/mean": 6.6593254407246905, "eval_rewards/reward_model_wrapper/std": 2.1798601150512695, "eval_runtime": 267.8341, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 180 }, { "completion_length": 443.75, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 443.75, "completions/mean_terminated_length": 443.75, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.6418439716312057, "frac_reward_zero_std": 0.0, "grad_norm": 0.19920244812965393, "kl": 0.0, "learning_rate": 4.126984126984127e-06, "loss": -0.0, "num_tokens": 2883307.0, "reward": 9.674999237060547, "reward_std": 0.20615528523921967, "rewards/reward_model_wrapper/mean": 9.674999237060547, "rewards/reward_model_wrapper/std": 0.20615531504154205, "step": 181 }, { "completion_length": 444.25, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 444.25, "completions/mean_terminated_length": 444.25, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.6453900709219859, "frac_reward_zero_std": 0.0, "grad_norm": 0.3695997893810272, "kl": 0.0, "learning_rate": 4.119047619047619e-06, "loss": 0.0, "num_tokens": 2896020.0, "reward": 9.625, "reward_std": 0.4271998405456543, "rewards/reward_model_wrapper/mean": 9.625, "rewards/reward_model_wrapper/std": 0.4272000193595886, "step": 182 }, { "completion_length": 396.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 396.0, "completions/mean_terminated_length": 396.0, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.648936170212766, "frac_reward_zero_std": 0.0, "grad_norm": 0.18736840784549713, "kl": 0.0, "learning_rate": 4.111111111111111e-06, "loss": -0.0, "num_tokens": 2907248.0, "reward": 8.649999618530273, "reward_std": 0.17320546507835388, "rewards/reward_model_wrapper/mean": 8.649999618530273, "rewards/reward_model_wrapper/std": 0.17320528626441956, "step": 183 }, { "completion_length": 423.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 423.0, "completions/mean_terminated_length": 423.0, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.6524822695035462, "frac_reward_zero_std": 0.0, "grad_norm": 1.356783151626587, "kl": 0.0, "learning_rate": 4.103174603174603e-06, "loss": 0.0, "num_tokens": 2918452.0, "reward": 8.575000762939453, "reward_std": 1.195477843284607, "rewards/reward_model_wrapper/mean": 8.575000762939453, "rewards/reward_model_wrapper/std": 1.1954777240753174, "step": 184 }, { "completion_length": 399.75, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 399.75, "completions/mean_terminated_length": 399.75, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.6560283687943262, "frac_reward_zero_std": 0.0, "grad_norm": 0.5133328437805176, "kl": 0.0, "learning_rate": 4.095238095238096e-06, "loss": -0.0, "num_tokens": 2929823.0, "reward": 8.899999618530273, "reward_std": 0.5477226376533508, "rewards/reward_model_wrapper/mean": 8.899999618530273, "rewards/reward_model_wrapper/std": 0.5477226972579956, "step": 185 }, { "completion_length": 2169.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 2169.25, "completions/mean_terminated_length": 2169.25, "completions/min_length": 2111.0, "completions/min_terminated_length": 2111.0, "epoch": 0.6595744680851063, "frac_reward_zero_std": 0.0, "grad_norm": 0.5709662437438965, "kl": 0.0, "learning_rate": 4.0873015873015875e-06, "loss": 0.0, "num_tokens": 2947184.0, "reward": 5.337080955505371, "reward_std": 1.4199196100234985, "rewards/reward_model_wrapper/mean": 5.337080955505371, "rewards/reward_model_wrapper/std": 1.419919490814209, "step": 186 }, { "completion_length": 2204.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 2204.0, "completions/mean_terminated_length": 2204.0, "completions/min_length": 1709.0, "completions/min_terminated_length": 1709.0, "epoch": 0.6631205673758865, "frac_reward_zero_std": 0.0, "grad_norm": 0.12867388129234314, "kl": 0.0, "learning_rate": 4.07936507936508e-06, "loss": -0.0, "num_tokens": 2966324.0, "reward": 8.314237594604492, "reward_std": 0.4207301139831543, "rewards/reward_model_wrapper/mean": 8.314237594604492, "rewards/reward_model_wrapper/std": 0.42073023319244385, "step": 187 }, { "completion_length": 2251.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 2251.5, "completions/mean_terminated_length": 2251.5, "completions/min_length": 2151.0, "completions/min_terminated_length": 2151.0, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.3119218945503235, "kl": 0.0, "learning_rate": 4.071428571428572e-06, "loss": 0.0, "num_tokens": 2985558.0, "reward": 7.891966819763184, "reward_std": 1.0035362243652344, "rewards/reward_model_wrapper/mean": 7.891966819763184, "rewards/reward_model_wrapper/std": 1.0035362243652344, "step": 188 }, { "completion_length": 439.75, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 439.75, "completions/mean_terminated_length": 439.75, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.6702127659574468, "frac_reward_zero_std": 0.0, "grad_norm": 0.04007119685411453, "kl": 0.0, "learning_rate": 4.063492063492064e-06, "loss": -0.0, "num_tokens": 2995753.0, "reward": 9.625, "reward_std": 0.049999553710222244, "rewards/reward_model_wrapper/mean": 9.625, "rewards/reward_model_wrapper/std": 0.04999971389770508, "step": 189 }, { "completion_length": 446.5, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 446.5, "completions/mean_terminated_length": 446.5, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.6737588652482269, "frac_reward_zero_std": 0.0, "grad_norm": 2.2511539459228516, "kl": 0.0, "learning_rate": 4.055555555555556e-06, "loss": 0.0, "num_tokens": 3006767.0, "reward": 7.100000381469727, "reward_std": 1.937352180480957, "rewards/reward_model_wrapper/mean": 7.100000381469727, "rewards/reward_model_wrapper/std": 1.937352180480957, "step": 190 }, { "completion_length": 2545.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 2545.25, "completions/mean_terminated_length": 2545.25, "completions/min_length": 1991.0, "completions/min_terminated_length": 1991.0, "epoch": 0.6773049645390071, "frac_reward_zero_std": 0.0, "grad_norm": 0.3206155002117157, "kl": 0.0, "learning_rate": 4.047619047619048e-06, "loss": -0.0, "num_tokens": 3025596.0, "reward": 5.511622905731201, "reward_std": 0.9387763142585754, "rewards/reward_model_wrapper/mean": 5.511622905731201, "rewards/reward_model_wrapper/std": 0.9387764930725098, "step": 191 }, { "completion_length": 416.75, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 416.75, "completions/mean_terminated_length": 416.75, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.6808510638297872, "frac_reward_zero_std": 0.0, "grad_norm": 1.0994503498077393, "kl": 0.0, "learning_rate": 4.03968253968254e-06, "loss": 0.0, "num_tokens": 3038151.0, "reward": 9.125, "reward_std": 1.0307766199111938, "rewards/reward_model_wrapper/mean": 9.125, "rewards/reward_model_wrapper/std": 1.0307763814926147, "step": 192 }, { "completion_length": 2273.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 2273.75, "completions/mean_terminated_length": 2273.75, "completions/min_length": 1848.0, "completions/min_terminated_length": 1848.0, "epoch": 0.6843971631205674, "frac_reward_zero_std": 0.0, "grad_norm": 0.8076251745223999, "kl": 0.0, "learning_rate": 4.031746031746032e-06, "loss": 0.0, "num_tokens": 3056594.0, "reward": 6.537005424499512, "reward_std": 2.0383059978485107, "rewards/reward_model_wrapper/mean": 6.537005424499512, "rewards/reward_model_wrapper/std": 2.0383059978485107, "step": 193 }, { "completion_length": 436.25, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 436.25, "completions/mean_terminated_length": 436.25, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.6879432624113475, "frac_reward_zero_std": 0.0, "grad_norm": 0.05064529925584793, "kl": 0.0, "learning_rate": 4.023809523809524e-06, "loss": 0.0, "num_tokens": 3068503.0, "reward": 9.550000190734863, "reward_std": 0.057735245674848557, "rewards/reward_model_wrapper/mean": 9.550000190734863, "rewards/reward_model_wrapper/std": 0.057735245674848557, "step": 194 }, { "completion_length": 2770.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 2770.25, "completions/mean_terminated_length": 2770.25, "completions/min_length": 2568.0, "completions/min_terminated_length": 2568.0, "epoch": 0.6914893617021277, "frac_reward_zero_std": 0.0, "grad_norm": 0.29459747672080994, "kl": 0.0, "learning_rate": 4.0158730158730165e-06, "loss": -0.0, "num_tokens": 3090512.0, "reward": 6.714430332183838, "reward_std": 0.9508627653121948, "rewards/reward_model_wrapper/mean": 6.714430332183838, "rewards/reward_model_wrapper/std": 0.9508628845214844, "step": 195 }, { "completion_length": 2266.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 2266.5, "completions/mean_terminated_length": 2266.5, "completions/min_length": 2012.0, "completions/min_terminated_length": 2012.0, "epoch": 0.6950354609929078, "frac_reward_zero_std": 0.0, "grad_norm": 0.42624586820602417, "kl": 0.0, "learning_rate": 4.007936507936508e-06, "loss": 0.0, "num_tokens": 3108894.0, "reward": 5.371957778930664, "reward_std": 1.4476152658462524, "rewards/reward_model_wrapper/mean": 5.371957778930664, "rewards/reward_model_wrapper/std": 1.447615385055542, "step": 196 }, { "completion_length": 2950.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3937.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 2950.25, "completions/mean_terminated_length": 2950.25, "completions/min_length": 2578.0, "completions/min_terminated_length": 2578.0, "epoch": 0.6985815602836879, "frac_reward_zero_std": 0.0, "grad_norm": 0.7941462397575378, "kl": 0.0, "learning_rate": 4.000000000000001e-06, "loss": -0.0, "num_tokens": 3129795.0, "reward": 1.0044989585876465, "reward_std": 2.008997917175293, "rewards/reward_model_wrapper/mean": 1.0044989585876465, "rewards/reward_model_wrapper/std": 2.008997917175293, "step": 197 }, { "completion_length": 3095.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3888.0, "completions/max_terminated_length": 3888.0, "completions/mean_length": 3095.25, "completions/mean_terminated_length": 3095.25, "completions/min_length": 2517.0, "completions/min_terminated_length": 2517.0, "epoch": 0.7021276595744681, "frac_reward_zero_std": 0.0, "grad_norm": 0.3224506974220276, "kl": 0.0, "learning_rate": 3.992063492063492e-06, "loss": 0.0, "num_tokens": 3152844.0, "reward": 7.919301986694336, "reward_std": 1.1731617450714111, "rewards/reward_model_wrapper/mean": 7.919301986694336, "rewards/reward_model_wrapper/std": 1.173161506652832, "step": 198 }, { "completion_length": 1723.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1723.0, "completions/mean_terminated_length": 1723.0, "completions/min_length": 1687.0, "completions/min_terminated_length": 1687.0, "epoch": 0.7056737588652482, "frac_reward_zero_std": 0.0, "grad_norm": 1.9725698232650757, "kl": 0.0, "learning_rate": 3.984126984126984e-06, "loss": -0.0, "num_tokens": 3168216.0, "reward": 6.199999809265137, "reward_std": 4.178516387939453, "rewards/reward_model_wrapper/mean": 6.199999809265137, "rewards/reward_model_wrapper/std": 4.178516387939453, "step": 199 }, { "completion_length": 3004.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3290.0, "completions/max_terminated_length": 3290.0, "completions/mean_length": 3004.75, "completions/mean_terminated_length": 3004.75, "completions/min_length": 2707.0, "completions/min_terminated_length": 2707.0, "epoch": 0.7092198581560284, "frac_reward_zero_std": 0.0, "grad_norm": 0.26858848333358765, "kl": 0.0, "learning_rate": 3.9761904761904764e-06, "loss": -0.0, "num_tokens": 3189067.0, "reward": 6.081937789916992, "reward_std": 0.7559127807617188, "rewards/reward_model_wrapper/mean": 6.081937789916992, "rewards/reward_model_wrapper/std": 0.7559127807617188, "step": 200 }, { "epoch": 0.7092198581560284, "eval_completion_length": 1624.6666666666667, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1762.3333333333333, "eval_completions/max_terminated_length": 1762.3333333333333, "eval_completions/mean_length": 1624.6666666666667, "eval_completions/mean_terminated_length": 1624.6666666666667, "eval_completions/min_length": 1513.0, "eval_completions/min_terminated_length": 1513.0, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 5.256267598952036e-08, "eval_num_tokens": 3189067.0, "eval_reward": 8.021337509155273, "eval_reward_std": 0.823111762603124, "eval_rewards/reward_model_wrapper/mean": 8.021337509155273, "eval_rewards/reward_model_wrapper/std": 0.8231118122736613, "eval_runtime": 254.6428, "eval_samples_per_second": 0.012, "eval_steps_per_second": 0.004, "step": 200 }, { "completion_length": 2172.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 2172.0, "completions/mean_terminated_length": 2172.0, "completions/min_length": 1957.0, "completions/min_terminated_length": 1957.0, "epoch": 0.7127659574468085, "frac_reward_zero_std": 0.0, "grad_norm": 0.6713287830352783, "kl": 0.0, "learning_rate": 3.968253968253968e-06, "loss": -0.0, "num_tokens": 3206715.0, "reward": 5.999256610870361, "reward_std": 1.9605343341827393, "rewards/reward_model_wrapper/mean": 5.999256610870361, "rewards/reward_model_wrapper/std": 1.9605345726013184, "step": 201 }, { "completion_length": 2593.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 2593.25, "completions/mean_terminated_length": 2593.25, "completions/min_length": 2278.0, "completions/min_terminated_length": 2278.0, "epoch": 0.7163120567375887, "frac_reward_zero_std": 0.0, "grad_norm": 0.6334704160690308, "kl": 0.0, "learning_rate": 3.9603174603174606e-06, "loss": -0.0, "num_tokens": 3226084.0, "reward": 7.339071750640869, "reward_std": 2.0284106731414795, "rewards/reward_model_wrapper/mean": 7.339071750640869, "rewards/reward_model_wrapper/std": 2.0284106731414795, "step": 202 }, { "completion_length": 2229.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 2229.25, "completions/mean_terminated_length": 2229.25, "completions/min_length": 1903.0, "completions/min_terminated_length": 1903.0, "epoch": 0.7198581560283688, "frac_reward_zero_std": 0.0, "grad_norm": 0.4829287528991699, "kl": 0.0, "learning_rate": 3.952380952380952e-06, "loss": -0.0, "num_tokens": 3244245.0, "reward": 5.921452522277832, "reward_std": 1.3154124021530151, "rewards/reward_model_wrapper/mean": 5.921452522277832, "rewards/reward_model_wrapper/std": 1.3154124021530151, "step": 203 }, { "completion_length": 519.25, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 519.25, "completions/mean_terminated_length": 519.25, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.723404255319149, "frac_reward_zero_std": 0.0, "grad_norm": 1.829824447631836, "kl": 0.0, "learning_rate": 3.944444444444445e-06, "loss": 0.0, "num_tokens": 3254738.0, "reward": 7.175000190734863, "reward_std": 1.9551212787628174, "rewards/reward_model_wrapper/mean": 7.175000190734863, "rewards/reward_model_wrapper/std": 1.9551215171813965, "step": 204 }, { "completion_length": 2626.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2626.75, "completions/mean_terminated_length": 2626.75, "completions/min_length": 2212.0, "completions/min_terminated_length": 2212.0, "epoch": 0.7269503546099291, "frac_reward_zero_std": 0.0, "grad_norm": 1.5580024719238281, "kl": 0.0, "learning_rate": 3.936507936507936e-06, "loss": -0.0, "num_tokens": 3273521.0, "reward": 5.818401336669922, "reward_std": 3.966958999633789, "rewards/reward_model_wrapper/mean": 5.818401336669922, "rewards/reward_model_wrapper/std": 3.966959238052368, "step": 205 }, { "completion_length": 2995.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3464.0, "completions/max_terminated_length": 3464.0, "completions/mean_length": 2995.0, "completions/mean_terminated_length": 2995.0, "completions/min_length": 2642.0, "completions/min_terminated_length": 2642.0, "epoch": 0.7304964539007093, "frac_reward_zero_std": 0.0, "grad_norm": 0.42423638701438904, "kl": 0.0, "learning_rate": 3.928571428571429e-06, "loss": -0.0, "num_tokens": 3294773.0, "reward": 6.130029678344727, "reward_std": 1.3333604335784912, "rewards/reward_model_wrapper/mean": 6.130029678344727, "rewards/reward_model_wrapper/std": 1.3333604335784912, "step": 206 }, { "completion_length": 2556.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 2556.75, "completions/mean_terminated_length": 2556.75, "completions/min_length": 2292.0, "completions/min_terminated_length": 2292.0, "epoch": 0.7340425531914894, "frac_reward_zero_std": 0.0, "grad_norm": 0.18910595774650574, "kl": 0.0, "learning_rate": 3.920634920634921e-06, "loss": 0.0, "num_tokens": 3314004.0, "reward": 7.490004539489746, "reward_std": 0.525921642780304, "rewards/reward_model_wrapper/mean": 7.490004539489746, "rewards/reward_model_wrapper/std": 0.5259217619895935, "step": 207 }, { "completion_length": 1951.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 1951.0, "completions/mean_terminated_length": 1951.0, "completions/min_length": 1449.0, "completions/min_terminated_length": 1449.0, "epoch": 0.7375886524822695, "frac_reward_zero_std": 0.0, "grad_norm": 0.28988587856292725, "kl": 0.0, "learning_rate": 3.912698412698413e-06, "loss": -0.0, "num_tokens": 3330032.0, "reward": 5.540345191955566, "reward_std": 0.7441600561141968, "rewards/reward_model_wrapper/mean": 5.540345191955566, "rewards/reward_model_wrapper/std": 0.7441601157188416, "step": 208 }, { "completion_length": 501.75, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 501.75, "completions/mean_terminated_length": 501.75, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.7411347517730497, "frac_reward_zero_std": 0.0, "grad_norm": 1.4627591371536255, "kl": 0.0, "learning_rate": 3.9047619047619055e-06, "loss": 0.0, "num_tokens": 3340883.0, "reward": 8.825000762939453, "reward_std": 1.7036725282669067, "rewards/reward_model_wrapper/mean": 8.825000762939453, "rewards/reward_model_wrapper/std": 1.7036724090576172, "step": 209 }, { "completion_length": 2266.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 2266.5, "completions/mean_terminated_length": 2266.5, "completions/min_length": 1854.0, "completions/min_terminated_length": 1854.0, "epoch": 0.7446808510638298, "frac_reward_zero_std": 0.0, "grad_norm": 1.1594327688217163, "kl": 0.0, "learning_rate": 3.896825396825397e-06, "loss": -0.0, "num_tokens": 3359453.0, "reward": 4.622861862182617, "reward_std": 3.4257824420928955, "rewards/reward_model_wrapper/mean": 4.622861862182617, "rewards/reward_model_wrapper/std": 3.4257826805114746, "step": 210 }, { "completion_length": 2849.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2849.5, "completions/mean_terminated_length": 2849.5, "completions/min_length": 2498.0, "completions/min_terminated_length": 2498.0, "epoch": 0.74822695035461, "frac_reward_zero_std": 0.0, "grad_norm": 0.17068040370941162, "kl": 0.0, "learning_rate": 3.88888888888889e-06, "loss": 0.0, "num_tokens": 3380831.0, "reward": 5.320184707641602, "reward_std": 0.49733904004096985, "rewards/reward_model_wrapper/mean": 5.320184707641602, "rewards/reward_model_wrapper/std": 0.49733904004096985, "step": 211 }, { "completion_length": 2342.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 2342.25, "completions/mean_terminated_length": 2342.25, "completions/min_length": 1999.0, "completions/min_terminated_length": 1999.0, "epoch": 0.75177304964539, "frac_reward_zero_std": 0.0, "grad_norm": 0.19651064276695251, "kl": 0.0, "learning_rate": 3.880952380952381e-06, "loss": 0.0, "num_tokens": 3399800.0, "reward": 8.825199127197266, "reward_std": 0.7178740501403809, "rewards/reward_model_wrapper/mean": 8.825199127197266, "rewards/reward_model_wrapper/std": 0.7178743481636047, "step": 212 }, { "completion_length": 2504.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 2504.0, "completions/mean_terminated_length": 2504.0, "completions/min_length": 2107.0, "completions/min_terminated_length": 2107.0, "epoch": 0.7553191489361702, "frac_reward_zero_std": 0.0, "grad_norm": 0.1505124419927597, "kl": 0.0, "learning_rate": 3.873015873015874e-06, "loss": 0.0, "num_tokens": 3418996.0, "reward": 6.223034858703613, "reward_std": 0.4169750213623047, "rewards/reward_model_wrapper/mean": 6.223034858703613, "rewards/reward_model_wrapper/std": 0.4169751703739166, "step": 213 }, { "completion_length": 466.5, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 466.5, "completions/mean_terminated_length": 466.5, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.7588652482269503, "frac_reward_zero_std": 0.0, "grad_norm": 0.07978856563568115, "kl": 0.0, "learning_rate": 3.865079365079365e-06, "loss": -0.0, "num_tokens": 3429430.0, "reward": 9.424999237060547, "reward_std": 0.09574273973703384, "rewards/reward_model_wrapper/mean": 9.424999237060547, "rewards/reward_model_wrapper/std": 0.09574265778064728, "step": 214 }, { "completion_length": 580.75, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 580.75, "completions/mean_terminated_length": 580.75, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.7624113475177305, "frac_reward_zero_std": 0.0, "grad_norm": 0.4256956875324249, "kl": 0.0, "learning_rate": 3.857142857142858e-06, "loss": -0.0, "num_tokens": 3441349.0, "reward": 8.299999237060547, "reward_std": 0.5830946564674377, "rewards/reward_model_wrapper/mean": 8.299999237060547, "rewards/reward_model_wrapper/std": 0.5830948948860168, "step": 215 }, { "completion_length": 2314.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2753.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 2314.5, "completions/mean_terminated_length": 2314.5, "completions/min_length": 1951.0, "completions/min_terminated_length": 1951.0, "epoch": 0.7659574468085106, "frac_reward_zero_std": 0.0, "grad_norm": 0.3771781921386719, "kl": 0.0, "learning_rate": 3.8492063492063495e-06, "loss": -0.0, "num_tokens": 3460251.0, "reward": 5.869539260864258, "reward_std": 1.0737770795822144, "rewards/reward_model_wrapper/mean": 5.869539260864258, "rewards/reward_model_wrapper/std": 1.0737770795822144, "step": 216 }, { "completion_length": 2125.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2299.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 2125.25, "completions/mean_terminated_length": 2125.25, "completions/min_length": 1941.0, "completions/min_terminated_length": 1941.0, "epoch": 0.7695035460992907, "frac_reward_zero_std": 0.0, "grad_norm": 0.23137274384498596, "kl": 0.0, "learning_rate": 3.841269841269842e-06, "loss": -0.0, "num_tokens": 3478324.0, "reward": 7.833771705627441, "reward_std": 0.8009502291679382, "rewards/reward_model_wrapper/mean": 7.833771705627441, "rewards/reward_model_wrapper/std": 0.800950288772583, "step": 217 }, { "completion_length": 2377.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 2377.25, "completions/mean_terminated_length": 2377.25, "completions/min_length": 1974.0, "completions/min_terminated_length": 1974.0, "epoch": 0.7730496453900709, "frac_reward_zero_std": 0.0, "grad_norm": 0.38140520453453064, "kl": 0.0, "learning_rate": 3.833333333333334e-06, "loss": 0.0, "num_tokens": 3497925.0, "reward": 8.141302108764648, "reward_std": 1.1214826107025146, "rewards/reward_model_wrapper/mean": 8.141302108764648, "rewards/reward_model_wrapper/std": 1.1214826107025146, "step": 218 }, { "completion_length": 461.5, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 461.5, "completions/mean_terminated_length": 461.5, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.776595744680851, "frac_reward_zero_std": 0.0, "grad_norm": 0.9963136911392212, "kl": 0.0, "learning_rate": 3.825396825396825e-06, "loss": -0.0, "num_tokens": 3509387.0, "reward": 8.674999237060547, "reward_std": 1.081279993057251, "rewards/reward_model_wrapper/mean": 8.674999237060547, "rewards/reward_model_wrapper/std": 1.0812801122665405, "step": 219 }, { "completion_length": 2151.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 2151.25, "completions/mean_terminated_length": 2151.25, "completions/min_length": 1867.0, "completions/min_terminated_length": 1867.0, "epoch": 0.7801418439716312, "frac_reward_zero_std": 0.0, "grad_norm": 0.3634147346019745, "kl": 0.0, "learning_rate": 3.817460317460318e-06, "loss": -0.0, "num_tokens": 3526636.0, "reward": 7.816840171813965, "reward_std": 0.972815990447998, "rewards/reward_model_wrapper/mean": 7.816840171813965, "rewards/reward_model_wrapper/std": 0.9728158116340637, "step": 220 }, { "epoch": 0.7801418439716312, "eval_completion_length": 1700.5, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1929.3333333333333, "eval_completions/max_terminated_length": 1929.3333333333333, "eval_completions/mean_length": 1700.5, "eval_completions/mean_terminated_length": 1700.5, "eval_completions/min_length": 1465.6666666666667, "eval_completions/min_terminated_length": 1465.6666666666667, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 1.60025280138143e-07, "eval_num_tokens": 3526636.0, "eval_reward": 8.269582271575928, "eval_reward_std": 0.7187795241673788, "eval_rewards/reward_model_wrapper/mean": 8.269582271575928, "eval_rewards/reward_model_wrapper/std": 0.7187795639038086, "eval_runtime": 270.3263, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 220 }, { "completion_length": 2181.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 2181.5, "completions/mean_terminated_length": 2181.5, "completions/min_length": 1920.0, "completions/min_terminated_length": 1920.0, "epoch": 0.7836879432624113, "frac_reward_zero_std": 0.0, "grad_norm": 0.2662002742290497, "kl": 0.0, "learning_rate": 3.80952380952381e-06, "loss": -0.0, "num_tokens": 3545498.0, "reward": 8.167633056640625, "reward_std": 0.7070407867431641, "rewards/reward_model_wrapper/mean": 8.167633056640625, "rewards/reward_model_wrapper/std": 0.7070406079292297, "step": 221 }, { "completion_length": 2464.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 2464.5, "completions/mean_terminated_length": 2464.5, "completions/min_length": 1985.0, "completions/min_terminated_length": 1985.0, "epoch": 0.7872340425531915, "frac_reward_zero_std": 0.0, "grad_norm": 0.5100277066230774, "kl": 0.0, "learning_rate": 3.801587301587302e-06, "loss": 0.0, "num_tokens": 3565240.0, "reward": 7.073288917541504, "reward_std": 1.5862873792648315, "rewards/reward_model_wrapper/mean": 7.073288917541504, "rewards/reward_model_wrapper/std": 1.586287260055542, "step": 222 }, { "completion_length": 2444.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3531.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 2444.75, "completions/mean_terminated_length": 2444.75, "completions/min_length": 1846.0, "completions/min_terminated_length": 1846.0, "epoch": 0.7907801418439716, "frac_reward_zero_std": 0.0, "grad_norm": 0.4494321942329407, "kl": 0.0, "learning_rate": 3.793650793650794e-06, "loss": 0.0, "num_tokens": 3584095.0, "reward": 7.215969085693359, "reward_std": 0.8860815167427063, "rewards/reward_model_wrapper/mean": 7.215969085693359, "rewards/reward_model_wrapper/std": 0.8860815167427063, "step": 223 }, { "completion_length": 437.5, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 437.5, "completions/mean_terminated_length": 437.5, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.7943262411347518, "frac_reward_zero_std": 0.0, "grad_norm": 0.9318215250968933, "kl": 0.0, "learning_rate": 3.785714285714286e-06, "loss": -0.0, "num_tokens": 3596965.0, "reward": 8.174999237060547, "reward_std": 0.8770217299461365, "rewards/reward_model_wrapper/mean": 8.174999237060547, "rewards/reward_model_wrapper/std": 0.8770217299461365, "step": 224 }, { "completion_length": 442.5, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 442.5, "completions/mean_terminated_length": 442.5, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.7978723404255319, "frac_reward_zero_std": 0.0, "grad_norm": 0.5678954720497131, "kl": 0.0, "learning_rate": 3.777777777777778e-06, "loss": -0.0, "num_tokens": 3607295.0, "reward": 9.074999809265137, "reward_std": 0.6184656620025635, "rewards/reward_model_wrapper/mean": 9.074999809265137, "rewards/reward_model_wrapper/std": 0.6184658408164978, "step": 225 }, { "completion_length": 2560.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2894.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 2560.75, "completions/mean_terminated_length": 2560.75, "completions/min_length": 2129.0, "completions/min_terminated_length": 2129.0, "epoch": 0.8014184397163121, "frac_reward_zero_std": 0.0, "grad_norm": 0.3085116744041443, "kl": 0.0, "learning_rate": 3.76984126984127e-06, "loss": 0.0, "num_tokens": 3627750.0, "reward": 7.259758949279785, "reward_std": 0.9428579211235046, "rewards/reward_model_wrapper/mean": 7.259758949279785, "rewards/reward_model_wrapper/std": 0.9428579807281494, "step": 226 }, { "completion_length": 394.5, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 394.5, "completions/mean_terminated_length": 394.5, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.8049645390070922, "frac_reward_zero_std": 0.0, "grad_norm": 4.370586395263672, "kl": 0.0, "learning_rate": 3.761904761904762e-06, "loss": -0.0, "num_tokens": 3637888.0, "reward": 7.724999904632568, "reward_std": 2.7825348377227783, "rewards/reward_model_wrapper/mean": 7.724999904632568, "rewards/reward_model_wrapper/std": 2.7825350761413574, "step": 227 }, { "completion_length": 2683.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3206.0, "completions/max_terminated_length": 3206.0, "completions/mean_length": 2683.0, "completions/mean_terminated_length": 2683.0, "completions/min_length": 2177.0, "completions/min_terminated_length": 2177.0, "epoch": 0.8085106382978723, "frac_reward_zero_std": 0.0, "grad_norm": 0.36320358514785767, "kl": 0.0, "learning_rate": 3.7539682539682543e-06, "loss": 0.0, "num_tokens": 3657540.0, "reward": 6.841793060302734, "reward_std": 1.0341066122055054, "rewards/reward_model_wrapper/mean": 6.841793060302734, "rewards/reward_model_wrapper/std": 1.034106731414795, "step": 228 }, { "completion_length": 2131.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 2131.5, "completions/mean_terminated_length": 2131.5, "completions/min_length": 1951.0, "completions/min_terminated_length": 1951.0, "epoch": 0.8120567375886525, "frac_reward_zero_std": 0.0, "grad_norm": 0.2548607587814331, "kl": 0.0, "learning_rate": 3.7460317460317463e-06, "loss": 0.0, "num_tokens": 3675994.0, "reward": 8.199789047241211, "reward_std": 0.735230028629303, "rewards/reward_model_wrapper/mean": 8.199789047241211, "rewards/reward_model_wrapper/std": 0.7352300882339478, "step": 229 }, { "completion_length": 2488.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 2488.5, "completions/mean_terminated_length": 2488.5, "completions/min_length": 2298.0, "completions/min_terminated_length": 2298.0, "epoch": 0.8156028368794326, "frac_reward_zero_std": 0.0, "grad_norm": 0.23277688026428223, "kl": 0.0, "learning_rate": 3.7380952380952384e-06, "loss": -0.0, "num_tokens": 3695924.0, "reward": 5.918699264526367, "reward_std": 0.7628692984580994, "rewards/reward_model_wrapper/mean": 5.918699264526367, "rewards/reward_model_wrapper/std": 0.7628693580627441, "step": 230 }, { "completion_length": 1829.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1829.75, "completions/mean_terminated_length": 1829.75, "completions/min_length": 1647.0, "completions/min_terminated_length": 1647.0, "epoch": 0.8191489361702128, "frac_reward_zero_std": 0.0, "grad_norm": 0.3287937641143799, "kl": 0.0, "learning_rate": 3.7301587301587305e-06, "loss": -0.0, "num_tokens": 3712307.0, "reward": 8.149999618530273, "reward_std": 0.9327379465103149, "rewards/reward_model_wrapper/mean": 8.149999618530273, "rewards/reward_model_wrapper/std": 0.9327378273010254, "step": 231 }, { "completion_length": 2555.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 2555.75, "completions/mean_terminated_length": 2555.75, "completions/min_length": 2357.0, "completions/min_terminated_length": 2357.0, "epoch": 0.8226950354609929, "frac_reward_zero_std": 0.0, "grad_norm": 0.06587518751621246, "kl": 0.0, "learning_rate": 3.7222222222222225e-06, "loss": 0.0, "num_tokens": 3731118.0, "reward": 7.631564140319824, "reward_std": 0.21653956174850464, "rewards/reward_model_wrapper/mean": 7.631564140319824, "rewards/reward_model_wrapper/std": 0.2165394127368927, "step": 232 }, { "completion_length": 2465.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 2465.0, "completions/mean_terminated_length": 2465.0, "completions/min_length": 2005.0, "completions/min_terminated_length": 2005.0, "epoch": 0.8262411347517731, "frac_reward_zero_std": 0.0, "grad_norm": 0.21043255925178528, "kl": 0.0, "learning_rate": 3.7142857142857146e-06, "loss": -0.0, "num_tokens": 3750446.0, "reward": 8.523218154907227, "reward_std": 0.5699461698532104, "rewards/reward_model_wrapper/mean": 8.523218154907227, "rewards/reward_model_wrapper/std": 0.5699459910392761, "step": 233 }, { "completion_length": 2371.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 2371.75, "completions/mean_terminated_length": 2371.75, "completions/min_length": 1901.0, "completions/min_terminated_length": 1901.0, "epoch": 0.8297872340425532, "frac_reward_zero_std": 0.0, "grad_norm": 0.46852922439575195, "kl": 0.0, "learning_rate": 3.7063492063492067e-06, "loss": -0.0, "num_tokens": 3769701.0, "reward": 6.899999618530273, "reward_std": 1.3515422344207764, "rewards/reward_model_wrapper/mean": 6.899999618530273, "rewards/reward_model_wrapper/std": 1.351542353630066, "step": 234 }, { "completion_length": 2296.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 2296.75, "completions/mean_terminated_length": 2296.75, "completions/min_length": 1946.0, "completions/min_terminated_length": 1946.0, "epoch": 0.8333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.7595881819725037, "kl": 0.0, "learning_rate": 3.6984126984126987e-06, "loss": 0.0, "num_tokens": 3788152.0, "reward": 7.1598711013793945, "reward_std": 2.1529154777526855, "rewards/reward_model_wrapper/mean": 7.1598711013793945, "rewards/reward_model_wrapper/std": 2.1529154777526855, "step": 235 }, { "completion_length": 2237.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2645.0, "completions/max_terminated_length": 2645.0, "completions/mean_length": 2237.25, "completions/mean_terminated_length": 2237.25, "completions/min_length": 1708.0, "completions/min_terminated_length": 1708.0, "epoch": 0.8368794326241135, "frac_reward_zero_std": 0.0, "grad_norm": 0.44726115465164185, "kl": 0.0, "learning_rate": 3.690476190476191e-06, "loss": -0.0, "num_tokens": 3805877.0, "reward": 6.849999904632568, "reward_std": 1.1818065643310547, "rewards/reward_model_wrapper/mean": 6.849999904632568, "rewards/reward_model_wrapper/std": 1.1818066835403442, "step": 236 }, { "completion_length": 2314.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 2314.0, "completions/mean_terminated_length": 2314.0, "completions/min_length": 1916.0, "completions/min_terminated_length": 1916.0, "epoch": 0.8404255319148937, "frac_reward_zero_std": 0.0, "grad_norm": 0.27029305696487427, "kl": 0.0, "learning_rate": 3.6825396825396833e-06, "loss": -0.0, "num_tokens": 3825265.0, "reward": 7.849999904632568, "reward_std": 0.7325754761695862, "rewards/reward_model_wrapper/mean": 7.849999904632568, "rewards/reward_model_wrapper/std": 0.7325754761695862, "step": 237 }, { "completion_length": 3291.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3944.0, "completions/max_terminated_length": 3944.0, "completions/mean_length": 3291.25, "completions/mean_terminated_length": 3291.25, "completions/min_length": 2831.0, "completions/min_terminated_length": 2831.0, "epoch": 0.8439716312056738, "frac_reward_zero_std": 0.0, "grad_norm": 0.3391379714012146, "kl": 0.0, "learning_rate": 3.6746031746031754e-06, "loss": 0.0, "num_tokens": 3848066.0, "reward": 6.427823543548584, "reward_std": 1.3842003345489502, "rewards/reward_model_wrapper/mean": 6.427823543548584, "rewards/reward_model_wrapper/std": 1.3842002153396606, "step": 238 }, { "completion_length": 2430.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 2430.0, "completions/mean_terminated_length": 2430.0, "completions/min_length": 2103.0, "completions/min_terminated_length": 2103.0, "epoch": 0.8475177304964538, "frac_reward_zero_std": 0.0, "grad_norm": 0.1822362095117569, "kl": 0.0, "learning_rate": 3.6666666666666666e-06, "loss": 0.0, "num_tokens": 3867150.0, "reward": 6.440606117248535, "reward_std": 0.4660487174987793, "rewards/reward_model_wrapper/mean": 6.440606117248535, "rewards/reward_model_wrapper/std": 0.46604883670806885, "step": 239 }, { "completion_length": 2250.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 2250.25, "completions/mean_terminated_length": 2250.25, "completions/min_length": 2136.0, "completions/min_terminated_length": 2136.0, "epoch": 0.851063829787234, "frac_reward_zero_std": 0.0, "grad_norm": 0.3650239109992981, "kl": 0.0, "learning_rate": 3.6587301587301586e-06, "loss": 0.0, "num_tokens": 3885383.0, "reward": 7.214082717895508, "reward_std": 1.329743504524231, "rewards/reward_model_wrapper/mean": 7.214082717895508, "rewards/reward_model_wrapper/std": 1.329743504524231, "step": 240 }, { "epoch": 0.851063829787234, "eval_completion_length": 1742.0833333333333, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1991.0, "eval_completions/max_terminated_length": 1991.0, "eval_completions/mean_length": 1742.0833333333333, "eval_completions/mean_terminated_length": 1742.0833333333333, "eval_completions/min_length": 1426.3333333333333, "eval_completions/min_terminated_length": 1426.3333333333333, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 1.8621513220296038e-07, "eval_num_tokens": 3885383.0, "eval_reward": 7.765734672546387, "eval_reward_std": 1.238718052705129, "eval_rewards/reward_model_wrapper/mean": 7.765734672546387, "eval_rewards/reward_model_wrapper/std": 1.2387180129686992, "eval_runtime": 277.8408, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 240 }, { "completion_length": 2203.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 2203.25, "completions/mean_terminated_length": 2203.25, "completions/min_length": 2013.0, "completions/min_terminated_length": 2013.0, "epoch": 0.8546099290780141, "frac_reward_zero_std": 0.0, "grad_norm": 0.2700336277484894, "kl": 0.0, "learning_rate": 3.6507936507936507e-06, "loss": -0.0, "num_tokens": 3903240.0, "reward": 7.924366474151611, "reward_std": 0.8551243543624878, "rewards/reward_model_wrapper/mean": 7.924366474151611, "rewards/reward_model_wrapper/std": 0.8551242351531982, "step": 241 }, { "completion_length": 2301.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 2301.0, "completions/mean_terminated_length": 2301.0, "completions/min_length": 2071.0, "completions/min_terminated_length": 2071.0, "epoch": 0.8581560283687943, "frac_reward_zero_std": 0.0, "grad_norm": 0.7171684503555298, "kl": 0.0, "learning_rate": 3.642857142857143e-06, "loss": 0.0, "num_tokens": 3921616.0, "reward": 6.969796180725098, "reward_std": 2.449939012527466, "rewards/reward_model_wrapper/mean": 6.969796180725098, "rewards/reward_model_wrapper/std": 2.4499387741088867, "step": 242 }, { "completion_length": 1923.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1923.0, "completions/mean_terminated_length": 1923.0, "completions/min_length": 1809.0, "completions/min_terminated_length": 1809.0, "epoch": 0.8617021276595744, "frac_reward_zero_std": 0.0, "grad_norm": 0.3681443929672241, "kl": 0.0, "learning_rate": 3.6349206349206353e-06, "loss": 0.0, "num_tokens": 3937780.0, "reward": 7.050696849822998, "reward_std": 1.001990795135498, "rewards/reward_model_wrapper/mean": 7.050696849822998, "rewards/reward_model_wrapper/std": 1.001990795135498, "step": 243 }, { "completion_length": 481.75, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 481.75, "completions/mean_terminated_length": 481.75, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.8652482269503546, "frac_reward_zero_std": 0.0, "grad_norm": 0.28024864196777344, "kl": 0.0, "learning_rate": 3.6269841269841273e-06, "loss": 0.0, "num_tokens": 3948547.0, "reward": 9.175000190734863, "reward_std": 0.3304038941860199, "rewards/reward_model_wrapper/mean": 9.175000190734863, "rewards/reward_model_wrapper/std": 0.3304038643836975, "step": 244 }, { "completion_length": 1747.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 1747.0, "completions/mean_terminated_length": 1747.0, "completions/min_length": 1417.0, "completions/min_terminated_length": 1417.0, "epoch": 0.8687943262411347, "frac_reward_zero_std": 0.0, "grad_norm": 0.3701102137565613, "kl": 0.0, "learning_rate": 3.6190476190476194e-06, "loss": -0.0, "num_tokens": 3964263.0, "reward": 8.274999618530273, "reward_std": 0.9920516610145569, "rewards/reward_model_wrapper/mean": 8.274999618530273, "rewards/reward_model_wrapper/std": 0.9920517206192017, "step": 245 }, { "completion_length": 352.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 352.0, "completions/mean_terminated_length": 352.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.8723404255319149, "frac_reward_zero_std": 0.0, "grad_norm": 0.042087722569704056, "kl": 0.0, "learning_rate": 3.6111111111111115e-06, "loss": -0.0, "num_tokens": 3974991.0, "reward": 8.774999618530273, "reward_std": 0.05000019073486328, "rewards/reward_model_wrapper/mean": 8.774999618530273, "rewards/reward_model_wrapper/std": 0.05000019073486328, "step": 246 }, { "completion_length": 417.25, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 417.25, "completions/mean_terminated_length": 417.25, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.875886524822695, "frac_reward_zero_std": 0.0, "grad_norm": 2.3400096893310547, "kl": 0.0, "learning_rate": 3.6031746031746035e-06, "loss": 0.0, "num_tokens": 3985256.0, "reward": 8.875, "reward_std": 1.649999737739563, "rewards/reward_model_wrapper/mean": 8.875, "rewards/reward_model_wrapper/std": 1.6499998569488525, "step": 247 }, { "completion_length": 2763.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3755.0, "completions/max_terminated_length": 3755.0, "completions/mean_length": 2763.25, "completions/mean_terminated_length": 2763.25, "completions/min_length": 2108.0, "completions/min_terminated_length": 2108.0, "epoch": 0.8794326241134752, "frac_reward_zero_std": 0.0, "grad_norm": 0.33081990480422974, "kl": 0.0, "learning_rate": 3.5952380952380956e-06, "loss": 0.0, "num_tokens": 4005593.0, "reward": 7.138833045959473, "reward_std": 0.9288495779037476, "rewards/reward_model_wrapper/mean": 7.138833045959473, "rewards/reward_model_wrapper/std": 0.9288495182991028, "step": 248 }, { "completion_length": 2560.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3048.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 2560.75, "completions/mean_terminated_length": 2560.75, "completions/min_length": 2229.0, "completions/min_terminated_length": 2229.0, "epoch": 0.8829787234042553, "frac_reward_zero_std": 0.0, "grad_norm": 0.38322874903678894, "kl": 0.0, "learning_rate": 3.5873015873015877e-06, "loss": 0.0, "num_tokens": 4026160.0, "reward": 8.062934875488281, "reward_std": 1.2949014902114868, "rewards/reward_model_wrapper/mean": 8.062934875488281, "rewards/reward_model_wrapper/std": 1.2949014902114868, "step": 249 }, { "completion_length": 2156.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 2156.25, "completions/mean_terminated_length": 2156.25, "completions/min_length": 1985.0, "completions/min_terminated_length": 1985.0, "epoch": 0.8865248226950354, "frac_reward_zero_std": 0.0, "grad_norm": 0.5524158477783203, "kl": 0.0, "learning_rate": 3.5793650793650797e-06, "loss": 0.0, "num_tokens": 4045745.0, "reward": 7.398092269897461, "reward_std": 1.5885765552520752, "rewards/reward_model_wrapper/mean": 7.398092269897461, "rewards/reward_model_wrapper/std": 1.5885766744613647, "step": 250 }, { "completion_length": 2402.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 2402.5, "completions/mean_terminated_length": 2402.5, "completions/min_length": 2148.0, "completions/min_terminated_length": 2148.0, "epoch": 0.8900709219858156, "frac_reward_zero_std": 0.0, "grad_norm": 0.16791951656341553, "kl": 0.0, "learning_rate": 3.5714285714285718e-06, "loss": -0.0, "num_tokens": 4064975.0, "reward": 8.131983757019043, "reward_std": 0.6082563400268555, "rewards/reward_model_wrapper/mean": 8.131983757019043, "rewards/reward_model_wrapper/std": 0.6082562208175659, "step": 251 }, { "completion_length": 2740.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3692.0, "completions/max_terminated_length": 3692.0, "completions/mean_length": 2740.75, "completions/mean_terminated_length": 2740.75, "completions/min_length": 2273.0, "completions/min_terminated_length": 2273.0, "epoch": 0.8936170212765957, "frac_reward_zero_std": 0.0, "grad_norm": 1.0261527299880981, "kl": 0.0, "learning_rate": 3.563492063492064e-06, "loss": -0.0, "num_tokens": 4086018.0, "reward": 4.149862766265869, "reward_std": 2.9782512187957764, "rewards/reward_model_wrapper/mean": 4.149862766265869, "rewards/reward_model_wrapper/std": 2.9782514572143555, "step": 252 }, { "completion_length": 2379.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2658.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 2379.0, "completions/mean_terminated_length": 2379.0, "completions/min_length": 1982.0, "completions/min_terminated_length": 1982.0, "epoch": 0.8971631205673759, "frac_reward_zero_std": 0.0, "grad_norm": 0.066538006067276, "kl": 0.0, "learning_rate": 3.555555555555556e-06, "loss": -0.0, "num_tokens": 4104726.0, "reward": 5.6199541091918945, "reward_std": 0.19530585408210754, "rewards/reward_model_wrapper/mean": 5.6199541091918945, "rewards/reward_model_wrapper/std": 0.19530577957630157, "step": 253 }, { "completion_length": 2667.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3297.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 2667.0, "completions/mean_terminated_length": 2667.0, "completions/min_length": 2219.0, "completions/min_terminated_length": 2219.0, "epoch": 0.900709219858156, "frac_reward_zero_std": 0.0, "grad_norm": 0.503246009349823, "kl": 0.0, "learning_rate": 3.547619047619048e-06, "loss": -0.0, "num_tokens": 4125194.0, "reward": 6.623642921447754, "reward_std": 1.673454999923706, "rewards/reward_model_wrapper/mean": 6.623642921447754, "rewards/reward_model_wrapper/std": 1.6734548807144165, "step": 254 }, { "completion_length": 1986.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1986.0, "completions/mean_terminated_length": 1986.0, "completions/min_length": 1804.0, "completions/min_terminated_length": 1804.0, "epoch": 0.9042553191489362, "frac_reward_zero_std": 0.0, "grad_norm": 0.32030004262924194, "kl": 0.0, "learning_rate": 3.53968253968254e-06, "loss": -0.0, "num_tokens": 4142470.0, "reward": 7.0, "reward_std": 0.9831922054290771, "rewards/reward_model_wrapper/mean": 7.0, "rewards/reward_model_wrapper/std": 0.9831922054290771, "step": 255 }, { "completion_length": 2150.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 2150.75, "completions/mean_terminated_length": 2150.75, "completions/min_length": 1687.0, "completions/min_terminated_length": 1687.0, "epoch": 0.9078014184397163, "frac_reward_zero_std": 0.0, "grad_norm": 0.3175884187221527, "kl": 0.0, "learning_rate": 3.531746031746032e-06, "loss": 0.0, "num_tokens": 4160617.0, "reward": 6.7586669921875, "reward_std": 0.7603438496589661, "rewards/reward_model_wrapper/mean": 6.7586669921875, "rewards/reward_model_wrapper/std": 0.7603438496589661, "step": 256 }, { "completion_length": 2174.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 2174.5, "completions/mean_terminated_length": 2174.5, "completions/min_length": 1927.0, "completions/min_terminated_length": 1927.0, "epoch": 0.9113475177304965, "frac_reward_zero_std": 0.0, "grad_norm": 0.19280906021595, "kl": 0.0, "learning_rate": 3.523809523809524e-06, "loss": 0.0, "num_tokens": 4180139.0, "reward": 8.675000190734863, "reward_std": 0.639661431312561, "rewards/reward_model_wrapper/mean": 8.675000190734863, "rewards/reward_model_wrapper/std": 0.639661431312561, "step": 257 }, { "completion_length": 2240.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2395.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 2240.5, "completions/mean_terminated_length": 2240.5, "completions/min_length": 2080.0, "completions/min_terminated_length": 2080.0, "epoch": 0.9148936170212766, "frac_reward_zero_std": 0.0, "grad_norm": 0.3732958734035492, "kl": 0.0, "learning_rate": 3.5158730158730162e-06, "loss": 0.0, "num_tokens": 4199761.0, "reward": 5.200000286102295, "reward_std": 1.2027746438980103, "rewards/reward_model_wrapper/mean": 5.200000286102295, "rewards/reward_model_wrapper/std": 1.2027746438980103, "step": 258 }, { "completion_length": 2612.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2854.0, "completions/max_terminated_length": 2854.0, "completions/mean_length": 2612.25, "completions/mean_terminated_length": 2612.25, "completions/min_length": 2268.0, "completions/min_terminated_length": 2268.0, "epoch": 0.9184397163120568, "frac_reward_zero_std": 0.0, "grad_norm": 0.3945389688014984, "kl": 0.0, "learning_rate": 3.507936507936508e-06, "loss": -0.0, "num_tokens": 4220310.0, "reward": 7.820770263671875, "reward_std": 1.3257137537002563, "rewards/reward_model_wrapper/mean": 7.820770263671875, "rewards/reward_model_wrapper/std": 1.3257135152816772, "step": 259 }, { "completion_length": 2599.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3212.0, "completions/max_terminated_length": 3212.0, "completions/mean_length": 2599.25, "completions/mean_terminated_length": 2599.25, "completions/min_length": 2191.0, "completions/min_terminated_length": 2191.0, "epoch": 0.9219858156028369, "frac_reward_zero_std": 0.0, "grad_norm": 0.45063745975494385, "kl": 0.0, "learning_rate": 3.5e-06, "loss": 0.0, "num_tokens": 4240415.0, "reward": 8.128239631652832, "reward_std": 1.3656197786331177, "rewards/reward_model_wrapper/mean": 8.128239631652832, "rewards/reward_model_wrapper/std": 1.3656197786331177, "step": 260 }, { "epoch": 0.9219858156028369, "eval_completion_length": 1784.0833333333333, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2059.3333333333335, "eval_completions/max_terminated_length": 2059.3333333333335, "eval_completions/mean_length": 1784.0833333333333, "eval_completions/mean_terminated_length": 1784.0833333333333, "eval_completions/min_length": 1547.0, "eval_completions/min_terminated_length": 1547.0, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 2.83996996586211e-07, "eval_num_tokens": 4240415.0, "eval_reward": 8.205649375915527, "eval_reward_std": 1.0581791400909424, "eval_rewards/reward_model_wrapper/mean": 8.205649375915527, "eval_rewards/reward_model_wrapper/std": 1.0581791996955872, "eval_runtime": 282.2721, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 260 }, { "completion_length": 430.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 430.0, "completions/mean_terminated_length": 430.0, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.925531914893617, "frac_reward_zero_std": 0.0, "grad_norm": 1.498737096786499, "kl": 0.0, "learning_rate": 3.492063492063492e-06, "loss": -0.0, "num_tokens": 4251111.0, "reward": 9.199999809265137, "reward_std": 1.1343135833740234, "rewards/reward_model_wrapper/mean": 9.199999809265137, "rewards/reward_model_wrapper/std": 1.1343133449554443, "step": 261 }, { "completion_length": 2256.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 2256.0, "completions/mean_terminated_length": 2256.0, "completions/min_length": 1943.0, "completions/min_terminated_length": 1943.0, "epoch": 0.9290780141843972, "frac_reward_zero_std": 0.0, "grad_norm": 0.11556282639503479, "kl": 0.0, "learning_rate": 3.484126984126984e-06, "loss": 0.0, "num_tokens": 4270279.0, "reward": 8.925000190734863, "reward_std": 0.4500000476837158, "rewards/reward_model_wrapper/mean": 8.925000190734863, "rewards/reward_model_wrapper/std": 0.44999995827674866, "step": 262 }, { "completion_length": 461.5, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 461.5, "completions/mean_terminated_length": 461.5, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.9326241134751773, "frac_reward_zero_std": 0.0, "grad_norm": 0.33811303973197937, "kl": 0.0, "learning_rate": 3.476190476190476e-06, "loss": 0.0, "num_tokens": 4282545.0, "reward": 8.125, "reward_std": 0.3593975603580475, "rewards/reward_model_wrapper/mean": 8.125, "rewards/reward_model_wrapper/std": 0.35939761996269226, "step": 263 }, { "completion_length": 2174.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 2174.5, "completions/mean_terminated_length": 2174.5, "completions/min_length": 1837.0, "completions/min_terminated_length": 1837.0, "epoch": 0.9361702127659575, "frac_reward_zero_std": 0.0, "grad_norm": 0.3310074210166931, "kl": 0.0, "learning_rate": 3.4682539682539686e-06, "loss": 0.0, "num_tokens": 4300899.0, "reward": 6.850000381469727, "reward_std": 0.9746794700622559, "rewards/reward_model_wrapper/mean": 6.850000381469727, "rewards/reward_model_wrapper/std": 0.9746794700622559, "step": 264 }, { "completion_length": 421.75, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 421.75, "completions/mean_terminated_length": 421.75, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.9397163120567376, "frac_reward_zero_std": 0.0, "grad_norm": 1.3118574619293213, "kl": 0.0, "learning_rate": 3.4603174603174607e-06, "loss": -0.0, "num_tokens": 4313418.0, "reward": 7.75, "reward_std": 1.3428826332092285, "rewards/reward_model_wrapper/mean": 7.75, "rewards/reward_model_wrapper/std": 1.3428823947906494, "step": 265 }, { "completion_length": 399.5, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 399.5, "completions/mean_terminated_length": 399.5, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.9432624113475178, "frac_reward_zero_std": 0.0, "grad_norm": 0.7195640802383423, "kl": 0.0, "learning_rate": 3.4523809523809528e-06, "loss": -0.0, "num_tokens": 4324096.0, "reward": 8.524999618530273, "reward_std": 0.6849573254585266, "rewards/reward_model_wrapper/mean": 8.524999618530273, "rewards/reward_model_wrapper/std": 0.6849573850631714, "step": 266 }, { "completion_length": 2450.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2811.0, "completions/max_terminated_length": 2811.0, "completions/mean_length": 2450.0, "completions/mean_terminated_length": 2450.0, "completions/min_length": 2159.0, "completions/min_terminated_length": 2159.0, "epoch": 0.9468085106382979, "frac_reward_zero_std": 0.0, "grad_norm": 0.11030207574367523, "kl": 0.0, "learning_rate": 3.444444444444445e-06, "loss": -0.0, "num_tokens": 4344468.0, "reward": 7.459270000457764, "reward_std": 0.30939337611198425, "rewards/reward_model_wrapper/mean": 7.459270000457764, "rewards/reward_model_wrapper/std": 0.30939337611198425, "step": 267 }, { "completion_length": 396.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 396.0, "completions/mean_terminated_length": 396.0, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.950354609929078, "frac_reward_zero_std": 0.0, "grad_norm": 1.0424106121063232, "kl": 0.0, "learning_rate": 3.436507936507937e-06, "loss": -0.0, "num_tokens": 4355592.0, "reward": 8.924999237060547, "reward_std": 0.9499998092651367, "rewards/reward_model_wrapper/mean": 8.924999237060547, "rewards/reward_model_wrapper/std": 0.9499998092651367, "step": 268 }, { "completion_length": 398.75, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 398.75, "completions/mean_terminated_length": 398.75, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.9539007092198581, "frac_reward_zero_std": 0.0, "grad_norm": 0.30330970883369446, "kl": 0.0, "learning_rate": 3.428571428571429e-06, "loss": -0.0, "num_tokens": 4365331.0, "reward": 8.625, "reward_std": 0.29860779643058777, "rewards/reward_model_wrapper/mean": 8.625, "rewards/reward_model_wrapper/std": 0.29860782623291016, "step": 269 }, { "completion_length": 2222.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 2222.75, "completions/mean_terminated_length": 2222.75, "completions/min_length": 2164.0, "completions/min_terminated_length": 2164.0, "epoch": 0.9574468085106383, "frac_reward_zero_std": 0.0, "grad_norm": 0.4794080853462219, "kl": 0.0, "learning_rate": 3.420634920634921e-06, "loss": 0.0, "num_tokens": 4383434.0, "reward": 6.735689163208008, "reward_std": 1.273897409439087, "rewards/reward_model_wrapper/mean": 6.735689163208008, "rewards/reward_model_wrapper/std": 1.273897409439087, "step": 270 }, { "completion_length": 1884.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2099.0, "completions/max_terminated_length": 2099.0, "completions/mean_length": 1884.5, "completions/mean_terminated_length": 1884.5, "completions/min_length": 1743.0, "completions/min_terminated_length": 1743.0, "epoch": 0.9609929078014184, "frac_reward_zero_std": 0.0, "grad_norm": 0.1837020367383957, "kl": 0.0, "learning_rate": 3.412698412698413e-06, "loss": -0.0, "num_tokens": 4402380.0, "reward": 8.375, "reward_std": 0.5909034609794617, "rewards/reward_model_wrapper/mean": 8.375, "rewards/reward_model_wrapper/std": 0.5909034013748169, "step": 271 }, { "completion_length": 2945.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3572.0, "completions/max_terminated_length": 3572.0, "completions/mean_length": 2945.0, "completions/mean_terminated_length": 2945.0, "completions/min_length": 2595.0, "completions/min_terminated_length": 2595.0, "epoch": 0.9645390070921985, "frac_reward_zero_std": 0.0, "grad_norm": 0.10882791131734848, "kl": 0.0, "learning_rate": 3.404761904761905e-06, "loss": 0.0, "num_tokens": 4424620.0, "reward": 6.517223358154297, "reward_std": 0.3936096727848053, "rewards/reward_model_wrapper/mean": 6.517223358154297, "rewards/reward_model_wrapper/std": 0.3936096727848053, "step": 272 }, { "completion_length": 1988.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2540.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 1988.75, "completions/mean_terminated_length": 1988.75, "completions/min_length": 1694.0, "completions/min_terminated_length": 1694.0, "epoch": 0.9680851063829787, "frac_reward_zero_std": 0.0, "grad_norm": 0.5360626578330994, "kl": 0.0, "learning_rate": 3.3968253968253972e-06, "loss": -0.0, "num_tokens": 4443475.0, "reward": 7.75, "reward_std": 1.4253655672073364, "rewards/reward_model_wrapper/mean": 7.75, "rewards/reward_model_wrapper/std": 1.4253654479980469, "step": 273 }, { "completion_length": 2082.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 2082.5, "completions/mean_terminated_length": 2082.5, "completions/min_length": 1733.0, "completions/min_terminated_length": 1733.0, "epoch": 0.9716312056737588, "frac_reward_zero_std": 0.0, "grad_norm": 0.4412393569946289, "kl": 0.0, "learning_rate": 3.3888888888888893e-06, "loss": -0.0, "num_tokens": 4460165.0, "reward": 6.421736717224121, "reward_std": 1.11923086643219, "rewards/reward_model_wrapper/mean": 6.421736717224121, "rewards/reward_model_wrapper/std": 1.11923086643219, "step": 274 }, { "completion_length": 1679.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1679.25, "completions/mean_terminated_length": 1679.25, "completions/min_length": 1498.0, "completions/min_terminated_length": 1498.0, "epoch": 0.975177304964539, "frac_reward_zero_std": 0.0, "grad_norm": 0.5789185762405396, "kl": 0.0, "learning_rate": 3.3809523809523814e-06, "loss": 0.0, "num_tokens": 4476626.0, "reward": 7.0497565269470215, "reward_std": 1.3477293252944946, "rewards/reward_model_wrapper/mean": 7.0497565269470215, "rewards/reward_model_wrapper/std": 1.3477294445037842, "step": 275 }, { "completion_length": 2249.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 2249.0, "completions/mean_terminated_length": 2249.0, "completions/min_length": 2081.0, "completions/min_terminated_length": 2081.0, "epoch": 0.9787234042553191, "frac_reward_zero_std": 0.0, "grad_norm": 0.2676711678504944, "kl": 0.0, "learning_rate": 3.3730158730158734e-06, "loss": -0.0, "num_tokens": 4495162.0, "reward": 8.524999618530273, "reward_std": 0.8539124131202698, "rewards/reward_model_wrapper/mean": 8.524999618530273, "rewards/reward_model_wrapper/std": 0.8539124131202698, "step": 276 }, { "completion_length": 2095.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 2095.25, "completions/mean_terminated_length": 2095.25, "completions/min_length": 1862.0, "completions/min_terminated_length": 1862.0, "epoch": 0.9822695035460993, "frac_reward_zero_std": 0.0, "grad_norm": 0.3678964376449585, "kl": 0.0, "learning_rate": 3.3650793650793655e-06, "loss": 0.0, "num_tokens": 4513651.0, "reward": 5.679318428039551, "reward_std": 0.9281534552574158, "rewards/reward_model_wrapper/mean": 5.679318428039551, "rewards/reward_model_wrapper/std": 0.9281535148620605, "step": 277 }, { "completion_length": 445.5, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 445.5, "completions/mean_terminated_length": 445.5, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.9858156028368794, "frac_reward_zero_std": 0.0, "grad_norm": 0.21169635653495789, "kl": 0.0, "learning_rate": 3.357142857142857e-06, "loss": 0.0, "num_tokens": 4524873.0, "reward": 9.175000190734863, "reward_std": 0.25, "rewards/reward_model_wrapper/mean": 9.175000190734863, "rewards/reward_model_wrapper/std": 0.25, "step": 278 }, { "completion_length": 2036.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2530.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 2036.25, "completions/mean_terminated_length": 2036.25, "completions/min_length": 1622.0, "completions/min_terminated_length": 1622.0, "epoch": 0.9893617021276596, "frac_reward_zero_std": 0.0, "grad_norm": 0.5899212956428528, "kl": 0.0, "learning_rate": 3.349206349206349e-06, "loss": 0.0, "num_tokens": 4542086.0, "reward": 5.653779983520508, "reward_std": 1.484867811203003, "rewards/reward_model_wrapper/mean": 5.653779983520508, "rewards/reward_model_wrapper/std": 1.484867811203003, "step": 279 }, { "completion_length": 2291.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2364.0, "completions/max_terminated_length": 2364.0, "completions/mean_length": 2291.75, "completions/mean_terminated_length": 2291.75, "completions/min_length": 2210.0, "completions/min_terminated_length": 2210.0, "epoch": 0.9929078014184397, "frac_reward_zero_std": 0.0, "grad_norm": 0.4923723042011261, "kl": 0.0, "learning_rate": 3.3412698412698413e-06, "loss": -0.0, "num_tokens": 4560609.0, "reward": 8.379796981811523, "reward_std": 1.5744445323944092, "rewards/reward_model_wrapper/mean": 8.379796981811523, "rewards/reward_model_wrapper/std": 1.5744445323944092, "step": 280 }, { "epoch": 0.9929078014184397, "eval_completion_length": 1579.9166666666667, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1786.3333333333333, "eval_completions/max_terminated_length": 1786.3333333333333, "eval_completions/mean_length": 1579.9166666666667, "eval_completions/mean_terminated_length": 1579.9166666666667, "eval_completions/min_length": 1379.0, "eval_completions/min_terminated_length": 1379.0, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 2.6844459988240033e-09, "eval_num_tokens": 4560609.0, "eval_reward": 7.866105715433757, "eval_reward_std": 0.8902884523073832, "eval_rewards/reward_model_wrapper/mean": 7.866105715433757, "eval_rewards/reward_model_wrapper/std": 0.8902884920438131, "eval_runtime": 278.0301, "eval_samples_per_second": 0.011, "eval_steps_per_second": 0.004, "step": 280 }, { "completion_length": 2375.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2881.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 2375.75, "completions/mean_terminated_length": 2375.75, "completions/min_length": 2020.0, "completions/min_terminated_length": 2020.0, "epoch": 0.9964539007092199, "frac_reward_zero_std": 0.0, "grad_norm": 0.7555868029594421, "kl": 0.0, "learning_rate": 3.3333333333333333e-06, "loss": -0.0, "num_tokens": 4580156.0, "reward": 5.947710037231445, "reward_std": 1.8693984746932983, "rewards/reward_model_wrapper/mean": 5.947710037231445, "rewards/reward_model_wrapper/std": 1.8693984746932983, "step": 281 }, { "completion_length": 377.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.9177780151367188, "kl": 0.0, "learning_rate": 3.3253968253968254e-06, "loss": -0.0, "num_tokens": 4590908.0, "reward": 8.25, "reward_std": 0.9848859310150146, "rewards/reward_model_wrapper/mean": 8.25, "rewards/reward_model_wrapper/std": 0.9848859310150146, "step": 282 }, { "completion_length": 428.25, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 428.25, "completions/mean_terminated_length": 428.25, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 1.00354609929078, "frac_reward_zero_std": 0.0, "grad_norm": 0.04547256976366043, "kl": 0.0, "learning_rate": 3.3174603174603175e-06, "loss": -0.0, "num_tokens": 4601613.0, "reward": 8.625, "reward_std": 0.049999553710222244, "rewards/reward_model_wrapper/mean": 8.625, "rewards/reward_model_wrapper/std": 0.04999971389770508, "step": 283 }, { "completion_length": 425.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 425.0, "completions/mean_terminated_length": 425.0, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 1.0070921985815602, "frac_reward_zero_std": 0.0, "grad_norm": 0.12378483265638351, "kl": 0.0, "learning_rate": 3.3095238095238095e-06, "loss": 0.0, "num_tokens": 4612921.0, "reward": 9.675000190734863, "reward_std": 0.12583042681217194, "rewards/reward_model_wrapper/mean": 9.675000190734863, "rewards/reward_model_wrapper/std": 0.12583062052726746, "step": 284 }, { "completion_length": 2083.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2440.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 2083.5, "completions/mean_terminated_length": 2083.5, "completions/min_length": 1846.0, "completions/min_terminated_length": 1846.0, "epoch": 1.0106382978723405, "frac_reward_zero_std": 0.0, "grad_norm": 0.2415478229522705, "kl": 0.0, "learning_rate": 3.3015873015873016e-06, "loss": 0.0, "num_tokens": 4630723.0, "reward": 7.861741542816162, "reward_std": 0.6652541756629944, "rewards/reward_model_wrapper/mean": 7.861741542816162, "rewards/reward_model_wrapper/std": 0.6652541756629944, "step": 285 }, { "completion_length": 424.75, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 424.75, "completions/mean_terminated_length": 424.75, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 1.0141843971631206, "frac_reward_zero_std": 0.0, "grad_norm": 0.14524802565574646, "kl": 0.0, "learning_rate": 3.293650793650794e-06, "loss": 0.0, "num_tokens": 4643358.0, "reward": 9.725000381469727, "reward_std": 0.1499999314546585, "rewards/reward_model_wrapper/mean": 9.725000381469727, "rewards/reward_model_wrapper/std": 0.1499996781349182, "step": 286 }, { "completion_length": 2237.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2601.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 2237.75, "completions/mean_terminated_length": 2237.75, "completions/min_length": 1925.0, "completions/min_terminated_length": 1925.0, "epoch": 1.0177304964539007, "frac_reward_zero_std": 0.0, "grad_norm": 0.41686731576919556, "kl": 0.0, "learning_rate": 3.285714285714286e-06, "loss": -0.0, "num_tokens": 4660953.0, "reward": 6.248117446899414, "reward_std": 1.2846812009811401, "rewards/reward_model_wrapper/mean": 6.248117446899414, "rewards/reward_model_wrapper/std": 1.2846813201904297, "step": 287 }, { "completion_length": 476.5, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 476.5, "completions/mean_terminated_length": 476.5, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 1.0212765957446808, "frac_reward_zero_std": 0.0, "grad_norm": 1.3313820362091064, "kl": 0.0, "learning_rate": 3.277777777777778e-06, "loss": -0.0, "num_tokens": 4672843.0, "reward": 8.175000190734863, "reward_std": 1.2737739086151123, "rewards/reward_model_wrapper/mean": 8.175000190734863, "rewards/reward_model_wrapper/std": 1.2737740278244019, "step": 288 }, { "completion_length": 2604.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3184.0, "completions/max_terminated_length": 3184.0, "completions/mean_length": 2604.75, "completions/mean_terminated_length": 2604.75, "completions/min_length": 1973.0, "completions/min_terminated_length": 1973.0, "epoch": 1.024822695035461, "frac_reward_zero_std": 0.0, "grad_norm": 0.2032010555267334, "kl": 0.0, "learning_rate": 3.2698412698412703e-06, "loss": -0.0, "num_tokens": 4691630.0, "reward": 6.390228271484375, "reward_std": 0.5616719722747803, "rewards/reward_model_wrapper/mean": 6.390228271484375, "rewards/reward_model_wrapper/std": 0.561672031879425, "step": 289 }, { "completion_length": 2243.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 2243.5, "completions/mean_terminated_length": 2243.5, "completions/min_length": 1901.0, "completions/min_terminated_length": 1901.0, "epoch": 1.0283687943262412, "frac_reward_zero_std": 0.0, "grad_norm": 0.4730672240257263, "kl": 0.0, "learning_rate": 3.2619047619047623e-06, "loss": 0.0, "num_tokens": 4710144.0, "reward": 7.735315799713135, "reward_std": 1.7455083131790161, "rewards/reward_model_wrapper/mean": 7.735315799713135, "rewards/reward_model_wrapper/std": 1.7455084323883057, "step": 290 }, { "completion_length": 2243.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 2243.25, "completions/mean_terminated_length": 2243.25, "completions/min_length": 1972.0, "completions/min_terminated_length": 1972.0, "epoch": 1.0319148936170213, "frac_reward_zero_std": 0.0, "grad_norm": 1.5938162803649902, "kl": 0.0, "learning_rate": 3.2539682539682544e-06, "loss": -0.0, "num_tokens": 4727969.0, "reward": 6.275000095367432, "reward_std": 4.1883769035339355, "rewards/reward_model_wrapper/mean": 6.275000095367432, "rewards/reward_model_wrapper/std": 4.1883769035339355, "step": 291 }, { "completion_length": 2696.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 2696.0, "completions/mean_terminated_length": 2696.0, "completions/min_length": 2387.0, "completions/min_terminated_length": 2387.0, "epoch": 1.0354609929078014, "frac_reward_zero_std": 0.0, "grad_norm": 0.3788889944553375, "kl": 0.0, "learning_rate": 3.2460317460317465e-06, "loss": 0.0, "num_tokens": 4747737.0, "reward": 7.019140720367432, "reward_std": 1.4419182538986206, "rewards/reward_model_wrapper/mean": 7.019140720367432, "rewards/reward_model_wrapper/std": 1.4419183731079102, "step": 292 }, { "completion_length": 2427.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 2427.25, "completions/mean_terminated_length": 2427.25, "completions/min_length": 2196.0, "completions/min_terminated_length": 2196.0, "epoch": 1.0390070921985815, "frac_reward_zero_std": 0.0, "grad_norm": 0.3122571110725403, "kl": 0.0, "learning_rate": 3.2380952380952385e-06, "loss": -0.0, "num_tokens": 4766730.0, "reward": 6.707321643829346, "reward_std": 0.8175406455993652, "rewards/reward_model_wrapper/mean": 6.707321643829346, "rewards/reward_model_wrapper/std": 0.8175406455993652, "step": 293 }, { "completion_length": 465.5, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 465.5, "completions/mean_terminated_length": 465.5, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 1.0425531914893618, "frac_reward_zero_std": 0.0, "grad_norm": 1.5147337913513184, "kl": 0.0, "learning_rate": 3.2301587301587306e-06, "loss": -0.0, "num_tokens": 4777424.0, "reward": 8.274999618530273, "reward_std": 1.4384597539901733, "rewards/reward_model_wrapper/mean": 8.274999618530273, "rewards/reward_model_wrapper/std": 1.438459873199463, "step": 294 }, { "completion_length": 2529.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3131.0, "completions/max_terminated_length": 3131.0, "completions/mean_length": 2529.0, "completions/mean_terminated_length": 2529.0, "completions/min_length": 2281.0, "completions/min_terminated_length": 2281.0, "epoch": 1.0460992907801419, "frac_reward_zero_std": 0.0, "grad_norm": 0.49145251512527466, "kl": 0.0, "learning_rate": 3.2222222222222227e-06, "loss": 0.0, "num_tokens": 4796316.0, "reward": 5.6014509201049805, "reward_std": 1.4516032934188843, "rewards/reward_model_wrapper/mean": 5.6014509201049805, "rewards/reward_model_wrapper/std": 1.4516034126281738, "step": 295 }, { "completion_length": 2461.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 2461.5, "completions/mean_terminated_length": 2461.5, "completions/min_length": 1901.0, "completions/min_terminated_length": 1901.0, "epoch": 1.049645390070922, "frac_reward_zero_std": 0.0, "grad_norm": 0.4098215699195862, "kl": 0.0, "learning_rate": 3.2142857142857147e-06, "loss": 0.0, "num_tokens": 4816270.0, "reward": 5.590328693389893, "reward_std": 1.1708478927612305, "rewards/reward_model_wrapper/mean": 5.590328693389893, "rewards/reward_model_wrapper/std": 1.1708478927612305, "step": 296 }, { "completion_length": 1900.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2680.0, "completions/max_terminated_length": 2680.0, "completions/mean_length": 1900.5, "completions/mean_terminated_length": 1900.5, "completions/min_length": 1490.0, "completions/min_terminated_length": 1490.0, "epoch": 1.053191489361702, "frac_reward_zero_std": 0.0, "grad_norm": 0.520626425743103, "kl": 0.0, "learning_rate": 3.206349206349207e-06, "loss": 0.0, "num_tokens": 4832940.0, "reward": 6.371167182922363, "reward_std": 1.4172254800796509, "rewards/reward_model_wrapper/mean": 6.371167182922363, "rewards/reward_model_wrapper/std": 1.4172253608703613, "step": 297 }, { "completion_length": 461.75, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 461.75, "completions/mean_terminated_length": 461.75, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 1.0567375886524824, "frac_reward_zero_std": 0.0, "grad_norm": 0.4798383414745331, "kl": 0.0, "learning_rate": 3.1984126984126984e-06, "loss": -0.0, "num_tokens": 4843671.0, "reward": 8.649999618530273, "reward_std": 0.5196153521537781, "rewards/reward_model_wrapper/mean": 8.649999618530273, "rewards/reward_model_wrapper/std": 0.5196153521537781, "step": 298 }, { "completion_length": 2237.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2832.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 2237.75, "completions/mean_terminated_length": 2237.75, "completions/min_length": 2005.0, "completions/min_terminated_length": 2005.0, "epoch": 1.0602836879432624, "frac_reward_zero_std": 0.0, "grad_norm": 0.5069819092750549, "kl": 0.0, "learning_rate": 3.1904761904761905e-06, "loss": -0.0, "num_tokens": 4862390.0, "reward": 6.881079196929932, "reward_std": 1.2724530696868896, "rewards/reward_model_wrapper/mean": 6.881079196929932, "rewards/reward_model_wrapper/std": 1.2724530696868896, "step": 299 }, { "completion_length": 2700.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 2700.75, "completions/mean_terminated_length": 2700.75, "completions/min_length": 2644.0, "completions/min_terminated_length": 2644.0, "epoch": 1.0638297872340425, "frac_reward_zero_std": 0.0, "grad_norm": 0.16641661524772644, "kl": 0.0, "learning_rate": 3.1825396825396826e-06, "loss": -0.0, "num_tokens": 4882177.0, "reward": 6.25, "reward_std": 0.4654746651649475, "rewards/reward_model_wrapper/mean": 6.25, "rewards/reward_model_wrapper/std": 0.4654746651649475, "step": 300 }, { "epoch": 1.0638297872340425, "eval_completion_length": 1623.5, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1819.0, "eval_completions/max_terminated_length": 1819.0, "eval_completions/mean_length": 1623.5, "eval_completions/mean_terminated_length": 1623.5, "eval_completions/min_length": 1471.6666666666667, "eval_completions/min_terminated_length": 1471.6666666666667, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 2.477212603935186e-07, "eval_num_tokens": 4882177.0, "eval_reward": 8.327227592468262, "eval_reward_std": 0.7619317372639974, "eval_rewards/reward_model_wrapper/mean": 8.327227592468262, "eval_rewards/reward_model_wrapper/std": 0.7619316180547079, "eval_runtime": 257.57, "eval_samples_per_second": 0.012, "eval_steps_per_second": 0.004, "step": 300 }, { "completion_length": 508.75, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 508.75, "completions/mean_terminated_length": 508.75, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 1.0673758865248226, "frac_reward_zero_std": 0.0, "grad_norm": 1.671441912651062, "kl": 0.0, "learning_rate": 3.1746031746031746e-06, "loss": -0.0, "num_tokens": 4893924.0, "reward": 7.5, "reward_std": 1.9646884202957153, "rewards/reward_model_wrapper/mean": 7.5, "rewards/reward_model_wrapper/std": 1.9646884202957153, "step": 301 }, { "completion_length": 2401.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 2401.25, "completions/mean_terminated_length": 2401.25, "completions/min_length": 2115.0, "completions/min_terminated_length": 2115.0, "epoch": 1.070921985815603, "frac_reward_zero_std": 0.0, "grad_norm": 0.20759686827659607, "kl": 0.0, "learning_rate": 3.1666666666666667e-06, "loss": 0.0, "num_tokens": 4912721.0, "reward": 6.458129405975342, "reward_std": 0.5831118226051331, "rewards/reward_model_wrapper/mean": 6.458129405975342, "rewards/reward_model_wrapper/std": 0.5831118822097778, "step": 302 }, { "completion_length": 2333.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2581.0, "completions/max_terminated_length": 2581.0, "completions/mean_length": 2333.0, "completions/mean_terminated_length": 2333.0, "completions/min_length": 2040.0, "completions/min_terminated_length": 2040.0, "epoch": 1.074468085106383, "frac_reward_zero_std": 0.0, "grad_norm": 0.4077896475791931, "kl": 0.0, "learning_rate": 3.1587301587301588e-06, "loss": -0.0, "num_tokens": 4931697.0, "reward": 5.941802024841309, "reward_std": 1.3979264497756958, "rewards/reward_model_wrapper/mean": 5.941802024841309, "rewards/reward_model_wrapper/std": 1.3979264497756958, "step": 303 }, { "completion_length": 481.75, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 481.75, "completions/mean_terminated_length": 481.75, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 1.0780141843971631, "frac_reward_zero_std": 0.0, "grad_norm": 1.6444014310836792, "kl": 0.0, "learning_rate": 3.150793650793651e-06, "loss": 0.0, "num_tokens": 4944044.0, "reward": 7.875, "reward_std": 2.1639084815979004, "rewards/reward_model_wrapper/mean": 7.875, "rewards/reward_model_wrapper/std": 2.1639087200164795, "step": 304 }, { "completion_length": 1568.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 1568.25, "completions/mean_terminated_length": 1568.25, "completions/min_length": 1359.0, "completions/min_terminated_length": 1359.0, "epoch": 1.0815602836879432, "frac_reward_zero_std": 0.0, "grad_norm": 0.6878366470336914, "kl": 0.0, "learning_rate": 3.142857142857143e-06, "loss": -0.0, "num_tokens": 4960057.0, "reward": 7.921150207519531, "reward_std": 1.4859638214111328, "rewards/reward_model_wrapper/mean": 7.921150207519531, "rewards/reward_model_wrapper/std": 1.4859638214111328, "step": 305 }, { "completion_length": 2457.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 2457.75, "completions/mean_terminated_length": 2457.75, "completions/min_length": 2183.0, "completions/min_terminated_length": 2183.0, "epoch": 1.0851063829787233, "frac_reward_zero_std": 0.0, "grad_norm": 1.057698130607605, "kl": 0.0, "learning_rate": 3.134920634920635e-06, "loss": -0.0, "num_tokens": 4980108.0, "reward": 6.449999809265137, "reward_std": 4.340122699737549, "rewards/reward_model_wrapper/mean": 6.449999809265137, "rewards/reward_model_wrapper/std": 4.340122699737549, "step": 306 }, { "completion_length": 410.25, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 410.25, "completions/mean_terminated_length": 410.25, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 1.0886524822695036, "frac_reward_zero_std": 0.0, "grad_norm": 0.16985005140304565, "kl": 0.0, "learning_rate": 3.126984126984127e-06, "loss": -0.0, "num_tokens": 4991913.0, "reward": 9.5, "reward_std": 0.16329915821552277, "rewards/reward_model_wrapper/mean": 9.5, "rewards/reward_model_wrapper/std": 0.16329915821552277, "step": 307 }, { "completion_length": 2522.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 2522.5, "completions/mean_terminated_length": 2522.5, "completions/min_length": 2369.0, "completions/min_terminated_length": 2369.0, "epoch": 1.0921985815602837, "frac_reward_zero_std": 0.0, "grad_norm": 0.12848211824893951, "kl": 0.0, "learning_rate": 3.1190476190476195e-06, "loss": -0.0, "num_tokens": 5011939.0, "reward": 6.375, "reward_std": 0.4500000476837158, "rewards/reward_model_wrapper/mean": 6.375, "rewards/reward_model_wrapper/std": 0.4500000476837158, "step": 308 }, { "completion_length": 2072.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 2072.75, "completions/mean_terminated_length": 2072.75, "completions/min_length": 1802.0, "completions/min_terminated_length": 1802.0, "epoch": 1.0957446808510638, "frac_reward_zero_std": 0.0, "grad_norm": 0.2590576708316803, "kl": 0.0, "learning_rate": 3.1111111111111116e-06, "loss": -0.0, "num_tokens": 5028818.0, "reward": 8.524999618530273, "reward_std": 0.7331439256668091, "rewards/reward_model_wrapper/mean": 8.524999618530273, "rewards/reward_model_wrapper/std": 0.7331439256668091, "step": 309 }, { "completion_length": 3009.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3530.0, "completions/max_terminated_length": 3530.0, "completions/mean_length": 3009.5, "completions/mean_terminated_length": 3009.5, "completions/min_length": 2479.0, "completions/min_terminated_length": 2479.0, "epoch": 1.099290780141844, "frac_reward_zero_std": 0.0, "grad_norm": 0.5278863906860352, "kl": 0.0, "learning_rate": 3.1031746031746036e-06, "loss": -0.0, "num_tokens": 5050656.0, "reward": 6.700944423675537, "reward_std": 1.8621336221694946, "rewards/reward_model_wrapper/mean": 6.700944423675537, "rewards/reward_model_wrapper/std": 1.8621336221694946, "step": 310 }, { "completion_length": 2218.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2617.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 2218.25, "completions/mean_terminated_length": 2218.25, "completions/min_length": 1863.0, "completions/min_terminated_length": 1863.0, "epoch": 1.1028368794326242, "frac_reward_zero_std": 0.0, "grad_norm": 0.08763570338487625, "kl": 0.0, "learning_rate": 3.0952380952380957e-06, "loss": -0.0, "num_tokens": 5069757.0, "reward": 6.099999904632568, "reward_std": 0.2828429937362671, "rewards/reward_model_wrapper/mean": 6.099999904632568, "rewards/reward_model_wrapper/std": 0.2828429043292999, "step": 311 }, { "completion_length": 2140.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 2140.0, "completions/mean_terminated_length": 2140.0, "completions/min_length": 1816.0, "completions/min_terminated_length": 1816.0, "epoch": 1.1063829787234043, "frac_reward_zero_std": 0.0, "grad_norm": 0.4348257780075073, "kl": 0.0, "learning_rate": 3.0873015873015878e-06, "loss": -0.0, "num_tokens": 5087861.0, "reward": 7.437117576599121, "reward_std": 1.028058409690857, "rewards/reward_model_wrapper/mean": 7.437117576599121, "rewards/reward_model_wrapper/std": 1.0280582904815674, "step": 312 }, { "completion_length": 2190.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2404.0, "completions/max_terminated_length": 2404.0, "completions/mean_length": 2190.25, "completions/mean_terminated_length": 2190.25, "completions/min_length": 1945.0, "completions/min_terminated_length": 1945.0, "epoch": 1.1099290780141844, "frac_reward_zero_std": 0.0, "grad_norm": 0.22907429933547974, "kl": 0.0, "learning_rate": 3.07936507936508e-06, "loss": 0.0, "num_tokens": 5105294.0, "reward": 8.950000762939453, "reward_std": 0.7895144820213318, "rewards/reward_model_wrapper/mean": 8.950000762939453, "rewards/reward_model_wrapper/std": 0.7895146012306213, "step": 313 }, { "completion_length": 424.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 424.0, "completions/mean_terminated_length": 424.0, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 1.1134751773049645, "frac_reward_zero_std": 0.0, "grad_norm": 0.04182127118110657, "kl": 0.0, "learning_rate": 3.071428571428572e-06, "loss": -0.0, "num_tokens": 5115798.0, "reward": 9.524999618530273, "reward_std": 0.05000019073486328, "rewards/reward_model_wrapper/mean": 9.524999618530273, "rewards/reward_model_wrapper/std": 0.05000019073486328, "step": 314 }, { "completion_length": 413.5, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 413.5, "completions/mean_terminated_length": 413.5, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 1.1170212765957448, "frac_reward_zero_std": 0.0, "grad_norm": 0.3655354380607605, "kl": 0.0, "learning_rate": 3.063492063492064e-06, "loss": 0.0, "num_tokens": 5126116.0, "reward": 8.975000381469727, "reward_std": 0.3774918019771576, "rewards/reward_model_wrapper/mean": 8.975000381469727, "rewards/reward_model_wrapper/std": 0.37749183177948, "step": 315 }, { "completion_length": 2274.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2673.0, "completions/max_terminated_length": 2673.0, "completions/mean_length": 2274.75, "completions/mean_terminated_length": 2274.75, "completions/min_length": 1945.0, "completions/min_terminated_length": 1945.0, "epoch": 1.1205673758865249, "frac_reward_zero_std": 0.0, "grad_norm": 0.2596956193447113, "kl": 0.0, "learning_rate": 3.055555555555556e-06, "loss": -0.0, "num_tokens": 5145103.0, "reward": 8.215713500976562, "reward_std": 0.7091834545135498, "rewards/reward_model_wrapper/mean": 8.215713500976562, "rewards/reward_model_wrapper/std": 0.7091834545135498, "step": 316 }, { "completion_length": 2531.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 2531.75, "completions/mean_terminated_length": 2531.75, "completions/min_length": 2341.0, "completions/min_terminated_length": 2341.0, "epoch": 1.124113475177305, "frac_reward_zero_std": 0.0, "grad_norm": 0.6331945657730103, "kl": 0.0, "learning_rate": 3.047619047619048e-06, "loss": 0.0, "num_tokens": 5165274.0, "reward": 6.849999904632568, "reward_std": 1.6051998138427734, "rewards/reward_model_wrapper/mean": 6.849999904632568, "rewards/reward_model_wrapper/std": 1.6051998138427734, "step": 317 }, { "completion_length": 450.75, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 450.75, "completions/mean_terminated_length": 450.75, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 1.127659574468085, "frac_reward_zero_std": 0.0, "grad_norm": 0.18532048165798187, "kl": 0.0, "learning_rate": 3.0396825396825397e-06, "loss": -0.0, "num_tokens": 5177973.0, "reward": 9.449999809265137, "reward_std": 0.19148598611354828, "rewards/reward_model_wrapper/mean": 9.449999809265137, "rewards/reward_model_wrapper/std": 0.19148573279380798, "step": 318 }, { "completion_length": 2468.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2615.0, "completions/max_terminated_length": 2615.0, "completions/mean_length": 2468.0, "completions/mean_terminated_length": 2468.0, "completions/min_length": 2350.0, "completions/min_terminated_length": 2350.0, "epoch": 1.1312056737588652, "frac_reward_zero_std": 0.0, "grad_norm": 0.6079407334327698, "kl": 0.0, "learning_rate": 3.031746031746032e-06, "loss": -0.0, "num_tokens": 5197465.0, "reward": 7.699999809265137, "reward_std": 2.0944368839263916, "rewards/reward_model_wrapper/mean": 7.699999809265137, "rewards/reward_model_wrapper/std": 2.0944371223449707, "step": 319 }, { "completion_length": 411.5, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 411.5, "completions/mean_terminated_length": 411.5, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 1.1347517730496455, "frac_reward_zero_std": 0.0, "grad_norm": 0.08739888668060303, "kl": 0.0, "learning_rate": 3.023809523809524e-06, "loss": -0.0, "num_tokens": 5207471.0, "reward": 8.524999618530273, "reward_std": 0.0957430750131607, "rewards/reward_model_wrapper/mean": 8.524999618530273, "rewards/reward_model_wrapper/std": 0.0957430750131607, "step": 320 }, { "epoch": 1.1347517730496455, "eval_completion_length": 1735.25, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1911.0, "eval_completions/max_terminated_length": 1911.0, "eval_completions/mean_length": 1735.25, "eval_completions/mean_terminated_length": 1735.25, "eval_completions/min_length": 1635.0, "eval_completions/min_terminated_length": 1635.0, "eval_frac_reward_zero_std": 0.0, "eval_kl": 0.0, "eval_loss": 2.157198224495005e-07, "eval_num_tokens": 5207471.0, "eval_reward": 7.787632306416829, "eval_reward_std": 1.1270970702171326, "eval_rewards/reward_model_wrapper/mean": 7.787632306416829, "eval_rewards/reward_model_wrapper/std": 1.1270970900853474, "eval_runtime": 559.3938, "eval_samples_per_second": 0.005, "eval_steps_per_second": 0.002, "step": 320 } ], "logging_steps": 1, "max_steps": 700, "num_input_tokens_seen": 5207471, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }