{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9946714031971581, "eval_steps": 500, "global_step": 175, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3742.0, "completions/max_terminated_length": 3742.0, "completions/mean_length": 1092.838623046875, "completions/mean_terminated_length": 1092.838623046875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.005683836589698046, "grad_norm": 0.13580130434100188, "kl": 0.0012049674987792969, "learning_rate": 0.0, "loss": 0.0074, "num_tokens": 954428.0, "reward": 0.6531250476837158, "reward_std": 0.16057492792606354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.453125, "rewards/mcq_accuracy_reward/std": 0.49812230467796326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5508.0, "completions/max_terminated_length": 5508.0, "completions/mean_length": 1069.8079528808594, "completions/mean_terminated_length": 1069.8079528808594, "completions/min_length": 178.75, "completions/min_terminated_length": 178.75, "epoch": 0.028419182948490232, "grad_norm": 0.14511271553818444, "kl": 0.0012257099151611328, "learning_rate": 1.111111111111111e-06, "loss": 0.002, "num_tokens": 4707030.0, "reward": 0.6582194864749908, "reward_std": 0.16218429803848267, "rewards/format_reward/mean": 0.9993489682674408, "rewards/format_reward/std": 0.018042195588350296, "rewards/mcq_accuracy_reward/mean": 0.4583333358168602, "rewards/mcq_accuracy_reward/std": 0.49666526168584824, "rewards/tag_count_reward/mean": 0.99951171875, "rewards/tag_count_reward/std": 0.013531646691262722, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4794.4, "completions/max_terminated_length": 4794.4, "completions/mean_length": 1049.7560302734375, "completions/mean_terminated_length": 1049.7560302734375, "completions/min_length": 121.8, "completions/min_terminated_length": 121.8, "epoch": 0.056838365896980464, "grad_norm": 0.1398931239536535, "kl": 0.0014247894287109375, "learning_rate": 2.5e-06, "loss": 0.004, "num_tokens": 9314069.0, "reward": 0.6792318463325501, "reward_std": 0.166354176402092, "rewards/format_reward/mean": 0.99921875, "rewards/format_reward/std": 0.01741643026471138, "rewards/mcq_accuracy_reward/mean": 0.47942708134651185, "rewards/mcq_accuracy_reward/std": 0.4984712541103363, "rewards/tag_count_reward/mean": 0.998828113079071, "rewards/tag_count_reward/std": 0.023616421967744827, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4637.0, "completions/max_terminated_length": 4637.0, "completions/mean_length": 1036.5583740234374, "completions/mean_terminated_length": 1036.5583740234374, "completions/min_length": 227.6, "completions/min_terminated_length": 227.6, "epoch": 0.0852575488454707, "grad_norm": 0.15112174245508642, "kl": 0.0023210525512695314, "learning_rate": 3.88888888888889e-06, "loss": 0.0125, "num_tokens": 13874013.0, "reward": 0.6991276741027832, "reward_std": 0.16582457721233368, "rewards/format_reward/mean": 0.9994791746139526, "rewards/format_reward/std": 0.014433756470680237, "rewards/mcq_accuracy_reward/mean": 0.49921875, "rewards/mcq_accuracy_reward/std": 0.49765779376029967, "rewards/tag_count_reward/mean": 0.999609375, "rewards/tag_count_reward/std": 0.009444911777973176, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5373.4, "completions/max_terminated_length": 5373.4, "completions/mean_length": 1068.2205932617187, "completions/mean_terminated_length": 1068.2205932617187, "completions/min_length": 213.8, "completions/min_terminated_length": 213.8, "epoch": 0.11367673179396093, "grad_norm": 0.1748099542859131, "kl": 0.005224609375, "learning_rate": 4.999499509357132e-06, "loss": -0.0021, "num_tokens": 18546748.0, "reward": 0.6684310674667359, "reward_std": 0.1836001008749008, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.46848957538604735, "rewards/mcq_accuracy_reward/std": 0.49776612520217894, "rewards/tag_count_reward/mean": 0.9996744751930237, "rewards/tag_count_reward/std": 0.007962546683847905, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5827.2, "completions/max_terminated_length": 5827.2, "completions/mean_length": 1014.5508178710937, "completions/mean_terminated_length": 1014.5508178710937, "completions/min_length": 239.6, "completions/min_terminated_length": 239.6, "epoch": 0.14209591474245115, "grad_norm": 0.17738027139049306, "kl": 0.00942840576171875, "learning_rate": 4.982003369106287e-06, "loss": 0.0057, "num_tokens": 23021767.0, "reward": 0.6767448544502258, "reward_std": 0.17676816284656524, "rewards/format_reward/mean": 0.9994791626930237, "rewards/format_reward/std": 0.010199552029371261, "rewards/mcq_accuracy_reward/mean": 0.4768229126930237, "rewards/mcq_accuracy_reward/std": 0.49895642399787904, "rewards/tag_count_reward/mean": 0.9997395873069763, "rewards/tag_count_reward/std": 0.003601375222206116, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5485.4, "completions/max_terminated_length": 5485.4, "completions/mean_length": 1067.72373046875, "completions/mean_terminated_length": 1067.72373046875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.1705150976909414, "grad_norm": 0.1559592819101095, "kl": 0.01161041259765625, "learning_rate": 4.939682729058839e-06, "loss": 0.0027, "num_tokens": 27697786.0, "reward": 0.6473893880844116, "reward_std": 0.16071836650371552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.44739583134651184, "rewards/mcq_accuracy_reward/std": 0.49608185291290285, "rewards/tag_count_reward/mean": 0.9999348998069764, "rewards/tag_count_reward/std": 0.0018042195588350296, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5728.2, "completions/max_terminated_length": 5728.2, "completions/mean_length": 1163.4677490234376, "completions/mean_terminated_length": 1163.4677490234376, "completions/min_length": 255.2, "completions/min_terminated_length": 255.2, "epoch": 0.1989342806394316, "grad_norm": 0.15172870795054275, "kl": 0.01438446044921875, "learning_rate": 4.872960871766826e-06, "loss": -0.0006, "num_tokens": 32744310.0, "reward": 0.6907943248748779, "reward_std": 0.1814502149820328, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.49088541269302366, "rewards/mcq_accuracy_reward/std": 0.4999137282371521, "rewards/tag_count_reward/mean": 0.9993489503860473, "rewards/tag_count_reward/std": 0.010171890631318093, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 5812.4, "completions/max_terminated_length": 5298.8, "completions/mean_length": 1077.8024047851563, "completions/mean_terminated_length": 1075.9230712890626, "completions/min_length": 250.6, "completions/min_terminated_length": 250.6, "epoch": 0.22735346358792186, "grad_norm": 0.1564558072390042, "kl": 0.015766143798828125, "learning_rate": 4.782505135862176e-06, "loss": 0.0012, "num_tokens": 37463479.0, "reward": 0.6952149271965027, "reward_std": 0.1775230199098587, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.4953125059604645, "rewards/mcq_accuracy_reward/std": 0.4984812021255493, "rewards/tag_count_reward/mean": 0.9992838621139526, "rewards/tag_count_reward/std": 0.01261383220553398, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 4943.8, "completions/max_terminated_length": 4187.4, "completions/mean_length": 992.1836181640625, "completions/mean_terminated_length": 990.3313232421875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2557726465364121, "grad_norm": 0.1740869664959226, "kl": 0.01738433837890625, "learning_rate": 4.669220241469573e-06, "loss": 0.0129, "num_tokens": 41846552.0, "reward": 0.6533008456230164, "reward_std": 0.17375805079936982, "rewards/format_reward/mean": 0.9994791746139526, "rewards/format_reward/std": 0.014433756470680237, "rewards/mcq_accuracy_reward/mean": 0.4533854126930237, "rewards/mcq_accuracy_reward/std": 0.49718082547187803, "rewards/tag_count_reward/mean": 0.9996744871139527, "rewards/tag_count_reward/std": 0.009021097794175148, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6464.0, "completions/max_terminated_length": 6464.0, "completions/mean_length": 1109.7568115234376, "completions/mean_terminated_length": 1109.7568115234376, "completions/min_length": 233.2, "completions/min_terminated_length": 233.2, "epoch": 0.2841918294849023, "grad_norm": 0.15589282758117567, "kl": 0.0189788818359375, "learning_rate": 4.534239241377266e-06, "loss": 0.001, "num_tokens": 46687962.0, "reward": 0.676927137374878, "reward_std": 0.17989677786827088, "rewards/format_reward/mean": 0.99921875, "rewards/format_reward/std": 0.01741643026471138, "rewards/mcq_accuracy_reward/mean": 0.47708333730697633, "rewards/mcq_accuracy_reward/std": 0.4982982099056244, "rewards/tag_count_reward/mean": 0.99921875, "rewards/tag_count_reward/std": 0.016611021012067795, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5222.0, "completions/max_terminated_length": 5222.0, "completions/mean_length": 1135.95576171875, "completions/mean_terminated_length": 1135.95576171875, "completions/min_length": 249.8, "completions/min_terminated_length": 249.8, "epoch": 0.31261101243339257, "grad_norm": 0.13369568955391997, "kl": 0.0205780029296875, "learning_rate": 4.378912188470374e-06, "loss": 0.0046, "num_tokens": 51627616.0, "reward": 0.6671224713325501, "reward_std": 0.160793936252594, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.4671874940395355, "rewards/mcq_accuracy_reward/std": 0.4979245364665985, "rewards/tag_count_reward/mean": 0.999609375, "rewards/tag_count_reward/std": 0.008064506202936172, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 6132.4, "completions/max_terminated_length": 5533.2, "completions/mean_length": 1087.3320434570312, "completions/mean_terminated_length": 1085.4773315429688, "completions/min_length": 263.6, "completions/min_terminated_length": 263.6, "epoch": 0.3410301953818828, "grad_norm": 0.1477789943624076, "kl": 0.020989990234375, "learning_rate": 4.204792632772754e-06, "loss": -0.0003, "num_tokens": 56378771.0, "reward": 0.6983789801597595, "reward_std": 0.17249701023101807, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.4984375059604645, "rewards/mcq_accuracy_reward/std": 0.4989359140396118, "rewards/tag_count_reward/mean": 0.9996744871139527, "rewards/tag_count_reward/std": 0.007507431134581566, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4786.4, "completions/max_terminated_length": 4786.4, "completions/mean_length": 1063.5450805664063, "completions/mean_terminated_length": 1063.5450805664063, "completions/min_length": 225.6, "completions/min_terminated_length": 225.6, "epoch": 0.369449378330373, "grad_norm": 0.14015972377677116, "kl": 0.0214569091796875, "learning_rate": 4.0136220831513205e-06, "loss": -0.0, "num_tokens": 61039224.0, "reward": 0.6768229961395263, "reward_std": 0.1578069359064102, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.47682292461395265, "rewards/mcq_accuracy_reward/std": 0.4989953935146332, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5206.6, "completions/max_terminated_length": 5206.6, "completions/mean_length": 1109.176611328125, "completions/mean_terminated_length": 1109.176611328125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.3978685612788632, "grad_norm": 0.16531093494639468, "kl": 95.62167663574219, "learning_rate": 3.807312589093701e-06, "loss": 3.8239, "num_tokens": 65872382.0, "reward": 0.6960807919502259, "reward_std": 0.16992745101451873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.49609375596046446, "rewards/mcq_accuracy_reward/std": 0.4989615023136139, "rewards/tag_count_reward/mean": 0.9998697876930237, "rewards/tag_count_reward/std": 0.0025498881936073304, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5165.0, "completions/max_terminated_length": 5165.0, "completions/mean_length": 1079.5161743164062, "completions/mean_terminated_length": 1079.5161743164062, "completions/min_length": 254.6, "completions/min_terminated_length": 254.6, "epoch": 0.42628774422735344, "grad_norm": 0.15112401029602476, "kl": 0.0230438232421875, "learning_rate": 3.5879276167728343e-06, "loss": 0.0125, "num_tokens": 70595996.0, "reward": 0.6843750596046447, "reward_std": 0.18003216683864592, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.484375, "rewards/mcq_accuracy_reward/std": 0.4970580399036407, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 5178.6, "completions/max_terminated_length": 4963.4, "completions/mean_length": 1061.828955078125, "completions/mean_terminated_length": 1059.984521484375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.4547069271758437, "grad_norm": 0.15923626474140243, "kl": 0.02301025390625, "learning_rate": 3.3576614106722473e-06, "loss": 0.0028, "num_tokens": 75247475.0, "reward": 0.7145378351211548, "reward_std": 0.1841348797082901, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.5145833313465118, "rewards/mcq_accuracy_reward/std": 0.49900742769241335, "rewards/tag_count_reward/mean": 0.9998046875, "rewards/tag_count_reward/std": 0.005412658676505089, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5323.8, "completions/max_terminated_length": 5323.8, "completions/mean_length": 1048.9911743164062, "completions/mean_terminated_length": 1048.9911743164062, "completions/min_length": 247.6, "completions/min_terminated_length": 247.6, "epoch": 0.48312611012433393, "grad_norm": 0.16115740042560586, "kl": 0.02386932373046875, "learning_rate": 3.118817047192907e-06, "loss": 0.0024, "num_tokens": 79853433.0, "reward": 0.6648047566413879, "reward_std": 0.189658322930336, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.46484375596046446, "rewards/mcq_accuracy_reward/std": 0.4943417370319366, "rewards/tag_count_reward/mean": 0.9998697996139526, "rewards/tag_count_reward/std": 0.003608439117670059, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6016.2, "completions/max_terminated_length": 6016.2, "completions/mean_length": 1056.59873046875, "completions/mean_terminated_length": 1056.59873046875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5115452930728241, "grad_norm": 0.21552374412647782, "kl": 0.02350616455078125, "learning_rate": 2.8737833997450658e-06, "loss": 0.0042, "num_tokens": 84486748.0, "reward": 0.6861914873123169, "reward_std": 0.16900237798690795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.4861979126930237, "rewards/mcq_accuracy_reward/std": 0.4990255832672119, "rewards/tag_count_reward/mean": 0.9999348998069764, "rewards/tag_count_reward/std": 0.0018042195588350296, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333482, "completions/max_length": 6964.4, "completions/max_terminated_length": 6690.8, "completions/mean_length": 1151.6203369140626, "completions/mean_terminated_length": 1142.4264404296875, "completions/min_length": 236.8, "completions/min_terminated_length": 236.8, "epoch": 0.5399644760213144, "grad_norm": 0.14156099172275863, "kl": 0.022979736328125, "learning_rate": 2.6250112457156296e-06, "loss": 0.0076, "num_tokens": 89489882.0, "reward": 0.6658268809318543, "reward_std": 0.16832855641841887, "rewards/format_reward/mean": 0.9981770873069763, "rewards/format_reward/std": 0.03182193115353584, "rewards/mcq_accuracy_reward/mean": 0.46614583134651183, "rewards/mcq_accuracy_reward/std": 0.4980164170265198, "rewards/tag_count_reward/mean": 0.9986328125, "rewards/tag_count_reward/std": 0.023866448923945426, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5372.6, "completions/max_terminated_length": 5372.6, "completions/mean_length": 1119.63466796875, "completions/mean_terminated_length": 1119.63466796875, "completions/min_length": 220.8, "completions/min_terminated_length": 220.8, "epoch": 0.5683836589698046, "grad_norm": 0.13456208913450085, "kl": 0.02239227294921875, "learning_rate": 2.374988754284371e-06, "loss": 0.0074, "num_tokens": 94365687.0, "reward": 0.6567123174667359, "reward_std": 0.1713259369134903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.4567708373069763, "rewards/mcq_accuracy_reward/std": 0.4970240592956543, "rewards/tag_count_reward/mean": 0.9994140625, "rewards/tag_count_reward/std": 0.011108423396945, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 5585.0, "completions/max_terminated_length": 5486.6, "completions/mean_length": 1100.5560302734375, "completions/mean_terminated_length": 1098.734423828125, "completions/min_length": 252.2, "completions/min_terminated_length": 252.2, "epoch": 0.5968028419182948, "grad_norm": 0.11958484360378753, "kl": 0.02285919189453125, "learning_rate": 2.1262166002549346e-06, "loss": 0.004, "num_tokens": 99170286.0, "reward": 0.664778733253479, "reward_std": 0.14817258715629578, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.46484375596046446, "rewards/mcq_accuracy_reward/std": 0.49491878151893615, "rewards/tag_count_reward/mean": 0.9996093869209289, "rewards/tag_count_reward/std": 0.010825317353010178, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4669.0, "completions/max_terminated_length": 4669.0, "completions/mean_length": 1052.46826171875, "completions/mean_terminated_length": 1052.46826171875, "completions/min_length": 245.8, "completions/min_terminated_length": 245.8, "epoch": 0.6252220248667851, "grad_norm": 0.14676711793529598, "kl": 19608371.221609496, "learning_rate": 1.8811829528070935e-06, "loss": 778784.15, "num_tokens": 103796876.0, "reward": 0.6645313143730164, "reward_std": 0.18562877476215361, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.46458333134651186, "rewards/mcq_accuracy_reward/std": 0.49767143130302427, "rewards/tag_count_reward/mean": 0.9997395873069763, "rewards/tag_count_reward/std": 0.007216878235340118, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 5137.8, "completions/max_terminated_length": 4382.8, "completions/mean_length": 1032.596630859375, "completions/mean_terminated_length": 1030.736181640625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6536412078152753, "grad_norm": 0.1802276243424991, "kl": 0.02183837890625, "learning_rate": 1.6423385893277537e-06, "loss": 0.0108, "num_tokens": 108338703.0, "reward": 0.6494206309318542, "reward_std": 0.17672575116157532, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.4494791626930237, "rewards/mcq_accuracy_reward/std": 0.49625040888786315, "rewards/tag_count_reward/mean": 0.9996744751930237, "rewards/tag_count_reward/std": 0.006501290947198868, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 5606.2, "completions/max_terminated_length": 5407.0, "completions/mean_length": 1034.7218994140626, "completions/mean_terminated_length": 1032.8922119140625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6820603907637656, "grad_norm": 0.23206072060115124, "kl": 0.02138214111328125, "learning_rate": 1.4120723832271665e-06, "loss": 0.0068, "num_tokens": 112897891.0, "reward": 0.7249414920806885, "reward_std": 0.1745230883359909, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.524999988079071, "rewards/mcq_accuracy_reward/std": 0.49913292527198794, "rewards/tag_count_reward/mean": 0.9996744751930237, "rewards/tag_count_reward/std": 0.009021097794175148, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 5944.4, "completions/max_terminated_length": 5786.4, "completions/mean_length": 1096.26279296875, "completions/mean_terminated_length": 1094.3953369140625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7104795737122558, "grad_norm": 0.12807422733812146, "kl": 0.020963287353515624, "learning_rate": 1.1926874109063e-06, "loss": 0.0023, "num_tokens": 117693212.0, "reward": 0.6751953840255738, "reward_std": 0.17198645174503327, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.4752604246139526, "rewards/mcq_accuracy_reward/std": 0.49931321144104, "rewards/tag_count_reward/mean": 0.999609375, "rewards/tag_count_reward/std": 0.009444911777973176, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4637.4, "completions/max_terminated_length": 4637.4, "completions/mean_length": 1052.527099609375, "completions/mean_terminated_length": 1052.527099609375, "completions/min_length": 261.8, "completions/min_terminated_length": 261.8, "epoch": 0.738898756660746, "grad_norm": 0.1465611288853758, "kl": 0.021143341064453126, "learning_rate": 9.863779168486797e-07, "loss": 0.0095, "num_tokens": 122313876.0, "reward": 0.6882617950439454, "reward_std": 0.16661006212234497, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.48828125596046446, "rewards/mcq_accuracy_reward/std": 0.4994440317153931, "rewards/tag_count_reward/mean": 0.9998046875, "rewards/tag_count_reward/std": 0.004354107566177845, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5632.6, "completions/max_terminated_length": 5632.6, "completions/mean_length": 1087.99873046875, "completions/mean_terminated_length": 1087.99873046875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.7673179396092362, "grad_norm": 0.1600247679302984, "kl": 0.019998931884765626, "learning_rate": 7.952073672272464e-07, "loss": 0.0036, "num_tokens": 127064943.0, "reward": 0.6473828911781311, "reward_std": 0.18547057807445527, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.4473958373069763, "rewards/mcq_accuracy_reward/std": 0.49676424264907837, "rewards/tag_count_reward/mean": 0.9998697996139526, "rewards/tag_count_reward/std": 0.003608439117670059, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333259, "completions/max_length": 6002.8, "completions/max_terminated_length": 5989.2, "completions/mean_length": 1068.9239868164063, "completions/mean_terminated_length": 1059.5496948242187, "completions/min_length": 238.2, "completions/min_terminated_length": 238.2, "epoch": 0.7957371225577264, "grad_norm": 0.14840743240817333, "kl": 0.0195098876953125, "learning_rate": 6.210878115296267e-07, "loss": -0.0015, "num_tokens": 131744587.0, "reward": 0.6677019000053406, "reward_std": 0.1643421858549118, "rewards/format_reward/mean": 0.9984375, "rewards/format_reward/std": 0.023312175273895265, "rewards/mcq_accuracy_reward/mean": 0.46796875, "rewards/mcq_accuracy_reward/std": 0.49840737581253053, "rewards/tag_count_reward/mean": 0.9988932251930237, "rewards/tag_count_reward/std": 0.01567991152405739, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5690.0, "completions/max_terminated_length": 5690.0, "completions/mean_length": 1053.6370361328125, "completions/mean_terminated_length": 1053.6370361328125, "completions/min_length": 236.4, "completions/min_terminated_length": 236.4, "epoch": 0.8241563055062167, "grad_norm": 0.16563780810558218, "kl": 0.01880035400390625, "learning_rate": 4.6576075862273445e-07, "loss": 0.0026, "num_tokens": 136367097.0, "reward": 0.6843424916267395, "reward_std": 0.17991138994693756, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.48437499403953554, "rewards/mcq_accuracy_reward/std": 0.4988525092601776, "rewards/tag_count_reward/mean": 0.9996744871139527, "rewards/tag_count_reward/std": 0.009021097794175148, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4469.6, "completions/max_terminated_length": 4469.6, "completions/mean_length": 1041.7265869140624, "completions/mean_terminated_length": 1041.7265869140624, "completions/min_length": 230.8, "completions/min_terminated_length": 230.8, "epoch": 0.8525754884547069, "grad_norm": 0.14692099930627384, "kl": 0.017971038818359375, "learning_rate": 3.3077975853042704e-07, "loss": 0.0001, "num_tokens": 140943367.0, "reward": 0.6906185746192932, "reward_std": 0.1725286066532135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.49062501192092894, "rewards/mcq_accuracy_reward/std": 0.4994231522083282, "rewards/tag_count_reward/mean": 0.9999348998069764, "rewards/tag_count_reward/std": 0.0018042195588350296, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4531.2, "completions/max_terminated_length": 4531.2, "completions/mean_length": 1035.3060302734375, "completions/mean_terminated_length": 1035.3060302734375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.8809946714031972, "grad_norm": 0.15437157339648844, "kl": 0.018267059326171876, "learning_rate": 2.174948641378244e-07, "loss": 0.0147, "num_tokens": 145497102.0, "reward": 0.7106380820274353, "reward_std": 0.17128031849861144, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.5106770873069764, "rewards/mcq_accuracy_reward/std": 0.4973214030265808, "rewards/tag_count_reward/mean": 0.9998697996139526, "rewards/tag_count_reward/std": 0.003608439117670059, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4793.6, "completions/max_terminated_length": 4793.6, "completions/mean_length": 1055.2679809570313, "completions/mean_terminated_length": 1055.2679809570313, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9094138543516874, "grad_norm": 0.15275141096890196, "kl": 0.0176239013671875, "learning_rate": 1.27039128233174e-07, "loss": 0.0121, "num_tokens": 150127331.0, "reward": 0.7130144119262696, "reward_std": 0.18018046617507935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.5130208253860473, "rewards/mcq_accuracy_reward/std": 0.49882700443267824, "rewards/tag_count_reward/mean": 0.9999348998069764, "rewards/tag_count_reward/std": 0.0018042195588350296, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 5728.6, "completions/max_terminated_length": 5411.0, "completions/mean_length": 1094.807080078125, "completions/mean_terminated_length": 1092.956884765625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.9378330373001776, "grad_norm": 0.1354642339017401, "kl": 0.0183624267578125, "learning_rate": 6.031727094116174e-08, "loss": 0.0067, "num_tokens": 154910862.0, "reward": 0.6801628351211548, "reward_std": 0.1546088457107544, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.48020832538604735, "rewards/mcq_accuracy_reward/std": 0.498971688747406, "rewards/tag_count_reward/mean": 0.9998046875, "rewards/tag_count_reward/std": 0.005412658676505089, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4582.0, "completions/max_terminated_length": 4582.0, "completions/mean_length": 1007.6719116210937, "completions/mean_terminated_length": 1007.6719116210937, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.9662522202486679, "grad_norm": 0.13334679057257176, "kl": 0.018311309814453124, "learning_rate": 1.7996630893712675e-08, "loss": 0.0029, "num_tokens": 159352714.0, "reward": 0.7252474665641785, "reward_std": 0.1664626866579056, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_accuracy_reward/mean": 0.5252604126930237, "rewards/mcq_accuracy_reward/std": 0.4992336809635162, "rewards/tag_count_reward/mean": 0.9998697876930237, "rewards/tag_count_reward/std": 0.003608439117670059, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0002604166666666741, "completions/max_length": 5149.6, "completions/max_terminated_length": 4739.2, "completions/mean_length": 1035.9330444335938, "completions/mean_terminated_length": 1034.0750610351563, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9946714031971581, "grad_norm": 0.14564780944336908, "kl": 0.018206024169921876, "learning_rate": 5.004906428685008e-10, "loss": 0.0067, "num_tokens": 163907551.0, "reward": 0.6884701490402222, "reward_std": 0.1888585329055786, "rewards/format_reward/mean": 0.9997395873069763, "rewards/format_reward/std": 0.007216878235340118, "rewards/mcq_accuracy_reward/mean": 0.48854167461395265, "rewards/mcq_accuracy_reward/std": 0.49964500069618223, "rewards/tag_count_reward/mean": 0.9995442748069763, "rewards/tag_count_reward/std": 0.012629536911845207, "step": 175 }, { "epoch": 0.9946714031971581, "step": 175, "total_flos": 0.0, "train_loss": 22251.08962759373, "train_runtime": 39419.1987, "train_samples_per_second": 0.428, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 175, "num_input_tokens_seen": 163907551, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }