{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9946714031971581,
  "eval_steps": 500,
  "global_step": 175,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3742.0,
      "completions/max_terminated_length": 3742.0,
      "completions/mean_length": 1092.838623046875,
      "completions/mean_terminated_length": 1092.838623046875,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.005683836589698046,
      "grad_norm": 0.13580130434100188,
      "kl": 0.0012049674987792969,
      "learning_rate": 0.0,
      "loss": 0.0074,
      "num_tokens": 954428.0,
      "reward": 0.6531250476837158,
      "reward_std": 0.16057492792606354,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.453125,
      "rewards/mcq_accuracy_reward/std": 0.49812230467796326,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5508.0,
      "completions/max_terminated_length": 5508.0,
      "completions/mean_length": 1069.8079528808594,
      "completions/mean_terminated_length": 1069.8079528808594,
      "completions/min_length": 178.75,
      "completions/min_terminated_length": 178.75,
      "epoch": 0.028419182948490232,
      "grad_norm": 0.14511271553818444,
      "kl": 0.0012257099151611328,
      "learning_rate": 1.111111111111111e-06,
      "loss": 0.002,
      "num_tokens": 4707030.0,
      "reward": 0.6582194864749908,
      "reward_std": 0.16218429803848267,
      "rewards/format_reward/mean": 0.9993489682674408,
      "rewards/format_reward/std": 0.018042195588350296,
      "rewards/mcq_accuracy_reward/mean": 0.4583333358168602,
      "rewards/mcq_accuracy_reward/std": 0.49666526168584824,
      "rewards/tag_count_reward/mean": 0.99951171875,
      "rewards/tag_count_reward/std": 0.013531646691262722,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4794.4,
      "completions/max_terminated_length": 4794.4,
      "completions/mean_length": 1049.7560302734375,
      "completions/mean_terminated_length": 1049.7560302734375,
      "completions/min_length": 121.8,
      "completions/min_terminated_length": 121.8,
      "epoch": 0.056838365896980464,
      "grad_norm": 0.1398931239536535,
      "kl": 0.0014247894287109375,
      "learning_rate": 2.5e-06,
      "loss": 0.004,
      "num_tokens": 9314069.0,
      "reward": 0.6792318463325501,
      "reward_std": 0.166354176402092,
      "rewards/format_reward/mean": 0.99921875,
      "rewards/format_reward/std": 0.01741643026471138,
      "rewards/mcq_accuracy_reward/mean": 0.47942708134651185,
      "rewards/mcq_accuracy_reward/std": 0.4984712541103363,
      "rewards/tag_count_reward/mean": 0.998828113079071,
      "rewards/tag_count_reward/std": 0.023616421967744827,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4637.0,
      "completions/max_terminated_length": 4637.0,
      "completions/mean_length": 1036.5583740234374,
      "completions/mean_terminated_length": 1036.5583740234374,
      "completions/min_length": 227.6,
      "completions/min_terminated_length": 227.6,
      "epoch": 0.0852575488454707,
      "grad_norm": 0.15112174245508642,
      "kl": 0.0023210525512695314,
      "learning_rate": 3.88888888888889e-06,
      "loss": 0.0125,
      "num_tokens": 13874013.0,
      "reward": 0.6991276741027832,
      "reward_std": 0.16582457721233368,
      "rewards/format_reward/mean": 0.9994791746139526,
      "rewards/format_reward/std": 0.014433756470680237,
      "rewards/mcq_accuracy_reward/mean": 0.49921875,
      "rewards/mcq_accuracy_reward/std": 0.49765779376029967,
      "rewards/tag_count_reward/mean": 0.999609375,
      "rewards/tag_count_reward/std": 0.009444911777973176,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5373.4,
      "completions/max_terminated_length": 5373.4,
      "completions/mean_length": 1068.2205932617187,
      "completions/mean_terminated_length": 1068.2205932617187,
      "completions/min_length": 213.8,
      "completions/min_terminated_length": 213.8,
      "epoch": 0.11367673179396093,
      "grad_norm": 0.1748099542859131,
      "kl": 0.005224609375,
      "learning_rate": 4.999499509357132e-06,
      "loss": -0.0021,
      "num_tokens": 18546748.0,
      "reward": 0.6684310674667359,
      "reward_std": 0.1836001008749008,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.46848957538604735,
      "rewards/mcq_accuracy_reward/std": 0.49776612520217894,
      "rewards/tag_count_reward/mean": 0.9996744751930237,
      "rewards/tag_count_reward/std": 0.007962546683847905,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5827.2,
      "completions/max_terminated_length": 5827.2,
      "completions/mean_length": 1014.5508178710937,
      "completions/mean_terminated_length": 1014.5508178710937,
      "completions/min_length": 239.6,
      "completions/min_terminated_length": 239.6,
      "epoch": 0.14209591474245115,
      "grad_norm": 0.17738027139049306,
      "kl": 0.00942840576171875,
      "learning_rate": 4.982003369106287e-06,
      "loss": 0.0057,
      "num_tokens": 23021767.0,
      "reward": 0.6767448544502258,
      "reward_std": 0.17676816284656524,
      "rewards/format_reward/mean": 0.9994791626930237,
      "rewards/format_reward/std": 0.010199552029371261,
      "rewards/mcq_accuracy_reward/mean": 0.4768229126930237,
      "rewards/mcq_accuracy_reward/std": 0.49895642399787904,
      "rewards/tag_count_reward/mean": 0.9997395873069763,
      "rewards/tag_count_reward/std": 0.003601375222206116,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5485.4,
      "completions/max_terminated_length": 5485.4,
      "completions/mean_length": 1067.72373046875,
      "completions/mean_terminated_length": 1067.72373046875,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.1705150976909414,
      "grad_norm": 0.1559592819101095,
      "kl": 0.01161041259765625,
      "learning_rate": 4.939682729058839e-06,
      "loss": 0.0027,
      "num_tokens": 27697786.0,
      "reward": 0.6473893880844116,
      "reward_std": 0.16071836650371552,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.44739583134651184,
      "rewards/mcq_accuracy_reward/std": 0.49608185291290285,
      "rewards/tag_count_reward/mean": 0.9999348998069764,
      "rewards/tag_count_reward/std": 0.0018042195588350296,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5728.2,
      "completions/max_terminated_length": 5728.2,
      "completions/mean_length": 1163.4677490234376,
      "completions/mean_terminated_length": 1163.4677490234376,
      "completions/min_length": 255.2,
      "completions/min_terminated_length": 255.2,
      "epoch": 0.1989342806394316,
      "grad_norm": 0.15172870795054275,
      "kl": 0.01438446044921875,
      "learning_rate": 4.872960871766826e-06,
      "loss": -0.0006,
      "num_tokens": 32744310.0,
      "reward": 0.6907943248748779,
      "reward_std": 0.1814502149820328,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.49088541269302366,
      "rewards/mcq_accuracy_reward/std": 0.4999137282371521,
      "rewards/tag_count_reward/mean": 0.9993489503860473,
      "rewards/tag_count_reward/std": 0.010171890631318093,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 5812.4,
      "completions/max_terminated_length": 5298.8,
      "completions/mean_length": 1077.8024047851563,
      "completions/mean_terminated_length": 1075.9230712890626,
      "completions/min_length": 250.6,
      "completions/min_terminated_length": 250.6,
      "epoch": 0.22735346358792186,
      "grad_norm": 0.1564558072390042,
      "kl": 0.015766143798828125,
      "learning_rate": 4.782505135862176e-06,
      "loss": 0.0012,
      "num_tokens": 37463479.0,
      "reward": 0.6952149271965027,
      "reward_std": 0.1775230199098587,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.4953125059604645,
      "rewards/mcq_accuracy_reward/std": 0.4984812021255493,
      "rewards/tag_count_reward/mean": 0.9992838621139526,
      "rewards/tag_count_reward/std": 0.01261383220553398,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 4943.8,
      "completions/max_terminated_length": 4187.4,
      "completions/mean_length": 992.1836181640625,
      "completions/mean_terminated_length": 990.3313232421875,
      "completions/min_length": 230.0,
      "completions/min_terminated_length": 230.0,
      "epoch": 0.2557726465364121,
      "grad_norm": 0.1740869664959226,
      "kl": 0.01738433837890625,
      "learning_rate": 4.669220241469573e-06,
      "loss": 0.0129,
      "num_tokens": 41846552.0,
      "reward": 0.6533008456230164,
      "reward_std": 0.17375805079936982,
      "rewards/format_reward/mean": 0.9994791746139526,
      "rewards/format_reward/std": 0.014433756470680237,
      "rewards/mcq_accuracy_reward/mean": 0.4533854126930237,
      "rewards/mcq_accuracy_reward/std": 0.49718082547187803,
      "rewards/tag_count_reward/mean": 0.9996744871139527,
      "rewards/tag_count_reward/std": 0.009021097794175148,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6464.0,
      "completions/max_terminated_length": 6464.0,
      "completions/mean_length": 1109.7568115234376,
      "completions/mean_terminated_length": 1109.7568115234376,
      "completions/min_length": 233.2,
      "completions/min_terminated_length": 233.2,
      "epoch": 0.2841918294849023,
      "grad_norm": 0.15589282758117567,
      "kl": 0.0189788818359375,
      "learning_rate": 4.534239241377266e-06,
      "loss": 0.001,
      "num_tokens": 46687962.0,
      "reward": 0.676927137374878,
      "reward_std": 0.17989677786827088,
      "rewards/format_reward/mean": 0.99921875,
      "rewards/format_reward/std": 0.01741643026471138,
      "rewards/mcq_accuracy_reward/mean": 0.47708333730697633,
      "rewards/mcq_accuracy_reward/std": 0.4982982099056244,
      "rewards/tag_count_reward/mean": 0.99921875,
      "rewards/tag_count_reward/std": 0.016611021012067795,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5222.0,
      "completions/max_terminated_length": 5222.0,
      "completions/mean_length": 1135.95576171875,
      "completions/mean_terminated_length": 1135.95576171875,
      "completions/min_length": 249.8,
      "completions/min_terminated_length": 249.8,
      "epoch": 0.31261101243339257,
      "grad_norm": 0.13369568955391997,
      "kl": 0.0205780029296875,
      "learning_rate": 4.378912188470374e-06,
      "loss": 0.0046,
      "num_tokens": 51627616.0,
      "reward": 0.6671224713325501,
      "reward_std": 0.160793936252594,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.4671874940395355,
      "rewards/mcq_accuracy_reward/std": 0.4979245364665985,
      "rewards/tag_count_reward/mean": 0.999609375,
      "rewards/tag_count_reward/std": 0.008064506202936172,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 6132.4,
      "completions/max_terminated_length": 5533.2,
      "completions/mean_length": 1087.3320434570312,
      "completions/mean_terminated_length": 1085.4773315429688,
      "completions/min_length": 263.6,
      "completions/min_terminated_length": 263.6,
      "epoch": 0.3410301953818828,
      "grad_norm": 0.1477789943624076,
      "kl": 0.020989990234375,
      "learning_rate": 4.204792632772754e-06,
      "loss": -0.0003,
      "num_tokens": 56378771.0,
      "reward": 0.6983789801597595,
      "reward_std": 0.17249701023101807,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.4984375059604645,
      "rewards/mcq_accuracy_reward/std": 0.4989359140396118,
      "rewards/tag_count_reward/mean": 0.9996744871139527,
      "rewards/tag_count_reward/std": 0.007507431134581566,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4786.4,
      "completions/max_terminated_length": 4786.4,
      "completions/mean_length": 1063.5450805664063,
      "completions/mean_terminated_length": 1063.5450805664063,
      "completions/min_length": 225.6,
      "completions/min_terminated_length": 225.6,
      "epoch": 0.369449378330373,
      "grad_norm": 0.14015972377677116,
      "kl": 0.0214569091796875,
      "learning_rate": 4.0136220831513205e-06,
      "loss": -0.0,
      "num_tokens": 61039224.0,
      "reward": 0.6768229961395263,
      "reward_std": 0.1578069359064102,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.47682292461395265,
      "rewards/mcq_accuracy_reward/std": 0.4989953935146332,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5206.6,
      "completions/max_terminated_length": 5206.6,
      "completions/mean_length": 1109.176611328125,
      "completions/mean_terminated_length": 1109.176611328125,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "epoch": 0.3978685612788632,
      "grad_norm": 0.16531093494639468,
      "kl": 95.62167663574219,
      "learning_rate": 3.807312589093701e-06,
      "loss": 3.8239,
      "num_tokens": 65872382.0,
      "reward": 0.6960807919502259,
      "reward_std": 0.16992745101451873,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.49609375596046446,
      "rewards/mcq_accuracy_reward/std": 0.4989615023136139,
      "rewards/tag_count_reward/mean": 0.9998697876930237,
      "rewards/tag_count_reward/std": 0.0025498881936073304,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5165.0,
      "completions/max_terminated_length": 5165.0,
      "completions/mean_length": 1079.5161743164062,
      "completions/mean_terminated_length": 1079.5161743164062,
      "completions/min_length": 254.6,
      "completions/min_terminated_length": 254.6,
      "epoch": 0.42628774422735344,
      "grad_norm": 0.15112401029602476,
      "kl": 0.0230438232421875,
      "learning_rate": 3.5879276167728343e-06,
      "loss": 0.0125,
      "num_tokens": 70595996.0,
      "reward": 0.6843750596046447,
      "reward_std": 0.18003216683864592,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.484375,
      "rewards/mcq_accuracy_reward/std": 0.4970580399036407,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 5178.6,
      "completions/max_terminated_length": 4963.4,
      "completions/mean_length": 1061.828955078125,
      "completions/mean_terminated_length": 1059.984521484375,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.4547069271758437,
      "grad_norm": 0.15923626474140243,
      "kl": 0.02301025390625,
      "learning_rate": 3.3576614106722473e-06,
      "loss": 0.0028,
      "num_tokens": 75247475.0,
      "reward": 0.7145378351211548,
      "reward_std": 0.1841348797082901,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.5145833313465118,
      "rewards/mcq_accuracy_reward/std": 0.49900742769241335,
      "rewards/tag_count_reward/mean": 0.9998046875,
      "rewards/tag_count_reward/std": 0.005412658676505089,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5323.8,
      "completions/max_terminated_length": 5323.8,
      "completions/mean_length": 1048.9911743164062,
      "completions/mean_terminated_length": 1048.9911743164062,
      "completions/min_length": 247.6,
      "completions/min_terminated_length": 247.6,
      "epoch": 0.48312611012433393,
      "grad_norm": 0.16115740042560586,
      "kl": 0.02386932373046875,
      "learning_rate": 3.118817047192907e-06,
      "loss": 0.0024,
      "num_tokens": 79853433.0,
      "reward": 0.6648047566413879,
      "reward_std": 0.189658322930336,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.46484375596046446,
      "rewards/mcq_accuracy_reward/std": 0.4943417370319366,
      "rewards/tag_count_reward/mean": 0.9998697996139526,
      "rewards/tag_count_reward/std": 0.003608439117670059,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 6016.2,
      "completions/max_terminated_length": 6016.2,
      "completions/mean_length": 1056.59873046875,
      "completions/mean_terminated_length": 1056.59873046875,
      "completions/min_length": 238.0,
      "completions/min_terminated_length": 238.0,
      "epoch": 0.5115452930728241,
      "grad_norm": 0.21552374412647782,
      "kl": 0.02350616455078125,
      "learning_rate": 2.8737833997450658e-06,
      "loss": 0.0042,
      "num_tokens": 84486748.0,
      "reward": 0.6861914873123169,
      "reward_std": 0.16900237798690795,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.4861979126930237,
      "rewards/mcq_accuracy_reward/std": 0.4990255832672119,
      "rewards/tag_count_reward/mean": 0.9999348998069764,
      "rewards/tag_count_reward/std": 0.0018042195588350296,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0013020833333333482,
      "completions/max_length": 6964.4,
      "completions/max_terminated_length": 6690.8,
      "completions/mean_length": 1151.6203369140626,
      "completions/mean_terminated_length": 1142.4264404296875,
      "completions/min_length": 236.8,
      "completions/min_terminated_length": 236.8,
      "epoch": 0.5399644760213144,
      "grad_norm": 0.14156099172275863,
      "kl": 0.022979736328125,
      "learning_rate": 2.6250112457156296e-06,
      "loss": 0.0076,
      "num_tokens": 89489882.0,
      "reward": 0.6658268809318543,
      "reward_std": 0.16832855641841887,
      "rewards/format_reward/mean": 0.9981770873069763,
      "rewards/format_reward/std": 0.03182193115353584,
      "rewards/mcq_accuracy_reward/mean": 0.46614583134651183,
      "rewards/mcq_accuracy_reward/std": 0.4980164170265198,
      "rewards/tag_count_reward/mean": 0.9986328125,
      "rewards/tag_count_reward/std": 0.023866448923945426,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5372.6,
      "completions/max_terminated_length": 5372.6,
      "completions/mean_length": 1119.63466796875,
      "completions/mean_terminated_length": 1119.63466796875,
      "completions/min_length": 220.8,
      "completions/min_terminated_length": 220.8,
      "epoch": 0.5683836589698046,
      "grad_norm": 0.13456208913450085,
      "kl": 0.02239227294921875,
      "learning_rate": 2.374988754284371e-06,
      "loss": 0.0074,
      "num_tokens": 94365687.0,
      "reward": 0.6567123174667359,
      "reward_std": 0.1713259369134903,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.4567708373069763,
      "rewards/mcq_accuracy_reward/std": 0.4970240592956543,
      "rewards/tag_count_reward/mean": 0.9994140625,
      "rewards/tag_count_reward/std": 0.011108423396945,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 5585.0,
      "completions/max_terminated_length": 5486.6,
      "completions/mean_length": 1100.5560302734375,
      "completions/mean_terminated_length": 1098.734423828125,
      "completions/min_length": 252.2,
      "completions/min_terminated_length": 252.2,
      "epoch": 0.5968028419182948,
      "grad_norm": 0.11958484360378753,
      "kl": 0.02285919189453125,
      "learning_rate": 2.1262166002549346e-06,
      "loss": 0.004,
      "num_tokens": 99170286.0,
      "reward": 0.664778733253479,
      "reward_std": 0.14817258715629578,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.46484375596046446,
      "rewards/mcq_accuracy_reward/std": 0.49491878151893615,
      "rewards/tag_count_reward/mean": 0.9996093869209289,
      "rewards/tag_count_reward/std": 0.010825317353010178,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4669.0,
      "completions/max_terminated_length": 4669.0,
      "completions/mean_length": 1052.46826171875,
      "completions/mean_terminated_length": 1052.46826171875,
      "completions/min_length": 245.8,
      "completions/min_terminated_length": 245.8,
      "epoch": 0.6252220248667851,
      "grad_norm": 0.14676711793529598,
      "kl": 19608371.221609496,
      "learning_rate": 1.8811829528070935e-06,
      "loss": 778784.15,
      "num_tokens": 103796876.0,
      "reward": 0.6645313143730164,
      "reward_std": 0.18562877476215361,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.46458333134651186,
      "rewards/mcq_accuracy_reward/std": 0.49767143130302427,
      "rewards/tag_count_reward/mean": 0.9997395873069763,
      "rewards/tag_count_reward/std": 0.007216878235340118,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 5137.8,
      "completions/max_terminated_length": 4382.8,
      "completions/mean_length": 1032.596630859375,
      "completions/mean_terminated_length": 1030.736181640625,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 0.6536412078152753,
      "grad_norm": 0.1802276243424991,
      "kl": 0.02183837890625,
      "learning_rate": 1.6423385893277537e-06,
      "loss": 0.0108,
      "num_tokens": 108338703.0,
      "reward": 0.6494206309318542,
      "reward_std": 0.17672575116157532,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.4494791626930237,
      "rewards/mcq_accuracy_reward/std": 0.49625040888786315,
      "rewards/tag_count_reward/mean": 0.9996744751930237,
      "rewards/tag_count_reward/std": 0.006501290947198868,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 5606.2,
      "completions/max_terminated_length": 5407.0,
      "completions/mean_length": 1034.7218994140626,
      "completions/mean_terminated_length": 1032.8922119140625,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.6820603907637656,
      "grad_norm": 0.23206072060115124,
      "kl": 0.02138214111328125,
      "learning_rate": 1.4120723832271665e-06,
      "loss": 0.0068,
      "num_tokens": 112897891.0,
      "reward": 0.7249414920806885,
      "reward_std": 0.1745230883359909,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.524999988079071,
      "rewards/mcq_accuracy_reward/std": 0.49913292527198794,
      "rewards/tag_count_reward/mean": 0.9996744751930237,
      "rewards/tag_count_reward/std": 0.009021097794175148,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 5944.4,
      "completions/max_terminated_length": 5786.4,
      "completions/mean_length": 1096.26279296875,
      "completions/mean_terminated_length": 1094.3953369140625,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.7104795737122558,
      "grad_norm": 0.12807422733812146,
      "kl": 0.020963287353515624,
      "learning_rate": 1.1926874109063e-06,
      "loss": 0.0023,
      "num_tokens": 117693212.0,
      "reward": 0.6751953840255738,
      "reward_std": 0.17198645174503327,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.4752604246139526,
      "rewards/mcq_accuracy_reward/std": 0.49931321144104,
      "rewards/tag_count_reward/mean": 0.999609375,
      "rewards/tag_count_reward/std": 0.009444911777973176,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4637.4,
      "completions/max_terminated_length": 4637.4,
      "completions/mean_length": 1052.527099609375,
      "completions/mean_terminated_length": 1052.527099609375,
      "completions/min_length": 261.8,
      "completions/min_terminated_length": 261.8,
      "epoch": 0.738898756660746,
      "grad_norm": 0.1465611288853758,
      "kl": 0.021143341064453126,
      "learning_rate": 9.863779168486797e-07,
      "loss": 0.0095,
      "num_tokens": 122313876.0,
      "reward": 0.6882617950439454,
      "reward_std": 0.16661006212234497,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.48828125596046446,
      "rewards/mcq_accuracy_reward/std": 0.4994440317153931,
      "rewards/tag_count_reward/mean": 0.9998046875,
      "rewards/tag_count_reward/std": 0.004354107566177845,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5632.6,
      "completions/max_terminated_length": 5632.6,
      "completions/mean_length": 1087.99873046875,
      "completions/mean_terminated_length": 1087.99873046875,
      "completions/min_length": 252.0,
      "completions/min_terminated_length": 252.0,
      "epoch": 0.7673179396092362,
      "grad_norm": 0.1600247679302984,
      "kl": 0.019998931884765626,
      "learning_rate": 7.952073672272464e-07,
      "loss": 0.0036,
      "num_tokens": 127064943.0,
      "reward": 0.6473828911781311,
      "reward_std": 0.18547057807445527,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.4473958373069763,
      "rewards/mcq_accuracy_reward/std": 0.49676424264907837,
      "rewards/tag_count_reward/mean": 0.9998697996139526,
      "rewards/tag_count_reward/std": 0.003608439117670059,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0013020833333333259,
      "completions/max_length": 6002.8,
      "completions/max_terminated_length": 5989.2,
      "completions/mean_length": 1068.9239868164063,
      "completions/mean_terminated_length": 1059.5496948242187,
      "completions/min_length": 238.2,
      "completions/min_terminated_length": 238.2,
      "epoch": 0.7957371225577264,
      "grad_norm": 0.14840743240817333,
      "kl": 0.0195098876953125,
      "learning_rate": 6.210878115296267e-07,
      "loss": -0.0015,
      "num_tokens": 131744587.0,
      "reward": 0.6677019000053406,
      "reward_std": 0.1643421858549118,
      "rewards/format_reward/mean": 0.9984375,
      "rewards/format_reward/std": 0.023312175273895265,
      "rewards/mcq_accuracy_reward/mean": 0.46796875,
      "rewards/mcq_accuracy_reward/std": 0.49840737581253053,
      "rewards/tag_count_reward/mean": 0.9988932251930237,
      "rewards/tag_count_reward/std": 0.01567991152405739,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5690.0,
      "completions/max_terminated_length": 5690.0,
      "completions/mean_length": 1053.6370361328125,
      "completions/mean_terminated_length": 1053.6370361328125,
      "completions/min_length": 236.4,
      "completions/min_terminated_length": 236.4,
      "epoch": 0.8241563055062167,
      "grad_norm": 0.16563780810558218,
      "kl": 0.01880035400390625,
      "learning_rate": 4.6576075862273445e-07,
      "loss": 0.0026,
      "num_tokens": 136367097.0,
      "reward": 0.6843424916267395,
      "reward_std": 0.17991138994693756,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.48437499403953554,
      "rewards/mcq_accuracy_reward/std": 0.4988525092601776,
      "rewards/tag_count_reward/mean": 0.9996744871139527,
      "rewards/tag_count_reward/std": 0.009021097794175148,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4469.6,
      "completions/max_terminated_length": 4469.6,
      "completions/mean_length": 1041.7265869140624,
      "completions/mean_terminated_length": 1041.7265869140624,
      "completions/min_length": 230.8,
      "completions/min_terminated_length": 230.8,
      "epoch": 0.8525754884547069,
      "grad_norm": 0.14692099930627384,
      "kl": 0.017971038818359375,
      "learning_rate": 3.3077975853042704e-07,
      "loss": 0.0001,
      "num_tokens": 140943367.0,
      "reward": 0.6906185746192932,
      "reward_std": 0.1725286066532135,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.49062501192092894,
      "rewards/mcq_accuracy_reward/std": 0.4994231522083282,
      "rewards/tag_count_reward/mean": 0.9999348998069764,
      "rewards/tag_count_reward/std": 0.0018042195588350296,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4531.2,
      "completions/max_terminated_length": 4531.2,
      "completions/mean_length": 1035.3060302734375,
      "completions/mean_terminated_length": 1035.3060302734375,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 0.8809946714031972,
      "grad_norm": 0.15437157339648844,
      "kl": 0.018267059326171876,
      "learning_rate": 2.174948641378244e-07,
      "loss": 0.0147,
      "num_tokens": 145497102.0,
      "reward": 0.7106380820274353,
      "reward_std": 0.17128031849861144,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.5106770873069764,
      "rewards/mcq_accuracy_reward/std": 0.4973214030265808,
      "rewards/tag_count_reward/mean": 0.9998697996139526,
      "rewards/tag_count_reward/std": 0.003608439117670059,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4793.6,
      "completions/max_terminated_length": 4793.6,
      "completions/mean_length": 1055.2679809570313,
      "completions/mean_terminated_length": 1055.2679809570313,
      "completions/min_length": 241.0,
      "completions/min_terminated_length": 241.0,
      "epoch": 0.9094138543516874,
      "grad_norm": 0.15275141096890196,
      "kl": 0.0176239013671875,
      "learning_rate": 1.27039128233174e-07,
      "loss": 0.0121,
      "num_tokens": 150127331.0,
      "reward": 0.7130144119262696,
      "reward_std": 0.18018046617507935,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.5130208253860473,
      "rewards/mcq_accuracy_reward/std": 0.49882700443267824,
      "rewards/tag_count_reward/mean": 0.9999348998069764,
      "rewards/tag_count_reward/std": 0.0018042195588350296,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 5728.6,
      "completions/max_terminated_length": 5411.0,
      "completions/mean_length": 1094.807080078125,
      "completions/mean_terminated_length": 1092.956884765625,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.9378330373001776,
      "grad_norm": 0.1354642339017401,
      "kl": 0.0183624267578125,
      "learning_rate": 6.031727094116174e-08,
      "loss": 0.0067,
      "num_tokens": 154910862.0,
      "reward": 0.6801628351211548,
      "reward_std": 0.1546088457107544,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.48020832538604735,
      "rewards/mcq_accuracy_reward/std": 0.498971688747406,
      "rewards/tag_count_reward/mean": 0.9998046875,
      "rewards/tag_count_reward/std": 0.005412658676505089,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 4582.0,
      "completions/max_terminated_length": 4582.0,
      "completions/mean_length": 1007.6719116210937,
      "completions/mean_terminated_length": 1007.6719116210937,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.9662522202486679,
      "grad_norm": 0.13334679057257176,
      "kl": 0.018311309814453124,
      "learning_rate": 1.7996630893712675e-08,
      "loss": 0.0029,
      "num_tokens": 159352714.0,
      "reward": 0.7252474665641785,
      "reward_std": 0.1664626866579056,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_accuracy_reward/mean": 0.5252604126930237,
      "rewards/mcq_accuracy_reward/std": 0.4992336809635162,
      "rewards/tag_count_reward/mean": 0.9998697876930237,
      "rewards/tag_count_reward/std": 0.003608439117670059,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0002604166666666741,
      "completions/max_length": 5149.6,
      "completions/max_terminated_length": 4739.2,
      "completions/mean_length": 1035.9330444335938,
      "completions/mean_terminated_length": 1034.0750610351563,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.9946714031971581,
      "grad_norm": 0.14564780944336908,
      "kl": 0.018206024169921876,
      "learning_rate": 5.004906428685008e-10,
      "loss": 0.0067,
      "num_tokens": 163907551.0,
      "reward": 0.6884701490402222,
      "reward_std": 0.1888585329055786,
      "rewards/format_reward/mean": 0.9997395873069763,
      "rewards/format_reward/std": 0.007216878235340118,
      "rewards/mcq_accuracy_reward/mean": 0.48854167461395265,
      "rewards/mcq_accuracy_reward/std": 0.49964500069618223,
      "rewards/tag_count_reward/mean": 0.9995442748069763,
      "rewards/tag_count_reward/std": 0.012629536911845207,
      "step": 175
    },
    {
      "epoch": 0.9946714031971581,
      "step": 175,
      "total_flos": 0.0,
      "train_loss": 22251.08962759373,
      "train_runtime": 39419.1987,
      "train_samples_per_second": 0.428,
      "train_steps_per_second": 0.004
    }
  ],
  "logging_steps": 5,
  "max_steps": 175,
  "num_input_tokens_seen": 163907551,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}