{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 234.375, "epoch": 0.04, "grad_norm": 7.176087856292725, "kl": 0.0, "learning_rate": 5e-07, "loss": -0.0, "reward": 4.112200040370226, "reward_std": 0.6570726247227867, "rewards/concensus_correctness_reward_func": 1.0191874988377094, "rewards/consensus_reward_func": 1.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.8291376009583473, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.9044999973848462, "step": 2 }, { "completion_length": 145.59375, "epoch": 0.08, "grad_norm": 3.5170514583587646, "kl": 0.009267363173421472, "learning_rate": 4.978612153434526e-07, "loss": 0.0, "reward": 6.426187425851822, "reward_std": 0.165728148072958, "rewards/concensus_correctness_reward_func": 1.7933749929070473, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.2421875, "step": 4 }, { "completion_length": 132.40625, "epoch": 0.12, "grad_norm": 12.279341697692871, "kl": 0.028428875797544606, "learning_rate": 4.91481456572267e-07, "loss": 0.0, "reward": 6.86219134926796, "reward_std": 0.08118852600455284, "rewards/concensus_correctness_reward_func": 2.0462500005960464, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9409414567053318, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 6 }, { "completion_length": 153.75, "epoch": 0.16, "grad_norm": 0.1743835210800171, "kl": 0.13935918442439288, "learning_rate": 4.809698831278217e-07, "loss": 0.0001, "reward": 6.861589133739471, "reward_std": 0.07264312717597932, "rewards/concensus_correctness_reward_func": 2.038124978542328, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9992454163730145, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.23046875, "step": 8 }, { "completion_length": 147.34375, "epoch": 0.2, "grad_norm": 15.341836929321289, "kl": 0.13160601945128292, "learning_rate": 4.6650635094610966e-07, "loss": 0.0001, "reward": 6.5379141718149185, "reward_std": 0.39141652490070555, "rewards/concensus_correctness_reward_func": 1.8202499886974692, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.9950079470872879, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.23828125, "step": 10 }, { "completion_length": 158.90625, "epoch": 0.24, "grad_norm": 7.824904918670654, "kl": 0.43980390403885394, "learning_rate": 4.483383350728088e-07, "loss": 0.0004, "reward": 6.499552339315414, "reward_std": 0.23187985000913613, "rewards/concensus_correctness_reward_func": 1.854249980300665, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9968648962676525, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2421875, "step": 12 }, { "completion_length": 142.1875, "epoch": 0.28, "grad_norm": 9.656704902648926, "kl": 0.3203001730144024, "learning_rate": 4.2677669529663686e-07, "loss": 0.0003, "reward": 6.893820106983185, "reward_std": 0.03330435510724783, "rewards/concensus_correctness_reward_func": 2.042624980211258, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9918201602995396, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 14 }, { "completion_length": 145.59375, "epoch": 0.32, "grad_norm": 9.023086547851562, "kl": 0.36359297251328826, "learning_rate": 4.0219035725218013e-07, "loss": 0.0004, "reward": 6.579638808965683, "reward_std": 0.12814350612461567, "rewards/concensus_correctness_reward_func": 1.9202499836683273, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9966076016426086, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.2252812534570694, "step": 16 }, { "completion_length": 134.1875, "epoch": 0.36, "grad_norm": 12.842790603637695, "kl": 156.88406436191872, "learning_rate": 3.75e-07, "loss": 0.1569, "reward": 6.7304129004478455, "reward_std": 0.2677630423568189, "rewards/concensus_correctness_reward_func": 1.9847500063478947, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9730379395186901, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2413749992847443, "step": 18 }, { "completion_length": 156.84375, "epoch": 0.4, "grad_norm": 23.726106643676758, "kl": 1.424283139873296, "learning_rate": 3.4567085809127245e-07, "loss": 0.0014, "reward": 6.993578672409058, "reward_std": 0.05592688778415322, "rewards/concensus_correctness_reward_func": 2.0643749982118607, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9961412325501442, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.245562493801117, "step": 20 }, { "completion_length": 126.125, "epoch": 0.44, "grad_norm": 10.510144233703613, "kl": 0.25376387825235724, "learning_rate": 3.147047612756302e-07, "loss": 0.0003, "reward": 7.145433336496353, "reward_std": 0.03509601688710973, "rewards/concensus_correctness_reward_func": 2.170249991118908, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9998708665370941, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.240937501192093, "step": 22 }, { "completion_length": 138.71875, "epoch": 0.48, "grad_norm": 8.906339645385742, "kl": 2.7483674963004887, "learning_rate": 2.826315480550129e-07, "loss": 0.0027, "reward": 6.512812465429306, "reward_std": 0.17332954704761505, "rewards/concensus_correctness_reward_func": 1.8253124915063381, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 24 }, { "completion_length": 154.875, "epoch": 0.52, "grad_norm": 5.817593097686768, "kl": 0.23101894953288138, "learning_rate": 2.5e-07, "loss": 0.0002, "reward": 6.779243886470795, "reward_std": 0.0016465720254927874, "rewards/concensus_correctness_reward_func": 1.9372499957680702, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.998243909329176, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.25, "step": 26 }, { "completion_length": 139.375, "epoch": 0.56, "grad_norm": 6.956367015838623, "kl": 0.1609617037465796, "learning_rate": 2.1736845194498716e-07, "loss": 0.0002, "reward": 6.7925144135952, "reward_std": 0.0029847975820302963, "rewards/concensus_correctness_reward_func": 2.0446249917149544, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9978894330561161, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 28 }, { "completion_length": 129.84375, "epoch": 0.6, "grad_norm": 7.940965175628662, "kl": 0.6511943477671593, "learning_rate": 1.8529523872436977e-07, "loss": 0.0007, "reward": 6.6300894767045975, "reward_std": 0.3586949845266645, "rewards/concensus_correctness_reward_func": 1.9808749817311764, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.9670269638299942, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2134375013411045, "step": 30 }, { "completion_length": 131.96875, "epoch": 0.64, "grad_norm": 10.983510971069336, "kl": 11.676874205470085, "learning_rate": 1.5432914190872756e-07, "loss": 0.0117, "reward": 6.615825533866882, "reward_std": 0.07520009062136523, "rewards/concensus_correctness_reward_func": 1.9189999774098396, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9958880096673965, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2321875020861626, "step": 32 }, { "completion_length": 138.0, "epoch": 0.68, "grad_norm": 6.4254279136657715, "kl": 0.3256819383241236, "learning_rate": 1.2500000000000005e-07, "loss": 0.0003, "reward": 6.7897875010967255, "reward_std": 0.007017956115305424, "rewards/concensus_correctness_reward_func": 2.044749990105629, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9950375556945801, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 34 }, { "completion_length": 145.1875, "epoch": 0.72, "grad_norm": 9.378355979919434, "kl": 16.052306916331872, "learning_rate": 9.780964274781983e-08, "loss": 0.0161, "reward": 6.93294321000576, "reward_std": 0.14636386698111892, "rewards/concensus_correctness_reward_func": 2.1079999916255474, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9929119944572449, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.23828125, "step": 36 }, { "completion_length": 154.125, "epoch": 0.76, "grad_norm": 6.797495365142822, "kl": 0.40698248730041087, "learning_rate": 7.322330470336313e-08, "loss": 0.0004, "reward": 6.721958696842194, "reward_std": 0.06882934272289276, "rewards/concensus_correctness_reward_func": 1.8992499895393848, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9633338004350662, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 38 }, { "completion_length": 143.96875, "epoch": 0.8, "grad_norm": 14.162485122680664, "kl": 1.0181477402802557, "learning_rate": 5.166166492719124e-08, "loss": 0.001, "reward": 6.521054327487946, "reward_std": 0.20339208524819696, "rewards/concensus_correctness_reward_func": 1.8601249791681767, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9968668967485428, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2421875, "step": 40 }, { "completion_length": 134.84375, "epoch": 0.84, "grad_norm": 9.491962432861328, "kl": 0.6755225867964327, "learning_rate": 3.349364905389032e-08, "loss": 0.0007, "reward": 7.223803788423538, "reward_std": 0.09353631362318993, "rewards/concensus_correctness_reward_func": 2.2955000177025795, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9889288172125816, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2206249982118607, "step": 42 }, { "completion_length": 133.71875, "epoch": 0.88, "grad_norm": 5.122064590454102, "kl": 0.1922820881009102, "learning_rate": 1.9030116872178314e-08, "loss": 0.0002, "reward": 6.633125603199005, "reward_std": 0.04931969312019646, "rewards/concensus_correctness_reward_func": 1.9179999828338623, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.997813206166029, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2329374998807907, "step": 44 }, { "completion_length": 135.4375, "epoch": 0.92, "grad_norm": 25.408519744873047, "kl": 1.0368469340028241, "learning_rate": 8.518543427732949e-09, "loss": 0.001, "reward": 6.295926123857498, "reward_std": 0.2810273655341007, "rewards/concensus_correctness_reward_func": 1.7611249797046185, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9683636538684368, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.1914375014603138, "step": 46 }, { "completion_length": 125.15625, "epoch": 0.96, "grad_norm": 11.458877563476562, "kl": 0.21031903591938317, "learning_rate": 2.1387846565474044e-09, "loss": 0.0002, "reward": 6.404887020587921, "reward_std": 0.02461348520591855, "rewards/concensus_correctness_reward_func": 1.8002499863505363, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9952620603144169, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 48 }, { "completion_length": 146.34375, "epoch": 1.0, "grad_norm": 7.924446105957031, "kl": 0.3799045365303755, "learning_rate": 0.0, "loss": 0.0004, "reward": 6.737992107868195, "reward_std": 0.07663538253109436, "rewards/concensus_correctness_reward_func": 2.043999992311001, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9908671341836452, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.234375, "step": 50 }, { "epoch": 1.0, "step": 50, "total_flos": 0.0, "train_loss": 0.007829376330410014, "train_runtime": 290.1349, "train_samples_per_second": 2.757, "train_steps_per_second": 0.172 } ], "logging_steps": 2, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }