| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 20, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 280.6875, | |
| "epoch": 0.1, | |
| "grad_norm": 53.05036163330078, | |
| "kl": 0.0, | |
| "learning_rate": 4.965903258506806e-07, | |
| "loss": -0.0, | |
| "reward": 3.6584795396775007, | |
| "reward_std": 0.5753003612917382, | |
| "rewards/concensus_correctness_reward_func": 1.011624988168478, | |
| "rewards/consensus_reward_func": 1.0625, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.0, | |
| "rewards/question_recreation_reward_func": 0.656604582327418, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.171875, | |
| "rewards/xmlcount_reward_func": 0.7558750007301569, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 171.03125, | |
| "epoch": 0.2, | |
| "grad_norm": 169.1420135498047, | |
| "kl": 10.232354334380943, | |
| "learning_rate": 4.698684378016222e-07, | |
| "loss": 0.0102, | |
| "reward": 6.453376889228821, | |
| "reward_std": 0.14471963373944163, | |
| "rewards/concensus_correctness_reward_func": 1.8348749801516533, | |
| "rewards/consensus_reward_func": 2.0, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.125, | |
| "rewards/question_recreation_reward_func": 0.9452207088470459, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.34375, | |
| "rewards/xmlcount_reward_func": 1.2045312523841858, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 167.6875, | |
| "epoch": 0.3, | |
| "grad_norm": 273064.90625, | |
| "kl": 468753.35245687794, | |
| "learning_rate": 4.193203929064353e-07, | |
| "loss": 468.7533, | |
| "reward": 6.525232225656509, | |
| "reward_std": 0.3649999493500218, | |
| "rewards/concensus_correctness_reward_func": 2.006375003606081, | |
| "rewards/consensus_reward_func": 1.8125, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.25, | |
| "rewards/question_recreation_reward_func": 0.8971072547137737, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.359375, | |
| "rewards/xmlcount_reward_func": 1.1998750008642673, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 160.8125, | |
| "epoch": 0.4, | |
| "grad_norm": 167036304.0, | |
| "kl": 3727724.481420556, | |
| "learning_rate": 3.5042385616324236e-07, | |
| "loss": 3727.7244, | |
| "reward": 7.081181287765503, | |
| "reward_std": 0.10486791450239252, | |
| "rewards/concensus_correctness_reward_func": 2.170249997638166, | |
| "rewards/consensus_reward_func": 2.0, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.375, | |
| "rewards/question_recreation_reward_func": 0.9093687888234854, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.40625, | |
| "rewards/xmlcount_reward_func": 1.2203124985098839, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 147.0625, | |
| "epoch": 0.5, | |
| "grad_norm": 4871.97412109375, | |
| "kl": 539.2818314363249, | |
| "learning_rate": 2.706448363680831e-07, | |
| "loss": 0.5393, | |
| "reward": 6.877287954092026, | |
| "reward_std": 0.44364456451148726, | |
| "rewards/concensus_correctness_reward_func": 2.1652499809861183, | |
| "rewards/consensus_reward_func": 1.9375, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.375, | |
| "rewards/question_recreation_reward_func": 0.8499442553147674, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.328125, | |
| "rewards/xmlcount_reward_func": 1.2214687392115593, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 175.53125, | |
| "epoch": 0.6, | |
| "grad_norm": 50525.3671875, | |
| "kl": 2825.989122052677, | |
| "learning_rate": 1.886286282148002e-07, | |
| "loss": 2.826, | |
| "reward": 6.0079357624053955, | |
| "reward_std": 0.43696475913748145, | |
| "rewards/concensus_correctness_reward_func": 1.7586249820888042, | |
| "rewards/consensus_reward_func": 1.9375, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.0, | |
| "rewards/question_recreation_reward_func": 0.7420295961201191, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.359375, | |
| "rewards/xmlcount_reward_func": 1.210406243801117, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 177.8125, | |
| "epoch": 0.7, | |
| "grad_norm": 505.6527404785156, | |
| "kl": 1061.56522446312, | |
| "learning_rate": 1.1326296046939333e-07, | |
| "loss": 1.0616, | |
| "reward": 6.597704261541367, | |
| "reward_std": 0.4343329582188744, | |
| "rewards/concensus_correctness_reward_func": 2.0284374840557575, | |
| "rewards/consensus_reward_func": 1.8125, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.25, | |
| "rewards/question_recreation_reward_func": 0.8877355121076107, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.375, | |
| "rewards/xmlcount_reward_func": 1.2440312504768372, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 144.71875, | |
| "epoch": 0.8, | |
| "grad_norm": 49.680084228515625, | |
| "kl": 12.081052286550403, | |
| "learning_rate": 5.271487265090163e-08, | |
| "loss": 0.0121, | |
| "reward": 6.665501236915588, | |
| "reward_std": 0.5225161880371161, | |
| "rewards/concensus_correctness_reward_func": 2.029374983161688, | |
| "rewards/consensus_reward_func": 1.9375, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.1875, | |
| "rewards/question_recreation_reward_func": 0.8862824384123087, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.390625, | |
| "rewards/xmlcount_reward_func": 1.234218753874302, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 147.0625, | |
| "epoch": 0.9, | |
| "grad_norm": 730.200927734375, | |
| "kl": 86.71727948682383, | |
| "learning_rate": 1.3545689574841341e-08, | |
| "loss": 0.0867, | |
| "reward": 6.578728884458542, | |
| "reward_std": 0.26941974822693737, | |
| "rewards/concensus_correctness_reward_func": 2.021124981343746, | |
| "rewards/consensus_reward_func": 1.75, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.3125, | |
| "rewards/question_recreation_reward_func": 0.9322289191186428, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.359375, | |
| "rewards/xmlcount_reward_func": 1.2034999914467335, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 142.25, | |
| "epoch": 1.0, | |
| "grad_norm": 13.476627349853516, | |
| "kl": 7.980941329384223, | |
| "learning_rate": 0.0, | |
| "loss": 0.008, | |
| "reward": 6.966006487607956, | |
| "reward_std": 0.08144115762843285, | |
| "rewards/concensus_correctness_reward_func": 2.1567499935626984, | |
| "rewards/consensus_reward_func": 2.0, | |
| "rewards/cumulative_reward_2": 0.0, | |
| "rewards/final_correctness_reward_func": 0.25, | |
| "rewards/question_recreation_reward_func": 0.9232877704780549, | |
| "rewards/soft_format_reward_func": 0.0, | |
| "rewards/strict_format_reward_func": 0.421875, | |
| "rewards/xmlcount_reward_func": 1.2140937522053719, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 20, | |
| "total_flos": 0.0, | |
| "train_loss": 420.1021536417305, | |
| "train_runtime": 134.2834, | |
| "train_samples_per_second": 2.383, | |
| "train_steps_per_second": 0.149 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 20, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |