{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 186, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7306758515536785, "epoch": 0.21621621621621623, "grad_norm": 3.109375, "learning_rate": 5e-06, "loss": 0.9965, "mean_token_accuracy": 0.779215482622385, "num_tokens": 6826187.0, "step": 20 }, { "entropy": 0.5489349599927664, "epoch": 0.43243243243243246, "grad_norm": 0.58203125, "learning_rate": 9.998873580873848e-06, "loss": 0.5535, "mean_token_accuracy": 0.8538637027144432, "num_tokens": 13653383.0, "step": 40 }, { "entropy": 0.3483663786202669, "epoch": 0.6486486486486487, "grad_norm": 0.33984375, "learning_rate": 9.511402217292927e-06, "loss": 0.351, "mean_token_accuracy": 0.8950387261807918, "num_tokens": 20485861.0, "step": 60 }, { "entropy": 0.32447177954018114, "epoch": 0.8648648648648649, "grad_norm": 0.3125, "learning_rate": 8.222962883121196e-06, "loss": 0.3235, "mean_token_accuracy": 0.90108412951231, "num_tokens": 27314991.0, "step": 80 }, { "entropy": 0.3140074060513423, "epoch": 1.0756756756756758, "grad_norm": 0.3203125, "learning_rate": 6.3623089878341146e-06, "loss": 0.3125, "mean_token_accuracy": 0.9037734896708758, "num_tokens": 33973895.0, "step": 100 }, { "entropy": 0.3088852509856224, "epoch": 1.291891891891892, "grad_norm": 0.30078125, "learning_rate": 4.259786641731344e-06, "loss": 0.3071, "mean_token_accuracy": 0.9051578566431999, "num_tokens": 40804802.0, "step": 120 }, { "entropy": 0.30427912399172785, "epoch": 1.5081081081081082, "grad_norm": 0.298828125, "learning_rate": 2.288684010125325e-06, "loss": 0.3034, "mean_token_accuracy": 0.9061512276530266, "num_tokens": 47638527.0, "step": 140 }, { "entropy": 0.3052296478301287, "epoch": 1.7243243243243245, "grad_norm": 0.310546875, "learning_rate": 7.989566054312286e-07, "loss": 0.3036, "mean_token_accuracy": 0.9059888675808907, "num_tokens": 54467102.0, "step": 160 }, { "entropy": 0.3061117686331272, "epoch": 1.9405405405405407, "grad_norm": 0.298828125, "learning_rate": 5.509512889877333e-08, "loss": 0.3049, "mean_token_accuracy": 0.9056108377873897, "num_tokens": 61294894.0, "step": 180 }, { "entropy": 0.3062901334329085, "epoch": 2.0, "mean_token_accuracy": 0.9058145718141035, "num_tokens": 63165740.0, "step": 186, "total_flos": 1.38382400484488e+18, "train_loss": 0.41364410038917293, "train_runtime": 6390.138, "train_samples_per_second": 9.726, "train_steps_per_second": 0.029 } ], "logging_steps": 20, "max_steps": 186, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.38382400484488e+18, "train_batch_size": 84, "trial_name": null, "trial_params": null }