{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7135222218930721, "epoch": 0.1386481802426343, "grad_norm": 2.765625, "learning_rate": 3.2758620689655175e-06, "loss": 1.0037, "mean_token_accuracy": 0.766643451154232, "num_tokens": 6832690.0, "step": 20 }, { "entropy": 0.6983387872576714, "epoch": 0.2772963604852686, "grad_norm": 1.171875, "learning_rate": 6.724137931034484e-06, "loss": 0.7837, "mean_token_accuracy": 0.7993580244481564, "num_tokens": 13664933.0, "step": 40 }, { "entropy": 0.5203136764466763, "epoch": 0.41594454072790293, "grad_norm": 0.44140625, "learning_rate": 9.999541586764836e-06, "loss": 0.5293, "mean_token_accuracy": 0.8474333696067333, "num_tokens": 20500452.0, "step": 60 }, { "entropy": 0.4632941197603941, "epoch": 0.5545927209705372, "grad_norm": 0.34765625, "learning_rate": 9.799195340909569e-06, "loss": 0.4664, "mean_token_accuracy": 0.8605481564998627, "num_tokens": 27342335.0, "step": 80 }, { "entropy": 0.44385356418788435, "epoch": 0.6932409012131716, "grad_norm": 0.384765625, "learning_rate": 9.248987682898576e-06, "loss": 0.4448, "mean_token_accuracy": 0.8655192881822587, "num_tokens": 34182590.0, "step": 100 }, { "entropy": 0.44103220105171204, "epoch": 0.8318890814558059, "grad_norm": 0.341796875, "learning_rate": 8.389028759232816e-06, "loss": 0.4425, "mean_token_accuracy": 0.8660077638924122, "num_tokens": 41024570.0, "step": 120 }, { "entropy": 0.4328003875911236, "epoch": 0.9705372616984402, "grad_norm": 0.318359375, "learning_rate": 7.2820095883138456e-06, "loss": 0.4334, "mean_token_accuracy": 0.8682045668363572, "num_tokens": 47861377.0, "step": 140 }, { "entropy": 0.4284165660282234, "epoch": 1.1039861351819757, "grad_norm": 0.326171875, "learning_rate": 6.008631884264387e-06, "loss": 0.4289, "mean_token_accuracy": 0.868948469688366, "num_tokens": 54391813.0, "step": 160 }, { "entropy": 0.4236792534589767, "epoch": 1.24263431542461, "grad_norm": 0.341796875, "learning_rate": 4.661724900761355e-06, "loss": 0.4239, "mean_token_accuracy": 0.8704770557582379, "num_tokens": 61227970.0, "step": 180 }, { "entropy": 0.42442810237407685, "epoch": 1.3812824956672443, "grad_norm": 0.33984375, "learning_rate": 3.3394781770539406e-06, "loss": 0.4245, "mean_token_accuracy": 0.8702129699289799, "num_tokens": 68065726.0, "step": 200 }, { "entropy": 0.42482112273573874, "epoch": 1.5199306759098787, "grad_norm": 0.318359375, "learning_rate": 2.138283519083281e-06, "loss": 0.4249, "mean_token_accuracy": 0.8700512439012528, "num_tokens": 74903041.0, "step": 220 }, { "entropy": 0.4213219854980707, "epoch": 1.658578856152513, "grad_norm": 0.32421875, "learning_rate": 1.145708035387177e-06, "loss": 0.4219, "mean_token_accuracy": 0.8707552805542946, "num_tokens": 81743295.0, "step": 240 }, { "entropy": 0.422089908644557, "epoch": 1.7972270363951472, "grad_norm": 0.322265625, "learning_rate": 4.341104935775442e-07, "loss": 0.4229, "mean_token_accuracy": 0.8708024740219116, "num_tokens": 88577789.0, "step": 260 }, { "entropy": 0.4224289160221815, "epoch": 1.9358752166377817, "grad_norm": 0.322265625, "learning_rate": 5.536636509891225e-08, "loss": 0.4232, "mean_token_accuracy": 0.8705480195581913, "num_tokens": 95417722.0, "step": 280 }, { "entropy": 0.4213579764237275, "epoch": 2.0, "mean_token_accuracy": 0.8710838395196039, "num_tokens": 98528536.0, "step": 290, "total_flos": 2.1561577524323942e+18, "train_loss": 0.5024350297862086, "train_runtime": 9862.6956, "train_samples_per_second": 9.819, "train_steps_per_second": 0.029 } ], "logging_steps": 20, "max_steps": 290, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1561577524323942e+18, "train_batch_size": 84, "trial_name": null, "trial_params": null }