| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 32.0, | |
| "eval_steps": 500, | |
| "global_step": 128, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.432454113508084, | |
| "learning_rate": 1e-05, | |
| "loss": 0.5341, | |
| "mean_token_accuracy": 0.8456295132637024, | |
| "num_tokens": 39650.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.6916586628500958, | |
| "learning_rate": 1e-05, | |
| "loss": 0.3018, | |
| "mean_token_accuracy": 0.9054504566722446, | |
| "num_tokens": 403007.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 1.0267171905460786, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0492, | |
| "mean_token_accuracy": 0.9846615970134736, | |
| "num_tokens": 807665.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.494232528957775, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0051, | |
| "mean_token_accuracy": 0.9987698495388031, | |
| "num_tokens": 1211294.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.1640580142572581, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "mean_token_accuracy": 0.9997386157512664, | |
| "num_tokens": 1615330.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 0.08988346242125522, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "mean_token_accuracy": 0.9999325215816498, | |
| "num_tokens": 2020196.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.02414932533649237, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "mean_token_accuracy": 0.9999617099761963, | |
| "num_tokens": 2422995.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "grad_norm": 0.009971544544364334, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0003, | |
| "mean_token_accuracy": 0.9999676525592804, | |
| "num_tokens": 2827840.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.050124298249891744, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "mean_token_accuracy": 0.9999841094017029, | |
| "num_tokens": 3230660.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 22.5, | |
| "grad_norm": 0.07029586320018262, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "mean_token_accuracy": 0.9999815821647644, | |
| "num_tokens": 3634201.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.006717185180549847, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "mean_token_accuracy": 0.9999877035617828, | |
| "num_tokens": 4038325.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 27.5, | |
| "grad_norm": 0.03058218632284509, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "mean_token_accuracy": 0.9999842464923858, | |
| "num_tokens": 4441308.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.17009886624050463, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "mean_token_accuracy": 0.9999807775020599, | |
| "num_tokens": 4845990.0, | |
| "step": 120 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 256, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 64, | |
| "save_steps": 64, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 11450386022400.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |