{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 105, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3463203463203463, "grad_norm": 9.76123928801732, "learning_rate": 7.272727272727273e-06, "loss": 1.7913, "mean_token_accuracy": 0.6742337845265866, "num_tokens": 3541090.0, "step": 5 }, { "epoch": 0.6926406926406926, "grad_norm": 1.7292229461629567, "learning_rate": 1.6363636363636366e-05, "loss": 1.1842, "mean_token_accuracy": 0.7258017435669899, "num_tokens": 6936595.0, "step": 10 }, { "epoch": 1.0, "grad_norm": 2.958735263554765, "learning_rate": 1.994977815088504e-05, "loss": 0.6688, "mean_token_accuracy": 0.8406233997412131, "num_tokens": 9820463.0, "step": 15 }, { "epoch": 1.3463203463203464, "grad_norm": 0.6172390521588779, "learning_rate": 1.964469175054377e-05, "loss": 0.5533, "mean_token_accuracy": 0.8736346215009689, "num_tokens": 13203490.0, "step": 20 }, { "epoch": 1.6926406926406927, "grad_norm": 0.1996301013239303, "learning_rate": 1.907090913734341e-05, "loss": 0.5113, "mean_token_accuracy": 0.8854866914451123, "num_tokens": 16677343.0, "step": 25 }, { "epoch": 2.0, "grad_norm": 0.4762125617018596, "learning_rate": 1.8244415603417603e-05, "loss": 0.557, "mean_token_accuracy": 0.8772783690774945, "num_tokens": 19657455.0, "step": 30 }, { "epoch": 2.346320346320346, "grad_norm": 0.17061055880750653, "learning_rate": 1.7188236838779297e-05, "loss": 0.4643, "mean_token_accuracy": 0.8905399434268475, "num_tokens": 23134171.0, "step": 35 }, { "epoch": 2.6926406926406927, "grad_norm": 0.17654122424112745, "learning_rate": 1.5931797447293553e-05, "loss": 0.4953, "mean_token_accuracy": 0.8895446695387363, "num_tokens": 26513134.0, "step": 40 }, { "epoch": 3.0, "grad_norm": 0.4198550622011698, "learning_rate": 1.451010119216102e-05, "loss": 0.5039, "mean_token_accuracy": 0.8886955749820655, "num_tokens": 29483157.0, "step": 45 }, { "epoch": 3.346320346320346, "grad_norm": 0.18028115485047716, "learning_rate": 1.2962755808856341e-05, "loss": 0.4518, "mean_token_accuracy": 0.9004578970372676, "num_tokens": 32726053.0, "step": 50 }, { "epoch": 3.6926406926406927, "grad_norm": 0.18706797363551947, "learning_rate": 1.133286955373779e-05, "loss": 0.46, "mean_token_accuracy": 0.8930317558348179, "num_tokens": 36258177.0, "step": 55 }, { "epoch": 4.0, "grad_norm": 0.15856177406249233, "learning_rate": 9.665850229923258e-06, "loss": 0.4212, "mean_token_accuracy": 0.8997706342750872, "num_tokens": 39301701.0, "step": 60 }, { "epoch": 4.346320346320346, "grad_norm": 0.17718136029490175, "learning_rate": 8.008140148961642e-06, "loss": 0.4743, "mean_token_accuracy": 0.8929261632263661, "num_tokens": 42943792.0, "step": 65 }, { "epoch": 4.692640692640692, "grad_norm": 0.17848902736372635, "learning_rate": 6.405922271624874e-06, "loss": 0.4136, "mean_token_accuracy": 0.9024837173521518, "num_tokens": 46472250.0, "step": 70 }, { "epoch": 5.0, "grad_norm": 0.2562768112814367, "learning_rate": 4.903833574080825e-06, "loss": 0.3746, "mean_token_accuracy": 0.9082856144703609, "num_tokens": 49176637.0, "step": 75 }, { "epoch": 5.346320346320346, "grad_norm": 0.17829278326006745, "learning_rate": 3.543721484411976e-06, "loss": 0.4164, "mean_token_accuracy": 0.9040168270468711, "num_tokens": 52544587.0, "step": 80 }, { "epoch": 5.692640692640692, "grad_norm": 0.159769199751552, "learning_rate": 2.3634780345266805e-06, "loss": 0.4005, "mean_token_accuracy": 0.9068072281777859, "num_tokens": 56073263.0, "step": 85 }, { "epoch": 6.0, "grad_norm": 0.17497872253097468, "learning_rate": 1.3959842073986085e-06, "loss": 0.3873, "mean_token_accuracy": 0.907542069193343, "num_tokens": 59025335.0, "step": 90 }, { "epoch": 6.346320346320346, "grad_norm": 0.16214766397656102, "learning_rate": 6.681938895839746e-07, "loss": 0.421, "mean_token_accuracy": 0.9028209887444973, "num_tokens": 62643177.0, "step": 95 }, { "epoch": 6.692640692640692, "grad_norm": 0.17433731599885693, "learning_rate": 2.0038294963413251e-07, "loss": 0.3946, "mean_token_accuracy": 0.9086535051465034, "num_tokens": 66032974.0, "step": 100 }, { "epoch": 7.0, "grad_norm": 0.26625567760062313, "learning_rate": 5.584362697453882e-09, "loss": 0.3765, "mean_token_accuracy": 0.910740121149681, "num_tokens": 68862598.0, "step": 105 }, { "epoch": 7.0, "step": 105, "total_flos": 100971315986432.0, "train_loss": 0.5581429311207362, "train_runtime": 2344.1731, "train_samples_per_second": 5.509, "train_steps_per_second": 0.045 } ], "logging_steps": 5, "max_steps": 105, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 100971315986432.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }