{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.056338028169014, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0880281690140845, "grad_norm": 1525.1461181640625, "learning_rate": 4.166666666666667e-06, "loss": 134.6262, "step": 100 }, { "epoch": 0.176056338028169, "grad_norm": 2871.249755859375, "learning_rate": 4.62962962962963e-06, "loss": 106.7172, "step": 200 }, { "epoch": 0.2640845070422535, "grad_norm": 492.1271667480469, "learning_rate": 4.166666666666667e-06, "loss": 102.1769, "step": 300 }, { "epoch": 0.352112676056338, "grad_norm": 1749.6070556640625, "learning_rate": 3.7037037037037037e-06, "loss": 106.4214, "step": 400 }, { "epoch": 0.44014084507042256, "grad_norm": 853.5518798828125, "learning_rate": 3.240740740740741e-06, "loss": 92.0699, "step": 500 }, { "epoch": 0.528169014084507, "grad_norm": 2185.8486328125, "learning_rate": 2.7777777777777783e-06, "loss": 90.0462, "step": 600 }, { "epoch": 0.6161971830985915, "grad_norm": 411.7020263671875, "learning_rate": 2.314814814814815e-06, "loss": 93.0463, "step": 700 }, { "epoch": 0.704225352112676, "grad_norm": 764.4384155273438, "learning_rate": 1.8518518518518519e-06, "loss": 86.6424, "step": 800 }, { "epoch": 0.7922535211267606, "grad_norm": 3891.708251953125, "learning_rate": 1.3888888888888892e-06, "loss": 78.9075, "step": 900 }, { "epoch": 0.8802816901408451, "grad_norm": 1514.9061279296875, "learning_rate": 9.259259259259259e-07, "loss": 79.3982, "step": 1000 }, { "epoch": 0.9683098591549296, "grad_norm": 1521.738037109375, "learning_rate": 4.6296296296296297e-07, "loss": 82.1088, "step": 1100 }, { "epoch": 1.0, "eval_loss": 197.31683349609375, "eval_runtime": 13.0547, "eval_samples_per_second": 77.367, "eval_steps_per_second": 9.728, "step": 1136 }, { "epoch": 1.056338028169014, "grad_norm": 260.0885925292969, "learning_rate": 0.0, "loss": 75.2076, "step": 1200 } ], "logging_steps": 100, "max_steps": 1200, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 600, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }