{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05578800557880056, "grad_norm": 23.749076225013393, "learning_rate": 8.888888888888888e-07, "loss": 0.3953, "mean_token_accuracy": 0.8888508647680282, "num_tokens": 387743.0, "step": 5 }, { "epoch": 0.11157601115760112, "grad_norm": 2.3167233631586432, "learning_rate": 2e-06, "loss": 0.1526, "mean_token_accuracy": 0.9280575379729271, "num_tokens": 771256.0, "step": 10 }, { "epoch": 0.11157601115760112, "eval_loss": 0.13449975848197937, "eval_mean_token_accuracy": 0.9239859104156494, "eval_num_tokens": 771256.0, "eval_runtime": 18.2782, "eval_samples_per_second": 13.13, "eval_steps_per_second": 0.821, "step": 10 }, { "epoch": 0.16736401673640167, "grad_norm": 0.7338062527338681, "learning_rate": 1.9812553106273845e-06, "loss": 0.1264, "mean_token_accuracy": 0.9322391495108604, "num_tokens": 1160980.0, "step": 15 }, { "epoch": 0.22315202231520223, "grad_norm": 0.3879503394325157, "learning_rate": 1.9257239692688907e-06, "loss": 0.1212, "mean_token_accuracy": 0.9351256564259529, "num_tokens": 1550715.0, "step": 20 }, { "epoch": 0.22315202231520223, "eval_loss": 0.12052115052938461, "eval_mean_token_accuracy": 0.9348681171735128, "eval_num_tokens": 1550715.0, "eval_runtime": 18.2138, "eval_samples_per_second": 13.177, "eval_steps_per_second": 0.824, "step": 20 }, { "epoch": 0.2789400278940028, "grad_norm": 0.5297134638859874, "learning_rate": 1.8354878114129364e-06, "loss": 0.1193, "mean_token_accuracy": 0.9359175354242325, "num_tokens": 1941126.0, "step": 25 }, { "epoch": 0.33472803347280333, "grad_norm": 0.47179195608480895, "learning_rate": 1.7139297345578992e-06, "loss": 0.1169, "mean_token_accuracy": 0.9379747390747071, "num_tokens": 2329529.0, "step": 30 }, { "epoch": 0.33472803347280333, "eval_loss": 0.11508604884147644, "eval_mean_token_accuracy": 0.9393269936243693, "eval_num_tokens": 2329529.0, "eval_runtime": 18.2459, "eval_samples_per_second": 13.154, "eval_steps_per_second": 0.822, "step": 30 }, { "epoch": 0.3905160390516039, "grad_norm": 0.37176199793362424, "learning_rate": 1.5656068754865386e-06, "loss": 0.1142, "mean_token_accuracy": 0.939566570520401, "num_tokens": 2717665.0, "step": 35 }, { "epoch": 0.44630404463040446, "grad_norm": 0.3499240896523672, "learning_rate": 1.3960797660391568e-06, "loss": 0.1129, "mean_token_accuracy": 0.9405360326170922, "num_tokens": 3103315.0, "step": 40 }, { "epoch": 0.44630404463040446, "eval_loss": 0.11198202520608902, "eval_mean_token_accuracy": 0.9417382756868998, "eval_num_tokens": 3103315.0, "eval_runtime": 18.2284, "eval_samples_per_second": 13.166, "eval_steps_per_second": 0.823, "step": 40 }, { "epoch": 0.502092050209205, "grad_norm": 0.2957821687268926, "learning_rate": 1.2117038722294109e-06, "loss": 0.1115, "mean_token_accuracy": 0.9413685530424118, "num_tokens": 3495146.0, "step": 45 }, { "epoch": 0.5578800557880056, "grad_norm": 0.46882385041167457, "learning_rate": 1.0193913317718244e-06, "loss": 0.11, "mean_token_accuracy": 0.9428008124232292, "num_tokens": 3881790.0, "step": 50 }, { "epoch": 0.5578800557880056, "eval_loss": 0.10956163704395294, "eval_mean_token_accuracy": 0.9426836887995402, "eval_num_tokens": 3881790.0, "eval_runtime": 18.215, "eval_samples_per_second": 13.176, "eval_steps_per_second": 0.823, "step": 50 }, { "epoch": 0.6136680613668062, "grad_norm": 0.4984955096633512, "learning_rate": 8.263518223330696e-07, "loss": 0.1095, "mean_token_accuracy": 0.942644701898098, "num_tokens": 4268488.0, "step": 55 }, { "epoch": 0.6694560669456067, "grad_norm": 0.4556490483768333, "learning_rate": 6.398222751952897e-07, "loss": 0.1086, "mean_token_accuracy": 0.9422503650188446, "num_tokens": 4655213.0, "step": 60 }, { "epoch": 0.6694560669456067, "eval_loss": 0.10714904963970184, "eval_mean_token_accuracy": 0.9433511217435201, "eval_num_tokens": 4655213.0, "eval_runtime": 18.1639, "eval_samples_per_second": 13.213, "eval_steps_per_second": 0.826, "step": 60 }, { "epoch": 0.7252440725244073, "grad_norm": 0.4314372816991459, "learning_rate": 4.667955671983089e-07, "loss": 0.1073, "mean_token_accuracy": 0.9441392824053765, "num_tokens": 5045504.0, "step": 65 }, { "epoch": 0.7810320781032078, "grad_norm": 0.354993884475141, "learning_rate": 3.137583621312665e-07, "loss": 0.1066, "mean_token_accuracy": 0.9438454762101174, "num_tokens": 5430975.0, "step": 70 }, { "epoch": 0.7810320781032078, "eval_loss": 0.10607162117958069, "eval_mean_token_accuracy": 0.9440902670224508, "eval_num_tokens": 5430975.0, "eval_runtime": 18.21, "eval_samples_per_second": 13.18, "eval_steps_per_second": 0.824, "step": 70 }, { "epoch": 0.8368200836820083, "grad_norm": 0.32832697342800965, "learning_rate": 1.864479297370325e-07, "loss": 0.1056, "mean_token_accuracy": 0.9443844795227051, "num_tokens": 5817145.0, "step": 75 }, { "epoch": 0.8926080892608089, "grad_norm": 0.35886626129455507, "learning_rate": 8.963705903385343e-08, "loss": 0.1056, "mean_token_accuracy": 0.9448912084102631, "num_tokens": 6205189.0, "step": 80 }, { "epoch": 0.8926080892608089, "eval_loss": 0.10541822016239166, "eval_mean_token_accuracy": 0.9442455053329468, "eval_num_tokens": 6205189.0, "eval_runtime": 18.2066, "eval_samples_per_second": 13.182, "eval_steps_per_second": 0.824, "step": 80 }, { "epoch": 0.9483960948396095, "grad_norm": 0.40190877094501176, "learning_rate": 2.6955129420176193e-08, "loss": 0.1057, "mean_token_accuracy": 0.9443619295954704, "num_tokens": 6593589.0, "step": 85 }, { "epoch": 1.0, "grad_norm": 0.5222516935507332, "learning_rate": 7.520474957699585e-10, "loss": 0.106, "mean_token_accuracy": 0.9445104276811754, "num_tokens": 6926562.0, "step": 90 }, { "epoch": 1.0, "eval_loss": 0.10521233826875687, "eval_mean_token_accuracy": 0.9448104937871297, "eval_num_tokens": 6926562.0, "eval_runtime": 18.1993, "eval_samples_per_second": 13.187, "eval_steps_per_second": 0.824, "step": 90 }, { "epoch": 1.0, "step": 90, "total_flos": 27072329023488.0, "train_loss": 0.12974326941702102, "train_runtime": 2960.6052, "train_samples_per_second": 3.871, "train_steps_per_second": 0.03 } ], "logging_steps": 5, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 27072329023488.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }