{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 112, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32653061224489793, "grad_norm": 3.1376605701360365, "learning_rate": 1.3333333333333333e-05, "loss": 1.4086, "mean_token_accuracy": 0.6991279818117618, "num_tokens": 3032788.0, "step": 5 }, { "epoch": 0.6530612244897959, "grad_norm": 2.886401259073607, "learning_rate": 3.0000000000000004e-05, "loss": 0.7103, "mean_token_accuracy": 0.8366268374025821, "num_tokens": 6108050.0, "step": 10 }, { "epoch": 0.9795918367346939, "grad_norm": 3.8085762872209985, "learning_rate": 3.9960534568565436e-05, "loss": 0.4951, "mean_token_accuracy": 0.8857219524681568, "num_tokens": 9153481.0, "step": 15 }, { "epoch": 1.2612244897959184, "grad_norm": 0.7657669831409691, "learning_rate": 3.951833523877495e-05, "loss": 0.4221, "mean_token_accuracy": 0.8977213052735813, "num_tokens": 11636574.0, "step": 20 }, { "epoch": 1.5877551020408163, "grad_norm": 0.2877956756473211, "learning_rate": 3.859552971776503e-05, "loss": 0.4363, "mean_token_accuracy": 0.895506302267313, "num_tokens": 14619147.0, "step": 25 }, { "epoch": 1.9142857142857141, "grad_norm": 0.28656112123732563, "learning_rate": 3.721484054007888e-05, "loss": 0.3936, "mean_token_accuracy": 0.9049416035413742, "num_tokens": 17710131.0, "step": 30 }, { "epoch": 2.195918367346939, "grad_norm": 0.2823103236835904, "learning_rate": 3.541026485551579e-05, "loss": 0.3158, "mean_token_accuracy": 0.9200559068417203, "num_tokens": 20112186.0, "step": 35 }, { "epoch": 2.522448979591837, "grad_norm": 0.2655359257585452, "learning_rate": 3.322623730647304e-05, "loss": 0.3448, "mean_token_accuracy": 0.9147586159408092, "num_tokens": 23165264.0, "step": 40 }, { "epoch": 2.8489795918367347, "grad_norm": 0.20795518587914283, "learning_rate": 3.0716535899579936e-05, "loss": 0.2919, "mean_token_accuracy": 0.9269049897789955, "num_tokens": 26259224.0, "step": 45 }, { "epoch": 3.130612244897959, "grad_norm": 0.2884355032278423, "learning_rate": 2.7942957812695613e-05, "loss": 0.282, "mean_token_accuracy": 0.9283535532329393, "num_tokens": 28739676.0, "step": 50 }, { "epoch": 3.4571428571428573, "grad_norm": 0.21897940738440588, "learning_rate": 2.4973797743297103e-05, "loss": 0.2199, "mean_token_accuracy": 0.9427422039210797, "num_tokens": 31756738.0, "step": 55 }, { "epoch": 3.783673469387755, "grad_norm": 0.23860764801694623, "learning_rate": 2.1882166266370292e-05, "loss": 0.2215, "mean_token_accuracy": 0.9424595959484577, "num_tokens": 34883372.0, "step": 60 }, { "epoch": 4.0653061224489795, "grad_norm": 0.46858487289456846, "learning_rate": 1.8744189609413733e-05, "loss": 0.1972, "mean_token_accuracy": 0.9491587445355844, "num_tokens": 37296561.0, "step": 65 }, { "epoch": 4.391836734693878, "grad_norm": 0.2511271428576979, "learning_rate": 1.5637135172069155e-05, "loss": 0.1837, "mean_token_accuracy": 0.9537528082728386, "num_tokens": 40373787.0, "step": 70 }, { "epoch": 4.718367346938775, "grad_norm": 0.2967198621367196, "learning_rate": 1.2637508946306443e-05, "loss": 0.165, "mean_token_accuracy": 0.9580980874598026, "num_tokens": 43363161.0, "step": 75 }, { "epoch": 5.0, "grad_norm": 0.23363519084392373, "learning_rate": 9.819171684992575e-06, "loss": 0.1312, "mean_token_accuracy": 0.9628054084985153, "num_tokens": 45860256.0, "step": 80 }, { "epoch": 5.326530612244898, "grad_norm": 0.23534750602445398, "learning_rate": 7.251520205026206e-06, "loss": 0.1269, "mean_token_accuracy": 0.967383312433958, "num_tokens": 48879333.0, "step": 85 }, { "epoch": 5.653061224489796, "grad_norm": 0.2799375041862015, "learning_rate": 4.997778607390809e-06, "loss": 0.1221, "mean_token_accuracy": 0.9683701202273369, "num_tokens": 51946379.0, "step": 90 }, { "epoch": 5.979591836734694, "grad_norm": 0.1875365707782748, "learning_rate": 3.1134414899597033e-06, "loss": 0.1279, "mean_token_accuracy": 0.9681350864470005, "num_tokens": 54995950.0, "step": 95 }, { "epoch": 6.261224489795918, "grad_norm": 0.2033347485797425, "learning_rate": 1.6449074863203773e-06, "loss": 0.1222, "mean_token_accuracy": 0.972901335660962, "num_tokens": 57369628.0, "step": 100 }, { "epoch": 6.587755102040816, "grad_norm": 0.1697720175472823, "learning_rate": 6.283367774273785e-07, "loss": 0.105, "mean_token_accuracy": 0.9732269406318664, "num_tokens": 60481208.0, "step": 105 }, { "epoch": 6.914285714285715, "grad_norm": 0.18026706851340263, "learning_rate": 8.876070793840008e-08, "loss": 0.0986, "mean_token_accuracy": 0.9752129040658474, "num_tokens": 63509425.0, "step": 110 }, { "epoch": 7.0, "mean_token_accuracy": 0.9745120831898281, "num_tokens": 64131339.0, "step": 112, "total_flos": 130497026588672.0, "train_loss": 0.3111057321407965, "train_runtime": 2546.477, "train_samples_per_second": 5.369, "train_steps_per_second": 0.044 } ], "logging_steps": 5, "max_steps": 112, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 130497026588672.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }