| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.0, | |
| "eval_steps": 500, | |
| "global_step": 112, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.32653061224489793, | |
| "grad_norm": 3.1376605701360365, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.4086, | |
| "mean_token_accuracy": 0.6991279818117618, | |
| "num_tokens": 3032788.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.6530612244897959, | |
| "grad_norm": 2.886401259073607, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 0.7103, | |
| "mean_token_accuracy": 0.8366268374025821, | |
| "num_tokens": 6108050.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.9795918367346939, | |
| "grad_norm": 3.8085762872209985, | |
| "learning_rate": 3.9960534568565436e-05, | |
| "loss": 0.4951, | |
| "mean_token_accuracy": 0.8857219524681568, | |
| "num_tokens": 9153481.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 1.2612244897959184, | |
| "grad_norm": 0.7657669831409691, | |
| "learning_rate": 3.951833523877495e-05, | |
| "loss": 0.4221, | |
| "mean_token_accuracy": 0.8977213052735813, | |
| "num_tokens": 11636574.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.5877551020408163, | |
| "grad_norm": 0.2877956756473211, | |
| "learning_rate": 3.859552971776503e-05, | |
| "loss": 0.4363, | |
| "mean_token_accuracy": 0.895506302267313, | |
| "num_tokens": 14619147.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.9142857142857141, | |
| "grad_norm": 0.28656112123732563, | |
| "learning_rate": 3.721484054007888e-05, | |
| "loss": 0.3936, | |
| "mean_token_accuracy": 0.9049416035413742, | |
| "num_tokens": 17710131.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 2.195918367346939, | |
| "grad_norm": 0.2823103236835904, | |
| "learning_rate": 3.541026485551579e-05, | |
| "loss": 0.3158, | |
| "mean_token_accuracy": 0.9200559068417203, | |
| "num_tokens": 20112186.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 2.522448979591837, | |
| "grad_norm": 0.2655359257585452, | |
| "learning_rate": 3.322623730647304e-05, | |
| "loss": 0.3448, | |
| "mean_token_accuracy": 0.9147586159408092, | |
| "num_tokens": 23165264.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.8489795918367347, | |
| "grad_norm": 0.20795518587914283, | |
| "learning_rate": 3.0716535899579936e-05, | |
| "loss": 0.2919, | |
| "mean_token_accuracy": 0.9269049897789955, | |
| "num_tokens": 26259224.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 3.130612244897959, | |
| "grad_norm": 0.2884355032278423, | |
| "learning_rate": 2.7942957812695613e-05, | |
| "loss": 0.282, | |
| "mean_token_accuracy": 0.9283535532329393, | |
| "num_tokens": 28739676.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 3.4571428571428573, | |
| "grad_norm": 0.21897940738440588, | |
| "learning_rate": 2.4973797743297103e-05, | |
| "loss": 0.2199, | |
| "mean_token_accuracy": 0.9427422039210797, | |
| "num_tokens": 31756738.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 3.783673469387755, | |
| "grad_norm": 0.23860764801694623, | |
| "learning_rate": 2.1882166266370292e-05, | |
| "loss": 0.2215, | |
| "mean_token_accuracy": 0.9424595959484577, | |
| "num_tokens": 34883372.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 4.0653061224489795, | |
| "grad_norm": 0.46858487289456846, | |
| "learning_rate": 1.8744189609413733e-05, | |
| "loss": 0.1972, | |
| "mean_token_accuracy": 0.9491587445355844, | |
| "num_tokens": 37296561.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 4.391836734693878, | |
| "grad_norm": 0.2511271428576979, | |
| "learning_rate": 1.5637135172069155e-05, | |
| "loss": 0.1837, | |
| "mean_token_accuracy": 0.9537528082728386, | |
| "num_tokens": 40373787.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 4.718367346938775, | |
| "grad_norm": 0.2967198621367196, | |
| "learning_rate": 1.2637508946306443e-05, | |
| "loss": 0.165, | |
| "mean_token_accuracy": 0.9580980874598026, | |
| "num_tokens": 43363161.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.23363519084392373, | |
| "learning_rate": 9.819171684992575e-06, | |
| "loss": 0.1312, | |
| "mean_token_accuracy": 0.9628054084985153, | |
| "num_tokens": 45860256.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 5.326530612244898, | |
| "grad_norm": 0.23534750602445398, | |
| "learning_rate": 7.251520205026206e-06, | |
| "loss": 0.1269, | |
| "mean_token_accuracy": 0.967383312433958, | |
| "num_tokens": 48879333.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 5.653061224489796, | |
| "grad_norm": 0.2799375041862015, | |
| "learning_rate": 4.997778607390809e-06, | |
| "loss": 0.1221, | |
| "mean_token_accuracy": 0.9683701202273369, | |
| "num_tokens": 51946379.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 5.979591836734694, | |
| "grad_norm": 0.1875365707782748, | |
| "learning_rate": 3.1134414899597033e-06, | |
| "loss": 0.1279, | |
| "mean_token_accuracy": 0.9681350864470005, | |
| "num_tokens": 54995950.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 6.261224489795918, | |
| "grad_norm": 0.2033347485797425, | |
| "learning_rate": 1.6449074863203773e-06, | |
| "loss": 0.1222, | |
| "mean_token_accuracy": 0.972901335660962, | |
| "num_tokens": 57369628.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 6.587755102040816, | |
| "grad_norm": 0.1697720175472823, | |
| "learning_rate": 6.283367774273785e-07, | |
| "loss": 0.105, | |
| "mean_token_accuracy": 0.9732269406318664, | |
| "num_tokens": 60481208.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 6.914285714285715, | |
| "grad_norm": 0.18026706851340263, | |
| "learning_rate": 8.876070793840008e-08, | |
| "loss": 0.0986, | |
| "mean_token_accuracy": 0.9752129040658474, | |
| "num_tokens": 63509425.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "mean_token_accuracy": 0.9745120831898281, | |
| "num_tokens": 64131339.0, | |
| "step": 112, | |
| "total_flos": 130497026588672.0, | |
| "train_loss": 0.3111057321407965, | |
| "train_runtime": 2546.477, | |
| "train_samples_per_second": 5.369, | |
| "train_steps_per_second": 0.044 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 112, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 130497026588672.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |