| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0321455648218345, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.12130401819560273, | |
| "grad_norm": 1.3961113691329956, | |
| "learning_rate": 7.960000000000001e-05, | |
| "loss": 2.5572, | |
| "mean_token_accuracy": 0.5595218382775784, | |
| "num_tokens": 57035.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24260803639120546, | |
| "grad_norm": 1.2175310850143433, | |
| "learning_rate": 9.686315789473685e-05, | |
| "loss": 1.2853, | |
| "mean_token_accuracy": 0.745141789726913, | |
| "num_tokens": 112951.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3639120545868082, | |
| "grad_norm": 1.0547959804534912, | |
| "learning_rate": 9.265263157894736e-05, | |
| "loss": 1.1237, | |
| "mean_token_accuracy": 0.773681266605854, | |
| "num_tokens": 169775.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4852160727824109, | |
| "grad_norm": 1.3259146213531494, | |
| "learning_rate": 8.844210526315791e-05, | |
| "loss": 0.9634, | |
| "mean_token_accuracy": 0.8042044347897171, | |
| "num_tokens": 226409.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6065200909780136, | |
| "grad_norm": 1.0659151077270508, | |
| "learning_rate": 8.423157894736843e-05, | |
| "loss": 0.8763, | |
| "mean_token_accuracy": 0.8207557284086943, | |
| "num_tokens": 283376.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7278241091736164, | |
| "grad_norm": 0.9711537957191467, | |
| "learning_rate": 8.002105263157896e-05, | |
| "loss": 0.7978, | |
| "mean_token_accuracy": 0.8341990520060062, | |
| "num_tokens": 339940.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.849128127369219, | |
| "grad_norm": 0.7150431275367737, | |
| "learning_rate": 7.581052631578947e-05, | |
| "loss": 0.758, | |
| "mean_token_accuracy": 0.840606311634183, | |
| "num_tokens": 397189.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9704321455648218, | |
| "grad_norm": 1.0585309267044067, | |
| "learning_rate": 7.16e-05, | |
| "loss": 0.6963, | |
| "mean_token_accuracy": 0.8530387983471155, | |
| "num_tokens": 453903.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.0915845337376802, | |
| "grad_norm": 0.7791032791137695, | |
| "learning_rate": 6.738947368421052e-05, | |
| "loss": 0.6373, | |
| "mean_token_accuracy": 0.8606961415019889, | |
| "num_tokens": 510914.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.2128885519332828, | |
| "grad_norm": 0.8289880752563477, | |
| "learning_rate": 6.317894736842105e-05, | |
| "loss": 0.6328, | |
| "mean_token_accuracy": 0.8607474016025662, | |
| "num_tokens": 567262.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.3341925701288855, | |
| "grad_norm": 0.9581061601638794, | |
| "learning_rate": 5.8968421052631585e-05, | |
| "loss": 0.5861, | |
| "mean_token_accuracy": 0.8699268980324268, | |
| "num_tokens": 622785.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.4554965883244884, | |
| "grad_norm": 0.7406579852104187, | |
| "learning_rate": 5.475789473684211e-05, | |
| "loss": 0.5775, | |
| "mean_token_accuracy": 0.8712259716168046, | |
| "num_tokens": 678802.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.576800606520091, | |
| "grad_norm": 0.9062987565994263, | |
| "learning_rate": 5.054736842105263e-05, | |
| "loss": 0.5801, | |
| "mean_token_accuracy": 0.8699949059635401, | |
| "num_tokens": 735965.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.6981046247156937, | |
| "grad_norm": 1.3214945793151855, | |
| "learning_rate": 4.6336842105263164e-05, | |
| "loss": 0.5514, | |
| "mean_token_accuracy": 0.875830891802907, | |
| "num_tokens": 793098.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.8194086429112963, | |
| "grad_norm": 0.99712735414505, | |
| "learning_rate": 4.212631578947369e-05, | |
| "loss": 0.5216, | |
| "mean_token_accuracy": 0.8817874767631292, | |
| "num_tokens": 849394.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.9407126611068992, | |
| "grad_norm": 0.968995213508606, | |
| "learning_rate": 3.791578947368421e-05, | |
| "loss": 0.5068, | |
| "mean_token_accuracy": 0.8846843484044075, | |
| "num_tokens": 906768.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.0618650492797572, | |
| "grad_norm": 0.8303135633468628, | |
| "learning_rate": 3.370526315789474e-05, | |
| "loss": 0.4856, | |
| "mean_token_accuracy": 0.8882180935748677, | |
| "num_tokens": 964437.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.1831690674753603, | |
| "grad_norm": 1.1362717151641846, | |
| "learning_rate": 2.9494736842105264e-05, | |
| "loss": 0.4485, | |
| "mean_token_accuracy": 0.8932469403743744, | |
| "num_tokens": 1021649.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.304473085670963, | |
| "grad_norm": 0.8629305362701416, | |
| "learning_rate": 2.528421052631579e-05, | |
| "loss": 0.4515, | |
| "mean_token_accuracy": 0.8930351422727107, | |
| "num_tokens": 1078206.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.4257771038665656, | |
| "grad_norm": 1.0377343893051147, | |
| "learning_rate": 2.1073684210526316e-05, | |
| "loss": 0.4472, | |
| "mean_token_accuracy": 0.8934314188361168, | |
| "num_tokens": 1135187.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.5470811220621683, | |
| "grad_norm": 1.061483383178711, | |
| "learning_rate": 1.686315789473684e-05, | |
| "loss": 0.4325, | |
| "mean_token_accuracy": 0.8970406632125377, | |
| "num_tokens": 1192547.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.668385140257771, | |
| "grad_norm": 0.8136045336723328, | |
| "learning_rate": 1.2652631578947368e-05, | |
| "loss": 0.4435, | |
| "mean_token_accuracy": 0.8965679884701967, | |
| "num_tokens": 1249416.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.7896891584533736, | |
| "grad_norm": 1.3490817546844482, | |
| "learning_rate": 8.463157894736843e-06, | |
| "loss": 0.4244, | |
| "mean_token_accuracy": 0.9009497262537479, | |
| "num_tokens": 1304893.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.9109931766489767, | |
| "grad_norm": 0.9819340109825134, | |
| "learning_rate": 4.252631578947369e-06, | |
| "loss": 0.4261, | |
| "mean_token_accuracy": 0.8986875934898854, | |
| "num_tokens": 1361211.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.0321455648218345, | |
| "grad_norm": 0.7312269806861877, | |
| "learning_rate": 4.210526315789474e-08, | |
| "loss": 0.4095, | |
| "mean_token_accuracy": 0.9022044740869047, | |
| "num_tokens": 1418483.0, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 200, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8666093203722240.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |