| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 27270, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.18335166850018336, | |
| "grad_norm": 3.5391387939453125, | |
| "learning_rate": 4.908324165749908e-05, | |
| "loss": 2.4276, | |
| "num_input_tokens_seen": 1750768, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3667033370003667, | |
| "grad_norm": 3.4214749336242676, | |
| "learning_rate": 4.816648331499817e-05, | |
| "loss": 2.2532, | |
| "num_input_tokens_seen": 3485632, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5500550055005501, | |
| "grad_norm": 3.533691167831421, | |
| "learning_rate": 4.724972497249725e-05, | |
| "loss": 2.1894, | |
| "num_input_tokens_seen": 5230688, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7334066740007334, | |
| "grad_norm": 3.7089884281158447, | |
| "learning_rate": 4.633296662999633e-05, | |
| "loss": 2.1511, | |
| "num_input_tokens_seen": 6971344, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9167583425009168, | |
| "grad_norm": 4.088582515716553, | |
| "learning_rate": 4.541620828749542e-05, | |
| "loss": 2.1089, | |
| "num_input_tokens_seen": 8738136, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.1001100110011002, | |
| "grad_norm": 4.8249077796936035, | |
| "learning_rate": 4.449944994499451e-05, | |
| "loss": 2.0594, | |
| "num_input_tokens_seen": 10466350, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.2834616795012834, | |
| "grad_norm": 3.9551169872283936, | |
| "learning_rate": 4.358269160249359e-05, | |
| "loss": 2.0194, | |
| "num_input_tokens_seen": 12222070, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.466813348001467, | |
| "grad_norm": 3.0416815280914307, | |
| "learning_rate": 4.266593325999267e-05, | |
| "loss": 2.0019, | |
| "num_input_tokens_seen": 13976918, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.6501650165016502, | |
| "grad_norm": 3.295426607131958, | |
| "learning_rate": 4.174917491749175e-05, | |
| "loss": 2.0024, | |
| "num_input_tokens_seen": 15721702, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.8335166850018334, | |
| "grad_norm": 4.8525309562683105, | |
| "learning_rate": 4.0832416574990836e-05, | |
| "loss": 1.9935, | |
| "num_input_tokens_seen": 17458590, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.0168683535020167, | |
| "grad_norm": 4.256695747375488, | |
| "learning_rate": 3.991565823248992e-05, | |
| "loss": 1.9769, | |
| "num_input_tokens_seen": 19188010, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.2002200220022003, | |
| "grad_norm": 4.129441738128662, | |
| "learning_rate": 3.8998899889989e-05, | |
| "loss": 1.9108, | |
| "num_input_tokens_seen": 20932210, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.3835716905023836, | |
| "grad_norm": 2.544461250305176, | |
| "learning_rate": 3.808214154748808e-05, | |
| "loss": 1.9047, | |
| "num_input_tokens_seen": 22658578, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.566923359002567, | |
| "grad_norm": 4.752838611602783, | |
| "learning_rate": 3.716538320498717e-05, | |
| "loss": 1.9119, | |
| "num_input_tokens_seen": 24411482, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.7502750275027505, | |
| "grad_norm": 4.965038776397705, | |
| "learning_rate": 3.624862486248625e-05, | |
| "loss": 1.8986, | |
| "num_input_tokens_seen": 26157770, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.933626696002934, | |
| "grad_norm": 4.416258335113525, | |
| "learning_rate": 3.5331866519985334e-05, | |
| "loss": 1.9086, | |
| "num_input_tokens_seen": 27912394, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.116978364503117, | |
| "grad_norm": 3.501598596572876, | |
| "learning_rate": 3.4415108177484414e-05, | |
| "loss": 1.868, | |
| "num_input_tokens_seen": 29671328, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.3003300330033003, | |
| "grad_norm": 3.8959696292877197, | |
| "learning_rate": 3.34983498349835e-05, | |
| "loss": 1.8465, | |
| "num_input_tokens_seen": 31405544, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.4836817015034836, | |
| "grad_norm": 3.5625758171081543, | |
| "learning_rate": 3.258159149248258e-05, | |
| "loss": 1.8463, | |
| "num_input_tokens_seen": 33146784, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.667033370003667, | |
| "grad_norm": 3.303110122680664, | |
| "learning_rate": 3.166483314998166e-05, | |
| "loss": 1.8394, | |
| "num_input_tokens_seen": 34888072, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.8503850385038505, | |
| "grad_norm": 3.5172908306121826, | |
| "learning_rate": 3.074807480748075e-05, | |
| "loss": 1.8379, | |
| "num_input_tokens_seen": 36645960, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 4.033736707004033, | |
| "grad_norm": 4.386786460876465, | |
| "learning_rate": 2.983131646497983e-05, | |
| "loss": 1.8245, | |
| "num_input_tokens_seen": 38388631, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.2170883755042174, | |
| "grad_norm": 3.2586567401885986, | |
| "learning_rate": 2.891455812247892e-05, | |
| "loss": 1.8029, | |
| "num_input_tokens_seen": 40139079, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.400440044004401, | |
| "grad_norm": 3.6384007930755615, | |
| "learning_rate": 2.7997799779978003e-05, | |
| "loss": 1.7909, | |
| "num_input_tokens_seen": 41872751, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.583791712504584, | |
| "grad_norm": 4.475183486938477, | |
| "learning_rate": 2.7081041437477084e-05, | |
| "loss": 1.791, | |
| "num_input_tokens_seen": 43618911, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.767143381004767, | |
| "grad_norm": 4.72713041305542, | |
| "learning_rate": 2.6164283094976168e-05, | |
| "loss": 1.7745, | |
| "num_input_tokens_seen": 45373143, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.9504950495049505, | |
| "grad_norm": 3.3076839447021484, | |
| "learning_rate": 2.5247524752475248e-05, | |
| "loss": 1.7968, | |
| "num_input_tokens_seen": 47112151, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 5.133846718005134, | |
| "grad_norm": 4.046383857727051, | |
| "learning_rate": 2.4330766409974332e-05, | |
| "loss": 1.7611, | |
| "num_input_tokens_seen": 48852751, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 5.317198386505317, | |
| "grad_norm": 3.291144609451294, | |
| "learning_rate": 2.3414008067473413e-05, | |
| "loss": 1.7363, | |
| "num_input_tokens_seen": 50602567, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 5.5005500550055, | |
| "grad_norm": 4.23388671875, | |
| "learning_rate": 2.24972497249725e-05, | |
| "loss": 1.7814, | |
| "num_input_tokens_seen": 52369863, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 5.683901723505684, | |
| "grad_norm": 3.1835505962371826, | |
| "learning_rate": 2.158049138247158e-05, | |
| "loss": 1.751, | |
| "num_input_tokens_seen": 54115983, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 5.867253392005868, | |
| "grad_norm": 3.593493938446045, | |
| "learning_rate": 2.0663733039970665e-05, | |
| "loss": 1.7481, | |
| "num_input_tokens_seen": 55853919, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 6.050605060506051, | |
| "grad_norm": 4.3933258056640625, | |
| "learning_rate": 1.9746974697469746e-05, | |
| "loss": 1.7506, | |
| "num_input_tokens_seen": 57581239, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 6.233956729006234, | |
| "grad_norm": 3.6081910133361816, | |
| "learning_rate": 1.883021635496883e-05, | |
| "loss": 1.7294, | |
| "num_input_tokens_seen": 59313735, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 6.417308397506417, | |
| "grad_norm": 3.7784392833709717, | |
| "learning_rate": 1.7913458012467914e-05, | |
| "loss": 1.719, | |
| "num_input_tokens_seen": 61061911, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 6.600660066006601, | |
| "grad_norm": 3.5482571125030518, | |
| "learning_rate": 1.6996699669966998e-05, | |
| "loss": 1.7184, | |
| "num_input_tokens_seen": 62802279, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 6.784011734506784, | |
| "grad_norm": 3.797348737716675, | |
| "learning_rate": 1.6079941327466082e-05, | |
| "loss": 1.7101, | |
| "num_input_tokens_seen": 64536303, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 6.967363403006967, | |
| "grad_norm": 3.9275312423706055, | |
| "learning_rate": 1.5163182984965163e-05, | |
| "loss": 1.7153, | |
| "num_input_tokens_seen": 66282967, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 7.15071507150715, | |
| "grad_norm": 3.65077805519104, | |
| "learning_rate": 1.4246424642464248e-05, | |
| "loss": 1.7181, | |
| "num_input_tokens_seen": 68030296, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 7.334066740007334, | |
| "grad_norm": 4.696651458740234, | |
| "learning_rate": 1.3329666299963331e-05, | |
| "loss": 1.6992, | |
| "num_input_tokens_seen": 69767824, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 7.517418408507518, | |
| "grad_norm": 5.405508518218994, | |
| "learning_rate": 1.2412907957462413e-05, | |
| "loss": 1.6903, | |
| "num_input_tokens_seen": 71509128, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 7.700770077007701, | |
| "grad_norm": 3.7343809604644775, | |
| "learning_rate": 1.1496149614961496e-05, | |
| "loss": 1.7019, | |
| "num_input_tokens_seen": 73255224, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 7.884121745507884, | |
| "grad_norm": 4.133444786071777, | |
| "learning_rate": 1.057939127246058e-05, | |
| "loss": 1.6959, | |
| "num_input_tokens_seen": 75002496, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 8.067473414008067, | |
| "grad_norm": 4.398416996002197, | |
| "learning_rate": 9.662632929959662e-06, | |
| "loss": 1.7018, | |
| "num_input_tokens_seen": 76756073, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 8.250825082508252, | |
| "grad_norm": 4.565046310424805, | |
| "learning_rate": 8.745874587458746e-06, | |
| "loss": 1.6837, | |
| "num_input_tokens_seen": 78483465, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 8.434176751008435, | |
| "grad_norm": 3.950497627258301, | |
| "learning_rate": 7.829116244957828e-06, | |
| "loss": 1.6913, | |
| "num_input_tokens_seen": 80220865, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 8.617528419508618, | |
| "grad_norm": 3.9700405597686768, | |
| "learning_rate": 6.912357902456913e-06, | |
| "loss": 1.6814, | |
| "num_input_tokens_seen": 81964649, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 8.800880088008801, | |
| "grad_norm": 3.21114444732666, | |
| "learning_rate": 5.995599559955996e-06, | |
| "loss": 1.689, | |
| "num_input_tokens_seen": 83718889, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 8.984231756508985, | |
| "grad_norm": 3.5966849327087402, | |
| "learning_rate": 5.078841217455079e-06, | |
| "loss": 1.6734, | |
| "num_input_tokens_seen": 85471529, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 9.167583425009168, | |
| "grad_norm": 3.4596688747406006, | |
| "learning_rate": 4.162082874954162e-06, | |
| "loss": 1.6792, | |
| "num_input_tokens_seen": 87214771, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 9.350935093509351, | |
| "grad_norm": 3.9838054180145264, | |
| "learning_rate": 3.2453245324532458e-06, | |
| "loss": 1.6583, | |
| "num_input_tokens_seen": 88949475, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 9.534286762009534, | |
| "grad_norm": 3.389430522918701, | |
| "learning_rate": 2.3285661899523286e-06, | |
| "loss": 1.6836, | |
| "num_input_tokens_seen": 90694267, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 9.717638430509718, | |
| "grad_norm": 4.560466289520264, | |
| "learning_rate": 1.411807847451412e-06, | |
| "loss": 1.6804, | |
| "num_input_tokens_seen": 92441267, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 9.900990099009901, | |
| "grad_norm": 4.484193325042725, | |
| "learning_rate": 4.950495049504951e-07, | |
| "loss": 1.6876, | |
| "num_input_tokens_seen": 94186835, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "num_input_tokens_seen": 95128823, | |
| "step": 27270, | |
| "total_flos": 3.4538173670639616e+16, | |
| "train_loss": 1.8293144167322006, | |
| "train_runtime": 2454.5506, | |
| "train_samples_per_second": 88.859, | |
| "train_steps_per_second": 11.11, | |
| "train_tokens_per_second": 38762.215 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 27270, | |
| "num_input_tokens_seen": 95128823, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.4538173670639616e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |