| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.254170497832654, | |
| "eval_steps": 500, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.052541704978326546, | |
| "grad_norm": 0.9754809141159058, | |
| "learning_rate": 5e-06, | |
| "loss": 1.2583, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10508340995665309, | |
| "grad_norm": 0.6179457902908325, | |
| "learning_rate": 1e-05, | |
| "loss": 1.0217, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.15762511493497963, | |
| "grad_norm": 0.6895671486854553, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8382, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.21016681991330619, | |
| "grad_norm": 1.0476884841918945, | |
| "learning_rate": 2e-05, | |
| "loss": 0.7851, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2627085248916327, | |
| "grad_norm": 0.8948929905891418, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.7133, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2627085248916327, | |
| "eval_loss": 0.6265314221382141, | |
| "eval_runtime": 1.2508, | |
| "eval_samples_per_second": 7.995, | |
| "eval_steps_per_second": 3.997, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.31525022986995926, | |
| "grad_norm": 1.1585056781768799, | |
| "learning_rate": 3e-05, | |
| "loss": 0.6821, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3677919348482858, | |
| "grad_norm": 0.838334321975708, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.6574, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.42033363982661237, | |
| "grad_norm": 1.056951642036438, | |
| "learning_rate": 4e-05, | |
| "loss": 0.6456, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4728753448049389, | |
| "grad_norm": 1.043925404548645, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.6396, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5254170497832654, | |
| "grad_norm": 1.2812925577163696, | |
| "learning_rate": 5e-05, | |
| "loss": 0.6351, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5254170497832654, | |
| "eval_loss": 0.5569401979446411, | |
| "eval_runtime": 1.2148, | |
| "eval_samples_per_second": 8.232, | |
| "eval_steps_per_second": 4.116, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.577958754761592, | |
| "grad_norm": 1.3376405239105225, | |
| "learning_rate": 4.9444444444444446e-05, | |
| "loss": 0.6051, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6305004597399185, | |
| "grad_norm": 1.3639925718307495, | |
| "learning_rate": 4.888888888888889e-05, | |
| "loss": 0.6182, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6830421647182451, | |
| "grad_norm": 1.4436575174331665, | |
| "learning_rate": 4.8333333333333334e-05, | |
| "loss": 0.6145, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7355838696965716, | |
| "grad_norm": 2.0778963565826416, | |
| "learning_rate": 4.7777777777777784e-05, | |
| "loss": 0.5879, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7881255746748982, | |
| "grad_norm": 1.6049619913101196, | |
| "learning_rate": 4.722222222222222e-05, | |
| "loss": 0.5851, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7881255746748982, | |
| "eval_loss": 0.5357321500778198, | |
| "eval_runtime": 1.2208, | |
| "eval_samples_per_second": 8.192, | |
| "eval_steps_per_second": 4.096, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8406672796532247, | |
| "grad_norm": 1.5120118856430054, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 0.5693, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8932089846315513, | |
| "grad_norm": 3.5921452045440674, | |
| "learning_rate": 4.6111111111111115e-05, | |
| "loss": 0.579, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9457506896098778, | |
| "grad_norm": 0.9748013615608215, | |
| "learning_rate": 4.555555555555556e-05, | |
| "loss": 0.5686, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9982923945882044, | |
| "grad_norm": 0.8585628271102905, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.5688, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.0508340995665308, | |
| "grad_norm": 3.2674453258514404, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.5521, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0508340995665308, | |
| "eval_loss": 0.5279057621955872, | |
| "eval_runtime": 1.2064, | |
| "eval_samples_per_second": 8.289, | |
| "eval_steps_per_second": 4.145, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.1033758045448574, | |
| "grad_norm": 1.5048810243606567, | |
| "learning_rate": 4.388888888888889e-05, | |
| "loss": 0.5315, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.155917509523184, | |
| "grad_norm": 1.1342097520828247, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 0.53, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.2084592145015105, | |
| "grad_norm": 1.418641448020935, | |
| "learning_rate": 4.277777777777778e-05, | |
| "loss": 0.5345, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.261000919479837, | |
| "grad_norm": 1.1525490283966064, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 0.5364, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3135426244581636, | |
| "grad_norm": 1.1001209020614624, | |
| "learning_rate": 4.166666666666667e-05, | |
| "loss": 0.5365, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3135426244581636, | |
| "eval_loss": 0.5235944986343384, | |
| "eval_runtime": 1.2217, | |
| "eval_samples_per_second": 8.186, | |
| "eval_steps_per_second": 4.093, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3660843294364902, | |
| "grad_norm": 1.8745806217193604, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 0.5228, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.4186260344148167, | |
| "grad_norm": 1.1493146419525146, | |
| "learning_rate": 4.055555555555556e-05, | |
| "loss": 0.5244, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.4711677393931433, | |
| "grad_norm": 2.0443015098571777, | |
| "learning_rate": 4e-05, | |
| "loss": 0.5321, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.5237094443714698, | |
| "grad_norm": 2.053090810775757, | |
| "learning_rate": 3.944444444444445e-05, | |
| "loss": 0.5221, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.5762511493497964, | |
| "grad_norm": 2.0530471801757812, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 0.5192, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.5762511493497964, | |
| "eval_loss": 0.5113556981086731, | |
| "eval_runtime": 1.221, | |
| "eval_samples_per_second": 8.19, | |
| "eval_steps_per_second": 4.095, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.628792854328123, | |
| "grad_norm": 1.469416856765747, | |
| "learning_rate": 3.8333333333333334e-05, | |
| "loss": 0.5312, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.6813345593064495, | |
| "grad_norm": 2.854994535446167, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 0.5223, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.733876264284776, | |
| "grad_norm": 1.2286680936813354, | |
| "learning_rate": 3.722222222222222e-05, | |
| "loss": 0.5235, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.7864179692631026, | |
| "grad_norm": 1.3007181882858276, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 0.5086, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.8389596742414291, | |
| "grad_norm": 2.007951498031616, | |
| "learning_rate": 3.611111111111111e-05, | |
| "loss": 0.5241, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.8389596742414291, | |
| "eval_loss": 0.5189292430877686, | |
| "eval_runtime": 1.2074, | |
| "eval_samples_per_second": 8.282, | |
| "eval_steps_per_second": 4.141, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.8915013792197557, | |
| "grad_norm": 1.0643136501312256, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 0.504, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.9440430841980822, | |
| "grad_norm": 1.4835437536239624, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.5161, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.9965847891764088, | |
| "grad_norm": 1.4298534393310547, | |
| "learning_rate": 3.444444444444445e-05, | |
| "loss": 0.5095, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.049126494154735, | |
| "grad_norm": 1.0783276557922363, | |
| "learning_rate": 3.388888888888889e-05, | |
| "loss": 0.4978, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.1016681991330617, | |
| "grad_norm": 2.3203086853027344, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.4854, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.1016681991330617, | |
| "eval_loss": 0.5047374963760376, | |
| "eval_runtime": 1.2234, | |
| "eval_samples_per_second": 8.174, | |
| "eval_steps_per_second": 4.087, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.1542099041113882, | |
| "grad_norm": 1.2583634853363037, | |
| "learning_rate": 3.277777777777778e-05, | |
| "loss": 0.4934, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.206751609089715, | |
| "grad_norm": 2.2736079692840576, | |
| "learning_rate": 3.222222222222223e-05, | |
| "loss": 0.4895, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.2592933140680413, | |
| "grad_norm": 2.931607484817505, | |
| "learning_rate": 3.1666666666666666e-05, | |
| "loss": 0.489, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.311835019046368, | |
| "grad_norm": 1.2253628969192505, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 0.4868, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.3643767240246945, | |
| "grad_norm": 1.2433133125305176, | |
| "learning_rate": 3.055555555555556e-05, | |
| "loss": 0.4926, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.3643767240246945, | |
| "eval_loss": 0.5110575556755066, | |
| "eval_runtime": 1.2952, | |
| "eval_samples_per_second": 7.721, | |
| "eval_steps_per_second": 3.86, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.416918429003021, | |
| "grad_norm": 1.1074525117874146, | |
| "learning_rate": 3e-05, | |
| "loss": 0.4918, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.4694601339813476, | |
| "grad_norm": 1.3761866092681885, | |
| "learning_rate": 2.9444444444444448e-05, | |
| "loss": 0.4915, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.522001838959674, | |
| "grad_norm": 2.135338068008423, | |
| "learning_rate": 2.8888888888888888e-05, | |
| "loss": 0.483, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.5745435439380007, | |
| "grad_norm": 1.0095371007919312, | |
| "learning_rate": 2.8333333333333335e-05, | |
| "loss": 0.4918, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.627085248916327, | |
| "grad_norm": 0.9606950879096985, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 0.486, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.627085248916327, | |
| "eval_loss": 0.5079401731491089, | |
| "eval_runtime": 1.2665, | |
| "eval_samples_per_second": 7.896, | |
| "eval_steps_per_second": 3.948, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.6796269538946538, | |
| "grad_norm": 1.0660576820373535, | |
| "learning_rate": 2.7222222222222223e-05, | |
| "loss": 0.4922, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.7321686588729803, | |
| "grad_norm": 2.1245779991149902, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.4933, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.784710363851307, | |
| "grad_norm": 1.7832878828048706, | |
| "learning_rate": 2.6111111111111114e-05, | |
| "loss": 0.4794, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.8372520688296334, | |
| "grad_norm": 2.381094455718994, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 0.4852, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.88979377380796, | |
| "grad_norm": 1.01780104637146, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.4775, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.88979377380796, | |
| "eval_loss": 0.49784645438194275, | |
| "eval_runtime": 1.2033, | |
| "eval_samples_per_second": 8.31, | |
| "eval_steps_per_second": 4.155, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.9423354787862865, | |
| "grad_norm": 2.5046234130859375, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 0.4786, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.994877183764613, | |
| "grad_norm": 2.6457679271698, | |
| "learning_rate": 2.3888888888888892e-05, | |
| "loss": 0.479, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.0474188887429396, | |
| "grad_norm": 1.8499191999435425, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 0.4741, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.099960593721266, | |
| "grad_norm": 1.0821927785873413, | |
| "learning_rate": 2.277777777777778e-05, | |
| "loss": 0.4647, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 3.1525022986995928, | |
| "grad_norm": 1.5793670415878296, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.483, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.1525022986995928, | |
| "eval_loss": 0.49618691205978394, | |
| "eval_runtime": 1.2172, | |
| "eval_samples_per_second": 8.216, | |
| "eval_steps_per_second": 4.108, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.2050440036779193, | |
| "grad_norm": 1.7674349546432495, | |
| "learning_rate": 2.1666666666666667e-05, | |
| "loss": 0.4682, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.257585708656246, | |
| "grad_norm": 1.8406447172164917, | |
| "learning_rate": 2.111111111111111e-05, | |
| "loss": 0.4658, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.3101274136345724, | |
| "grad_norm": 1.0926892757415771, | |
| "learning_rate": 2.0555555555555555e-05, | |
| "loss": 0.4676, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 3.362669118612899, | |
| "grad_norm": 2.300992488861084, | |
| "learning_rate": 2e-05, | |
| "loss": 0.4595, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.4152108235912255, | |
| "grad_norm": 1.148647665977478, | |
| "learning_rate": 1.9444444444444445e-05, | |
| "loss": 0.4674, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.4152108235912255, | |
| "eval_loss": 0.4925019145011902, | |
| "eval_runtime": 1.2002, | |
| "eval_samples_per_second": 8.332, | |
| "eval_steps_per_second": 4.166, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.467752528569552, | |
| "grad_norm": 1.1826932430267334, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 0.4563, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.5202942335478786, | |
| "grad_norm": 1.0678240060806274, | |
| "learning_rate": 1.8333333333333333e-05, | |
| "loss": 0.47, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.572835938526205, | |
| "grad_norm": 2.3626327514648438, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 0.4563, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.6253776435045317, | |
| "grad_norm": 1.1958198547363281, | |
| "learning_rate": 1.7222222222222224e-05, | |
| "loss": 0.4621, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.6779193484828583, | |
| "grad_norm": 1.61029052734375, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.4629, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.6779193484828583, | |
| "eval_loss": 0.4930470585823059, | |
| "eval_runtime": 1.2074, | |
| "eval_samples_per_second": 8.282, | |
| "eval_steps_per_second": 4.141, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.730461053461185, | |
| "grad_norm": 1.0514099597930908, | |
| "learning_rate": 1.6111111111111115e-05, | |
| "loss": 0.467, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.7830027584395114, | |
| "grad_norm": 1.5292298793792725, | |
| "learning_rate": 1.5555555555555555e-05, | |
| "loss": 0.4657, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.835544463417838, | |
| "grad_norm": 1.8738545179367065, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.4741, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.8880861683961645, | |
| "grad_norm": 1.5150409936904907, | |
| "learning_rate": 1.4444444444444444e-05, | |
| "loss": 0.4598, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.940627873374491, | |
| "grad_norm": 1.3109909296035767, | |
| "learning_rate": 1.388888888888889e-05, | |
| "loss": 0.4682, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.940627873374491, | |
| "eval_loss": 0.49467793107032776, | |
| "eval_runtime": 1.241, | |
| "eval_samples_per_second": 8.058, | |
| "eval_steps_per_second": 4.029, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.9931695783528176, | |
| "grad_norm": 1.4036318063735962, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.4652, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 4.045711283331144, | |
| "grad_norm": 1.308784008026123, | |
| "learning_rate": 1.2777777777777777e-05, | |
| "loss": 0.4535, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 4.09825298830947, | |
| "grad_norm": 1.1499841213226318, | |
| "learning_rate": 1.2222222222222222e-05, | |
| "loss": 0.4554, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 4.150794693287797, | |
| "grad_norm": 1.1128175258636475, | |
| "learning_rate": 1.1666666666666668e-05, | |
| "loss": 0.4574, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 4.203336398266123, | |
| "grad_norm": 1.2397664785385132, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 0.4456, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.203336398266123, | |
| "eval_loss": 0.49332427978515625, | |
| "eval_runtime": 1.2363, | |
| "eval_samples_per_second": 8.089, | |
| "eval_steps_per_second": 4.044, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.25587810324445, | |
| "grad_norm": 1.3388928174972534, | |
| "learning_rate": 1.0555555555555555e-05, | |
| "loss": 0.4619, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 4.3084198082227765, | |
| "grad_norm": 0.97515869140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4623, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 4.3609615132011035, | |
| "grad_norm": 1.0904083251953125, | |
| "learning_rate": 9.444444444444445e-06, | |
| "loss": 0.4456, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 4.41350321817943, | |
| "grad_norm": 1.3429654836654663, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 0.4501, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 4.466044923157757, | |
| "grad_norm": 1.919039249420166, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.4486, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.466044923157757, | |
| "eval_loss": 0.4914402365684509, | |
| "eval_runtime": 1.2335, | |
| "eval_samples_per_second": 8.107, | |
| "eval_steps_per_second": 4.053, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.518586628136083, | |
| "grad_norm": 2.0423989295959473, | |
| "learning_rate": 7.777777777777777e-06, | |
| "loss": 0.4571, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.57112833311441, | |
| "grad_norm": 1.3525835275650024, | |
| "learning_rate": 7.222222222222222e-06, | |
| "loss": 0.46, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.623670038092736, | |
| "grad_norm": 1.1251217126846313, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.4522, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.676211743071063, | |
| "grad_norm": 1.8645511865615845, | |
| "learning_rate": 6.111111111111111e-06, | |
| "loss": 0.4605, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.728753448049389, | |
| "grad_norm": 1.1539450883865356, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 0.4544, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.728753448049389, | |
| "eval_loss": 0.490975946187973, | |
| "eval_runtime": 1.2066, | |
| "eval_samples_per_second": 8.288, | |
| "eval_steps_per_second": 4.144, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.781295153027716, | |
| "grad_norm": 1.1662914752960205, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4521, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.833836858006042, | |
| "grad_norm": 0.9396342635154724, | |
| "learning_rate": 4.444444444444445e-06, | |
| "loss": 0.4643, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.886378562984369, | |
| "grad_norm": 2.0316038131713867, | |
| "learning_rate": 3.888888888888889e-06, | |
| "loss": 0.4632, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.938920267962695, | |
| "grad_norm": 1.1537178754806519, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.442, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.991461972941022, | |
| "grad_norm": 1.3441674709320068, | |
| "learning_rate": 2.777777777777778e-06, | |
| "loss": 0.4531, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.991461972941022, | |
| "eval_loss": 0.4903333783149719, | |
| "eval_runtime": 1.2203, | |
| "eval_samples_per_second": 8.194, | |
| "eval_steps_per_second": 4.097, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 5.044003677919348, | |
| "grad_norm": 1.1727863550186157, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 0.4564, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 5.096545382897675, | |
| "grad_norm": 1.2154302597045898, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.4369, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 5.149087087876001, | |
| "grad_norm": 0.9445755481719971, | |
| "learning_rate": 1.1111111111111112e-06, | |
| "loss": 0.4377, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 5.201628792854328, | |
| "grad_norm": 1.6093214750289917, | |
| "learning_rate": 5.555555555555556e-07, | |
| "loss": 0.4503, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 5.254170497832654, | |
| "grad_norm": 1.092114806175232, | |
| "learning_rate": 0.0, | |
| "loss": 0.4549, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 5.254170497832654, | |
| "eval_loss": 0.4912105202674866, | |
| "eval_runtime": 1.2188, | |
| "eval_samples_per_second": 8.205, | |
| "eval_steps_per_second": 4.102, | |
| "step": 10000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.7932422620461716e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |