{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03389830508474576, "grad_norm": 7.922511100769043, "learning_rate": 6.081081081081082e-06, "loss": 1.3016, "step": 10 }, { "epoch": 0.06779661016949153, "grad_norm": 1.777317762374878, "learning_rate": 1.2837837837837838e-05, "loss": 0.6145, "step": 20 }, { "epoch": 0.1016949152542373, "grad_norm": 2.9913039207458496, "learning_rate": 1.9594594594594595e-05, "loss": 0.2696, "step": 30 }, { "epoch": 0.13559322033898305, "grad_norm": 1.0988717079162598, "learning_rate": 2.635135135135135e-05, "loss": 0.1885, "step": 40 }, { "epoch": 0.1694915254237288, "grad_norm": 1.2467349767684937, "learning_rate": 3.310810810810811e-05, "loss": 0.1516, "step": 50 }, { "epoch": 0.2033898305084746, "grad_norm": 1.4213334321975708, "learning_rate": 3.986486486486487e-05, "loss": 0.1406, "step": 60 }, { "epoch": 0.23728813559322035, "grad_norm": 1.0713083744049072, "learning_rate": 4.662162162162162e-05, "loss": 0.1316, "step": 70 }, { "epoch": 0.2711864406779661, "grad_norm": 1.506921410560608, "learning_rate": 5.337837837837838e-05, "loss": 0.1413, "step": 80 }, { "epoch": 0.3050847457627119, "grad_norm": 0.9623873233795166, "learning_rate": 6.013513513513514e-05, "loss": 0.1204, "step": 90 }, { "epoch": 0.3389830508474576, "grad_norm": 1.5513378381729126, "learning_rate": 6.68918918918919e-05, "loss": 0.1194, "step": 100 }, { "epoch": 0.3728813559322034, "grad_norm": 1.0387235879898071, "learning_rate": 7.364864864864865e-05, "loss": 0.1111, "step": 110 }, { "epoch": 0.4067796610169492, "grad_norm": 1.3981765508651733, "learning_rate": 8.040540540540541e-05, "loss": 0.119, "step": 120 }, { "epoch": 0.4406779661016949, "grad_norm": 1.1575907468795776, "learning_rate": 8.716216216216216e-05, "loss": 0.0932, "step": 130 }, { "epoch": 0.4745762711864407, "grad_norm": 0.8324375748634338, "learning_rate": 9.391891891891892e-05, "loss": 0.0997, "step": 140 }, { "epoch": 0.5084745762711864, "grad_norm": 1.0497596263885498, "learning_rate": 9.99999685729623e-05, "loss": 0.0795, "step": 150 }, { "epoch": 0.5423728813559322, "grad_norm": 0.9868571758270264, "learning_rate": 9.999619737623951e-05, "loss": 0.0759, "step": 160 }, { "epoch": 0.576271186440678, "grad_norm": 0.6950494050979614, "learning_rate": 9.99861413151757e-05, "loss": 0.0647, "step": 170 }, { "epoch": 0.6101694915254238, "grad_norm": 0.9549587965011597, "learning_rate": 9.99698016538866e-05, "loss": 0.0708, "step": 180 }, { "epoch": 0.6440677966101694, "grad_norm": 0.6263039112091064, "learning_rate": 9.994718044637956e-05, "loss": 0.0711, "step": 190 }, { "epoch": 0.6779661016949152, "grad_norm": 0.6637316346168518, "learning_rate": 9.991828053629519e-05, "loss": 0.0635, "step": 200 }, { "epoch": 0.711864406779661, "grad_norm": 0.8776123523712158, "learning_rate": 9.988310555655009e-05, "loss": 0.0609, "step": 210 }, { "epoch": 0.7457627118644068, "grad_norm": 0.9095442891120911, "learning_rate": 9.984165992888009e-05, "loss": 0.0532, "step": 220 }, { "epoch": 0.7796610169491526, "grad_norm": 0.8120779395103455, "learning_rate": 9.979394886328434e-05, "loss": 0.0522, "step": 230 }, { "epoch": 0.8135593220338984, "grad_norm": 0.7972075939178467, "learning_rate": 9.973997835737049e-05, "loss": 0.0564, "step": 240 }, { "epoch": 0.847457627118644, "grad_norm": 0.8452187776565552, "learning_rate": 9.967975519560067e-05, "loss": 0.0522, "step": 250 }, { "epoch": 0.8813559322033898, "grad_norm": 0.8456110954284668, "learning_rate": 9.961328694843865e-05, "loss": 0.049, "step": 260 }, { "epoch": 0.9152542372881356, "grad_norm": 0.47546839714050293, "learning_rate": 9.954058197139824e-05, "loss": 0.0503, "step": 270 }, { "epoch": 0.9491525423728814, "grad_norm": 0.7819595336914062, "learning_rate": 9.946164940399288e-05, "loss": 0.0515, "step": 280 }, { "epoch": 0.9830508474576272, "grad_norm": 0.7775952219963074, "learning_rate": 9.937649916858673e-05, "loss": 0.05, "step": 290 }, { "epoch": 1.0169491525423728, "grad_norm": 0.7178511619567871, "learning_rate": 9.928514196914744e-05, "loss": 0.0529, "step": 300 }, { "epoch": 1.0508474576271187, "grad_norm": 0.6638413667678833, "learning_rate": 9.91875892899005e-05, "loss": 0.053, "step": 310 }, { "epoch": 1.0847457627118644, "grad_norm": 0.6948956251144409, "learning_rate": 9.908385339388564e-05, "loss": 0.0484, "step": 320 }, { "epoch": 1.11864406779661, "grad_norm": 0.47976964712142944, "learning_rate": 9.897394732141531e-05, "loss": 0.0477, "step": 330 }, { "epoch": 1.152542372881356, "grad_norm": 0.786380410194397, "learning_rate": 9.885788488843535e-05, "loss": 0.0464, "step": 340 }, { "epoch": 1.1864406779661016, "grad_norm": 0.7638571858406067, "learning_rate": 9.873568068478826e-05, "loss": 0.0493, "step": 350 }, { "epoch": 1.2203389830508475, "grad_norm": 0.8229843974113464, "learning_rate": 9.860735007237922e-05, "loss": 0.0488, "step": 360 }, { "epoch": 1.2542372881355932, "grad_norm": 0.6507530808448792, "learning_rate": 9.847290918324487e-05, "loss": 0.0442, "step": 370 }, { "epoch": 1.288135593220339, "grad_norm": 0.6550291180610657, "learning_rate": 9.833237491752557e-05, "loss": 0.0422, "step": 380 }, { "epoch": 1.3220338983050848, "grad_norm": 0.4493582844734192, "learning_rate": 9.818576494134071e-05, "loss": 0.0436, "step": 390 }, { "epoch": 1.3559322033898304, "grad_norm": 0.4366756081581116, "learning_rate": 9.803309768456819e-05, "loss": 0.0483, "step": 400 }, { "epoch": 1.3898305084745763, "grad_norm": 0.518522322177887, "learning_rate": 9.787439233852752e-05, "loss": 0.04, "step": 410 }, { "epoch": 1.423728813559322, "grad_norm": 0.5709708333015442, "learning_rate": 9.770966885356736e-05, "loss": 0.0389, "step": 420 }, { "epoch": 1.457627118644068, "grad_norm": 0.44815006852149963, "learning_rate": 9.753894793655767e-05, "loss": 0.0414, "step": 430 }, { "epoch": 1.4915254237288136, "grad_norm": 0.6205833554267883, "learning_rate": 9.736225104828666e-05, "loss": 0.0488, "step": 440 }, { "epoch": 1.5254237288135593, "grad_norm": 0.38849759101867676, "learning_rate": 9.717960040076305e-05, "loss": 0.0419, "step": 450 }, { "epoch": 1.559322033898305, "grad_norm": 0.3880470395088196, "learning_rate": 9.699101895442388e-05, "loss": 0.0438, "step": 460 }, { "epoch": 1.5932203389830508, "grad_norm": 0.7112048268318176, "learning_rate": 9.679653041524818e-05, "loss": 0.0424, "step": 470 }, { "epoch": 1.6271186440677967, "grad_norm": 0.8996948003768921, "learning_rate": 9.659615923177702e-05, "loss": 0.0441, "step": 480 }, { "epoch": 1.6610169491525424, "grad_norm": 0.6175360083580017, "learning_rate": 9.638993059204016e-05, "loss": 0.0429, "step": 490 }, { "epoch": 1.694915254237288, "grad_norm": 0.39414605498313904, "learning_rate": 9.617787042038964e-05, "loss": 0.0377, "step": 500 }, { "epoch": 1.7288135593220337, "grad_norm": 0.5390213131904602, "learning_rate": 9.596000537424107e-05, "loss": 0.0407, "step": 510 }, { "epoch": 1.7627118644067796, "grad_norm": 0.4397003948688507, "learning_rate": 9.573636284072252e-05, "loss": 0.0386, "step": 520 }, { "epoch": 1.7966101694915255, "grad_norm": 0.6200686097145081, "learning_rate": 9.550697093323173e-05, "loss": 0.0392, "step": 530 }, { "epoch": 1.8305084745762712, "grad_norm": 0.5365989804267883, "learning_rate": 9.527185848790216e-05, "loss": 0.0355, "step": 540 }, { "epoch": 1.8644067796610169, "grad_norm": 0.4744288921356201, "learning_rate": 9.503105505997805e-05, "loss": 0.0387, "step": 550 }, { "epoch": 1.8983050847457628, "grad_norm": 0.5821052193641663, "learning_rate": 9.478459092009909e-05, "loss": 0.0392, "step": 560 }, { "epoch": 1.9322033898305084, "grad_norm": 0.5957028269767761, "learning_rate": 9.453249705049523e-05, "loss": 0.0392, "step": 570 }, { "epoch": 1.9661016949152543, "grad_norm": 0.54462730884552, "learning_rate": 9.427480514109202e-05, "loss": 0.0355, "step": 580 }, { "epoch": 2.0, "grad_norm": 1.2646660804748535, "learning_rate": 9.401154758552691e-05, "loss": 0.045, "step": 590 }, { "epoch": 2.0338983050847457, "grad_norm": 0.4895438551902771, "learning_rate": 9.37427574770772e-05, "loss": 0.0394, "step": 600 }, { "epoch": 2.0677966101694913, "grad_norm": 0.7774738669395447, "learning_rate": 9.346846860449996e-05, "loss": 0.0412, "step": 610 }, { "epoch": 2.1016949152542375, "grad_norm": 0.7907915115356445, "learning_rate": 9.318871544778457e-05, "loss": 0.0434, "step": 620 }, { "epoch": 2.135593220338983, "grad_norm": 0.5634192228317261, "learning_rate": 9.290353317381834e-05, "loss": 0.0445, "step": 630 }, { "epoch": 2.169491525423729, "grad_norm": 0.524152398109436, "learning_rate": 9.261295763196579e-05, "loss": 0.0347, "step": 640 }, { "epoch": 2.2033898305084745, "grad_norm": 0.48940563201904297, "learning_rate": 9.231702534956213e-05, "loss": 0.0387, "step": 650 }, { "epoch": 2.23728813559322, "grad_norm": 0.4866064488887787, "learning_rate": 9.201577352732157e-05, "loss": 0.0386, "step": 660 }, { "epoch": 2.2711864406779663, "grad_norm": 0.379842609167099, "learning_rate": 9.170924003466086e-05, "loss": 0.0341, "step": 670 }, { "epoch": 2.305084745762712, "grad_norm": 0.41267237067222595, "learning_rate": 9.13974634049389e-05, "loss": 0.0379, "step": 680 }, { "epoch": 2.3389830508474576, "grad_norm": 0.6153244376182556, "learning_rate": 9.108048283061283e-05, "loss": 0.0331, "step": 690 }, { "epoch": 2.3728813559322033, "grad_norm": 0.48824194073677063, "learning_rate": 9.075833815831128e-05, "loss": 0.0448, "step": 700 }, { "epoch": 2.406779661016949, "grad_norm": 0.47475725412368774, "learning_rate": 9.043106988382526e-05, "loss": 0.0373, "step": 710 }, { "epoch": 2.440677966101695, "grad_norm": 0.44530612230300903, "learning_rate": 9.009871914701779e-05, "loss": 0.036, "step": 720 }, { "epoch": 2.4745762711864407, "grad_norm": 0.704857349395752, "learning_rate": 8.976132772665211e-05, "loss": 0.0405, "step": 730 }, { "epoch": 2.5084745762711864, "grad_norm": 0.5021880269050598, "learning_rate": 8.941893803513994e-05, "loss": 0.039, "step": 740 }, { "epoch": 2.542372881355932, "grad_norm": 0.43946319818496704, "learning_rate": 8.907159311320985e-05, "loss": 0.0377, "step": 750 }, { "epoch": 2.576271186440678, "grad_norm": 0.4081028997898102, "learning_rate": 8.87193366244969e-05, "loss": 0.0337, "step": 760 }, { "epoch": 2.610169491525424, "grad_norm": 0.42831340432167053, "learning_rate": 8.836221285005364e-05, "loss": 0.0331, "step": 770 }, { "epoch": 2.6440677966101696, "grad_norm": 0.5115790963172913, "learning_rate": 8.800026668278378e-05, "loss": 0.0383, "step": 780 }, { "epoch": 2.6779661016949152, "grad_norm": 0.7489039897918701, "learning_rate": 8.763354362179887e-05, "loss": 0.037, "step": 790 }, { "epoch": 2.711864406779661, "grad_norm": 0.47204285860061646, "learning_rate": 8.726208976669868e-05, "loss": 0.0366, "step": 800 }, { "epoch": 2.7457627118644066, "grad_norm": 0.3575058579444885, "learning_rate": 8.688595181177617e-05, "loss": 0.03, "step": 810 }, { "epoch": 2.7796610169491527, "grad_norm": 0.3760571777820587, "learning_rate": 8.650517704014773e-05, "loss": 0.0376, "step": 820 }, { "epoch": 2.8135593220338984, "grad_norm": 0.3222888708114624, "learning_rate": 8.611981331780942e-05, "loss": 0.0373, "step": 830 }, { "epoch": 2.847457627118644, "grad_norm": 0.3633357286453247, "learning_rate": 8.572990908761974e-05, "loss": 0.0358, "step": 840 }, { "epoch": 2.8813559322033897, "grad_norm": 0.3399844765663147, "learning_rate": 8.533551336321015e-05, "loss": 0.0359, "step": 850 }, { "epoch": 2.915254237288136, "grad_norm": 0.7424901127815247, "learning_rate": 8.493667572282371e-05, "loss": 0.0338, "step": 860 }, { "epoch": 2.9491525423728815, "grad_norm": 0.4179316759109497, "learning_rate": 8.453344630308269e-05, "loss": 0.0358, "step": 870 }, { "epoch": 2.983050847457627, "grad_norm": 0.4843442738056183, "learning_rate": 8.412587579268613e-05, "loss": 0.0353, "step": 880 }, { "epoch": 3.016949152542373, "grad_norm": 0.5782058835029602, "learning_rate": 8.371401542603791e-05, "loss": 0.0373, "step": 890 }, { "epoch": 3.0508474576271185, "grad_norm": 0.5099455714225769, "learning_rate": 8.329791697680612e-05, "loss": 0.0336, "step": 900 }, { "epoch": 3.084745762711864, "grad_norm": 0.31090185046195984, "learning_rate": 8.287763275141496e-05, "loss": 0.0326, "step": 910 }, { "epoch": 3.1186440677966103, "grad_norm": 0.22995054721832275, "learning_rate": 8.245321558246928e-05, "loss": 0.0342, "step": 920 }, { "epoch": 3.152542372881356, "grad_norm": 0.46085304021835327, "learning_rate": 8.202471882211327e-05, "loss": 0.0314, "step": 930 }, { "epoch": 3.1864406779661016, "grad_norm": 0.47579386830329895, "learning_rate": 8.159219633532368e-05, "loss": 0.0335, "step": 940 }, { "epoch": 3.2203389830508473, "grad_norm": 0.3809256851673126, "learning_rate": 8.115570249313864e-05, "loss": 0.0322, "step": 950 }, { "epoch": 3.2542372881355934, "grad_norm": 0.38978174328804016, "learning_rate": 8.071529216582284e-05, "loss": 0.0313, "step": 960 }, { "epoch": 3.288135593220339, "grad_norm": 0.3661576211452484, "learning_rate": 8.027102071596999e-05, "loss": 0.0378, "step": 970 }, { "epoch": 3.3220338983050848, "grad_norm": 0.28337371349334717, "learning_rate": 7.982294399154331e-05, "loss": 0.0338, "step": 980 }, { "epoch": 3.3559322033898304, "grad_norm": 0.48508763313293457, "learning_rate": 7.93711183188551e-05, "loss": 0.0374, "step": 990 }, { "epoch": 3.389830508474576, "grad_norm": 0.4197925925254822, "learning_rate": 7.891560049548614e-05, "loss": 0.0336, "step": 1000 }, { "epoch": 3.423728813559322, "grad_norm": 0.4868261516094208, "learning_rate": 7.845644778314583e-05, "loss": 0.0396, "step": 1010 }, { "epoch": 3.457627118644068, "grad_norm": 0.264763206243515, "learning_rate": 7.799371790047395e-05, "loss": 0.0317, "step": 1020 }, { "epoch": 3.4915254237288136, "grad_norm": 0.2507479190826416, "learning_rate": 7.752746901578517e-05, "loss": 0.0308, "step": 1030 }, { "epoch": 3.5254237288135593, "grad_norm": 0.40973591804504395, "learning_rate": 7.705775973975675e-05, "loss": 0.035, "step": 1040 }, { "epoch": 3.559322033898305, "grad_norm": 0.34160512685775757, "learning_rate": 7.658464911806087e-05, "loss": 0.0287, "step": 1050 }, { "epoch": 3.593220338983051, "grad_norm": 0.3527885973453522, "learning_rate": 7.61081966239421e-05, "loss": 0.026, "step": 1060 }, { "epoch": 3.6271186440677967, "grad_norm": 0.4634409248828888, "learning_rate": 7.562846215074129e-05, "loss": 0.03, "step": 1070 }, { "epoch": 3.6610169491525424, "grad_norm": 0.3833574652671814, "learning_rate": 7.514550600436657e-05, "loss": 0.032, "step": 1080 }, { "epoch": 3.694915254237288, "grad_norm": 0.4073950946331024, "learning_rate": 7.465938889571232e-05, "loss": 0.0268, "step": 1090 }, { "epoch": 3.7288135593220337, "grad_norm": 0.590704083442688, "learning_rate": 7.417017193302769e-05, "loss": 0.0298, "step": 1100 }, { "epoch": 3.7627118644067794, "grad_norm": 0.3835419714450836, "learning_rate": 7.367791661423461e-05, "loss": 0.0305, "step": 1110 }, { "epoch": 3.7966101694915255, "grad_norm": 0.4307284951210022, "learning_rate": 7.318268481919716e-05, "loss": 0.0312, "step": 1120 }, { "epoch": 3.830508474576271, "grad_norm": 0.5295498371124268, "learning_rate": 7.268453880194292e-05, "loss": 0.0329, "step": 1130 }, { "epoch": 3.864406779661017, "grad_norm": 0.5060334801673889, "learning_rate": 7.218354118283715e-05, "loss": 0.0346, "step": 1140 }, { "epoch": 3.898305084745763, "grad_norm": 0.3661726415157318, "learning_rate": 7.167975494071093e-05, "loss": 0.034, "step": 1150 }, { "epoch": 3.9322033898305087, "grad_norm": 0.382707804441452, "learning_rate": 7.11732434049444e-05, "loss": 0.0314, "step": 1160 }, { "epoch": 3.9661016949152543, "grad_norm": 0.44602781534194946, "learning_rate": 7.066407024750578e-05, "loss": 0.0346, "step": 1170 }, { "epoch": 4.0, "grad_norm": 0.8751994371414185, "learning_rate": 7.015229947494734e-05, "loss": 0.0321, "step": 1180 }, { "epoch": 4.033898305084746, "grad_norm": 0.31756699085235596, "learning_rate": 6.963799542035932e-05, "loss": 0.0319, "step": 1190 }, { "epoch": 4.067796610169491, "grad_norm": 0.336243599653244, "learning_rate": 6.912122273528291e-05, "loss": 0.0296, "step": 1200 }, { "epoch": 4.101694915254237, "grad_norm": 0.3985385298728943, "learning_rate": 6.860204638158305e-05, "loss": 0.028, "step": 1210 }, { "epoch": 4.135593220338983, "grad_norm": 0.5406593084335327, "learning_rate": 6.808053162328227e-05, "loss": 0.034, "step": 1220 }, { "epoch": 4.169491525423728, "grad_norm": 0.3683408200740814, "learning_rate": 6.755674401835657e-05, "loss": 0.0312, "step": 1230 }, { "epoch": 4.203389830508475, "grad_norm": 0.5468569993972778, "learning_rate": 6.703074941049431e-05, "loss": 0.0286, "step": 1240 }, { "epoch": 4.237288135593221, "grad_norm": 0.5490782260894775, "learning_rate": 6.650261392081929e-05, "loss": 0.0322, "step": 1250 }, { "epoch": 4.271186440677966, "grad_norm": 0.2997434437274933, "learning_rate": 6.597240393957875e-05, "loss": 0.0299, "step": 1260 }, { "epoch": 4.305084745762712, "grad_norm": 0.43356242775917053, "learning_rate": 6.544018611779778e-05, "loss": 0.0331, "step": 1270 }, { "epoch": 4.338983050847458, "grad_norm": 0.33595308661460876, "learning_rate": 6.490602735890071e-05, "loss": 0.0275, "step": 1280 }, { "epoch": 4.372881355932203, "grad_norm": 0.4883277714252472, "learning_rate": 6.436999481030104e-05, "loss": 0.0321, "step": 1290 }, { "epoch": 4.406779661016949, "grad_norm": 0.25386956334114075, "learning_rate": 6.383215585496037e-05, "loss": 0.0322, "step": 1300 }, { "epoch": 4.440677966101695, "grad_norm": 0.30296579003334045, "learning_rate": 6.329257810291806e-05, "loss": 0.023, "step": 1310 }, { "epoch": 4.47457627118644, "grad_norm": 0.47214221954345703, "learning_rate": 6.275132938279213e-05, "loss": 0.0248, "step": 1320 }, { "epoch": 4.508474576271187, "grad_norm": 0.40692755579948425, "learning_rate": 6.220847773325268e-05, "loss": 0.0325, "step": 1330 }, { "epoch": 4.5423728813559325, "grad_norm": 0.26799410581588745, "learning_rate": 6.166409139446903e-05, "loss": 0.0305, "step": 1340 }, { "epoch": 4.576271186440678, "grad_norm": 0.2558389902114868, "learning_rate": 6.111823879953145e-05, "loss": 0.0265, "step": 1350 }, { "epoch": 4.610169491525424, "grad_norm": 0.28272953629493713, "learning_rate": 6.0570988565848685e-05, "loss": 0.025, "step": 1360 }, { "epoch": 4.6440677966101696, "grad_norm": 0.4986218214035034, "learning_rate": 6.002240948652218e-05, "loss": 0.0251, "step": 1370 }, { "epoch": 4.677966101694915, "grad_norm": 0.5608692169189453, "learning_rate": 5.94725705216984e-05, "loss": 0.0271, "step": 1380 }, { "epoch": 4.711864406779661, "grad_norm": 0.5304986834526062, "learning_rate": 5.8921540789900064e-05, "loss": 0.0268, "step": 1390 }, { "epoch": 4.745762711864407, "grad_norm": 0.45950591564178467, "learning_rate": 5.836938955933743e-05, "loss": 0.0272, "step": 1400 }, { "epoch": 4.779661016949152, "grad_norm": 0.31691136956214905, "learning_rate": 5.781618623920081e-05, "loss": 0.0312, "step": 1410 }, { "epoch": 4.813559322033898, "grad_norm": 0.33825352787971497, "learning_rate": 5.726200037093542e-05, "loss": 0.0289, "step": 1420 }, { "epoch": 4.847457627118644, "grad_norm": 0.31809505820274353, "learning_rate": 5.670690161949952e-05, "loss": 0.0269, "step": 1430 }, { "epoch": 4.88135593220339, "grad_norm": 0.27987194061279297, "learning_rate": 5.6150959764606944e-05, "loss": 0.026, "step": 1440 }, { "epoch": 4.915254237288136, "grad_norm": 0.33458834886550903, "learning_rate": 5.559424469195551e-05, "loss": 0.0256, "step": 1450 }, { "epoch": 4.9491525423728815, "grad_norm": 0.5636442303657532, "learning_rate": 5.503682638444173e-05, "loss": 0.0279, "step": 1460 }, { "epoch": 4.983050847457627, "grad_norm": 0.400234192609787, "learning_rate": 5.4478774913363596e-05, "loss": 0.0273, "step": 1470 }, { "epoch": 5.016949152542373, "grad_norm": 0.5593984127044678, "learning_rate": 5.392016042961205e-05, "loss": 0.0299, "step": 1480 }, { "epoch": 5.0508474576271185, "grad_norm": 0.42886337637901306, "learning_rate": 5.336105315485264e-05, "loss": 0.0282, "step": 1490 }, { "epoch": 5.084745762711864, "grad_norm": 0.4065008759498596, "learning_rate": 5.280152337269807e-05, "loss": 0.0282, "step": 1500 }, { "epoch": 5.11864406779661, "grad_norm": 0.46473270654678345, "learning_rate": 5.224164141987313e-05, "loss": 0.027, "step": 1510 }, { "epoch": 5.1525423728813555, "grad_norm": 0.35750046372413635, "learning_rate": 5.168147767737289e-05, "loss": 0.0288, "step": 1520 }, { "epoch": 5.186440677966102, "grad_norm": 0.3990558385848999, "learning_rate": 5.1121102561615295e-05, "loss": 0.0294, "step": 1530 }, { "epoch": 5.220338983050848, "grad_norm": 0.32793498039245605, "learning_rate": 5.056058651558936e-05, "loss": 0.028, "step": 1540 }, { "epoch": 5.254237288135593, "grad_norm": 0.6837785243988037, "learning_rate": 5e-05, "loss": 0.0256, "step": 1550 }, { "epoch": 5.288135593220339, "grad_norm": 0.49780961871147156, "learning_rate": 4.943941348441065e-05, "loss": 0.0258, "step": 1560 }, { "epoch": 5.322033898305085, "grad_norm": 0.529035210609436, "learning_rate": 4.8878897438384716e-05, "loss": 0.0281, "step": 1570 }, { "epoch": 5.3559322033898304, "grad_norm": 0.27274975180625916, "learning_rate": 4.831852232262713e-05, "loss": 0.0233, "step": 1580 }, { "epoch": 5.389830508474576, "grad_norm": 0.5764862895011902, "learning_rate": 4.775835858012689e-05, "loss": 0.0235, "step": 1590 }, { "epoch": 5.423728813559322, "grad_norm": 0.3733164370059967, "learning_rate": 4.7198476627301955e-05, "loss": 0.0269, "step": 1600 }, { "epoch": 5.4576271186440675, "grad_norm": 0.4069574177265167, "learning_rate": 4.663894684514737e-05, "loss": 0.0267, "step": 1610 }, { "epoch": 5.491525423728813, "grad_norm": 0.37277278304100037, "learning_rate": 4.6079839570387954e-05, "loss": 0.0239, "step": 1620 }, { "epoch": 5.52542372881356, "grad_norm": 0.43740156292915344, "learning_rate": 4.552122508663641e-05, "loss": 0.0218, "step": 1630 }, { "epoch": 5.559322033898305, "grad_norm": 0.4995250105857849, "learning_rate": 4.496317361555828e-05, "loss": 0.0275, "step": 1640 }, { "epoch": 5.593220338983051, "grad_norm": 0.3874273896217346, "learning_rate": 4.44057553080445e-05, "loss": 0.0218, "step": 1650 }, { "epoch": 5.627118644067797, "grad_norm": 0.3819812536239624, "learning_rate": 4.384904023539306e-05, "loss": 0.0217, "step": 1660 }, { "epoch": 5.661016949152542, "grad_norm": 0.44215914607048035, "learning_rate": 4.32930983805005e-05, "loss": 0.028, "step": 1670 }, { "epoch": 5.694915254237288, "grad_norm": 0.25850358605384827, "learning_rate": 4.273799962906459e-05, "loss": 0.0255, "step": 1680 }, { "epoch": 5.728813559322034, "grad_norm": 0.3987925052642822, "learning_rate": 4.218381376079921e-05, "loss": 0.0267, "step": 1690 }, { "epoch": 5.762711864406779, "grad_norm": 0.25257599353790283, "learning_rate": 4.16306104406626e-05, "loss": 0.0244, "step": 1700 }, { "epoch": 5.796610169491525, "grad_norm": 0.33424732089042664, "learning_rate": 4.1078459210099954e-05, "loss": 0.0254, "step": 1710 }, { "epoch": 5.830508474576272, "grad_norm": 0.22258403897285461, "learning_rate": 4.0527429478301596e-05, "loss": 0.0236, "step": 1720 }, { "epoch": 5.864406779661017, "grad_norm": 0.34541794657707214, "learning_rate": 3.997759051347782e-05, "loss": 0.0206, "step": 1730 }, { "epoch": 5.898305084745763, "grad_norm": 0.38338717818260193, "learning_rate": 3.9429011434151306e-05, "loss": 0.031, "step": 1740 }, { "epoch": 5.932203389830509, "grad_norm": 0.5410785675048828, "learning_rate": 3.8881761200468546e-05, "loss": 0.0269, "step": 1750 }, { "epoch": 5.966101694915254, "grad_norm": 0.3703366219997406, "learning_rate": 3.833590860553098e-05, "loss": 0.0265, "step": 1760 }, { "epoch": 6.0, "grad_norm": 1.3290477991104126, "learning_rate": 3.779152226674733e-05, "loss": 0.0231, "step": 1770 }, { "epoch": 6.033898305084746, "grad_norm": 0.3305765986442566, "learning_rate": 3.724867061720787e-05, "loss": 0.0237, "step": 1780 }, { "epoch": 6.067796610169491, "grad_norm": 0.2830105125904083, "learning_rate": 3.6707421897081947e-05, "loss": 0.0253, "step": 1790 }, { "epoch": 6.101694915254237, "grad_norm": 0.4226081073284149, "learning_rate": 3.6167844145039644e-05, "loss": 0.021, "step": 1800 }, { "epoch": 6.135593220338983, "grad_norm": 0.3365839123725891, "learning_rate": 3.563000518969898e-05, "loss": 0.0236, "step": 1810 }, { "epoch": 6.169491525423728, "grad_norm": 0.3108862042427063, "learning_rate": 3.509397264109929e-05, "loss": 0.0214, "step": 1820 }, { "epoch": 6.203389830508475, "grad_norm": 0.22683779895305634, "learning_rate": 3.455981388220223e-05, "loss": 0.0194, "step": 1830 }, { "epoch": 6.237288135593221, "grad_norm": 0.38497164845466614, "learning_rate": 3.4027596060421254e-05, "loss": 0.0241, "step": 1840 }, { "epoch": 6.271186440677966, "grad_norm": 0.22206397354602814, "learning_rate": 3.3497386079180725e-05, "loss": 0.019, "step": 1850 }, { "epoch": 6.305084745762712, "grad_norm": 0.3224073648452759, "learning_rate": 3.29692505895057e-05, "loss": 0.0231, "step": 1860 }, { "epoch": 6.338983050847458, "grad_norm": 0.28747010231018066, "learning_rate": 3.244325598164344e-05, "loss": 0.0206, "step": 1870 }, { "epoch": 6.372881355932203, "grad_norm": 0.3315770924091339, "learning_rate": 3.191946837671773e-05, "loss": 0.0192, "step": 1880 }, { "epoch": 6.406779661016949, "grad_norm": 0.38061845302581787, "learning_rate": 3.139795361841696e-05, "loss": 0.0179, "step": 1890 }, { "epoch": 6.440677966101695, "grad_norm": 0.2752900719642639, "learning_rate": 3.08787772647171e-05, "loss": 0.0217, "step": 1900 }, { "epoch": 6.47457627118644, "grad_norm": 0.270625501871109, "learning_rate": 3.03620045796407e-05, "loss": 0.0246, "step": 1910 }, { "epoch": 6.508474576271187, "grad_norm": 0.2529767155647278, "learning_rate": 2.9847700525052687e-05, "loss": 0.0192, "step": 1920 }, { "epoch": 6.5423728813559325, "grad_norm": 0.25423118472099304, "learning_rate": 2.933592975249423e-05, "loss": 0.0209, "step": 1930 }, { "epoch": 6.576271186440678, "grad_norm": 0.20597638189792633, "learning_rate": 2.8826756595055593e-05, "loss": 0.024, "step": 1940 }, { "epoch": 6.610169491525424, "grad_norm": 0.2644325792789459, "learning_rate": 2.8320245059289084e-05, "loss": 0.0249, "step": 1950 }, { "epoch": 6.6440677966101696, "grad_norm": 0.3659607768058777, "learning_rate": 2.781645881716286e-05, "loss": 0.0189, "step": 1960 }, { "epoch": 6.677966101694915, "grad_norm": 0.2306346893310547, "learning_rate": 2.731546119805708e-05, "loss": 0.0224, "step": 1970 }, { "epoch": 6.711864406779661, "grad_norm": 0.30413874983787537, "learning_rate": 2.6817315180802848e-05, "loss": 0.023, "step": 1980 }, { "epoch": 6.745762711864407, "grad_norm": 0.2451293170452118, "learning_rate": 2.63220833857654e-05, "loss": 0.0189, "step": 1990 }, { "epoch": 6.779661016949152, "grad_norm": 0.3022925555706024, "learning_rate": 2.5829828066972318e-05, "loss": 0.0217, "step": 2000 }, { "epoch": 6.813559322033898, "grad_norm": 0.2744346261024475, "learning_rate": 2.5340611104287682e-05, "loss": 0.0215, "step": 2010 }, { "epoch": 6.847457627118644, "grad_norm": 0.38032010197639465, "learning_rate": 2.4854493995633465e-05, "loss": 0.0194, "step": 2020 }, { "epoch": 6.88135593220339, "grad_norm": 0.3329201340675354, "learning_rate": 2.437153784925873e-05, "loss": 0.0217, "step": 2030 }, { "epoch": 6.915254237288136, "grad_norm": 0.3519807457923889, "learning_rate": 2.3891803376057914e-05, "loss": 0.0229, "step": 2040 }, { "epoch": 6.9491525423728815, "grad_norm": 0.3398759961128235, "learning_rate": 2.3415350881939134e-05, "loss": 0.0201, "step": 2050 }, { "epoch": 6.983050847457627, "grad_norm": 0.5052980780601501, "learning_rate": 2.2942240260243248e-05, "loss": 0.0212, "step": 2060 }, { "epoch": 7.016949152542373, "grad_norm": 0.26014694571495056, "learning_rate": 2.2472530984214823e-05, "loss": 0.02, "step": 2070 }, { "epoch": 7.0508474576271185, "grad_norm": 0.20524539053440094, "learning_rate": 2.2006282099526053e-05, "loss": 0.0203, "step": 2080 }, { "epoch": 7.084745762711864, "grad_norm": 0.26466068625450134, "learning_rate": 2.15435522168542e-05, "loss": 0.0194, "step": 2090 }, { "epoch": 7.11864406779661, "grad_norm": 0.5399367809295654, "learning_rate": 2.108439950451387e-05, "loss": 0.0216, "step": 2100 }, { "epoch": 7.1525423728813555, "grad_norm": 0.39390575885772705, "learning_rate": 2.0628881681144912e-05, "loss": 0.0199, "step": 2110 }, { "epoch": 7.186440677966102, "grad_norm": 0.17156942188739777, "learning_rate": 2.0177056008456702e-05, "loss": 0.0177, "step": 2120 }, { "epoch": 7.220338983050848, "grad_norm": 0.337688684463501, "learning_rate": 1.9728979284030025e-05, "loss": 0.0204, "step": 2130 }, { "epoch": 7.254237288135593, "grad_norm": 0.4261310398578644, "learning_rate": 1.928470783417718e-05, "loss": 0.0221, "step": 2140 }, { "epoch": 7.288135593220339, "grad_norm": 0.3683595359325409, "learning_rate": 1.8844297506861375e-05, "loss": 0.018, "step": 2150 }, { "epoch": 7.322033898305085, "grad_norm": 0.31089678406715393, "learning_rate": 1.840780366467632e-05, "loss": 0.0182, "step": 2160 }, { "epoch": 7.3559322033898304, "grad_norm": 0.3290519118309021, "learning_rate": 1.797528117788674e-05, "loss": 0.0173, "step": 2170 }, { "epoch": 7.389830508474576, "grad_norm": 0.21714557707309723, "learning_rate": 1.754678441753072e-05, "loss": 0.0181, "step": 2180 }, { "epoch": 7.423728813559322, "grad_norm": 0.2562347948551178, "learning_rate": 1.7122367248585048e-05, "loss": 0.0192, "step": 2190 }, { "epoch": 7.4576271186440675, "grad_norm": 0.22622661292552948, "learning_rate": 1.6702083023193887e-05, "loss": 0.018, "step": 2200 }, { "epoch": 7.491525423728813, "grad_norm": 0.2597718834877014, "learning_rate": 1.6285984573962098e-05, "loss": 0.0238, "step": 2210 }, { "epoch": 7.52542372881356, "grad_norm": 0.32258540391921997, "learning_rate": 1.587412420731387e-05, "loss": 0.0207, "step": 2220 }, { "epoch": 7.559322033898305, "grad_norm": 0.27688607573509216, "learning_rate": 1.5466553696917323e-05, "loss": 0.0206, "step": 2230 }, { "epoch": 7.593220338983051, "grad_norm": 0.326158344745636, "learning_rate": 1.5063324277176305e-05, "loss": 0.0187, "step": 2240 }, { "epoch": 7.627118644067797, "grad_norm": 0.22293926775455475, "learning_rate": 1.4664486636789871e-05, "loss": 0.0181, "step": 2250 }, { "epoch": 7.661016949152542, "grad_norm": 0.5270984172821045, "learning_rate": 1.4270090912380274e-05, "loss": 0.0206, "step": 2260 }, { "epoch": 7.694915254237288, "grad_norm": 0.2740112245082855, "learning_rate": 1.3880186682190583e-05, "loss": 0.0188, "step": 2270 }, { "epoch": 7.728813559322034, "grad_norm": 0.22564932703971863, "learning_rate": 1.3494822959852271e-05, "loss": 0.0165, "step": 2280 }, { "epoch": 7.762711864406779, "grad_norm": 0.22905248403549194, "learning_rate": 1.3114048188223838e-05, "loss": 0.0205, "step": 2290 }, { "epoch": 7.796610169491525, "grad_norm": 0.3510766923427582, "learning_rate": 1.2737910233301332e-05, "loss": 0.0167, "step": 2300 }, { "epoch": 7.830508474576272, "grad_norm": 0.25208619236946106, "learning_rate": 1.2366456378201135e-05, "loss": 0.0178, "step": 2310 }, { "epoch": 7.864406779661017, "grad_norm": 0.27848222851753235, "learning_rate": 1.1999733317216222e-05, "loss": 0.0208, "step": 2320 }, { "epoch": 7.898305084745763, "grad_norm": 0.2933652997016907, "learning_rate": 1.1637787149946377e-05, "loss": 0.0171, "step": 2330 }, { "epoch": 7.932203389830509, "grad_norm": 0.28821244835853577, "learning_rate": 1.128066337550312e-05, "loss": 0.0204, "step": 2340 }, { "epoch": 7.966101694915254, "grad_norm": 0.3546352684497833, "learning_rate": 1.0928406886790155e-05, "loss": 0.0192, "step": 2350 }, { "epoch": 8.0, "grad_norm": 0.7775828242301941, "learning_rate": 1.0581061964860085e-05, "loss": 0.018, "step": 2360 }, { "epoch": 8.033898305084746, "grad_norm": 0.3042224943637848, "learning_rate": 1.0238672273347894e-05, "loss": 0.0163, "step": 2370 }, { "epoch": 8.067796610169491, "grad_norm": 0.24432986974716187, "learning_rate": 9.901280852982203e-06, "loss": 0.0215, "step": 2380 }, { "epoch": 8.101694915254237, "grad_norm": 0.25356370210647583, "learning_rate": 9.568930116174734e-06, "loss": 0.0198, "step": 2390 }, { "epoch": 8.135593220338983, "grad_norm": 0.2504992187023163, "learning_rate": 9.241661841688732e-06, "loss": 0.0141, "step": 2400 }, { "epoch": 8.169491525423728, "grad_norm": 0.33074647188186646, "learning_rate": 8.919517169387165e-06, "loss": 0.0205, "step": 2410 }, { "epoch": 8.203389830508474, "grad_norm": 0.336225301027298, "learning_rate": 8.602536595061111e-06, "loss": 0.0203, "step": 2420 }, { "epoch": 8.23728813559322, "grad_norm": 0.17460206151008606, "learning_rate": 8.290759965339145e-06, "loss": 0.0199, "step": 2430 }, { "epoch": 8.271186440677965, "grad_norm": 0.26632779836654663, "learning_rate": 7.98422647267844e-06, "loss": 0.0159, "step": 2440 }, { "epoch": 8.305084745762711, "grad_norm": 0.22440572082996368, "learning_rate": 7.682974650437879e-06, "loss": 0.0193, "step": 2450 }, { "epoch": 8.338983050847457, "grad_norm": 0.30192360281944275, "learning_rate": 7.387042368034219e-06, "loss": 0.0183, "step": 2460 }, { "epoch": 8.372881355932204, "grad_norm": 0.2981124818325043, "learning_rate": 7.096466826181669e-06, "loss": 0.0203, "step": 2470 }, { "epoch": 8.40677966101695, "grad_norm": 0.1849370151758194, "learning_rate": 6.811284552215435e-06, "loss": 0.0155, "step": 2480 }, { "epoch": 8.440677966101696, "grad_norm": 0.2095867246389389, "learning_rate": 6.531531395500046e-06, "loss": 0.0181, "step": 2490 }, { "epoch": 8.474576271186441, "grad_norm": 0.3721981644630432, "learning_rate": 6.257242522922813e-06, "loss": 0.0186, "step": 2500 }, { "epoch": 8.508474576271187, "grad_norm": 0.19706839323043823, "learning_rate": 5.988452414473106e-06, "loss": 0.0184, "step": 2510 }, { "epoch": 8.542372881355933, "grad_norm": 0.2606714367866516, "learning_rate": 5.725194858907989e-06, "loss": 0.0172, "step": 2520 }, { "epoch": 8.576271186440678, "grad_norm": 0.1873464286327362, "learning_rate": 5.467502949504783e-06, "loss": 0.0181, "step": 2530 }, { "epoch": 8.610169491525424, "grad_norm": 0.2513989806175232, "learning_rate": 5.215409079900923e-06, "loss": 0.0171, "step": 2540 }, { "epoch": 8.64406779661017, "grad_norm": 0.3158385157585144, "learning_rate": 4.9689449400219635e-06, "loss": 0.0173, "step": 2550 }, { "epoch": 8.677966101694915, "grad_norm": 0.22146488726139069, "learning_rate": 4.7281415120978504e-06, "loss": 0.0162, "step": 2560 }, { "epoch": 8.711864406779661, "grad_norm": 0.23577848076820374, "learning_rate": 4.4930290667682786e-06, "loss": 0.0172, "step": 2570 }, { "epoch": 8.745762711864407, "grad_norm": 0.41719919443130493, "learning_rate": 4.263637159277495e-06, "loss": 0.0173, "step": 2580 }, { "epoch": 8.779661016949152, "grad_norm": 0.3312488794326782, "learning_rate": 4.039994625758925e-06, "loss": 0.0182, "step": 2590 }, { "epoch": 8.813559322033898, "grad_norm": 0.2829158306121826, "learning_rate": 3.822129579610356e-06, "loss": 0.0181, "step": 2600 }, { "epoch": 8.847457627118644, "grad_norm": 0.24710486829280853, "learning_rate": 3.6100694079598485e-06, "loss": 0.0166, "step": 2610 }, { "epoch": 8.88135593220339, "grad_norm": 0.1250915676355362, "learning_rate": 3.4038407682229768e-06, "loss": 0.0153, "step": 2620 }, { "epoch": 8.915254237288135, "grad_norm": 0.1759193390607834, "learning_rate": 3.2034695847518227e-06, "loss": 0.0161, "step": 2630 }, { "epoch": 8.94915254237288, "grad_norm": 0.23805756866931915, "learning_rate": 3.0089810455761316e-06, "loss": 0.0165, "step": 2640 }, { "epoch": 8.983050847457626, "grad_norm": 0.19876998662948608, "learning_rate": 2.8203995992369503e-06, "loss": 0.0156, "step": 2650 }, { "epoch": 9.016949152542374, "grad_norm": 0.21514378488063812, "learning_rate": 2.637748951713348e-06, "loss": 0.0199, "step": 2660 }, { "epoch": 9.05084745762712, "grad_norm": 0.29321616888046265, "learning_rate": 2.4610520634423374e-06, "loss": 0.0158, "step": 2670 }, { "epoch": 9.084745762711865, "grad_norm": 0.23713384568691254, "learning_rate": 2.290331146432645e-06, "loss": 0.018, "step": 2680 }, { "epoch": 9.11864406779661, "grad_norm": 0.40180814266204834, "learning_rate": 2.125607661472495e-06, "loss": 0.017, "step": 2690 }, { "epoch": 9.152542372881356, "grad_norm": 0.2338988482952118, "learning_rate": 1.9669023154318235e-06, "loss": 0.0172, "step": 2700 }, { "epoch": 9.186440677966102, "grad_norm": 0.3559688329696655, "learning_rate": 1.814235058659297e-06, "loss": 0.016, "step": 2710 }, { "epoch": 9.220338983050848, "grad_norm": 0.191647008061409, "learning_rate": 1.6676250824744444e-06, "loss": 0.0161, "step": 2720 }, { "epoch": 9.254237288135593, "grad_norm": 0.20268139243125916, "learning_rate": 1.5270908167551223e-06, "loss": 0.0165, "step": 2730 }, { "epoch": 9.288135593220339, "grad_norm": 0.1706833839416504, "learning_rate": 1.3926499276207872e-06, "loss": 0.0191, "step": 2740 }, { "epoch": 9.322033898305085, "grad_norm": 0.1586904525756836, "learning_rate": 1.2643193152117428e-06, "loss": 0.0148, "step": 2750 }, { "epoch": 9.35593220338983, "grad_norm": 0.18658769130706787, "learning_rate": 1.1421151115646621e-06, "loss": 0.0142, "step": 2760 }, { "epoch": 9.389830508474576, "grad_norm": 0.3142736852169037, "learning_rate": 1.0260526785846936e-06, "loss": 0.0146, "step": 2770 }, { "epoch": 9.423728813559322, "grad_norm": 0.35053491592407227, "learning_rate": 9.161466061143653e-07, "loss": 0.0167, "step": 2780 }, { "epoch": 9.457627118644067, "grad_norm": 0.28281068801879883, "learning_rate": 8.124107100995093e-07, "loss": 0.0185, "step": 2790 }, { "epoch": 9.491525423728813, "grad_norm": 0.23286263644695282, "learning_rate": 7.14858030852572e-07, "loss": 0.0142, "step": 2800 }, { "epoch": 9.525423728813559, "grad_norm": 0.2240029126405716, "learning_rate": 6.235008314132762e-07, "loss": 0.0162, "step": 2810 }, { "epoch": 9.559322033898304, "grad_norm": 0.17313669621944427, "learning_rate": 5.383505960071256e-07, "loss": 0.0131, "step": 2820 }, { "epoch": 9.59322033898305, "grad_norm": 0.2640604078769684, "learning_rate": 4.5941802860176017e-07, "loss": 0.0159, "step": 2830 }, { "epoch": 9.627118644067796, "grad_norm": 0.26407647132873535, "learning_rate": 3.8671305156135484e-07, "loss": 0.0128, "step": 2840 }, { "epoch": 9.661016949152543, "grad_norm": 0.19562020897865295, "learning_rate": 3.2024480439934445e-07, "loss": 0.0145, "step": 2850 }, { "epoch": 9.694915254237289, "grad_norm": 0.1296786069869995, "learning_rate": 2.6002164262951546e-07, "loss": 0.0153, "step": 2860 }, { "epoch": 9.728813559322035, "grad_norm": 0.14787475764751434, "learning_rate": 2.06051136715657e-07, "loss": 0.0141, "step": 2870 }, { "epoch": 9.76271186440678, "grad_norm": 0.20423457026481628, "learning_rate": 1.5834007111991122e-07, "loss": 0.0188, "step": 2880 }, { "epoch": 9.796610169491526, "grad_norm": 0.2300773411989212, "learning_rate": 1.168944434499053e-07, "loss": 0.0158, "step": 2890 }, { "epoch": 9.830508474576272, "grad_norm": 0.39848440885543823, "learning_rate": 8.171946370481576e-08, "loss": 0.0168, "step": 2900 }, { "epoch": 9.864406779661017, "grad_norm": 0.1742592751979828, "learning_rate": 5.281955362045343e-08, "loss": 0.0191, "step": 2910 }, { "epoch": 9.898305084745763, "grad_norm": 0.21877504885196686, "learning_rate": 3.019834611339145e-08, "loss": 0.0146, "step": 2920 }, { "epoch": 9.932203389830509, "grad_norm": 0.19842731952667236, "learning_rate": 1.38586848243083e-08, "loss": 0.0144, "step": 2930 }, { "epoch": 9.966101694915254, "grad_norm": 0.1726018637418747, "learning_rate": 3.802623760501556e-09, "loss": 0.017, "step": 2940 }, { "epoch": 10.0, "grad_norm": 1.1511099338531494, "learning_rate": 3.1427037711084796e-11, "loss": 0.0179, "step": 2950 }, { "epoch": 10.0, "step": 2950, "total_flos": 0.0, "train_loss": 0.03957842677326526, "train_runtime": 3172.5088, "train_samples_per_second": 45.431, "train_steps_per_second": 0.93 } ], "logging_steps": 10, "max_steps": 2950, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 49, "trial_name": null, "trial_params": null }