| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 34.89855072463768, | |
| "eval_steps": 500, | |
| "global_step": 1505, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11594202898550725, | |
| "grad_norm": 7.532149791717529, | |
| "learning_rate": 4.999863832700438e-05, | |
| "loss": 3.8116, | |
| "num_input_tokens_seen": 106929, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.2318840579710145, | |
| "grad_norm": 4.141716480255127, | |
| "learning_rate": 4.999455345634978e-05, | |
| "loss": 3.6928, | |
| "num_input_tokens_seen": 225964, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 3.470097780227661, | |
| "learning_rate": 4.9987745833016855e-05, | |
| "loss": 3.6227, | |
| "num_input_tokens_seen": 362264, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.463768115942029, | |
| "grad_norm": 3.4544646739959717, | |
| "learning_rate": 4.9978216198586135e-05, | |
| "loss": 3.601, | |
| "num_input_tokens_seen": 477807, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5797101449275363, | |
| "grad_norm": 3.249224901199341, | |
| "learning_rate": 4.996596559115731e-05, | |
| "loss": 3.539, | |
| "num_input_tokens_seen": 588900, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 3.395056962966919, | |
| "learning_rate": 4.995099534523607e-05, | |
| "loss": 3.4956, | |
| "num_input_tokens_seen": 706077, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.8115942028985508, | |
| "grad_norm": 3.997875213623047, | |
| "learning_rate": 4.9933307091588796e-05, | |
| "loss": 3.5044, | |
| "num_input_tokens_seen": 853504, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.927536231884058, | |
| "grad_norm": 3.5168681144714355, | |
| "learning_rate": 4.991290275706486e-05, | |
| "loss": 3.4324, | |
| "num_input_tokens_seen": 990472, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.0434782608695652, | |
| "grad_norm": 7.144646167755127, | |
| "learning_rate": 4.988978456438678e-05, | |
| "loss": 3.2542, | |
| "num_input_tokens_seen": 1125870, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.1594202898550725, | |
| "grad_norm": 3.257103681564331, | |
| "learning_rate": 4.986395503190805e-05, | |
| "loss": 2.9024, | |
| "num_input_tokens_seen": 1249877, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.2753623188405796, | |
| "grad_norm": 3.3208603858947754, | |
| "learning_rate": 4.983541697333881e-05, | |
| "loss": 2.8069, | |
| "num_input_tokens_seen": 1375193, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.391304347826087, | |
| "grad_norm": 4.378167629241943, | |
| "learning_rate": 4.980417349743936e-05, | |
| "loss": 2.75, | |
| "num_input_tokens_seen": 1489716, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.5072463768115942, | |
| "grad_norm": 4.321849822998047, | |
| "learning_rate": 4.9770228007681494e-05, | |
| "loss": 2.7329, | |
| "num_input_tokens_seen": 1600483, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.6231884057971016, | |
| "grad_norm": 3.6365067958831787, | |
| "learning_rate": 4.973358420187776e-05, | |
| "loss": 2.8212, | |
| "num_input_tokens_seen": 1731315, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.7391304347826086, | |
| "grad_norm": 3.74035906791687, | |
| "learning_rate": 4.9694246071778604e-05, | |
| "loss": 2.7935, | |
| "num_input_tokens_seen": 1858269, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.855072463768116, | |
| "grad_norm": 23.26426124572754, | |
| "learning_rate": 4.9652217902637596e-05, | |
| "loss": 2.7305, | |
| "num_input_tokens_seen": 1984587, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.971014492753623, | |
| "grad_norm": 4.870578289031982, | |
| "learning_rate": 4.9607504272744575e-05, | |
| "loss": 2.6482, | |
| "num_input_tokens_seen": 2109391, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.0869565217391304, | |
| "grad_norm": 4.419096946716309, | |
| "learning_rate": 4.956011005292692e-05, | |
| "loss": 2.4292, | |
| "num_input_tokens_seen": 2246413, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.2028985507246377, | |
| "grad_norm": 18.216915130615234, | |
| "learning_rate": 4.951004040601898e-05, | |
| "loss": 2.1416, | |
| "num_input_tokens_seen": 2386890, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 2.318840579710145, | |
| "grad_norm": 4.4814581871032715, | |
| "learning_rate": 4.945730078629964e-05, | |
| "loss": 2.2847, | |
| "num_input_tokens_seen": 2522302, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.4347826086956523, | |
| "grad_norm": 406.9701232910156, | |
| "learning_rate": 4.9401896938898185e-05, | |
| "loss": 2.0944, | |
| "num_input_tokens_seen": 2642208, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 2.550724637681159, | |
| "grad_norm": 3.5890581607818604, | |
| "learning_rate": 4.934383489916843e-05, | |
| "loss": 2.2862, | |
| "num_input_tokens_seen": 2780587, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 5.334541320800781, | |
| "learning_rate": 4.928312099203131e-05, | |
| "loss": 2.105, | |
| "num_input_tokens_seen": 2885320, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.782608695652174, | |
| "grad_norm": 5.464664936065674, | |
| "learning_rate": 4.921976183128585e-05, | |
| "loss": 2.0287, | |
| "num_input_tokens_seen": 2996923, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.898550724637681, | |
| "grad_norm": 4.113780975341797, | |
| "learning_rate": 4.9153764318888706e-05, | |
| "loss": 2.0162, | |
| "num_input_tokens_seen": 3102391, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 3.0144927536231885, | |
| "grad_norm": 6.009971618652344, | |
| "learning_rate": 4.908513564420231e-05, | |
| "loss": 2.2464, | |
| "num_input_tokens_seen": 3233443, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.130434782608696, | |
| "grad_norm": 10.397327423095703, | |
| "learning_rate": 4.90138832832117e-05, | |
| "loss": 1.6561, | |
| "num_input_tokens_seen": 3358966, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 3.246376811594203, | |
| "grad_norm": 4.9139556884765625, | |
| "learning_rate": 4.894001499771015e-05, | |
| "loss": 1.6113, | |
| "num_input_tokens_seen": 3490069, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.36231884057971, | |
| "grad_norm": 4.146034240722656, | |
| "learning_rate": 4.886353883445363e-05, | |
| "loss": 1.6235, | |
| "num_input_tokens_seen": 3609842, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 3.4782608695652173, | |
| "grad_norm": 4.301880359649658, | |
| "learning_rate": 4.878446312428424e-05, | |
| "loss": 1.7873, | |
| "num_input_tokens_seen": 3751570, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.5942028985507246, | |
| "grad_norm": 3.9485158920288086, | |
| "learning_rate": 4.8702796481222714e-05, | |
| "loss": 1.3723, | |
| "num_input_tokens_seen": 3865303, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 3.710144927536232, | |
| "grad_norm": 4.183668613433838, | |
| "learning_rate": 4.861854780153004e-05, | |
| "loss": 1.6512, | |
| "num_input_tokens_seen": 3991347, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.8260869565217392, | |
| "grad_norm": 5.1000471115112305, | |
| "learning_rate": 4.853172626273841e-05, | |
| "loss": 1.5524, | |
| "num_input_tokens_seen": 4113654, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 3.942028985507246, | |
| "grad_norm": 4.142239570617676, | |
| "learning_rate": 4.8442341322651385e-05, | |
| "loss": 1.5954, | |
| "num_input_tokens_seen": 4236348, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 4.057971014492754, | |
| "grad_norm": 3.8976669311523438, | |
| "learning_rate": 4.83504027183137e-05, | |
| "loss": 1.1652, | |
| "num_input_tokens_seen": 4340378, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 4.173913043478261, | |
| "grad_norm": 5.923389911651611, | |
| "learning_rate": 4.825592046495054e-05, | |
| "loss": 1.1995, | |
| "num_input_tokens_seen": 4473601, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 4.2898550724637685, | |
| "grad_norm": 4.220530033111572, | |
| "learning_rate": 4.8158904854876555e-05, | |
| "loss": 0.9431, | |
| "num_input_tokens_seen": 4586911, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 4.405797101449275, | |
| "grad_norm": 5.896139144897461, | |
| "learning_rate": 4.805936645637463e-05, | |
| "loss": 1.1136, | |
| "num_input_tokens_seen": 4702445, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 4.521739130434782, | |
| "grad_norm": 4.467094421386719, | |
| "learning_rate": 4.795731611254473e-05, | |
| "loss": 1.1509, | |
| "num_input_tokens_seen": 4831301, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 4.63768115942029, | |
| "grad_norm": 4.232386112213135, | |
| "learning_rate": 4.785276494012263e-05, | |
| "loss": 0.9962, | |
| "num_input_tokens_seen": 4941656, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.753623188405797, | |
| "grad_norm": 4.829892635345459, | |
| "learning_rate": 4.7745724328269e-05, | |
| "loss": 1.2377, | |
| "num_input_tokens_seen": 5088437, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 4.869565217391305, | |
| "grad_norm": 4.1343913078308105, | |
| "learning_rate": 4.763620593732867e-05, | |
| "loss": 1.234, | |
| "num_input_tokens_seen": 5219806, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.9855072463768115, | |
| "grad_norm": 4.9217729568481445, | |
| "learning_rate": 4.752422169756048e-05, | |
| "loss": 1.1453, | |
| "num_input_tokens_seen": 5340222, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 5.101449275362318, | |
| "grad_norm": 4.4605865478515625, | |
| "learning_rate": 4.740978380783765e-05, | |
| "loss": 0.9056, | |
| "num_input_tokens_seen": 5476315, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 5.217391304347826, | |
| "grad_norm": 4.396484375, | |
| "learning_rate": 4.7292904734318924e-05, | |
| "loss": 0.7349, | |
| "num_input_tokens_seen": 5589951, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 4.053436279296875, | |
| "learning_rate": 4.7173597209090534e-05, | |
| "loss": 0.6968, | |
| "num_input_tokens_seen": 5711449, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 5.449275362318841, | |
| "grad_norm": 5.303736209869385, | |
| "learning_rate": 4.70518742287793e-05, | |
| "loss": 0.851, | |
| "num_input_tokens_seen": 5852650, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 5.565217391304348, | |
| "grad_norm": 3.70810866355896, | |
| "learning_rate": 4.6927749053136866e-05, | |
| "loss": 0.716, | |
| "num_input_tokens_seen": 5972289, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.681159420289855, | |
| "grad_norm": 3.9204599857330322, | |
| "learning_rate": 4.6801235203595195e-05, | |
| "loss": 0.6384, | |
| "num_input_tokens_seen": 6088707, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 5.797101449275362, | |
| "grad_norm": 4.06931209564209, | |
| "learning_rate": 4.667234646179368e-05, | |
| "loss": 0.7799, | |
| "num_input_tokens_seen": 6215471, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.913043478260869, | |
| "grad_norm": 4.283618450164795, | |
| "learning_rate": 4.654109686807787e-05, | |
| "loss": 0.7923, | |
| "num_input_tokens_seen": 6335935, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 6.028985507246377, | |
| "grad_norm": 4.719886302947998, | |
| "learning_rate": 4.640750071996995e-05, | |
| "loss": 0.7452, | |
| "num_input_tokens_seen": 6463689, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 6.144927536231884, | |
| "grad_norm": 3.8415334224700928, | |
| "learning_rate": 4.6271572570611296e-05, | |
| "loss": 0.4085, | |
| "num_input_tokens_seen": 6576954, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 6.260869565217392, | |
| "grad_norm": 4.19309663772583, | |
| "learning_rate": 4.613332722717714e-05, | |
| "loss": 0.5777, | |
| "num_input_tokens_seen": 6714404, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 6.3768115942028984, | |
| "grad_norm": 5.686235427856445, | |
| "learning_rate": 4.5992779749263546e-05, | |
| "loss": 0.4718, | |
| "num_input_tokens_seen": 6840385, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 6.492753623188406, | |
| "grad_norm": 3.2365808486938477, | |
| "learning_rate": 4.584994544724695e-05, | |
| "loss": 0.3723, | |
| "num_input_tokens_seen": 6954269, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 6.608695652173913, | |
| "grad_norm": 3.530801296234131, | |
| "learning_rate": 4.5704839880616296e-05, | |
| "loss": 0.4453, | |
| "num_input_tokens_seen": 7076143, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 6.72463768115942, | |
| "grad_norm": 3.2134931087493896, | |
| "learning_rate": 4.5557478856278114e-05, | |
| "loss": 0.5742, | |
| "num_input_tokens_seen": 7201833, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 6.840579710144928, | |
| "grad_norm": 6.281985282897949, | |
| "learning_rate": 4.5407878426834596e-05, | |
| "loss": 0.5291, | |
| "num_input_tokens_seen": 7330479, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 6.956521739130435, | |
| "grad_norm": 17.072542190551758, | |
| "learning_rate": 4.5256054888834934e-05, | |
| "loss": 0.4968, | |
| "num_input_tokens_seen": 7449244, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 7.072463768115942, | |
| "grad_norm": 3.737456798553467, | |
| "learning_rate": 4.5102024781000077e-05, | |
| "loss": 0.421, | |
| "num_input_tokens_seen": 7578947, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 7.188405797101449, | |
| "grad_norm": 2.6586523056030273, | |
| "learning_rate": 4.4945804882421086e-05, | |
| "loss": 0.2767, | |
| "num_input_tokens_seen": 7691948, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 7.304347826086957, | |
| "grad_norm": 2.027702808380127, | |
| "learning_rate": 4.478741221073136e-05, | |
| "loss": 0.2922, | |
| "num_input_tokens_seen": 7815786, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 7.420289855072464, | |
| "grad_norm": 3.4651787281036377, | |
| "learning_rate": 4.4626864020252774e-05, | |
| "loss": 0.2768, | |
| "num_input_tokens_seen": 7925106, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 7.536231884057971, | |
| "grad_norm": 4.559577941894531, | |
| "learning_rate": 4.446417780011618e-05, | |
| "loss": 0.3281, | |
| "num_input_tokens_seen": 8057202, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 7.6521739130434785, | |
| "grad_norm": 2.5885751247406006, | |
| "learning_rate": 4.42993712723562e-05, | |
| "loss": 0.3374, | |
| "num_input_tokens_seen": 8187865, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 7.768115942028985, | |
| "grad_norm": 2.792222023010254, | |
| "learning_rate": 4.413246238998069e-05, | |
| "loss": 0.2491, | |
| "num_input_tokens_seen": 8304605, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 7.884057971014493, | |
| "grad_norm": 3.610206127166748, | |
| "learning_rate": 4.3963469335015085e-05, | |
| "loss": 0.3893, | |
| "num_input_tokens_seen": 8437319, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 4.31643533706665, | |
| "learning_rate": 4.379241051652174e-05, | |
| "loss": 0.3761, | |
| "num_input_tokens_seen": 8573080, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 8.115942028985508, | |
| "grad_norm": 2.2834160327911377, | |
| "learning_rate": 4.361930456859456e-05, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 8707741, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 8.231884057971014, | |
| "grad_norm": 2.6929121017456055, | |
| "learning_rate": 4.34441703483291e-05, | |
| "loss": 0.1584, | |
| "num_input_tokens_seen": 8825774, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 8.347826086956522, | |
| "grad_norm": 3.8095011711120605, | |
| "learning_rate": 4.326702693376844e-05, | |
| "loss": 0.1481, | |
| "num_input_tokens_seen": 8932249, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 8.46376811594203, | |
| "grad_norm": 2.6493489742279053, | |
| "learning_rate": 4.308789362182492e-05, | |
| "loss": 0.1743, | |
| "num_input_tokens_seen": 9051548, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 8.579710144927537, | |
| "grad_norm": 30.796459197998047, | |
| "learning_rate": 4.290678992617798e-05, | |
| "loss": 0.3162, | |
| "num_input_tokens_seen": 9197232, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 8.695652173913043, | |
| "grad_norm": 3.3164985179901123, | |
| "learning_rate": 4.272373557514858e-05, | |
| "loss": 0.2235, | |
| "num_input_tokens_seen": 9317650, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 8.81159420289855, | |
| "grad_norm": 3.1515417098999023, | |
| "learning_rate": 4.2538750509550054e-05, | |
| "loss": 0.2504, | |
| "num_input_tokens_seen": 9450765, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 8.927536231884059, | |
| "grad_norm": 3.3926901817321777, | |
| "learning_rate": 4.235185488051585e-05, | |
| "loss": 0.2136, | |
| "num_input_tokens_seen": 9582961, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 9.043478260869565, | |
| "grad_norm": 4.670753002166748, | |
| "learning_rate": 4.216306904730447e-05, | |
| "loss": 0.1047, | |
| "num_input_tokens_seen": 9678616, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 9.159420289855072, | |
| "grad_norm": 2.166652202606201, | |
| "learning_rate": 4.1972413575081595e-05, | |
| "loss": 0.1015, | |
| "num_input_tokens_seen": 9788512, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 9.27536231884058, | |
| "grad_norm": 2.1161272525787354, | |
| "learning_rate": 4.177990923267986e-05, | |
| "loss": 0.1505, | |
| "num_input_tokens_seen": 9916229, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 9.391304347826088, | |
| "grad_norm": 2.378105401992798, | |
| "learning_rate": 4.158557699033644e-05, | |
| "loss": 0.1135, | |
| "num_input_tokens_seen": 10042697, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 9.507246376811594, | |
| "grad_norm": 2.5567331314086914, | |
| "learning_rate": 4.138943801740865e-05, | |
| "loss": 0.1832, | |
| "num_input_tokens_seen": 10171849, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 9.623188405797102, | |
| "grad_norm": 2.022610902786255, | |
| "learning_rate": 4.119151368006793e-05, | |
| "loss": 0.1178, | |
| "num_input_tokens_seen": 10281924, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 9.73913043478261, | |
| "grad_norm": 2.5578079223632812, | |
| "learning_rate": 4.099182553897229e-05, | |
| "loss": 0.1426, | |
| "num_input_tokens_seen": 10418758, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 9.855072463768115, | |
| "grad_norm": 2.7287228107452393, | |
| "learning_rate": 4.079039534691767e-05, | |
| "loss": 0.1603, | |
| "num_input_tokens_seen": 10558322, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 9.971014492753623, | |
| "grad_norm": 2.361532688140869, | |
| "learning_rate": 4.058724504646834e-05, | |
| "loss": 0.1548, | |
| "num_input_tokens_seen": 10679536, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 10.08695652173913, | |
| "grad_norm": 1.8757002353668213, | |
| "learning_rate": 4.0382396767566536e-05, | |
| "loss": 0.1407, | |
| "num_input_tokens_seen": 10821076, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 10.202898550724637, | |
| "grad_norm": 2.352725028991699, | |
| "learning_rate": 4.017587282512181e-05, | |
| "loss": 0.0791, | |
| "num_input_tokens_seen": 10949771, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 10.318840579710145, | |
| "grad_norm": 1.7948939800262451, | |
| "learning_rate": 3.9967695716580224e-05, | |
| "loss": 0.0722, | |
| "num_input_tokens_seen": 11072044, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 10.434782608695652, | |
| "grad_norm": 1.954727292060852, | |
| "learning_rate": 3.975788811947351e-05, | |
| "loss": 0.0655, | |
| "num_input_tokens_seen": 11182627, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 10.55072463768116, | |
| "grad_norm": 2.143941640853882, | |
| "learning_rate": 3.954647288894883e-05, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 11303028, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 10.666666666666666, | |
| "grad_norm": 2.0527164936065674, | |
| "learning_rate": 3.933347305527898e-05, | |
| "loss": 0.0655, | |
| "num_input_tokens_seen": 11415868, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 10.782608695652174, | |
| "grad_norm": 1.6390535831451416, | |
| "learning_rate": 3.911891182135371e-05, | |
| "loss": 0.1534, | |
| "num_input_tokens_seen": 11555653, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 10.898550724637682, | |
| "grad_norm": 2.3848719596862793, | |
| "learning_rate": 3.8902812560152066e-05, | |
| "loss": 0.0947, | |
| "num_input_tokens_seen": 11681065, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 11.014492753623188, | |
| "grad_norm": 2.2094757556915283, | |
| "learning_rate": 3.868519881219631e-05, | |
| "loss": 0.0868, | |
| "num_input_tokens_seen": 11809957, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 11.130434782608695, | |
| "grad_norm": 4.137216567993164, | |
| "learning_rate": 3.846609428298757e-05, | |
| "loss": 0.0467, | |
| "num_input_tokens_seen": 11937881, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 11.246376811594203, | |
| "grad_norm": 1.6658189296722412, | |
| "learning_rate": 3.824552284042351e-05, | |
| "loss": 0.0521, | |
| "num_input_tokens_seen": 12048905, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 11.36231884057971, | |
| "grad_norm": 1.5732171535491943, | |
| "learning_rate": 3.8023508512198256e-05, | |
| "loss": 0.051, | |
| "num_input_tokens_seen": 12185453, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 11.478260869565217, | |
| "grad_norm": 1.8459701538085938, | |
| "learning_rate": 3.780007548318507e-05, | |
| "loss": 0.0753, | |
| "num_input_tokens_seen": 12310911, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 11.594202898550725, | |
| "grad_norm": 1.4724109172821045, | |
| "learning_rate": 3.7575248092801686e-05, | |
| "loss": 0.0601, | |
| "num_input_tokens_seen": 12439708, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 11.710144927536232, | |
| "grad_norm": 2.4690322875976562, | |
| "learning_rate": 3.734905083235901e-05, | |
| "loss": 0.0533, | |
| "num_input_tokens_seen": 12554467, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 11.826086956521738, | |
| "grad_norm": 2.369218111038208, | |
| "learning_rate": 3.712150834239313e-05, | |
| "loss": 0.064, | |
| "num_input_tokens_seen": 12682329, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 11.942028985507246, | |
| "grad_norm": 1.6901100873947144, | |
| "learning_rate": 3.689264540998116e-05, | |
| "loss": 0.0755, | |
| "num_input_tokens_seen": 12800852, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 12.057971014492754, | |
| "grad_norm": 1.303114414215088, | |
| "learning_rate": 3.66624869660411e-05, | |
| "loss": 0.0553, | |
| "num_input_tokens_seen": 12917527, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 12.173913043478262, | |
| "grad_norm": 1.1986353397369385, | |
| "learning_rate": 3.6431058082615964e-05, | |
| "loss": 0.0355, | |
| "num_input_tokens_seen": 13044774, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 12.289855072463768, | |
| "grad_norm": 1.5653026103973389, | |
| "learning_rate": 3.619838397014263e-05, | |
| "loss": 0.0413, | |
| "num_input_tokens_seen": 13175692, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 12.405797101449275, | |
| "grad_norm": 1.0767664909362793, | |
| "learning_rate": 3.5964489974705553e-05, | |
| "loss": 0.0596, | |
| "num_input_tokens_seen": 13293164, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 12.521739130434783, | |
| "grad_norm": 1.6005312204360962, | |
| "learning_rate": 3.572940157527572e-05, | |
| "loss": 0.0479, | |
| "num_input_tokens_seen": 13417894, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 12.63768115942029, | |
| "grad_norm": 1.627121925354004, | |
| "learning_rate": 3.549314438093515e-05, | |
| "loss": 0.047, | |
| "num_input_tokens_seen": 13551913, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 12.753623188405797, | |
| "grad_norm": 2.239276647567749, | |
| "learning_rate": 3.525574412808717e-05, | |
| "loss": 0.0492, | |
| "num_input_tokens_seen": 13675309, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 12.869565217391305, | |
| "grad_norm": 1.5702998638153076, | |
| "learning_rate": 3.501722667765286e-05, | |
| "loss": 0.0471, | |
| "num_input_tokens_seen": 13797691, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 12.985507246376812, | |
| "grad_norm": 1.8216972351074219, | |
| "learning_rate": 3.47776180122539e-05, | |
| "loss": 0.1041, | |
| "num_input_tokens_seen": 13919770, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 13.101449275362318, | |
| "grad_norm": 0.9026144742965698, | |
| "learning_rate": 3.453694423338225e-05, | |
| "loss": 0.0282, | |
| "num_input_tokens_seen": 14037673, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 13.217391304347826, | |
| "grad_norm": 1.4504765272140503, | |
| "learning_rate": 3.4295231558556715e-05, | |
| "loss": 0.0272, | |
| "num_input_tokens_seen": 14167090, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 1.4278969764709473, | |
| "learning_rate": 3.4052506318467084e-05, | |
| "loss": 0.0342, | |
| "num_input_tokens_seen": 14311710, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 13.44927536231884, | |
| "grad_norm": 1.1284997463226318, | |
| "learning_rate": 3.3808794954105716e-05, | |
| "loss": 0.0855, | |
| "num_input_tokens_seen": 14404322, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 13.565217391304348, | |
| "grad_norm": 1.4915614128112793, | |
| "learning_rate": 3.356412401388732e-05, | |
| "loss": 0.0378, | |
| "num_input_tokens_seen": 14530794, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 13.681159420289855, | |
| "grad_norm": 1.372157096862793, | |
| "learning_rate": 3.3318520150756846e-05, | |
| "loss": 0.0457, | |
| "num_input_tokens_seen": 14637342, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 13.797101449275363, | |
| "grad_norm": 1.6492116451263428, | |
| "learning_rate": 3.307201011928616e-05, | |
| "loss": 0.0453, | |
| "num_input_tokens_seen": 14787534, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 13.91304347826087, | |
| "grad_norm": 1.3583859205245972, | |
| "learning_rate": 3.282462077275947e-05, | |
| "loss": 0.0378, | |
| "num_input_tokens_seen": 14909175, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 14.028985507246377, | |
| "grad_norm": 1.0751795768737793, | |
| "learning_rate": 3.257637906024822e-05, | |
| "loss": 0.0296, | |
| "num_input_tokens_seen": 15030530, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 14.144927536231885, | |
| "grad_norm": 1.474602222442627, | |
| "learning_rate": 3.2327312023675287e-05, | |
| "loss": 0.0216, | |
| "num_input_tokens_seen": 15148359, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 14.26086956521739, | |
| "grad_norm": 1.0749961137771606, | |
| "learning_rate": 3.2077446794869295e-05, | |
| "loss": 0.0299, | |
| "num_input_tokens_seen": 15280749, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 14.376811594202898, | |
| "grad_norm": 1.4042794704437256, | |
| "learning_rate": 3.1826810592609036e-05, | |
| "loss": 0.0247, | |
| "num_input_tokens_seen": 15397167, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 14.492753623188406, | |
| "grad_norm": 1.2280118465423584, | |
| "learning_rate": 3.157543071965835e-05, | |
| "loss": 0.0455, | |
| "num_input_tokens_seen": 15522794, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 14.608695652173914, | |
| "grad_norm": 1.2819784879684448, | |
| "learning_rate": 3.132333455979202e-05, | |
| "loss": 0.0262, | |
| "num_input_tokens_seen": 15637987, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 14.72463768115942, | |
| "grad_norm": 1.2691748142242432, | |
| "learning_rate": 3.107054957481271e-05, | |
| "loss": 0.0281, | |
| "num_input_tokens_seen": 15773163, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 14.840579710144928, | |
| "grad_norm": 1.2752504348754883, | |
| "learning_rate": 3.081710330155942e-05, | |
| "loss": 0.0294, | |
| "num_input_tokens_seen": 15892659, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 14.956521739130435, | |
| "grad_norm": 1.3479197025299072, | |
| "learning_rate": 3.056302334890786e-05, | |
| "loss": 0.0291, | |
| "num_input_tokens_seen": 16024576, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 15.072463768115941, | |
| "grad_norm": 1.3151382207870483, | |
| "learning_rate": 3.030833739476285e-05, | |
| "loss": 0.0216, | |
| "num_input_tokens_seen": 16151987, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 15.18840579710145, | |
| "grad_norm": 2.3882877826690674, | |
| "learning_rate": 3.0053073183043256e-05, | |
| "loss": 0.0218, | |
| "num_input_tokens_seen": 16278639, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 15.304347826086957, | |
| "grad_norm": 0.9794278144836426, | |
| "learning_rate": 2.979725852065981e-05, | |
| "loss": 0.0283, | |
| "num_input_tokens_seen": 16414743, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 15.420289855072463, | |
| "grad_norm": 0.8964869976043701, | |
| "learning_rate": 2.954092127448591e-05, | |
| "loss": 0.0259, | |
| "num_input_tokens_seen": 16529298, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 15.53623188405797, | |
| "grad_norm": 1.1441810131072998, | |
| "learning_rate": 2.9284089368322045e-05, | |
| "loss": 0.0716, | |
| "num_input_tokens_seen": 16655909, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 15.652173913043478, | |
| "grad_norm": 1.0959213972091675, | |
| "learning_rate": 2.9026790779853874e-05, | |
| "loss": 0.025, | |
| "num_input_tokens_seen": 16798263, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 15.768115942028986, | |
| "grad_norm": 1.0119343996047974, | |
| "learning_rate": 2.876905353760459e-05, | |
| "loss": 0.0218, | |
| "num_input_tokens_seen": 16916827, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 15.884057971014492, | |
| "grad_norm": 1.1373978853225708, | |
| "learning_rate": 2.8510905717881614e-05, | |
| "loss": 0.0231, | |
| "num_input_tokens_seen": 17040247, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 1.2512497901916504, | |
| "learning_rate": 2.8252375441718137e-05, | |
| "loss": 0.0228, | |
| "num_input_tokens_seen": 17146160, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 16.115942028985508, | |
| "grad_norm": 0.7410117387771606, | |
| "learning_rate": 2.7993490871809808e-05, | |
| "loss": 0.029, | |
| "num_input_tokens_seen": 17284643, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 16.231884057971016, | |
| "grad_norm": 1.0934263467788696, | |
| "learning_rate": 2.7734280209446865e-05, | |
| "loss": 0.0199, | |
| "num_input_tokens_seen": 17426644, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 16.347826086956523, | |
| "grad_norm": 1.0034395456314087, | |
| "learning_rate": 2.7474771691442018e-05, | |
| "loss": 0.0259, | |
| "num_input_tokens_seen": 17541812, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 16.463768115942027, | |
| "grad_norm": 1.4287781715393066, | |
| "learning_rate": 2.721499358705458e-05, | |
| "loss": 0.021, | |
| "num_input_tokens_seen": 17667755, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 16.579710144927535, | |
| "grad_norm": 1.0989606380462646, | |
| "learning_rate": 2.6954974194910888e-05, | |
| "loss": 0.0199, | |
| "num_input_tokens_seen": 17788162, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 16.695652173913043, | |
| "grad_norm": 0.9687130451202393, | |
| "learning_rate": 2.6694741839921732e-05, | |
| "loss": 0.0189, | |
| "num_input_tokens_seen": 17911718, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 16.81159420289855, | |
| "grad_norm": 1.143617033958435, | |
| "learning_rate": 2.6434324870196748e-05, | |
| "loss": 0.0169, | |
| "num_input_tokens_seen": 18018729, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 16.92753623188406, | |
| "grad_norm": 1.1395140886306763, | |
| "learning_rate": 2.617375165395634e-05, | |
| "loss": 0.0209, | |
| "num_input_tokens_seen": 18139681, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 17.043478260869566, | |
| "grad_norm": 0.881986677646637, | |
| "learning_rate": 2.5913050576441477e-05, | |
| "loss": 0.0201, | |
| "num_input_tokens_seen": 18278544, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 17.159420289855074, | |
| "grad_norm": 0.8654409050941467, | |
| "learning_rate": 2.5652250036821523e-05, | |
| "loss": 0.017, | |
| "num_input_tokens_seen": 18396700, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 17.27536231884058, | |
| "grad_norm": 0.9699842929840088, | |
| "learning_rate": 2.5391378445100644e-05, | |
| "loss": 0.0187, | |
| "num_input_tokens_seen": 18506229, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 17.391304347826086, | |
| "grad_norm": 0.8799194693565369, | |
| "learning_rate": 2.5130464219022992e-05, | |
| "loss": 0.0242, | |
| "num_input_tokens_seen": 18621580, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 17.507246376811594, | |
| "grad_norm": 0.9715821146965027, | |
| "learning_rate": 2.486953578097702e-05, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 18748382, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 17.6231884057971, | |
| "grad_norm": 0.8819458484649658, | |
| "learning_rate": 2.4608621554899362e-05, | |
| "loss": 0.0182, | |
| "num_input_tokens_seen": 18884730, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 17.73913043478261, | |
| "grad_norm": 0.8835431933403015, | |
| "learning_rate": 2.4347749963178486e-05, | |
| "loss": 0.0143, | |
| "num_input_tokens_seen": 19003589, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 17.855072463768117, | |
| "grad_norm": 0.780754566192627, | |
| "learning_rate": 2.4086949423558526e-05, | |
| "loss": 0.0164, | |
| "num_input_tokens_seen": 19136411, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 17.971014492753625, | |
| "grad_norm": 0.7591371536254883, | |
| "learning_rate": 2.3826248346043663e-05, | |
| "loss": 0.0157, | |
| "num_input_tokens_seen": 19260436, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 18.08695652173913, | |
| "grad_norm": 0.673797070980072, | |
| "learning_rate": 2.356567512980326e-05, | |
| "loss": 0.0304, | |
| "num_input_tokens_seen": 19388733, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 18.202898550724637, | |
| "grad_norm": 0.4008718729019165, | |
| "learning_rate": 2.3305258160078274e-05, | |
| "loss": 0.009, | |
| "num_input_tokens_seen": 19531204, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 18.318840579710145, | |
| "grad_norm": 0.6676005125045776, | |
| "learning_rate": 2.3045025805089118e-05, | |
| "loss": 0.0105, | |
| "num_input_tokens_seen": 19624608, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 18.434782608695652, | |
| "grad_norm": 0.6956990957260132, | |
| "learning_rate": 2.278500641294543e-05, | |
| "loss": 0.0104, | |
| "num_input_tokens_seen": 19751062, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 18.55072463768116, | |
| "grad_norm": 0.80479896068573, | |
| "learning_rate": 2.252522830855798e-05, | |
| "loss": 0.0103, | |
| "num_input_tokens_seen": 19879837, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 18.666666666666668, | |
| "grad_norm": 0.7206840515136719, | |
| "learning_rate": 2.2265719790553147e-05, | |
| "loss": 0.0107, | |
| "num_input_tokens_seen": 20019385, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 18.782608695652176, | |
| "grad_norm": 0.6994977593421936, | |
| "learning_rate": 2.2006509128190195e-05, | |
| "loss": 0.0269, | |
| "num_input_tokens_seen": 20138003, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 18.89855072463768, | |
| "grad_norm": 0.5642988681793213, | |
| "learning_rate": 2.174762455828187e-05, | |
| "loss": 0.0086, | |
| "num_input_tokens_seen": 20260523, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 19.014492753623188, | |
| "grad_norm": 0.5547834038734436, | |
| "learning_rate": 2.1489094282118395e-05, | |
| "loss": 0.0133, | |
| "num_input_tokens_seen": 20375322, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 19.130434782608695, | |
| "grad_norm": 0.48678871989250183, | |
| "learning_rate": 2.123094646239541e-05, | |
| "loss": 0.0114, | |
| "num_input_tokens_seen": 20477407, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 19.246376811594203, | |
| "grad_norm": 0.4791460633277893, | |
| "learning_rate": 2.0973209220146135e-05, | |
| "loss": 0.007, | |
| "num_input_tokens_seen": 20605728, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 19.36231884057971, | |
| "grad_norm": 1.1198338270187378, | |
| "learning_rate": 2.0715910631677968e-05, | |
| "loss": 0.0088, | |
| "num_input_tokens_seen": 20725799, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 19.47826086956522, | |
| "grad_norm": 0.6645247936248779, | |
| "learning_rate": 2.0459078725514092e-05, | |
| "loss": 0.007, | |
| "num_input_tokens_seen": 20865534, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 19.594202898550726, | |
| "grad_norm": 0.5324479341506958, | |
| "learning_rate": 2.020274147934019e-05, | |
| "loss": 0.0059, | |
| "num_input_tokens_seen": 20977913, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 19.71014492753623, | |
| "grad_norm": 0.6183504462242126, | |
| "learning_rate": 1.9946926816956743e-05, | |
| "loss": 0.0069, | |
| "num_input_tokens_seen": 21102848, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 19.82608695652174, | |
| "grad_norm": 0.6665703058242798, | |
| "learning_rate": 1.9691662605237166e-05, | |
| "loss": 0.008, | |
| "num_input_tokens_seen": 21243679, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 19.942028985507246, | |
| "grad_norm": 0.3298584222793579, | |
| "learning_rate": 1.9436976651092144e-05, | |
| "loss": 0.0127, | |
| "num_input_tokens_seen": 21364202, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 20.057971014492754, | |
| "grad_norm": 0.2818591296672821, | |
| "learning_rate": 1.9182896698440584e-05, | |
| "loss": 0.0059, | |
| "num_input_tokens_seen": 21496089, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 20.17391304347826, | |
| "grad_norm": 0.6906440258026123, | |
| "learning_rate": 1.89294504251873e-05, | |
| "loss": 0.0046, | |
| "num_input_tokens_seen": 21603193, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 20.28985507246377, | |
| "grad_norm": 0.33482542634010315, | |
| "learning_rate": 1.867666544020798e-05, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 21742062, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 20.405797101449274, | |
| "grad_norm": 2.443847417831421, | |
| "learning_rate": 1.8424569280341653e-05, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 21869307, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 20.52173913043478, | |
| "grad_norm": 0.43886587023735046, | |
| "learning_rate": 1.817318940739098e-05, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 21992573, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 20.63768115942029, | |
| "grad_norm": 0.93570876121521, | |
| "learning_rate": 1.7922553205130707e-05, | |
| "loss": 0.0064, | |
| "num_input_tokens_seen": 22101845, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 20.753623188405797, | |
| "grad_norm": 1176.9595947265625, | |
| "learning_rate": 1.767268797632472e-05, | |
| "loss": 0.008, | |
| "num_input_tokens_seen": 22230253, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 20.869565217391305, | |
| "grad_norm": 0.35642215609550476, | |
| "learning_rate": 1.7423620939751788e-05, | |
| "loss": 0.0053, | |
| "num_input_tokens_seen": 22373454, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 20.985507246376812, | |
| "grad_norm": 0.39736178517341614, | |
| "learning_rate": 1.7175379227240523e-05, | |
| "loss": 0.0054, | |
| "num_input_tokens_seen": 22493123, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 21.10144927536232, | |
| "grad_norm": 0.5092463493347168, | |
| "learning_rate": 1.692798988071385e-05, | |
| "loss": 0.0044, | |
| "num_input_tokens_seen": 22629005, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 21.217391304347824, | |
| "grad_norm": 0.26361697912216187, | |
| "learning_rate": 1.6681479849243153e-05, | |
| "loss": 0.0043, | |
| "num_input_tokens_seen": 22752358, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 21.333333333333332, | |
| "grad_norm": 0.19933666288852692, | |
| "learning_rate": 1.6435875986112685e-05, | |
| "loss": 0.0035, | |
| "num_input_tokens_seen": 22880349, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 21.44927536231884, | |
| "grad_norm": 0.22622954845428467, | |
| "learning_rate": 1.6191205045894283e-05, | |
| "loss": 0.0044, | |
| "num_input_tokens_seen": 22987343, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 21.565217391304348, | |
| "grad_norm": 0.30199098587036133, | |
| "learning_rate": 1.594749368153292e-05, | |
| "loss": 0.0178, | |
| "num_input_tokens_seen": 23113462, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 21.681159420289855, | |
| "grad_norm": 0.9627483487129211, | |
| "learning_rate": 1.570476844144329e-05, | |
| "loss": 0.0089, | |
| "num_input_tokens_seen": 23221714, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 21.797101449275363, | |
| "grad_norm": 0.27791452407836914, | |
| "learning_rate": 1.546305576661776e-05, | |
| "loss": 0.004, | |
| "num_input_tokens_seen": 23368857, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 21.91304347826087, | |
| "grad_norm": 0.3269965648651123, | |
| "learning_rate": 1.5222381987746104e-05, | |
| "loss": 0.004, | |
| "num_input_tokens_seen": 23494483, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 22.028985507246375, | |
| "grad_norm": 0.15966826677322388, | |
| "learning_rate": 1.4982773322347144e-05, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 23605463, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 22.144927536231883, | |
| "grad_norm": 0.3009255826473236, | |
| "learning_rate": 1.4744255871912823e-05, | |
| "loss": 0.0066, | |
| "num_input_tokens_seen": 23715776, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 22.26086956521739, | |
| "grad_norm": 0.4215935170650482, | |
| "learning_rate": 1.4506855619064846e-05, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 23841669, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 22.3768115942029, | |
| "grad_norm": 0.20214155316352844, | |
| "learning_rate": 1.4270598424724292e-05, | |
| "loss": 0.0032, | |
| "num_input_tokens_seen": 23960567, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 22.492753623188406, | |
| "grad_norm": 7.0683207511901855, | |
| "learning_rate": 1.4035510025294462e-05, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 24074628, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 22.608695652173914, | |
| "grad_norm": 0.20178793370723724, | |
| "learning_rate": 1.3801616029857378e-05, | |
| "loss": 0.0027, | |
| "num_input_tokens_seen": 24214324, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 22.72463768115942, | |
| "grad_norm": 1.3855236768722534, | |
| "learning_rate": 1.3568941917384036e-05, | |
| "loss": 0.0037, | |
| "num_input_tokens_seen": 24326727, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 22.840579710144926, | |
| "grad_norm": 0.18420317769050598, | |
| "learning_rate": 1.3337513033958904e-05, | |
| "loss": 0.0029, | |
| "num_input_tokens_seen": 24456961, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 22.956521739130434, | |
| "grad_norm": 0.15907694399356842, | |
| "learning_rate": 1.310735459001884e-05, | |
| "loss": 0.0035, | |
| "num_input_tokens_seen": 24606652, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 23.07246376811594, | |
| "grad_norm": 0.2548115849494934, | |
| "learning_rate": 1.2878491657606872e-05, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 24710410, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 23.18840579710145, | |
| "grad_norm": 0.36587971448898315, | |
| "learning_rate": 1.2650949167640993e-05, | |
| "loss": 0.0023, | |
| "num_input_tokens_seen": 24831908, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 23.304347826086957, | |
| "grad_norm": 0.13662408292293549, | |
| "learning_rate": 1.2424751907198312e-05, | |
| "loss": 0.0031, | |
| "num_input_tokens_seen": 24951342, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 23.420289855072465, | |
| "grad_norm": 0.19979843497276306, | |
| "learning_rate": 1.2199924516814939e-05, | |
| "loss": 0.0027, | |
| "num_input_tokens_seen": 25088309, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 23.536231884057973, | |
| "grad_norm": 0.14170995354652405, | |
| "learning_rate": 1.1976491487801748e-05, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 25216080, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 23.652173913043477, | |
| "grad_norm": 0.06863216310739517, | |
| "learning_rate": 1.1754477159576499e-05, | |
| "loss": 0.0023, | |
| "num_input_tokens_seen": 25326581, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 23.768115942028984, | |
| "grad_norm": 0.25133436918258667, | |
| "learning_rate": 1.1533905717012428e-05, | |
| "loss": 0.0027, | |
| "num_input_tokens_seen": 25477500, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 23.884057971014492, | |
| "grad_norm": 0.28348398208618164, | |
| "learning_rate": 1.1314801187803686e-05, | |
| "loss": 0.0041, | |
| "num_input_tokens_seen": 25601354, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 0.5024954676628113, | |
| "learning_rate": 1.1097187439847939e-05, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 25719240, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 24.115942028985508, | |
| "grad_norm": 0.1774568408727646, | |
| "learning_rate": 1.088108817864629e-05, | |
| "loss": 0.0039, | |
| "num_input_tokens_seen": 25834910, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 24.231884057971016, | |
| "grad_norm": 0.08105342090129852, | |
| "learning_rate": 1.0666526944721016e-05, | |
| "loss": 0.0025, | |
| "num_input_tokens_seen": 25974530, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 24.347826086956523, | |
| "grad_norm": 0.13048779964447021, | |
| "learning_rate": 1.0453527111051184e-05, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 26104464, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 24.463768115942027, | |
| "grad_norm": 0.10774020105600357, | |
| "learning_rate": 1.0242111880526495e-05, | |
| "loss": 0.0024, | |
| "num_input_tokens_seen": 26251334, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 24.579710144927535, | |
| "grad_norm": 0.7494776248931885, | |
| "learning_rate": 1.003230428341979e-05, | |
| "loss": 0.0031, | |
| "num_input_tokens_seen": 26366561, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 24.695652173913043, | |
| "grad_norm": 0.3580308258533478, | |
| "learning_rate": 9.824127174878195e-06, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 26486437, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 24.81159420289855, | |
| "grad_norm": 0.1473228931427002, | |
| "learning_rate": 9.617603232433475e-06, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 26601526, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 24.92753623188406, | |
| "grad_norm": 0.11716706305742264, | |
| "learning_rate": 9.412754953531663e-06, | |
| "loss": 0.0109, | |
| "num_input_tokens_seen": 26727922, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 25.043478260869566, | |
| "grad_norm": 0.12043190747499466, | |
| "learning_rate": 9.209604653082326e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 26835621, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 25.159420289855074, | |
| "grad_norm": 0.1277165412902832, | |
| "learning_rate": 9.008174461027724e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 26955101, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 25.27536231884058, | |
| "grad_norm": 0.08892516791820526, | |
| "learning_rate": 8.808486319932083e-06, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 27077833, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 25.391304347826086, | |
| "grad_norm": 0.30754807591438293, | |
| "learning_rate": 8.610561982591357e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 27192758, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 25.507246376811594, | |
| "grad_norm": 0.7194050550460815, | |
| "learning_rate": 8.414423009663563e-06, | |
| "loss": 0.0028, | |
| "num_input_tokens_seen": 27324970, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 25.6231884057971, | |
| "grad_norm": 4777.61328125, | |
| "learning_rate": 8.220090767320137e-06, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 27477531, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 25.73913043478261, | |
| "grad_norm": 2.280327081680298, | |
| "learning_rate": 8.027586424918412e-06, | |
| "loss": 0.0057, | |
| "num_input_tokens_seen": 27592035, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 25.855072463768117, | |
| "grad_norm": 0.13882993161678314, | |
| "learning_rate": 7.836930952695533e-06, | |
| "loss": 0.0067, | |
| "num_input_tokens_seen": 27712377, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 25.971014492753625, | |
| "grad_norm": 0.20987676084041595, | |
| "learning_rate": 7.648145119484153e-06, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 27834613, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 26.08695652173913, | |
| "grad_norm": 0.09795770049095154, | |
| "learning_rate": 7.461249490449954e-06, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 27966996, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 26.202898550724637, | |
| "grad_norm": 0.14506971836090088, | |
| "learning_rate": 7.276264424851423e-06, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 28093538, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 26.318840579710145, | |
| "grad_norm": 0.08091314136981964, | |
| "learning_rate": 7.0932100738220265e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 28215579, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 26.434782608695652, | |
| "grad_norm": 0.22550061345100403, | |
| "learning_rate": 6.912106378175098e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 28344144, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 26.55072463768116, | |
| "grad_norm": 0.23987355828285217, | |
| "learning_rate": 6.732973066231563e-06, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 28478650, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 26.666666666666668, | |
| "grad_norm": 0.1993756741285324, | |
| "learning_rate": 6.555829651670911e-06, | |
| "loss": 0.0023, | |
| "num_input_tokens_seen": 28593004, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 26.782608695652176, | |
| "grad_norm": 0.7184757590293884, | |
| "learning_rate": 6.380695431405456e-06, | |
| "loss": 0.0028, | |
| "num_input_tokens_seen": 28707392, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 26.89855072463768, | |
| "grad_norm": 0.06247011199593544, | |
| "learning_rate": 6.207589483478266e-06, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 28834902, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 27.014492753623188, | |
| "grad_norm": 0.11046591401100159, | |
| "learning_rate": 6.0365306649849214e-06, | |
| "loss": 0.0045, | |
| "num_input_tokens_seen": 28948812, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 27.130434782608695, | |
| "grad_norm": 0.12309098988771439, | |
| "learning_rate": 5.867537610019317e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 29078309, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 27.246376811594203, | |
| "grad_norm": 0.11428932845592499, | |
| "learning_rate": 5.700628727643806e-06, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 29211503, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 27.36231884057971, | |
| "grad_norm": 0.1093268170952797, | |
| "learning_rate": 5.53582219988382e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 29344489, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 27.47826086956522, | |
| "grad_norm": 0.2166384607553482, | |
| "learning_rate": 5.373135979747227e-06, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 29464082, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 27.594202898550726, | |
| "grad_norm": 0.15387850999832153, | |
| "learning_rate": 5.2125877892686496e-06, | |
| "loss": 0.0043, | |
| "num_input_tokens_seen": 29581124, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 27.71014492753623, | |
| "grad_norm": 0.11962082982063293, | |
| "learning_rate": 5.054195117578914e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 29696346, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 27.82608695652174, | |
| "grad_norm": 0.18724732100963593, | |
| "learning_rate": 4.897975218999926e-06, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 29815117, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 27.942028985507246, | |
| "grad_norm": 0.09917350113391876, | |
| "learning_rate": 4.743945111165068e-06, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 29939175, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 28.057971014492754, | |
| "grad_norm": 0.08235369622707367, | |
| "learning_rate": 4.592121573165414e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 30079840, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 28.17391304347826, | |
| "grad_norm": 0.20488996803760529, | |
| "learning_rate": 4.442521143721892e-06, | |
| "loss": 0.0033, | |
| "num_input_tokens_seen": 30192219, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 28.28985507246377, | |
| "grad_norm": 0.05383768677711487, | |
| "learning_rate": 4.295160119383712e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 30330969, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 28.405797101449274, | |
| "grad_norm": 0.14237363636493683, | |
| "learning_rate": 4.150054552753055e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 30453302, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 28.52173913043478, | |
| "grad_norm": 0.12487669289112091, | |
| "learning_rate": 4.007220250736454e-06, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 30568943, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 28.63768115942029, | |
| "grad_norm": 0.1423855572938919, | |
| "learning_rate": 3.866672772822863e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 30696057, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 28.753623188405797, | |
| "grad_norm": 0.1543101817369461, | |
| "learning_rate": 3.7284274293887115e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 30815506, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 28.869565217391305, | |
| "grad_norm": 0.1402539610862732, | |
| "learning_rate": 3.592499280030057e-06, | |
| "loss": 0.0027, | |
| "num_input_tokens_seen": 30916446, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 28.985507246376812, | |
| "grad_norm": 0.26191645860671997, | |
| "learning_rate": 3.458903131922134e-06, | |
| "loss": 0.0023, | |
| "num_input_tokens_seen": 31054242, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 29.10144927536232, | |
| "grad_norm": 0.09874732792377472, | |
| "learning_rate": 3.3276535382063213e-06, | |
| "loss": 0.0029, | |
| "num_input_tokens_seen": 31189078, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 29.217391304347824, | |
| "grad_norm": 0.11677820980548859, | |
| "learning_rate": 3.198764796404807e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 31311374, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 29.333333333333332, | |
| "grad_norm": 0.05459802597761154, | |
| "learning_rate": 3.0722509468631392e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 31444681, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 29.44927536231884, | |
| "grad_norm": 0.1113714948296547, | |
| "learning_rate": 2.948125771220697e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 31567569, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 29.565217391304348, | |
| "grad_norm": 0.1816156655550003, | |
| "learning_rate": 2.8264027909094715e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 31697338, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 29.681159420289855, | |
| "grad_norm": 0.13639949262142181, | |
| "learning_rate": 2.707095265681081e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 31826661, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 29.797101449275363, | |
| "grad_norm": 0.05292365327477455, | |
| "learning_rate": 2.5902161921623454e-06, | |
| "loss": 0.0023, | |
| "num_input_tokens_seen": 31944680, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 29.91304347826087, | |
| "grad_norm": 0.16608740389347076, | |
| "learning_rate": 2.475778302439524e-06, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 32067106, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 30.028985507246375, | |
| "grad_norm": 0.09277443587779999, | |
| "learning_rate": 2.3637940626713346e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 32184526, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 30.144927536231883, | |
| "grad_norm": 0.18832191824913025, | |
| "learning_rate": 2.254275671731007e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 32309423, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 30.26086956521739, | |
| "grad_norm": 0.1828456073999405, | |
| "learning_rate": 2.14723505987737e-06, | |
| "loss": 0.0071, | |
| "num_input_tokens_seen": 32429445, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 30.3768115942029, | |
| "grad_norm": 0.07503814995288849, | |
| "learning_rate": 2.0426838874552714e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 32540571, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 30.492753623188406, | |
| "grad_norm": 0.19047732651233673, | |
| "learning_rate": 1.9406335436253724e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 32665528, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 30.608695652173914, | |
| "grad_norm": 0.17791509628295898, | |
| "learning_rate": 1.8410951451234533e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 32800773, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 30.72463768115942, | |
| "grad_norm": 0.10698456317186356, | |
| "learning_rate": 1.7440795350494588e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 32928397, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 30.840579710144926, | |
| "grad_norm": 0.0963551327586174, | |
| "learning_rate": 1.649597281686302e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 33054819, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 30.956521739130434, | |
| "grad_norm": 0.24703514575958252, | |
| "learning_rate": 1.5576586773486195e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 33180616, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 31.07246376811594, | |
| "grad_norm": 0.12497910857200623, | |
| "learning_rate": 1.4682737372615967e-06, | |
| "loss": 0.0038, | |
| "num_input_tokens_seen": 33298041, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 31.18840579710145, | |
| "grad_norm": 0.18260960280895233, | |
| "learning_rate": 1.3814521984699596e-06, | |
| "loss": 0.0052, | |
| "num_input_tokens_seen": 33408343, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 31.304347826086957, | |
| "grad_norm": 0.13422255218029022, | |
| "learning_rate": 1.297203518777293e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 33545364, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 31.420289855072465, | |
| "grad_norm": 0.1285027116537094, | |
| "learning_rate": 1.2155368757157643e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 33652900, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 31.536231884057973, | |
| "grad_norm": 0.12832242250442505, | |
| "learning_rate": 1.1364611655463736e-06, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 33768791, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 31.652173913043477, | |
| "grad_norm": 0.12093157321214676, | |
| "learning_rate": 1.0599850022898539e-06, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 33892837, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 31.768115942028984, | |
| "grad_norm": 0.7227018475532532, | |
| "learning_rate": 9.861167167883046e-07, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 34015288, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 31.884057971014492, | |
| "grad_norm": 2.143653631210327, | |
| "learning_rate": 9.148643557976955e-07, | |
| "loss": 0.0037, | |
| "num_input_tokens_seen": 34154884, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 0.17518474161624908, | |
| "learning_rate": 8.462356811112987e-07, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 34292320, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 32.11594202898551, | |
| "grad_norm": 0.1274159997701645, | |
| "learning_rate": 7.802381687141535e-07, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 34413850, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 32.231884057971016, | |
| "grad_norm": 0.11443401873111725, | |
| "learning_rate": 7.168790079686932e-07, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 34547127, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 32.34782608695652, | |
| "grad_norm": 0.08239752799272537, | |
| "learning_rate": 6.561651008315738e-07, | |
| "loss": 0.0035, | |
| "num_input_tokens_seen": 34685112, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 32.46376811594203, | |
| "grad_norm": 0.7361220717430115, | |
| "learning_rate": 5.981030611018234e-07, | |
| "loss": 0.0063, | |
| "num_input_tokens_seen": 34810484, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 32.57971014492754, | |
| "grad_norm": 0.20323431491851807, | |
| "learning_rate": 5.426992137003622e-07, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 34920531, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 32.69565217391305, | |
| "grad_norm": 0.11165229231119156, | |
| "learning_rate": 4.899595939810236e-07, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 35035657, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 32.81159420289855, | |
| "grad_norm": 0.15023387968540192, | |
| "learning_rate": 4.398899470730827e-07, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 35167466, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 32.927536231884055, | |
| "grad_norm": 0.18479810655117035, | |
| "learning_rate": 3.9249572725543196e-07, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 35296818, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 33.04347826086956, | |
| "grad_norm": 0.08527754247188568, | |
| "learning_rate": 3.477820973624063e-07, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 35430399, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 33.15942028985507, | |
| "grad_norm": 0.16888481378555298, | |
| "learning_rate": 3.0575392822139726e-07, | |
| "loss": 0.0057, | |
| "num_input_tokens_seen": 35551540, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 33.27536231884058, | |
| "grad_norm": 0.18187086284160614, | |
| "learning_rate": 2.664157981222437e-07, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 35676077, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 33.391304347826086, | |
| "grad_norm": 0.15047162771224976, | |
| "learning_rate": 2.297719923185032e-07, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 35785127, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 33.507246376811594, | |
| "grad_norm": 0.12288761883974075, | |
| "learning_rate": 1.9582650256064205e-07, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 35911682, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 33.6231884057971, | |
| "grad_norm": 0.22509098052978516, | |
| "learning_rate": 1.645830266611914e-07, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 36030754, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 33.73913043478261, | |
| "grad_norm": 2.0408618450164795, | |
| "learning_rate": 1.3604496809195288e-07, | |
| "loss": 0.0042, | |
| "num_input_tokens_seen": 36146749, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 33.85507246376812, | |
| "grad_norm": 0.10705255717039108, | |
| "learning_rate": 1.1021543561322012e-07, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 36278454, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 33.971014492753625, | |
| "grad_norm": 1876.0384521484375, | |
| "learning_rate": 8.709724293513854e-08, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 36408834, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 34.08695652173913, | |
| "grad_norm": 0.1927630454301834, | |
| "learning_rate": 6.66929084112089e-08, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 36550538, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 34.20289855072464, | |
| "grad_norm": 0.1668202131986618, | |
| "learning_rate": 4.900465476393168e-08, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 36647436, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 34.31884057971015, | |
| "grad_norm": 0.7123565673828125, | |
| "learning_rate": 3.403440884269526e-08, | |
| "loss": 0.0024, | |
| "num_input_tokens_seen": 36785387, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 34.43478260869565, | |
| "grad_norm": 0.16973845660686493, | |
| "learning_rate": 2.1783801413866046e-08, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 36915606, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 34.55072463768116, | |
| "grad_norm": 2.034724473953247, | |
| "learning_rate": 1.2254166983152737e-08, | |
| "loss": 0.0035, | |
| "num_input_tokens_seen": 37036117, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 34.666666666666664, | |
| "grad_norm": 0.155415877699852, | |
| "learning_rate": 5.446543650219904e-09, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 37165587, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 34.78260869565217, | |
| "grad_norm": 0.10199662297964096, | |
| "learning_rate": 1.3616729956228425e-09, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 37290827, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 34.89855072463768, | |
| "grad_norm": 0.14740267395973206, | |
| "learning_rate": 0.0, | |
| "loss": 0.0053, | |
| "num_input_tokens_seen": 37412688, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 34.89855072463768, | |
| "num_input_tokens_seen": 37412688, | |
| "step": 1505, | |
| "total_flos": 8.033958240027034e+16, | |
| "train_loss": 0.3889684765070578, | |
| "train_runtime": 37510.9602, | |
| "train_samples_per_second": 0.322, | |
| "train_steps_per_second": 0.04 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1505, | |
| "num_input_tokens_seen": 37412688, | |
| "num_train_epochs": 35, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.033958240027034e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |