| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.4666234607906675, | |
| "eval_steps": 300, | |
| "global_step": 1800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012961762799740765, | |
| "grad_norm": 5.6396965980529785, | |
| "learning_rate": 1.724137931034483e-06, | |
| "loss": 6.6733, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.002592352559948153, | |
| "grad_norm": 5.946774482727051, | |
| "learning_rate": 3.8793103448275865e-06, | |
| "loss": 6.8433, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0038885288399222295, | |
| "grad_norm": 6.051335334777832, | |
| "learning_rate": 6.03448275862069e-06, | |
| "loss": 6.6488, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.005184705119896306, | |
| "grad_norm": 5.994152069091797, | |
| "learning_rate": 8.189655172413793e-06, | |
| "loss": 6.766, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0064808813998703824, | |
| "grad_norm": 6.127562522888184, | |
| "learning_rate": 1.0344827586206897e-05, | |
| "loss": 6.8587, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.007777057679844459, | |
| "grad_norm": 6.231839179992676, | |
| "learning_rate": 1.25e-05, | |
| "loss": 6.6363, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.009073233959818535, | |
| "grad_norm": 6.233114242553711, | |
| "learning_rate": 1.4655172413793103e-05, | |
| "loss": 7.1256, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.010369410239792612, | |
| "grad_norm": 6.063745021820068, | |
| "learning_rate": 1.6810344827586207e-05, | |
| "loss": 7.089, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.011665586519766688, | |
| "grad_norm": 7.45819616317749, | |
| "learning_rate": 1.896551724137931e-05, | |
| "loss": 6.9478, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.012961762799740765, | |
| "grad_norm": 5.544084548950195, | |
| "learning_rate": 2.1120689655172415e-05, | |
| "loss": 7.0204, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.014257939079714841, | |
| "grad_norm": 5.735494613647461, | |
| "learning_rate": 2.327586206896552e-05, | |
| "loss": 6.8533, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.015554115359688918, | |
| "grad_norm": 5.821224212646484, | |
| "learning_rate": 2.543103448275862e-05, | |
| "loss": 6.8601, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.016850291639662993, | |
| "grad_norm": 6.087413311004639, | |
| "learning_rate": 2.7586206896551727e-05, | |
| "loss": 6.8345, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.01814646791963707, | |
| "grad_norm": 6.119109153747559, | |
| "learning_rate": 2.974137931034483e-05, | |
| "loss": 6.5241, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.019442644199611146, | |
| "grad_norm": 5.87671422958374, | |
| "learning_rate": 3.1896551724137935e-05, | |
| "loss": 6.9227, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.020738820479585224, | |
| "grad_norm": 5.842630386352539, | |
| "learning_rate": 3.405172413793103e-05, | |
| "loss": 7.7281, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0220349967595593, | |
| "grad_norm": 5.891020774841309, | |
| "learning_rate": 3.620689655172414e-05, | |
| "loss": 7.1023, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.023331173039533377, | |
| "grad_norm": 6.341122627258301, | |
| "learning_rate": 3.8362068965517246e-05, | |
| "loss": 6.7766, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02462734931950745, | |
| "grad_norm": 6.294985294342041, | |
| "learning_rate": 4.0517241379310344e-05, | |
| "loss": 6.9056, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.02592352559948153, | |
| "grad_norm": 6.357933521270752, | |
| "learning_rate": 4.267241379310345e-05, | |
| "loss": 6.8075, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.027219701879455604, | |
| "grad_norm": 5.870952606201172, | |
| "learning_rate": 4.482758620689655e-05, | |
| "loss": 6.6911, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.028515878159429683, | |
| "grad_norm": 15.302526473999023, | |
| "learning_rate": 4.698275862068966e-05, | |
| "loss": 6.4926, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.029812054439403757, | |
| "grad_norm": 6.0892014503479, | |
| "learning_rate": 4.913793103448276e-05, | |
| "loss": 6.4598, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.031108230719377836, | |
| "grad_norm": 5.548859119415283, | |
| "learning_rate": 5.129310344827587e-05, | |
| "loss": 6.4285, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03240440699935191, | |
| "grad_norm": 5.682961940765381, | |
| "learning_rate": 5.344827586206896e-05, | |
| "loss": 6.2057, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.033700583279325985, | |
| "grad_norm": 5.919161319732666, | |
| "learning_rate": 5.560344827586207e-05, | |
| "loss": 6.1522, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03499675955930007, | |
| "grad_norm": 6.013058185577393, | |
| "learning_rate": 5.7758620689655175e-05, | |
| "loss": 6.7591, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.03629293583927414, | |
| "grad_norm": 5.664581298828125, | |
| "learning_rate": 5.991379310344828e-05, | |
| "loss": 6.6685, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.037589112119248216, | |
| "grad_norm": 6.199220180511475, | |
| "learning_rate": 6.206896551724138e-05, | |
| "loss": 6.6676, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.03888528839922229, | |
| "grad_norm": 5.9385552406311035, | |
| "learning_rate": 6.422413793103449e-05, | |
| "loss": 6.423, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04018146467919637, | |
| "grad_norm": 5.6819939613342285, | |
| "learning_rate": 6.637931034482759e-05, | |
| "loss": 6.2384, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.04147764095917045, | |
| "grad_norm": 5.781296730041504, | |
| "learning_rate": 6.85344827586207e-05, | |
| "loss": 6.4166, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04277381723914452, | |
| "grad_norm": 6.319295883178711, | |
| "learning_rate": 7.06896551724138e-05, | |
| "loss": 6.7247, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.0440699935191186, | |
| "grad_norm": 5.825390815734863, | |
| "learning_rate": 7.28448275862069e-05, | |
| "loss": 6.4742, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04536616979909268, | |
| "grad_norm": 5.79691743850708, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 6.4885, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.046662346079066754, | |
| "grad_norm": 5.771387100219727, | |
| "learning_rate": 7.715517241379311e-05, | |
| "loss": 6.6513, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04795852235904083, | |
| "grad_norm": 5.620362281799316, | |
| "learning_rate": 7.931034482758621e-05, | |
| "loss": 6.8207, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.0492546986390149, | |
| "grad_norm": 5.468255996704102, | |
| "learning_rate": 8.146551724137932e-05, | |
| "loss": 6.2821, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.050550874918988985, | |
| "grad_norm": 5.637963771820068, | |
| "learning_rate": 8.362068965517241e-05, | |
| "loss": 6.5699, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.05184705119896306, | |
| "grad_norm": 6.304792881011963, | |
| "learning_rate": 8.577586206896551e-05, | |
| "loss": 6.8814, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.053143227478937134, | |
| "grad_norm": 5.914714813232422, | |
| "learning_rate": 8.793103448275862e-05, | |
| "loss": 6.5254, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.05443940375891121, | |
| "grad_norm": 6.677070140838623, | |
| "learning_rate": 9.008620689655173e-05, | |
| "loss": 6.2366, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05573558003888529, | |
| "grad_norm": 6.566666603088379, | |
| "learning_rate": 9.224137931034484e-05, | |
| "loss": 6.9259, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.057031756318859365, | |
| "grad_norm": 6.206492900848389, | |
| "learning_rate": 9.439655172413794e-05, | |
| "loss": 6.2938, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05832793259883344, | |
| "grad_norm": 6.607194423675537, | |
| "learning_rate": 9.655172413793105e-05, | |
| "loss": 6.4837, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.059624108878807515, | |
| "grad_norm": 6.002255439758301, | |
| "learning_rate": 9.870689655172414e-05, | |
| "loss": 6.2093, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0609202851587816, | |
| "grad_norm": 5.8744425773620605, | |
| "learning_rate": 9.999992493386817e-05, | |
| "loss": 6.2799, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.06221646143875567, | |
| "grad_norm": 5.867527961730957, | |
| "learning_rate": 9.999908044247358e-05, | |
| "loss": 6.0516, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06351263771872975, | |
| "grad_norm": 6.7269721031188965, | |
| "learning_rate": 9.999729764292059e-05, | |
| "loss": 6.4098, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.06480881399870382, | |
| "grad_norm": 6.160502910614014, | |
| "learning_rate": 9.999457656866613e-05, | |
| "loss": 5.8759, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0661049902786779, | |
| "grad_norm": 7.1613640785217285, | |
| "learning_rate": 9.999091727077524e-05, | |
| "loss": 6.2836, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.06740116655865197, | |
| "grad_norm": 5.804195404052734, | |
| "learning_rate": 9.99863198179202e-05, | |
| "loss": 6.6131, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06869734283862605, | |
| "grad_norm": 5.900219440460205, | |
| "learning_rate": 9.99807842963791e-05, | |
| "loss": 6.4623, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.06999351911860013, | |
| "grad_norm": 5.594264984130859, | |
| "learning_rate": 9.99743108100344e-05, | |
| "loss": 6.1754, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0712896953985742, | |
| "grad_norm": 6.203033447265625, | |
| "learning_rate": 9.99668994803708e-05, | |
| "loss": 6.1605, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.07258587167854828, | |
| "grad_norm": 6.414889335632324, | |
| "learning_rate": 9.99585504464731e-05, | |
| "loss": 6.1736, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07388204795852236, | |
| "grad_norm": 6.240243911743164, | |
| "learning_rate": 9.99492638650235e-05, | |
| "loss": 6.177, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.07517822423849643, | |
| "grad_norm": 6.264848232269287, | |
| "learning_rate": 9.993903991029873e-05, | |
| "loss": 5.9035, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07647440051847051, | |
| "grad_norm": 6.6530537605285645, | |
| "learning_rate": 9.99278787741667e-05, | |
| "loss": 6.5645, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.07777057679844458, | |
| "grad_norm": 5.987668514251709, | |
| "learning_rate": 9.991578066608296e-05, | |
| "loss": 5.9072, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07777057679844458, | |
| "eval_loss": 1.7322468757629395, | |
| "eval_runtime": 353.2957, | |
| "eval_samples_per_second": 9.601, | |
| "eval_steps_per_second": 1.2, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07906675307841866, | |
| "grad_norm": 6.055402755737305, | |
| "learning_rate": 9.990274581308676e-05, | |
| "loss": 6.1636, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.08036292935839275, | |
| "grad_norm": 5.985538959503174, | |
| "learning_rate": 9.98887744597968e-05, | |
| "loss": 5.9484, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08165910563836681, | |
| "grad_norm": 6.564467430114746, | |
| "learning_rate": 9.987386686840658e-05, | |
| "loss": 6.046, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.0829552819183409, | |
| "grad_norm": 6.496596336364746, | |
| "learning_rate": 9.985802331867953e-05, | |
| "loss": 5.9602, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08425145819831498, | |
| "grad_norm": 6.17959451675415, | |
| "learning_rate": 9.984124410794376e-05, | |
| "loss": 5.9331, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.08554763447828904, | |
| "grad_norm": 6.264903545379639, | |
| "learning_rate": 9.982352955108648e-05, | |
| "loss": 6.3204, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08684381075826313, | |
| "grad_norm": 6.16652250289917, | |
| "learning_rate": 9.980487998054806e-05, | |
| "loss": 6.4928, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.0881399870382372, | |
| "grad_norm": 5.843173027038574, | |
| "learning_rate": 9.978529574631583e-05, | |
| "loss": 5.6959, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.08943616331821128, | |
| "grad_norm": 6.269702911376953, | |
| "learning_rate": 9.976477721591745e-05, | |
| "loss": 6.2319, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.09073233959818536, | |
| "grad_norm": 6.384511470794678, | |
| "learning_rate": 9.974332477441415e-05, | |
| "loss": 5.7345, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09202851587815943, | |
| "grad_norm": 6.084228515625, | |
| "learning_rate": 9.972093882439331e-05, | |
| "loss": 6.1189, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.09332469215813351, | |
| "grad_norm": 6.038543701171875, | |
| "learning_rate": 9.969761978596104e-05, | |
| "loss": 6.2143, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.09462086843810759, | |
| "grad_norm": 5.236361503601074, | |
| "learning_rate": 9.96733680967343e-05, | |
| "loss": 5.8919, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.09591704471808166, | |
| "grad_norm": 6.045313358306885, | |
| "learning_rate": 9.96481842118326e-05, | |
| "loss": 6.0698, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09721322099805574, | |
| "grad_norm": 6.191497325897217, | |
| "learning_rate": 9.962206860386952e-05, | |
| "loss": 6.0906, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.0985093972780298, | |
| "grad_norm": 6.468414306640625, | |
| "learning_rate": 9.959502176294383e-05, | |
| "loss": 5.6146, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09980557355800389, | |
| "grad_norm": 5.91257381439209, | |
| "learning_rate": 9.956704419663034e-05, | |
| "loss": 6.2094, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.10110174983797797, | |
| "grad_norm": 6.700929641723633, | |
| "learning_rate": 9.953813642997023e-05, | |
| "loss": 6.0105, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10239792611795204, | |
| "grad_norm": 5.680608749389648, | |
| "learning_rate": 9.950829900546135e-05, | |
| "loss": 5.6779, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.10369410239792612, | |
| "grad_norm": 7.64698600769043, | |
| "learning_rate": 9.947753248304798e-05, | |
| "loss": 5.6944, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1049902786779002, | |
| "grad_norm": 6.112722873687744, | |
| "learning_rate": 9.944583744011035e-05, | |
| "loss": 5.2823, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.10628645495787427, | |
| "grad_norm": 5.932455539703369, | |
| "learning_rate": 9.941321447145369e-05, | |
| "loss": 5.6419, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.10758263123784835, | |
| "grad_norm": 6.098362445831299, | |
| "learning_rate": 9.937966418929726e-05, | |
| "loss": 5.555, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.10887880751782242, | |
| "grad_norm": 6.430552005767822, | |
| "learning_rate": 9.934518722326268e-05, | |
| "loss": 5.9416, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1101749837977965, | |
| "grad_norm": 6.295275688171387, | |
| "learning_rate": 9.930978422036224e-05, | |
| "loss": 5.7914, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.11147116007777058, | |
| "grad_norm": 6.376883029937744, | |
| "learning_rate": 9.927345584498666e-05, | |
| "loss": 5.6276, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11276733635774465, | |
| "grad_norm": 6.90930700302124, | |
| "learning_rate": 9.923620277889271e-05, | |
| "loss": 5.5259, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.11406351263771873, | |
| "grad_norm": 6.884796142578125, | |
| "learning_rate": 9.91980257211904e-05, | |
| "loss": 5.8534, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.11535968891769281, | |
| "grad_norm": 6.210943698883057, | |
| "learning_rate": 9.915892538832975e-05, | |
| "loss": 6.2987, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.11665586519766688, | |
| "grad_norm": 6.3871235847473145, | |
| "learning_rate": 9.911890251408751e-05, | |
| "loss": 5.6077, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.11795204147764096, | |
| "grad_norm": 6.339592933654785, | |
| "learning_rate": 9.907795784955327e-05, | |
| "loss": 5.3609, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.11924821775761503, | |
| "grad_norm": 8.121675491333008, | |
| "learning_rate": 9.903609216311543e-05, | |
| "loss": 5.3018, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.12054439403758911, | |
| "grad_norm": 5.988659858703613, | |
| "learning_rate": 9.899330624044672e-05, | |
| "loss": 5.6771, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.1218405703175632, | |
| "grad_norm": 5.641611576080322, | |
| "learning_rate": 9.894960088448952e-05, | |
| "loss": 5.4788, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.12313674659753726, | |
| "grad_norm": 6.305113315582275, | |
| "learning_rate": 9.890497691544078e-05, | |
| "loss": 5.8398, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.12443292287751134, | |
| "grad_norm": 7.6531476974487305, | |
| "learning_rate": 9.885943517073656e-05, | |
| "loss": 5.9084, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.12572909915748542, | |
| "grad_norm": 6.197567462921143, | |
| "learning_rate": 9.881297650503641e-05, | |
| "loss": 6.1301, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.1270252754374595, | |
| "grad_norm": 6.230889797210693, | |
| "learning_rate": 9.876560179020723e-05, | |
| "loss": 5.357, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.12832145171743356, | |
| "grad_norm": 5.8952155113220215, | |
| "learning_rate": 9.871731191530703e-05, | |
| "loss": 5.4557, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.12961762799740764, | |
| "grad_norm": 5.878301620483398, | |
| "learning_rate": 9.866810778656815e-05, | |
| "loss": 5.6862, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13091380427738172, | |
| "grad_norm": 5.700610637664795, | |
| "learning_rate": 9.861799032738026e-05, | |
| "loss": 5.73, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.1322099805573558, | |
| "grad_norm": 5.974181175231934, | |
| "learning_rate": 9.856696047827309e-05, | |
| "loss": 5.9917, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1335061568373299, | |
| "grad_norm": 6.263741493225098, | |
| "learning_rate": 9.851501919689872e-05, | |
| "loss": 6.5588, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.13480233311730394, | |
| "grad_norm": 5.810044765472412, | |
| "learning_rate": 9.846216745801365e-05, | |
| "loss": 5.4013, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.13609850939727802, | |
| "grad_norm": 6.2864089012146, | |
| "learning_rate": 9.840840625346046e-05, | |
| "loss": 5.7395, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.1373946856772521, | |
| "grad_norm": 6.56425142288208, | |
| "learning_rate": 9.835373659214925e-05, | |
| "loss": 5.8189, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.13869086195722619, | |
| "grad_norm": 5.685390472412109, | |
| "learning_rate": 9.829815950003869e-05, | |
| "loss": 5.0689, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.13998703823720027, | |
| "grad_norm": 6.452259063720703, | |
| "learning_rate": 9.824167602011671e-05, | |
| "loss": 6.3463, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.14128321451717435, | |
| "grad_norm": 6.589794158935547, | |
| "learning_rate": 9.818428721238101e-05, | |
| "loss": 5.5236, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.1425793907971484, | |
| "grad_norm": 6.536722183227539, | |
| "learning_rate": 9.812599415381916e-05, | |
| "loss": 5.7223, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.14387556707712248, | |
| "grad_norm": 6.562587261199951, | |
| "learning_rate": 9.806679793838829e-05, | |
| "loss": 5.6417, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.14517174335709657, | |
| "grad_norm": 6.284836769104004, | |
| "learning_rate": 9.800669967699467e-05, | |
| "loss": 5.9385, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.14646791963707065, | |
| "grad_norm": 6.164417743682861, | |
| "learning_rate": 9.794570049747285e-05, | |
| "loss": 5.5899, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.14776409591704473, | |
| "grad_norm": 6.173615455627441, | |
| "learning_rate": 9.788380154456443e-05, | |
| "loss": 5.6792, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.14906027219701878, | |
| "grad_norm": 5.460556983947754, | |
| "learning_rate": 9.78210039798966e-05, | |
| "loss": 5.3521, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.15035644847699287, | |
| "grad_norm": 5.926529407501221, | |
| "learning_rate": 9.775730898196038e-05, | |
| "loss": 5.3681, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.15165262475696695, | |
| "grad_norm": 7.669357776641846, | |
| "learning_rate": 9.769271774608853e-05, | |
| "loss": 6.0213, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.15294880103694103, | |
| "grad_norm": 5.901175022125244, | |
| "learning_rate": 9.762723148443296e-05, | |
| "loss": 5.5721, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.1542449773169151, | |
| "grad_norm": 5.736074447631836, | |
| "learning_rate": 9.756085142594215e-05, | |
| "loss": 5.5985, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.15554115359688916, | |
| "grad_norm": 6.197432041168213, | |
| "learning_rate": 9.749357881633805e-05, | |
| "loss": 5.6776, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15554115359688916, | |
| "eval_loss": 1.759212613105774, | |
| "eval_runtime": 353.1926, | |
| "eval_samples_per_second": 9.604, | |
| "eval_steps_per_second": 1.2, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15683732987686325, | |
| "grad_norm": 6.798159122467041, | |
| "learning_rate": 9.742541491809261e-05, | |
| "loss": 5.3871, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.15813350615683733, | |
| "grad_norm": 6.722763538360596, | |
| "learning_rate": 9.735636101040422e-05, | |
| "loss": 5.7196, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1594296824368114, | |
| "grad_norm": 6.259701728820801, | |
| "learning_rate": 9.72864183891736e-05, | |
| "loss": 5.3804, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.1607258587167855, | |
| "grad_norm": 5.556440830230713, | |
| "learning_rate": 9.721558836697952e-05, | |
| "loss": 5.3889, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.16202203499675957, | |
| "grad_norm": 6.141296863555908, | |
| "learning_rate": 9.714387227305422e-05, | |
| "loss": 5.7832, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.16331821127673363, | |
| "grad_norm": 5.994726181030273, | |
| "learning_rate": 9.707127145325833e-05, | |
| "loss": 5.6706, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.1646143875567077, | |
| "grad_norm": 5.489011287689209, | |
| "learning_rate": 9.699778727005575e-05, | |
| "loss": 5.0538, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.1659105638366818, | |
| "grad_norm": 5.828495025634766, | |
| "learning_rate": 9.692342110248802e-05, | |
| "loss": 5.3216, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.16720674011665587, | |
| "grad_norm": 6.654528617858887, | |
| "learning_rate": 9.684817434614844e-05, | |
| "loss": 5.5388, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.16850291639662995, | |
| "grad_norm": 6.706836223602295, | |
| "learning_rate": 9.67720484131559e-05, | |
| "loss": 5.2992, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.169799092676604, | |
| "grad_norm": 6.2614264488220215, | |
| "learning_rate": 9.669504473212834e-05, | |
| "loss": 6.1578, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.1710952689565781, | |
| "grad_norm": 5.715453624725342, | |
| "learning_rate": 9.661716474815597e-05, | |
| "loss": 5.172, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.17239144523655217, | |
| "grad_norm": 7.11665678024292, | |
| "learning_rate": 9.653840992277417e-05, | |
| "loss": 5.6336, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.17368762151652625, | |
| "grad_norm": 6.611268520355225, | |
| "learning_rate": 9.645878173393601e-05, | |
| "loss": 5.6544, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.17498379779650033, | |
| "grad_norm": 6.485696792602539, | |
| "learning_rate": 9.637828167598457e-05, | |
| "loss": 5.952, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.1762799740764744, | |
| "grad_norm": 5.934052467346191, | |
| "learning_rate": 9.629691125962487e-05, | |
| "loss": 5.7921, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.17757615035644847, | |
| "grad_norm": 6.288647174835205, | |
| "learning_rate": 9.62146720118955e-05, | |
| "loss": 5.9811, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.17887232663642255, | |
| "grad_norm": 6.341413497924805, | |
| "learning_rate": 9.613156547613994e-05, | |
| "loss": 5.7378, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.18016850291639663, | |
| "grad_norm": 5.738951206207275, | |
| "learning_rate": 9.604759321197773e-05, | |
| "loss": 5.2245, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.18146467919637072, | |
| "grad_norm": 6.080374240875244, | |
| "learning_rate": 9.596275679527506e-05, | |
| "loss": 5.4993, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.18276085547634477, | |
| "grad_norm": 5.326103210449219, | |
| "learning_rate": 9.587705781811524e-05, | |
| "loss": 5.7679, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.18405703175631885, | |
| "grad_norm": 6.497161388397217, | |
| "learning_rate": 9.579049788876883e-05, | |
| "loss": 5.3233, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.18535320803629293, | |
| "grad_norm": 5.9565887451171875, | |
| "learning_rate": 9.570307863166347e-05, | |
| "loss": 5.8357, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.18664938431626701, | |
| "grad_norm": 6.489195823669434, | |
| "learning_rate": 9.561480168735337e-05, | |
| "loss": 5.8847, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.1879455605962411, | |
| "grad_norm": 6.582181930541992, | |
| "learning_rate": 9.552566871248854e-05, | |
| "loss": 5.6037, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.18924173687621518, | |
| "grad_norm": 6.493152141571045, | |
| "learning_rate": 9.543568137978372e-05, | |
| "loss": 5.6814, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.19053791315618923, | |
| "grad_norm": 6.5842180252075195, | |
| "learning_rate": 9.53448413779869e-05, | |
| "loss": 5.6718, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.1918340894361633, | |
| "grad_norm": 6.549362659454346, | |
| "learning_rate": 9.525315041184772e-05, | |
| "loss": 5.3291, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.1931302657161374, | |
| "grad_norm": 6.025957107543945, | |
| "learning_rate": 9.516061020208549e-05, | |
| "loss": 5.124, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.19442644199611148, | |
| "grad_norm": 6.822085380554199, | |
| "learning_rate": 9.506722248535683e-05, | |
| "loss": 5.3759, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.19572261827608556, | |
| "grad_norm": 6.566873550415039, | |
| "learning_rate": 9.497298901422307e-05, | |
| "loss": 5.5954, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.1970187945560596, | |
| "grad_norm": 6.094099521636963, | |
| "learning_rate": 9.487791155711745e-05, | |
| "loss": 5.2167, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.1983149708360337, | |
| "grad_norm": 8.192927360534668, | |
| "learning_rate": 9.478199189831183e-05, | |
| "loss": 5.6828, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.19961114711600778, | |
| "grad_norm": 5.992204666137695, | |
| "learning_rate": 9.468523183788333e-05, | |
| "loss": 5.6504, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.20090732339598186, | |
| "grad_norm": 6.572239875793457, | |
| "learning_rate": 9.45876331916804e-05, | |
| "loss": 5.9306, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.20220349967595594, | |
| "grad_norm": 6.623505592346191, | |
| "learning_rate": 9.448919779128884e-05, | |
| "loss": 5.2466, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.20349967595593, | |
| "grad_norm": 6.26369571685791, | |
| "learning_rate": 9.438992748399742e-05, | |
| "loss": 5.3634, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.20479585223590407, | |
| "grad_norm": 6.356987953186035, | |
| "learning_rate": 9.428982413276318e-05, | |
| "loss": 5.1097, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.20609202851587816, | |
| "grad_norm": 5.846899032592773, | |
| "learning_rate": 9.41888896161765e-05, | |
| "loss": 5.2561, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.20738820479585224, | |
| "grad_norm": 6.218850135803223, | |
| "learning_rate": 9.408712582842583e-05, | |
| "loss": 5.3602, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.20868438107582632, | |
| "grad_norm": 6.571227550506592, | |
| "learning_rate": 9.39845346792621e-05, | |
| "loss": 5.4462, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.2099805573558004, | |
| "grad_norm": 6.540446758270264, | |
| "learning_rate": 9.3881118093963e-05, | |
| "loss": 5.3208, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.21127673363577446, | |
| "grad_norm": 6.814016342163086, | |
| "learning_rate": 9.377687801329674e-05, | |
| "loss": 5.7917, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.21257290991574854, | |
| "grad_norm": 7.118403434753418, | |
| "learning_rate": 9.367181639348564e-05, | |
| "loss": 6.0615, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.21386908619572262, | |
| "grad_norm": 6.125025749206543, | |
| "learning_rate": 9.356593520616948e-05, | |
| "loss": 5.8048, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.2151652624756967, | |
| "grad_norm": 5.778207778930664, | |
| "learning_rate": 9.34592364383684e-05, | |
| "loss": 5.3762, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.21646143875567078, | |
| "grad_norm": 6.2385382652282715, | |
| "learning_rate": 9.335172209244575e-05, | |
| "loss": 5.8005, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.21775761503564484, | |
| "grad_norm": 5.940319538116455, | |
| "learning_rate": 9.324339418607041e-05, | |
| "loss": 5.1152, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.21905379131561892, | |
| "grad_norm": 6.526686191558838, | |
| "learning_rate": 9.31342547521789e-05, | |
| "loss": 5.3928, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.220349967595593, | |
| "grad_norm": 6.4096221923828125, | |
| "learning_rate": 9.302430583893731e-05, | |
| "loss": 5.1502, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.22164614387556708, | |
| "grad_norm": 5.7976603507995605, | |
| "learning_rate": 9.291354950970286e-05, | |
| "loss": 5.2698, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.22294232015554116, | |
| "grad_norm": 6.290134906768799, | |
| "learning_rate": 9.28019878429851e-05, | |
| "loss": 5.2121, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.22423849643551522, | |
| "grad_norm": 9.13962173461914, | |
| "learning_rate": 9.268962293240701e-05, | |
| "loss": 5.3281, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.2255346727154893, | |
| "grad_norm": 6.373178482055664, | |
| "learning_rate": 9.257645688666556e-05, | |
| "loss": 5.1852, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.22683084899546338, | |
| "grad_norm": 5.77469539642334, | |
| "learning_rate": 9.246249182949233e-05, | |
| "loss": 5.5195, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.22812702527543746, | |
| "grad_norm": 6.406745433807373, | |
| "learning_rate": 9.234772989961352e-05, | |
| "loss": 5.1316, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.22942320155541154, | |
| "grad_norm": 6.724541187286377, | |
| "learning_rate": 9.22321732507098e-05, | |
| "loss": 5.308, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.23071937783538563, | |
| "grad_norm": 6.221187591552734, | |
| "learning_rate": 9.211582405137603e-05, | |
| "loss": 5.167, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.23201555411535968, | |
| "grad_norm": 6.091421604156494, | |
| "learning_rate": 9.199868448508037e-05, | |
| "loss": 5.5671, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.23331173039533376, | |
| "grad_norm": 6.427068710327148, | |
| "learning_rate": 9.188075675012351e-05, | |
| "loss": 5.1115, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.23331173039533376, | |
| "eval_loss": 1.747745156288147, | |
| "eval_runtime": 353.516, | |
| "eval_samples_per_second": 9.595, | |
| "eval_steps_per_second": 1.199, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.23460790667530784, | |
| "grad_norm": 7.048336982727051, | |
| "learning_rate": 9.176204305959726e-05, | |
| "loss": 5.455, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.23590408295528192, | |
| "grad_norm": 43.31245803833008, | |
| "learning_rate": 9.164254564134305e-05, | |
| "loss": 5.8, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.237200259235256, | |
| "grad_norm": 6.042270660400391, | |
| "learning_rate": 9.15222667379102e-05, | |
| "loss": 5.7257, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.23849643551523006, | |
| "grad_norm": 6.102289199829102, | |
| "learning_rate": 9.140120860651374e-05, | |
| "loss": 5.6096, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.23979261179520414, | |
| "grad_norm": 6.190640449523926, | |
| "learning_rate": 9.127937351899211e-05, | |
| "loss": 5.2916, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.24108878807517822, | |
| "grad_norm": 6.64203405380249, | |
| "learning_rate": 9.115676376176448e-05, | |
| "loss": 5.4412, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.2423849643551523, | |
| "grad_norm": 6.799017429351807, | |
| "learning_rate": 9.103338163578787e-05, | |
| "loss": 6.0282, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.2436811406351264, | |
| "grad_norm": 6.019811153411865, | |
| "learning_rate": 9.090922945651399e-05, | |
| "loss": 5.1639, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.24497731691510044, | |
| "grad_norm": 6.229288578033447, | |
| "learning_rate": 9.078430955384572e-05, | |
| "loss": 5.4325, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.24627349319507452, | |
| "grad_norm": 6.305727005004883, | |
| "learning_rate": 9.065862427209349e-05, | |
| "loss": 5.3317, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2475696694750486, | |
| "grad_norm": 6.441644191741943, | |
| "learning_rate": 9.053217596993114e-05, | |
| "loss": 5.232, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.24886584575502269, | |
| "grad_norm": 5.868955135345459, | |
| "learning_rate": 9.040496702035181e-05, | |
| "loss": 5.9553, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.25016202203499677, | |
| "grad_norm": 6.898869037628174, | |
| "learning_rate": 9.027699981062332e-05, | |
| "loss": 5.355, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.25145819831497085, | |
| "grad_norm": 6.511346817016602, | |
| "learning_rate": 9.014827674224333e-05, | |
| "loss": 5.2525, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.25275437459494493, | |
| "grad_norm": 6.207486629486084, | |
| "learning_rate": 9.001880023089441e-05, | |
| "loss": 5.1674, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.254050550874919, | |
| "grad_norm": 6.383389949798584, | |
| "learning_rate": 8.988857270639857e-05, | |
| "loss": 5.2571, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.25534672715489304, | |
| "grad_norm": 6.2587103843688965, | |
| "learning_rate": 8.975759661267173e-05, | |
| "loss": 5.3479, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.2566429034348671, | |
| "grad_norm": 7.096968173980713, | |
| "learning_rate": 8.962587440767787e-05, | |
| "loss": 5.22, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.2579390797148412, | |
| "grad_norm": 6.323131561279297, | |
| "learning_rate": 8.94934085633828e-05, | |
| "loss": 5.5058, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.2592352559948153, | |
| "grad_norm": 6.179114818572998, | |
| "learning_rate": 8.93602015657079e-05, | |
| "loss": 5.6379, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.26053143227478937, | |
| "grad_norm": 5.8347601890563965, | |
| "learning_rate": 8.922625591448341e-05, | |
| "loss": 5.1801, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.26182760855476345, | |
| "grad_norm": 6.349084854125977, | |
| "learning_rate": 8.90915741234015e-05, | |
| "loss": 5.2985, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.26312378483473753, | |
| "grad_norm": 6.430611610412598, | |
| "learning_rate": 8.895615871996911e-05, | |
| "loss": 5.4722, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.2644199611147116, | |
| "grad_norm": 7.19846248626709, | |
| "learning_rate": 8.882001224546057e-05, | |
| "loss": 5.2468, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.2657161373946857, | |
| "grad_norm": 6.426541805267334, | |
| "learning_rate": 8.868313725486979e-05, | |
| "loss": 4.756, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.2670123136746598, | |
| "grad_norm": 6.313275337219238, | |
| "learning_rate": 8.854553631686241e-05, | |
| "loss": 5.3424, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.26830848995463386, | |
| "grad_norm": 6.315749168395996, | |
| "learning_rate": 8.84072120137276e-05, | |
| "loss": 5.6435, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.2696046662346079, | |
| "grad_norm": 6.214425086975098, | |
| "learning_rate": 8.826816694132955e-05, | |
| "loss": 5.5317, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.27090084251458196, | |
| "grad_norm": 6.459805488586426, | |
| "learning_rate": 8.812840370905873e-05, | |
| "loss": 5.3225, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.27219701879455604, | |
| "grad_norm": 6.470264911651611, | |
| "learning_rate": 8.798792493978305e-05, | |
| "loss": 5.948, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2734931950745301, | |
| "grad_norm": 7.647356033325195, | |
| "learning_rate": 8.784673326979844e-05, | |
| "loss": 5.611, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.2747893713545042, | |
| "grad_norm": 6.665130615234375, | |
| "learning_rate": 8.77048313487796e-05, | |
| "loss": 5.6659, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.2760855476344783, | |
| "grad_norm": 6.436264514923096, | |
| "learning_rate": 8.756222183973008e-05, | |
| "loss": 4.9427, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.27738172391445237, | |
| "grad_norm": 6.688732147216797, | |
| "learning_rate": 8.741890741893244e-05, | |
| "loss": 4.7955, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.27867790019442645, | |
| "grad_norm": 6.054559707641602, | |
| "learning_rate": 8.727489077589793e-05, | |
| "loss": 5.2091, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.27997407647440054, | |
| "grad_norm": 6.237517356872559, | |
| "learning_rate": 8.713017461331608e-05, | |
| "loss": 5.6823, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.2812702527543746, | |
| "grad_norm": 6.290626525878906, | |
| "learning_rate": 8.698476164700395e-05, | |
| "loss": 5.2632, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.2825664290343487, | |
| "grad_norm": 6.412489891052246, | |
| "learning_rate": 8.683865460585518e-05, | |
| "loss": 5.3348, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.2838626053143227, | |
| "grad_norm": 6.4528632164001465, | |
| "learning_rate": 8.669185623178879e-05, | |
| "loss": 5.1606, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.2851587815942968, | |
| "grad_norm": 7.906050682067871, | |
| "learning_rate": 8.654436927969767e-05, | |
| "loss": 5.5195, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2864549578742709, | |
| "grad_norm": 6.647706031799316, | |
| "learning_rate": 8.639619651739694e-05, | |
| "loss": 4.9791, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.28775113415424497, | |
| "grad_norm": 5.931995391845703, | |
| "learning_rate": 8.624734072557199e-05, | |
| "loss": 5.915, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.28904731043421905, | |
| "grad_norm": 5.751828193664551, | |
| "learning_rate": 8.609780469772623e-05, | |
| "loss": 5.1009, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.29034348671419313, | |
| "grad_norm": 6.851621627807617, | |
| "learning_rate": 8.59475912401288e-05, | |
| "loss": 5.3921, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2916396629941672, | |
| "grad_norm": 6.673628330230713, | |
| "learning_rate": 8.579670317176179e-05, | |
| "loss": 5.3902, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.2929358392741413, | |
| "grad_norm": 6.149777412414551, | |
| "learning_rate": 8.564514332426741e-05, | |
| "loss": 5.479, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.2942320155541154, | |
| "grad_norm": 5.980111122131348, | |
| "learning_rate": 8.549291454189477e-05, | |
| "loss": 5.154, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.29552819183408946, | |
| "grad_norm": 6.860113143920898, | |
| "learning_rate": 8.534001968144656e-05, | |
| "loss": 5.5186, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.2968243681140635, | |
| "grad_norm": 6.145979404449463, | |
| "learning_rate": 8.51864616122255e-05, | |
| "loss": 5.0269, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.29812054439403757, | |
| "grad_norm": 5.662299633026123, | |
| "learning_rate": 8.503224321598035e-05, | |
| "loss": 5.5617, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.29941672067401165, | |
| "grad_norm": 6.544498920440674, | |
| "learning_rate": 8.48773673868519e-05, | |
| "loss": 5.2969, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.30071289695398573, | |
| "grad_norm": 6.1995391845703125, | |
| "learning_rate": 8.472183703131873e-05, | |
| "loss": 5.2686, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3020090732339598, | |
| "grad_norm": 6.684445381164551, | |
| "learning_rate": 8.456565506814251e-05, | |
| "loss": 5.3912, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.3033052495139339, | |
| "grad_norm": 6.4725260734558105, | |
| "learning_rate": 8.440882442831336e-05, | |
| "loss": 5.1365, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.304601425793908, | |
| "grad_norm": 6.407034397125244, | |
| "learning_rate": 8.42513480549948e-05, | |
| "loss": 5.3766, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.30589760207388206, | |
| "grad_norm": 7.576113224029541, | |
| "learning_rate": 8.409322890346847e-05, | |
| "loss": 5.5042, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.30719377835385614, | |
| "grad_norm": 6.397856712341309, | |
| "learning_rate": 8.393446994107877e-05, | |
| "loss": 5.2347, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.3084899546338302, | |
| "grad_norm": 6.017226696014404, | |
| "learning_rate": 8.377507414717706e-05, | |
| "loss": 4.9617, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.3097861309138043, | |
| "grad_norm": 7.199647903442383, | |
| "learning_rate": 8.361504451306585e-05, | |
| "loss": 5.1948, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.31108230719377833, | |
| "grad_norm": 6.387392520904541, | |
| "learning_rate": 8.345438404194259e-05, | |
| "loss": 5.0078, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.31108230719377833, | |
| "eval_loss": 1.7383368015289307, | |
| "eval_runtime": 353.0434, | |
| "eval_samples_per_second": 9.608, | |
| "eval_steps_per_second": 1.201, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3123784834737524, | |
| "grad_norm": 6.844247817993164, | |
| "learning_rate": 8.329309574884335e-05, | |
| "loss": 5.4026, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.3136746597537265, | |
| "grad_norm": 6.19937801361084, | |
| "learning_rate": 8.313118266058619e-05, | |
| "loss": 4.951, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3149708360337006, | |
| "grad_norm": 6.4328742027282715, | |
| "learning_rate": 8.296864781571448e-05, | |
| "loss": 5.263, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.31626701231367466, | |
| "grad_norm": 6.944987773895264, | |
| "learning_rate": 8.28054942644397e-05, | |
| "loss": 5.4176, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.31756318859364874, | |
| "grad_norm": 5.801463603973389, | |
| "learning_rate": 8.264172506858434e-05, | |
| "loss": 5.3857, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.3188593648736228, | |
| "grad_norm": 6.501681804656982, | |
| "learning_rate": 8.247734330152436e-05, | |
| "loss": 5.3497, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3201555411535969, | |
| "grad_norm": 6.634169578552246, | |
| "learning_rate": 8.231235204813157e-05, | |
| "loss": 5.232, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.321451717433571, | |
| "grad_norm": 6.158379554748535, | |
| "learning_rate": 8.21467544047157e-05, | |
| "loss": 5.0522, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.32274789371354506, | |
| "grad_norm": 5.85274076461792, | |
| "learning_rate": 8.19805534789663e-05, | |
| "loss": 5.0862, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.32404406999351915, | |
| "grad_norm": 6.4909348487854, | |
| "learning_rate": 8.181375238989438e-05, | |
| "loss": 5.1882, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.32534024627349317, | |
| "grad_norm": 6.870463848114014, | |
| "learning_rate": 8.164635426777404e-05, | |
| "loss": 5.31, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.32663642255346725, | |
| "grad_norm": 6.946374893188477, | |
| "learning_rate": 8.147836225408347e-05, | |
| "loss": 5.3501, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.32793259883344134, | |
| "grad_norm": 6.639915943145752, | |
| "learning_rate": 8.130977950144621e-05, | |
| "loss": 5.0554, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.3292287751134154, | |
| "grad_norm": 5.978764057159424, | |
| "learning_rate": 8.11406091735719e-05, | |
| "loss": 4.9968, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3305249513933895, | |
| "grad_norm": 6.2093706130981445, | |
| "learning_rate": 8.097085444519688e-05, | |
| "loss": 5.2527, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.3318211276733636, | |
| "grad_norm": 6.899875164031982, | |
| "learning_rate": 8.080051850202468e-05, | |
| "loss": 5.0845, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.33311730395333766, | |
| "grad_norm": 6.9142656326293945, | |
| "learning_rate": 8.062960454066619e-05, | |
| "loss": 5.4892, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.33441348023331174, | |
| "grad_norm": 7.387802600860596, | |
| "learning_rate": 8.04581157685797e-05, | |
| "loss": 5.49, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.3357096565132858, | |
| "grad_norm": 6.345734119415283, | |
| "learning_rate": 8.028605540401065e-05, | |
| "loss": 5.2043, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.3370058327932599, | |
| "grad_norm": 6.245737552642822, | |
| "learning_rate": 8.011342667593132e-05, | |
| "loss": 4.9001, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.33830200907323393, | |
| "grad_norm": 6.332602500915527, | |
| "learning_rate": 7.994023282398017e-05, | |
| "loss": 5.4487, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.339598185353208, | |
| "grad_norm": 6.836564064025879, | |
| "learning_rate": 7.976647709840104e-05, | |
| "loss": 5.1026, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.3408943616331821, | |
| "grad_norm": 6.576712608337402, | |
| "learning_rate": 7.959216275998223e-05, | |
| "loss": 5.413, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.3421905379131562, | |
| "grad_norm": 6.615803241729736, | |
| "learning_rate": 7.94172930799952e-05, | |
| "loss": 5.2105, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.34348671419313026, | |
| "grad_norm": 6.192866325378418, | |
| "learning_rate": 7.924187134013323e-05, | |
| "loss": 4.9693, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.34478289047310434, | |
| "grad_norm": 7.869561195373535, | |
| "learning_rate": 7.906590083244991e-05, | |
| "loss": 5.1879, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.3460790667530784, | |
| "grad_norm": 6.713994979858398, | |
| "learning_rate": 7.888938485929718e-05, | |
| "loss": 5.12, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.3473752430330525, | |
| "grad_norm": 6.530569076538086, | |
| "learning_rate": 7.871232673326356e-05, | |
| "loss": 5.3551, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.3486714193130266, | |
| "grad_norm": 6.053219318389893, | |
| "learning_rate": 7.853472977711183e-05, | |
| "loss": 4.961, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.34996759559300067, | |
| "grad_norm": 7.1648664474487305, | |
| "learning_rate": 7.835659732371671e-05, | |
| "loss": 5.2817, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.35126377187297475, | |
| "grad_norm": 8.36543083190918, | |
| "learning_rate": 7.817793271600242e-05, | |
| "loss": 5.1267, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.3525599481529488, | |
| "grad_norm": 6.720150470733643, | |
| "learning_rate": 7.799873930687978e-05, | |
| "loss": 5.3077, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.35385612443292286, | |
| "grad_norm": 7.16845178604126, | |
| "learning_rate": 7.781902045918337e-05, | |
| "loss": 5.0731, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.35515230071289694, | |
| "grad_norm": 6.205852508544922, | |
| "learning_rate": 7.763877954560848e-05, | |
| "loss": 5.0185, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.356448476992871, | |
| "grad_norm": 6.484716892242432, | |
| "learning_rate": 7.745801994864766e-05, | |
| "loss": 4.8446, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.3577446532728451, | |
| "grad_norm": 6.074382781982422, | |
| "learning_rate": 7.727674506052743e-05, | |
| "loss": 5.4281, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.3590408295528192, | |
| "grad_norm": 6.475484371185303, | |
| "learning_rate": 7.709495828314448e-05, | |
| "loss": 5.2827, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.36033700583279327, | |
| "grad_norm": 6.075077056884766, | |
| "learning_rate": 7.691266302800186e-05, | |
| "loss": 5.2338, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.36163318211276735, | |
| "grad_norm": 6.620121955871582, | |
| "learning_rate": 7.6729862716145e-05, | |
| "loss": 5.1154, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.36292935839274143, | |
| "grad_norm": 6.225570201873779, | |
| "learning_rate": 7.654656077809747e-05, | |
| "loss": 5.1745, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3642255346727155, | |
| "grad_norm": 6.469938278198242, | |
| "learning_rate": 7.63627606537966e-05, | |
| "loss": 5.2555, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.36552171095268954, | |
| "grad_norm": 6.713720321655273, | |
| "learning_rate": 7.617846579252897e-05, | |
| "loss": 5.2147, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.3668178872326636, | |
| "grad_norm": 5.663670063018799, | |
| "learning_rate": 7.599367965286559e-05, | |
| "loss": 4.6885, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.3681140635126377, | |
| "grad_norm": 6.744603157043457, | |
| "learning_rate": 7.580840570259713e-05, | |
| "loss": 5.572, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.3694102397926118, | |
| "grad_norm": 6.041903018951416, | |
| "learning_rate": 7.562264741866869e-05, | |
| "loss": 4.9152, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.37070641607258586, | |
| "grad_norm": 7.129952907562256, | |
| "learning_rate": 7.543640828711466e-05, | |
| "loss": 4.7115, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.37200259235255995, | |
| "grad_norm": 6.357120990753174, | |
| "learning_rate": 7.524969180299325e-05, | |
| "loss": 5.0026, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.37329876863253403, | |
| "grad_norm": 5.915050506591797, | |
| "learning_rate": 7.506250147032088e-05, | |
| "loss": 5.3111, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.3745949449125081, | |
| "grad_norm": 6.719303607940674, | |
| "learning_rate": 7.487484080200653e-05, | |
| "loss": 5.6598, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.3758911211924822, | |
| "grad_norm": 7.025637626647949, | |
| "learning_rate": 7.468671331978567e-05, | |
| "loss": 5.3481, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3771872974724563, | |
| "grad_norm": 7.205336570739746, | |
| "learning_rate": 7.449812255415423e-05, | |
| "loss": 5.3031, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.37848347375243035, | |
| "grad_norm": 5.23866081237793, | |
| "learning_rate": 7.430907204430242e-05, | |
| "loss": 4.8644, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.3797796500324044, | |
| "grad_norm": 6.661409854888916, | |
| "learning_rate": 7.411956533804818e-05, | |
| "loss": 5.0625, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.38107582631237846, | |
| "grad_norm": 7.062594890594482, | |
| "learning_rate": 7.39296059917707e-05, | |
| "loss": 5.1213, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.38237200259235254, | |
| "grad_norm": 6.861457347869873, | |
| "learning_rate": 7.373919757034362e-05, | |
| "loss": 5.4799, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.3836681788723266, | |
| "grad_norm": 6.35822057723999, | |
| "learning_rate": 7.354834364706818e-05, | |
| "loss": 5.4426, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.3849643551523007, | |
| "grad_norm": 6.43895959854126, | |
| "learning_rate": 7.335704780360608e-05, | |
| "loss": 4.9054, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.3862605314322748, | |
| "grad_norm": 6.686366558074951, | |
| "learning_rate": 7.316531362991239e-05, | |
| "loss": 4.7958, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.38755670771224887, | |
| "grad_norm": 5.39023494720459, | |
| "learning_rate": 7.297314472416805e-05, | |
| "loss": 4.9293, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.38885288399222295, | |
| "grad_norm": 6.572624206542969, | |
| "learning_rate": 7.278054469271245e-05, | |
| "loss": 5.1606, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.38885288399222295, | |
| "eval_loss": 1.7062296867370605, | |
| "eval_runtime": 353.1925, | |
| "eval_samples_per_second": 9.604, | |
| "eval_steps_per_second": 1.2, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.39014906027219703, | |
| "grad_norm": 5.829883098602295, | |
| "learning_rate": 7.258751714997568e-05, | |
| "loss": 5.0312, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.3914452365521711, | |
| "grad_norm": 6.756894588470459, | |
| "learning_rate": 7.239406571841068e-05, | |
| "loss": 5.0417, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.3927414128321452, | |
| "grad_norm": 7.765864372253418, | |
| "learning_rate": 7.22001940284254e-05, | |
| "loss": 4.7524, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.3940375891121192, | |
| "grad_norm": 7.117984294891357, | |
| "learning_rate": 7.200590571831447e-05, | |
| "loss": 5.276, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.3953337653920933, | |
| "grad_norm": 6.583863735198975, | |
| "learning_rate": 7.181120443419113e-05, | |
| "loss": 4.4836, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.3966299416720674, | |
| "grad_norm": 6.2899651527404785, | |
| "learning_rate": 7.161609382991861e-05, | |
| "loss": 4.8327, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.39792611795204147, | |
| "grad_norm": 10.40349292755127, | |
| "learning_rate": 7.142057756704168e-05, | |
| "loss": 5.2758, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.39922229423201555, | |
| "grad_norm": 6.652336120605469, | |
| "learning_rate": 7.122465931471794e-05, | |
| "loss": 5.19, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.40051847051198963, | |
| "grad_norm": 6.063689708709717, | |
| "learning_rate": 7.102834274964889e-05, | |
| "loss": 5.4008, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.4018146467919637, | |
| "grad_norm": 6.221471309661865, | |
| "learning_rate": 7.083163155601097e-05, | |
| "loss": 4.8305, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4031108230719378, | |
| "grad_norm": 6.177482604980469, | |
| "learning_rate": 7.063452942538644e-05, | |
| "loss": 5.0628, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.4044069993519119, | |
| "grad_norm": 5.943436622619629, | |
| "learning_rate": 7.043704005669405e-05, | |
| "loss": 5.0516, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.40570317563188596, | |
| "grad_norm": 6.601740837097168, | |
| "learning_rate": 7.023916715611969e-05, | |
| "loss": 4.8331, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.40699935191186, | |
| "grad_norm": 6.374989986419678, | |
| "learning_rate": 7.004091443704681e-05, | |
| "loss": 5.3084, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.40829552819183407, | |
| "grad_norm": 6.466265678405762, | |
| "learning_rate": 6.984228561998669e-05, | |
| "loss": 5.2695, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.40959170447180815, | |
| "grad_norm": 6.188666343688965, | |
| "learning_rate": 6.964328443250867e-05, | |
| "loss": 4.8792, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.41088788075178223, | |
| "grad_norm": 5.995561122894287, | |
| "learning_rate": 6.944391460917021e-05, | |
| "loss": 4.8856, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.4121840570317563, | |
| "grad_norm": 6.418734073638916, | |
| "learning_rate": 6.924417989144674e-05, | |
| "loss": 5.2133, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4134802333117304, | |
| "grad_norm": 6.242722034454346, | |
| "learning_rate": 6.90440840276615e-05, | |
| "loss": 5.1887, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.4147764095917045, | |
| "grad_norm": 6.374722003936768, | |
| "learning_rate": 6.884363077291517e-05, | |
| "loss": 5.5929, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.41607258587167856, | |
| "grad_norm": 6.672064781188965, | |
| "learning_rate": 6.864282388901544e-05, | |
| "loss": 5.5616, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.41736876215165264, | |
| "grad_norm": 6.3545732498168945, | |
| "learning_rate": 6.844166714440635e-05, | |
| "loss": 5.1473, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.4186649384316267, | |
| "grad_norm": 8.2235107421875, | |
| "learning_rate": 6.824016431409762e-05, | |
| "loss": 4.7892, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.4199611147116008, | |
| "grad_norm": 6.904574394226074, | |
| "learning_rate": 6.803831917959381e-05, | |
| "loss": 5.3336, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.42125729099157483, | |
| "grad_norm": 6.573385715484619, | |
| "learning_rate": 6.783613552882329e-05, | |
| "loss": 5.1198, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.4225534672715489, | |
| "grad_norm": 6.856905460357666, | |
| "learning_rate": 6.763361715606723e-05, | |
| "loss": 5.2217, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.423849643551523, | |
| "grad_norm": 6.580875396728516, | |
| "learning_rate": 6.743076786188833e-05, | |
| "loss": 5.2903, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.4251458198314971, | |
| "grad_norm": 6.890357971191406, | |
| "learning_rate": 6.72275914530596e-05, | |
| "loss": 4.811, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.42644199611147116, | |
| "grad_norm": 6.392624378204346, | |
| "learning_rate": 6.702409174249275e-05, | |
| "loss": 5.2851, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.42773817239144524, | |
| "grad_norm": 5.905599117279053, | |
| "learning_rate": 6.682027254916686e-05, | |
| "loss": 4.8389, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.4290343486714193, | |
| "grad_norm": 6.269746780395508, | |
| "learning_rate": 6.661613769805644e-05, | |
| "loss": 5.1311, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.4303305249513934, | |
| "grad_norm": 6.146518230438232, | |
| "learning_rate": 6.641169102005991e-05, | |
| "loss": 4.8601, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4316267012313675, | |
| "grad_norm": 6.379298686981201, | |
| "learning_rate": 6.620693635192754e-05, | |
| "loss": 5.3775, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.43292287751134156, | |
| "grad_norm": 6.051974773406982, | |
| "learning_rate": 6.600187753618951e-05, | |
| "loss": 5.0664, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.43421905379131565, | |
| "grad_norm": 6.480034828186035, | |
| "learning_rate": 6.57965184210838e-05, | |
| "loss": 5.0112, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.43551523007128967, | |
| "grad_norm": 5.577784538269043, | |
| "learning_rate": 6.559086286048394e-05, | |
| "loss": 4.684, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.43681140635126375, | |
| "grad_norm": 6.112408638000488, | |
| "learning_rate": 6.53849147138267e-05, | |
| "loss": 4.7755, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.43810758263123784, | |
| "grad_norm": 6.673699378967285, | |
| "learning_rate": 6.517867784603972e-05, | |
| "loss": 5.4472, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.4394037589112119, | |
| "grad_norm": 5.8759446144104, | |
| "learning_rate": 6.497215612746886e-05, | |
| "loss": 4.9053, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.440699935191186, | |
| "grad_norm": 6.411087989807129, | |
| "learning_rate": 6.47653534338057e-05, | |
| "loss": 4.6784, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.4419961114711601, | |
| "grad_norm": 5.656676769256592, | |
| "learning_rate": 6.455827364601468e-05, | |
| "loss": 4.7073, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.44329228775113416, | |
| "grad_norm": 7.167407035827637, | |
| "learning_rate": 6.435092065026035e-05, | |
| "loss": 4.815, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.44458846403110824, | |
| "grad_norm": 7.390071868896484, | |
| "learning_rate": 6.414329833783446e-05, | |
| "loss": 5.2979, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.4458846403110823, | |
| "grad_norm": 6.286187171936035, | |
| "learning_rate": 6.393541060508283e-05, | |
| "loss": 4.428, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4471808165910564, | |
| "grad_norm": 6.020079135894775, | |
| "learning_rate": 6.372726135333234e-05, | |
| "loss": 5.1399, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.44847699287103043, | |
| "grad_norm": 6.759265899658203, | |
| "learning_rate": 6.351885448881765e-05, | |
| "loss": 5.0373, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.4497731691510045, | |
| "grad_norm": 6.868116855621338, | |
| "learning_rate": 6.331019392260791e-05, | |
| "loss": 5.334, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.4510693454309786, | |
| "grad_norm": 6.369646072387695, | |
| "learning_rate": 6.310128357053339e-05, | |
| "loss": 4.9139, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.4523655217109527, | |
| "grad_norm": 5.470962047576904, | |
| "learning_rate": 6.28921273531119e-05, | |
| "loss": 4.8604, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.45366169799092676, | |
| "grad_norm": 6.49376916885376, | |
| "learning_rate": 6.268272919547537e-05, | |
| "loss": 5.0324, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.45495787427090084, | |
| "grad_norm": 6.682295799255371, | |
| "learning_rate": 6.247309302729607e-05, | |
| "loss": 4.9259, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.4562540505508749, | |
| "grad_norm": 6.105667591094971, | |
| "learning_rate": 6.226322278271286e-05, | |
| "loss": 5.2015, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.457550226830849, | |
| "grad_norm": 6.069522380828857, | |
| "learning_rate": 6.205312240025745e-05, | |
| "loss": 4.7538, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.4588464031108231, | |
| "grad_norm": 6.145866394042969, | |
| "learning_rate": 6.184279582278039e-05, | |
| "loss": 4.8113, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.46014257939079717, | |
| "grad_norm": 6.4372358322143555, | |
| "learning_rate": 6.163224699737718e-05, | |
| "loss": 5.0577, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.46143875567077125, | |
| "grad_norm": 6.565794944763184, | |
| "learning_rate": 6.142147987531407e-05, | |
| "loss": 5.0166, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.4627349319507453, | |
| "grad_norm": 6.475300312042236, | |
| "learning_rate": 6.121049841195402e-05, | |
| "loss": 4.7242, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.46403110823071936, | |
| "grad_norm": 6.883950710296631, | |
| "learning_rate": 6.099930656668241e-05, | |
| "loss": 5.2766, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.46532728451069344, | |
| "grad_norm": 6.661801338195801, | |
| "learning_rate": 6.078790830283276e-05, | |
| "loss": 5.1597, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.4666234607906675, | |
| "grad_norm": 6.135336875915527, | |
| "learning_rate": 6.0576307587612347e-05, | |
| "loss": 4.6236, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.4666234607906675, | |
| "eval_loss": 1.6923874616622925, | |
| "eval_runtime": 353.1926, | |
| "eval_samples_per_second": 9.604, | |
| "eval_steps_per_second": 1.2, | |
| "step": 1800 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 3858, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.140322113650688e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |