gemma-base-2-3-1800 / trainer_state.json
rmdhirr's picture
Upload folder using huggingface_hub
dfb61b0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4666234607906675,
"eval_steps": 300,
"global_step": 1800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012961762799740765,
"grad_norm": 5.6396965980529785,
"learning_rate": 1.724137931034483e-06,
"loss": 6.6733,
"step": 5
},
{
"epoch": 0.002592352559948153,
"grad_norm": 5.946774482727051,
"learning_rate": 3.8793103448275865e-06,
"loss": 6.8433,
"step": 10
},
{
"epoch": 0.0038885288399222295,
"grad_norm": 6.051335334777832,
"learning_rate": 6.03448275862069e-06,
"loss": 6.6488,
"step": 15
},
{
"epoch": 0.005184705119896306,
"grad_norm": 5.994152069091797,
"learning_rate": 8.189655172413793e-06,
"loss": 6.766,
"step": 20
},
{
"epoch": 0.0064808813998703824,
"grad_norm": 6.127562522888184,
"learning_rate": 1.0344827586206897e-05,
"loss": 6.8587,
"step": 25
},
{
"epoch": 0.007777057679844459,
"grad_norm": 6.231839179992676,
"learning_rate": 1.25e-05,
"loss": 6.6363,
"step": 30
},
{
"epoch": 0.009073233959818535,
"grad_norm": 6.233114242553711,
"learning_rate": 1.4655172413793103e-05,
"loss": 7.1256,
"step": 35
},
{
"epoch": 0.010369410239792612,
"grad_norm": 6.063745021820068,
"learning_rate": 1.6810344827586207e-05,
"loss": 7.089,
"step": 40
},
{
"epoch": 0.011665586519766688,
"grad_norm": 7.45819616317749,
"learning_rate": 1.896551724137931e-05,
"loss": 6.9478,
"step": 45
},
{
"epoch": 0.012961762799740765,
"grad_norm": 5.544084548950195,
"learning_rate": 2.1120689655172415e-05,
"loss": 7.0204,
"step": 50
},
{
"epoch": 0.014257939079714841,
"grad_norm": 5.735494613647461,
"learning_rate": 2.327586206896552e-05,
"loss": 6.8533,
"step": 55
},
{
"epoch": 0.015554115359688918,
"grad_norm": 5.821224212646484,
"learning_rate": 2.543103448275862e-05,
"loss": 6.8601,
"step": 60
},
{
"epoch": 0.016850291639662993,
"grad_norm": 6.087413311004639,
"learning_rate": 2.7586206896551727e-05,
"loss": 6.8345,
"step": 65
},
{
"epoch": 0.01814646791963707,
"grad_norm": 6.119109153747559,
"learning_rate": 2.974137931034483e-05,
"loss": 6.5241,
"step": 70
},
{
"epoch": 0.019442644199611146,
"grad_norm": 5.87671422958374,
"learning_rate": 3.1896551724137935e-05,
"loss": 6.9227,
"step": 75
},
{
"epoch": 0.020738820479585224,
"grad_norm": 5.842630386352539,
"learning_rate": 3.405172413793103e-05,
"loss": 7.7281,
"step": 80
},
{
"epoch": 0.0220349967595593,
"grad_norm": 5.891020774841309,
"learning_rate": 3.620689655172414e-05,
"loss": 7.1023,
"step": 85
},
{
"epoch": 0.023331173039533377,
"grad_norm": 6.341122627258301,
"learning_rate": 3.8362068965517246e-05,
"loss": 6.7766,
"step": 90
},
{
"epoch": 0.02462734931950745,
"grad_norm": 6.294985294342041,
"learning_rate": 4.0517241379310344e-05,
"loss": 6.9056,
"step": 95
},
{
"epoch": 0.02592352559948153,
"grad_norm": 6.357933521270752,
"learning_rate": 4.267241379310345e-05,
"loss": 6.8075,
"step": 100
},
{
"epoch": 0.027219701879455604,
"grad_norm": 5.870952606201172,
"learning_rate": 4.482758620689655e-05,
"loss": 6.6911,
"step": 105
},
{
"epoch": 0.028515878159429683,
"grad_norm": 15.302526473999023,
"learning_rate": 4.698275862068966e-05,
"loss": 6.4926,
"step": 110
},
{
"epoch": 0.029812054439403757,
"grad_norm": 6.0892014503479,
"learning_rate": 4.913793103448276e-05,
"loss": 6.4598,
"step": 115
},
{
"epoch": 0.031108230719377836,
"grad_norm": 5.548859119415283,
"learning_rate": 5.129310344827587e-05,
"loss": 6.4285,
"step": 120
},
{
"epoch": 0.03240440699935191,
"grad_norm": 5.682961940765381,
"learning_rate": 5.344827586206896e-05,
"loss": 6.2057,
"step": 125
},
{
"epoch": 0.033700583279325985,
"grad_norm": 5.919161319732666,
"learning_rate": 5.560344827586207e-05,
"loss": 6.1522,
"step": 130
},
{
"epoch": 0.03499675955930007,
"grad_norm": 6.013058185577393,
"learning_rate": 5.7758620689655175e-05,
"loss": 6.7591,
"step": 135
},
{
"epoch": 0.03629293583927414,
"grad_norm": 5.664581298828125,
"learning_rate": 5.991379310344828e-05,
"loss": 6.6685,
"step": 140
},
{
"epoch": 0.037589112119248216,
"grad_norm": 6.199220180511475,
"learning_rate": 6.206896551724138e-05,
"loss": 6.6676,
"step": 145
},
{
"epoch": 0.03888528839922229,
"grad_norm": 5.9385552406311035,
"learning_rate": 6.422413793103449e-05,
"loss": 6.423,
"step": 150
},
{
"epoch": 0.04018146467919637,
"grad_norm": 5.6819939613342285,
"learning_rate": 6.637931034482759e-05,
"loss": 6.2384,
"step": 155
},
{
"epoch": 0.04147764095917045,
"grad_norm": 5.781296730041504,
"learning_rate": 6.85344827586207e-05,
"loss": 6.4166,
"step": 160
},
{
"epoch": 0.04277381723914452,
"grad_norm": 6.319295883178711,
"learning_rate": 7.06896551724138e-05,
"loss": 6.7247,
"step": 165
},
{
"epoch": 0.0440699935191186,
"grad_norm": 5.825390815734863,
"learning_rate": 7.28448275862069e-05,
"loss": 6.4742,
"step": 170
},
{
"epoch": 0.04536616979909268,
"grad_norm": 5.79691743850708,
"learning_rate": 7.500000000000001e-05,
"loss": 6.4885,
"step": 175
},
{
"epoch": 0.046662346079066754,
"grad_norm": 5.771387100219727,
"learning_rate": 7.715517241379311e-05,
"loss": 6.6513,
"step": 180
},
{
"epoch": 0.04795852235904083,
"grad_norm": 5.620362281799316,
"learning_rate": 7.931034482758621e-05,
"loss": 6.8207,
"step": 185
},
{
"epoch": 0.0492546986390149,
"grad_norm": 5.468255996704102,
"learning_rate": 8.146551724137932e-05,
"loss": 6.2821,
"step": 190
},
{
"epoch": 0.050550874918988985,
"grad_norm": 5.637963771820068,
"learning_rate": 8.362068965517241e-05,
"loss": 6.5699,
"step": 195
},
{
"epoch": 0.05184705119896306,
"grad_norm": 6.304792881011963,
"learning_rate": 8.577586206896551e-05,
"loss": 6.8814,
"step": 200
},
{
"epoch": 0.053143227478937134,
"grad_norm": 5.914714813232422,
"learning_rate": 8.793103448275862e-05,
"loss": 6.5254,
"step": 205
},
{
"epoch": 0.05443940375891121,
"grad_norm": 6.677070140838623,
"learning_rate": 9.008620689655173e-05,
"loss": 6.2366,
"step": 210
},
{
"epoch": 0.05573558003888529,
"grad_norm": 6.566666603088379,
"learning_rate": 9.224137931034484e-05,
"loss": 6.9259,
"step": 215
},
{
"epoch": 0.057031756318859365,
"grad_norm": 6.206492900848389,
"learning_rate": 9.439655172413794e-05,
"loss": 6.2938,
"step": 220
},
{
"epoch": 0.05832793259883344,
"grad_norm": 6.607194423675537,
"learning_rate": 9.655172413793105e-05,
"loss": 6.4837,
"step": 225
},
{
"epoch": 0.059624108878807515,
"grad_norm": 6.002255439758301,
"learning_rate": 9.870689655172414e-05,
"loss": 6.2093,
"step": 230
},
{
"epoch": 0.0609202851587816,
"grad_norm": 5.8744425773620605,
"learning_rate": 9.999992493386817e-05,
"loss": 6.2799,
"step": 235
},
{
"epoch": 0.06221646143875567,
"grad_norm": 5.867527961730957,
"learning_rate": 9.999908044247358e-05,
"loss": 6.0516,
"step": 240
},
{
"epoch": 0.06351263771872975,
"grad_norm": 6.7269721031188965,
"learning_rate": 9.999729764292059e-05,
"loss": 6.4098,
"step": 245
},
{
"epoch": 0.06480881399870382,
"grad_norm": 6.160502910614014,
"learning_rate": 9.999457656866613e-05,
"loss": 5.8759,
"step": 250
},
{
"epoch": 0.0661049902786779,
"grad_norm": 7.1613640785217285,
"learning_rate": 9.999091727077524e-05,
"loss": 6.2836,
"step": 255
},
{
"epoch": 0.06740116655865197,
"grad_norm": 5.804195404052734,
"learning_rate": 9.99863198179202e-05,
"loss": 6.6131,
"step": 260
},
{
"epoch": 0.06869734283862605,
"grad_norm": 5.900219440460205,
"learning_rate": 9.99807842963791e-05,
"loss": 6.4623,
"step": 265
},
{
"epoch": 0.06999351911860013,
"grad_norm": 5.594264984130859,
"learning_rate": 9.99743108100344e-05,
"loss": 6.1754,
"step": 270
},
{
"epoch": 0.0712896953985742,
"grad_norm": 6.203033447265625,
"learning_rate": 9.99668994803708e-05,
"loss": 6.1605,
"step": 275
},
{
"epoch": 0.07258587167854828,
"grad_norm": 6.414889335632324,
"learning_rate": 9.99585504464731e-05,
"loss": 6.1736,
"step": 280
},
{
"epoch": 0.07388204795852236,
"grad_norm": 6.240243911743164,
"learning_rate": 9.99492638650235e-05,
"loss": 6.177,
"step": 285
},
{
"epoch": 0.07517822423849643,
"grad_norm": 6.264848232269287,
"learning_rate": 9.993903991029873e-05,
"loss": 5.9035,
"step": 290
},
{
"epoch": 0.07647440051847051,
"grad_norm": 6.6530537605285645,
"learning_rate": 9.99278787741667e-05,
"loss": 6.5645,
"step": 295
},
{
"epoch": 0.07777057679844458,
"grad_norm": 5.987668514251709,
"learning_rate": 9.991578066608296e-05,
"loss": 5.9072,
"step": 300
},
{
"epoch": 0.07777057679844458,
"eval_loss": 1.7322468757629395,
"eval_runtime": 353.2957,
"eval_samples_per_second": 9.601,
"eval_steps_per_second": 1.2,
"step": 300
},
{
"epoch": 0.07906675307841866,
"grad_norm": 6.055402755737305,
"learning_rate": 9.990274581308676e-05,
"loss": 6.1636,
"step": 305
},
{
"epoch": 0.08036292935839275,
"grad_norm": 5.985538959503174,
"learning_rate": 9.98887744597968e-05,
"loss": 5.9484,
"step": 310
},
{
"epoch": 0.08165910563836681,
"grad_norm": 6.564467430114746,
"learning_rate": 9.987386686840658e-05,
"loss": 6.046,
"step": 315
},
{
"epoch": 0.0829552819183409,
"grad_norm": 6.496596336364746,
"learning_rate": 9.985802331867953e-05,
"loss": 5.9602,
"step": 320
},
{
"epoch": 0.08425145819831498,
"grad_norm": 6.17959451675415,
"learning_rate": 9.984124410794376e-05,
"loss": 5.9331,
"step": 325
},
{
"epoch": 0.08554763447828904,
"grad_norm": 6.264903545379639,
"learning_rate": 9.982352955108648e-05,
"loss": 6.3204,
"step": 330
},
{
"epoch": 0.08684381075826313,
"grad_norm": 6.16652250289917,
"learning_rate": 9.980487998054806e-05,
"loss": 6.4928,
"step": 335
},
{
"epoch": 0.0881399870382372,
"grad_norm": 5.843173027038574,
"learning_rate": 9.978529574631583e-05,
"loss": 5.6959,
"step": 340
},
{
"epoch": 0.08943616331821128,
"grad_norm": 6.269702911376953,
"learning_rate": 9.976477721591745e-05,
"loss": 6.2319,
"step": 345
},
{
"epoch": 0.09073233959818536,
"grad_norm": 6.384511470794678,
"learning_rate": 9.974332477441415e-05,
"loss": 5.7345,
"step": 350
},
{
"epoch": 0.09202851587815943,
"grad_norm": 6.084228515625,
"learning_rate": 9.972093882439331e-05,
"loss": 6.1189,
"step": 355
},
{
"epoch": 0.09332469215813351,
"grad_norm": 6.038543701171875,
"learning_rate": 9.969761978596104e-05,
"loss": 6.2143,
"step": 360
},
{
"epoch": 0.09462086843810759,
"grad_norm": 5.236361503601074,
"learning_rate": 9.96733680967343e-05,
"loss": 5.8919,
"step": 365
},
{
"epoch": 0.09591704471808166,
"grad_norm": 6.045313358306885,
"learning_rate": 9.96481842118326e-05,
"loss": 6.0698,
"step": 370
},
{
"epoch": 0.09721322099805574,
"grad_norm": 6.191497325897217,
"learning_rate": 9.962206860386952e-05,
"loss": 6.0906,
"step": 375
},
{
"epoch": 0.0985093972780298,
"grad_norm": 6.468414306640625,
"learning_rate": 9.959502176294383e-05,
"loss": 5.6146,
"step": 380
},
{
"epoch": 0.09980557355800389,
"grad_norm": 5.91257381439209,
"learning_rate": 9.956704419663034e-05,
"loss": 6.2094,
"step": 385
},
{
"epoch": 0.10110174983797797,
"grad_norm": 6.700929641723633,
"learning_rate": 9.953813642997023e-05,
"loss": 6.0105,
"step": 390
},
{
"epoch": 0.10239792611795204,
"grad_norm": 5.680608749389648,
"learning_rate": 9.950829900546135e-05,
"loss": 5.6779,
"step": 395
},
{
"epoch": 0.10369410239792612,
"grad_norm": 7.64698600769043,
"learning_rate": 9.947753248304798e-05,
"loss": 5.6944,
"step": 400
},
{
"epoch": 0.1049902786779002,
"grad_norm": 6.112722873687744,
"learning_rate": 9.944583744011035e-05,
"loss": 5.2823,
"step": 405
},
{
"epoch": 0.10628645495787427,
"grad_norm": 5.932455539703369,
"learning_rate": 9.941321447145369e-05,
"loss": 5.6419,
"step": 410
},
{
"epoch": 0.10758263123784835,
"grad_norm": 6.098362445831299,
"learning_rate": 9.937966418929726e-05,
"loss": 5.555,
"step": 415
},
{
"epoch": 0.10887880751782242,
"grad_norm": 6.430552005767822,
"learning_rate": 9.934518722326268e-05,
"loss": 5.9416,
"step": 420
},
{
"epoch": 0.1101749837977965,
"grad_norm": 6.295275688171387,
"learning_rate": 9.930978422036224e-05,
"loss": 5.7914,
"step": 425
},
{
"epoch": 0.11147116007777058,
"grad_norm": 6.376883029937744,
"learning_rate": 9.927345584498666e-05,
"loss": 5.6276,
"step": 430
},
{
"epoch": 0.11276733635774465,
"grad_norm": 6.90930700302124,
"learning_rate": 9.923620277889271e-05,
"loss": 5.5259,
"step": 435
},
{
"epoch": 0.11406351263771873,
"grad_norm": 6.884796142578125,
"learning_rate": 9.91980257211904e-05,
"loss": 5.8534,
"step": 440
},
{
"epoch": 0.11535968891769281,
"grad_norm": 6.210943698883057,
"learning_rate": 9.915892538832975e-05,
"loss": 6.2987,
"step": 445
},
{
"epoch": 0.11665586519766688,
"grad_norm": 6.3871235847473145,
"learning_rate": 9.911890251408751e-05,
"loss": 5.6077,
"step": 450
},
{
"epoch": 0.11795204147764096,
"grad_norm": 6.339592933654785,
"learning_rate": 9.907795784955327e-05,
"loss": 5.3609,
"step": 455
},
{
"epoch": 0.11924821775761503,
"grad_norm": 8.121675491333008,
"learning_rate": 9.903609216311543e-05,
"loss": 5.3018,
"step": 460
},
{
"epoch": 0.12054439403758911,
"grad_norm": 5.988659858703613,
"learning_rate": 9.899330624044672e-05,
"loss": 5.6771,
"step": 465
},
{
"epoch": 0.1218405703175632,
"grad_norm": 5.641611576080322,
"learning_rate": 9.894960088448952e-05,
"loss": 5.4788,
"step": 470
},
{
"epoch": 0.12313674659753726,
"grad_norm": 6.305113315582275,
"learning_rate": 9.890497691544078e-05,
"loss": 5.8398,
"step": 475
},
{
"epoch": 0.12443292287751134,
"grad_norm": 7.6531476974487305,
"learning_rate": 9.885943517073656e-05,
"loss": 5.9084,
"step": 480
},
{
"epoch": 0.12572909915748542,
"grad_norm": 6.197567462921143,
"learning_rate": 9.881297650503641e-05,
"loss": 6.1301,
"step": 485
},
{
"epoch": 0.1270252754374595,
"grad_norm": 6.230889797210693,
"learning_rate": 9.876560179020723e-05,
"loss": 5.357,
"step": 490
},
{
"epoch": 0.12832145171743356,
"grad_norm": 5.8952155113220215,
"learning_rate": 9.871731191530703e-05,
"loss": 5.4557,
"step": 495
},
{
"epoch": 0.12961762799740764,
"grad_norm": 5.878301620483398,
"learning_rate": 9.866810778656815e-05,
"loss": 5.6862,
"step": 500
},
{
"epoch": 0.13091380427738172,
"grad_norm": 5.700610637664795,
"learning_rate": 9.861799032738026e-05,
"loss": 5.73,
"step": 505
},
{
"epoch": 0.1322099805573558,
"grad_norm": 5.974181175231934,
"learning_rate": 9.856696047827309e-05,
"loss": 5.9917,
"step": 510
},
{
"epoch": 0.1335061568373299,
"grad_norm": 6.263741493225098,
"learning_rate": 9.851501919689872e-05,
"loss": 6.5588,
"step": 515
},
{
"epoch": 0.13480233311730394,
"grad_norm": 5.810044765472412,
"learning_rate": 9.846216745801365e-05,
"loss": 5.4013,
"step": 520
},
{
"epoch": 0.13609850939727802,
"grad_norm": 6.2864089012146,
"learning_rate": 9.840840625346046e-05,
"loss": 5.7395,
"step": 525
},
{
"epoch": 0.1373946856772521,
"grad_norm": 6.56425142288208,
"learning_rate": 9.835373659214925e-05,
"loss": 5.8189,
"step": 530
},
{
"epoch": 0.13869086195722619,
"grad_norm": 5.685390472412109,
"learning_rate": 9.829815950003869e-05,
"loss": 5.0689,
"step": 535
},
{
"epoch": 0.13998703823720027,
"grad_norm": 6.452259063720703,
"learning_rate": 9.824167602011671e-05,
"loss": 6.3463,
"step": 540
},
{
"epoch": 0.14128321451717435,
"grad_norm": 6.589794158935547,
"learning_rate": 9.818428721238101e-05,
"loss": 5.5236,
"step": 545
},
{
"epoch": 0.1425793907971484,
"grad_norm": 6.536722183227539,
"learning_rate": 9.812599415381916e-05,
"loss": 5.7223,
"step": 550
},
{
"epoch": 0.14387556707712248,
"grad_norm": 6.562587261199951,
"learning_rate": 9.806679793838829e-05,
"loss": 5.6417,
"step": 555
},
{
"epoch": 0.14517174335709657,
"grad_norm": 6.284836769104004,
"learning_rate": 9.800669967699467e-05,
"loss": 5.9385,
"step": 560
},
{
"epoch": 0.14646791963707065,
"grad_norm": 6.164417743682861,
"learning_rate": 9.794570049747285e-05,
"loss": 5.5899,
"step": 565
},
{
"epoch": 0.14776409591704473,
"grad_norm": 6.173615455627441,
"learning_rate": 9.788380154456443e-05,
"loss": 5.6792,
"step": 570
},
{
"epoch": 0.14906027219701878,
"grad_norm": 5.460556983947754,
"learning_rate": 9.78210039798966e-05,
"loss": 5.3521,
"step": 575
},
{
"epoch": 0.15035644847699287,
"grad_norm": 5.926529407501221,
"learning_rate": 9.775730898196038e-05,
"loss": 5.3681,
"step": 580
},
{
"epoch": 0.15165262475696695,
"grad_norm": 7.669357776641846,
"learning_rate": 9.769271774608853e-05,
"loss": 6.0213,
"step": 585
},
{
"epoch": 0.15294880103694103,
"grad_norm": 5.901175022125244,
"learning_rate": 9.762723148443296e-05,
"loss": 5.5721,
"step": 590
},
{
"epoch": 0.1542449773169151,
"grad_norm": 5.736074447631836,
"learning_rate": 9.756085142594215e-05,
"loss": 5.5985,
"step": 595
},
{
"epoch": 0.15554115359688916,
"grad_norm": 6.197432041168213,
"learning_rate": 9.749357881633805e-05,
"loss": 5.6776,
"step": 600
},
{
"epoch": 0.15554115359688916,
"eval_loss": 1.759212613105774,
"eval_runtime": 353.1926,
"eval_samples_per_second": 9.604,
"eval_steps_per_second": 1.2,
"step": 600
},
{
"epoch": 0.15683732987686325,
"grad_norm": 6.798159122467041,
"learning_rate": 9.742541491809261e-05,
"loss": 5.3871,
"step": 605
},
{
"epoch": 0.15813350615683733,
"grad_norm": 6.722763538360596,
"learning_rate": 9.735636101040422e-05,
"loss": 5.7196,
"step": 610
},
{
"epoch": 0.1594296824368114,
"grad_norm": 6.259701728820801,
"learning_rate": 9.72864183891736e-05,
"loss": 5.3804,
"step": 615
},
{
"epoch": 0.1607258587167855,
"grad_norm": 5.556440830230713,
"learning_rate": 9.721558836697952e-05,
"loss": 5.3889,
"step": 620
},
{
"epoch": 0.16202203499675957,
"grad_norm": 6.141296863555908,
"learning_rate": 9.714387227305422e-05,
"loss": 5.7832,
"step": 625
},
{
"epoch": 0.16331821127673363,
"grad_norm": 5.994726181030273,
"learning_rate": 9.707127145325833e-05,
"loss": 5.6706,
"step": 630
},
{
"epoch": 0.1646143875567077,
"grad_norm": 5.489011287689209,
"learning_rate": 9.699778727005575e-05,
"loss": 5.0538,
"step": 635
},
{
"epoch": 0.1659105638366818,
"grad_norm": 5.828495025634766,
"learning_rate": 9.692342110248802e-05,
"loss": 5.3216,
"step": 640
},
{
"epoch": 0.16720674011665587,
"grad_norm": 6.654528617858887,
"learning_rate": 9.684817434614844e-05,
"loss": 5.5388,
"step": 645
},
{
"epoch": 0.16850291639662995,
"grad_norm": 6.706836223602295,
"learning_rate": 9.67720484131559e-05,
"loss": 5.2992,
"step": 650
},
{
"epoch": 0.169799092676604,
"grad_norm": 6.2614264488220215,
"learning_rate": 9.669504473212834e-05,
"loss": 6.1578,
"step": 655
},
{
"epoch": 0.1710952689565781,
"grad_norm": 5.715453624725342,
"learning_rate": 9.661716474815597e-05,
"loss": 5.172,
"step": 660
},
{
"epoch": 0.17239144523655217,
"grad_norm": 7.11665678024292,
"learning_rate": 9.653840992277417e-05,
"loss": 5.6336,
"step": 665
},
{
"epoch": 0.17368762151652625,
"grad_norm": 6.611268520355225,
"learning_rate": 9.645878173393601e-05,
"loss": 5.6544,
"step": 670
},
{
"epoch": 0.17498379779650033,
"grad_norm": 6.485696792602539,
"learning_rate": 9.637828167598457e-05,
"loss": 5.952,
"step": 675
},
{
"epoch": 0.1762799740764744,
"grad_norm": 5.934052467346191,
"learning_rate": 9.629691125962487e-05,
"loss": 5.7921,
"step": 680
},
{
"epoch": 0.17757615035644847,
"grad_norm": 6.288647174835205,
"learning_rate": 9.62146720118955e-05,
"loss": 5.9811,
"step": 685
},
{
"epoch": 0.17887232663642255,
"grad_norm": 6.341413497924805,
"learning_rate": 9.613156547613994e-05,
"loss": 5.7378,
"step": 690
},
{
"epoch": 0.18016850291639663,
"grad_norm": 5.738951206207275,
"learning_rate": 9.604759321197773e-05,
"loss": 5.2245,
"step": 695
},
{
"epoch": 0.18146467919637072,
"grad_norm": 6.080374240875244,
"learning_rate": 9.596275679527506e-05,
"loss": 5.4993,
"step": 700
},
{
"epoch": 0.18276085547634477,
"grad_norm": 5.326103210449219,
"learning_rate": 9.587705781811524e-05,
"loss": 5.7679,
"step": 705
},
{
"epoch": 0.18405703175631885,
"grad_norm": 6.497161388397217,
"learning_rate": 9.579049788876883e-05,
"loss": 5.3233,
"step": 710
},
{
"epoch": 0.18535320803629293,
"grad_norm": 5.9565887451171875,
"learning_rate": 9.570307863166347e-05,
"loss": 5.8357,
"step": 715
},
{
"epoch": 0.18664938431626701,
"grad_norm": 6.489195823669434,
"learning_rate": 9.561480168735337e-05,
"loss": 5.8847,
"step": 720
},
{
"epoch": 0.1879455605962411,
"grad_norm": 6.582181930541992,
"learning_rate": 9.552566871248854e-05,
"loss": 5.6037,
"step": 725
},
{
"epoch": 0.18924173687621518,
"grad_norm": 6.493152141571045,
"learning_rate": 9.543568137978372e-05,
"loss": 5.6814,
"step": 730
},
{
"epoch": 0.19053791315618923,
"grad_norm": 6.5842180252075195,
"learning_rate": 9.53448413779869e-05,
"loss": 5.6718,
"step": 735
},
{
"epoch": 0.1918340894361633,
"grad_norm": 6.549362659454346,
"learning_rate": 9.525315041184772e-05,
"loss": 5.3291,
"step": 740
},
{
"epoch": 0.1931302657161374,
"grad_norm": 6.025957107543945,
"learning_rate": 9.516061020208549e-05,
"loss": 5.124,
"step": 745
},
{
"epoch": 0.19442644199611148,
"grad_norm": 6.822085380554199,
"learning_rate": 9.506722248535683e-05,
"loss": 5.3759,
"step": 750
},
{
"epoch": 0.19572261827608556,
"grad_norm": 6.566873550415039,
"learning_rate": 9.497298901422307e-05,
"loss": 5.5954,
"step": 755
},
{
"epoch": 0.1970187945560596,
"grad_norm": 6.094099521636963,
"learning_rate": 9.487791155711745e-05,
"loss": 5.2167,
"step": 760
},
{
"epoch": 0.1983149708360337,
"grad_norm": 8.192927360534668,
"learning_rate": 9.478199189831183e-05,
"loss": 5.6828,
"step": 765
},
{
"epoch": 0.19961114711600778,
"grad_norm": 5.992204666137695,
"learning_rate": 9.468523183788333e-05,
"loss": 5.6504,
"step": 770
},
{
"epoch": 0.20090732339598186,
"grad_norm": 6.572239875793457,
"learning_rate": 9.45876331916804e-05,
"loss": 5.9306,
"step": 775
},
{
"epoch": 0.20220349967595594,
"grad_norm": 6.623505592346191,
"learning_rate": 9.448919779128884e-05,
"loss": 5.2466,
"step": 780
},
{
"epoch": 0.20349967595593,
"grad_norm": 6.26369571685791,
"learning_rate": 9.438992748399742e-05,
"loss": 5.3634,
"step": 785
},
{
"epoch": 0.20479585223590407,
"grad_norm": 6.356987953186035,
"learning_rate": 9.428982413276318e-05,
"loss": 5.1097,
"step": 790
},
{
"epoch": 0.20609202851587816,
"grad_norm": 5.846899032592773,
"learning_rate": 9.41888896161765e-05,
"loss": 5.2561,
"step": 795
},
{
"epoch": 0.20738820479585224,
"grad_norm": 6.218850135803223,
"learning_rate": 9.408712582842583e-05,
"loss": 5.3602,
"step": 800
},
{
"epoch": 0.20868438107582632,
"grad_norm": 6.571227550506592,
"learning_rate": 9.39845346792621e-05,
"loss": 5.4462,
"step": 805
},
{
"epoch": 0.2099805573558004,
"grad_norm": 6.540446758270264,
"learning_rate": 9.3881118093963e-05,
"loss": 5.3208,
"step": 810
},
{
"epoch": 0.21127673363577446,
"grad_norm": 6.814016342163086,
"learning_rate": 9.377687801329674e-05,
"loss": 5.7917,
"step": 815
},
{
"epoch": 0.21257290991574854,
"grad_norm": 7.118403434753418,
"learning_rate": 9.367181639348564e-05,
"loss": 6.0615,
"step": 820
},
{
"epoch": 0.21386908619572262,
"grad_norm": 6.125025749206543,
"learning_rate": 9.356593520616948e-05,
"loss": 5.8048,
"step": 825
},
{
"epoch": 0.2151652624756967,
"grad_norm": 5.778207778930664,
"learning_rate": 9.34592364383684e-05,
"loss": 5.3762,
"step": 830
},
{
"epoch": 0.21646143875567078,
"grad_norm": 6.2385382652282715,
"learning_rate": 9.335172209244575e-05,
"loss": 5.8005,
"step": 835
},
{
"epoch": 0.21775761503564484,
"grad_norm": 5.940319538116455,
"learning_rate": 9.324339418607041e-05,
"loss": 5.1152,
"step": 840
},
{
"epoch": 0.21905379131561892,
"grad_norm": 6.526686191558838,
"learning_rate": 9.31342547521789e-05,
"loss": 5.3928,
"step": 845
},
{
"epoch": 0.220349967595593,
"grad_norm": 6.4096221923828125,
"learning_rate": 9.302430583893731e-05,
"loss": 5.1502,
"step": 850
},
{
"epoch": 0.22164614387556708,
"grad_norm": 5.7976603507995605,
"learning_rate": 9.291354950970286e-05,
"loss": 5.2698,
"step": 855
},
{
"epoch": 0.22294232015554116,
"grad_norm": 6.290134906768799,
"learning_rate": 9.28019878429851e-05,
"loss": 5.2121,
"step": 860
},
{
"epoch": 0.22423849643551522,
"grad_norm": 9.13962173461914,
"learning_rate": 9.268962293240701e-05,
"loss": 5.3281,
"step": 865
},
{
"epoch": 0.2255346727154893,
"grad_norm": 6.373178482055664,
"learning_rate": 9.257645688666556e-05,
"loss": 5.1852,
"step": 870
},
{
"epoch": 0.22683084899546338,
"grad_norm": 5.77469539642334,
"learning_rate": 9.246249182949233e-05,
"loss": 5.5195,
"step": 875
},
{
"epoch": 0.22812702527543746,
"grad_norm": 6.406745433807373,
"learning_rate": 9.234772989961352e-05,
"loss": 5.1316,
"step": 880
},
{
"epoch": 0.22942320155541154,
"grad_norm": 6.724541187286377,
"learning_rate": 9.22321732507098e-05,
"loss": 5.308,
"step": 885
},
{
"epoch": 0.23071937783538563,
"grad_norm": 6.221187591552734,
"learning_rate": 9.211582405137603e-05,
"loss": 5.167,
"step": 890
},
{
"epoch": 0.23201555411535968,
"grad_norm": 6.091421604156494,
"learning_rate": 9.199868448508037e-05,
"loss": 5.5671,
"step": 895
},
{
"epoch": 0.23331173039533376,
"grad_norm": 6.427068710327148,
"learning_rate": 9.188075675012351e-05,
"loss": 5.1115,
"step": 900
},
{
"epoch": 0.23331173039533376,
"eval_loss": 1.747745156288147,
"eval_runtime": 353.516,
"eval_samples_per_second": 9.595,
"eval_steps_per_second": 1.199,
"step": 900
},
{
"epoch": 0.23460790667530784,
"grad_norm": 7.048336982727051,
"learning_rate": 9.176204305959726e-05,
"loss": 5.455,
"step": 905
},
{
"epoch": 0.23590408295528192,
"grad_norm": 43.31245803833008,
"learning_rate": 9.164254564134305e-05,
"loss": 5.8,
"step": 910
},
{
"epoch": 0.237200259235256,
"grad_norm": 6.042270660400391,
"learning_rate": 9.15222667379102e-05,
"loss": 5.7257,
"step": 915
},
{
"epoch": 0.23849643551523006,
"grad_norm": 6.102289199829102,
"learning_rate": 9.140120860651374e-05,
"loss": 5.6096,
"step": 920
},
{
"epoch": 0.23979261179520414,
"grad_norm": 6.190640449523926,
"learning_rate": 9.127937351899211e-05,
"loss": 5.2916,
"step": 925
},
{
"epoch": 0.24108878807517822,
"grad_norm": 6.64203405380249,
"learning_rate": 9.115676376176448e-05,
"loss": 5.4412,
"step": 930
},
{
"epoch": 0.2423849643551523,
"grad_norm": 6.799017429351807,
"learning_rate": 9.103338163578787e-05,
"loss": 6.0282,
"step": 935
},
{
"epoch": 0.2436811406351264,
"grad_norm": 6.019811153411865,
"learning_rate": 9.090922945651399e-05,
"loss": 5.1639,
"step": 940
},
{
"epoch": 0.24497731691510044,
"grad_norm": 6.229288578033447,
"learning_rate": 9.078430955384572e-05,
"loss": 5.4325,
"step": 945
},
{
"epoch": 0.24627349319507452,
"grad_norm": 6.305727005004883,
"learning_rate": 9.065862427209349e-05,
"loss": 5.3317,
"step": 950
},
{
"epoch": 0.2475696694750486,
"grad_norm": 6.441644191741943,
"learning_rate": 9.053217596993114e-05,
"loss": 5.232,
"step": 955
},
{
"epoch": 0.24886584575502269,
"grad_norm": 5.868955135345459,
"learning_rate": 9.040496702035181e-05,
"loss": 5.9553,
"step": 960
},
{
"epoch": 0.25016202203499677,
"grad_norm": 6.898869037628174,
"learning_rate": 9.027699981062332e-05,
"loss": 5.355,
"step": 965
},
{
"epoch": 0.25145819831497085,
"grad_norm": 6.511346817016602,
"learning_rate": 9.014827674224333e-05,
"loss": 5.2525,
"step": 970
},
{
"epoch": 0.25275437459494493,
"grad_norm": 6.207486629486084,
"learning_rate": 9.001880023089441e-05,
"loss": 5.1674,
"step": 975
},
{
"epoch": 0.254050550874919,
"grad_norm": 6.383389949798584,
"learning_rate": 8.988857270639857e-05,
"loss": 5.2571,
"step": 980
},
{
"epoch": 0.25534672715489304,
"grad_norm": 6.2587103843688965,
"learning_rate": 8.975759661267173e-05,
"loss": 5.3479,
"step": 985
},
{
"epoch": 0.2566429034348671,
"grad_norm": 7.096968173980713,
"learning_rate": 8.962587440767787e-05,
"loss": 5.22,
"step": 990
},
{
"epoch": 0.2579390797148412,
"grad_norm": 6.323131561279297,
"learning_rate": 8.94934085633828e-05,
"loss": 5.5058,
"step": 995
},
{
"epoch": 0.2592352559948153,
"grad_norm": 6.179114818572998,
"learning_rate": 8.93602015657079e-05,
"loss": 5.6379,
"step": 1000
},
{
"epoch": 0.26053143227478937,
"grad_norm": 5.8347601890563965,
"learning_rate": 8.922625591448341e-05,
"loss": 5.1801,
"step": 1005
},
{
"epoch": 0.26182760855476345,
"grad_norm": 6.349084854125977,
"learning_rate": 8.90915741234015e-05,
"loss": 5.2985,
"step": 1010
},
{
"epoch": 0.26312378483473753,
"grad_norm": 6.430611610412598,
"learning_rate": 8.895615871996911e-05,
"loss": 5.4722,
"step": 1015
},
{
"epoch": 0.2644199611147116,
"grad_norm": 7.19846248626709,
"learning_rate": 8.882001224546057e-05,
"loss": 5.2468,
"step": 1020
},
{
"epoch": 0.2657161373946857,
"grad_norm": 6.426541805267334,
"learning_rate": 8.868313725486979e-05,
"loss": 4.756,
"step": 1025
},
{
"epoch": 0.2670123136746598,
"grad_norm": 6.313275337219238,
"learning_rate": 8.854553631686241e-05,
"loss": 5.3424,
"step": 1030
},
{
"epoch": 0.26830848995463386,
"grad_norm": 6.315749168395996,
"learning_rate": 8.84072120137276e-05,
"loss": 5.6435,
"step": 1035
},
{
"epoch": 0.2696046662346079,
"grad_norm": 6.214425086975098,
"learning_rate": 8.826816694132955e-05,
"loss": 5.5317,
"step": 1040
},
{
"epoch": 0.27090084251458196,
"grad_norm": 6.459805488586426,
"learning_rate": 8.812840370905873e-05,
"loss": 5.3225,
"step": 1045
},
{
"epoch": 0.27219701879455604,
"grad_norm": 6.470264911651611,
"learning_rate": 8.798792493978305e-05,
"loss": 5.948,
"step": 1050
},
{
"epoch": 0.2734931950745301,
"grad_norm": 7.647356033325195,
"learning_rate": 8.784673326979844e-05,
"loss": 5.611,
"step": 1055
},
{
"epoch": 0.2747893713545042,
"grad_norm": 6.665130615234375,
"learning_rate": 8.77048313487796e-05,
"loss": 5.6659,
"step": 1060
},
{
"epoch": 0.2760855476344783,
"grad_norm": 6.436264514923096,
"learning_rate": 8.756222183973008e-05,
"loss": 4.9427,
"step": 1065
},
{
"epoch": 0.27738172391445237,
"grad_norm": 6.688732147216797,
"learning_rate": 8.741890741893244e-05,
"loss": 4.7955,
"step": 1070
},
{
"epoch": 0.27867790019442645,
"grad_norm": 6.054559707641602,
"learning_rate": 8.727489077589793e-05,
"loss": 5.2091,
"step": 1075
},
{
"epoch": 0.27997407647440054,
"grad_norm": 6.237517356872559,
"learning_rate": 8.713017461331608e-05,
"loss": 5.6823,
"step": 1080
},
{
"epoch": 0.2812702527543746,
"grad_norm": 6.290626525878906,
"learning_rate": 8.698476164700395e-05,
"loss": 5.2632,
"step": 1085
},
{
"epoch": 0.2825664290343487,
"grad_norm": 6.412489891052246,
"learning_rate": 8.683865460585518e-05,
"loss": 5.3348,
"step": 1090
},
{
"epoch": 0.2838626053143227,
"grad_norm": 6.4528632164001465,
"learning_rate": 8.669185623178879e-05,
"loss": 5.1606,
"step": 1095
},
{
"epoch": 0.2851587815942968,
"grad_norm": 7.906050682067871,
"learning_rate": 8.654436927969767e-05,
"loss": 5.5195,
"step": 1100
},
{
"epoch": 0.2864549578742709,
"grad_norm": 6.647706031799316,
"learning_rate": 8.639619651739694e-05,
"loss": 4.9791,
"step": 1105
},
{
"epoch": 0.28775113415424497,
"grad_norm": 5.931995391845703,
"learning_rate": 8.624734072557199e-05,
"loss": 5.915,
"step": 1110
},
{
"epoch": 0.28904731043421905,
"grad_norm": 5.751828193664551,
"learning_rate": 8.609780469772623e-05,
"loss": 5.1009,
"step": 1115
},
{
"epoch": 0.29034348671419313,
"grad_norm": 6.851621627807617,
"learning_rate": 8.59475912401288e-05,
"loss": 5.3921,
"step": 1120
},
{
"epoch": 0.2916396629941672,
"grad_norm": 6.673628330230713,
"learning_rate": 8.579670317176179e-05,
"loss": 5.3902,
"step": 1125
},
{
"epoch": 0.2929358392741413,
"grad_norm": 6.149777412414551,
"learning_rate": 8.564514332426741e-05,
"loss": 5.479,
"step": 1130
},
{
"epoch": 0.2942320155541154,
"grad_norm": 5.980111122131348,
"learning_rate": 8.549291454189477e-05,
"loss": 5.154,
"step": 1135
},
{
"epoch": 0.29552819183408946,
"grad_norm": 6.860113143920898,
"learning_rate": 8.534001968144656e-05,
"loss": 5.5186,
"step": 1140
},
{
"epoch": 0.2968243681140635,
"grad_norm": 6.145979404449463,
"learning_rate": 8.51864616122255e-05,
"loss": 5.0269,
"step": 1145
},
{
"epoch": 0.29812054439403757,
"grad_norm": 5.662299633026123,
"learning_rate": 8.503224321598035e-05,
"loss": 5.5617,
"step": 1150
},
{
"epoch": 0.29941672067401165,
"grad_norm": 6.544498920440674,
"learning_rate": 8.48773673868519e-05,
"loss": 5.2969,
"step": 1155
},
{
"epoch": 0.30071289695398573,
"grad_norm": 6.1995391845703125,
"learning_rate": 8.472183703131873e-05,
"loss": 5.2686,
"step": 1160
},
{
"epoch": 0.3020090732339598,
"grad_norm": 6.684445381164551,
"learning_rate": 8.456565506814251e-05,
"loss": 5.3912,
"step": 1165
},
{
"epoch": 0.3033052495139339,
"grad_norm": 6.4725260734558105,
"learning_rate": 8.440882442831336e-05,
"loss": 5.1365,
"step": 1170
},
{
"epoch": 0.304601425793908,
"grad_norm": 6.407034397125244,
"learning_rate": 8.42513480549948e-05,
"loss": 5.3766,
"step": 1175
},
{
"epoch": 0.30589760207388206,
"grad_norm": 7.576113224029541,
"learning_rate": 8.409322890346847e-05,
"loss": 5.5042,
"step": 1180
},
{
"epoch": 0.30719377835385614,
"grad_norm": 6.397856712341309,
"learning_rate": 8.393446994107877e-05,
"loss": 5.2347,
"step": 1185
},
{
"epoch": 0.3084899546338302,
"grad_norm": 6.017226696014404,
"learning_rate": 8.377507414717706e-05,
"loss": 4.9617,
"step": 1190
},
{
"epoch": 0.3097861309138043,
"grad_norm": 7.199647903442383,
"learning_rate": 8.361504451306585e-05,
"loss": 5.1948,
"step": 1195
},
{
"epoch": 0.31108230719377833,
"grad_norm": 6.387392520904541,
"learning_rate": 8.345438404194259e-05,
"loss": 5.0078,
"step": 1200
},
{
"epoch": 0.31108230719377833,
"eval_loss": 1.7383368015289307,
"eval_runtime": 353.0434,
"eval_samples_per_second": 9.608,
"eval_steps_per_second": 1.201,
"step": 1200
},
{
"epoch": 0.3123784834737524,
"grad_norm": 6.844247817993164,
"learning_rate": 8.329309574884335e-05,
"loss": 5.4026,
"step": 1205
},
{
"epoch": 0.3136746597537265,
"grad_norm": 6.19937801361084,
"learning_rate": 8.313118266058619e-05,
"loss": 4.951,
"step": 1210
},
{
"epoch": 0.3149708360337006,
"grad_norm": 6.4328742027282715,
"learning_rate": 8.296864781571448e-05,
"loss": 5.263,
"step": 1215
},
{
"epoch": 0.31626701231367466,
"grad_norm": 6.944987773895264,
"learning_rate": 8.28054942644397e-05,
"loss": 5.4176,
"step": 1220
},
{
"epoch": 0.31756318859364874,
"grad_norm": 5.801463603973389,
"learning_rate": 8.264172506858434e-05,
"loss": 5.3857,
"step": 1225
},
{
"epoch": 0.3188593648736228,
"grad_norm": 6.501681804656982,
"learning_rate": 8.247734330152436e-05,
"loss": 5.3497,
"step": 1230
},
{
"epoch": 0.3201555411535969,
"grad_norm": 6.634169578552246,
"learning_rate": 8.231235204813157e-05,
"loss": 5.232,
"step": 1235
},
{
"epoch": 0.321451717433571,
"grad_norm": 6.158379554748535,
"learning_rate": 8.21467544047157e-05,
"loss": 5.0522,
"step": 1240
},
{
"epoch": 0.32274789371354506,
"grad_norm": 5.85274076461792,
"learning_rate": 8.19805534789663e-05,
"loss": 5.0862,
"step": 1245
},
{
"epoch": 0.32404406999351915,
"grad_norm": 6.4909348487854,
"learning_rate": 8.181375238989438e-05,
"loss": 5.1882,
"step": 1250
},
{
"epoch": 0.32534024627349317,
"grad_norm": 6.870463848114014,
"learning_rate": 8.164635426777404e-05,
"loss": 5.31,
"step": 1255
},
{
"epoch": 0.32663642255346725,
"grad_norm": 6.946374893188477,
"learning_rate": 8.147836225408347e-05,
"loss": 5.3501,
"step": 1260
},
{
"epoch": 0.32793259883344134,
"grad_norm": 6.639915943145752,
"learning_rate": 8.130977950144621e-05,
"loss": 5.0554,
"step": 1265
},
{
"epoch": 0.3292287751134154,
"grad_norm": 5.978764057159424,
"learning_rate": 8.11406091735719e-05,
"loss": 4.9968,
"step": 1270
},
{
"epoch": 0.3305249513933895,
"grad_norm": 6.2093706130981445,
"learning_rate": 8.097085444519688e-05,
"loss": 5.2527,
"step": 1275
},
{
"epoch": 0.3318211276733636,
"grad_norm": 6.899875164031982,
"learning_rate": 8.080051850202468e-05,
"loss": 5.0845,
"step": 1280
},
{
"epoch": 0.33311730395333766,
"grad_norm": 6.9142656326293945,
"learning_rate": 8.062960454066619e-05,
"loss": 5.4892,
"step": 1285
},
{
"epoch": 0.33441348023331174,
"grad_norm": 7.387802600860596,
"learning_rate": 8.04581157685797e-05,
"loss": 5.49,
"step": 1290
},
{
"epoch": 0.3357096565132858,
"grad_norm": 6.345734119415283,
"learning_rate": 8.028605540401065e-05,
"loss": 5.2043,
"step": 1295
},
{
"epoch": 0.3370058327932599,
"grad_norm": 6.245737552642822,
"learning_rate": 8.011342667593132e-05,
"loss": 4.9001,
"step": 1300
},
{
"epoch": 0.33830200907323393,
"grad_norm": 6.332602500915527,
"learning_rate": 7.994023282398017e-05,
"loss": 5.4487,
"step": 1305
},
{
"epoch": 0.339598185353208,
"grad_norm": 6.836564064025879,
"learning_rate": 7.976647709840104e-05,
"loss": 5.1026,
"step": 1310
},
{
"epoch": 0.3408943616331821,
"grad_norm": 6.576712608337402,
"learning_rate": 7.959216275998223e-05,
"loss": 5.413,
"step": 1315
},
{
"epoch": 0.3421905379131562,
"grad_norm": 6.615803241729736,
"learning_rate": 7.94172930799952e-05,
"loss": 5.2105,
"step": 1320
},
{
"epoch": 0.34348671419313026,
"grad_norm": 6.192866325378418,
"learning_rate": 7.924187134013323e-05,
"loss": 4.9693,
"step": 1325
},
{
"epoch": 0.34478289047310434,
"grad_norm": 7.869561195373535,
"learning_rate": 7.906590083244991e-05,
"loss": 5.1879,
"step": 1330
},
{
"epoch": 0.3460790667530784,
"grad_norm": 6.713994979858398,
"learning_rate": 7.888938485929718e-05,
"loss": 5.12,
"step": 1335
},
{
"epoch": 0.3473752430330525,
"grad_norm": 6.530569076538086,
"learning_rate": 7.871232673326356e-05,
"loss": 5.3551,
"step": 1340
},
{
"epoch": 0.3486714193130266,
"grad_norm": 6.053219318389893,
"learning_rate": 7.853472977711183e-05,
"loss": 4.961,
"step": 1345
},
{
"epoch": 0.34996759559300067,
"grad_norm": 7.1648664474487305,
"learning_rate": 7.835659732371671e-05,
"loss": 5.2817,
"step": 1350
},
{
"epoch": 0.35126377187297475,
"grad_norm": 8.36543083190918,
"learning_rate": 7.817793271600242e-05,
"loss": 5.1267,
"step": 1355
},
{
"epoch": 0.3525599481529488,
"grad_norm": 6.720150470733643,
"learning_rate": 7.799873930687978e-05,
"loss": 5.3077,
"step": 1360
},
{
"epoch": 0.35385612443292286,
"grad_norm": 7.16845178604126,
"learning_rate": 7.781902045918337e-05,
"loss": 5.0731,
"step": 1365
},
{
"epoch": 0.35515230071289694,
"grad_norm": 6.205852508544922,
"learning_rate": 7.763877954560848e-05,
"loss": 5.0185,
"step": 1370
},
{
"epoch": 0.356448476992871,
"grad_norm": 6.484716892242432,
"learning_rate": 7.745801994864766e-05,
"loss": 4.8446,
"step": 1375
},
{
"epoch": 0.3577446532728451,
"grad_norm": 6.074382781982422,
"learning_rate": 7.727674506052743e-05,
"loss": 5.4281,
"step": 1380
},
{
"epoch": 0.3590408295528192,
"grad_norm": 6.475484371185303,
"learning_rate": 7.709495828314448e-05,
"loss": 5.2827,
"step": 1385
},
{
"epoch": 0.36033700583279327,
"grad_norm": 6.075077056884766,
"learning_rate": 7.691266302800186e-05,
"loss": 5.2338,
"step": 1390
},
{
"epoch": 0.36163318211276735,
"grad_norm": 6.620121955871582,
"learning_rate": 7.6729862716145e-05,
"loss": 5.1154,
"step": 1395
},
{
"epoch": 0.36292935839274143,
"grad_norm": 6.225570201873779,
"learning_rate": 7.654656077809747e-05,
"loss": 5.1745,
"step": 1400
},
{
"epoch": 0.3642255346727155,
"grad_norm": 6.469938278198242,
"learning_rate": 7.63627606537966e-05,
"loss": 5.2555,
"step": 1405
},
{
"epoch": 0.36552171095268954,
"grad_norm": 6.713720321655273,
"learning_rate": 7.617846579252897e-05,
"loss": 5.2147,
"step": 1410
},
{
"epoch": 0.3668178872326636,
"grad_norm": 5.663670063018799,
"learning_rate": 7.599367965286559e-05,
"loss": 4.6885,
"step": 1415
},
{
"epoch": 0.3681140635126377,
"grad_norm": 6.744603157043457,
"learning_rate": 7.580840570259713e-05,
"loss": 5.572,
"step": 1420
},
{
"epoch": 0.3694102397926118,
"grad_norm": 6.041903018951416,
"learning_rate": 7.562264741866869e-05,
"loss": 4.9152,
"step": 1425
},
{
"epoch": 0.37070641607258586,
"grad_norm": 7.129952907562256,
"learning_rate": 7.543640828711466e-05,
"loss": 4.7115,
"step": 1430
},
{
"epoch": 0.37200259235255995,
"grad_norm": 6.357120990753174,
"learning_rate": 7.524969180299325e-05,
"loss": 5.0026,
"step": 1435
},
{
"epoch": 0.37329876863253403,
"grad_norm": 5.915050506591797,
"learning_rate": 7.506250147032088e-05,
"loss": 5.3111,
"step": 1440
},
{
"epoch": 0.3745949449125081,
"grad_norm": 6.719303607940674,
"learning_rate": 7.487484080200653e-05,
"loss": 5.6598,
"step": 1445
},
{
"epoch": 0.3758911211924822,
"grad_norm": 7.025637626647949,
"learning_rate": 7.468671331978567e-05,
"loss": 5.3481,
"step": 1450
},
{
"epoch": 0.3771872974724563,
"grad_norm": 7.205336570739746,
"learning_rate": 7.449812255415423e-05,
"loss": 5.3031,
"step": 1455
},
{
"epoch": 0.37848347375243035,
"grad_norm": 5.23866081237793,
"learning_rate": 7.430907204430242e-05,
"loss": 4.8644,
"step": 1460
},
{
"epoch": 0.3797796500324044,
"grad_norm": 6.661409854888916,
"learning_rate": 7.411956533804818e-05,
"loss": 5.0625,
"step": 1465
},
{
"epoch": 0.38107582631237846,
"grad_norm": 7.062594890594482,
"learning_rate": 7.39296059917707e-05,
"loss": 5.1213,
"step": 1470
},
{
"epoch": 0.38237200259235254,
"grad_norm": 6.861457347869873,
"learning_rate": 7.373919757034362e-05,
"loss": 5.4799,
"step": 1475
},
{
"epoch": 0.3836681788723266,
"grad_norm": 6.35822057723999,
"learning_rate": 7.354834364706818e-05,
"loss": 5.4426,
"step": 1480
},
{
"epoch": 0.3849643551523007,
"grad_norm": 6.43895959854126,
"learning_rate": 7.335704780360608e-05,
"loss": 4.9054,
"step": 1485
},
{
"epoch": 0.3862605314322748,
"grad_norm": 6.686366558074951,
"learning_rate": 7.316531362991239e-05,
"loss": 4.7958,
"step": 1490
},
{
"epoch": 0.38755670771224887,
"grad_norm": 5.39023494720459,
"learning_rate": 7.297314472416805e-05,
"loss": 4.9293,
"step": 1495
},
{
"epoch": 0.38885288399222295,
"grad_norm": 6.572624206542969,
"learning_rate": 7.278054469271245e-05,
"loss": 5.1606,
"step": 1500
},
{
"epoch": 0.38885288399222295,
"eval_loss": 1.7062296867370605,
"eval_runtime": 353.1925,
"eval_samples_per_second": 9.604,
"eval_steps_per_second": 1.2,
"step": 1500
},
{
"epoch": 0.39014906027219703,
"grad_norm": 5.829883098602295,
"learning_rate": 7.258751714997568e-05,
"loss": 5.0312,
"step": 1505
},
{
"epoch": 0.3914452365521711,
"grad_norm": 6.756894588470459,
"learning_rate": 7.239406571841068e-05,
"loss": 5.0417,
"step": 1510
},
{
"epoch": 0.3927414128321452,
"grad_norm": 7.765864372253418,
"learning_rate": 7.22001940284254e-05,
"loss": 4.7524,
"step": 1515
},
{
"epoch": 0.3940375891121192,
"grad_norm": 7.117984294891357,
"learning_rate": 7.200590571831447e-05,
"loss": 5.276,
"step": 1520
},
{
"epoch": 0.3953337653920933,
"grad_norm": 6.583863735198975,
"learning_rate": 7.181120443419113e-05,
"loss": 4.4836,
"step": 1525
},
{
"epoch": 0.3966299416720674,
"grad_norm": 6.2899651527404785,
"learning_rate": 7.161609382991861e-05,
"loss": 4.8327,
"step": 1530
},
{
"epoch": 0.39792611795204147,
"grad_norm": 10.40349292755127,
"learning_rate": 7.142057756704168e-05,
"loss": 5.2758,
"step": 1535
},
{
"epoch": 0.39922229423201555,
"grad_norm": 6.652336120605469,
"learning_rate": 7.122465931471794e-05,
"loss": 5.19,
"step": 1540
},
{
"epoch": 0.40051847051198963,
"grad_norm": 6.063689708709717,
"learning_rate": 7.102834274964889e-05,
"loss": 5.4008,
"step": 1545
},
{
"epoch": 0.4018146467919637,
"grad_norm": 6.221471309661865,
"learning_rate": 7.083163155601097e-05,
"loss": 4.8305,
"step": 1550
},
{
"epoch": 0.4031108230719378,
"grad_norm": 6.177482604980469,
"learning_rate": 7.063452942538644e-05,
"loss": 5.0628,
"step": 1555
},
{
"epoch": 0.4044069993519119,
"grad_norm": 5.943436622619629,
"learning_rate": 7.043704005669405e-05,
"loss": 5.0516,
"step": 1560
},
{
"epoch": 0.40570317563188596,
"grad_norm": 6.601740837097168,
"learning_rate": 7.023916715611969e-05,
"loss": 4.8331,
"step": 1565
},
{
"epoch": 0.40699935191186,
"grad_norm": 6.374989986419678,
"learning_rate": 7.004091443704681e-05,
"loss": 5.3084,
"step": 1570
},
{
"epoch": 0.40829552819183407,
"grad_norm": 6.466265678405762,
"learning_rate": 6.984228561998669e-05,
"loss": 5.2695,
"step": 1575
},
{
"epoch": 0.40959170447180815,
"grad_norm": 6.188666343688965,
"learning_rate": 6.964328443250867e-05,
"loss": 4.8792,
"step": 1580
},
{
"epoch": 0.41088788075178223,
"grad_norm": 5.995561122894287,
"learning_rate": 6.944391460917021e-05,
"loss": 4.8856,
"step": 1585
},
{
"epoch": 0.4121840570317563,
"grad_norm": 6.418734073638916,
"learning_rate": 6.924417989144674e-05,
"loss": 5.2133,
"step": 1590
},
{
"epoch": 0.4134802333117304,
"grad_norm": 6.242722034454346,
"learning_rate": 6.90440840276615e-05,
"loss": 5.1887,
"step": 1595
},
{
"epoch": 0.4147764095917045,
"grad_norm": 6.374722003936768,
"learning_rate": 6.884363077291517e-05,
"loss": 5.5929,
"step": 1600
},
{
"epoch": 0.41607258587167856,
"grad_norm": 6.672064781188965,
"learning_rate": 6.864282388901544e-05,
"loss": 5.5616,
"step": 1605
},
{
"epoch": 0.41736876215165264,
"grad_norm": 6.3545732498168945,
"learning_rate": 6.844166714440635e-05,
"loss": 5.1473,
"step": 1610
},
{
"epoch": 0.4186649384316267,
"grad_norm": 8.2235107421875,
"learning_rate": 6.824016431409762e-05,
"loss": 4.7892,
"step": 1615
},
{
"epoch": 0.4199611147116008,
"grad_norm": 6.904574394226074,
"learning_rate": 6.803831917959381e-05,
"loss": 5.3336,
"step": 1620
},
{
"epoch": 0.42125729099157483,
"grad_norm": 6.573385715484619,
"learning_rate": 6.783613552882329e-05,
"loss": 5.1198,
"step": 1625
},
{
"epoch": 0.4225534672715489,
"grad_norm": 6.856905460357666,
"learning_rate": 6.763361715606723e-05,
"loss": 5.2217,
"step": 1630
},
{
"epoch": 0.423849643551523,
"grad_norm": 6.580875396728516,
"learning_rate": 6.743076786188833e-05,
"loss": 5.2903,
"step": 1635
},
{
"epoch": 0.4251458198314971,
"grad_norm": 6.890357971191406,
"learning_rate": 6.72275914530596e-05,
"loss": 4.811,
"step": 1640
},
{
"epoch": 0.42644199611147116,
"grad_norm": 6.392624378204346,
"learning_rate": 6.702409174249275e-05,
"loss": 5.2851,
"step": 1645
},
{
"epoch": 0.42773817239144524,
"grad_norm": 5.905599117279053,
"learning_rate": 6.682027254916686e-05,
"loss": 4.8389,
"step": 1650
},
{
"epoch": 0.4290343486714193,
"grad_norm": 6.269746780395508,
"learning_rate": 6.661613769805644e-05,
"loss": 5.1311,
"step": 1655
},
{
"epoch": 0.4303305249513934,
"grad_norm": 6.146518230438232,
"learning_rate": 6.641169102005991e-05,
"loss": 4.8601,
"step": 1660
},
{
"epoch": 0.4316267012313675,
"grad_norm": 6.379298686981201,
"learning_rate": 6.620693635192754e-05,
"loss": 5.3775,
"step": 1665
},
{
"epoch": 0.43292287751134156,
"grad_norm": 6.051974773406982,
"learning_rate": 6.600187753618951e-05,
"loss": 5.0664,
"step": 1670
},
{
"epoch": 0.43421905379131565,
"grad_norm": 6.480034828186035,
"learning_rate": 6.57965184210838e-05,
"loss": 5.0112,
"step": 1675
},
{
"epoch": 0.43551523007128967,
"grad_norm": 5.577784538269043,
"learning_rate": 6.559086286048394e-05,
"loss": 4.684,
"step": 1680
},
{
"epoch": 0.43681140635126375,
"grad_norm": 6.112408638000488,
"learning_rate": 6.53849147138267e-05,
"loss": 4.7755,
"step": 1685
},
{
"epoch": 0.43810758263123784,
"grad_norm": 6.673699378967285,
"learning_rate": 6.517867784603972e-05,
"loss": 5.4472,
"step": 1690
},
{
"epoch": 0.4394037589112119,
"grad_norm": 5.8759446144104,
"learning_rate": 6.497215612746886e-05,
"loss": 4.9053,
"step": 1695
},
{
"epoch": 0.440699935191186,
"grad_norm": 6.411087989807129,
"learning_rate": 6.47653534338057e-05,
"loss": 4.6784,
"step": 1700
},
{
"epoch": 0.4419961114711601,
"grad_norm": 5.656676769256592,
"learning_rate": 6.455827364601468e-05,
"loss": 4.7073,
"step": 1705
},
{
"epoch": 0.44329228775113416,
"grad_norm": 7.167407035827637,
"learning_rate": 6.435092065026035e-05,
"loss": 4.815,
"step": 1710
},
{
"epoch": 0.44458846403110824,
"grad_norm": 7.390071868896484,
"learning_rate": 6.414329833783446e-05,
"loss": 5.2979,
"step": 1715
},
{
"epoch": 0.4458846403110823,
"grad_norm": 6.286187171936035,
"learning_rate": 6.393541060508283e-05,
"loss": 4.428,
"step": 1720
},
{
"epoch": 0.4471808165910564,
"grad_norm": 6.020079135894775,
"learning_rate": 6.372726135333234e-05,
"loss": 5.1399,
"step": 1725
},
{
"epoch": 0.44847699287103043,
"grad_norm": 6.759265899658203,
"learning_rate": 6.351885448881765e-05,
"loss": 5.0373,
"step": 1730
},
{
"epoch": 0.4497731691510045,
"grad_norm": 6.868116855621338,
"learning_rate": 6.331019392260791e-05,
"loss": 5.334,
"step": 1735
},
{
"epoch": 0.4510693454309786,
"grad_norm": 6.369646072387695,
"learning_rate": 6.310128357053339e-05,
"loss": 4.9139,
"step": 1740
},
{
"epoch": 0.4523655217109527,
"grad_norm": 5.470962047576904,
"learning_rate": 6.28921273531119e-05,
"loss": 4.8604,
"step": 1745
},
{
"epoch": 0.45366169799092676,
"grad_norm": 6.49376916885376,
"learning_rate": 6.268272919547537e-05,
"loss": 5.0324,
"step": 1750
},
{
"epoch": 0.45495787427090084,
"grad_norm": 6.682295799255371,
"learning_rate": 6.247309302729607e-05,
"loss": 4.9259,
"step": 1755
},
{
"epoch": 0.4562540505508749,
"grad_norm": 6.105667591094971,
"learning_rate": 6.226322278271286e-05,
"loss": 5.2015,
"step": 1760
},
{
"epoch": 0.457550226830849,
"grad_norm": 6.069522380828857,
"learning_rate": 6.205312240025745e-05,
"loss": 4.7538,
"step": 1765
},
{
"epoch": 0.4588464031108231,
"grad_norm": 6.145866394042969,
"learning_rate": 6.184279582278039e-05,
"loss": 4.8113,
"step": 1770
},
{
"epoch": 0.46014257939079717,
"grad_norm": 6.4372358322143555,
"learning_rate": 6.163224699737718e-05,
"loss": 5.0577,
"step": 1775
},
{
"epoch": 0.46143875567077125,
"grad_norm": 6.565794944763184,
"learning_rate": 6.142147987531407e-05,
"loss": 5.0166,
"step": 1780
},
{
"epoch": 0.4627349319507453,
"grad_norm": 6.475300312042236,
"learning_rate": 6.121049841195402e-05,
"loss": 4.7242,
"step": 1785
},
{
"epoch": 0.46403110823071936,
"grad_norm": 6.883950710296631,
"learning_rate": 6.099930656668241e-05,
"loss": 5.2766,
"step": 1790
},
{
"epoch": 0.46532728451069344,
"grad_norm": 6.661801338195801,
"learning_rate": 6.078790830283276e-05,
"loss": 5.1597,
"step": 1795
},
{
"epoch": 0.4666234607906675,
"grad_norm": 6.135336875915527,
"learning_rate": 6.0576307587612347e-05,
"loss": 4.6236,
"step": 1800
},
{
"epoch": 0.4666234607906675,
"eval_loss": 1.6923874616622925,
"eval_runtime": 353.1926,
"eval_samples_per_second": 9.604,
"eval_steps_per_second": 1.2,
"step": 1800
}
],
"logging_steps": 5,
"max_steps": 3858,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.140322113650688e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}