no_pipeline_100k_32B / trainer_state.json
ssu53's picture
Upload model
72929ea verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.961240310077519,
"eval_steps": 500,
"global_step": 240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020671834625323,
"grad_norm": 2.9160615895696287,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.8883,
"step": 1
},
{
"epoch": 0.041343669250646,
"grad_norm": 2.8580123177883006,
"learning_rate": 6.666666666666667e-06,
"loss": 0.8848,
"step": 2
},
{
"epoch": 0.06201550387596899,
"grad_norm": 2.7122432751897176,
"learning_rate": 1e-05,
"loss": 0.8795,
"step": 3
},
{
"epoch": 0.082687338501292,
"grad_norm": 1.513030205425536,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.8245,
"step": 4
},
{
"epoch": 0.10335917312661498,
"grad_norm": 1.3443713492280498,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.7893,
"step": 5
},
{
"epoch": 0.12403100775193798,
"grad_norm": 1.1804286091298655,
"learning_rate": 2e-05,
"loss": 0.7386,
"step": 6
},
{
"epoch": 0.14470284237726097,
"grad_norm": 1.4145959109105715,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.7205,
"step": 7
},
{
"epoch": 0.165374677002584,
"grad_norm": 1.2357655230641855,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.6846,
"step": 8
},
{
"epoch": 0.18604651162790697,
"grad_norm": 1.0044529380434546,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.6703,
"step": 9
},
{
"epoch": 0.20671834625322996,
"grad_norm": 0.7736903323340503,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.6625,
"step": 10
},
{
"epoch": 0.22739018087855298,
"grad_norm": 0.8691083811443499,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.639,
"step": 11
},
{
"epoch": 0.24806201550387597,
"grad_norm": 0.7248603690109244,
"learning_rate": 4e-05,
"loss": 0.6387,
"step": 12
},
{
"epoch": 0.268733850129199,
"grad_norm": 0.8294236796702565,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.6344,
"step": 13
},
{
"epoch": 0.28940568475452194,
"grad_norm": 0.5760472222889121,
"learning_rate": 4.666666666666667e-05,
"loss": 0.6157,
"step": 14
},
{
"epoch": 0.31007751937984496,
"grad_norm": 0.5595513798990738,
"learning_rate": 5e-05,
"loss": 0.6132,
"step": 15
},
{
"epoch": 0.330749354005168,
"grad_norm": 0.6094355123731108,
"learning_rate": 5.333333333333333e-05,
"loss": 0.607,
"step": 16
},
{
"epoch": 0.35142118863049093,
"grad_norm": 0.4550744555441343,
"learning_rate": 5.666666666666668e-05,
"loss": 0.597,
"step": 17
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.5503338541742518,
"learning_rate": 6.000000000000001e-05,
"loss": 0.5883,
"step": 18
},
{
"epoch": 0.39276485788113696,
"grad_norm": 0.38787592050569225,
"learning_rate": 6.333333333333333e-05,
"loss": 0.585,
"step": 19
},
{
"epoch": 0.4134366925064599,
"grad_norm": 0.5311180440664859,
"learning_rate": 6.666666666666667e-05,
"loss": 0.5823,
"step": 20
},
{
"epoch": 0.43410852713178294,
"grad_norm": 0.5233803608939733,
"learning_rate": 7.000000000000001e-05,
"loss": 0.5826,
"step": 21
},
{
"epoch": 0.45478036175710596,
"grad_norm": 0.5080450227129891,
"learning_rate": 7.333333333333333e-05,
"loss": 0.5725,
"step": 22
},
{
"epoch": 0.4754521963824289,
"grad_norm": 0.6170489561809843,
"learning_rate": 7.666666666666668e-05,
"loss": 0.5794,
"step": 23
},
{
"epoch": 0.49612403100775193,
"grad_norm": 0.8492871852161974,
"learning_rate": 8e-05,
"loss": 0.5739,
"step": 24
},
{
"epoch": 0.5167958656330749,
"grad_norm": 1.0957049612040843,
"learning_rate": 7.99957692770843e-05,
"loss": 0.5738,
"step": 25
},
{
"epoch": 0.537467700258398,
"grad_norm": 1.1467346259917117,
"learning_rate": 7.998307800328803e-05,
"loss": 0.5676,
"step": 26
},
{
"epoch": 0.5581395348837209,
"grad_norm": 0.9479677758413664,
"learning_rate": 7.996192886327432e-05,
"loss": 0.5626,
"step": 27
},
{
"epoch": 0.5788113695090439,
"grad_norm": 0.645380904520113,
"learning_rate": 7.993232633085074e-05,
"loss": 0.5577,
"step": 28
},
{
"epoch": 0.599483204134367,
"grad_norm": 0.7182744075304529,
"learning_rate": 7.98942766680229e-05,
"loss": 0.5588,
"step": 29
},
{
"epoch": 0.6201550387596899,
"grad_norm": 0.5127412291089614,
"learning_rate": 7.984778792366983e-05,
"loss": 0.5576,
"step": 30
},
{
"epoch": 0.6408268733850129,
"grad_norm": 0.5717315410000132,
"learning_rate": 7.979286993184134e-05,
"loss": 0.5543,
"step": 31
},
{
"epoch": 0.661498708010336,
"grad_norm": 0.574802710774332,
"learning_rate": 7.972953430967773e-05,
"loss": 0.5546,
"step": 32
},
{
"epoch": 0.6821705426356589,
"grad_norm": 0.38071050250313326,
"learning_rate": 7.965779445495243e-05,
"loss": 0.5423,
"step": 33
},
{
"epoch": 0.7028423772609819,
"grad_norm": 0.560940638806937,
"learning_rate": 7.957766554323778e-05,
"loss": 0.5549,
"step": 34
},
{
"epoch": 0.7235142118863049,
"grad_norm": 0.3195456558063643,
"learning_rate": 7.948916452469497e-05,
"loss": 0.5381,
"step": 35
},
{
"epoch": 0.7441860465116279,
"grad_norm": 0.5120309870626497,
"learning_rate": 7.939231012048833e-05,
"loss": 0.5334,
"step": 36
},
{
"epoch": 0.7648578811369509,
"grad_norm": 0.41200122065705974,
"learning_rate": 7.928712281882523e-05,
"loss": 0.546,
"step": 37
},
{
"epoch": 0.7855297157622739,
"grad_norm": 0.34801915896030955,
"learning_rate": 7.917362487062207e-05,
"loss": 0.538,
"step": 38
},
{
"epoch": 0.8062015503875969,
"grad_norm": 0.4814496611227264,
"learning_rate": 7.905184028479734e-05,
"loss": 0.5395,
"step": 39
},
{
"epoch": 0.8268733850129198,
"grad_norm": 0.343357625146377,
"learning_rate": 7.892179482319297e-05,
"loss": 0.5351,
"step": 40
},
{
"epoch": 0.8475452196382429,
"grad_norm": 0.3732374596704409,
"learning_rate": 7.878351599512465e-05,
"loss": 0.5332,
"step": 41
},
{
"epoch": 0.8682170542635659,
"grad_norm": 0.35877511208535684,
"learning_rate": 7.863703305156273e-05,
"loss": 0.5255,
"step": 42
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.2718043315180321,
"learning_rate": 7.848237697894453e-05,
"loss": 0.5289,
"step": 43
},
{
"epoch": 0.9095607235142119,
"grad_norm": 0.3538773839345982,
"learning_rate": 7.831958049261956e-05,
"loss": 0.53,
"step": 44
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.23197832751573458,
"learning_rate": 7.814867802992907e-05,
"loss": 0.528,
"step": 45
},
{
"epoch": 0.9509043927648578,
"grad_norm": 0.3210740150407701,
"learning_rate": 7.796970574292136e-05,
"loss": 0.5281,
"step": 46
},
{
"epoch": 0.9715762273901809,
"grad_norm": 0.3700032416130845,
"learning_rate": 7.77827014907042e-05,
"loss": 0.5214,
"step": 47
},
{
"epoch": 0.9922480620155039,
"grad_norm": 0.3009183027050577,
"learning_rate": 7.758770483143634e-05,
"loss": 0.5269,
"step": 48
},
{
"epoch": 1.0129198966408268,
"grad_norm": 0.3896828923021403,
"learning_rate": 7.738475701395955e-05,
"loss": 0.5093,
"step": 49
},
{
"epoch": 1.0335917312661498,
"grad_norm": 0.543202361441145,
"learning_rate": 7.71739009690729e-05,
"loss": 0.4955,
"step": 50
},
{
"epoch": 1.054263565891473,
"grad_norm": 0.7625591010952377,
"learning_rate": 7.695518130045147e-05,
"loss": 0.4969,
"step": 51
},
{
"epoch": 1.074935400516796,
"grad_norm": 1.193784060003013,
"learning_rate": 7.672864427521097e-05,
"loss": 0.5087,
"step": 52
},
{
"epoch": 1.0956072351421189,
"grad_norm": 0.7752291239959456,
"learning_rate": 7.649433781412058e-05,
"loss": 0.5012,
"step": 53
},
{
"epoch": 1.1162790697674418,
"grad_norm": 0.6474587717802546,
"learning_rate": 7.625231148146601e-05,
"loss": 0.4947,
"step": 54
},
{
"epoch": 1.1369509043927648,
"grad_norm": 0.7738304727686638,
"learning_rate": 7.600261647456485e-05,
"loss": 0.501,
"step": 55
},
{
"epoch": 1.1576227390180878,
"grad_norm": 0.6980515832191755,
"learning_rate": 7.57453056129365e-05,
"loss": 0.4909,
"step": 56
},
{
"epoch": 1.178294573643411,
"grad_norm": 0.6004371658326068,
"learning_rate": 7.548043332712887e-05,
"loss": 0.4889,
"step": 57
},
{
"epoch": 1.198966408268734,
"grad_norm": 0.6165348115304269,
"learning_rate": 7.520805564720444e-05,
"loss": 0.4916,
"step": 58
},
{
"epoch": 1.2196382428940569,
"grad_norm": 0.6291962094809461,
"learning_rate": 7.492823019088785e-05,
"loss": 0.4876,
"step": 59
},
{
"epoch": 1.2403100775193798,
"grad_norm": 0.5151425388987184,
"learning_rate": 7.464101615137756e-05,
"loss": 0.4903,
"step": 60
},
{
"epoch": 1.2609819121447028,
"grad_norm": 0.6152043162282845,
"learning_rate": 7.434647428482453e-05,
"loss": 0.4852,
"step": 61
},
{
"epoch": 1.2816537467700257,
"grad_norm": 0.4108530991008805,
"learning_rate": 7.404466689747999e-05,
"loss": 0.4824,
"step": 62
},
{
"epoch": 1.302325581395349,
"grad_norm": 0.5619333094656168,
"learning_rate": 7.373565783251544e-05,
"loss": 0.4862,
"step": 63
},
{
"epoch": 1.322997416020672,
"grad_norm": 0.36736616896387303,
"learning_rate": 7.341951245651747e-05,
"loss": 0.4816,
"step": 64
},
{
"epoch": 1.3436692506459949,
"grad_norm": 0.5007899896930137,
"learning_rate": 7.309629764566042e-05,
"loss": 0.4849,
"step": 65
},
{
"epoch": 1.3643410852713178,
"grad_norm": 0.3567245531783105,
"learning_rate": 7.276608177155968e-05,
"loss": 0.4806,
"step": 66
},
{
"epoch": 1.3850129198966408,
"grad_norm": 0.4042585004384195,
"learning_rate": 7.242893468680849e-05,
"loss": 0.479,
"step": 67
},
{
"epoch": 1.405684754521964,
"grad_norm": 0.3071251372629263,
"learning_rate": 7.208492771020176e-05,
"loss": 0.4708,
"step": 68
},
{
"epoch": 1.4263565891472867,
"grad_norm": 0.3178098545746679,
"learning_rate": 7.173413361164941e-05,
"loss": 0.4743,
"step": 69
},
{
"epoch": 1.4470284237726099,
"grad_norm": 0.32791656769788197,
"learning_rate": 7.137662659678303e-05,
"loss": 0.479,
"step": 70
},
{
"epoch": 1.4677002583979328,
"grad_norm": 0.31073048837177464,
"learning_rate": 7.101248229125864e-05,
"loss": 0.4748,
"step": 71
},
{
"epoch": 1.4883720930232558,
"grad_norm": 0.23092670638656684,
"learning_rate": 7.064177772475912e-05,
"loss": 0.483,
"step": 72
},
{
"epoch": 1.509043927648579,
"grad_norm": 0.29202717623796903,
"learning_rate": 7.026459131469972e-05,
"loss": 0.4806,
"step": 73
},
{
"epoch": 1.5297157622739017,
"grad_norm": 0.24757644213273783,
"learning_rate": 6.988100284963985e-05,
"loss": 0.4758,
"step": 74
},
{
"epoch": 1.550387596899225,
"grad_norm": 0.21393996054013467,
"learning_rate": 6.949109347240496e-05,
"loss": 0.478,
"step": 75
},
{
"epoch": 1.5710594315245479,
"grad_norm": 0.21940059670661766,
"learning_rate": 6.909494566292195e-05,
"loss": 0.4794,
"step": 76
},
{
"epoch": 1.5917312661498708,
"grad_norm": 0.25120510290065323,
"learning_rate": 6.869264322077158e-05,
"loss": 0.4741,
"step": 77
},
{
"epoch": 1.6124031007751938,
"grad_norm": 0.26922385323839254,
"learning_rate": 6.828427124746191e-05,
"loss": 0.4719,
"step": 78
},
{
"epoch": 1.6330749354005167,
"grad_norm": 0.20344576961977537,
"learning_rate": 6.786991612842621e-05,
"loss": 0.4778,
"step": 79
},
{
"epoch": 1.65374677002584,
"grad_norm": 0.22604786354074988,
"learning_rate": 6.744966551474936e-05,
"loss": 0.4761,
"step": 80
},
{
"epoch": 1.6744186046511627,
"grad_norm": 0.18625757521916195,
"learning_rate": 6.702360830462642e-05,
"loss": 0.475,
"step": 81
},
{
"epoch": 1.6950904392764858,
"grad_norm": 0.20462391670532523,
"learning_rate": 6.659183462455751e-05,
"loss": 0.4752,
"step": 82
},
{
"epoch": 1.7157622739018088,
"grad_norm": 0.18546909132461142,
"learning_rate": 6.615443581028279e-05,
"loss": 0.4773,
"step": 83
},
{
"epoch": 1.7364341085271318,
"grad_norm": 0.18510363335943614,
"learning_rate": 6.571150438746157e-05,
"loss": 0.4695,
"step": 84
},
{
"epoch": 1.757105943152455,
"grad_norm": 0.15034980115459998,
"learning_rate": 6.526313405209991e-05,
"loss": 0.4663,
"step": 85
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.19615925097698703,
"learning_rate": 6.480941965073041e-05,
"loss": 0.4749,
"step": 86
},
{
"epoch": 1.7984496124031009,
"grad_norm": 0.2654011493760909,
"learning_rate": 6.435045716034883e-05,
"loss": 0.4736,
"step": 87
},
{
"epoch": 1.8191214470284238,
"grad_norm": 0.3111773040871724,
"learning_rate": 6.388634366811146e-05,
"loss": 0.4708,
"step": 88
},
{
"epoch": 1.8397932816537468,
"grad_norm": 0.36927812675111354,
"learning_rate": 6.341717735079763e-05,
"loss": 0.4752,
"step": 89
},
{
"epoch": 1.8604651162790697,
"grad_norm": 0.4160789587496509,
"learning_rate": 6.294305745404185e-05,
"loss": 0.4754,
"step": 90
},
{
"epoch": 1.8811369509043927,
"grad_norm": 0.3977585119965555,
"learning_rate": 6.246408427133972e-05,
"loss": 0.4786,
"step": 91
},
{
"epoch": 1.9018087855297159,
"grad_norm": 0.3516911724442998,
"learning_rate": 6.198035912283225e-05,
"loss": 0.481,
"step": 92
},
{
"epoch": 1.9224806201550386,
"grad_norm": 0.293513841505873,
"learning_rate": 6.149198433387297e-05,
"loss": 0.4729,
"step": 93
},
{
"epoch": 1.9431524547803618,
"grad_norm": 0.2731022151882442,
"learning_rate": 6.099906321338241e-05,
"loss": 0.4741,
"step": 94
},
{
"epoch": 1.9638242894056848,
"grad_norm": 0.35410712723586696,
"learning_rate": 6.0501700031994613e-05,
"loss": 0.474,
"step": 95
},
{
"epoch": 1.9844961240310077,
"grad_norm": 0.4538676453325634,
"learning_rate": 6.000000000000001e-05,
"loss": 0.4739,
"step": 96
},
{
"epoch": 2.005167958656331,
"grad_norm": 0.46816594534216643,
"learning_rate": 5.94940692450897e-05,
"loss": 0.4659,
"step": 97
},
{
"epoch": 2.0258397932816536,
"grad_norm": 0.3490457312607184,
"learning_rate": 5.8984014789905625e-05,
"loss": 0.4321,
"step": 98
},
{
"epoch": 2.046511627906977,
"grad_norm": 0.3313437102668528,
"learning_rate": 5.846994452940137e-05,
"loss": 0.4324,
"step": 99
},
{
"epoch": 2.0671834625322996,
"grad_norm": 0.4268324278027606,
"learning_rate": 5.79519672080185e-05,
"loss": 0.4343,
"step": 100
},
{
"epoch": 2.0878552971576227,
"grad_norm": 0.46532373355417445,
"learning_rate": 5.743019239668318e-05,
"loss": 0.4348,
"step": 101
},
{
"epoch": 2.108527131782946,
"grad_norm": 0.3656350937833029,
"learning_rate": 5.6904730469627985e-05,
"loss": 0.4287,
"step": 102
},
{
"epoch": 2.1291989664082687,
"grad_norm": 0.341576849989435,
"learning_rate": 5.6375692581043705e-05,
"loss": 0.4256,
"step": 103
},
{
"epoch": 2.149870801033592,
"grad_norm": 0.3435636006449175,
"learning_rate": 5.584319064156628e-05,
"loss": 0.4255,
"step": 104
},
{
"epoch": 2.1705426356589146,
"grad_norm": 0.3674851087079681,
"learning_rate": 5.5307337294603595e-05,
"loss": 0.4297,
"step": 105
},
{
"epoch": 2.1912144702842378,
"grad_norm": 0.25757942003867124,
"learning_rate": 5.476824589250738e-05,
"loss": 0.4265,
"step": 106
},
{
"epoch": 2.2118863049095605,
"grad_norm": 0.29319082903907967,
"learning_rate": 5.4226030472595075e-05,
"loss": 0.4273,
"step": 107
},
{
"epoch": 2.2325581395348837,
"grad_norm": 0.30912735428671423,
"learning_rate": 5.368080573302676e-05,
"loss": 0.4274,
"step": 108
},
{
"epoch": 2.253229974160207,
"grad_norm": 0.2946112564630265,
"learning_rate": 5.3132687008542454e-05,
"loss": 0.4266,
"step": 109
},
{
"epoch": 2.2739018087855296,
"grad_norm": 0.24956667963282786,
"learning_rate": 5.258179024606455e-05,
"loss": 0.4233,
"step": 110
},
{
"epoch": 2.294573643410853,
"grad_norm": 0.24495972859727833,
"learning_rate": 5.202823198017092e-05,
"loss": 0.4201,
"step": 111
},
{
"epoch": 2.3152454780361755,
"grad_norm": 0.25189356412883607,
"learning_rate": 5.1472129308443616e-05,
"loss": 0.4281,
"step": 112
},
{
"epoch": 2.3359173126614987,
"grad_norm": 0.24882897366306153,
"learning_rate": 5.091359986669845e-05,
"loss": 0.4275,
"step": 113
},
{
"epoch": 2.356589147286822,
"grad_norm": 0.2505341488622872,
"learning_rate": 5.0352761804100835e-05,
"loss": 0.4244,
"step": 114
},
{
"epoch": 2.3772609819121446,
"grad_norm": 0.23709144675798577,
"learning_rate": 4.9789733758172956e-05,
"loss": 0.4233,
"step": 115
},
{
"epoch": 2.397932816537468,
"grad_norm": 0.22216591251943948,
"learning_rate": 4.922463482969761e-05,
"loss": 0.4257,
"step": 116
},
{
"epoch": 2.4186046511627906,
"grad_norm": 0.2485822812049657,
"learning_rate": 4.8657584557524116e-05,
"loss": 0.4238,
"step": 117
},
{
"epoch": 2.4392764857881137,
"grad_norm": 0.20581013393206954,
"learning_rate": 4.808870289328153e-05,
"loss": 0.4237,
"step": 118
},
{
"epoch": 2.459948320413437,
"grad_norm": 0.2683694988919976,
"learning_rate": 4.751811017600448e-05,
"loss": 0.428,
"step": 119
},
{
"epoch": 2.4806201550387597,
"grad_norm": 0.2732119282896145,
"learning_rate": 4.694592710667723e-05,
"loss": 0.4177,
"step": 120
},
{
"epoch": 2.501291989664083,
"grad_norm": 0.16985653465966316,
"learning_rate": 4.637227472270091e-05,
"loss": 0.4212,
"step": 121
},
{
"epoch": 2.5219638242894056,
"grad_norm": 0.24083993805158754,
"learning_rate": 4.579727437228987e-05,
"loss": 0.4259,
"step": 122
},
{
"epoch": 2.5426356589147288,
"grad_norm": 0.19049366411569346,
"learning_rate": 4.522104768880208e-05,
"loss": 0.4237,
"step": 123
},
{
"epoch": 2.5633074935400515,
"grad_norm": 0.18921694866288802,
"learning_rate": 4.464371656500921e-05,
"loss": 0.4162,
"step": 124
},
{
"epoch": 2.5839793281653747,
"grad_norm": 0.11515867618528139,
"learning_rate": 4.406540312731208e-05,
"loss": 0.4245,
"step": 125
},
{
"epoch": 2.604651162790698,
"grad_norm": 0.16805334717409715,
"learning_rate": 4.348622970990634e-05,
"loss": 0.4244,
"step": 126
},
{
"epoch": 2.6253229974160206,
"grad_norm": 0.14624332678173066,
"learning_rate": 4.290631882890443e-05,
"loss": 0.4206,
"step": 127
},
{
"epoch": 2.645994832041344,
"grad_norm": 0.1226966632599885,
"learning_rate": 4.2325793156419035e-05,
"loss": 0.422,
"step": 128
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.12200828138561001,
"learning_rate": 4.174477549461345e-05,
"loss": 0.4234,
"step": 129
},
{
"epoch": 2.6873385012919897,
"grad_norm": 0.13700653705541546,
"learning_rate": 4.116338874972446e-05,
"loss": 0.4232,
"step": 130
},
{
"epoch": 2.7080103359173124,
"grad_norm": 0.11557203582812532,
"learning_rate": 4.058175590606332e-05,
"loss": 0.4218,
"step": 131
},
{
"epoch": 2.7286821705426356,
"grad_norm": 0.11956605011466209,
"learning_rate": 4e-05,
"loss": 0.4252,
"step": 132
},
{
"epoch": 2.749354005167959,
"grad_norm": 0.10473648355726717,
"learning_rate": 3.9418244093936694e-05,
"loss": 0.4204,
"step": 133
},
{
"epoch": 2.7700258397932815,
"grad_norm": 0.1186616842930409,
"learning_rate": 3.8836611250275546e-05,
"loss": 0.4241,
"step": 134
},
{
"epoch": 2.7906976744186047,
"grad_norm": 0.10525513152325092,
"learning_rate": 3.825522450538657e-05,
"loss": 0.4275,
"step": 135
},
{
"epoch": 2.811369509043928,
"grad_norm": 0.11453121919183189,
"learning_rate": 3.767420684358097e-05,
"loss": 0.4213,
"step": 136
},
{
"epoch": 2.8320413436692506,
"grad_norm": 0.09206489385884514,
"learning_rate": 3.709368117109558e-05,
"loss": 0.4234,
"step": 137
},
{
"epoch": 2.8527131782945734,
"grad_norm": 0.118433918645971,
"learning_rate": 3.6513770290093674e-05,
"loss": 0.4242,
"step": 138
},
{
"epoch": 2.8733850129198966,
"grad_norm": 0.0943985454829677,
"learning_rate": 3.5934596872687924e-05,
"loss": 0.4225,
"step": 139
},
{
"epoch": 2.8940568475452197,
"grad_norm": 0.11050468674335369,
"learning_rate": 3.535628343499079e-05,
"loss": 0.4193,
"step": 140
},
{
"epoch": 2.9147286821705425,
"grad_norm": 0.09949452607563593,
"learning_rate": 3.477895231119795e-05,
"loss": 0.429,
"step": 141
},
{
"epoch": 2.9354005167958657,
"grad_norm": 0.1106090290134035,
"learning_rate": 3.4202725627710136e-05,
"loss": 0.4244,
"step": 142
},
{
"epoch": 2.956072351421189,
"grad_norm": 0.12048140894086738,
"learning_rate": 3.3627725277299103e-05,
"loss": 0.4178,
"step": 143
},
{
"epoch": 2.9767441860465116,
"grad_norm": 0.09941200224739741,
"learning_rate": 3.305407289332279e-05,
"loss": 0.4193,
"step": 144
},
{
"epoch": 2.9974160206718348,
"grad_norm": 0.11191324212004439,
"learning_rate": 3.248188982399553e-05,
"loss": 0.4268,
"step": 145
},
{
"epoch": 3.0180878552971575,
"grad_norm": 0.1927802146617283,
"learning_rate": 3.191129710671849e-05,
"loss": 0.3943,
"step": 146
},
{
"epoch": 3.0387596899224807,
"grad_norm": 0.13601769681186723,
"learning_rate": 3.134241544247589e-05,
"loss": 0.3843,
"step": 147
},
{
"epoch": 3.0594315245478034,
"grad_norm": 0.18128467277952806,
"learning_rate": 3.07753651703024e-05,
"loss": 0.3875,
"step": 148
},
{
"epoch": 3.0801033591731266,
"grad_norm": 0.22945154955579947,
"learning_rate": 3.0210266241827047e-05,
"loss": 0.3807,
"step": 149
},
{
"epoch": 3.10077519379845,
"grad_norm": 0.16654454526224097,
"learning_rate": 2.9647238195899168e-05,
"loss": 0.3838,
"step": 150
},
{
"epoch": 3.1214470284237725,
"grad_norm": 0.22750866213777649,
"learning_rate": 2.9086400133301573e-05,
"loss": 0.3751,
"step": 151
},
{
"epoch": 3.1421188630490957,
"grad_norm": 0.17973030546542432,
"learning_rate": 2.8527870691556404e-05,
"loss": 0.3826,
"step": 152
},
{
"epoch": 3.1627906976744184,
"grad_norm": 0.17029401369958916,
"learning_rate": 2.7971768019829083e-05,
"loss": 0.3804,
"step": 153
},
{
"epoch": 3.1834625322997416,
"grad_norm": 0.1852119301785162,
"learning_rate": 2.7418209753935464e-05,
"loss": 0.3806,
"step": 154
},
{
"epoch": 3.2041343669250644,
"grad_norm": 0.14136299318153245,
"learning_rate": 2.6867312991457563e-05,
"loss": 0.3806,
"step": 155
},
{
"epoch": 3.2248062015503876,
"grad_norm": 0.16072645831339827,
"learning_rate": 2.6319194266973256e-05,
"loss": 0.3813,
"step": 156
},
{
"epoch": 3.2454780361757107,
"grad_norm": 0.13278616146506314,
"learning_rate": 2.577396952740495e-05,
"loss": 0.3838,
"step": 157
},
{
"epoch": 3.2661498708010335,
"grad_norm": 0.14768004119760159,
"learning_rate": 2.523175410749263e-05,
"loss": 0.379,
"step": 158
},
{
"epoch": 3.2868217054263567,
"grad_norm": 0.12905452066326767,
"learning_rate": 2.4692662705396412e-05,
"loss": 0.3798,
"step": 159
},
{
"epoch": 3.3074935400516794,
"grad_norm": 0.12981775553641342,
"learning_rate": 2.4156809358433728e-05,
"loss": 0.3829,
"step": 160
},
{
"epoch": 3.3281653746770026,
"grad_norm": 0.11735923882875593,
"learning_rate": 2.3624307418956298e-05,
"loss": 0.3854,
"step": 161
},
{
"epoch": 3.3488372093023258,
"grad_norm": 0.1094304758715909,
"learning_rate": 2.3095269530372032e-05,
"loss": 0.3846,
"step": 162
},
{
"epoch": 3.3695090439276485,
"grad_norm": 0.12552108042358762,
"learning_rate": 2.2569807603316836e-05,
"loss": 0.3799,
"step": 163
},
{
"epoch": 3.3901808785529717,
"grad_norm": 0.09960480491775725,
"learning_rate": 2.2048032791981515e-05,
"loss": 0.3816,
"step": 164
},
{
"epoch": 3.4108527131782944,
"grad_norm": 0.13048633977534593,
"learning_rate": 2.1530055470598654e-05,
"loss": 0.3805,
"step": 165
},
{
"epoch": 3.4315245478036176,
"grad_norm": 0.10265331283268403,
"learning_rate": 2.1015985210094385e-05,
"loss": 0.3838,
"step": 166
},
{
"epoch": 3.452196382428941,
"grad_norm": 0.12636258686674082,
"learning_rate": 2.050593075491031e-05,
"loss": 0.3796,
"step": 167
},
{
"epoch": 3.4728682170542635,
"grad_norm": 0.09627358297975736,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.382,
"step": 168
},
{
"epoch": 3.4935400516795867,
"grad_norm": 0.10144594851320451,
"learning_rate": 1.9498299968005393e-05,
"loss": 0.376,
"step": 169
},
{
"epoch": 3.5142118863049094,
"grad_norm": 0.09960892836011123,
"learning_rate": 1.90009367866176e-05,
"loss": 0.3828,
"step": 170
},
{
"epoch": 3.5348837209302326,
"grad_norm": 0.08649589297067169,
"learning_rate": 1.8508015666127043e-05,
"loss": 0.3827,
"step": 171
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.10384196159292765,
"learning_rate": 1.8019640877167763e-05,
"loss": 0.3839,
"step": 172
},
{
"epoch": 3.5762273901808785,
"grad_norm": 0.08656341678898698,
"learning_rate": 1.753591572866029e-05,
"loss": 0.375,
"step": 173
},
{
"epoch": 3.5968992248062017,
"grad_norm": 0.08511926948170766,
"learning_rate": 1.7056942545958167e-05,
"loss": 0.3804,
"step": 174
},
{
"epoch": 3.6175710594315245,
"grad_norm": 0.09259550643790064,
"learning_rate": 1.6582822649202382e-05,
"loss": 0.3835,
"step": 175
},
{
"epoch": 3.6382428940568476,
"grad_norm": 0.08328195157299345,
"learning_rate": 1.6113656331888563e-05,
"loss": 0.3807,
"step": 176
},
{
"epoch": 3.6589147286821704,
"grad_norm": 0.08877299434086175,
"learning_rate": 1.5649542839651175e-05,
"loss": 0.3764,
"step": 177
},
{
"epoch": 3.6795865633074936,
"grad_norm": 0.08380640318280198,
"learning_rate": 1.5190580349269604e-05,
"loss": 0.3804,
"step": 178
},
{
"epoch": 3.7002583979328163,
"grad_norm": 0.07699776373434924,
"learning_rate": 1.4736865947900106e-05,
"loss": 0.3796,
"step": 179
},
{
"epoch": 3.7209302325581395,
"grad_norm": 0.09256081817751462,
"learning_rate": 1.4288495612538427e-05,
"loss": 0.3797,
"step": 180
},
{
"epoch": 3.7416020671834627,
"grad_norm": 0.08359373379077649,
"learning_rate": 1.3845564189717218e-05,
"loss": 0.3745,
"step": 181
},
{
"epoch": 3.7622739018087854,
"grad_norm": 0.08520678172410531,
"learning_rate": 1.3408165375442486e-05,
"loss": 0.383,
"step": 182
},
{
"epoch": 3.7829457364341086,
"grad_norm": 0.09122883357806844,
"learning_rate": 1.297639169537359e-05,
"loss": 0.3795,
"step": 183
},
{
"epoch": 3.8036175710594318,
"grad_norm": 0.0779176471928561,
"learning_rate": 1.2550334485250661e-05,
"loss": 0.3825,
"step": 184
},
{
"epoch": 3.8242894056847545,
"grad_norm": 0.08154088851689886,
"learning_rate": 1.2130083871573812e-05,
"loss": 0.3814,
"step": 185
},
{
"epoch": 3.8449612403100772,
"grad_norm": 0.10166515233750176,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.3763,
"step": 186
},
{
"epoch": 3.8656330749354004,
"grad_norm": 0.07978891816708118,
"learning_rate": 1.130735677922842e-05,
"loss": 0.382,
"step": 187
},
{
"epoch": 3.8863049095607236,
"grad_norm": 0.08247932494598004,
"learning_rate": 1.0905054337078051e-05,
"loss": 0.3816,
"step": 188
},
{
"epoch": 3.9069767441860463,
"grad_norm": 0.08617393492477488,
"learning_rate": 1.0508906527595042e-05,
"loss": 0.3758,
"step": 189
},
{
"epoch": 3.9276485788113695,
"grad_norm": 0.07076644296330604,
"learning_rate": 1.0118997150360169e-05,
"loss": 0.3822,
"step": 190
},
{
"epoch": 3.9483204134366927,
"grad_norm": 0.07065814361505379,
"learning_rate": 9.735408685300287e-06,
"loss": 0.3826,
"step": 191
},
{
"epoch": 3.9689922480620154,
"grad_norm": 0.08938154240197935,
"learning_rate": 9.358222275240884e-06,
"loss": 0.3828,
"step": 192
},
{
"epoch": 3.9896640826873386,
"grad_norm": 0.07941435941658158,
"learning_rate": 8.987517708741364e-06,
"loss": 0.378,
"step": 193
},
{
"epoch": 4.010335917312662,
"grad_norm": 0.09711457338727685,
"learning_rate": 8.623373403216972e-06,
"loss": 0.3677,
"step": 194
},
{
"epoch": 4.0310077519379846,
"grad_norm": 0.13105925666440368,
"learning_rate": 8.265866388350598e-06,
"loss": 0.3617,
"step": 195
},
{
"epoch": 4.051679586563307,
"grad_norm": 0.08933195191567478,
"learning_rate": 7.915072289798247e-06,
"loss": 0.3537,
"step": 196
},
{
"epoch": 4.072351421188631,
"grad_norm": 0.08419493114811873,
"learning_rate": 7.5710653131915125e-06,
"loss": 0.3627,
"step": 197
},
{
"epoch": 4.093023255813954,
"grad_norm": 0.09855757707090335,
"learning_rate": 7.233918228440324e-06,
"loss": 0.363,
"step": 198
},
{
"epoch": 4.113695090439276,
"grad_norm": 0.10644403332249518,
"learning_rate": 6.903702354339578e-06,
"loss": 0.3572,
"step": 199
},
{
"epoch": 4.134366925064599,
"grad_norm": 0.10474868764346248,
"learning_rate": 6.58048754348255e-06,
"loss": 0.3656,
"step": 200
},
{
"epoch": 4.155038759689923,
"grad_norm": 0.08998213945424546,
"learning_rate": 6.26434216748458e-06,
"loss": 0.357,
"step": 201
},
{
"epoch": 4.1757105943152455,
"grad_norm": 0.0874382734058534,
"learning_rate": 5.955333102520011e-06,
"loss": 0.3579,
"step": 202
},
{
"epoch": 4.196382428940568,
"grad_norm": 0.09229516478584097,
"learning_rate": 5.653525715175483e-06,
"loss": 0.3551,
"step": 203
},
{
"epoch": 4.217054263565892,
"grad_norm": 0.0899139620609084,
"learning_rate": 5.358983848622452e-06,
"loss": 0.3561,
"step": 204
},
{
"epoch": 4.237726098191215,
"grad_norm": 0.09480003016590481,
"learning_rate": 5.07176980911217e-06,
"loss": 0.3612,
"step": 205
},
{
"epoch": 4.258397932816537,
"grad_norm": 0.07955059469841694,
"learning_rate": 4.791944352795561e-06,
"loss": 0.3576,
"step": 206
},
{
"epoch": 4.27906976744186,
"grad_norm": 0.08281316418812414,
"learning_rate": 4.519566672871132e-06,
"loss": 0.3596,
"step": 207
},
{
"epoch": 4.299741602067184,
"grad_norm": 0.08360162648406322,
"learning_rate": 4.254694387063514e-06,
"loss": 0.358,
"step": 208
},
{
"epoch": 4.320413436692506,
"grad_norm": 0.07640740401473636,
"learning_rate": 3.997383525435154e-06,
"loss": 0.3532,
"step": 209
},
{
"epoch": 4.341085271317829,
"grad_norm": 0.07916200584935532,
"learning_rate": 3.747688518534003e-06,
"loss": 0.3608,
"step": 210
},
{
"epoch": 4.361757105943153,
"grad_norm": 0.06910995508249697,
"learning_rate": 3.5056621858794393e-06,
"loss": 0.3521,
"step": 211
},
{
"epoch": 4.3824289405684755,
"grad_norm": 0.07374366802772589,
"learning_rate": 3.2713557247890447e-06,
"loss": 0.3617,
"step": 212
},
{
"epoch": 4.403100775193798,
"grad_norm": 0.0710814968114632,
"learning_rate": 3.0448186995485307e-06,
"loss": 0.3601,
"step": 213
},
{
"epoch": 4.423772609819121,
"grad_norm": 0.06896603986098794,
"learning_rate": 2.8260990309270987e-06,
"loss": 0.3479,
"step": 214
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.06185011564865765,
"learning_rate": 2.6152429860404647e-06,
"loss": 0.3614,
"step": 215
},
{
"epoch": 4.465116279069767,
"grad_norm": 0.06549697826461233,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.3561,
"step": 216
},
{
"epoch": 4.48578811369509,
"grad_norm": 0.061438617551997,
"learning_rate": 2.217298509295813e-06,
"loss": 0.3555,
"step": 217
},
{
"epoch": 4.506459948320414,
"grad_norm": 0.060700891226855756,
"learning_rate": 2.0302942570786446e-06,
"loss": 0.3584,
"step": 218
},
{
"epoch": 4.5271317829457365,
"grad_norm": 0.06178488967139298,
"learning_rate": 1.8513219700709272e-06,
"loss": 0.3581,
"step": 219
},
{
"epoch": 4.547803617571059,
"grad_norm": 0.06183530674329798,
"learning_rate": 1.6804195073804442e-06,
"loss": 0.352,
"step": 220
},
{
"epoch": 4.568475452196383,
"grad_norm": 0.05866001427045164,
"learning_rate": 1.5176230210554744e-06,
"loss": 0.3509,
"step": 221
},
{
"epoch": 4.589147286821706,
"grad_norm": 0.05774735846339501,
"learning_rate": 1.3629669484372722e-06,
"loss": 0.3574,
"step": 222
},
{
"epoch": 4.609819121447028,
"grad_norm": 0.05601122893723008,
"learning_rate": 1.2164840048753602e-06,
"loss": 0.3569,
"step": 223
},
{
"epoch": 4.630490956072351,
"grad_norm": 0.05526767753253488,
"learning_rate": 1.0782051768070477e-06,
"loss": 0.3582,
"step": 224
},
{
"epoch": 4.651162790697675,
"grad_norm": 0.05616027399981306,
"learning_rate": 9.481597152026656e-07,
"loss": 0.3493,
"step": 225
},
{
"epoch": 4.671834625322997,
"grad_norm": 0.05467514890147951,
"learning_rate": 8.263751293779409e-07,
"loss": 0.3533,
"step": 226
},
{
"epoch": 4.69250645994832,
"grad_norm": 0.05486244376026512,
"learning_rate": 7.128771811747737e-07,
"loss": 0.353,
"step": 227
},
{
"epoch": 4.713178294573644,
"grad_norm": 0.053720925123237416,
"learning_rate": 6.076898795116792e-07,
"loss": 0.359,
"step": 228
},
{
"epoch": 4.7338501291989665,
"grad_norm": 0.054529190806937466,
"learning_rate": 5.108354753050381e-07,
"loss": 0.3548,
"step": 229
},
{
"epoch": 4.754521963824289,
"grad_norm": 0.054284156157904224,
"learning_rate": 4.223344567622212e-07,
"loss": 0.3509,
"step": 230
},
{
"epoch": 4.775193798449612,
"grad_norm": 0.05169076157313685,
"learning_rate": 3.4220554504758475e-07,
"loss": 0.3576,
"step": 231
},
{
"epoch": 4.795865633074936,
"grad_norm": 0.05516028856228327,
"learning_rate": 2.704656903222791e-07,
"loss": 0.3591,
"step": 232
},
{
"epoch": 4.816537467700258,
"grad_norm": 0.05350115446089016,
"learning_rate": 2.0713006815868075e-07,
"loss": 0.358,
"step": 233
},
{
"epoch": 4.837209302325581,
"grad_norm": 0.052581881517593826,
"learning_rate": 1.522120763301782e-07,
"loss": 0.3586,
"step": 234
},
{
"epoch": 4.857881136950905,
"grad_norm": 0.053948835043271054,
"learning_rate": 1.0572333197711005e-07,
"loss": 0.3543,
"step": 235
},
{
"epoch": 4.8785529715762275,
"grad_norm": 0.0546909910210749,
"learning_rate": 6.767366914927298e-08,
"loss": 0.3566,
"step": 236
},
{
"epoch": 4.89922480620155,
"grad_norm": 0.05406684942381343,
"learning_rate": 3.8071136725688074e-08,
"loss": 0.3561,
"step": 237
},
{
"epoch": 4.919896640826874,
"grad_norm": 0.05108092626758996,
"learning_rate": 1.6921996711976028e-08,
"loss": 0.3512,
"step": 238
},
{
"epoch": 4.940568475452197,
"grad_norm": 0.053315673178670926,
"learning_rate": 4.230722915701257e-09,
"loss": 0.3584,
"step": 239
},
{
"epoch": 4.961240310077519,
"grad_norm": 0.05179414324279462,
"learning_rate": 0.0,
"loss": 0.3541,
"step": 240
},
{
"epoch": 4.961240310077519,
"step": 240,
"total_flos": 8.242697655180853e+18,
"train_loss": 0.45099904040495553,
"train_runtime": 77492.3925,
"train_samples_per_second": 1.597,
"train_steps_per_second": 0.003
}
],
"logging_steps": 1,
"max_steps": 240,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.242697655180853e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}