Llama3.1-8B-Middo-Wizard / trainer_state.json
Word2Li's picture
Upload model
89cdb6f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9973935708079931,
"eval_steps": 500,
"global_step": 287,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0034752389226759338,
"grad_norm": 13.863444328308105,
"learning_rate": 2.222222222222222e-06,
"loss": 0.8359,
"step": 1
},
{
"epoch": 0.0069504778453518675,
"grad_norm": 12.374430656433105,
"learning_rate": 4.444444444444444e-06,
"loss": 0.8862,
"step": 2
},
{
"epoch": 0.010425716768027803,
"grad_norm": 11.986786842346191,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7899,
"step": 3
},
{
"epoch": 0.013900955690703735,
"grad_norm": 6.093581199645996,
"learning_rate": 8.888888888888888e-06,
"loss": 0.7342,
"step": 4
},
{
"epoch": 0.01737619461337967,
"grad_norm": 2.471663475036621,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.6759,
"step": 5
},
{
"epoch": 0.020851433536055605,
"grad_norm": 2.6024749279022217,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.6608,
"step": 6
},
{
"epoch": 0.02432667245873154,
"grad_norm": 2.211730718612671,
"learning_rate": 1.555555555555556e-05,
"loss": 0.5929,
"step": 7
},
{
"epoch": 0.02780191138140747,
"grad_norm": 2.3514232635498047,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.6591,
"step": 8
},
{
"epoch": 0.03127715030408341,
"grad_norm": 2.4454445838928223,
"learning_rate": 2e-05,
"loss": 0.6129,
"step": 9
},
{
"epoch": 0.03475238922675934,
"grad_norm": 3.566322088241577,
"learning_rate": 1.9999361478484043e-05,
"loss": 0.6247,
"step": 10
},
{
"epoch": 0.038227628149435276,
"grad_norm": 2.7100212574005127,
"learning_rate": 1.999744599547812e-05,
"loss": 0.5985,
"step": 11
},
{
"epoch": 0.04170286707211121,
"grad_norm": 2.569108247756958,
"learning_rate": 1.999425379559765e-05,
"loss": 0.6277,
"step": 12
},
{
"epoch": 0.045178105994787145,
"grad_norm": 2.111241102218628,
"learning_rate": 1.9989785286500294e-05,
"loss": 0.6268,
"step": 13
},
{
"epoch": 0.04865334491746308,
"grad_norm": 2.4817757606506348,
"learning_rate": 1.99840410388339e-05,
"loss": 0.6465,
"step": 14
},
{
"epoch": 0.052128583840139006,
"grad_norm": 1.7389640808105469,
"learning_rate": 1.99770217861636e-05,
"loss": 0.6048,
"step": 15
},
{
"epoch": 0.05560382276281494,
"grad_norm": 1.7822411060333252,
"learning_rate": 1.9968728424878178e-05,
"loss": 0.6343,
"step": 16
},
{
"epoch": 0.059079061685490875,
"grad_norm": 1.7938050031661987,
"learning_rate": 1.9959162014075553e-05,
"loss": 0.6308,
"step": 17
},
{
"epoch": 0.06255430060816682,
"grad_norm": 1.7255632877349854,
"learning_rate": 1.994832377542755e-05,
"loss": 0.6382,
"step": 18
},
{
"epoch": 0.06602953953084274,
"grad_norm": 1.6459022760391235,
"learning_rate": 1.9936215093023884e-05,
"loss": 0.5864,
"step": 19
},
{
"epoch": 0.06950477845351868,
"grad_norm": 1.5717964172363281,
"learning_rate": 1.9922837513195406e-05,
"loss": 0.6014,
"step": 20
},
{
"epoch": 0.07298001737619461,
"grad_norm": 1.4626662731170654,
"learning_rate": 1.990819274431662e-05,
"loss": 0.5919,
"step": 21
},
{
"epoch": 0.07645525629887055,
"grad_norm": 1.5188833475112915,
"learning_rate": 1.989228265658754e-05,
"loss": 0.6285,
"step": 22
},
{
"epoch": 0.07993049522154648,
"grad_norm": 1.4063705205917358,
"learning_rate": 1.9875109281794828e-05,
"loss": 0.6498,
"step": 23
},
{
"epoch": 0.08340573414422242,
"grad_norm": 1.5133353471755981,
"learning_rate": 1.9856674813052345e-05,
"loss": 0.6267,
"step": 24
},
{
"epoch": 0.08688097306689835,
"grad_norm": 1.294826865196228,
"learning_rate": 1.9836981604521077e-05,
"loss": 0.6798,
"step": 25
},
{
"epoch": 0.09035621198957429,
"grad_norm": 1.4135065078735352,
"learning_rate": 1.98160321711085e-05,
"loss": 0.6478,
"step": 26
},
{
"epoch": 0.09383145091225022,
"grad_norm": 1.3046231269836426,
"learning_rate": 1.9793829188147406e-05,
"loss": 0.6136,
"step": 27
},
{
"epoch": 0.09730668983492616,
"grad_norm": 1.2951562404632568,
"learning_rate": 1.9770375491054264e-05,
"loss": 0.6365,
"step": 28
},
{
"epoch": 0.10078192875760209,
"grad_norm": 1.2141485214233398,
"learning_rate": 1.974567407496712e-05,
"loss": 0.6168,
"step": 29
},
{
"epoch": 0.10425716768027801,
"grad_norm": 1.267811894416809,
"learning_rate": 1.9719728094363103e-05,
"loss": 0.6212,
"step": 30
},
{
"epoch": 0.10773240660295395,
"grad_norm": 1.3066190481185913,
"learning_rate": 1.9692540862655587e-05,
"loss": 0.6246,
"step": 31
},
{
"epoch": 0.11120764552562988,
"grad_norm": 1.3221654891967773,
"learning_rate": 1.966411585177105e-05,
"loss": 0.585,
"step": 32
},
{
"epoch": 0.11468288444830582,
"grad_norm": 1.2844849824905396,
"learning_rate": 1.9634456691705705e-05,
"loss": 0.6416,
"step": 33
},
{
"epoch": 0.11815812337098175,
"grad_norm": 1.2656630277633667,
"learning_rate": 1.9603567170061918e-05,
"loss": 0.6112,
"step": 34
},
{
"epoch": 0.12163336229365769,
"grad_norm": 1.2453721761703491,
"learning_rate": 1.9571451231564523e-05,
"loss": 0.6193,
"step": 35
},
{
"epoch": 0.12510860121633363,
"grad_norm": 1.3584957122802734,
"learning_rate": 1.9538112977557077e-05,
"loss": 0.661,
"step": 36
},
{
"epoch": 0.12858384013900956,
"grad_norm": 1.2529661655426025,
"learning_rate": 1.9503556665478066e-05,
"loss": 0.626,
"step": 37
},
{
"epoch": 0.13205907906168549,
"grad_norm": 1.168838381767273,
"learning_rate": 1.9467786708317257e-05,
"loss": 0.6553,
"step": 38
},
{
"epoch": 0.1355343179843614,
"grad_norm": 1.266099452972412,
"learning_rate": 1.9430807674052092e-05,
"loss": 0.6105,
"step": 39
},
{
"epoch": 0.13900955690703737,
"grad_norm": 1.2240427732467651,
"learning_rate": 1.939262428506438e-05,
"loss": 0.6362,
"step": 40
},
{
"epoch": 0.1424847958297133,
"grad_norm": 1.181288480758667,
"learning_rate": 1.9353241417537216e-05,
"loss": 0.639,
"step": 41
},
{
"epoch": 0.14596003475238922,
"grad_norm": 1.1083927154541016,
"learning_rate": 1.9312664100832236e-05,
"loss": 0.6421,
"step": 42
},
{
"epoch": 0.14943527367506515,
"grad_norm": 1.2228623628616333,
"learning_rate": 1.9270897516847406e-05,
"loss": 0.5897,
"step": 43
},
{
"epoch": 0.1529105125977411,
"grad_norm": 1.215945839881897,
"learning_rate": 1.9227946999355226e-05,
"loss": 0.6122,
"step": 44
},
{
"epoch": 0.15638575152041703,
"grad_norm": 1.1623311042785645,
"learning_rate": 1.9183818033321612e-05,
"loss": 0.5546,
"step": 45
},
{
"epoch": 0.15986099044309296,
"grad_norm": 1.3293700218200684,
"learning_rate": 1.9138516254205416e-05,
"loss": 0.6268,
"step": 46
},
{
"epoch": 0.1633362293657689,
"grad_norm": 1.1984280347824097,
"learning_rate": 1.9092047447238775e-05,
"loss": 0.6081,
"step": 47
},
{
"epoch": 0.16681146828844484,
"grad_norm": 1.1898068189620972,
"learning_rate": 1.9044417546688295e-05,
"loss": 0.6007,
"step": 48
},
{
"epoch": 0.17028670721112077,
"grad_norm": 1.2403992414474487,
"learning_rate": 1.899563263509725e-05,
"loss": 0.6219,
"step": 49
},
{
"epoch": 0.1737619461337967,
"grad_norm": 1.3231101036071777,
"learning_rate": 1.894569894250877e-05,
"loss": 0.5834,
"step": 50
},
{
"epoch": 0.17723718505647262,
"grad_norm": 1.3285619020462036,
"learning_rate": 1.8894622845670282e-05,
"loss": 0.6568,
"step": 51
},
{
"epoch": 0.18071242397914858,
"grad_norm": 1.1555180549621582,
"learning_rate": 1.8842410867219137e-05,
"loss": 0.6269,
"step": 52
},
{
"epoch": 0.1841876629018245,
"grad_norm": 1.1796412467956543,
"learning_rate": 1.878906967484966e-05,
"loss": 0.5973,
"step": 53
},
{
"epoch": 0.18766290182450043,
"grad_norm": 1.144237756729126,
"learning_rate": 1.8734606080461657e-05,
"loss": 0.6367,
"step": 54
},
{
"epoch": 0.19113814074717636,
"grad_norm": 1.1937980651855469,
"learning_rate": 1.86790270392905e-05,
"loss": 0.6006,
"step": 55
},
{
"epoch": 0.19461337966985232,
"grad_norm": 1.0895384550094604,
"learning_rate": 1.8622339649018907e-05,
"loss": 0.579,
"step": 56
},
{
"epoch": 0.19808861859252824,
"grad_norm": 1.1548895835876465,
"learning_rate": 1.856455114887056e-05,
"loss": 0.5521,
"step": 57
},
{
"epoch": 0.20156385751520417,
"grad_norm": 1.0904297828674316,
"learning_rate": 1.8505668918685603e-05,
"loss": 0.6222,
"step": 58
},
{
"epoch": 0.2050390964378801,
"grad_norm": 1.2059494256973267,
"learning_rate": 1.8445700477978207e-05,
"loss": 0.6631,
"step": 59
},
{
"epoch": 0.20851433536055602,
"grad_norm": 1.1830602884292603,
"learning_rate": 1.8384653484976305e-05,
"loss": 0.5963,
"step": 60
},
{
"epoch": 0.21198957428323198,
"grad_norm": 1.2046791315078735,
"learning_rate": 1.8322535735643604e-05,
"loss": 0.5943,
"step": 61
},
{
"epoch": 0.2154648132059079,
"grad_norm": 1.1545571088790894,
"learning_rate": 1.8259355162684e-05,
"loss": 0.6301,
"step": 62
},
{
"epoch": 0.21894005212858383,
"grad_norm": 1.0760880708694458,
"learning_rate": 1.8195119834528535e-05,
"loss": 0.6366,
"step": 63
},
{
"epoch": 0.22241529105125976,
"grad_norm": 1.2411205768585205,
"learning_rate": 1.8129837954305033e-05,
"loss": 0.6156,
"step": 64
},
{
"epoch": 0.22589052997393572,
"grad_norm": 1.1241486072540283,
"learning_rate": 1.8063517858790517e-05,
"loss": 0.608,
"step": 65
},
{
"epoch": 0.22936576889661164,
"grad_norm": 1.0839852094650269,
"learning_rate": 1.799616801734657e-05,
"loss": 0.6334,
"step": 66
},
{
"epoch": 0.23284100781928757,
"grad_norm": 1.125570297241211,
"learning_rate": 1.792779703083777e-05,
"loss": 0.6139,
"step": 67
},
{
"epoch": 0.2363162467419635,
"grad_norm": 1.1417044401168823,
"learning_rate": 1.7858413630533305e-05,
"loss": 0.5897,
"step": 68
},
{
"epoch": 0.23979148566463945,
"grad_norm": 1.0312554836273193,
"learning_rate": 1.778802667699196e-05,
"loss": 0.606,
"step": 69
},
{
"epoch": 0.24326672458731538,
"grad_norm": 1.0954256057739258,
"learning_rate": 1.77166451589306e-05,
"loss": 0.5888,
"step": 70
},
{
"epoch": 0.2467419635099913,
"grad_norm": 1.031202793121338,
"learning_rate": 1.764427819207624e-05,
"loss": 0.6004,
"step": 71
},
{
"epoch": 0.25021720243266726,
"grad_norm": 1.1144808530807495,
"learning_rate": 1.757093501800196e-05,
"loss": 0.5699,
"step": 72
},
{
"epoch": 0.2536924413553432,
"grad_norm": 1.0327179431915283,
"learning_rate": 1.7496625002946702e-05,
"loss": 0.6051,
"step": 73
},
{
"epoch": 0.2571676802780191,
"grad_norm": 1.080093264579773,
"learning_rate": 1.7421357636619153e-05,
"loss": 0.6494,
"step": 74
},
{
"epoch": 0.26064291920069504,
"grad_norm": 1.2163845300674438,
"learning_rate": 1.734514253098589e-05,
"loss": 0.6033,
"step": 75
},
{
"epoch": 0.26411815812337097,
"grad_norm": 1.1358786821365356,
"learning_rate": 1.726798941904386e-05,
"loss": 0.596,
"step": 76
},
{
"epoch": 0.2675933970460469,
"grad_norm": 1.0965262651443481,
"learning_rate": 1.7189908153577473e-05,
"loss": 0.5669,
"step": 77
},
{
"epoch": 0.2710686359687228,
"grad_norm": 1.2162927389144897,
"learning_rate": 1.7110908705900322e-05,
"loss": 0.6133,
"step": 78
},
{
"epoch": 0.2745438748913988,
"grad_norm": 1.186356544494629,
"learning_rate": 1.7031001164581828e-05,
"loss": 0.6405,
"step": 79
},
{
"epoch": 0.27801911381407474,
"grad_norm": 1.0369867086410522,
"learning_rate": 1.6950195734158874e-05,
"loss": 0.579,
"step": 80
},
{
"epoch": 0.28149435273675066,
"grad_norm": 1.0901113748550415,
"learning_rate": 1.6868502733832647e-05,
"loss": 0.5825,
"step": 81
},
{
"epoch": 0.2849695916594266,
"grad_norm": 1.0874531269073486,
"learning_rate": 1.6785932596150827e-05,
"loss": 0.6038,
"step": 82
},
{
"epoch": 0.2884448305821025,
"grad_norm": 1.1928529739379883,
"learning_rate": 1.670249586567531e-05,
"loss": 0.5979,
"step": 83
},
{
"epoch": 0.29192006950477845,
"grad_norm": 1.0388442277908325,
"learning_rate": 1.6618203197635624e-05,
"loss": 0.5832,
"step": 84
},
{
"epoch": 0.2953953084274544,
"grad_norm": 1.2084934711456299,
"learning_rate": 1.6533065356568206e-05,
"loss": 0.5844,
"step": 85
},
{
"epoch": 0.2988705473501303,
"grad_norm": 1.223413348197937,
"learning_rate": 1.6447093214941727e-05,
"loss": 0.619,
"step": 86
},
{
"epoch": 0.3023457862728063,
"grad_norm": 1.0778778791427612,
"learning_rate": 1.636029775176862e-05,
"loss": 0.6573,
"step": 87
},
{
"epoch": 0.3058210251954822,
"grad_norm": 1.0900267362594604,
"learning_rate": 1.627269005120304e-05,
"loss": 0.5779,
"step": 88
},
{
"epoch": 0.30929626411815814,
"grad_norm": 1.0737788677215576,
"learning_rate": 1.618428130112533e-05,
"loss": 0.5994,
"step": 89
},
{
"epoch": 0.31277150304083406,
"grad_norm": 1.0811336040496826,
"learning_rate": 1.6095082791713322e-05,
"loss": 0.626,
"step": 90
},
{
"epoch": 0.31624674196351,
"grad_norm": 1.0799121856689453,
"learning_rate": 1.6005105914000508e-05,
"loss": 0.6313,
"step": 91
},
{
"epoch": 0.3197219808861859,
"grad_norm": 1.0824054479599,
"learning_rate": 1.5914362158421352e-05,
"loss": 0.636,
"step": 92
},
{
"epoch": 0.32319721980886185,
"grad_norm": 1.0406136512756348,
"learning_rate": 1.5822863113343934e-05,
"loss": 0.5972,
"step": 93
},
{
"epoch": 0.3266724587315378,
"grad_norm": 1.16518235206604,
"learning_rate": 1.5730620463590052e-05,
"loss": 0.5728,
"step": 94
},
{
"epoch": 0.3301476976542137,
"grad_norm": 1.0442129373550415,
"learning_rate": 1.5637645988943008e-05,
"loss": 0.6138,
"step": 95
},
{
"epoch": 0.3336229365768897,
"grad_norm": 1.1471384763717651,
"learning_rate": 1.554395156264331e-05,
"loss": 0.6123,
"step": 96
},
{
"epoch": 0.3370981754995656,
"grad_norm": 1.0176016092300415,
"learning_rate": 1.544954914987238e-05,
"loss": 0.6063,
"step": 97
},
{
"epoch": 0.34057341442224154,
"grad_norm": 1.062016487121582,
"learning_rate": 1.5354450806224553e-05,
"loss": 0.5842,
"step": 98
},
{
"epoch": 0.34404865334491747,
"grad_norm": 1.1640706062316895,
"learning_rate": 1.5258668676167548e-05,
"loss": 0.5938,
"step": 99
},
{
"epoch": 0.3475238922675934,
"grad_norm": 1.1660997867584229,
"learning_rate": 1.516221499149154e-05,
"loss": 0.6202,
"step": 100
},
{
"epoch": 0.3509991311902693,
"grad_norm": 1.156037449836731,
"learning_rate": 1.5065102069747117e-05,
"loss": 0.6216,
"step": 101
},
{
"epoch": 0.35447437011294525,
"grad_norm": 1.2295578718185425,
"learning_rate": 1.4967342312672283e-05,
"loss": 0.5649,
"step": 102
},
{
"epoch": 0.3579496090356212,
"grad_norm": 1.0619351863861084,
"learning_rate": 1.48689482046087e-05,
"loss": 0.5954,
"step": 103
},
{
"epoch": 0.36142484795829716,
"grad_norm": 1.266830325126648,
"learning_rate": 1.4769932310907372e-05,
"loss": 0.6362,
"step": 104
},
{
"epoch": 0.3649000868809731,
"grad_norm": 1.0485084056854248,
"learning_rate": 1.467030727632401e-05,
"loss": 0.5664,
"step": 105
},
{
"epoch": 0.368375325803649,
"grad_norm": 1.12933349609375,
"learning_rate": 1.4570085823404232e-05,
"loss": 0.586,
"step": 106
},
{
"epoch": 0.37185056472632494,
"grad_norm": 1.0767207145690918,
"learning_rate": 1.4469280750858854e-05,
"loss": 0.5773,
"step": 107
},
{
"epoch": 0.37532580364900087,
"grad_norm": 1.121981143951416,
"learning_rate": 1.4367904931929422e-05,
"loss": 0.585,
"step": 108
},
{
"epoch": 0.3788010425716768,
"grad_norm": 1.1495089530944824,
"learning_rate": 1.4265971312744252e-05,
"loss": 0.5698,
"step": 109
},
{
"epoch": 0.3822762814943527,
"grad_norm": 1.0745882987976074,
"learning_rate": 1.4163492910665153e-05,
"loss": 0.611,
"step": 110
},
{
"epoch": 0.38575152041702865,
"grad_norm": 1.0079902410507202,
"learning_rate": 1.4060482812625055e-05,
"loss": 0.6226,
"step": 111
},
{
"epoch": 0.38922675933970463,
"grad_norm": 1.0810552835464478,
"learning_rate": 1.395695417345675e-05,
"loss": 0.6257,
"step": 112
},
{
"epoch": 0.39270199826238056,
"grad_norm": 1.040677547454834,
"learning_rate": 1.3852920214212966e-05,
"loss": 0.5883,
"step": 113
},
{
"epoch": 0.3961772371850565,
"grad_norm": 1.0766505002975464,
"learning_rate": 1.3748394220477972e-05,
"loss": 0.5804,
"step": 114
},
{
"epoch": 0.3996524761077324,
"grad_norm": 1.0619298219680786,
"learning_rate": 1.3643389540670963e-05,
"loss": 0.6178,
"step": 115
},
{
"epoch": 0.40312771503040834,
"grad_norm": 1.019912838935852,
"learning_rate": 1.3537919584341413e-05,
"loss": 0.5959,
"step": 116
},
{
"epoch": 0.40660295395308427,
"grad_norm": 1.113372564315796,
"learning_rate": 1.3431997820456592e-05,
"loss": 0.6051,
"step": 117
},
{
"epoch": 0.4100781928757602,
"grad_norm": 1.0494154691696167,
"learning_rate": 1.3325637775681561e-05,
"loss": 0.599,
"step": 118
},
{
"epoch": 0.4135534317984361,
"grad_norm": 1.1271395683288574,
"learning_rate": 1.3218853032651719e-05,
"loss": 0.6322,
"step": 119
},
{
"epoch": 0.41702867072111205,
"grad_norm": 1.1663187742233276,
"learning_rate": 1.3111657228238263e-05,
"loss": 0.6209,
"step": 120
},
{
"epoch": 0.42050390964378803,
"grad_norm": 1.0917779207229614,
"learning_rate": 1.3004064051806712e-05,
"loss": 0.5832,
"step": 121
},
{
"epoch": 0.42397914856646396,
"grad_norm": 1.1407371759414673,
"learning_rate": 1.2896087243468673e-05,
"loss": 0.5645,
"step": 122
},
{
"epoch": 0.4274543874891399,
"grad_norm": 1.1023014783859253,
"learning_rate": 1.2787740592327232e-05,
"loss": 0.5921,
"step": 123
},
{
"epoch": 0.4309296264118158,
"grad_norm": 1.1019343137741089,
"learning_rate": 1.267903793471597e-05,
"loss": 0.5687,
"step": 124
},
{
"epoch": 0.43440486533449174,
"grad_norm": 1.0315438508987427,
"learning_rate": 1.2569993152432028e-05,
"loss": 0.5973,
"step": 125
},
{
"epoch": 0.43788010425716767,
"grad_norm": 1.1791157722473145,
"learning_rate": 1.2460620170963353e-05,
"loss": 0.6029,
"step": 126
},
{
"epoch": 0.4413553431798436,
"grad_norm": 1.1693004369735718,
"learning_rate": 1.2350932957710322e-05,
"loss": 0.5916,
"step": 127
},
{
"epoch": 0.4448305821025195,
"grad_norm": 1.023739218711853,
"learning_rate": 1.2240945520202079e-05,
"loss": 0.5557,
"step": 128
},
{
"epoch": 0.4483058210251955,
"grad_norm": 1.2074741125106812,
"learning_rate": 1.2130671904307692e-05,
"loss": 0.605,
"step": 129
},
{
"epoch": 0.45178105994787143,
"grad_norm": 1.0737248659133911,
"learning_rate": 1.202012619244243e-05,
"loss": 0.6198,
"step": 130
},
{
"epoch": 0.45525629887054736,
"grad_norm": 1.0830988883972168,
"learning_rate": 1.1909322501769407e-05,
"loss": 0.5757,
"step": 131
},
{
"epoch": 0.4587315377932233,
"grad_norm": 1.050476312637329,
"learning_rate": 1.1798274982396728e-05,
"loss": 0.5597,
"step": 132
},
{
"epoch": 0.4622067767158992,
"grad_norm": 0.9489808678627014,
"learning_rate": 1.1686997815570473e-05,
"loss": 0.5686,
"step": 133
},
{
"epoch": 0.46568201563857514,
"grad_norm": 1.009084701538086,
"learning_rate": 1.15755052118637e-05,
"loss": 0.6,
"step": 134
},
{
"epoch": 0.46915725456125107,
"grad_norm": 1.058876872062683,
"learning_rate": 1.1463811409361667e-05,
"loss": 0.5479,
"step": 135
},
{
"epoch": 0.472632493483927,
"grad_norm": 1.0262316465377808,
"learning_rate": 1.13519306718436e-05,
"loss": 0.6045,
"step": 136
},
{
"epoch": 0.476107732406603,
"grad_norm": 1.0066461563110352,
"learning_rate": 1.1239877286961123e-05,
"loss": 0.561,
"step": 137
},
{
"epoch": 0.4795829713292789,
"grad_norm": 0.9656773209571838,
"learning_rate": 1.112766556441367e-05,
"loss": 0.6139,
"step": 138
},
{
"epoch": 0.48305821025195483,
"grad_norm": 1.0848320722579956,
"learning_rate": 1.1015309834121083e-05,
"loss": 0.5811,
"step": 139
},
{
"epoch": 0.48653344917463076,
"grad_norm": 1.0625325441360474,
"learning_rate": 1.0902824444393602e-05,
"loss": 0.6299,
"step": 140
},
{
"epoch": 0.4900086880973067,
"grad_norm": 1.0536874532699585,
"learning_rate": 1.079022376009955e-05,
"loss": 0.5937,
"step": 141
},
{
"epoch": 0.4934839270199826,
"grad_norm": 1.0022127628326416,
"learning_rate": 1.067752216083085e-05,
"loss": 0.5718,
"step": 142
},
{
"epoch": 0.49695916594265854,
"grad_norm": 1.0636472702026367,
"learning_rate": 1.05647340390667e-05,
"loss": 0.6251,
"step": 143
},
{
"epoch": 0.5004344048653345,
"grad_norm": 1.0569506883621216,
"learning_rate": 1.0451873798335605e-05,
"loss": 0.5583,
"step": 144
},
{
"epoch": 0.5039096437880104,
"grad_norm": 1.04555344581604,
"learning_rate": 1.0338955851375962e-05,
"loss": 0.5893,
"step": 145
},
{
"epoch": 0.5073848827106864,
"grad_norm": 0.9304705858230591,
"learning_rate": 1.0225994618295507e-05,
"loss": 0.5414,
"step": 146
},
{
"epoch": 0.5108601216333623,
"grad_norm": 0.9779573082923889,
"learning_rate": 1.01130045247298e-05,
"loss": 0.5601,
"step": 147
},
{
"epoch": 0.5143353605560382,
"grad_norm": 1.0225908756256104,
"learning_rate": 1e-05,
"loss": 0.65,
"step": 148
},
{
"epoch": 0.5178105994787141,
"grad_norm": 0.9953768253326416,
"learning_rate": 9.886995475270205e-06,
"loss": 0.5905,
"step": 149
},
{
"epoch": 0.5212858384013901,
"grad_norm": 1.0968619585037231,
"learning_rate": 9.774005381704498e-06,
"loss": 0.5995,
"step": 150
},
{
"epoch": 0.5247610773240661,
"grad_norm": 0.9768278002738953,
"learning_rate": 9.661044148624038e-06,
"loss": 0.642,
"step": 151
},
{
"epoch": 0.5282363162467419,
"grad_norm": 1.0109375715255737,
"learning_rate": 9.548126201664398e-06,
"loss": 0.534,
"step": 152
},
{
"epoch": 0.5317115551694179,
"grad_norm": 1.0256866216659546,
"learning_rate": 9.435265960933304e-06,
"loss": 0.6327,
"step": 153
},
{
"epoch": 0.5351867940920938,
"grad_norm": 0.9121344089508057,
"learning_rate": 9.322477839169156e-06,
"loss": 0.5787,
"step": 154
},
{
"epoch": 0.5386620330147698,
"grad_norm": 0.9741325974464417,
"learning_rate": 9.209776239900453e-06,
"loss": 0.5954,
"step": 155
},
{
"epoch": 0.5421372719374457,
"grad_norm": 1.0003210306167603,
"learning_rate": 9.097175555606396e-06,
"loss": 0.5514,
"step": 156
},
{
"epoch": 0.5456125108601216,
"grad_norm": 1.0428141355514526,
"learning_rate": 8.98469016587892e-06,
"loss": 0.5798,
"step": 157
},
{
"epoch": 0.5490877497827976,
"grad_norm": 1.0282821655273438,
"learning_rate": 8.872334435586333e-06,
"loss": 0.5911,
"step": 158
},
{
"epoch": 0.5525629887054735,
"grad_norm": 0.9479502439498901,
"learning_rate": 8.76012271303888e-06,
"loss": 0.6158,
"step": 159
},
{
"epoch": 0.5560382276281495,
"grad_norm": 1.0172061920166016,
"learning_rate": 8.648069328156403e-06,
"loss": 0.5601,
"step": 160
},
{
"epoch": 0.5595134665508253,
"grad_norm": 1.0053855180740356,
"learning_rate": 8.536188590638334e-06,
"loss": 0.6152,
"step": 161
},
{
"epoch": 0.5629887054735013,
"grad_norm": 0.974694550037384,
"learning_rate": 8.424494788136303e-06,
"loss": 0.5666,
"step": 162
},
{
"epoch": 0.5664639443961772,
"grad_norm": 0.9912855625152588,
"learning_rate": 8.313002184429529e-06,
"loss": 0.5695,
"step": 163
},
{
"epoch": 0.5699391833188532,
"grad_norm": 0.962709367275238,
"learning_rate": 8.201725017603277e-06,
"loss": 0.5357,
"step": 164
},
{
"epoch": 0.573414422241529,
"grad_norm": 0.9394634962081909,
"learning_rate": 8.090677498230598e-06,
"loss": 0.564,
"step": 165
},
{
"epoch": 0.576889661164205,
"grad_norm": 0.9721453785896301,
"learning_rate": 7.97987380755757e-06,
"loss": 0.5933,
"step": 166
},
{
"epoch": 0.580364900086881,
"grad_norm": 1.0866742134094238,
"learning_rate": 7.869328095692313e-06,
"loss": 0.5811,
"step": 167
},
{
"epoch": 0.5838401390095569,
"grad_norm": 0.9584195613861084,
"learning_rate": 7.759054479797924e-06,
"loss": 0.5816,
"step": 168
},
{
"epoch": 0.5873153779322329,
"grad_norm": 1.0240747928619385,
"learning_rate": 7.649067042289681e-06,
"loss": 0.5688,
"step": 169
},
{
"epoch": 0.5907906168549087,
"grad_norm": 1.0707347393035889,
"learning_rate": 7.539379829036652e-06,
"loss": 0.5823,
"step": 170
},
{
"epoch": 0.5942658557775847,
"grad_norm": 0.9794528484344482,
"learning_rate": 7.430006847567972e-06,
"loss": 0.5933,
"step": 171
},
{
"epoch": 0.5977410947002606,
"grad_norm": 1.1342881917953491,
"learning_rate": 7.320962065284032e-06,
"loss": 0.5611,
"step": 172
},
{
"epoch": 0.6012163336229366,
"grad_norm": 1.0687241554260254,
"learning_rate": 7.2122594076727705e-06,
"loss": 0.6206,
"step": 173
},
{
"epoch": 0.6046915725456126,
"grad_norm": 1.1404447555541992,
"learning_rate": 7.1039127565313285e-06,
"loss": 0.619,
"step": 174
},
{
"epoch": 0.6081668114682884,
"grad_norm": 1.0000810623168945,
"learning_rate": 6.995935948193294e-06,
"loss": 0.5557,
"step": 175
},
{
"epoch": 0.6116420503909644,
"grad_norm": 0.9445539712905884,
"learning_rate": 6.888342771761737e-06,
"loss": 0.5392,
"step": 176
},
{
"epoch": 0.6151172893136403,
"grad_norm": 0.9872927069664001,
"learning_rate": 6.781146967348283e-06,
"loss": 0.557,
"step": 177
},
{
"epoch": 0.6185925282363163,
"grad_norm": 1.1482657194137573,
"learning_rate": 6.6743622243184405e-06,
"loss": 0.6229,
"step": 178
},
{
"epoch": 0.6220677671589921,
"grad_norm": 0.9911180138587952,
"learning_rate": 6.568002179543409e-06,
"loss": 0.5777,
"step": 179
},
{
"epoch": 0.6255430060816681,
"grad_norm": 1.0723944902420044,
"learning_rate": 6.462080415658591e-06,
"loss": 0.5546,
"step": 180
},
{
"epoch": 0.629018245004344,
"grad_norm": 1.0823560953140259,
"learning_rate": 6.356610459329038e-06,
"loss": 0.5811,
"step": 181
},
{
"epoch": 0.63249348392702,
"grad_norm": 1.0165990591049194,
"learning_rate": 6.251605779522032e-06,
"loss": 0.5626,
"step": 182
},
{
"epoch": 0.635968722849696,
"grad_norm": 0.9735214114189148,
"learning_rate": 6.147079785787038e-06,
"loss": 0.5326,
"step": 183
},
{
"epoch": 0.6394439617723718,
"grad_norm": 0.993291974067688,
"learning_rate": 6.043045826543254e-06,
"loss": 0.5609,
"step": 184
},
{
"epoch": 0.6429192006950478,
"grad_norm": 0.9579317569732666,
"learning_rate": 5.93951718737495e-06,
"loss": 0.5593,
"step": 185
},
{
"epoch": 0.6463944396177237,
"grad_norm": 0.8985302448272705,
"learning_rate": 5.836507089334849e-06,
"loss": 0.5391,
"step": 186
},
{
"epoch": 0.6498696785403997,
"grad_norm": 1.0520668029785156,
"learning_rate": 5.7340286872557515e-06,
"loss": 0.6108,
"step": 187
},
{
"epoch": 0.6533449174630755,
"grad_norm": 0.9960751533508301,
"learning_rate": 5.6320950680705826e-06,
"loss": 0.5867,
"step": 188
},
{
"epoch": 0.6568201563857515,
"grad_norm": 1.0133719444274902,
"learning_rate": 5.530719249141148e-06,
"loss": 0.5313,
"step": 189
},
{
"epoch": 0.6602953953084274,
"grad_norm": 0.9764801859855652,
"learning_rate": 5.429914176595772e-06,
"loss": 0.559,
"step": 190
},
{
"epoch": 0.6637706342311034,
"grad_norm": 1.0536527633666992,
"learning_rate": 5.329692723675994e-06,
"loss": 0.5625,
"step": 191
},
{
"epoch": 0.6672458731537794,
"grad_norm": 1.139304280281067,
"learning_rate": 5.230067689092629e-06,
"loss": 0.5941,
"step": 192
},
{
"epoch": 0.6707211120764552,
"grad_norm": 1.0412135124206543,
"learning_rate": 5.131051795391302e-06,
"loss": 0.6064,
"step": 193
},
{
"epoch": 0.6741963509991312,
"grad_norm": 1.0304534435272217,
"learning_rate": 5.03265768732772e-06,
"loss": 0.5576,
"step": 194
},
{
"epoch": 0.6776715899218071,
"grad_norm": 0.9016774892807007,
"learning_rate": 4.934897930252887e-06,
"loss": 0.59,
"step": 195
},
{
"epoch": 0.6811468288444831,
"grad_norm": 0.9751427173614502,
"learning_rate": 4.837785008508462e-06,
"loss": 0.587,
"step": 196
},
{
"epoch": 0.684622067767159,
"grad_norm": 0.9640844464302063,
"learning_rate": 4.7413313238324556e-06,
"loss": 0.5332,
"step": 197
},
{
"epoch": 0.6880973066898349,
"grad_norm": 1.0356625318527222,
"learning_rate": 4.645549193775452e-06,
"loss": 0.5798,
"step": 198
},
{
"epoch": 0.6915725456125109,
"grad_norm": 0.9761075377464294,
"learning_rate": 4.550450850127626e-06,
"loss": 0.5669,
"step": 199
},
{
"epoch": 0.6950477845351868,
"grad_norm": 0.9727676510810852,
"learning_rate": 4.4560484373566945e-06,
"loss": 0.5526,
"step": 200
},
{
"epoch": 0.6985230234578628,
"grad_norm": 0.9600728750228882,
"learning_rate": 4.3623540110569935e-06,
"loss": 0.5225,
"step": 201
},
{
"epoch": 0.7019982623805386,
"grad_norm": 1.075377345085144,
"learning_rate": 4.26937953640995e-06,
"loss": 0.6123,
"step": 202
},
{
"epoch": 0.7054735013032146,
"grad_norm": 0.9740473031997681,
"learning_rate": 4.177136886656067e-06,
"loss": 0.5718,
"step": 203
},
{
"epoch": 0.7089487402258905,
"grad_norm": 0.894684374332428,
"learning_rate": 4.085637841578652e-06,
"loss": 0.6006,
"step": 204
},
{
"epoch": 0.7124239791485665,
"grad_norm": 1.0192121267318726,
"learning_rate": 3.9948940859994964e-06,
"loss": 0.5745,
"step": 205
},
{
"epoch": 0.7158992180712423,
"grad_norm": 1.0569602251052856,
"learning_rate": 3.9049172082866786e-06,
"loss": 0.6085,
"step": 206
},
{
"epoch": 0.7193744569939183,
"grad_norm": 1.0002917051315308,
"learning_rate": 3.815718698874672e-06,
"loss": 0.5586,
"step": 207
},
{
"epoch": 0.7228496959165943,
"grad_norm": 0.9531834125518799,
"learning_rate": 3.727309948796963e-06,
"loss": 0.5927,
"step": 208
},
{
"epoch": 0.7263249348392702,
"grad_norm": 1.045615553855896,
"learning_rate": 3.6397022482313804e-06,
"loss": 0.5594,
"step": 209
},
{
"epoch": 0.7298001737619462,
"grad_norm": 0.9334245324134827,
"learning_rate": 3.552906785058278e-06,
"loss": 0.5364,
"step": 210
},
{
"epoch": 0.733275412684622,
"grad_norm": 0.9601573348045349,
"learning_rate": 3.466934643431795e-06,
"loss": 0.5571,
"step": 211
},
{
"epoch": 0.736750651607298,
"grad_norm": 0.95643150806427,
"learning_rate": 3.3817968023643766e-06,
"loss": 0.5717,
"step": 212
},
{
"epoch": 0.7402258905299739,
"grad_norm": 0.9626767039299011,
"learning_rate": 3.2975041343246937e-06,
"loss": 0.54,
"step": 213
},
{
"epoch": 0.7437011294526499,
"grad_norm": 0.9196009635925293,
"learning_rate": 3.214067403849179e-06,
"loss": 0.5807,
"step": 214
},
{
"epoch": 0.7471763683753258,
"grad_norm": 1.049148440361023,
"learning_rate": 3.1314972661673572e-06,
"loss": 0.6214,
"step": 215
},
{
"epoch": 0.7506516072980017,
"grad_norm": 0.9988502264022827,
"learning_rate": 3.0498042658411276e-06,
"loss": 0.5561,
"step": 216
},
{
"epoch": 0.7541268462206777,
"grad_norm": 1.0174543857574463,
"learning_rate": 2.9689988354181742e-06,
"loss": 0.5698,
"step": 217
},
{
"epoch": 0.7576020851433536,
"grad_norm": 1.0161805152893066,
"learning_rate": 2.8890912940996784e-06,
"loss": 0.5478,
"step": 218
},
{
"epoch": 0.7610773240660296,
"grad_norm": 0.9427905082702637,
"learning_rate": 2.8100918464225304e-06,
"loss": 0.5637,
"step": 219
},
{
"epoch": 0.7645525629887054,
"grad_norm": 1.0057843923568726,
"learning_rate": 2.7320105809561415e-06,
"loss": 0.5674,
"step": 220
},
{
"epoch": 0.7680278019113814,
"grad_norm": 1.013282299041748,
"learning_rate": 2.654857469014113e-06,
"loss": 0.5863,
"step": 221
},
{
"epoch": 0.7715030408340573,
"grad_norm": 0.980964720249176,
"learning_rate": 2.5786423633808487e-06,
"loss": 0.573,
"step": 222
},
{
"epoch": 0.7749782797567333,
"grad_norm": 0.9535639882087708,
"learning_rate": 2.5033749970533015e-06,
"loss": 0.5594,
"step": 223
},
{
"epoch": 0.7784535186794093,
"grad_norm": 0.9691473841667175,
"learning_rate": 2.4290649819980404e-06,
"loss": 0.5937,
"step": 224
},
{
"epoch": 0.7819287576020851,
"grad_norm": 0.9758164286613464,
"learning_rate": 2.3557218079237608e-06,
"loss": 0.5412,
"step": 225
},
{
"epoch": 0.7854039965247611,
"grad_norm": 0.9405572414398193,
"learning_rate": 2.283354841069403e-06,
"loss": 0.5655,
"step": 226
},
{
"epoch": 0.788879235447437,
"grad_norm": 0.8931716680526733,
"learning_rate": 2.211973323008041e-06,
"loss": 0.5258,
"step": 227
},
{
"epoch": 0.792354474370113,
"grad_norm": 0.9230597019195557,
"learning_rate": 2.1415863694666973e-06,
"loss": 0.5538,
"step": 228
},
{
"epoch": 0.7958297132927888,
"grad_norm": 0.9798585176467896,
"learning_rate": 2.072202969162234e-06,
"loss": 0.5556,
"step": 229
},
{
"epoch": 0.7993049522154648,
"grad_norm": 0.9468632340431213,
"learning_rate": 2.0038319826534312e-06,
"loss": 0.597,
"step": 230
},
{
"epoch": 0.8027801911381407,
"grad_norm": 0.9490894079208374,
"learning_rate": 1.936482141209486e-06,
"loss": 0.5645,
"step": 231
},
{
"epoch": 0.8062554300608167,
"grad_norm": 0.9810106754302979,
"learning_rate": 1.870162045694971e-06,
"loss": 0.5281,
"step": 232
},
{
"epoch": 0.8097306689834927,
"grad_norm": 0.9110345840454102,
"learning_rate": 1.8048801654714687e-06,
"loss": 0.5774,
"step": 233
},
{
"epoch": 0.8132059079061685,
"grad_norm": 0.9290068745613098,
"learning_rate": 1.7406448373160024e-06,
"loss": 0.6053,
"step": 234
},
{
"epoch": 0.8166811468288445,
"grad_norm": 0.9238559603691101,
"learning_rate": 1.6774642643563955e-06,
"loss": 0.5125,
"step": 235
},
{
"epoch": 0.8201563857515204,
"grad_norm": 1.0130783319473267,
"learning_rate": 1.615346515023698e-06,
"loss": 0.5873,
"step": 236
},
{
"epoch": 0.8236316246741964,
"grad_norm": 0.9619508981704712,
"learning_rate": 1.5542995220217961e-06,
"loss": 0.5979,
"step": 237
},
{
"epoch": 0.8271068635968722,
"grad_norm": 1.0197911262512207,
"learning_rate": 1.4943310813144006e-06,
"loss": 0.6156,
"step": 238
},
{
"epoch": 0.8305821025195482,
"grad_norm": 0.9349676370620728,
"learning_rate": 1.4354488511294418e-06,
"loss": 0.5689,
"step": 239
},
{
"epoch": 0.8340573414422241,
"grad_norm": 1.0270828008651733,
"learning_rate": 1.3776603509810938e-06,
"loss": 0.5397,
"step": 240
},
{
"epoch": 0.8375325803649001,
"grad_norm": 0.9381040334701538,
"learning_rate": 1.3209729607095022e-06,
"loss": 0.542,
"step": 241
},
{
"epoch": 0.8410078192875761,
"grad_norm": 1.0486862659454346,
"learning_rate": 1.2653939195383448e-06,
"loss": 0.5886,
"step": 242
},
{
"epoch": 0.8444830582102519,
"grad_norm": 0.9653035402297974,
"learning_rate": 1.2109303251503434e-06,
"loss": 0.5893,
"step": 243
},
{
"epoch": 0.8479582971329279,
"grad_norm": 0.9536635279655457,
"learning_rate": 1.1575891327808664e-06,
"loss": 0.5728,
"step": 244
},
{
"epoch": 0.8514335360556038,
"grad_norm": 0.9552822113037109,
"learning_rate": 1.1053771543297198e-06,
"loss": 0.5335,
"step": 245
},
{
"epoch": 0.8549087749782798,
"grad_norm": 0.9501549005508423,
"learning_rate": 1.0543010574912305e-06,
"loss": 0.5696,
"step": 246
},
{
"epoch": 0.8583840139009556,
"grad_norm": 0.9457242488861084,
"learning_rate": 1.0043673649027519e-06,
"loss": 0.5974,
"step": 247
},
{
"epoch": 0.8618592528236316,
"grad_norm": 0.9081954956054688,
"learning_rate": 9.555824533117064e-07,
"loss": 0.5984,
"step": 248
},
{
"epoch": 0.8653344917463076,
"grad_norm": 0.9678678512573242,
"learning_rate": 9.079525527612321e-07,
"loss": 0.5463,
"step": 249
},
{
"epoch": 0.8688097306689835,
"grad_norm": 0.9402781128883362,
"learning_rate": 8.614837457945868e-07,
"loss": 0.5831,
"step": 250
},
{
"epoch": 0.8722849695916595,
"grad_norm": 0.8864692449569702,
"learning_rate": 8.161819666783888e-07,
"loss": 0.6336,
"step": 251
},
{
"epoch": 0.8757602085143353,
"grad_norm": 0.9280177354812622,
"learning_rate": 7.720530006447735e-07,
"loss": 0.5792,
"step": 252
},
{
"epoch": 0.8792354474370113,
"grad_norm": 0.9325032830238342,
"learning_rate": 7.291024831525961e-07,
"loss": 0.5374,
"step": 253
},
{
"epoch": 0.8827106863596872,
"grad_norm": 0.8580663800239563,
"learning_rate": 6.87335899167767e-07,
"loss": 0.5319,
"step": 254
},
{
"epoch": 0.8861859252823632,
"grad_norm": 0.98850017786026,
"learning_rate": 6.467585824627886e-07,
"loss": 0.6334,
"step": 255
},
{
"epoch": 0.889661164205039,
"grad_norm": 0.9273893237113953,
"learning_rate": 6.073757149356185e-07,
"loss": 0.5404,
"step": 256
},
{
"epoch": 0.893136403127715,
"grad_norm": 0.9261394739151001,
"learning_rate": 5.691923259479093e-07,
"loss": 0.553,
"step": 257
},
{
"epoch": 0.896611642050391,
"grad_norm": 0.9248658418655396,
"learning_rate": 5.322132916827483e-07,
"loss": 0.5835,
"step": 258
},
{
"epoch": 0.9000868809730669,
"grad_norm": 0.9358659982681274,
"learning_rate": 4.964433345219354e-07,
"loss": 0.6004,
"step": 259
},
{
"epoch": 0.9035621198957429,
"grad_norm": 1.0171351432800293,
"learning_rate": 4.6188702244292614e-07,
"loss": 0.5684,
"step": 260
},
{
"epoch": 0.9070373588184187,
"grad_norm": 0.9322757720947266,
"learning_rate": 4.285487684354772e-07,
"loss": 0.5311,
"step": 261
},
{
"epoch": 0.9105125977410947,
"grad_norm": 0.9555125832557678,
"learning_rate": 3.96432829938086e-07,
"loss": 0.5627,
"step": 262
},
{
"epoch": 0.9139878366637706,
"grad_norm": 0.8531783223152161,
"learning_rate": 3.6554330829429716e-07,
"loss": 0.5249,
"step": 263
},
{
"epoch": 0.9174630755864466,
"grad_norm": 0.9125425815582275,
"learning_rate": 3.3588414822895097e-07,
"loss": 0.5257,
"step": 264
},
{
"epoch": 0.9209383145091226,
"grad_norm": 0.9245027899742126,
"learning_rate": 3.0745913734441357e-07,
"loss": 0.5328,
"step": 265
},
{
"epoch": 0.9244135534317984,
"grad_norm": 0.9753005504608154,
"learning_rate": 2.8027190563689745e-07,
"loss": 0.5431,
"step": 266
},
{
"epoch": 0.9278887923544744,
"grad_norm": 0.9659878611564636,
"learning_rate": 2.5432592503288e-07,
"loss": 0.5794,
"step": 267
},
{
"epoch": 0.9313640312771503,
"grad_norm": 1.0079331398010254,
"learning_rate": 2.2962450894573606e-07,
"loss": 0.5434,
"step": 268
},
{
"epoch": 0.9348392701998263,
"grad_norm": 0.9261694550514221,
"learning_rate": 2.0617081185259512e-07,
"loss": 0.5718,
"step": 269
},
{
"epoch": 0.9383145091225021,
"grad_norm": 0.8658697009086609,
"learning_rate": 1.8396782889150144e-07,
"loss": 0.5553,
"step": 270
},
{
"epoch": 0.9417897480451781,
"grad_norm": 0.9265699982643127,
"learning_rate": 1.630183954789233e-07,
"loss": 0.551,
"step": 271
},
{
"epoch": 0.945264986967854,
"grad_norm": 0.9904604554176331,
"learning_rate": 1.4332518694765708e-07,
"loss": 0.5113,
"step": 272
},
{
"epoch": 0.94874022589053,
"grad_norm": 0.9163973331451416,
"learning_rate": 1.2489071820517394e-07,
"loss": 0.587,
"step": 273
},
{
"epoch": 0.952215464813206,
"grad_norm": 0.8994106650352478,
"learning_rate": 1.0771734341246121e-07,
"loss": 0.5273,
"step": 274
},
{
"epoch": 0.9556907037358818,
"grad_norm": 0.9306617975234985,
"learning_rate": 9.180725568338045e-08,
"loss": 0.5598,
"step": 275
},
{
"epoch": 0.9591659426585578,
"grad_norm": 0.9544433355331421,
"learning_rate": 7.716248680459726e-08,
"loss": 0.5608,
"step": 276
},
{
"epoch": 0.9626411815812337,
"grad_norm": 0.9377245903015137,
"learning_rate": 6.378490697611761e-08,
"loss": 0.5477,
"step": 277
},
{
"epoch": 0.9661164205039097,
"grad_norm": 0.8245200514793396,
"learning_rate": 5.1676224572452246e-08,
"loss": 0.5014,
"step": 278
},
{
"epoch": 0.9695916594265855,
"grad_norm": 0.8969350457191467,
"learning_rate": 4.083798592444899e-08,
"loss": 0.5899,
"step": 279
},
{
"epoch": 0.9730668983492615,
"grad_norm": 0.9808463454246521,
"learning_rate": 3.127157512182288e-08,
"loss": 0.5642,
"step": 280
},
{
"epoch": 0.9765421372719374,
"grad_norm": 0.9421252608299255,
"learning_rate": 2.2978213836400974e-08,
"loss": 0.5448,
"step": 281
},
{
"epoch": 0.9800173761946134,
"grad_norm": 0.9303473830223083,
"learning_rate": 1.5958961166104847e-08,
"loss": 0.5947,
"step": 282
},
{
"epoch": 0.9834926151172894,
"grad_norm": 1.075130820274353,
"learning_rate": 1.0214713499706596e-08,
"loss": 0.5925,
"step": 283
},
{
"epoch": 0.9869678540399652,
"grad_norm": 0.9997183680534363,
"learning_rate": 5.7462044023515186e-09,
"loss": 0.6126,
"step": 284
},
{
"epoch": 0.9904430929626412,
"grad_norm": 0.8891317248344421,
"learning_rate": 2.5540045218819256e-09,
"loss": 0.5529,
"step": 285
},
{
"epoch": 0.9939183318853171,
"grad_norm": 0.8776668310165405,
"learning_rate": 6.385215159565583e-10,
"loss": 0.5456,
"step": 286
},
{
"epoch": 0.9973935708079931,
"grad_norm": 0.8966051340103149,
"learning_rate": 0.0,
"loss": 0.5303,
"step": 287
},
{
"epoch": 0.9973935708079931,
"step": 287,
"total_flos": 2.698045158024282e+18,
"train_loss": 0.5919382667707649,
"train_runtime": 4471.5794,
"train_samples_per_second": 16.469,
"train_steps_per_second": 0.064
}
],
"logging_steps": 1.0,
"max_steps": 287,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.698045158024282e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}