bloomz_lora_mn / last-checkpoint /trainer_state.json
Billyyy's picture
Training in progress, step 5493, checkpoint
2ca9fb7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 5493,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018204988166757691,
"grad_norm": 900.346923828125,
"learning_rate": 2.0000000000000003e-06,
"loss": 20.2441,
"step": 10
},
{
"epoch": 0.0036409976333515383,
"grad_norm": 939.3712158203125,
"learning_rate": 4.000000000000001e-06,
"loss": 19.6193,
"step": 20
},
{
"epoch": 0.005461496450027308,
"grad_norm": 562.291015625,
"learning_rate": 6e-06,
"loss": 18.7898,
"step": 30
},
{
"epoch": 0.007281995266703077,
"grad_norm": 447.43060302734375,
"learning_rate": 8.000000000000001e-06,
"loss": 17.2234,
"step": 40
},
{
"epoch": 0.009102494083378846,
"grad_norm": 361.34100341796875,
"learning_rate": 1e-05,
"loss": 15.4955,
"step": 50
},
{
"epoch": 0.010922992900054615,
"grad_norm": 320.7839660644531,
"learning_rate": 1.2e-05,
"loss": 13.6977,
"step": 60
},
{
"epoch": 0.012743491716730384,
"grad_norm": 131.0957794189453,
"learning_rate": 1.4000000000000001e-05,
"loss": 11.4716,
"step": 70
},
{
"epoch": 0.014563990533406153,
"grad_norm": 75.75025177001953,
"learning_rate": 1.6000000000000003e-05,
"loss": 9.5351,
"step": 80
},
{
"epoch": 0.016384489350081924,
"grad_norm": 42.2626953125,
"learning_rate": 1.8e-05,
"loss": 8.1668,
"step": 90
},
{
"epoch": 0.018204988166757693,
"grad_norm": 25.88639259338379,
"learning_rate": 2e-05,
"loss": 7.3242,
"step": 100
},
{
"epoch": 0.02002548698343346,
"grad_norm": 16.043380737304688,
"learning_rate": 2.2000000000000003e-05,
"loss": 6.6513,
"step": 110
},
{
"epoch": 0.02184598580010923,
"grad_norm": 15.031912803649902,
"learning_rate": 2.4e-05,
"loss": 6.1476,
"step": 120
},
{
"epoch": 0.023666484616785,
"grad_norm": 8.564423561096191,
"learning_rate": 2.6000000000000002e-05,
"loss": 5.7499,
"step": 130
},
{
"epoch": 0.02548698343346077,
"grad_norm": 9.205732345581055,
"learning_rate": 2.8000000000000003e-05,
"loss": 5.4031,
"step": 140
},
{
"epoch": 0.027307482250136537,
"grad_norm": 7.424269199371338,
"learning_rate": 3e-05,
"loss": 5.1054,
"step": 150
},
{
"epoch": 0.029127981066812306,
"grad_norm": 7.239726543426514,
"learning_rate": 3.2000000000000005e-05,
"loss": 4.9637,
"step": 160
},
{
"epoch": 0.030948479883488075,
"grad_norm": 9.377843856811523,
"learning_rate": 3.4000000000000007e-05,
"loss": 4.7583,
"step": 170
},
{
"epoch": 0.03276897870016385,
"grad_norm": 5.893341064453125,
"learning_rate": 3.6e-05,
"loss": 4.7093,
"step": 180
},
{
"epoch": 0.03458947751683961,
"grad_norm": 5.311996936798096,
"learning_rate": 3.8e-05,
"loss": 4.5147,
"step": 190
},
{
"epoch": 0.036409976333515386,
"grad_norm": 5.311359882354736,
"learning_rate": 4e-05,
"loss": 4.5004,
"step": 200
},
{
"epoch": 0.03823047515019115,
"grad_norm": 8.67163372039795,
"learning_rate": 4.2e-05,
"loss": 4.3923,
"step": 210
},
{
"epoch": 0.04005097396686692,
"grad_norm": 5.317925930023193,
"learning_rate": 4.4000000000000006e-05,
"loss": 4.3169,
"step": 220
},
{
"epoch": 0.04187147278354269,
"grad_norm": 10.292462348937988,
"learning_rate": 4.600000000000001e-05,
"loss": 4.2887,
"step": 230
},
{
"epoch": 0.04369197160021846,
"grad_norm": 6.532808303833008,
"learning_rate": 4.8e-05,
"loss": 4.2458,
"step": 240
},
{
"epoch": 0.04551247041689423,
"grad_norm": 5.974935531616211,
"learning_rate": 5e-05,
"loss": 4.1716,
"step": 250
},
{
"epoch": 0.04733296923357,
"grad_norm": 10.58936595916748,
"learning_rate": 5.2000000000000004e-05,
"loss": 4.0606,
"step": 260
},
{
"epoch": 0.049153468050245765,
"grad_norm": 11.461891174316406,
"learning_rate": 5.4000000000000005e-05,
"loss": 4.1349,
"step": 270
},
{
"epoch": 0.05097396686692154,
"grad_norm": 8.389955520629883,
"learning_rate": 5.6000000000000006e-05,
"loss": 4.044,
"step": 280
},
{
"epoch": 0.0527944656835973,
"grad_norm": 5.730175018310547,
"learning_rate": 5.8e-05,
"loss": 4.0169,
"step": 290
},
{
"epoch": 0.054614964500273075,
"grad_norm": 8.236620903015137,
"learning_rate": 6e-05,
"loss": 3.8992,
"step": 300
},
{
"epoch": 0.05643546331694885,
"grad_norm": 9.377148628234863,
"learning_rate": 6.2e-05,
"loss": 3.9041,
"step": 310
},
{
"epoch": 0.05825596213362461,
"grad_norm": 8.225547790527344,
"learning_rate": 6.400000000000001e-05,
"loss": 3.8243,
"step": 320
},
{
"epoch": 0.060076460950300385,
"grad_norm": 6.899202823638916,
"learning_rate": 6.6e-05,
"loss": 3.8613,
"step": 330
},
{
"epoch": 0.06189695976697615,
"grad_norm": 7.693598747253418,
"learning_rate": 6.800000000000001e-05,
"loss": 3.7624,
"step": 340
},
{
"epoch": 0.06371745858365192,
"grad_norm": 6.594521522521973,
"learning_rate": 7e-05,
"loss": 3.7709,
"step": 350
},
{
"epoch": 0.0655379574003277,
"grad_norm": 7.71303129196167,
"learning_rate": 7.2e-05,
"loss": 3.699,
"step": 360
},
{
"epoch": 0.06735845621700345,
"grad_norm": 11.58485221862793,
"learning_rate": 7.4e-05,
"loss": 3.6711,
"step": 370
},
{
"epoch": 0.06917895503367923,
"grad_norm": 12.817239761352539,
"learning_rate": 7.6e-05,
"loss": 3.6647,
"step": 380
},
{
"epoch": 0.070999453850355,
"grad_norm": 11.794710159301758,
"learning_rate": 7.800000000000001e-05,
"loss": 3.6485,
"step": 390
},
{
"epoch": 0.07281995266703077,
"grad_norm": 7.5003509521484375,
"learning_rate": 8e-05,
"loss": 3.63,
"step": 400
},
{
"epoch": 0.07464045148370653,
"grad_norm": 7.507719993591309,
"learning_rate": 8.2e-05,
"loss": 3.5663,
"step": 410
},
{
"epoch": 0.0764609503003823,
"grad_norm": 10.392959594726562,
"learning_rate": 8.4e-05,
"loss": 3.5462,
"step": 420
},
{
"epoch": 0.07828144911705807,
"grad_norm": 10.792546272277832,
"learning_rate": 8.6e-05,
"loss": 3.489,
"step": 430
},
{
"epoch": 0.08010194793373385,
"grad_norm": 8.886263847351074,
"learning_rate": 8.800000000000001e-05,
"loss": 3.507,
"step": 440
},
{
"epoch": 0.0819224467504096,
"grad_norm": 10.633005142211914,
"learning_rate": 9e-05,
"loss": 3.4786,
"step": 450
},
{
"epoch": 0.08374294556708538,
"grad_norm": 8.927319526672363,
"learning_rate": 9.200000000000001e-05,
"loss": 3.4799,
"step": 460
},
{
"epoch": 0.08556344438376115,
"grad_norm": 7.764442443847656,
"learning_rate": 9.4e-05,
"loss": 3.4424,
"step": 470
},
{
"epoch": 0.08738394320043692,
"grad_norm": 7.294579982757568,
"learning_rate": 9.6e-05,
"loss": 3.3888,
"step": 480
},
{
"epoch": 0.0892044420171127,
"grad_norm": 12.751729965209961,
"learning_rate": 9.8e-05,
"loss": 3.4342,
"step": 490
},
{
"epoch": 0.09102494083378845,
"grad_norm": 12.688404083251953,
"learning_rate": 0.0001,
"loss": 3.412,
"step": 500
},
{
"epoch": 0.09284543965046423,
"grad_norm": 8.39454174041748,
"learning_rate": 9.99990102735217e-05,
"loss": 3.3787,
"step": 510
},
{
"epoch": 0.09466593846714,
"grad_norm": 8.661295890808105,
"learning_rate": 9.999604113326911e-05,
"loss": 3.3499,
"step": 520
},
{
"epoch": 0.09648643728381577,
"grad_norm": 11.745123863220215,
"learning_rate": 9.999109269678773e-05,
"loss": 3.3025,
"step": 530
},
{
"epoch": 0.09830693610049153,
"grad_norm": 11.857743263244629,
"learning_rate": 9.998416515998146e-05,
"loss": 3.3443,
"step": 540
},
{
"epoch": 0.1001274349171673,
"grad_norm": 10.025672912597656,
"learning_rate": 9.997525879710501e-05,
"loss": 3.2624,
"step": 550
},
{
"epoch": 0.10194793373384307,
"grad_norm": 11.870292663574219,
"learning_rate": 9.996437396075289e-05,
"loss": 3.3323,
"step": 560
},
{
"epoch": 0.10376843255051885,
"grad_norm": 11.637451171875,
"learning_rate": 9.995151108184551e-05,
"loss": 3.2349,
"step": 570
},
{
"epoch": 0.1055889313671946,
"grad_norm": 21.10885238647461,
"learning_rate": 9.993667066961219e-05,
"loss": 3.3025,
"step": 580
},
{
"epoch": 0.10740943018387038,
"grad_norm": 19.601778030395508,
"learning_rate": 9.991985331157083e-05,
"loss": 3.2509,
"step": 590
},
{
"epoch": 0.10922992900054615,
"grad_norm": 15.462264060974121,
"learning_rate": 9.990105967350486e-05,
"loss": 3.2197,
"step": 600
},
{
"epoch": 0.11105042781722192,
"grad_norm": 10.48941421508789,
"learning_rate": 9.98802904994367e-05,
"loss": 3.2523,
"step": 610
},
{
"epoch": 0.1128709266338977,
"grad_norm": 11.569725036621094,
"learning_rate": 9.985754661159844e-05,
"loss": 3.192,
"step": 620
},
{
"epoch": 0.11469142545057345,
"grad_norm": 7.362033843994141,
"learning_rate": 9.983282891039914e-05,
"loss": 3.2174,
"step": 630
},
{
"epoch": 0.11651192426724923,
"grad_norm": 7.256021022796631,
"learning_rate": 9.98061383743894e-05,
"loss": 3.1671,
"step": 640
},
{
"epoch": 0.118332423083925,
"grad_norm": 8.411303520202637,
"learning_rate": 9.97774760602224e-05,
"loss": 3.1793,
"step": 650
},
{
"epoch": 0.12015292190060077,
"grad_norm": 9.6874361038208,
"learning_rate": 9.97468431026122e-05,
"loss": 3.167,
"step": 660
},
{
"epoch": 0.12197342071727653,
"grad_norm": 9.121397972106934,
"learning_rate": 9.971424071428877e-05,
"loss": 3.1107,
"step": 670
},
{
"epoch": 0.1237939195339523,
"grad_norm": 13.291868209838867,
"learning_rate": 9.967967018594997e-05,
"loss": 3.1678,
"step": 680
},
{
"epoch": 0.12561441835062806,
"grad_norm": 10.143365859985352,
"learning_rate": 9.96431328862105e-05,
"loss": 3.1591,
"step": 690
},
{
"epoch": 0.12743491716730385,
"grad_norm": 9.821294784545898,
"learning_rate": 9.96046302615477e-05,
"loss": 3.1315,
"step": 700
},
{
"epoch": 0.1292554159839796,
"grad_norm": 12.868693351745605,
"learning_rate": 9.956416383624422e-05,
"loss": 3.0713,
"step": 710
},
{
"epoch": 0.1310759148006554,
"grad_norm": 10.408199310302734,
"learning_rate": 9.95217352123278e-05,
"loss": 3.1292,
"step": 720
},
{
"epoch": 0.13289641361733115,
"grad_norm": 8.829959869384766,
"learning_rate": 9.947734606950771e-05,
"loss": 3.111,
"step": 730
},
{
"epoch": 0.1347169124340069,
"grad_norm": 9.13364315032959,
"learning_rate": 9.943099816510836e-05,
"loss": 3.1011,
"step": 740
},
{
"epoch": 0.1365374112506827,
"grad_norm": 8.674768447875977,
"learning_rate": 9.93826933339997e-05,
"loss": 3.0991,
"step": 750
},
{
"epoch": 0.13835791006735845,
"grad_norm": 8.487624168395996,
"learning_rate": 9.933243348852451e-05,
"loss": 3.0915,
"step": 760
},
{
"epoch": 0.14017840888403424,
"grad_norm": 7.808052062988281,
"learning_rate": 9.928022061842282e-05,
"loss": 3.0694,
"step": 770
},
{
"epoch": 0.14199890770071,
"grad_norm": 12.615427017211914,
"learning_rate": 9.922605679075298e-05,
"loss": 3.0524,
"step": 780
},
{
"epoch": 0.14381940651738576,
"grad_norm": 8.977923393249512,
"learning_rate": 9.916994414981002e-05,
"loss": 2.9989,
"step": 790
},
{
"epoch": 0.14563990533406154,
"grad_norm": 8.723967552185059,
"learning_rate": 9.911188491704058e-05,
"loss": 3.0123,
"step": 800
},
{
"epoch": 0.1474604041507373,
"grad_norm": 8.96397590637207,
"learning_rate": 9.905188139095506e-05,
"loss": 3.0031,
"step": 810
},
{
"epoch": 0.14928090296741306,
"grad_norm": 21.173276901245117,
"learning_rate": 9.89899359470366e-05,
"loss": 3.0045,
"step": 820
},
{
"epoch": 0.15110140178408885,
"grad_norm": 9.059152603149414,
"learning_rate": 9.892605103764704e-05,
"loss": 2.972,
"step": 830
},
{
"epoch": 0.1529219006007646,
"grad_norm": 7.885227203369141,
"learning_rate": 9.886022919192985e-05,
"loss": 2.9822,
"step": 840
},
{
"epoch": 0.1547423994174404,
"grad_norm": 7.191554069519043,
"learning_rate": 9.879247301570995e-05,
"loss": 3.0297,
"step": 850
},
{
"epoch": 0.15656289823411615,
"grad_norm": 10.712031364440918,
"learning_rate": 9.872278519139062e-05,
"loss": 3.0149,
"step": 860
},
{
"epoch": 0.1583833970507919,
"grad_norm": 10.305954933166504,
"learning_rate": 9.865116847784726e-05,
"loss": 3.0046,
"step": 870
},
{
"epoch": 0.1602038958674677,
"grad_norm": 11.114262580871582,
"learning_rate": 9.857762571031818e-05,
"loss": 2.9784,
"step": 880
},
{
"epoch": 0.16202439468414345,
"grad_norm": 10.611502647399902,
"learning_rate": 9.850215980029234e-05,
"loss": 2.9992,
"step": 890
},
{
"epoch": 0.1638448935008192,
"grad_norm": 8.901230812072754,
"learning_rate": 9.842477373539412e-05,
"loss": 2.9712,
"step": 900
},
{
"epoch": 0.165665392317495,
"grad_norm": 7.750337600708008,
"learning_rate": 9.834547057926502e-05,
"loss": 2.9586,
"step": 910
},
{
"epoch": 0.16748589113417076,
"grad_norm": 8.038575172424316,
"learning_rate": 9.826425347144237e-05,
"loss": 2.9487,
"step": 920
},
{
"epoch": 0.16930638995084654,
"grad_norm": 8.837507247924805,
"learning_rate": 9.818112562723507e-05,
"loss": 2.9682,
"step": 930
},
{
"epoch": 0.1711268887675223,
"grad_norm": 8.412049293518066,
"learning_rate": 9.809609033759625e-05,
"loss": 2.9253,
"step": 940
},
{
"epoch": 0.17294738758419806,
"grad_norm": 8.611617088317871,
"learning_rate": 9.8009150968993e-05,
"loss": 2.9494,
"step": 950
},
{
"epoch": 0.17476788640087385,
"grad_norm": 8.870803833007812,
"learning_rate": 9.792031096327318e-05,
"loss": 2.9347,
"step": 960
},
{
"epoch": 0.1765883852175496,
"grad_norm": 6.085882186889648,
"learning_rate": 9.782957383752898e-05,
"loss": 2.8954,
"step": 970
},
{
"epoch": 0.1784088840342254,
"grad_norm": 53.74106979370117,
"learning_rate": 9.773694318395786e-05,
"loss": 2.9159,
"step": 980
},
{
"epoch": 0.18022938285090115,
"grad_norm": 8.432936668395996,
"learning_rate": 9.764242266972021e-05,
"loss": 2.9752,
"step": 990
},
{
"epoch": 0.1820498816675769,
"grad_norm": 6.479644298553467,
"learning_rate": 9.75460160367943e-05,
"loss": 2.938,
"step": 1000
},
{
"epoch": 0.1820498816675769,
"eval_loss": 2.901522159576416,
"eval_runtime": 1012.3411,
"eval_samples_per_second": 9.647,
"eval_steps_per_second": 1.206,
"step": 1000
},
{
"epoch": 0.1838703804842527,
"grad_norm": 8.284567832946777,
"learning_rate": 9.744772710182801e-05,
"loss": 2.899,
"step": 1010
},
{
"epoch": 0.18569087930092845,
"grad_norm": 6.623629570007324,
"learning_rate": 9.734755975598777e-05,
"loss": 2.9371,
"step": 1020
},
{
"epoch": 0.1875113781176042,
"grad_norm": 8.541956901550293,
"learning_rate": 9.724551796480459e-05,
"loss": 2.8807,
"step": 1030
},
{
"epoch": 0.18933187693428,
"grad_norm": 8.620600700378418,
"learning_rate": 9.714160576801696e-05,
"loss": 2.8888,
"step": 1040
},
{
"epoch": 0.19115237575095576,
"grad_norm": 8.644622802734375,
"learning_rate": 9.7035827279411e-05,
"loss": 2.8747,
"step": 1050
},
{
"epoch": 0.19297287456763154,
"grad_norm": 9.656100273132324,
"learning_rate": 9.692818668665752e-05,
"loss": 2.9203,
"step": 1060
},
{
"epoch": 0.1947933733843073,
"grad_norm": 10.529635429382324,
"learning_rate": 9.681868825114634e-05,
"loss": 2.9257,
"step": 1070
},
{
"epoch": 0.19661387220098306,
"grad_norm": 8.376754760742188,
"learning_rate": 9.670733630781747e-05,
"loss": 2.8864,
"step": 1080
},
{
"epoch": 0.19843437101765884,
"grad_norm": 8.018802642822266,
"learning_rate": 9.659413526498962e-05,
"loss": 2.8672,
"step": 1090
},
{
"epoch": 0.2002548698343346,
"grad_norm": 7.348598480224609,
"learning_rate": 9.647908960418553e-05,
"loss": 2.8528,
"step": 1100
},
{
"epoch": 0.2020753686510104,
"grad_norm": 7.87021017074585,
"learning_rate": 9.636220387995469e-05,
"loss": 2.8713,
"step": 1110
},
{
"epoch": 0.20389586746768615,
"grad_norm": 8.476405143737793,
"learning_rate": 9.624348271969295e-05,
"loss": 2.8667,
"step": 1120
},
{
"epoch": 0.2057163662843619,
"grad_norm": 8.64283561706543,
"learning_rate": 9.612293082345931e-05,
"loss": 2.8523,
"step": 1130
},
{
"epoch": 0.2075368651010377,
"grad_norm": 10.11330795288086,
"learning_rate": 9.600055296378995e-05,
"loss": 2.8375,
"step": 1140
},
{
"epoch": 0.20935736391771345,
"grad_norm": 8.217743873596191,
"learning_rate": 9.58763539855092e-05,
"loss": 2.8685,
"step": 1150
},
{
"epoch": 0.2111778627343892,
"grad_norm": 7.501378536224365,
"learning_rate": 9.575033880553774e-05,
"loss": 2.8349,
"step": 1160
},
{
"epoch": 0.212998361551065,
"grad_norm": 8.812211036682129,
"learning_rate": 9.562251241269798e-05,
"loss": 2.8384,
"step": 1170
},
{
"epoch": 0.21481886036774075,
"grad_norm": 7.964756011962891,
"learning_rate": 9.549287986751655e-05,
"loss": 2.8653,
"step": 1180
},
{
"epoch": 0.21663935918441654,
"grad_norm": 7.216350555419922,
"learning_rate": 9.536144630202395e-05,
"loss": 2.8276,
"step": 1190
},
{
"epoch": 0.2184598580010923,
"grad_norm": 7.890927314758301,
"learning_rate": 9.522821691955135e-05,
"loss": 2.7802,
"step": 1200
},
{
"epoch": 0.22028035681776806,
"grad_norm": 8.259157180786133,
"learning_rate": 9.509319699452469e-05,
"loss": 2.8407,
"step": 1210
},
{
"epoch": 0.22210085563444384,
"grad_norm": 7.810998916625977,
"learning_rate": 9.495639187225575e-05,
"loss": 2.8374,
"step": 1220
},
{
"epoch": 0.2239213544511196,
"grad_norm": 6.905944347381592,
"learning_rate": 9.481780696873059e-05,
"loss": 2.8342,
"step": 1230
},
{
"epoch": 0.2257418532677954,
"grad_norm": 8.832979202270508,
"learning_rate": 9.467744777039517e-05,
"loss": 2.7816,
"step": 1240
},
{
"epoch": 0.22756235208447115,
"grad_norm": 6.949944972991943,
"learning_rate": 9.453531983393809e-05,
"loss": 2.8104,
"step": 1250
},
{
"epoch": 0.2293828509011469,
"grad_norm": 11.183205604553223,
"learning_rate": 9.439142878607061e-05,
"loss": 2.8605,
"step": 1260
},
{
"epoch": 0.2312033497178227,
"grad_norm": 8.672426223754883,
"learning_rate": 9.424578032330398e-05,
"loss": 2.7866,
"step": 1270
},
{
"epoch": 0.23302384853449845,
"grad_norm": 8.570023536682129,
"learning_rate": 9.409838021172375e-05,
"loss": 2.7814,
"step": 1280
},
{
"epoch": 0.2348443473511742,
"grad_norm": 17.605865478515625,
"learning_rate": 9.394923428676168e-05,
"loss": 2.8896,
"step": 1290
},
{
"epoch": 0.23666484616785,
"grad_norm": 8.613877296447754,
"learning_rate": 9.379834845296463e-05,
"loss": 2.8474,
"step": 1300
},
{
"epoch": 0.23848534498452575,
"grad_norm": 9.39710807800293,
"learning_rate": 9.364572868376075e-05,
"loss": 2.7771,
"step": 1310
},
{
"epoch": 0.24030584380120154,
"grad_norm": 12.333969116210938,
"learning_rate": 9.349138102122316e-05,
"loss": 2.8079,
"step": 1320
},
{
"epoch": 0.2421263426178773,
"grad_norm": 10.491060256958008,
"learning_rate": 9.333531157583055e-05,
"loss": 2.7536,
"step": 1330
},
{
"epoch": 0.24394684143455306,
"grad_norm": 9.862618446350098,
"learning_rate": 9.317752652622547e-05,
"loss": 2.8011,
"step": 1340
},
{
"epoch": 0.24576734025122884,
"grad_norm": 11.95722484588623,
"learning_rate": 9.301803211896955e-05,
"loss": 2.8058,
"step": 1350
},
{
"epoch": 0.2475878390679046,
"grad_norm": 8.709095001220703,
"learning_rate": 9.28568346682963e-05,
"loss": 2.7922,
"step": 1360
},
{
"epoch": 0.2494083378845804,
"grad_norm": 6.32808256149292,
"learning_rate": 9.269394055586116e-05,
"loss": 2.7246,
"step": 1370
},
{
"epoch": 0.2512288367012561,
"grad_norm": 10.615900039672852,
"learning_rate": 9.252935623048875e-05,
"loss": 2.7993,
"step": 1380
},
{
"epoch": 0.2530493355179319,
"grad_norm": 10.374322891235352,
"learning_rate": 9.236308820791768e-05,
"loss": 2.7583,
"step": 1390
},
{
"epoch": 0.2548698343346077,
"grad_norm": 11.486263275146484,
"learning_rate": 9.219514307054251e-05,
"loss": 2.8258,
"step": 1400
},
{
"epoch": 0.2566903331512835,
"grad_norm": 9.840982437133789,
"learning_rate": 9.202552746715322e-05,
"loss": 2.8464,
"step": 1410
},
{
"epoch": 0.2585108319679592,
"grad_norm": 15.894274711608887,
"learning_rate": 9.185424811267199e-05,
"loss": 2.8465,
"step": 1420
},
{
"epoch": 0.260331330784635,
"grad_norm": 8.428662300109863,
"learning_rate": 9.168131178788726e-05,
"loss": 2.8095,
"step": 1430
},
{
"epoch": 0.2621518296013108,
"grad_norm": 17.082258224487305,
"learning_rate": 9.150672533918544e-05,
"loss": 2.7782,
"step": 1440
},
{
"epoch": 0.2639723284179865,
"grad_norm": 7.154361724853516,
"learning_rate": 9.133049567827982e-05,
"loss": 2.7773,
"step": 1450
},
{
"epoch": 0.2657928272346623,
"grad_norm": 6.119648456573486,
"learning_rate": 9.115262978193679e-05,
"loss": 2.7788,
"step": 1460
},
{
"epoch": 0.2676133260513381,
"grad_norm": 8.635058403015137,
"learning_rate": 9.097313469169988e-05,
"loss": 2.7703,
"step": 1470
},
{
"epoch": 0.2694338248680138,
"grad_norm": 12.325600624084473,
"learning_rate": 9.079201751361082e-05,
"loss": 2.7313,
"step": 1480
},
{
"epoch": 0.2712543236846896,
"grad_norm": 8.181892395019531,
"learning_rate": 9.06092854179283e-05,
"loss": 2.7795,
"step": 1490
},
{
"epoch": 0.2730748225013654,
"grad_norm": 14.719033241271973,
"learning_rate": 9.042494563884404e-05,
"loss": 2.8108,
"step": 1500
},
{
"epoch": 0.2748953213180411,
"grad_norm": 7.8658061027526855,
"learning_rate": 9.023900547419646e-05,
"loss": 2.7663,
"step": 1510
},
{
"epoch": 0.2767158201347169,
"grad_norm": 15.445107460021973,
"learning_rate": 9.005147228518174e-05,
"loss": 2.7878,
"step": 1520
},
{
"epoch": 0.2785363189513927,
"grad_norm": 12.650901794433594,
"learning_rate": 8.986235349606238e-05,
"loss": 2.8219,
"step": 1530
},
{
"epoch": 0.2803568177680685,
"grad_norm": 10.226774215698242,
"learning_rate": 8.967165659387331e-05,
"loss": 2.742,
"step": 1540
},
{
"epoch": 0.2821773165847442,
"grad_norm": 138.37210083007812,
"learning_rate": 8.947938912812548e-05,
"loss": 2.9524,
"step": 1550
},
{
"epoch": 0.28399781540142,
"grad_norm": 16.11450958251953,
"learning_rate": 8.928555871050693e-05,
"loss": 2.7966,
"step": 1560
},
{
"epoch": 0.2858183142180958,
"grad_norm": 10.201882362365723,
"learning_rate": 8.909017301458156e-05,
"loss": 2.8389,
"step": 1570
},
{
"epoch": 0.2876388130347715,
"grad_norm": 10.96867847442627,
"learning_rate": 8.889323977548521e-05,
"loss": 2.7495,
"step": 1580
},
{
"epoch": 0.2894593118514473,
"grad_norm": 10.814942359924316,
"learning_rate": 8.869476678961954e-05,
"loss": 2.7676,
"step": 1590
},
{
"epoch": 0.2912798106681231,
"grad_norm": 6.535337448120117,
"learning_rate": 8.849476191434334e-05,
"loss": 2.7589,
"step": 1600
},
{
"epoch": 0.2931003094847988,
"grad_norm": 7.036696434020996,
"learning_rate": 8.829323306766142e-05,
"loss": 2.7921,
"step": 1610
},
{
"epoch": 0.2949208083014746,
"grad_norm": 5.92086124420166,
"learning_rate": 8.809018822791121e-05,
"loss": 2.7267,
"step": 1620
},
{
"epoch": 0.2967413071181504,
"grad_norm": 6.186739921569824,
"learning_rate": 8.788563543344688e-05,
"loss": 2.795,
"step": 1630
},
{
"epoch": 0.2985618059348261,
"grad_norm": 8.154546737670898,
"learning_rate": 8.767958278232112e-05,
"loss": 2.7627,
"step": 1640
},
{
"epoch": 0.3003823047515019,
"grad_norm": 7.674529075622559,
"learning_rate": 8.74720384319645e-05,
"loss": 2.7996,
"step": 1650
},
{
"epoch": 0.3022028035681777,
"grad_norm": 6.348474025726318,
"learning_rate": 8.726301059886259e-05,
"loss": 2.7704,
"step": 1660
},
{
"epoch": 0.3040233023848534,
"grad_norm": 10.496267318725586,
"learning_rate": 8.705250755823064e-05,
"loss": 2.7591,
"step": 1670
},
{
"epoch": 0.3058438012015292,
"grad_norm": 102.05543518066406,
"learning_rate": 8.684053764368598e-05,
"loss": 2.8027,
"step": 1680
},
{
"epoch": 0.307664300018205,
"grad_norm": 8.403404235839844,
"learning_rate": 8.662710924691805e-05,
"loss": 2.8801,
"step": 1690
},
{
"epoch": 0.3094847988348808,
"grad_norm": 7.355569839477539,
"learning_rate": 8.64122308173563e-05,
"loss": 2.8346,
"step": 1700
},
{
"epoch": 0.3113052976515565,
"grad_norm": 12.551121711730957,
"learning_rate": 8.61959108618356e-05,
"loss": 2.8381,
"step": 1710
},
{
"epoch": 0.3131257964682323,
"grad_norm": 116.6989517211914,
"learning_rate": 8.597815794425943e-05,
"loss": 2.814,
"step": 1720
},
{
"epoch": 0.3149462952849081,
"grad_norm": 21.63788604736328,
"learning_rate": 8.575898068526093e-05,
"loss": 2.8389,
"step": 1730
},
{
"epoch": 0.3167667941015838,
"grad_norm": 143.42408752441406,
"learning_rate": 8.553838776186158e-05,
"loss": 2.8534,
"step": 1740
},
{
"epoch": 0.3185872929182596,
"grad_norm": 9.04028034210205,
"learning_rate": 8.531638790712765e-05,
"loss": 2.8186,
"step": 1750
},
{
"epoch": 0.3204077917349354,
"grad_norm": 11.659414291381836,
"learning_rate": 8.509298990982453e-05,
"loss": 2.8078,
"step": 1760
},
{
"epoch": 0.3222282905516111,
"grad_norm": 7.934113502502441,
"learning_rate": 8.486820261406873e-05,
"loss": 2.792,
"step": 1770
},
{
"epoch": 0.3240487893682869,
"grad_norm": 12.919567108154297,
"learning_rate": 8.464203491897779e-05,
"loss": 2.8111,
"step": 1780
},
{
"epoch": 0.3258692881849627,
"grad_norm": 13.67540454864502,
"learning_rate": 8.441449577831801e-05,
"loss": 2.8085,
"step": 1790
},
{
"epoch": 0.3276897870016384,
"grad_norm": 7.7655110359191895,
"learning_rate": 8.418559420014984e-05,
"loss": 2.7689,
"step": 1800
},
{
"epoch": 0.3295102858183142,
"grad_norm": 8.168259620666504,
"learning_rate": 8.395533924647141e-05,
"loss": 2.7534,
"step": 1810
},
{
"epoch": 0.33133078463499,
"grad_norm": 14.387748718261719,
"learning_rate": 8.372374003285968e-05,
"loss": 2.8353,
"step": 1820
},
{
"epoch": 0.3331512834516658,
"grad_norm": 9.209723472595215,
"learning_rate": 8.349080572810965e-05,
"loss": 2.7837,
"step": 1830
},
{
"epoch": 0.3349717822683415,
"grad_norm": 9.160303115844727,
"learning_rate": 8.325654555387123e-05,
"loss": 2.8186,
"step": 1840
},
{
"epoch": 0.3367922810850173,
"grad_norm": 20.171415328979492,
"learning_rate": 8.302096878428438e-05,
"loss": 2.8011,
"step": 1850
},
{
"epoch": 0.3386127799016931,
"grad_norm": 29.545217514038086,
"learning_rate": 8.278408474561169e-05,
"loss": 2.7971,
"step": 1860
},
{
"epoch": 0.3404332787183688,
"grad_norm": 19.314136505126953,
"learning_rate": 8.254590281586942e-05,
"loss": 2.7983,
"step": 1870
},
{
"epoch": 0.3422537775350446,
"grad_norm": 8.010175704956055,
"learning_rate": 8.230643242445605e-05,
"loss": 2.7921,
"step": 1880
},
{
"epoch": 0.3440742763517204,
"grad_norm": 24.649381637573242,
"learning_rate": 8.206568305177907e-05,
"loss": 2.7962,
"step": 1890
},
{
"epoch": 0.3458947751683961,
"grad_norm": 8.272650718688965,
"learning_rate": 8.182366422887964e-05,
"loss": 2.7439,
"step": 1900
},
{
"epoch": 0.3477152739850719,
"grad_norm": 7.553550720214844,
"learning_rate": 8.158038553705524e-05,
"loss": 2.7845,
"step": 1910
},
{
"epoch": 0.3495357728017477,
"grad_norm": 8.573986053466797,
"learning_rate": 8.13358566074804e-05,
"loss": 2.7003,
"step": 1920
},
{
"epoch": 0.3513562716184234,
"grad_norm": 10.316489219665527,
"learning_rate": 8.109008712082538e-05,
"loss": 2.7627,
"step": 1930
},
{
"epoch": 0.3531767704350992,
"grad_norm": 8.462483406066895,
"learning_rate": 8.084308680687287e-05,
"loss": 2.7281,
"step": 1940
},
{
"epoch": 0.354997269251775,
"grad_norm": 20.140274047851562,
"learning_rate": 8.059486544413298e-05,
"loss": 2.6906,
"step": 1950
},
{
"epoch": 0.3568177680684508,
"grad_norm": 7.473912239074707,
"learning_rate": 8.034543285945584e-05,
"loss": 2.8117,
"step": 1960
},
{
"epoch": 0.3586382668851265,
"grad_norm": 38.26898193359375,
"learning_rate": 8.009479892764284e-05,
"loss": 2.7456,
"step": 1970
},
{
"epoch": 0.3604587657018023,
"grad_norm": 48.63120651245117,
"learning_rate": 7.984297357105552e-05,
"loss": 2.7224,
"step": 1980
},
{
"epoch": 0.3622792645184781,
"grad_norm": 10.31283187866211,
"learning_rate": 7.95899667592228e-05,
"loss": 2.7108,
"step": 1990
},
{
"epoch": 0.3640997633351538,
"grad_norm": 6.465616703033447,
"learning_rate": 7.933578850844636e-05,
"loss": 2.6901,
"step": 2000
},
{
"epoch": 0.3640997633351538,
"eval_loss": 2.735260248184204,
"eval_runtime": 1011.986,
"eval_samples_per_second": 9.65,
"eval_steps_per_second": 1.207,
"step": 2000
},
{
"epoch": 0.3659202621518296,
"grad_norm": 7.348064422607422,
"learning_rate": 7.908044888140394e-05,
"loss": 2.7194,
"step": 2010
},
{
"epoch": 0.3677407609685054,
"grad_norm": 10.493142127990723,
"learning_rate": 7.882395798675115e-05,
"loss": 2.7374,
"step": 2020
},
{
"epoch": 0.3695612597851811,
"grad_norm": 9.447548866271973,
"learning_rate": 7.856632597872122e-05,
"loss": 2.7186,
"step": 2030
},
{
"epoch": 0.3713817586018569,
"grad_norm": 6.926711559295654,
"learning_rate": 7.83075630567229e-05,
"loss": 2.7369,
"step": 2040
},
{
"epoch": 0.3732022574185327,
"grad_norm": 19.662818908691406,
"learning_rate": 7.804767946493685e-05,
"loss": 2.71,
"step": 2050
},
{
"epoch": 0.3750227562352084,
"grad_norm": 37.960330963134766,
"learning_rate": 7.778668549190994e-05,
"loss": 2.7497,
"step": 2060
},
{
"epoch": 0.3768432550518842,
"grad_norm": 10.371471405029297,
"learning_rate": 7.7524591470148e-05,
"loss": 2.7329,
"step": 2070
},
{
"epoch": 0.37866375386856,
"grad_norm": 7.78175163269043,
"learning_rate": 7.726140777570675e-05,
"loss": 2.6866,
"step": 2080
},
{
"epoch": 0.3804842526852358,
"grad_norm": 63.65814971923828,
"learning_rate": 7.699714482778104e-05,
"loss": 2.6993,
"step": 2090
},
{
"epoch": 0.3823047515019115,
"grad_norm": 20.166156768798828,
"learning_rate": 7.673181308829233e-05,
"loss": 2.7455,
"step": 2100
},
{
"epoch": 0.3841252503185873,
"grad_norm": 10.943995475769043,
"learning_rate": 7.646542306147455e-05,
"loss": 2.7369,
"step": 2110
},
{
"epoch": 0.3859457491352631,
"grad_norm": 8.025435447692871,
"learning_rate": 7.619798529345825e-05,
"loss": 2.6558,
"step": 2120
},
{
"epoch": 0.3877662479519388,
"grad_norm": 7.745648384094238,
"learning_rate": 7.592951037185301e-05,
"loss": 2.7071,
"step": 2130
},
{
"epoch": 0.3895867467686146,
"grad_norm": 11.425312995910645,
"learning_rate": 7.566000892532838e-05,
"loss": 2.7322,
"step": 2140
},
{
"epoch": 0.3914072455852904,
"grad_norm": 21.18678855895996,
"learning_rate": 7.538949162319306e-05,
"loss": 2.6649,
"step": 2150
},
{
"epoch": 0.3932277444019661,
"grad_norm": 7.39724588394165,
"learning_rate": 7.511796917497255e-05,
"loss": 2.6632,
"step": 2160
},
{
"epoch": 0.3950482432186419,
"grad_norm": 26.170703887939453,
"learning_rate": 7.484545232998508e-05,
"loss": 2.7239,
"step": 2170
},
{
"epoch": 0.3968687420353177,
"grad_norm": 9.031233787536621,
"learning_rate": 7.457195187691614e-05,
"loss": 2.6995,
"step": 2180
},
{
"epoch": 0.3986892408519934,
"grad_norm": 9.091829299926758,
"learning_rate": 7.429747864339136e-05,
"loss": 2.6826,
"step": 2190
},
{
"epoch": 0.4005097396686692,
"grad_norm": 8.380509376525879,
"learning_rate": 7.40220434955478e-05,
"loss": 2.7026,
"step": 2200
},
{
"epoch": 0.402330238485345,
"grad_norm": 5.83329439163208,
"learning_rate": 7.374565733760379e-05,
"loss": 2.6523,
"step": 2210
},
{
"epoch": 0.4041507373020208,
"grad_norm": 6.2706475257873535,
"learning_rate": 7.346833111142735e-05,
"loss": 2.6854,
"step": 2220
},
{
"epoch": 0.4059712361186965,
"grad_norm": 6.717202663421631,
"learning_rate": 7.319007579610277e-05,
"loss": 2.6949,
"step": 2230
},
{
"epoch": 0.4077917349353723,
"grad_norm": 9.39269733428955,
"learning_rate": 7.291090240749621e-05,
"loss": 2.6903,
"step": 2240
},
{
"epoch": 0.4096122337520481,
"grad_norm": 5.5886383056640625,
"learning_rate": 7.263082199781949e-05,
"loss": 2.6633,
"step": 2250
},
{
"epoch": 0.4114327325687238,
"grad_norm": 7.369466304779053,
"learning_rate": 7.234984565519247e-05,
"loss": 2.6493,
"step": 2260
},
{
"epoch": 0.4132532313853996,
"grad_norm": 11.548351287841797,
"learning_rate": 7.206798450320422e-05,
"loss": 2.6406,
"step": 2270
},
{
"epoch": 0.4150737302020754,
"grad_norm": 9.547252655029297,
"learning_rate": 7.178524970047253e-05,
"loss": 2.7079,
"step": 2280
},
{
"epoch": 0.4168942290187511,
"grad_norm": 7.008022785186768,
"learning_rate": 7.150165244020224e-05,
"loss": 2.6277,
"step": 2290
},
{
"epoch": 0.4187147278354269,
"grad_norm": 7.342654705047607,
"learning_rate": 7.121720394974206e-05,
"loss": 2.6857,
"step": 2300
},
{
"epoch": 0.4205352266521027,
"grad_norm": 6.655099391937256,
"learning_rate": 7.093191549014007e-05,
"loss": 2.654,
"step": 2310
},
{
"epoch": 0.4223557254687784,
"grad_norm": 6.727902412414551,
"learning_rate": 7.0645798355698e-05,
"loss": 2.6778,
"step": 2320
},
{
"epoch": 0.4241762242854542,
"grad_norm": 6.407891273498535,
"learning_rate": 7.035886387352399e-05,
"loss": 2.6798,
"step": 2330
},
{
"epoch": 0.42599672310213,
"grad_norm": 6.782746315002441,
"learning_rate": 7.007112340308423e-05,
"loss": 2.6669,
"step": 2340
},
{
"epoch": 0.4278172219188058,
"grad_norm": 6.325183868408203,
"learning_rate": 6.97825883357532e-05,
"loss": 2.6556,
"step": 2350
},
{
"epoch": 0.4296377207354815,
"grad_norm": 5.9982008934021,
"learning_rate": 6.949327009436278e-05,
"loss": 2.6764,
"step": 2360
},
{
"epoch": 0.4314582195521573,
"grad_norm": 6.148717403411865,
"learning_rate": 6.920318013274988e-05,
"loss": 2.6792,
"step": 2370
},
{
"epoch": 0.4332787183688331,
"grad_norm": 6.28306245803833,
"learning_rate": 6.891232993530319e-05,
"loss": 2.6945,
"step": 2380
},
{
"epoch": 0.4350992171855088,
"grad_norm": 6.82548713684082,
"learning_rate": 6.862073101650837e-05,
"loss": 2.6592,
"step": 2390
},
{
"epoch": 0.4369197160021846,
"grad_norm": 6.251521587371826,
"learning_rate": 6.832839492049225e-05,
"loss": 2.6386,
"step": 2400
},
{
"epoch": 0.4387402148188604,
"grad_norm": 6.3002800941467285,
"learning_rate": 6.80353332205658e-05,
"loss": 2.6491,
"step": 2410
},
{
"epoch": 0.4405607136355361,
"grad_norm": 6.661396026611328,
"learning_rate": 6.774155751876603e-05,
"loss": 2.6538,
"step": 2420
},
{
"epoch": 0.4423812124522119,
"grad_norm": 5.950438022613525,
"learning_rate": 6.744707944539654e-05,
"loss": 2.603,
"step": 2430
},
{
"epoch": 0.4442017112688877,
"grad_norm": 9.665274620056152,
"learning_rate": 6.715191065856721e-05,
"loss": 2.6364,
"step": 2440
},
{
"epoch": 0.4460222100855634,
"grad_norm": 7.168936252593994,
"learning_rate": 6.685606284373258e-05,
"loss": 2.6911,
"step": 2450
},
{
"epoch": 0.4478427089022392,
"grad_norm": 6.2818145751953125,
"learning_rate": 6.655954771322929e-05,
"loss": 2.6304,
"step": 2460
},
{
"epoch": 0.449663207718915,
"grad_norm": 6.3358964920043945,
"learning_rate": 6.626237700581238e-05,
"loss": 2.6124,
"step": 2470
},
{
"epoch": 0.4514837065355908,
"grad_norm": 7.2275004386901855,
"learning_rate": 6.596456248619054e-05,
"loss": 2.644,
"step": 2480
},
{
"epoch": 0.4533042053522665,
"grad_norm": 5.857280254364014,
"learning_rate": 6.566611594456042e-05,
"loss": 2.6175,
"step": 2490
},
{
"epoch": 0.4551247041689423,
"grad_norm": 6.958190441131592,
"learning_rate": 6.536704919613982e-05,
"loss": 2.6372,
"step": 2500
},
{
"epoch": 0.4569452029856181,
"grad_norm": 6.084266662597656,
"learning_rate": 6.506737408069988e-05,
"loss": 2.5989,
"step": 2510
},
{
"epoch": 0.4587657018022938,
"grad_norm": 6.761366367340088,
"learning_rate": 6.476710246209649e-05,
"loss": 2.6231,
"step": 2520
},
{
"epoch": 0.4605862006189696,
"grad_norm": 6.110794544219971,
"learning_rate": 6.446624622780052e-05,
"loss": 2.6294,
"step": 2530
},
{
"epoch": 0.4624066994356454,
"grad_norm": 7.39132022857666,
"learning_rate": 6.416481728842722e-05,
"loss": 2.589,
"step": 2540
},
{
"epoch": 0.4642271982523211,
"grad_norm": 5.385328769683838,
"learning_rate": 6.386282757726467e-05,
"loss": 2.5938,
"step": 2550
},
{
"epoch": 0.4660476970689969,
"grad_norm": 6.137452602386475,
"learning_rate": 6.356028904980145e-05,
"loss": 2.611,
"step": 2560
},
{
"epoch": 0.4678681958856727,
"grad_norm": 6.881803512573242,
"learning_rate": 6.325721368325317e-05,
"loss": 2.5577,
"step": 2570
},
{
"epoch": 0.4696886947023484,
"grad_norm": 7.0561323165893555,
"learning_rate": 6.295361347608846e-05,
"loss": 2.5971,
"step": 2580
},
{
"epoch": 0.4715091935190242,
"grad_norm": 8.178688049316406,
"learning_rate": 6.264950044755387e-05,
"loss": 2.6111,
"step": 2590
},
{
"epoch": 0.4733296923357,
"grad_norm": 7.674656867980957,
"learning_rate": 6.234488663719807e-05,
"loss": 2.6211,
"step": 2600
},
{
"epoch": 0.4751501911523758,
"grad_norm": 6.952455043792725,
"learning_rate": 6.203978410439519e-05,
"loss": 2.5976,
"step": 2610
},
{
"epoch": 0.4769706899690515,
"grad_norm": 7.033661842346191,
"learning_rate": 6.173420492786747e-05,
"loss": 2.5667,
"step": 2620
},
{
"epoch": 0.4787911887857273,
"grad_norm": 7.665953636169434,
"learning_rate": 6.142816120520699e-05,
"loss": 2.5964,
"step": 2630
},
{
"epoch": 0.4806116876024031,
"grad_norm": 6.789517402648926,
"learning_rate": 6.112166505239679e-05,
"loss": 2.5046,
"step": 2640
},
{
"epoch": 0.4824321864190788,
"grad_norm": 6.597125053405762,
"learning_rate": 6.0814728603331176e-05,
"loss": 2.6044,
"step": 2650
},
{
"epoch": 0.4842526852357546,
"grad_norm": 5.781830310821533,
"learning_rate": 6.050736400933538e-05,
"loss": 2.6041,
"step": 2660
},
{
"epoch": 0.4860731840524304,
"grad_norm": 6.907136917114258,
"learning_rate": 6.0199583438684495e-05,
"loss": 2.587,
"step": 2670
},
{
"epoch": 0.4878936828691061,
"grad_norm": 5.483163833618164,
"learning_rate": 5.989139907612174e-05,
"loss": 2.6136,
"step": 2680
},
{
"epoch": 0.4897141816857819,
"grad_norm": 5.925361156463623,
"learning_rate": 5.958282312237605e-05,
"loss": 2.5801,
"step": 2690
},
{
"epoch": 0.4915346805024577,
"grad_norm": 6.07294225692749,
"learning_rate": 5.927386779367912e-05,
"loss": 2.6287,
"step": 2700
},
{
"epoch": 0.4933551793191334,
"grad_norm": 5.8490424156188965,
"learning_rate": 5.896454532128171e-05,
"loss": 2.5894,
"step": 2710
},
{
"epoch": 0.4951756781358092,
"grad_norm": 6.67198371887207,
"learning_rate": 5.865486795096948e-05,
"loss": 2.5684,
"step": 2720
},
{
"epoch": 0.496996176952485,
"grad_norm": 6.779095649719238,
"learning_rate": 5.8344847942578175e-05,
"loss": 2.5916,
"step": 2730
},
{
"epoch": 0.4988166757691608,
"grad_norm": 6.148252487182617,
"learning_rate": 5.8034497569508206e-05,
"loss": 2.579,
"step": 2740
},
{
"epoch": 0.5006371745858366,
"grad_norm": 7.566195011138916,
"learning_rate": 5.772382911823886e-05,
"loss": 2.5639,
"step": 2750
},
{
"epoch": 0.5024576734025122,
"grad_norm": 6.119766712188721,
"learning_rate": 5.741285488784183e-05,
"loss": 2.5728,
"step": 2760
},
{
"epoch": 0.504278172219188,
"grad_norm": 6.376175880432129,
"learning_rate": 5.710158718949431e-05,
"loss": 2.5889,
"step": 2770
},
{
"epoch": 0.5060986710358638,
"grad_norm": 7.798823833465576,
"learning_rate": 5.67900383459916e-05,
"loss": 2.5611,
"step": 2780
},
{
"epoch": 0.5079191698525396,
"grad_norm": 5.470147132873535,
"learning_rate": 5.64782206912593e-05,
"loss": 2.6003,
"step": 2790
},
{
"epoch": 0.5097396686692154,
"grad_norm": 6.034721374511719,
"learning_rate": 5.6166146569864986e-05,
"loss": 2.6126,
"step": 2800
},
{
"epoch": 0.5115601674858912,
"grad_norm": 5.667842388153076,
"learning_rate": 5.585382833652951e-05,
"loss": 2.5349,
"step": 2810
},
{
"epoch": 0.513380666302567,
"grad_norm": 5.957113742828369,
"learning_rate": 5.554127835563784e-05,
"loss": 2.5567,
"step": 2820
},
{
"epoch": 0.5152011651192426,
"grad_norm": 8.104249000549316,
"learning_rate": 5.5228509000749705e-05,
"loss": 2.5654,
"step": 2830
},
{
"epoch": 0.5170216639359184,
"grad_norm": 5.897515296936035,
"learning_rate": 5.491553265410956e-05,
"loss": 2.599,
"step": 2840
},
{
"epoch": 0.5188421627525942,
"grad_norm": 7.213466167449951,
"learning_rate": 5.46023617061565e-05,
"loss": 2.5782,
"step": 2850
},
{
"epoch": 0.52066266156927,
"grad_norm": 6.2212910652160645,
"learning_rate": 5.4289008555033704e-05,
"loss": 2.5869,
"step": 2860
},
{
"epoch": 0.5224831603859458,
"grad_norm": 7.721590042114258,
"learning_rate": 5.397548560609762e-05,
"loss": 2.5546,
"step": 2870
},
{
"epoch": 0.5243036592026216,
"grad_norm": 6.373271465301514,
"learning_rate": 5.366180527142678e-05,
"loss": 2.555,
"step": 2880
},
{
"epoch": 0.5261241580192972,
"grad_norm": 7.320889949798584,
"learning_rate": 5.334797996933052e-05,
"loss": 2.5487,
"step": 2890
},
{
"epoch": 0.527944656835973,
"grad_norm": 5.541944980621338,
"learning_rate": 5.3034022123857285e-05,
"loss": 2.6031,
"step": 2900
},
{
"epoch": 0.5297651556526488,
"grad_norm": 6.056175708770752,
"learning_rate": 5.271994416430278e-05,
"loss": 2.5569,
"step": 2910
},
{
"epoch": 0.5315856544693246,
"grad_norm": 6.227322578430176,
"learning_rate": 5.240575852471791e-05,
"loss": 2.5726,
"step": 2920
},
{
"epoch": 0.5334061532860004,
"grad_norm": 5.931169509887695,
"learning_rate": 5.2091477643416565e-05,
"loss": 2.563,
"step": 2930
},
{
"epoch": 0.5352266521026762,
"grad_norm": 5.819269180297852,
"learning_rate": 5.17771139624831e-05,
"loss": 2.5544,
"step": 2940
},
{
"epoch": 0.537047150919352,
"grad_norm": 5.978631973266602,
"learning_rate": 5.14626799272799e-05,
"loss": 2.5477,
"step": 2950
},
{
"epoch": 0.5388676497360276,
"grad_norm": 7.41937255859375,
"learning_rate": 5.114818798595457e-05,
"loss": 2.537,
"step": 2960
},
{
"epoch": 0.5406881485527034,
"grad_norm": 6.325901985168457,
"learning_rate": 5.083365058894717e-05,
"loss": 2.5219,
"step": 2970
},
{
"epoch": 0.5425086473693792,
"grad_norm": 5.593471050262451,
"learning_rate": 5.051908018849729e-05,
"loss": 2.5031,
"step": 2980
},
{
"epoch": 0.544329146186055,
"grad_norm": 6.014733791351318,
"learning_rate": 5.020448923815115e-05,
"loss": 2.5559,
"step": 2990
},
{
"epoch": 0.5461496450027308,
"grad_norm": 5.821467399597168,
"learning_rate": 4.988989019226846e-05,
"loss": 2.5543,
"step": 3000
},
{
"epoch": 0.5461496450027308,
"eval_loss": 2.542562246322632,
"eval_runtime": 1012.6169,
"eval_samples_per_second": 9.644,
"eval_steps_per_second": 1.206,
"step": 3000
},
{
"epoch": 0.5479701438194066,
"grad_norm": 5.459403991699219,
"learning_rate": 4.9575295505529476e-05,
"loss": 2.5296,
"step": 3010
},
{
"epoch": 0.5497906426360822,
"grad_norm": 5.517323970794678,
"learning_rate": 4.926071763244182e-05,
"loss": 2.5244,
"step": 3020
},
{
"epoch": 0.551611141452758,
"grad_norm": 6.594572067260742,
"learning_rate": 4.894616902684755e-05,
"loss": 2.5305,
"step": 3030
},
{
"epoch": 0.5534316402694338,
"grad_norm": 5.410883903503418,
"learning_rate": 4.8631662141429965e-05,
"loss": 2.52,
"step": 3040
},
{
"epoch": 0.5552521390861096,
"grad_norm": 6.53268575668335,
"learning_rate": 4.8317209427220776e-05,
"loss": 2.552,
"step": 3050
},
{
"epoch": 0.5570726379027854,
"grad_norm": 5.973785877227783,
"learning_rate": 4.8002823333107094e-05,
"loss": 2.5089,
"step": 3060
},
{
"epoch": 0.5588931367194612,
"grad_norm": 5.9094038009643555,
"learning_rate": 4.768851630533858e-05,
"loss": 2.5641,
"step": 3070
},
{
"epoch": 0.560713635536137,
"grad_norm": 6.263980865478516,
"learning_rate": 4.737430078703473e-05,
"loss": 2.5573,
"step": 3080
},
{
"epoch": 0.5625341343528126,
"grad_norm": 5.653293609619141,
"learning_rate": 4.706018921769229e-05,
"loss": 2.5162,
"step": 3090
},
{
"epoch": 0.5643546331694884,
"grad_norm": 5.543784141540527,
"learning_rate": 4.674619403269275e-05,
"loss": 2.5121,
"step": 3100
},
{
"epoch": 0.5661751319861642,
"grad_norm": 5.531651020050049,
"learning_rate": 4.643232766281003e-05,
"loss": 2.5214,
"step": 3110
},
{
"epoch": 0.56799563080284,
"grad_norm": 6.792632579803467,
"learning_rate": 4.6118602533718457e-05,
"loss": 2.5064,
"step": 3120
},
{
"epoch": 0.5698161296195158,
"grad_norm": 5.586999416351318,
"learning_rate": 4.580503106550069e-05,
"loss": 2.5475,
"step": 3130
},
{
"epoch": 0.5716366284361916,
"grad_norm": 5.922962188720703,
"learning_rate": 4.549162567215612e-05,
"loss": 2.4942,
"step": 3140
},
{
"epoch": 0.5734571272528672,
"grad_norm": 6.408661842346191,
"learning_rate": 4.517839876110942e-05,
"loss": 2.5143,
"step": 3150
},
{
"epoch": 0.575277626069543,
"grad_norm": 6.490014553070068,
"learning_rate": 4.4865362732719266e-05,
"loss": 2.524,
"step": 3160
},
{
"epoch": 0.5770981248862188,
"grad_norm": 7.0241217613220215,
"learning_rate": 4.45525299797875e-05,
"loss": 2.5019,
"step": 3170
},
{
"epoch": 0.5789186237028946,
"grad_norm": 5.701231002807617,
"learning_rate": 4.423991288706851e-05,
"loss": 2.5243,
"step": 3180
},
{
"epoch": 0.5807391225195704,
"grad_norm": 5.697451591491699,
"learning_rate": 4.392752383077883e-05,
"loss": 2.5121,
"step": 3190
},
{
"epoch": 0.5825596213362462,
"grad_norm": 6.233221530914307,
"learning_rate": 4.3615375178107306e-05,
"loss": 2.5233,
"step": 3200
},
{
"epoch": 0.584380120152922,
"grad_norm": 5.627314567565918,
"learning_rate": 4.330347928672538e-05,
"loss": 2.4944,
"step": 3210
},
{
"epoch": 0.5862006189695976,
"grad_norm": 5.6860737800598145,
"learning_rate": 4.299184850429795e-05,
"loss": 2.5099,
"step": 3220
},
{
"epoch": 0.5880211177862734,
"grad_norm": 6.200436592102051,
"learning_rate": 4.26804951679945e-05,
"loss": 2.5251,
"step": 3230
},
{
"epoch": 0.5898416166029492,
"grad_norm": 6.162016868591309,
"learning_rate": 4.2369431604000654e-05,
"loss": 2.5156,
"step": 3240
},
{
"epoch": 0.591662115419625,
"grad_norm": 5.810561656951904,
"learning_rate": 4.205867012703025e-05,
"loss": 2.5343,
"step": 3250
},
{
"epoch": 0.5934826142363008,
"grad_norm": 5.519481182098389,
"learning_rate": 4.174822303983779e-05,
"loss": 2.4742,
"step": 3260
},
{
"epoch": 0.5953031130529766,
"grad_norm": 5.590371131896973,
"learning_rate": 4.1438102632731346e-05,
"loss": 2.5091,
"step": 3270
},
{
"epoch": 0.5971236118696522,
"grad_norm": 5.302878379821777,
"learning_rate": 4.1128321183086065e-05,
"loss": 2.5024,
"step": 3280
},
{
"epoch": 0.598944110686328,
"grad_norm": 5.2954864501953125,
"learning_rate": 4.081889095485806e-05,
"loss": 2.5304,
"step": 3290
},
{
"epoch": 0.6007646095030038,
"grad_norm": 6.41418981552124,
"learning_rate": 4.050982419809895e-05,
"loss": 2.482,
"step": 3300
},
{
"epoch": 0.6025851083196796,
"grad_norm": 6.165164947509766,
"learning_rate": 4.020113314847082e-05,
"loss": 2.5042,
"step": 3310
},
{
"epoch": 0.6044056071363554,
"grad_norm": 5.556238651275635,
"learning_rate": 3.989283002676193e-05,
"loss": 2.4748,
"step": 3320
},
{
"epoch": 0.6062261059530312,
"grad_norm": 5.898430824279785,
"learning_rate": 3.958492703840276e-05,
"loss": 2.5103,
"step": 3330
},
{
"epoch": 0.6080466047697068,
"grad_norm": 6.131360054016113,
"learning_rate": 3.9277436372982945e-05,
"loss": 2.5023,
"step": 3340
},
{
"epoch": 0.6098671035863826,
"grad_norm": 6.646694660186768,
"learning_rate": 3.8970370203768595e-05,
"loss": 2.5278,
"step": 3350
},
{
"epoch": 0.6116876024030584,
"grad_norm": 5.293123245239258,
"learning_rate": 3.8663740687220466e-05,
"loss": 2.5339,
"step": 3360
},
{
"epoch": 0.6135081012197342,
"grad_norm": 5.64793586730957,
"learning_rate": 3.835755996251261e-05,
"loss": 2.4955,
"step": 3370
},
{
"epoch": 0.61532860003641,
"grad_norm": 5.227383613586426,
"learning_rate": 3.805184015105182e-05,
"loss": 2.4942,
"step": 3380
},
{
"epoch": 0.6171490988530858,
"grad_norm": 6.251855850219727,
"learning_rate": 3.774659335599783e-05,
"loss": 2.4941,
"step": 3390
},
{
"epoch": 0.6189695976697616,
"grad_norm": 6.169816970825195,
"learning_rate": 3.744183166178405e-05,
"loss": 2.5091,
"step": 3400
},
{
"epoch": 0.6207900964864372,
"grad_norm": 5.742128372192383,
"learning_rate": 3.71375671336392e-05,
"loss": 2.4775,
"step": 3410
},
{
"epoch": 0.622610595303113,
"grad_norm": 5.501997470855713,
"learning_rate": 3.683381181710969e-05,
"loss": 2.5165,
"step": 3420
},
{
"epoch": 0.6244310941197888,
"grad_norm": 5.634415149688721,
"learning_rate": 3.653057773758268e-05,
"loss": 2.5137,
"step": 3430
},
{
"epoch": 0.6262515929364646,
"grad_norm": 6.159447193145752,
"learning_rate": 3.622787689981009e-05,
"loss": 2.5118,
"step": 3440
},
{
"epoch": 0.6280720917531404,
"grad_norm": 5.9222307205200195,
"learning_rate": 3.5925721287433304e-05,
"loss": 2.5158,
"step": 3450
},
{
"epoch": 0.6298925905698162,
"grad_norm": 5.72164249420166,
"learning_rate": 3.5624122862508724e-05,
"loss": 2.4573,
"step": 3460
},
{
"epoch": 0.6317130893864918,
"grad_norm": 5.604609489440918,
"learning_rate": 3.5323093565034213e-05,
"loss": 2.4427,
"step": 3470
},
{
"epoch": 0.6335335882031676,
"grad_norm": 6.2562336921691895,
"learning_rate": 3.502264531247644e-05,
"loss": 2.5276,
"step": 3480
},
{
"epoch": 0.6353540870198434,
"grad_norm": 5.994789123535156,
"learning_rate": 3.4722789999299034e-05,
"loss": 2.5015,
"step": 3490
},
{
"epoch": 0.6371745858365192,
"grad_norm": 7.051544189453125,
"learning_rate": 3.442353949649173e-05,
"loss": 2.4955,
"step": 3500
},
{
"epoch": 0.638995084653195,
"grad_norm": 5.279881000518799,
"learning_rate": 3.412490565110034e-05,
"loss": 2.5066,
"step": 3510
},
{
"epoch": 0.6408155834698708,
"grad_norm": 5.663167476654053,
"learning_rate": 3.382690028575789e-05,
"loss": 2.4843,
"step": 3520
},
{
"epoch": 0.6426360822865466,
"grad_norm": 6.197967052459717,
"learning_rate": 3.352953519821637e-05,
"loss": 2.4439,
"step": 3530
},
{
"epoch": 0.6444565811032222,
"grad_norm": 6.703118324279785,
"learning_rate": 3.3232822160879825e-05,
"loss": 2.4902,
"step": 3540
},
{
"epoch": 0.646277079919898,
"grad_norm": 6.075878143310547,
"learning_rate": 3.2936772920338244e-05,
"loss": 2.4375,
"step": 3550
},
{
"epoch": 0.6480975787365738,
"grad_norm": 5.913654327392578,
"learning_rate": 3.2641399196902505e-05,
"loss": 2.4575,
"step": 3560
},
{
"epoch": 0.6499180775532496,
"grad_norm": 5.623748779296875,
"learning_rate": 3.234671268414041e-05,
"loss": 2.5078,
"step": 3570
},
{
"epoch": 0.6517385763699254,
"grad_norm": 4.9896745681762695,
"learning_rate": 3.2052725048413734e-05,
"loss": 2.461,
"step": 3580
},
{
"epoch": 0.6535590751866012,
"grad_norm": 5.503573417663574,
"learning_rate": 3.175944792841639e-05,
"loss": 2.4964,
"step": 3590
},
{
"epoch": 0.6553795740032768,
"grad_norm": 5.610907077789307,
"learning_rate": 3.146689293471362e-05,
"loss": 2.496,
"step": 3600
},
{
"epoch": 0.6572000728199526,
"grad_norm": 5.222755432128906,
"learning_rate": 3.117507164928235e-05,
"loss": 2.4587,
"step": 3610
},
{
"epoch": 0.6590205716366284,
"grad_norm": 5.29688024520874,
"learning_rate": 3.0883995625052735e-05,
"loss": 2.4976,
"step": 3620
},
{
"epoch": 0.6608410704533042,
"grad_norm": 5.991846084594727,
"learning_rate": 3.059367638545069e-05,
"loss": 2.4668,
"step": 3630
},
{
"epoch": 0.66266156926998,
"grad_norm": 5.622383117675781,
"learning_rate": 3.030412542394176e-05,
"loss": 2.4624,
"step": 3640
},
{
"epoch": 0.6644820680866558,
"grad_norm": 5.628267288208008,
"learning_rate": 3.001535420357607e-05,
"loss": 2.4497,
"step": 3650
},
{
"epoch": 0.6663025669033316,
"grad_norm": 5.707132339477539,
"learning_rate": 2.972737415653456e-05,
"loss": 2.4921,
"step": 3660
},
{
"epoch": 0.6681230657200072,
"grad_norm": 6.638173580169678,
"learning_rate": 2.9440196683676337e-05,
"loss": 2.4978,
"step": 3670
},
{
"epoch": 0.669943564536683,
"grad_norm": 6.14837121963501,
"learning_rate": 2.915383315408736e-05,
"loss": 2.4597,
"step": 3680
},
{
"epoch": 0.6717640633533588,
"grad_norm": 5.452149391174316,
"learning_rate": 2.8868294904630333e-05,
"loss": 2.4789,
"step": 3690
},
{
"epoch": 0.6735845621700346,
"grad_norm": 5.278966903686523,
"learning_rate": 2.8583593239495875e-05,
"loss": 2.4435,
"step": 3700
},
{
"epoch": 0.6754050609867104,
"grad_norm": 5.96103048324585,
"learning_rate": 2.8299739429755057e-05,
"loss": 2.4103,
"step": 3710
},
{
"epoch": 0.6772255598033862,
"grad_norm": 5.705805778503418,
"learning_rate": 2.8016744712913164e-05,
"loss": 2.494,
"step": 3720
},
{
"epoch": 0.6790460586200618,
"grad_norm": 6.212954044342041,
"learning_rate": 2.773462029246475e-05,
"loss": 2.4419,
"step": 3730
},
{
"epoch": 0.6808665574367376,
"grad_norm": 5.658579349517822,
"learning_rate": 2.7453377337450182e-05,
"loss": 2.4648,
"step": 3740
},
{
"epoch": 0.6826870562534134,
"grad_norm": 6.050464630126953,
"learning_rate": 2.7173026982013417e-05,
"loss": 2.4964,
"step": 3750
},
{
"epoch": 0.6845075550700892,
"grad_norm": 5.388469219207764,
"learning_rate": 2.689358032496129e-05,
"loss": 2.4463,
"step": 3760
},
{
"epoch": 0.686328053886765,
"grad_norm": 5.987942695617676,
"learning_rate": 2.661504842932402e-05,
"loss": 2.4777,
"step": 3770
},
{
"epoch": 0.6881485527034408,
"grad_norm": 20.632400512695312,
"learning_rate": 2.633744232191726e-05,
"loss": 2.4645,
"step": 3780
},
{
"epoch": 0.6899690515201166,
"grad_norm": 5.323004245758057,
"learning_rate": 2.6060772992905647e-05,
"loss": 2.4832,
"step": 3790
},
{
"epoch": 0.6917895503367922,
"grad_norm": 5.565816402435303,
"learning_rate": 2.578505139536762e-05,
"loss": 2.4677,
"step": 3800
},
{
"epoch": 0.693610049153468,
"grad_norm": 6.668233394622803,
"learning_rate": 2.5510288444861784e-05,
"loss": 2.4372,
"step": 3810
},
{
"epoch": 0.6954305479701438,
"grad_norm": 6.057459354400635,
"learning_rate": 2.5236495018994844e-05,
"loss": 2.4625,
"step": 3820
},
{
"epoch": 0.6972510467868196,
"grad_norm": 5.716314315795898,
"learning_rate": 2.4963681956990896e-05,
"loss": 2.4582,
"step": 3830
},
{
"epoch": 0.6990715456034954,
"grad_norm": 5.098658561706543,
"learning_rate": 2.4691860059262427e-05,
"loss": 2.449,
"step": 3840
},
{
"epoch": 0.7008920444201712,
"grad_norm": 6.102919578552246,
"learning_rate": 2.4421040086982587e-05,
"loss": 2.4678,
"step": 3850
},
{
"epoch": 0.7027125432368468,
"grad_norm": 6.12929630279541,
"learning_rate": 2.4151232761659305e-05,
"loss": 2.4162,
"step": 3860
},
{
"epoch": 0.7045330420535226,
"grad_norm": 5.601887226104736,
"learning_rate": 2.3882448764710698e-05,
"loss": 2.4455,
"step": 3870
},
{
"epoch": 0.7063535408701984,
"grad_norm": 5.384401798248291,
"learning_rate": 2.3614698737042355e-05,
"loss": 2.4618,
"step": 3880
},
{
"epoch": 0.7081740396868742,
"grad_norm": 5.636379718780518,
"learning_rate": 2.3347993278625933e-05,
"loss": 2.43,
"step": 3890
},
{
"epoch": 0.70999453850355,
"grad_norm": 6.716041088104248,
"learning_rate": 2.3082342948079606e-05,
"loss": 2.4177,
"step": 3900
},
{
"epoch": 0.7118150373202258,
"grad_norm": 4.779394149780273,
"learning_rate": 2.2817758262249988e-05,
"loss": 2.4502,
"step": 3910
},
{
"epoch": 0.7136355361369016,
"grad_norm": 5.225165843963623,
"learning_rate": 2.2554249695795878e-05,
"loss": 2.4954,
"step": 3920
},
{
"epoch": 0.7154560349535772,
"grad_norm": 5.962668418884277,
"learning_rate": 2.2291827680773508e-05,
"loss": 2.4367,
"step": 3930
},
{
"epoch": 0.717276533770253,
"grad_norm": 5.683213233947754,
"learning_rate": 2.2030502606223534e-05,
"loss": 2.4969,
"step": 3940
},
{
"epoch": 0.7190970325869288,
"grad_norm": 5.069901466369629,
"learning_rate": 2.1770284817759767e-05,
"loss": 2.4462,
"step": 3950
},
{
"epoch": 0.7209175314036046,
"grad_norm": 5.872584819793701,
"learning_rate": 2.1511184617159652e-05,
"loss": 2.4353,
"step": 3960
},
{
"epoch": 0.7227380302202804,
"grad_norm": 5.787328720092773,
"learning_rate": 2.125321226195634e-05,
"loss": 2.4174,
"step": 3970
},
{
"epoch": 0.7245585290369562,
"grad_norm": 8.6191987991333,
"learning_rate": 2.0996377965032638e-05,
"loss": 2.4482,
"step": 3980
},
{
"epoch": 0.7263790278536318,
"grad_norm": 5.412010192871094,
"learning_rate": 2.074069189421673e-05,
"loss": 2.4207,
"step": 3990
},
{
"epoch": 0.7281995266703076,
"grad_norm": 5.450544834136963,
"learning_rate": 2.0486164171879613e-05,
"loss": 2.4624,
"step": 4000
},
{
"epoch": 0.7281995266703076,
"eval_loss": 2.449294328689575,
"eval_runtime": 1012.1688,
"eval_samples_per_second": 9.649,
"eval_steps_per_second": 1.206,
"step": 4000
},
{
"epoch": 0.7300200254869834,
"grad_norm": 5.740320205688477,
"learning_rate": 2.0232804874534313e-05,
"loss": 2.4582,
"step": 4010
},
{
"epoch": 0.7318405243036592,
"grad_norm": 5.987521171569824,
"learning_rate": 1.998062403243704e-05,
"loss": 2.4499,
"step": 4020
},
{
"epoch": 0.733661023120335,
"grad_norm": 5.337474346160889,
"learning_rate": 1.9729631629190042e-05,
"loss": 2.4598,
"step": 4030
},
{
"epoch": 0.7354815219370108,
"grad_norm": 5.655992031097412,
"learning_rate": 1.9479837601346457e-05,
"loss": 2.4601,
"step": 4040
},
{
"epoch": 0.7373020207536866,
"grad_norm": 5.7331414222717285,
"learning_rate": 1.923125183801678e-05,
"loss": 2.4445,
"step": 4050
},
{
"epoch": 0.7391225195703622,
"grad_norm": 5.471503257751465,
"learning_rate": 1.898388418047753e-05,
"loss": 2.4683,
"step": 4060
},
{
"epoch": 0.740943018387038,
"grad_norm": 5.409184455871582,
"learning_rate": 1.87377444217815e-05,
"loss": 2.4358,
"step": 4070
},
{
"epoch": 0.7427635172037138,
"grad_norm": 5.579779148101807,
"learning_rate": 1.8492842306370182e-05,
"loss": 2.4989,
"step": 4080
},
{
"epoch": 0.7445840160203896,
"grad_norm": 5.366626262664795,
"learning_rate": 1.8249187529687895e-05,
"loss": 2.4102,
"step": 4090
},
{
"epoch": 0.7464045148370654,
"grad_norm": 5.263418674468994,
"learning_rate": 1.8006789737797984e-05,
"loss": 2.4573,
"step": 4100
},
{
"epoch": 0.7482250136537412,
"grad_norm": 5.129177570343018,
"learning_rate": 1.7765658527000966e-05,
"loss": 2.4792,
"step": 4110
},
{
"epoch": 0.7500455124704168,
"grad_norm": 6.237401962280273,
"learning_rate": 1.7525803443454615e-05,
"loss": 2.479,
"step": 4120
},
{
"epoch": 0.7518660112870926,
"grad_norm": 6.163425445556641,
"learning_rate": 1.728723398279603e-05,
"loss": 2.4222,
"step": 4130
},
{
"epoch": 0.7536865101037684,
"grad_norm": 5.254932403564453,
"learning_rate": 1.7049959589765686e-05,
"loss": 2.4307,
"step": 4140
},
{
"epoch": 0.7555070089204442,
"grad_norm": 6.144068717956543,
"learning_rate": 1.6813989657833534e-05,
"loss": 2.4923,
"step": 4150
},
{
"epoch": 0.75732750773712,
"grad_norm": 5.038397789001465,
"learning_rate": 1.6579333528827205e-05,
"loss": 2.4732,
"step": 4160
},
{
"epoch": 0.7591480065537958,
"grad_norm": 5.2848076820373535,
"learning_rate": 1.634600049256204e-05,
"loss": 2.4651,
"step": 4170
},
{
"epoch": 0.7609685053704716,
"grad_norm": 5.274468898773193,
"learning_rate": 1.611399978647342e-05,
"loss": 2.4407,
"step": 4180
},
{
"epoch": 0.7627890041871472,
"grad_norm": 5.039272308349609,
"learning_rate": 1.588334059525099e-05,
"loss": 2.4892,
"step": 4190
},
{
"epoch": 0.764609503003823,
"grad_norm": 5.7419867515563965,
"learning_rate": 1.5654032050475138e-05,
"loss": 2.4456,
"step": 4200
},
{
"epoch": 0.7664300018204988,
"grad_norm": 5.30146598815918,
"learning_rate": 1.5426083230255405e-05,
"loss": 2.4515,
"step": 4210
},
{
"epoch": 0.7682505006371746,
"grad_norm": 4.977199077606201,
"learning_rate": 1.5199503158871115e-05,
"loss": 2.4317,
"step": 4220
},
{
"epoch": 0.7700709994538504,
"grad_norm": 5.318095684051514,
"learning_rate": 1.4974300806414082e-05,
"loss": 2.403,
"step": 4230
},
{
"epoch": 0.7718914982705262,
"grad_norm": 5.638497352600098,
"learning_rate": 1.4750485088433592e-05,
"loss": 2.4327,
"step": 4240
},
{
"epoch": 0.7737119970872018,
"grad_norm": 5.739340305328369,
"learning_rate": 1.4528064865583301e-05,
"loss": 2.4266,
"step": 4250
},
{
"epoch": 0.7755324959038776,
"grad_norm": 4.749205112457275,
"learning_rate": 1.4307048943270606e-05,
"loss": 2.4136,
"step": 4260
},
{
"epoch": 0.7773529947205534,
"grad_norm": 5.616302490234375,
"learning_rate": 1.4087446071307903e-05,
"loss": 2.4197,
"step": 4270
},
{
"epoch": 0.7791734935372292,
"grad_norm": 5.402510643005371,
"learning_rate": 1.3869264943566263e-05,
"loss": 2.4194,
"step": 4280
},
{
"epoch": 0.780993992353905,
"grad_norm": 5.278769493103027,
"learning_rate": 1.3652514197631277e-05,
"loss": 2.4351,
"step": 4290
},
{
"epoch": 0.7828144911705808,
"grad_norm": 6.828596115112305,
"learning_rate": 1.343720241446103e-05,
"loss": 2.3813,
"step": 4300
},
{
"epoch": 0.7846349899872566,
"grad_norm": 5.306332588195801,
"learning_rate": 1.322333811804643e-05,
"loss": 2.4133,
"step": 4310
},
{
"epoch": 0.7864554888039322,
"grad_norm": 5.437227249145508,
"learning_rate": 1.3010929775073765e-05,
"loss": 2.4166,
"step": 4320
},
{
"epoch": 0.788275987620608,
"grad_norm": 5.493254661560059,
"learning_rate": 1.2799985794589497e-05,
"loss": 2.3842,
"step": 4330
},
{
"epoch": 0.7900964864372838,
"grad_norm": 5.259057521820068,
"learning_rate": 1.2590514527667336e-05,
"loss": 2.3783,
"step": 4340
},
{
"epoch": 0.7919169852539596,
"grad_norm": 5.750987529754639,
"learning_rate": 1.2382524267077645e-05,
"loss": 2.4202,
"step": 4350
},
{
"epoch": 0.7937374840706354,
"grad_norm": 4.952456951141357,
"learning_rate": 1.2176023246959133e-05,
"loss": 2.4393,
"step": 4360
},
{
"epoch": 0.7955579828873112,
"grad_norm": 5.3008713722229,
"learning_rate": 1.1971019642492942e-05,
"loss": 2.375,
"step": 4370
},
{
"epoch": 0.7973784817039868,
"grad_norm": 4.872366428375244,
"learning_rate": 1.176752156957886e-05,
"loss": 2.4257,
"step": 4380
},
{
"epoch": 0.7991989805206626,
"grad_norm": 5.488797664642334,
"learning_rate": 1.1565537084514123e-05,
"loss": 2.4424,
"step": 4390
},
{
"epoch": 0.8010194793373384,
"grad_norm": 5.145867824554443,
"learning_rate": 1.1365074183674468e-05,
"loss": 2.4806,
"step": 4400
},
{
"epoch": 0.8028399781540142,
"grad_norm": 5.343238353729248,
"learning_rate": 1.116614080319754e-05,
"loss": 2.4321,
"step": 4410
},
{
"epoch": 0.80466047697069,
"grad_norm": 5.240965366363525,
"learning_rate": 1.0968744818668691e-05,
"loss": 2.4358,
"step": 4420
},
{
"epoch": 0.8064809757873658,
"grad_norm": 5.5220513343811035,
"learning_rate": 1.0772894044809229e-05,
"loss": 2.442,
"step": 4430
},
{
"epoch": 0.8083014746040416,
"grad_norm": 4.8629045486450195,
"learning_rate": 1.0578596235166998e-05,
"loss": 2.4567,
"step": 4440
},
{
"epoch": 0.8101219734207172,
"grad_norm": 5.297680854797363,
"learning_rate": 1.0385859081809508e-05,
"loss": 2.4544,
"step": 4450
},
{
"epoch": 0.811942472237393,
"grad_norm": 5.134615898132324,
"learning_rate": 1.0194690215019292e-05,
"loss": 2.4656,
"step": 4460
},
{
"epoch": 0.8137629710540688,
"grad_norm": 5.012113571166992,
"learning_rate": 1.0005097202991948e-05,
"loss": 2.382,
"step": 4470
},
{
"epoch": 0.8155834698707446,
"grad_norm": 5.369142532348633,
"learning_rate": 9.817087551536414e-06,
"loss": 2.4584,
"step": 4480
},
{
"epoch": 0.8174039686874204,
"grad_norm": 5.545107841491699,
"learning_rate": 9.630668703777922e-06,
"loss": 2.4013,
"step": 4490
},
{
"epoch": 0.8192244675040962,
"grad_norm": 4.933434963226318,
"learning_rate": 9.445848039863252e-06,
"loss": 2.4516,
"step": 4500
},
{
"epoch": 0.8210449663207718,
"grad_norm": 4.916785717010498,
"learning_rate": 9.262632876668591e-06,
"loss": 2.4555,
"step": 4510
},
{
"epoch": 0.8228654651374476,
"grad_norm": 5.11759090423584,
"learning_rate": 9.08103046750986e-06,
"loss": 2.447,
"step": 4520
},
{
"epoch": 0.8246859639541234,
"grad_norm": 5.081522464752197,
"learning_rate": 8.901048001855583e-06,
"loss": 2.4004,
"step": 4530
},
{
"epoch": 0.8265064627707992,
"grad_norm": 5.203310489654541,
"learning_rate": 8.722692605042248e-06,
"loss": 2.4237,
"step": 4540
},
{
"epoch": 0.828326961587475,
"grad_norm": 5.090500831604004,
"learning_rate": 8.545971337992197e-06,
"loss": 2.4342,
"step": 4550
},
{
"epoch": 0.8301474604041508,
"grad_norm": 5.330081462860107,
"learning_rate": 8.37089119693411e-06,
"loss": 2.3922,
"step": 4560
},
{
"epoch": 0.8319679592208266,
"grad_norm": 5.339773178100586,
"learning_rate": 8.197459113126067e-06,
"loss": 2.4342,
"step": 4570
},
{
"epoch": 0.8337884580375022,
"grad_norm": 5.109127044677734,
"learning_rate": 8.02568195258107e-06,
"loss": 2.4207,
"step": 4580
},
{
"epoch": 0.835608956854178,
"grad_norm": 5.223607540130615,
"learning_rate": 7.855566515795282e-06,
"loss": 2.383,
"step": 4590
},
{
"epoch": 0.8374294556708538,
"grad_norm": 5.024397373199463,
"learning_rate": 7.687119537478799e-06,
"loss": 2.4197,
"step": 4600
},
{
"epoch": 0.8392499544875296,
"grad_norm": 5.112728595733643,
"learning_rate": 7.52034768628902e-06,
"loss": 2.4399,
"step": 4610
},
{
"epoch": 0.8410704533042054,
"grad_norm": 5.149270057678223,
"learning_rate": 7.3552575645666036e-06,
"loss": 2.417,
"step": 4620
},
{
"epoch": 0.8428909521208812,
"grad_norm": 5.0890350341796875,
"learning_rate": 7.191855708074152e-06,
"loss": 2.4217,
"step": 4630
},
{
"epoch": 0.8447114509375568,
"grad_norm": 5.196211338043213,
"learning_rate": 7.030148585737406e-06,
"loss": 2.4351,
"step": 4640
},
{
"epoch": 0.8465319497542326,
"grad_norm": 5.102319717407227,
"learning_rate": 6.870142599389217e-06,
"loss": 2.4397,
"step": 4650
},
{
"epoch": 0.8483524485709084,
"grad_norm": 5.2318220138549805,
"learning_rate": 6.711844083516022e-06,
"loss": 2.3708,
"step": 4660
},
{
"epoch": 0.8501729473875842,
"grad_norm": 5.27686071395874,
"learning_rate": 6.555259305007139e-06,
"loss": 2.4018,
"step": 4670
},
{
"epoch": 0.85199344620426,
"grad_norm": 5.138775825500488,
"learning_rate": 6.400394462906612e-06,
"loss": 2.4244,
"step": 4680
},
{
"epoch": 0.8538139450209358,
"grad_norm": 4.929832935333252,
"learning_rate": 6.247255688167852e-06,
"loss": 2.3637,
"step": 4690
},
{
"epoch": 0.8556344438376116,
"grad_norm": 5.328685283660889,
"learning_rate": 6.09584904341085e-06,
"loss": 2.4037,
"step": 4700
},
{
"epoch": 0.8574549426542872,
"grad_norm": 4.969110012054443,
"learning_rate": 5.946180522682227e-06,
"loss": 2.4054,
"step": 4710
},
{
"epoch": 0.859275441470963,
"grad_norm": 4.729069232940674,
"learning_rate": 5.798256051217882e-06,
"loss": 2.419,
"step": 4720
},
{
"epoch": 0.8610959402876388,
"grad_norm": 4.847239971160889,
"learning_rate": 5.652081485208482e-06,
"loss": 2.4067,
"step": 4730
},
{
"epoch": 0.8629164391043146,
"grad_norm": 4.862872123718262,
"learning_rate": 5.507662611567565e-06,
"loss": 2.4237,
"step": 4740
},
{
"epoch": 0.8647369379209904,
"grad_norm": 4.765954971313477,
"learning_rate": 5.365005147702462e-06,
"loss": 2.3841,
"step": 4750
},
{
"epoch": 0.8665574367376662,
"grad_norm": 5.191616535186768,
"learning_rate": 5.224114741287922e-06,
"loss": 2.4473,
"step": 4760
},
{
"epoch": 0.8683779355543418,
"grad_norm": 5.6387619972229,
"learning_rate": 5.084996970042599e-06,
"loss": 2.3946,
"step": 4770
},
{
"epoch": 0.8701984343710176,
"grad_norm": 4.992214202880859,
"learning_rate": 4.947657341508166e-06,
"loss": 2.4029,
"step": 4780
},
{
"epoch": 0.8720189331876934,
"grad_norm": 4.994503021240234,
"learning_rate": 4.812101292831283e-06,
"loss": 2.4212,
"step": 4790
},
{
"epoch": 0.8738394320043692,
"grad_norm": 5.3045220375061035,
"learning_rate": 4.678334190548378e-06,
"loss": 2.4025,
"step": 4800
},
{
"epoch": 0.875659930821045,
"grad_norm": 5.20510196685791,
"learning_rate": 4.546361330373178e-06,
"loss": 2.4087,
"step": 4810
},
{
"epoch": 0.8774804296377208,
"grad_norm": 5.151695728302002,
"learning_rate": 4.41618793698706e-06,
"loss": 2.3748,
"step": 4820
},
{
"epoch": 0.8793009284543966,
"grad_norm": 5.292723178863525,
"learning_rate": 4.287819163832179e-06,
"loss": 2.4068,
"step": 4830
},
{
"epoch": 0.8811214272710722,
"grad_norm": 5.057366847991943,
"learning_rate": 4.161260092907476e-06,
"loss": 2.4191,
"step": 4840
},
{
"epoch": 0.882941926087748,
"grad_norm": 5.375776290893555,
"learning_rate": 4.0365157345675255e-06,
"loss": 2.4085,
"step": 4850
},
{
"epoch": 0.8847624249044238,
"grad_norm": 5.110659599304199,
"learning_rate": 3.91359102732407e-06,
"loss": 2.3547,
"step": 4860
},
{
"epoch": 0.8865829237210996,
"grad_norm": 5.154110908508301,
"learning_rate": 3.792490837650642e-06,
"loss": 2.3731,
"step": 4870
},
{
"epoch": 0.8884034225377754,
"grad_norm": 4.55220365524292,
"learning_rate": 3.673219959789803e-06,
"loss": 2.3956,
"step": 4880
},
{
"epoch": 0.8902239213544512,
"grad_norm": 5.095583915710449,
"learning_rate": 3.5557831155633715e-06,
"loss": 2.4077,
"step": 4890
},
{
"epoch": 0.8920444201711268,
"grad_norm": 5.244335174560547,
"learning_rate": 3.4401849541855493e-06,
"loss": 2.4164,
"step": 4900
},
{
"epoch": 0.8938649189878026,
"grad_norm": 5.595185279846191,
"learning_rate": 3.3264300520787607e-06,
"loss": 2.4055,
"step": 4910
},
{
"epoch": 0.8956854178044784,
"grad_norm": 5.167870044708252,
"learning_rate": 3.214522912692547e-06,
"loss": 2.4064,
"step": 4920
},
{
"epoch": 0.8975059166211542,
"grad_norm": 4.97356653213501,
"learning_rate": 3.1044679663252807e-06,
"loss": 2.4249,
"step": 4930
},
{
"epoch": 0.89932641543783,
"grad_norm": 5.116016387939453,
"learning_rate": 2.996269569948745e-06,
"loss": 2.3617,
"step": 4940
},
{
"epoch": 0.9011469142545058,
"grad_norm": 5.192502498626709,
"learning_rate": 2.889932007035645e-06,
"loss": 2.3889,
"step": 4950
},
{
"epoch": 0.9029674130711816,
"grad_norm": 4.694880485534668,
"learning_rate": 2.7854594873900463e-06,
"loss": 2.4309,
"step": 4960
},
{
"epoch": 0.9047879118878572,
"grad_norm": 4.840787887573242,
"learning_rate": 2.6828561469807e-06,
"loss": 2.4412,
"step": 4970
},
{
"epoch": 0.906608410704533,
"grad_norm": 5.104063987731934,
"learning_rate": 2.582126047777328e-06,
"loss": 2.4295,
"step": 4980
},
{
"epoch": 0.9084289095212088,
"grad_norm": 4.761752605438232,
"learning_rate": 2.4832731775897844e-06,
"loss": 2.4125,
"step": 4990
},
{
"epoch": 0.9102494083378846,
"grad_norm": 4.580504894256592,
"learning_rate": 2.3863014499101775e-06,
"loss": 2.4762,
"step": 5000
},
{
"epoch": 0.9102494083378846,
"eval_loss": 2.4089949131011963,
"eval_runtime": 1012.4809,
"eval_samples_per_second": 9.646,
"eval_steps_per_second": 1.206,
"step": 5000
},
{
"epoch": 0.9120699071545604,
"grad_norm": 4.849244117736816,
"learning_rate": 2.291214703757982e-06,
"loss": 2.3958,
"step": 5010
},
{
"epoch": 0.9138904059712362,
"grad_norm": 4.8128204345703125,
"learning_rate": 2.1980167035280163e-06,
"loss": 2.4288,
"step": 5020
},
{
"epoch": 0.9157109047879118,
"grad_norm": 5.573403835296631,
"learning_rate": 2.1067111388414163e-06,
"loss": 2.4134,
"step": 5030
},
{
"epoch": 0.9175314036045876,
"grad_norm": 4.9487504959106445,
"learning_rate": 2.0173016243995866e-06,
"loss": 2.4095,
"step": 5040
},
{
"epoch": 0.9193519024212634,
"grad_norm": 4.933927536010742,
"learning_rate": 1.929791699841066e-06,
"loss": 2.4014,
"step": 5050
},
{
"epoch": 0.9211724012379392,
"grad_norm": 5.116062641143799,
"learning_rate": 1.844184829601453e-06,
"loss": 2.4196,
"step": 5060
},
{
"epoch": 0.922992900054615,
"grad_norm": 4.888516902923584,
"learning_rate": 1.7604844027761802e-06,
"loss": 2.4418,
"step": 5070
},
{
"epoch": 0.9248133988712908,
"grad_norm": 4.990447998046875,
"learning_rate": 1.6786937329864027e-06,
"loss": 2.4049,
"step": 5080
},
{
"epoch": 0.9266338976879666,
"grad_norm": 4.672518253326416,
"learning_rate": 1.5988160582477818e-06,
"loss": 2.3873,
"step": 5090
},
{
"epoch": 0.9284543965046422,
"grad_norm": 5.029353618621826,
"learning_rate": 1.5208545408423092e-06,
"loss": 2.4754,
"step": 5100
},
{
"epoch": 0.930274895321318,
"grad_norm": 4.660059928894043,
"learning_rate": 1.444812267193102e-06,
"loss": 2.4081,
"step": 5110
},
{
"epoch": 0.9320953941379938,
"grad_norm": 5.001034259796143,
"learning_rate": 1.3706922477422336e-06,
"loss": 2.4014,
"step": 5120
},
{
"epoch": 0.9339158929546696,
"grad_norm": 5.1275858879089355,
"learning_rate": 1.2984974168315234e-06,
"loss": 2.4251,
"step": 5130
},
{
"epoch": 0.9357363917713454,
"grad_norm": 4.893324375152588,
"learning_rate": 1.2282306325864135e-06,
"loss": 2.4196,
"step": 5140
},
{
"epoch": 0.9375568905880212,
"grad_norm": 4.734968662261963,
"learning_rate": 1.1598946768027863e-06,
"loss": 2.401,
"step": 5150
},
{
"epoch": 0.9393773894046968,
"grad_norm": 4.66255521774292,
"learning_rate": 1.0934922548368254e-06,
"loss": 2.3846,
"step": 5160
},
{
"epoch": 0.9411978882213726,
"grad_norm": 4.771427631378174,
"learning_rate": 1.0290259954979397e-06,
"loss": 2.3953,
"step": 5170
},
{
"epoch": 0.9430183870380484,
"grad_norm": 4.673166275024414,
"learning_rate": 9.664984509446917e-07,
"loss": 2.3694,
"step": 5180
},
{
"epoch": 0.9448388858547242,
"grad_norm": 4.778134346008301,
"learning_rate": 9.059120965837331e-07,
"loss": 2.3948,
"step": 5190
},
{
"epoch": 0.9466593846714,
"grad_norm": 4.706231594085693,
"learning_rate": 8.472693309718283e-07,
"loss": 2.4153,
"step": 5200
},
{
"epoch": 0.9484798834880758,
"grad_norm": 4.645259380340576,
"learning_rate": 7.905724757208965e-07,
"loss": 2.3806,
"step": 5210
},
{
"epoch": 0.9503003823047516,
"grad_norm": 5.04796838760376,
"learning_rate": 7.358237754060915e-07,
"loss": 2.454,
"step": 5220
},
{
"epoch": 0.9521208811214272,
"grad_norm": 4.7881646156311035,
"learning_rate": 6.830253974769496e-07,
"loss": 2.4161,
"step": 5230
},
{
"epoch": 0.953941379938103,
"grad_norm": 4.7254743576049805,
"learning_rate": 6.321794321715757e-07,
"loss": 2.4715,
"step": 5240
},
{
"epoch": 0.9557618787547788,
"grad_norm": 5.13754415512085,
"learning_rate": 5.832878924338869e-07,
"loss": 2.4191,
"step": 5250
},
{
"epoch": 0.9575823775714546,
"grad_norm": 4.781599998474121,
"learning_rate": 5.363527138339597e-07,
"loss": 2.4127,
"step": 5260
},
{
"epoch": 0.9594028763881304,
"grad_norm": 4.541421413421631,
"learning_rate": 4.913757544913355e-07,
"loss": 2.3908,
"step": 5270
},
{
"epoch": 0.9612233752048062,
"grad_norm": 5.078845500946045,
"learning_rate": 4.4835879500153556e-07,
"loss": 2.4303,
"step": 5280
},
{
"epoch": 0.9630438740214818,
"grad_norm": 4.745322227478027,
"learning_rate": 4.0730353836549993e-07,
"loss": 2.4046,
"step": 5290
},
{
"epoch": 0.9648643728381576,
"grad_norm": 4.688536643981934,
"learning_rate": 3.6821160992221993e-07,
"loss": 2.4456,
"step": 5300
},
{
"epoch": 0.9666848716548334,
"grad_norm": 4.9088592529296875,
"learning_rate": 3.310845572843557e-07,
"loss": 2.3846,
"step": 5310
},
{
"epoch": 0.9685053704715092,
"grad_norm": 5.126766681671143,
"learning_rate": 2.959238502769912e-07,
"loss": 2.4093,
"step": 5320
},
{
"epoch": 0.970325869288185,
"grad_norm": 4.49152946472168,
"learning_rate": 2.6273088087943597e-07,
"loss": 2.3837,
"step": 5330
},
{
"epoch": 0.9721463681048608,
"grad_norm": 4.944559097290039,
"learning_rate": 2.315069631701139e-07,
"loss": 2.3791,
"step": 5340
},
{
"epoch": 0.9739668669215366,
"grad_norm": 4.91040563583374,
"learning_rate": 2.022533332745602e-07,
"loss": 2.4035,
"step": 5350
},
{
"epoch": 0.9757873657382122,
"grad_norm": 4.91538143157959,
"learning_rate": 1.7497114931644965e-07,
"loss": 2.4057,
"step": 5360
},
{
"epoch": 0.977607864554888,
"grad_norm": 5.63076114654541,
"learning_rate": 1.496614913717831e-07,
"loss": 2.3627,
"step": 5370
},
{
"epoch": 0.9794283633715638,
"grad_norm": 4.944591045379639,
"learning_rate": 1.2632536142609397e-07,
"loss": 2.3662,
"step": 5380
},
{
"epoch": 0.9812488621882396,
"grad_norm": 4.864638328552246,
"learning_rate": 1.0496368333482442e-07,
"loss": 2.3704,
"step": 5390
},
{
"epoch": 0.9830693610049154,
"grad_norm": 4.991931438446045,
"learning_rate": 8.557730278669906e-08,
"loss": 2.3767,
"step": 5400
},
{
"epoch": 0.9848898598215912,
"grad_norm": 4.382468223571777,
"learning_rate": 6.816698727029614e-08,
"loss": 2.4112,
"step": 5410
},
{
"epoch": 0.9867103586382668,
"grad_norm": 44.841453552246094,
"learning_rate": 5.273342604361631e-08,
"loss": 2.4092,
"step": 5420
},
{
"epoch": 0.9885308574549426,
"grad_norm": 4.815988063812256,
"learning_rate": 3.9277230106832264e-08,
"loss": 2.4256,
"step": 5430
},
{
"epoch": 0.9903513562716184,
"grad_norm": 4.87392520904541,
"learning_rate": 2.7798932178080274e-08,
"loss": 2.3936,
"step": 5440
},
{
"epoch": 0.9921718550882942,
"grad_norm": 5.1465559005737305,
"learning_rate": 1.829898667237151e-08,
"loss": 2.3805,
"step": 5450
},
{
"epoch": 0.99399235390497,
"grad_norm": 4.486802101135254,
"learning_rate": 1.0777769683617544e-08,
"loss": 2.3492,
"step": 5460
},
{
"epoch": 0.9958128527216458,
"grad_norm": 5.0049614906311035,
"learning_rate": 5.2355789697144945e-09,
"loss": 2.4414,
"step": 5470
},
{
"epoch": 0.9976333515383216,
"grad_norm": 4.7070441246032715,
"learning_rate": 1.6726339407857616e-09,
"loss": 2.4294,
"step": 5480
},
{
"epoch": 0.9994538503549972,
"grad_norm": 4.9832539558410645,
"learning_rate": 8.907565046678557e-11,
"loss": 2.3724,
"step": 5490
}
],
"logging_steps": 10,
"max_steps": 5493,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2843428615741768e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}