{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 5493, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018204988166757691, "grad_norm": 900.346923828125, "learning_rate": 2.0000000000000003e-06, "loss": 20.2441, "step": 10 }, { "epoch": 0.0036409976333515383, "grad_norm": 939.3712158203125, "learning_rate": 4.000000000000001e-06, "loss": 19.6193, "step": 20 }, { "epoch": 0.005461496450027308, "grad_norm": 562.291015625, "learning_rate": 6e-06, "loss": 18.7898, "step": 30 }, { "epoch": 0.007281995266703077, "grad_norm": 447.43060302734375, "learning_rate": 8.000000000000001e-06, "loss": 17.2234, "step": 40 }, { "epoch": 0.009102494083378846, "grad_norm": 361.34100341796875, "learning_rate": 1e-05, "loss": 15.4955, "step": 50 }, { "epoch": 0.010922992900054615, "grad_norm": 320.7839660644531, "learning_rate": 1.2e-05, "loss": 13.6977, "step": 60 }, { "epoch": 0.012743491716730384, "grad_norm": 131.0957794189453, "learning_rate": 1.4000000000000001e-05, "loss": 11.4716, "step": 70 }, { "epoch": 0.014563990533406153, "grad_norm": 75.75025177001953, "learning_rate": 1.6000000000000003e-05, "loss": 9.5351, "step": 80 }, { "epoch": 0.016384489350081924, "grad_norm": 42.2626953125, "learning_rate": 1.8e-05, "loss": 8.1668, "step": 90 }, { "epoch": 0.018204988166757693, "grad_norm": 25.88639259338379, "learning_rate": 2e-05, "loss": 7.3242, "step": 100 }, { "epoch": 0.02002548698343346, "grad_norm": 16.043380737304688, "learning_rate": 2.2000000000000003e-05, "loss": 6.6513, "step": 110 }, { "epoch": 0.02184598580010923, "grad_norm": 15.031912803649902, "learning_rate": 2.4e-05, "loss": 6.1476, "step": 120 }, { "epoch": 0.023666484616785, "grad_norm": 8.564423561096191, "learning_rate": 2.6000000000000002e-05, "loss": 5.7499, "step": 130 }, { "epoch": 0.02548698343346077, "grad_norm": 9.205732345581055, "learning_rate": 2.8000000000000003e-05, "loss": 5.4031, "step": 140 }, { "epoch": 0.027307482250136537, "grad_norm": 7.424269199371338, "learning_rate": 3e-05, "loss": 5.1054, "step": 150 }, { "epoch": 0.029127981066812306, "grad_norm": 7.239726543426514, "learning_rate": 3.2000000000000005e-05, "loss": 4.9637, "step": 160 }, { "epoch": 0.030948479883488075, "grad_norm": 9.377843856811523, "learning_rate": 3.4000000000000007e-05, "loss": 4.7583, "step": 170 }, { "epoch": 0.03276897870016385, "grad_norm": 5.893341064453125, "learning_rate": 3.6e-05, "loss": 4.7093, "step": 180 }, { "epoch": 0.03458947751683961, "grad_norm": 5.311996936798096, "learning_rate": 3.8e-05, "loss": 4.5147, "step": 190 }, { "epoch": 0.036409976333515386, "grad_norm": 5.311359882354736, "learning_rate": 4e-05, "loss": 4.5004, "step": 200 }, { "epoch": 0.03823047515019115, "grad_norm": 8.67163372039795, "learning_rate": 4.2e-05, "loss": 4.3923, "step": 210 }, { "epoch": 0.04005097396686692, "grad_norm": 5.317925930023193, "learning_rate": 4.4000000000000006e-05, "loss": 4.3169, "step": 220 }, { "epoch": 0.04187147278354269, "grad_norm": 10.292462348937988, "learning_rate": 4.600000000000001e-05, "loss": 4.2887, "step": 230 }, { "epoch": 0.04369197160021846, "grad_norm": 6.532808303833008, "learning_rate": 4.8e-05, "loss": 4.2458, "step": 240 }, { "epoch": 0.04551247041689423, "grad_norm": 5.974935531616211, "learning_rate": 5e-05, "loss": 4.1716, "step": 250 }, { "epoch": 0.04733296923357, "grad_norm": 10.58936595916748, "learning_rate": 5.2000000000000004e-05, "loss": 4.0606, "step": 260 }, { "epoch": 0.049153468050245765, "grad_norm": 11.461891174316406, "learning_rate": 5.4000000000000005e-05, "loss": 4.1349, "step": 270 }, { "epoch": 0.05097396686692154, "grad_norm": 8.389955520629883, "learning_rate": 5.6000000000000006e-05, "loss": 4.044, "step": 280 }, { "epoch": 0.0527944656835973, "grad_norm": 5.730175018310547, "learning_rate": 5.8e-05, "loss": 4.0169, "step": 290 }, { "epoch": 0.054614964500273075, "grad_norm": 8.236620903015137, "learning_rate": 6e-05, "loss": 3.8992, "step": 300 }, { "epoch": 0.05643546331694885, "grad_norm": 9.377148628234863, "learning_rate": 6.2e-05, "loss": 3.9041, "step": 310 }, { "epoch": 0.05825596213362461, "grad_norm": 8.225547790527344, "learning_rate": 6.400000000000001e-05, "loss": 3.8243, "step": 320 }, { "epoch": 0.060076460950300385, "grad_norm": 6.899202823638916, "learning_rate": 6.6e-05, "loss": 3.8613, "step": 330 }, { "epoch": 0.06189695976697615, "grad_norm": 7.693598747253418, "learning_rate": 6.800000000000001e-05, "loss": 3.7624, "step": 340 }, { "epoch": 0.06371745858365192, "grad_norm": 6.594521522521973, "learning_rate": 7e-05, "loss": 3.7709, "step": 350 }, { "epoch": 0.0655379574003277, "grad_norm": 7.71303129196167, "learning_rate": 7.2e-05, "loss": 3.699, "step": 360 }, { "epoch": 0.06735845621700345, "grad_norm": 11.58485221862793, "learning_rate": 7.4e-05, "loss": 3.6711, "step": 370 }, { "epoch": 0.06917895503367923, "grad_norm": 12.817239761352539, "learning_rate": 7.6e-05, "loss": 3.6647, "step": 380 }, { "epoch": 0.070999453850355, "grad_norm": 11.794710159301758, "learning_rate": 7.800000000000001e-05, "loss": 3.6485, "step": 390 }, { "epoch": 0.07281995266703077, "grad_norm": 7.5003509521484375, "learning_rate": 8e-05, "loss": 3.63, "step": 400 }, { "epoch": 0.07464045148370653, "grad_norm": 7.507719993591309, "learning_rate": 8.2e-05, "loss": 3.5663, "step": 410 }, { "epoch": 0.0764609503003823, "grad_norm": 10.392959594726562, "learning_rate": 8.4e-05, "loss": 3.5462, "step": 420 }, { "epoch": 0.07828144911705807, "grad_norm": 10.792546272277832, "learning_rate": 8.6e-05, "loss": 3.489, "step": 430 }, { "epoch": 0.08010194793373385, "grad_norm": 8.886263847351074, "learning_rate": 8.800000000000001e-05, "loss": 3.507, "step": 440 }, { "epoch": 0.0819224467504096, "grad_norm": 10.633005142211914, "learning_rate": 9e-05, "loss": 3.4786, "step": 450 }, { "epoch": 0.08374294556708538, "grad_norm": 8.927319526672363, "learning_rate": 9.200000000000001e-05, "loss": 3.4799, "step": 460 }, { "epoch": 0.08556344438376115, "grad_norm": 7.764442443847656, "learning_rate": 9.4e-05, "loss": 3.4424, "step": 470 }, { "epoch": 0.08738394320043692, "grad_norm": 7.294579982757568, "learning_rate": 9.6e-05, "loss": 3.3888, "step": 480 }, { "epoch": 0.0892044420171127, "grad_norm": 12.751729965209961, "learning_rate": 9.8e-05, "loss": 3.4342, "step": 490 }, { "epoch": 0.09102494083378845, "grad_norm": 12.688404083251953, "learning_rate": 0.0001, "loss": 3.412, "step": 500 }, { "epoch": 0.09284543965046423, "grad_norm": 8.39454174041748, "learning_rate": 9.99990102735217e-05, "loss": 3.3787, "step": 510 }, { "epoch": 0.09466593846714, "grad_norm": 8.661295890808105, "learning_rate": 9.999604113326911e-05, "loss": 3.3499, "step": 520 }, { "epoch": 0.09648643728381577, "grad_norm": 11.745123863220215, "learning_rate": 9.999109269678773e-05, "loss": 3.3025, "step": 530 }, { "epoch": 0.09830693610049153, "grad_norm": 11.857743263244629, "learning_rate": 9.998416515998146e-05, "loss": 3.3443, "step": 540 }, { "epoch": 0.1001274349171673, "grad_norm": 10.025672912597656, "learning_rate": 9.997525879710501e-05, "loss": 3.2624, "step": 550 }, { "epoch": 0.10194793373384307, "grad_norm": 11.870292663574219, "learning_rate": 9.996437396075289e-05, "loss": 3.3323, "step": 560 }, { "epoch": 0.10376843255051885, "grad_norm": 11.637451171875, "learning_rate": 9.995151108184551e-05, "loss": 3.2349, "step": 570 }, { "epoch": 0.1055889313671946, "grad_norm": 21.10885238647461, "learning_rate": 9.993667066961219e-05, "loss": 3.3025, "step": 580 }, { "epoch": 0.10740943018387038, "grad_norm": 19.601778030395508, "learning_rate": 9.991985331157083e-05, "loss": 3.2509, "step": 590 }, { "epoch": 0.10922992900054615, "grad_norm": 15.462264060974121, "learning_rate": 9.990105967350486e-05, "loss": 3.2197, "step": 600 }, { "epoch": 0.11105042781722192, "grad_norm": 10.48941421508789, "learning_rate": 9.98802904994367e-05, "loss": 3.2523, "step": 610 }, { "epoch": 0.1128709266338977, "grad_norm": 11.569725036621094, "learning_rate": 9.985754661159844e-05, "loss": 3.192, "step": 620 }, { "epoch": 0.11469142545057345, "grad_norm": 7.362033843994141, "learning_rate": 9.983282891039914e-05, "loss": 3.2174, "step": 630 }, { "epoch": 0.11651192426724923, "grad_norm": 7.256021022796631, "learning_rate": 9.98061383743894e-05, "loss": 3.1671, "step": 640 }, { "epoch": 0.118332423083925, "grad_norm": 8.411303520202637, "learning_rate": 9.97774760602224e-05, "loss": 3.1793, "step": 650 }, { "epoch": 0.12015292190060077, "grad_norm": 9.6874361038208, "learning_rate": 9.97468431026122e-05, "loss": 3.167, "step": 660 }, { "epoch": 0.12197342071727653, "grad_norm": 9.121397972106934, "learning_rate": 9.971424071428877e-05, "loss": 3.1107, "step": 670 }, { "epoch": 0.1237939195339523, "grad_norm": 13.291868209838867, "learning_rate": 9.967967018594997e-05, "loss": 3.1678, "step": 680 }, { "epoch": 0.12561441835062806, "grad_norm": 10.143365859985352, "learning_rate": 9.96431328862105e-05, "loss": 3.1591, "step": 690 }, { "epoch": 0.12743491716730385, "grad_norm": 9.821294784545898, "learning_rate": 9.96046302615477e-05, "loss": 3.1315, "step": 700 }, { "epoch": 0.1292554159839796, "grad_norm": 12.868693351745605, "learning_rate": 9.956416383624422e-05, "loss": 3.0713, "step": 710 }, { "epoch": 0.1310759148006554, "grad_norm": 10.408199310302734, "learning_rate": 9.95217352123278e-05, "loss": 3.1292, "step": 720 }, { "epoch": 0.13289641361733115, "grad_norm": 8.829959869384766, "learning_rate": 9.947734606950771e-05, "loss": 3.111, "step": 730 }, { "epoch": 0.1347169124340069, "grad_norm": 9.13364315032959, "learning_rate": 9.943099816510836e-05, "loss": 3.1011, "step": 740 }, { "epoch": 0.1365374112506827, "grad_norm": 8.674768447875977, "learning_rate": 9.93826933339997e-05, "loss": 3.0991, "step": 750 }, { "epoch": 0.13835791006735845, "grad_norm": 8.487624168395996, "learning_rate": 9.933243348852451e-05, "loss": 3.0915, "step": 760 }, { "epoch": 0.14017840888403424, "grad_norm": 7.808052062988281, "learning_rate": 9.928022061842282e-05, "loss": 3.0694, "step": 770 }, { "epoch": 0.14199890770071, "grad_norm": 12.615427017211914, "learning_rate": 9.922605679075298e-05, "loss": 3.0524, "step": 780 }, { "epoch": 0.14381940651738576, "grad_norm": 8.977923393249512, "learning_rate": 9.916994414981002e-05, "loss": 2.9989, "step": 790 }, { "epoch": 0.14563990533406154, "grad_norm": 8.723967552185059, "learning_rate": 9.911188491704058e-05, "loss": 3.0123, "step": 800 }, { "epoch": 0.1474604041507373, "grad_norm": 8.96397590637207, "learning_rate": 9.905188139095506e-05, "loss": 3.0031, "step": 810 }, { "epoch": 0.14928090296741306, "grad_norm": 21.173276901245117, "learning_rate": 9.89899359470366e-05, "loss": 3.0045, "step": 820 }, { "epoch": 0.15110140178408885, "grad_norm": 9.059152603149414, "learning_rate": 9.892605103764704e-05, "loss": 2.972, "step": 830 }, { "epoch": 0.1529219006007646, "grad_norm": 7.885227203369141, "learning_rate": 9.886022919192985e-05, "loss": 2.9822, "step": 840 }, { "epoch": 0.1547423994174404, "grad_norm": 7.191554069519043, "learning_rate": 9.879247301570995e-05, "loss": 3.0297, "step": 850 }, { "epoch": 0.15656289823411615, "grad_norm": 10.712031364440918, "learning_rate": 9.872278519139062e-05, "loss": 3.0149, "step": 860 }, { "epoch": 0.1583833970507919, "grad_norm": 10.305954933166504, "learning_rate": 9.865116847784726e-05, "loss": 3.0046, "step": 870 }, { "epoch": 0.1602038958674677, "grad_norm": 11.114262580871582, "learning_rate": 9.857762571031818e-05, "loss": 2.9784, "step": 880 }, { "epoch": 0.16202439468414345, "grad_norm": 10.611502647399902, "learning_rate": 9.850215980029234e-05, "loss": 2.9992, "step": 890 }, { "epoch": 0.1638448935008192, "grad_norm": 8.901230812072754, "learning_rate": 9.842477373539412e-05, "loss": 2.9712, "step": 900 }, { "epoch": 0.165665392317495, "grad_norm": 7.750337600708008, "learning_rate": 9.834547057926502e-05, "loss": 2.9586, "step": 910 }, { "epoch": 0.16748589113417076, "grad_norm": 8.038575172424316, "learning_rate": 9.826425347144237e-05, "loss": 2.9487, "step": 920 }, { "epoch": 0.16930638995084654, "grad_norm": 8.837507247924805, "learning_rate": 9.818112562723507e-05, "loss": 2.9682, "step": 930 }, { "epoch": 0.1711268887675223, "grad_norm": 8.412049293518066, "learning_rate": 9.809609033759625e-05, "loss": 2.9253, "step": 940 }, { "epoch": 0.17294738758419806, "grad_norm": 8.611617088317871, "learning_rate": 9.8009150968993e-05, "loss": 2.9494, "step": 950 }, { "epoch": 0.17476788640087385, "grad_norm": 8.870803833007812, "learning_rate": 9.792031096327318e-05, "loss": 2.9347, "step": 960 }, { "epoch": 0.1765883852175496, "grad_norm": 6.085882186889648, "learning_rate": 9.782957383752898e-05, "loss": 2.8954, "step": 970 }, { "epoch": 0.1784088840342254, "grad_norm": 53.74106979370117, "learning_rate": 9.773694318395786e-05, "loss": 2.9159, "step": 980 }, { "epoch": 0.18022938285090115, "grad_norm": 8.432936668395996, "learning_rate": 9.764242266972021e-05, "loss": 2.9752, "step": 990 }, { "epoch": 0.1820498816675769, "grad_norm": 6.479644298553467, "learning_rate": 9.75460160367943e-05, "loss": 2.938, "step": 1000 }, { "epoch": 0.1820498816675769, "eval_loss": 2.901522159576416, "eval_runtime": 1012.3411, "eval_samples_per_second": 9.647, "eval_steps_per_second": 1.206, "step": 1000 }, { "epoch": 0.1838703804842527, "grad_norm": 8.284567832946777, "learning_rate": 9.744772710182801e-05, "loss": 2.899, "step": 1010 }, { "epoch": 0.18569087930092845, "grad_norm": 6.623629570007324, "learning_rate": 9.734755975598777e-05, "loss": 2.9371, "step": 1020 }, { "epoch": 0.1875113781176042, "grad_norm": 8.541956901550293, "learning_rate": 9.724551796480459e-05, "loss": 2.8807, "step": 1030 }, { "epoch": 0.18933187693428, "grad_norm": 8.620600700378418, "learning_rate": 9.714160576801696e-05, "loss": 2.8888, "step": 1040 }, { "epoch": 0.19115237575095576, "grad_norm": 8.644622802734375, "learning_rate": 9.7035827279411e-05, "loss": 2.8747, "step": 1050 }, { "epoch": 0.19297287456763154, "grad_norm": 9.656100273132324, "learning_rate": 9.692818668665752e-05, "loss": 2.9203, "step": 1060 }, { "epoch": 0.1947933733843073, "grad_norm": 10.529635429382324, "learning_rate": 9.681868825114634e-05, "loss": 2.9257, "step": 1070 }, { "epoch": 0.19661387220098306, "grad_norm": 8.376754760742188, "learning_rate": 9.670733630781747e-05, "loss": 2.8864, "step": 1080 }, { "epoch": 0.19843437101765884, "grad_norm": 8.018802642822266, "learning_rate": 9.659413526498962e-05, "loss": 2.8672, "step": 1090 }, { "epoch": 0.2002548698343346, "grad_norm": 7.348598480224609, "learning_rate": 9.647908960418553e-05, "loss": 2.8528, "step": 1100 }, { "epoch": 0.2020753686510104, "grad_norm": 7.87021017074585, "learning_rate": 9.636220387995469e-05, "loss": 2.8713, "step": 1110 }, { "epoch": 0.20389586746768615, "grad_norm": 8.476405143737793, "learning_rate": 9.624348271969295e-05, "loss": 2.8667, "step": 1120 }, { "epoch": 0.2057163662843619, "grad_norm": 8.64283561706543, "learning_rate": 9.612293082345931e-05, "loss": 2.8523, "step": 1130 }, { "epoch": 0.2075368651010377, "grad_norm": 10.11330795288086, "learning_rate": 9.600055296378995e-05, "loss": 2.8375, "step": 1140 }, { "epoch": 0.20935736391771345, "grad_norm": 8.217743873596191, "learning_rate": 9.58763539855092e-05, "loss": 2.8685, "step": 1150 }, { "epoch": 0.2111778627343892, "grad_norm": 7.501378536224365, "learning_rate": 9.575033880553774e-05, "loss": 2.8349, "step": 1160 }, { "epoch": 0.212998361551065, "grad_norm": 8.812211036682129, "learning_rate": 9.562251241269798e-05, "loss": 2.8384, "step": 1170 }, { "epoch": 0.21481886036774075, "grad_norm": 7.964756011962891, "learning_rate": 9.549287986751655e-05, "loss": 2.8653, "step": 1180 }, { "epoch": 0.21663935918441654, "grad_norm": 7.216350555419922, "learning_rate": 9.536144630202395e-05, "loss": 2.8276, "step": 1190 }, { "epoch": 0.2184598580010923, "grad_norm": 7.890927314758301, "learning_rate": 9.522821691955135e-05, "loss": 2.7802, "step": 1200 }, { "epoch": 0.22028035681776806, "grad_norm": 8.259157180786133, "learning_rate": 9.509319699452469e-05, "loss": 2.8407, "step": 1210 }, { "epoch": 0.22210085563444384, "grad_norm": 7.810998916625977, "learning_rate": 9.495639187225575e-05, "loss": 2.8374, "step": 1220 }, { "epoch": 0.2239213544511196, "grad_norm": 6.905944347381592, "learning_rate": 9.481780696873059e-05, "loss": 2.8342, "step": 1230 }, { "epoch": 0.2257418532677954, "grad_norm": 8.832979202270508, "learning_rate": 9.467744777039517e-05, "loss": 2.7816, "step": 1240 }, { "epoch": 0.22756235208447115, "grad_norm": 6.949944972991943, "learning_rate": 9.453531983393809e-05, "loss": 2.8104, "step": 1250 }, { "epoch": 0.2293828509011469, "grad_norm": 11.183205604553223, "learning_rate": 9.439142878607061e-05, "loss": 2.8605, "step": 1260 }, { "epoch": 0.2312033497178227, "grad_norm": 8.672426223754883, "learning_rate": 9.424578032330398e-05, "loss": 2.7866, "step": 1270 }, { "epoch": 0.23302384853449845, "grad_norm": 8.570023536682129, "learning_rate": 9.409838021172375e-05, "loss": 2.7814, "step": 1280 }, { "epoch": 0.2348443473511742, "grad_norm": 17.605865478515625, "learning_rate": 9.394923428676168e-05, "loss": 2.8896, "step": 1290 }, { "epoch": 0.23666484616785, "grad_norm": 8.613877296447754, "learning_rate": 9.379834845296463e-05, "loss": 2.8474, "step": 1300 }, { "epoch": 0.23848534498452575, "grad_norm": 9.39710807800293, "learning_rate": 9.364572868376075e-05, "loss": 2.7771, "step": 1310 }, { "epoch": 0.24030584380120154, "grad_norm": 12.333969116210938, "learning_rate": 9.349138102122316e-05, "loss": 2.8079, "step": 1320 }, { "epoch": 0.2421263426178773, "grad_norm": 10.491060256958008, "learning_rate": 9.333531157583055e-05, "loss": 2.7536, "step": 1330 }, { "epoch": 0.24394684143455306, "grad_norm": 9.862618446350098, "learning_rate": 9.317752652622547e-05, "loss": 2.8011, "step": 1340 }, { "epoch": 0.24576734025122884, "grad_norm": 11.95722484588623, "learning_rate": 9.301803211896955e-05, "loss": 2.8058, "step": 1350 }, { "epoch": 0.2475878390679046, "grad_norm": 8.709095001220703, "learning_rate": 9.28568346682963e-05, "loss": 2.7922, "step": 1360 }, { "epoch": 0.2494083378845804, "grad_norm": 6.32808256149292, "learning_rate": 9.269394055586116e-05, "loss": 2.7246, "step": 1370 }, { "epoch": 0.2512288367012561, "grad_norm": 10.615900039672852, "learning_rate": 9.252935623048875e-05, "loss": 2.7993, "step": 1380 }, { "epoch": 0.2530493355179319, "grad_norm": 10.374322891235352, "learning_rate": 9.236308820791768e-05, "loss": 2.7583, "step": 1390 }, { "epoch": 0.2548698343346077, "grad_norm": 11.486263275146484, "learning_rate": 9.219514307054251e-05, "loss": 2.8258, "step": 1400 }, { "epoch": 0.2566903331512835, "grad_norm": 9.840982437133789, "learning_rate": 9.202552746715322e-05, "loss": 2.8464, "step": 1410 }, { "epoch": 0.2585108319679592, "grad_norm": 15.894274711608887, "learning_rate": 9.185424811267199e-05, "loss": 2.8465, "step": 1420 }, { "epoch": 0.260331330784635, "grad_norm": 8.428662300109863, "learning_rate": 9.168131178788726e-05, "loss": 2.8095, "step": 1430 }, { "epoch": 0.2621518296013108, "grad_norm": 17.082258224487305, "learning_rate": 9.150672533918544e-05, "loss": 2.7782, "step": 1440 }, { "epoch": 0.2639723284179865, "grad_norm": 7.154361724853516, "learning_rate": 9.133049567827982e-05, "loss": 2.7773, "step": 1450 }, { "epoch": 0.2657928272346623, "grad_norm": 6.119648456573486, "learning_rate": 9.115262978193679e-05, "loss": 2.7788, "step": 1460 }, { "epoch": 0.2676133260513381, "grad_norm": 8.635058403015137, "learning_rate": 9.097313469169988e-05, "loss": 2.7703, "step": 1470 }, { "epoch": 0.2694338248680138, "grad_norm": 12.325600624084473, "learning_rate": 9.079201751361082e-05, "loss": 2.7313, "step": 1480 }, { "epoch": 0.2712543236846896, "grad_norm": 8.181892395019531, "learning_rate": 9.06092854179283e-05, "loss": 2.7795, "step": 1490 }, { "epoch": 0.2730748225013654, "grad_norm": 14.719033241271973, "learning_rate": 9.042494563884404e-05, "loss": 2.8108, "step": 1500 }, { "epoch": 0.2748953213180411, "grad_norm": 7.8658061027526855, "learning_rate": 9.023900547419646e-05, "loss": 2.7663, "step": 1510 }, { "epoch": 0.2767158201347169, "grad_norm": 15.445107460021973, "learning_rate": 9.005147228518174e-05, "loss": 2.7878, "step": 1520 }, { "epoch": 0.2785363189513927, "grad_norm": 12.650901794433594, "learning_rate": 8.986235349606238e-05, "loss": 2.8219, "step": 1530 }, { "epoch": 0.2803568177680685, "grad_norm": 10.226774215698242, "learning_rate": 8.967165659387331e-05, "loss": 2.742, "step": 1540 }, { "epoch": 0.2821773165847442, "grad_norm": 138.37210083007812, "learning_rate": 8.947938912812548e-05, "loss": 2.9524, "step": 1550 }, { "epoch": 0.28399781540142, "grad_norm": 16.11450958251953, "learning_rate": 8.928555871050693e-05, "loss": 2.7966, "step": 1560 }, { "epoch": 0.2858183142180958, "grad_norm": 10.201882362365723, "learning_rate": 8.909017301458156e-05, "loss": 2.8389, "step": 1570 }, { "epoch": 0.2876388130347715, "grad_norm": 10.96867847442627, "learning_rate": 8.889323977548521e-05, "loss": 2.7495, "step": 1580 }, { "epoch": 0.2894593118514473, "grad_norm": 10.814942359924316, "learning_rate": 8.869476678961954e-05, "loss": 2.7676, "step": 1590 }, { "epoch": 0.2912798106681231, "grad_norm": 6.535337448120117, "learning_rate": 8.849476191434334e-05, "loss": 2.7589, "step": 1600 }, { "epoch": 0.2931003094847988, "grad_norm": 7.036696434020996, "learning_rate": 8.829323306766142e-05, "loss": 2.7921, "step": 1610 }, { "epoch": 0.2949208083014746, "grad_norm": 5.92086124420166, "learning_rate": 8.809018822791121e-05, "loss": 2.7267, "step": 1620 }, { "epoch": 0.2967413071181504, "grad_norm": 6.186739921569824, "learning_rate": 8.788563543344688e-05, "loss": 2.795, "step": 1630 }, { "epoch": 0.2985618059348261, "grad_norm": 8.154546737670898, "learning_rate": 8.767958278232112e-05, "loss": 2.7627, "step": 1640 }, { "epoch": 0.3003823047515019, "grad_norm": 7.674529075622559, "learning_rate": 8.74720384319645e-05, "loss": 2.7996, "step": 1650 }, { "epoch": 0.3022028035681777, "grad_norm": 6.348474025726318, "learning_rate": 8.726301059886259e-05, "loss": 2.7704, "step": 1660 }, { "epoch": 0.3040233023848534, "grad_norm": 10.496267318725586, "learning_rate": 8.705250755823064e-05, "loss": 2.7591, "step": 1670 }, { "epoch": 0.3058438012015292, "grad_norm": 102.05543518066406, "learning_rate": 8.684053764368598e-05, "loss": 2.8027, "step": 1680 }, { "epoch": 0.307664300018205, "grad_norm": 8.403404235839844, "learning_rate": 8.662710924691805e-05, "loss": 2.8801, "step": 1690 }, { "epoch": 0.3094847988348808, "grad_norm": 7.355569839477539, "learning_rate": 8.64122308173563e-05, "loss": 2.8346, "step": 1700 }, { "epoch": 0.3113052976515565, "grad_norm": 12.551121711730957, "learning_rate": 8.61959108618356e-05, "loss": 2.8381, "step": 1710 }, { "epoch": 0.3131257964682323, "grad_norm": 116.6989517211914, "learning_rate": 8.597815794425943e-05, "loss": 2.814, "step": 1720 }, { "epoch": 0.3149462952849081, "grad_norm": 21.63788604736328, "learning_rate": 8.575898068526093e-05, "loss": 2.8389, "step": 1730 }, { "epoch": 0.3167667941015838, "grad_norm": 143.42408752441406, "learning_rate": 8.553838776186158e-05, "loss": 2.8534, "step": 1740 }, { "epoch": 0.3185872929182596, "grad_norm": 9.04028034210205, "learning_rate": 8.531638790712765e-05, "loss": 2.8186, "step": 1750 }, { "epoch": 0.3204077917349354, "grad_norm": 11.659414291381836, "learning_rate": 8.509298990982453e-05, "loss": 2.8078, "step": 1760 }, { "epoch": 0.3222282905516111, "grad_norm": 7.934113502502441, "learning_rate": 8.486820261406873e-05, "loss": 2.792, "step": 1770 }, { "epoch": 0.3240487893682869, "grad_norm": 12.919567108154297, "learning_rate": 8.464203491897779e-05, "loss": 2.8111, "step": 1780 }, { "epoch": 0.3258692881849627, "grad_norm": 13.67540454864502, "learning_rate": 8.441449577831801e-05, "loss": 2.8085, "step": 1790 }, { "epoch": 0.3276897870016384, "grad_norm": 7.7655110359191895, "learning_rate": 8.418559420014984e-05, "loss": 2.7689, "step": 1800 }, { "epoch": 0.3295102858183142, "grad_norm": 8.168259620666504, "learning_rate": 8.395533924647141e-05, "loss": 2.7534, "step": 1810 }, { "epoch": 0.33133078463499, "grad_norm": 14.387748718261719, "learning_rate": 8.372374003285968e-05, "loss": 2.8353, "step": 1820 }, { "epoch": 0.3331512834516658, "grad_norm": 9.209723472595215, "learning_rate": 8.349080572810965e-05, "loss": 2.7837, "step": 1830 }, { "epoch": 0.3349717822683415, "grad_norm": 9.160303115844727, "learning_rate": 8.325654555387123e-05, "loss": 2.8186, "step": 1840 }, { "epoch": 0.3367922810850173, "grad_norm": 20.171415328979492, "learning_rate": 8.302096878428438e-05, "loss": 2.8011, "step": 1850 }, { "epoch": 0.3386127799016931, "grad_norm": 29.545217514038086, "learning_rate": 8.278408474561169e-05, "loss": 2.7971, "step": 1860 }, { "epoch": 0.3404332787183688, "grad_norm": 19.314136505126953, "learning_rate": 8.254590281586942e-05, "loss": 2.7983, "step": 1870 }, { "epoch": 0.3422537775350446, "grad_norm": 8.010175704956055, "learning_rate": 8.230643242445605e-05, "loss": 2.7921, "step": 1880 }, { "epoch": 0.3440742763517204, "grad_norm": 24.649381637573242, "learning_rate": 8.206568305177907e-05, "loss": 2.7962, "step": 1890 }, { "epoch": 0.3458947751683961, "grad_norm": 8.272650718688965, "learning_rate": 8.182366422887964e-05, "loss": 2.7439, "step": 1900 }, { "epoch": 0.3477152739850719, "grad_norm": 7.553550720214844, "learning_rate": 8.158038553705524e-05, "loss": 2.7845, "step": 1910 }, { "epoch": 0.3495357728017477, "grad_norm": 8.573986053466797, "learning_rate": 8.13358566074804e-05, "loss": 2.7003, "step": 1920 }, { "epoch": 0.3513562716184234, "grad_norm": 10.316489219665527, "learning_rate": 8.109008712082538e-05, "loss": 2.7627, "step": 1930 }, { "epoch": 0.3531767704350992, "grad_norm": 8.462483406066895, "learning_rate": 8.084308680687287e-05, "loss": 2.7281, "step": 1940 }, { "epoch": 0.354997269251775, "grad_norm": 20.140274047851562, "learning_rate": 8.059486544413298e-05, "loss": 2.6906, "step": 1950 }, { "epoch": 0.3568177680684508, "grad_norm": 7.473912239074707, "learning_rate": 8.034543285945584e-05, "loss": 2.8117, "step": 1960 }, { "epoch": 0.3586382668851265, "grad_norm": 38.26898193359375, "learning_rate": 8.009479892764284e-05, "loss": 2.7456, "step": 1970 }, { "epoch": 0.3604587657018023, "grad_norm": 48.63120651245117, "learning_rate": 7.984297357105552e-05, "loss": 2.7224, "step": 1980 }, { "epoch": 0.3622792645184781, "grad_norm": 10.31283187866211, "learning_rate": 7.95899667592228e-05, "loss": 2.7108, "step": 1990 }, { "epoch": 0.3640997633351538, "grad_norm": 6.465616703033447, "learning_rate": 7.933578850844636e-05, "loss": 2.6901, "step": 2000 }, { "epoch": 0.3640997633351538, "eval_loss": 2.735260248184204, "eval_runtime": 1011.986, "eval_samples_per_second": 9.65, "eval_steps_per_second": 1.207, "step": 2000 }, { "epoch": 0.3659202621518296, "grad_norm": 7.348064422607422, "learning_rate": 7.908044888140394e-05, "loss": 2.7194, "step": 2010 }, { "epoch": 0.3677407609685054, "grad_norm": 10.493142127990723, "learning_rate": 7.882395798675115e-05, "loss": 2.7374, "step": 2020 }, { "epoch": 0.3695612597851811, "grad_norm": 9.447548866271973, "learning_rate": 7.856632597872122e-05, "loss": 2.7186, "step": 2030 }, { "epoch": 0.3713817586018569, "grad_norm": 6.926711559295654, "learning_rate": 7.83075630567229e-05, "loss": 2.7369, "step": 2040 }, { "epoch": 0.3732022574185327, "grad_norm": 19.662818908691406, "learning_rate": 7.804767946493685e-05, "loss": 2.71, "step": 2050 }, { "epoch": 0.3750227562352084, "grad_norm": 37.960330963134766, "learning_rate": 7.778668549190994e-05, "loss": 2.7497, "step": 2060 }, { "epoch": 0.3768432550518842, "grad_norm": 10.371471405029297, "learning_rate": 7.7524591470148e-05, "loss": 2.7329, "step": 2070 }, { "epoch": 0.37866375386856, "grad_norm": 7.78175163269043, "learning_rate": 7.726140777570675e-05, "loss": 2.6866, "step": 2080 }, { "epoch": 0.3804842526852358, "grad_norm": 63.65814971923828, "learning_rate": 7.699714482778104e-05, "loss": 2.6993, "step": 2090 }, { "epoch": 0.3823047515019115, "grad_norm": 20.166156768798828, "learning_rate": 7.673181308829233e-05, "loss": 2.7455, "step": 2100 }, { "epoch": 0.3841252503185873, "grad_norm": 10.943995475769043, "learning_rate": 7.646542306147455e-05, "loss": 2.7369, "step": 2110 }, { "epoch": 0.3859457491352631, "grad_norm": 8.025435447692871, "learning_rate": 7.619798529345825e-05, "loss": 2.6558, "step": 2120 }, { "epoch": 0.3877662479519388, "grad_norm": 7.745648384094238, "learning_rate": 7.592951037185301e-05, "loss": 2.7071, "step": 2130 }, { "epoch": 0.3895867467686146, "grad_norm": 11.425312995910645, "learning_rate": 7.566000892532838e-05, "loss": 2.7322, "step": 2140 }, { "epoch": 0.3914072455852904, "grad_norm": 21.18678855895996, "learning_rate": 7.538949162319306e-05, "loss": 2.6649, "step": 2150 }, { "epoch": 0.3932277444019661, "grad_norm": 7.39724588394165, "learning_rate": 7.511796917497255e-05, "loss": 2.6632, "step": 2160 }, { "epoch": 0.3950482432186419, "grad_norm": 26.170703887939453, "learning_rate": 7.484545232998508e-05, "loss": 2.7239, "step": 2170 }, { "epoch": 0.3968687420353177, "grad_norm": 9.031233787536621, "learning_rate": 7.457195187691614e-05, "loss": 2.6995, "step": 2180 }, { "epoch": 0.3986892408519934, "grad_norm": 9.091829299926758, "learning_rate": 7.429747864339136e-05, "loss": 2.6826, "step": 2190 }, { "epoch": 0.4005097396686692, "grad_norm": 8.380509376525879, "learning_rate": 7.40220434955478e-05, "loss": 2.7026, "step": 2200 }, { "epoch": 0.402330238485345, "grad_norm": 5.83329439163208, "learning_rate": 7.374565733760379e-05, "loss": 2.6523, "step": 2210 }, { "epoch": 0.4041507373020208, "grad_norm": 6.2706475257873535, "learning_rate": 7.346833111142735e-05, "loss": 2.6854, "step": 2220 }, { "epoch": 0.4059712361186965, "grad_norm": 6.717202663421631, "learning_rate": 7.319007579610277e-05, "loss": 2.6949, "step": 2230 }, { "epoch": 0.4077917349353723, "grad_norm": 9.39269733428955, "learning_rate": 7.291090240749621e-05, "loss": 2.6903, "step": 2240 }, { "epoch": 0.4096122337520481, "grad_norm": 5.5886383056640625, "learning_rate": 7.263082199781949e-05, "loss": 2.6633, "step": 2250 }, { "epoch": 0.4114327325687238, "grad_norm": 7.369466304779053, "learning_rate": 7.234984565519247e-05, "loss": 2.6493, "step": 2260 }, { "epoch": 0.4132532313853996, "grad_norm": 11.548351287841797, "learning_rate": 7.206798450320422e-05, "loss": 2.6406, "step": 2270 }, { "epoch": 0.4150737302020754, "grad_norm": 9.547252655029297, "learning_rate": 7.178524970047253e-05, "loss": 2.7079, "step": 2280 }, { "epoch": 0.4168942290187511, "grad_norm": 7.008022785186768, "learning_rate": 7.150165244020224e-05, "loss": 2.6277, "step": 2290 }, { "epoch": 0.4187147278354269, "grad_norm": 7.342654705047607, "learning_rate": 7.121720394974206e-05, "loss": 2.6857, "step": 2300 }, { "epoch": 0.4205352266521027, "grad_norm": 6.655099391937256, "learning_rate": 7.093191549014007e-05, "loss": 2.654, "step": 2310 }, { "epoch": 0.4223557254687784, "grad_norm": 6.727902412414551, "learning_rate": 7.0645798355698e-05, "loss": 2.6778, "step": 2320 }, { "epoch": 0.4241762242854542, "grad_norm": 6.407891273498535, "learning_rate": 7.035886387352399e-05, "loss": 2.6798, "step": 2330 }, { "epoch": 0.42599672310213, "grad_norm": 6.782746315002441, "learning_rate": 7.007112340308423e-05, "loss": 2.6669, "step": 2340 }, { "epoch": 0.4278172219188058, "grad_norm": 6.325183868408203, "learning_rate": 6.97825883357532e-05, "loss": 2.6556, "step": 2350 }, { "epoch": 0.4296377207354815, "grad_norm": 5.9982008934021, "learning_rate": 6.949327009436278e-05, "loss": 2.6764, "step": 2360 }, { "epoch": 0.4314582195521573, "grad_norm": 6.148717403411865, "learning_rate": 6.920318013274988e-05, "loss": 2.6792, "step": 2370 }, { "epoch": 0.4332787183688331, "grad_norm": 6.28306245803833, "learning_rate": 6.891232993530319e-05, "loss": 2.6945, "step": 2380 }, { "epoch": 0.4350992171855088, "grad_norm": 6.82548713684082, "learning_rate": 6.862073101650837e-05, "loss": 2.6592, "step": 2390 }, { "epoch": 0.4369197160021846, "grad_norm": 6.251521587371826, "learning_rate": 6.832839492049225e-05, "loss": 2.6386, "step": 2400 }, { "epoch": 0.4387402148188604, "grad_norm": 6.3002800941467285, "learning_rate": 6.80353332205658e-05, "loss": 2.6491, "step": 2410 }, { "epoch": 0.4405607136355361, "grad_norm": 6.661396026611328, "learning_rate": 6.774155751876603e-05, "loss": 2.6538, "step": 2420 }, { "epoch": 0.4423812124522119, "grad_norm": 5.950438022613525, "learning_rate": 6.744707944539654e-05, "loss": 2.603, "step": 2430 }, { "epoch": 0.4442017112688877, "grad_norm": 9.665274620056152, "learning_rate": 6.715191065856721e-05, "loss": 2.6364, "step": 2440 }, { "epoch": 0.4460222100855634, "grad_norm": 7.168936252593994, "learning_rate": 6.685606284373258e-05, "loss": 2.6911, "step": 2450 }, { "epoch": 0.4478427089022392, "grad_norm": 6.2818145751953125, "learning_rate": 6.655954771322929e-05, "loss": 2.6304, "step": 2460 }, { "epoch": 0.449663207718915, "grad_norm": 6.3358964920043945, "learning_rate": 6.626237700581238e-05, "loss": 2.6124, "step": 2470 }, { "epoch": 0.4514837065355908, "grad_norm": 7.2275004386901855, "learning_rate": 6.596456248619054e-05, "loss": 2.644, "step": 2480 }, { "epoch": 0.4533042053522665, "grad_norm": 5.857280254364014, "learning_rate": 6.566611594456042e-05, "loss": 2.6175, "step": 2490 }, { "epoch": 0.4551247041689423, "grad_norm": 6.958190441131592, "learning_rate": 6.536704919613982e-05, "loss": 2.6372, "step": 2500 }, { "epoch": 0.4569452029856181, "grad_norm": 6.084266662597656, "learning_rate": 6.506737408069988e-05, "loss": 2.5989, "step": 2510 }, { "epoch": 0.4587657018022938, "grad_norm": 6.761366367340088, "learning_rate": 6.476710246209649e-05, "loss": 2.6231, "step": 2520 }, { "epoch": 0.4605862006189696, "grad_norm": 6.110794544219971, "learning_rate": 6.446624622780052e-05, "loss": 2.6294, "step": 2530 }, { "epoch": 0.4624066994356454, "grad_norm": 7.39132022857666, "learning_rate": 6.416481728842722e-05, "loss": 2.589, "step": 2540 }, { "epoch": 0.4642271982523211, "grad_norm": 5.385328769683838, "learning_rate": 6.386282757726467e-05, "loss": 2.5938, "step": 2550 }, { "epoch": 0.4660476970689969, "grad_norm": 6.137452602386475, "learning_rate": 6.356028904980145e-05, "loss": 2.611, "step": 2560 }, { "epoch": 0.4678681958856727, "grad_norm": 6.881803512573242, "learning_rate": 6.325721368325317e-05, "loss": 2.5577, "step": 2570 }, { "epoch": 0.4696886947023484, "grad_norm": 7.0561323165893555, "learning_rate": 6.295361347608846e-05, "loss": 2.5971, "step": 2580 }, { "epoch": 0.4715091935190242, "grad_norm": 8.178688049316406, "learning_rate": 6.264950044755387e-05, "loss": 2.6111, "step": 2590 }, { "epoch": 0.4733296923357, "grad_norm": 7.674656867980957, "learning_rate": 6.234488663719807e-05, "loss": 2.6211, "step": 2600 }, { "epoch": 0.4751501911523758, "grad_norm": 6.952455043792725, "learning_rate": 6.203978410439519e-05, "loss": 2.5976, "step": 2610 }, { "epoch": 0.4769706899690515, "grad_norm": 7.033661842346191, "learning_rate": 6.173420492786747e-05, "loss": 2.5667, "step": 2620 }, { "epoch": 0.4787911887857273, "grad_norm": 7.665953636169434, "learning_rate": 6.142816120520699e-05, "loss": 2.5964, "step": 2630 }, { "epoch": 0.4806116876024031, "grad_norm": 6.789517402648926, "learning_rate": 6.112166505239679e-05, "loss": 2.5046, "step": 2640 }, { "epoch": 0.4824321864190788, "grad_norm": 6.597125053405762, "learning_rate": 6.0814728603331176e-05, "loss": 2.6044, "step": 2650 }, { "epoch": 0.4842526852357546, "grad_norm": 5.781830310821533, "learning_rate": 6.050736400933538e-05, "loss": 2.6041, "step": 2660 }, { "epoch": 0.4860731840524304, "grad_norm": 6.907136917114258, "learning_rate": 6.0199583438684495e-05, "loss": 2.587, "step": 2670 }, { "epoch": 0.4878936828691061, "grad_norm": 5.483163833618164, "learning_rate": 5.989139907612174e-05, "loss": 2.6136, "step": 2680 }, { "epoch": 0.4897141816857819, "grad_norm": 5.925361156463623, "learning_rate": 5.958282312237605e-05, "loss": 2.5801, "step": 2690 }, { "epoch": 0.4915346805024577, "grad_norm": 6.07294225692749, "learning_rate": 5.927386779367912e-05, "loss": 2.6287, "step": 2700 }, { "epoch": 0.4933551793191334, "grad_norm": 5.8490424156188965, "learning_rate": 5.896454532128171e-05, "loss": 2.5894, "step": 2710 }, { "epoch": 0.4951756781358092, "grad_norm": 6.67198371887207, "learning_rate": 5.865486795096948e-05, "loss": 2.5684, "step": 2720 }, { "epoch": 0.496996176952485, "grad_norm": 6.779095649719238, "learning_rate": 5.8344847942578175e-05, "loss": 2.5916, "step": 2730 }, { "epoch": 0.4988166757691608, "grad_norm": 6.148252487182617, "learning_rate": 5.8034497569508206e-05, "loss": 2.579, "step": 2740 }, { "epoch": 0.5006371745858366, "grad_norm": 7.566195011138916, "learning_rate": 5.772382911823886e-05, "loss": 2.5639, "step": 2750 }, { "epoch": 0.5024576734025122, "grad_norm": 6.119766712188721, "learning_rate": 5.741285488784183e-05, "loss": 2.5728, "step": 2760 }, { "epoch": 0.504278172219188, "grad_norm": 6.376175880432129, "learning_rate": 5.710158718949431e-05, "loss": 2.5889, "step": 2770 }, { "epoch": 0.5060986710358638, "grad_norm": 7.798823833465576, "learning_rate": 5.67900383459916e-05, "loss": 2.5611, "step": 2780 }, { "epoch": 0.5079191698525396, "grad_norm": 5.470147132873535, "learning_rate": 5.64782206912593e-05, "loss": 2.6003, "step": 2790 }, { "epoch": 0.5097396686692154, "grad_norm": 6.034721374511719, "learning_rate": 5.6166146569864986e-05, "loss": 2.6126, "step": 2800 }, { "epoch": 0.5115601674858912, "grad_norm": 5.667842388153076, "learning_rate": 5.585382833652951e-05, "loss": 2.5349, "step": 2810 }, { "epoch": 0.513380666302567, "grad_norm": 5.957113742828369, "learning_rate": 5.554127835563784e-05, "loss": 2.5567, "step": 2820 }, { "epoch": 0.5152011651192426, "grad_norm": 8.104249000549316, "learning_rate": 5.5228509000749705e-05, "loss": 2.5654, "step": 2830 }, { "epoch": 0.5170216639359184, "grad_norm": 5.897515296936035, "learning_rate": 5.491553265410956e-05, "loss": 2.599, "step": 2840 }, { "epoch": 0.5188421627525942, "grad_norm": 7.213466167449951, "learning_rate": 5.46023617061565e-05, "loss": 2.5782, "step": 2850 }, { "epoch": 0.52066266156927, "grad_norm": 6.2212910652160645, "learning_rate": 5.4289008555033704e-05, "loss": 2.5869, "step": 2860 }, { "epoch": 0.5224831603859458, "grad_norm": 7.721590042114258, "learning_rate": 5.397548560609762e-05, "loss": 2.5546, "step": 2870 }, { "epoch": 0.5243036592026216, "grad_norm": 6.373271465301514, "learning_rate": 5.366180527142678e-05, "loss": 2.555, "step": 2880 }, { "epoch": 0.5261241580192972, "grad_norm": 7.320889949798584, "learning_rate": 5.334797996933052e-05, "loss": 2.5487, "step": 2890 }, { "epoch": 0.527944656835973, "grad_norm": 5.541944980621338, "learning_rate": 5.3034022123857285e-05, "loss": 2.6031, "step": 2900 }, { "epoch": 0.5297651556526488, "grad_norm": 6.056175708770752, "learning_rate": 5.271994416430278e-05, "loss": 2.5569, "step": 2910 }, { "epoch": 0.5315856544693246, "grad_norm": 6.227322578430176, "learning_rate": 5.240575852471791e-05, "loss": 2.5726, "step": 2920 }, { "epoch": 0.5334061532860004, "grad_norm": 5.931169509887695, "learning_rate": 5.2091477643416565e-05, "loss": 2.563, "step": 2930 }, { "epoch": 0.5352266521026762, "grad_norm": 5.819269180297852, "learning_rate": 5.17771139624831e-05, "loss": 2.5544, "step": 2940 }, { "epoch": 0.537047150919352, "grad_norm": 5.978631973266602, "learning_rate": 5.14626799272799e-05, "loss": 2.5477, "step": 2950 }, { "epoch": 0.5388676497360276, "grad_norm": 7.41937255859375, "learning_rate": 5.114818798595457e-05, "loss": 2.537, "step": 2960 }, { "epoch": 0.5406881485527034, "grad_norm": 6.325901985168457, "learning_rate": 5.083365058894717e-05, "loss": 2.5219, "step": 2970 }, { "epoch": 0.5425086473693792, "grad_norm": 5.593471050262451, "learning_rate": 5.051908018849729e-05, "loss": 2.5031, "step": 2980 }, { "epoch": 0.544329146186055, "grad_norm": 6.014733791351318, "learning_rate": 5.020448923815115e-05, "loss": 2.5559, "step": 2990 }, { "epoch": 0.5461496450027308, "grad_norm": 5.821467399597168, "learning_rate": 4.988989019226846e-05, "loss": 2.5543, "step": 3000 }, { "epoch": 0.5461496450027308, "eval_loss": 2.542562246322632, "eval_runtime": 1012.6169, "eval_samples_per_second": 9.644, "eval_steps_per_second": 1.206, "step": 3000 }, { "epoch": 0.5479701438194066, "grad_norm": 5.459403991699219, "learning_rate": 4.9575295505529476e-05, "loss": 2.5296, "step": 3010 }, { "epoch": 0.5497906426360822, "grad_norm": 5.517323970794678, "learning_rate": 4.926071763244182e-05, "loss": 2.5244, "step": 3020 }, { "epoch": 0.551611141452758, "grad_norm": 6.594572067260742, "learning_rate": 4.894616902684755e-05, "loss": 2.5305, "step": 3030 }, { "epoch": 0.5534316402694338, "grad_norm": 5.410883903503418, "learning_rate": 4.8631662141429965e-05, "loss": 2.52, "step": 3040 }, { "epoch": 0.5552521390861096, "grad_norm": 6.53268575668335, "learning_rate": 4.8317209427220776e-05, "loss": 2.552, "step": 3050 }, { "epoch": 0.5570726379027854, "grad_norm": 5.973785877227783, "learning_rate": 4.8002823333107094e-05, "loss": 2.5089, "step": 3060 }, { "epoch": 0.5588931367194612, "grad_norm": 5.9094038009643555, "learning_rate": 4.768851630533858e-05, "loss": 2.5641, "step": 3070 }, { "epoch": 0.560713635536137, "grad_norm": 6.263980865478516, "learning_rate": 4.737430078703473e-05, "loss": 2.5573, "step": 3080 }, { "epoch": 0.5625341343528126, "grad_norm": 5.653293609619141, "learning_rate": 4.706018921769229e-05, "loss": 2.5162, "step": 3090 }, { "epoch": 0.5643546331694884, "grad_norm": 5.543784141540527, "learning_rate": 4.674619403269275e-05, "loss": 2.5121, "step": 3100 }, { "epoch": 0.5661751319861642, "grad_norm": 5.531651020050049, "learning_rate": 4.643232766281003e-05, "loss": 2.5214, "step": 3110 }, { "epoch": 0.56799563080284, "grad_norm": 6.792632579803467, "learning_rate": 4.6118602533718457e-05, "loss": 2.5064, "step": 3120 }, { "epoch": 0.5698161296195158, "grad_norm": 5.586999416351318, "learning_rate": 4.580503106550069e-05, "loss": 2.5475, "step": 3130 }, { "epoch": 0.5716366284361916, "grad_norm": 5.922962188720703, "learning_rate": 4.549162567215612e-05, "loss": 2.4942, "step": 3140 }, { "epoch": 0.5734571272528672, "grad_norm": 6.408661842346191, "learning_rate": 4.517839876110942e-05, "loss": 2.5143, "step": 3150 }, { "epoch": 0.575277626069543, "grad_norm": 6.490014553070068, "learning_rate": 4.4865362732719266e-05, "loss": 2.524, "step": 3160 }, { "epoch": 0.5770981248862188, "grad_norm": 7.0241217613220215, "learning_rate": 4.45525299797875e-05, "loss": 2.5019, "step": 3170 }, { "epoch": 0.5789186237028946, "grad_norm": 5.701231002807617, "learning_rate": 4.423991288706851e-05, "loss": 2.5243, "step": 3180 }, { "epoch": 0.5807391225195704, "grad_norm": 5.697451591491699, "learning_rate": 4.392752383077883e-05, "loss": 2.5121, "step": 3190 }, { "epoch": 0.5825596213362462, "grad_norm": 6.233221530914307, "learning_rate": 4.3615375178107306e-05, "loss": 2.5233, "step": 3200 }, { "epoch": 0.584380120152922, "grad_norm": 5.627314567565918, "learning_rate": 4.330347928672538e-05, "loss": 2.4944, "step": 3210 }, { "epoch": 0.5862006189695976, "grad_norm": 5.6860737800598145, "learning_rate": 4.299184850429795e-05, "loss": 2.5099, "step": 3220 }, { "epoch": 0.5880211177862734, "grad_norm": 6.200436592102051, "learning_rate": 4.26804951679945e-05, "loss": 2.5251, "step": 3230 }, { "epoch": 0.5898416166029492, "grad_norm": 6.162016868591309, "learning_rate": 4.2369431604000654e-05, "loss": 2.5156, "step": 3240 }, { "epoch": 0.591662115419625, "grad_norm": 5.810561656951904, "learning_rate": 4.205867012703025e-05, "loss": 2.5343, "step": 3250 }, { "epoch": 0.5934826142363008, "grad_norm": 5.519481182098389, "learning_rate": 4.174822303983779e-05, "loss": 2.4742, "step": 3260 }, { "epoch": 0.5953031130529766, "grad_norm": 5.590371131896973, "learning_rate": 4.1438102632731346e-05, "loss": 2.5091, "step": 3270 }, { "epoch": 0.5971236118696522, "grad_norm": 5.302878379821777, "learning_rate": 4.1128321183086065e-05, "loss": 2.5024, "step": 3280 }, { "epoch": 0.598944110686328, "grad_norm": 5.2954864501953125, "learning_rate": 4.081889095485806e-05, "loss": 2.5304, "step": 3290 }, { "epoch": 0.6007646095030038, "grad_norm": 6.41418981552124, "learning_rate": 4.050982419809895e-05, "loss": 2.482, "step": 3300 }, { "epoch": 0.6025851083196796, "grad_norm": 6.165164947509766, "learning_rate": 4.020113314847082e-05, "loss": 2.5042, "step": 3310 }, { "epoch": 0.6044056071363554, "grad_norm": 5.556238651275635, "learning_rate": 3.989283002676193e-05, "loss": 2.4748, "step": 3320 }, { "epoch": 0.6062261059530312, "grad_norm": 5.898430824279785, "learning_rate": 3.958492703840276e-05, "loss": 2.5103, "step": 3330 }, { "epoch": 0.6080466047697068, "grad_norm": 6.131360054016113, "learning_rate": 3.9277436372982945e-05, "loss": 2.5023, "step": 3340 }, { "epoch": 0.6098671035863826, "grad_norm": 6.646694660186768, "learning_rate": 3.8970370203768595e-05, "loss": 2.5278, "step": 3350 }, { "epoch": 0.6116876024030584, "grad_norm": 5.293123245239258, "learning_rate": 3.8663740687220466e-05, "loss": 2.5339, "step": 3360 }, { "epoch": 0.6135081012197342, "grad_norm": 5.64793586730957, "learning_rate": 3.835755996251261e-05, "loss": 2.4955, "step": 3370 }, { "epoch": 0.61532860003641, "grad_norm": 5.227383613586426, "learning_rate": 3.805184015105182e-05, "loss": 2.4942, "step": 3380 }, { "epoch": 0.6171490988530858, "grad_norm": 6.251855850219727, "learning_rate": 3.774659335599783e-05, "loss": 2.4941, "step": 3390 }, { "epoch": 0.6189695976697616, "grad_norm": 6.169816970825195, "learning_rate": 3.744183166178405e-05, "loss": 2.5091, "step": 3400 }, { "epoch": 0.6207900964864372, "grad_norm": 5.742128372192383, "learning_rate": 3.71375671336392e-05, "loss": 2.4775, "step": 3410 }, { "epoch": 0.622610595303113, "grad_norm": 5.501997470855713, "learning_rate": 3.683381181710969e-05, "loss": 2.5165, "step": 3420 }, { "epoch": 0.6244310941197888, "grad_norm": 5.634415149688721, "learning_rate": 3.653057773758268e-05, "loss": 2.5137, "step": 3430 }, { "epoch": 0.6262515929364646, "grad_norm": 6.159447193145752, "learning_rate": 3.622787689981009e-05, "loss": 2.5118, "step": 3440 }, { "epoch": 0.6280720917531404, "grad_norm": 5.9222307205200195, "learning_rate": 3.5925721287433304e-05, "loss": 2.5158, "step": 3450 }, { "epoch": 0.6298925905698162, "grad_norm": 5.72164249420166, "learning_rate": 3.5624122862508724e-05, "loss": 2.4573, "step": 3460 }, { "epoch": 0.6317130893864918, "grad_norm": 5.604609489440918, "learning_rate": 3.5323093565034213e-05, "loss": 2.4427, "step": 3470 }, { "epoch": 0.6335335882031676, "grad_norm": 6.2562336921691895, "learning_rate": 3.502264531247644e-05, "loss": 2.5276, "step": 3480 }, { "epoch": 0.6353540870198434, "grad_norm": 5.994789123535156, "learning_rate": 3.4722789999299034e-05, "loss": 2.5015, "step": 3490 }, { "epoch": 0.6371745858365192, "grad_norm": 7.051544189453125, "learning_rate": 3.442353949649173e-05, "loss": 2.4955, "step": 3500 }, { "epoch": 0.638995084653195, "grad_norm": 5.279881000518799, "learning_rate": 3.412490565110034e-05, "loss": 2.5066, "step": 3510 }, { "epoch": 0.6408155834698708, "grad_norm": 5.663167476654053, "learning_rate": 3.382690028575789e-05, "loss": 2.4843, "step": 3520 }, { "epoch": 0.6426360822865466, "grad_norm": 6.197967052459717, "learning_rate": 3.352953519821637e-05, "loss": 2.4439, "step": 3530 }, { "epoch": 0.6444565811032222, "grad_norm": 6.703118324279785, "learning_rate": 3.3232822160879825e-05, "loss": 2.4902, "step": 3540 }, { "epoch": 0.646277079919898, "grad_norm": 6.075878143310547, "learning_rate": 3.2936772920338244e-05, "loss": 2.4375, "step": 3550 }, { "epoch": 0.6480975787365738, "grad_norm": 5.913654327392578, "learning_rate": 3.2641399196902505e-05, "loss": 2.4575, "step": 3560 }, { "epoch": 0.6499180775532496, "grad_norm": 5.623748779296875, "learning_rate": 3.234671268414041e-05, "loss": 2.5078, "step": 3570 }, { "epoch": 0.6517385763699254, "grad_norm": 4.9896745681762695, "learning_rate": 3.2052725048413734e-05, "loss": 2.461, "step": 3580 }, { "epoch": 0.6535590751866012, "grad_norm": 5.503573417663574, "learning_rate": 3.175944792841639e-05, "loss": 2.4964, "step": 3590 }, { "epoch": 0.6553795740032768, "grad_norm": 5.610907077789307, "learning_rate": 3.146689293471362e-05, "loss": 2.496, "step": 3600 }, { "epoch": 0.6572000728199526, "grad_norm": 5.222755432128906, "learning_rate": 3.117507164928235e-05, "loss": 2.4587, "step": 3610 }, { "epoch": 0.6590205716366284, "grad_norm": 5.29688024520874, "learning_rate": 3.0883995625052735e-05, "loss": 2.4976, "step": 3620 }, { "epoch": 0.6608410704533042, "grad_norm": 5.991846084594727, "learning_rate": 3.059367638545069e-05, "loss": 2.4668, "step": 3630 }, { "epoch": 0.66266156926998, "grad_norm": 5.622383117675781, "learning_rate": 3.030412542394176e-05, "loss": 2.4624, "step": 3640 }, { "epoch": 0.6644820680866558, "grad_norm": 5.628267288208008, "learning_rate": 3.001535420357607e-05, "loss": 2.4497, "step": 3650 }, { "epoch": 0.6663025669033316, "grad_norm": 5.707132339477539, "learning_rate": 2.972737415653456e-05, "loss": 2.4921, "step": 3660 }, { "epoch": 0.6681230657200072, "grad_norm": 6.638173580169678, "learning_rate": 2.9440196683676337e-05, "loss": 2.4978, "step": 3670 }, { "epoch": 0.669943564536683, "grad_norm": 6.14837121963501, "learning_rate": 2.915383315408736e-05, "loss": 2.4597, "step": 3680 }, { "epoch": 0.6717640633533588, "grad_norm": 5.452149391174316, "learning_rate": 2.8868294904630333e-05, "loss": 2.4789, "step": 3690 }, { "epoch": 0.6735845621700346, "grad_norm": 5.278966903686523, "learning_rate": 2.8583593239495875e-05, "loss": 2.4435, "step": 3700 }, { "epoch": 0.6754050609867104, "grad_norm": 5.96103048324585, "learning_rate": 2.8299739429755057e-05, "loss": 2.4103, "step": 3710 }, { "epoch": 0.6772255598033862, "grad_norm": 5.705805778503418, "learning_rate": 2.8016744712913164e-05, "loss": 2.494, "step": 3720 }, { "epoch": 0.6790460586200618, "grad_norm": 6.212954044342041, "learning_rate": 2.773462029246475e-05, "loss": 2.4419, "step": 3730 }, { "epoch": 0.6808665574367376, "grad_norm": 5.658579349517822, "learning_rate": 2.7453377337450182e-05, "loss": 2.4648, "step": 3740 }, { "epoch": 0.6826870562534134, "grad_norm": 6.050464630126953, "learning_rate": 2.7173026982013417e-05, "loss": 2.4964, "step": 3750 }, { "epoch": 0.6845075550700892, "grad_norm": 5.388469219207764, "learning_rate": 2.689358032496129e-05, "loss": 2.4463, "step": 3760 }, { "epoch": 0.686328053886765, "grad_norm": 5.987942695617676, "learning_rate": 2.661504842932402e-05, "loss": 2.4777, "step": 3770 }, { "epoch": 0.6881485527034408, "grad_norm": 20.632400512695312, "learning_rate": 2.633744232191726e-05, "loss": 2.4645, "step": 3780 }, { "epoch": 0.6899690515201166, "grad_norm": 5.323004245758057, "learning_rate": 2.6060772992905647e-05, "loss": 2.4832, "step": 3790 }, { "epoch": 0.6917895503367922, "grad_norm": 5.565816402435303, "learning_rate": 2.578505139536762e-05, "loss": 2.4677, "step": 3800 }, { "epoch": 0.693610049153468, "grad_norm": 6.668233394622803, "learning_rate": 2.5510288444861784e-05, "loss": 2.4372, "step": 3810 }, { "epoch": 0.6954305479701438, "grad_norm": 6.057459354400635, "learning_rate": 2.5236495018994844e-05, "loss": 2.4625, "step": 3820 }, { "epoch": 0.6972510467868196, "grad_norm": 5.716314315795898, "learning_rate": 2.4963681956990896e-05, "loss": 2.4582, "step": 3830 }, { "epoch": 0.6990715456034954, "grad_norm": 5.098658561706543, "learning_rate": 2.4691860059262427e-05, "loss": 2.449, "step": 3840 }, { "epoch": 0.7008920444201712, "grad_norm": 6.102919578552246, "learning_rate": 2.4421040086982587e-05, "loss": 2.4678, "step": 3850 }, { "epoch": 0.7027125432368468, "grad_norm": 6.12929630279541, "learning_rate": 2.4151232761659305e-05, "loss": 2.4162, "step": 3860 }, { "epoch": 0.7045330420535226, "grad_norm": 5.601887226104736, "learning_rate": 2.3882448764710698e-05, "loss": 2.4455, "step": 3870 }, { "epoch": 0.7063535408701984, "grad_norm": 5.384401798248291, "learning_rate": 2.3614698737042355e-05, "loss": 2.4618, "step": 3880 }, { "epoch": 0.7081740396868742, "grad_norm": 5.636379718780518, "learning_rate": 2.3347993278625933e-05, "loss": 2.43, "step": 3890 }, { "epoch": 0.70999453850355, "grad_norm": 6.716041088104248, "learning_rate": 2.3082342948079606e-05, "loss": 2.4177, "step": 3900 }, { "epoch": 0.7118150373202258, "grad_norm": 4.779394149780273, "learning_rate": 2.2817758262249988e-05, "loss": 2.4502, "step": 3910 }, { "epoch": 0.7136355361369016, "grad_norm": 5.225165843963623, "learning_rate": 2.2554249695795878e-05, "loss": 2.4954, "step": 3920 }, { "epoch": 0.7154560349535772, "grad_norm": 5.962668418884277, "learning_rate": 2.2291827680773508e-05, "loss": 2.4367, "step": 3930 }, { "epoch": 0.717276533770253, "grad_norm": 5.683213233947754, "learning_rate": 2.2030502606223534e-05, "loss": 2.4969, "step": 3940 }, { "epoch": 0.7190970325869288, "grad_norm": 5.069901466369629, "learning_rate": 2.1770284817759767e-05, "loss": 2.4462, "step": 3950 }, { "epoch": 0.7209175314036046, "grad_norm": 5.872584819793701, "learning_rate": 2.1511184617159652e-05, "loss": 2.4353, "step": 3960 }, { "epoch": 0.7227380302202804, "grad_norm": 5.787328720092773, "learning_rate": 2.125321226195634e-05, "loss": 2.4174, "step": 3970 }, { "epoch": 0.7245585290369562, "grad_norm": 8.6191987991333, "learning_rate": 2.0996377965032638e-05, "loss": 2.4482, "step": 3980 }, { "epoch": 0.7263790278536318, "grad_norm": 5.412010192871094, "learning_rate": 2.074069189421673e-05, "loss": 2.4207, "step": 3990 }, { "epoch": 0.7281995266703076, "grad_norm": 5.450544834136963, "learning_rate": 2.0486164171879613e-05, "loss": 2.4624, "step": 4000 }, { "epoch": 0.7281995266703076, "eval_loss": 2.449294328689575, "eval_runtime": 1012.1688, "eval_samples_per_second": 9.649, "eval_steps_per_second": 1.206, "step": 4000 }, { "epoch": 0.7300200254869834, "grad_norm": 5.740320205688477, "learning_rate": 2.0232804874534313e-05, "loss": 2.4582, "step": 4010 }, { "epoch": 0.7318405243036592, "grad_norm": 5.987521171569824, "learning_rate": 1.998062403243704e-05, "loss": 2.4499, "step": 4020 }, { "epoch": 0.733661023120335, "grad_norm": 5.337474346160889, "learning_rate": 1.9729631629190042e-05, "loss": 2.4598, "step": 4030 }, { "epoch": 0.7354815219370108, "grad_norm": 5.655992031097412, "learning_rate": 1.9479837601346457e-05, "loss": 2.4601, "step": 4040 }, { "epoch": 0.7373020207536866, "grad_norm": 5.7331414222717285, "learning_rate": 1.923125183801678e-05, "loss": 2.4445, "step": 4050 }, { "epoch": 0.7391225195703622, "grad_norm": 5.471503257751465, "learning_rate": 1.898388418047753e-05, "loss": 2.4683, "step": 4060 }, { "epoch": 0.740943018387038, "grad_norm": 5.409184455871582, "learning_rate": 1.87377444217815e-05, "loss": 2.4358, "step": 4070 }, { "epoch": 0.7427635172037138, "grad_norm": 5.579779148101807, "learning_rate": 1.8492842306370182e-05, "loss": 2.4989, "step": 4080 }, { "epoch": 0.7445840160203896, "grad_norm": 5.366626262664795, "learning_rate": 1.8249187529687895e-05, "loss": 2.4102, "step": 4090 }, { "epoch": 0.7464045148370654, "grad_norm": 5.263418674468994, "learning_rate": 1.8006789737797984e-05, "loss": 2.4573, "step": 4100 }, { "epoch": 0.7482250136537412, "grad_norm": 5.129177570343018, "learning_rate": 1.7765658527000966e-05, "loss": 2.4792, "step": 4110 }, { "epoch": 0.7500455124704168, "grad_norm": 6.237401962280273, "learning_rate": 1.7525803443454615e-05, "loss": 2.479, "step": 4120 }, { "epoch": 0.7518660112870926, "grad_norm": 6.163425445556641, "learning_rate": 1.728723398279603e-05, "loss": 2.4222, "step": 4130 }, { "epoch": 0.7536865101037684, "grad_norm": 5.254932403564453, "learning_rate": 1.7049959589765686e-05, "loss": 2.4307, "step": 4140 }, { "epoch": 0.7555070089204442, "grad_norm": 6.144068717956543, "learning_rate": 1.6813989657833534e-05, "loss": 2.4923, "step": 4150 }, { "epoch": 0.75732750773712, "grad_norm": 5.038397789001465, "learning_rate": 1.6579333528827205e-05, "loss": 2.4732, "step": 4160 }, { "epoch": 0.7591480065537958, "grad_norm": 5.2848076820373535, "learning_rate": 1.634600049256204e-05, "loss": 2.4651, "step": 4170 }, { "epoch": 0.7609685053704716, "grad_norm": 5.274468898773193, "learning_rate": 1.611399978647342e-05, "loss": 2.4407, "step": 4180 }, { "epoch": 0.7627890041871472, "grad_norm": 5.039272308349609, "learning_rate": 1.588334059525099e-05, "loss": 2.4892, "step": 4190 }, { "epoch": 0.764609503003823, "grad_norm": 5.7419867515563965, "learning_rate": 1.5654032050475138e-05, "loss": 2.4456, "step": 4200 }, { "epoch": 0.7664300018204988, "grad_norm": 5.30146598815918, "learning_rate": 1.5426083230255405e-05, "loss": 2.4515, "step": 4210 }, { "epoch": 0.7682505006371746, "grad_norm": 4.977199077606201, "learning_rate": 1.5199503158871115e-05, "loss": 2.4317, "step": 4220 }, { "epoch": 0.7700709994538504, "grad_norm": 5.318095684051514, "learning_rate": 1.4974300806414082e-05, "loss": 2.403, "step": 4230 }, { "epoch": 0.7718914982705262, "grad_norm": 5.638497352600098, "learning_rate": 1.4750485088433592e-05, "loss": 2.4327, "step": 4240 }, { "epoch": 0.7737119970872018, "grad_norm": 5.739340305328369, "learning_rate": 1.4528064865583301e-05, "loss": 2.4266, "step": 4250 }, { "epoch": 0.7755324959038776, "grad_norm": 4.749205112457275, "learning_rate": 1.4307048943270606e-05, "loss": 2.4136, "step": 4260 }, { "epoch": 0.7773529947205534, "grad_norm": 5.616302490234375, "learning_rate": 1.4087446071307903e-05, "loss": 2.4197, "step": 4270 }, { "epoch": 0.7791734935372292, "grad_norm": 5.402510643005371, "learning_rate": 1.3869264943566263e-05, "loss": 2.4194, "step": 4280 }, { "epoch": 0.780993992353905, "grad_norm": 5.278769493103027, "learning_rate": 1.3652514197631277e-05, "loss": 2.4351, "step": 4290 }, { "epoch": 0.7828144911705808, "grad_norm": 6.828596115112305, "learning_rate": 1.343720241446103e-05, "loss": 2.3813, "step": 4300 }, { "epoch": 0.7846349899872566, "grad_norm": 5.306332588195801, "learning_rate": 1.322333811804643e-05, "loss": 2.4133, "step": 4310 }, { "epoch": 0.7864554888039322, "grad_norm": 5.437227249145508, "learning_rate": 1.3010929775073765e-05, "loss": 2.4166, "step": 4320 }, { "epoch": 0.788275987620608, "grad_norm": 5.493254661560059, "learning_rate": 1.2799985794589497e-05, "loss": 2.3842, "step": 4330 }, { "epoch": 0.7900964864372838, "grad_norm": 5.259057521820068, "learning_rate": 1.2590514527667336e-05, "loss": 2.3783, "step": 4340 }, { "epoch": 0.7919169852539596, "grad_norm": 5.750987529754639, "learning_rate": 1.2382524267077645e-05, "loss": 2.4202, "step": 4350 }, { "epoch": 0.7937374840706354, "grad_norm": 4.952456951141357, "learning_rate": 1.2176023246959133e-05, "loss": 2.4393, "step": 4360 }, { "epoch": 0.7955579828873112, "grad_norm": 5.3008713722229, "learning_rate": 1.1971019642492942e-05, "loss": 2.375, "step": 4370 }, { "epoch": 0.7973784817039868, "grad_norm": 4.872366428375244, "learning_rate": 1.176752156957886e-05, "loss": 2.4257, "step": 4380 }, { "epoch": 0.7991989805206626, "grad_norm": 5.488797664642334, "learning_rate": 1.1565537084514123e-05, "loss": 2.4424, "step": 4390 }, { "epoch": 0.8010194793373384, "grad_norm": 5.145867824554443, "learning_rate": 1.1365074183674468e-05, "loss": 2.4806, "step": 4400 }, { "epoch": 0.8028399781540142, "grad_norm": 5.343238353729248, "learning_rate": 1.116614080319754e-05, "loss": 2.4321, "step": 4410 }, { "epoch": 0.80466047697069, "grad_norm": 5.240965366363525, "learning_rate": 1.0968744818668691e-05, "loss": 2.4358, "step": 4420 }, { "epoch": 0.8064809757873658, "grad_norm": 5.5220513343811035, "learning_rate": 1.0772894044809229e-05, "loss": 2.442, "step": 4430 }, { "epoch": 0.8083014746040416, "grad_norm": 4.8629045486450195, "learning_rate": 1.0578596235166998e-05, "loss": 2.4567, "step": 4440 }, { "epoch": 0.8101219734207172, "grad_norm": 5.297680854797363, "learning_rate": 1.0385859081809508e-05, "loss": 2.4544, "step": 4450 }, { "epoch": 0.811942472237393, "grad_norm": 5.134615898132324, "learning_rate": 1.0194690215019292e-05, "loss": 2.4656, "step": 4460 }, { "epoch": 0.8137629710540688, "grad_norm": 5.012113571166992, "learning_rate": 1.0005097202991948e-05, "loss": 2.382, "step": 4470 }, { "epoch": 0.8155834698707446, "grad_norm": 5.369142532348633, "learning_rate": 9.817087551536414e-06, "loss": 2.4584, "step": 4480 }, { "epoch": 0.8174039686874204, "grad_norm": 5.545107841491699, "learning_rate": 9.630668703777922e-06, "loss": 2.4013, "step": 4490 }, { "epoch": 0.8192244675040962, "grad_norm": 4.933434963226318, "learning_rate": 9.445848039863252e-06, "loss": 2.4516, "step": 4500 }, { "epoch": 0.8210449663207718, "grad_norm": 4.916785717010498, "learning_rate": 9.262632876668591e-06, "loss": 2.4555, "step": 4510 }, { "epoch": 0.8228654651374476, "grad_norm": 5.11759090423584, "learning_rate": 9.08103046750986e-06, "loss": 2.447, "step": 4520 }, { "epoch": 0.8246859639541234, "grad_norm": 5.081522464752197, "learning_rate": 8.901048001855583e-06, "loss": 2.4004, "step": 4530 }, { "epoch": 0.8265064627707992, "grad_norm": 5.203310489654541, "learning_rate": 8.722692605042248e-06, "loss": 2.4237, "step": 4540 }, { "epoch": 0.828326961587475, "grad_norm": 5.090500831604004, "learning_rate": 8.545971337992197e-06, "loss": 2.4342, "step": 4550 }, { "epoch": 0.8301474604041508, "grad_norm": 5.330081462860107, "learning_rate": 8.37089119693411e-06, "loss": 2.3922, "step": 4560 }, { "epoch": 0.8319679592208266, "grad_norm": 5.339773178100586, "learning_rate": 8.197459113126067e-06, "loss": 2.4342, "step": 4570 }, { "epoch": 0.8337884580375022, "grad_norm": 5.109127044677734, "learning_rate": 8.02568195258107e-06, "loss": 2.4207, "step": 4580 }, { "epoch": 0.835608956854178, "grad_norm": 5.223607540130615, "learning_rate": 7.855566515795282e-06, "loss": 2.383, "step": 4590 }, { "epoch": 0.8374294556708538, "grad_norm": 5.024397373199463, "learning_rate": 7.687119537478799e-06, "loss": 2.4197, "step": 4600 }, { "epoch": 0.8392499544875296, "grad_norm": 5.112728595733643, "learning_rate": 7.52034768628902e-06, "loss": 2.4399, "step": 4610 }, { "epoch": 0.8410704533042054, "grad_norm": 5.149270057678223, "learning_rate": 7.3552575645666036e-06, "loss": 2.417, "step": 4620 }, { "epoch": 0.8428909521208812, "grad_norm": 5.0890350341796875, "learning_rate": 7.191855708074152e-06, "loss": 2.4217, "step": 4630 }, { "epoch": 0.8447114509375568, "grad_norm": 5.196211338043213, "learning_rate": 7.030148585737406e-06, "loss": 2.4351, "step": 4640 }, { "epoch": 0.8465319497542326, "grad_norm": 5.102319717407227, "learning_rate": 6.870142599389217e-06, "loss": 2.4397, "step": 4650 }, { "epoch": 0.8483524485709084, "grad_norm": 5.2318220138549805, "learning_rate": 6.711844083516022e-06, "loss": 2.3708, "step": 4660 }, { "epoch": 0.8501729473875842, "grad_norm": 5.27686071395874, "learning_rate": 6.555259305007139e-06, "loss": 2.4018, "step": 4670 }, { "epoch": 0.85199344620426, "grad_norm": 5.138775825500488, "learning_rate": 6.400394462906612e-06, "loss": 2.4244, "step": 4680 }, { "epoch": 0.8538139450209358, "grad_norm": 4.929832935333252, "learning_rate": 6.247255688167852e-06, "loss": 2.3637, "step": 4690 }, { "epoch": 0.8556344438376116, "grad_norm": 5.328685283660889, "learning_rate": 6.09584904341085e-06, "loss": 2.4037, "step": 4700 }, { "epoch": 0.8574549426542872, "grad_norm": 4.969110012054443, "learning_rate": 5.946180522682227e-06, "loss": 2.4054, "step": 4710 }, { "epoch": 0.859275441470963, "grad_norm": 4.729069232940674, "learning_rate": 5.798256051217882e-06, "loss": 2.419, "step": 4720 }, { "epoch": 0.8610959402876388, "grad_norm": 4.847239971160889, "learning_rate": 5.652081485208482e-06, "loss": 2.4067, "step": 4730 }, { "epoch": 0.8629164391043146, "grad_norm": 4.862872123718262, "learning_rate": 5.507662611567565e-06, "loss": 2.4237, "step": 4740 }, { "epoch": 0.8647369379209904, "grad_norm": 4.765954971313477, "learning_rate": 5.365005147702462e-06, "loss": 2.3841, "step": 4750 }, { "epoch": 0.8665574367376662, "grad_norm": 5.191616535186768, "learning_rate": 5.224114741287922e-06, "loss": 2.4473, "step": 4760 }, { "epoch": 0.8683779355543418, "grad_norm": 5.6387619972229, "learning_rate": 5.084996970042599e-06, "loss": 2.3946, "step": 4770 }, { "epoch": 0.8701984343710176, "grad_norm": 4.992214202880859, "learning_rate": 4.947657341508166e-06, "loss": 2.4029, "step": 4780 }, { "epoch": 0.8720189331876934, "grad_norm": 4.994503021240234, "learning_rate": 4.812101292831283e-06, "loss": 2.4212, "step": 4790 }, { "epoch": 0.8738394320043692, "grad_norm": 5.3045220375061035, "learning_rate": 4.678334190548378e-06, "loss": 2.4025, "step": 4800 }, { "epoch": 0.875659930821045, "grad_norm": 5.20510196685791, "learning_rate": 4.546361330373178e-06, "loss": 2.4087, "step": 4810 }, { "epoch": 0.8774804296377208, "grad_norm": 5.151695728302002, "learning_rate": 4.41618793698706e-06, "loss": 2.3748, "step": 4820 }, { "epoch": 0.8793009284543966, "grad_norm": 5.292723178863525, "learning_rate": 4.287819163832179e-06, "loss": 2.4068, "step": 4830 }, { "epoch": 0.8811214272710722, "grad_norm": 5.057366847991943, "learning_rate": 4.161260092907476e-06, "loss": 2.4191, "step": 4840 }, { "epoch": 0.882941926087748, "grad_norm": 5.375776290893555, "learning_rate": 4.0365157345675255e-06, "loss": 2.4085, "step": 4850 }, { "epoch": 0.8847624249044238, "grad_norm": 5.110659599304199, "learning_rate": 3.91359102732407e-06, "loss": 2.3547, "step": 4860 }, { "epoch": 0.8865829237210996, "grad_norm": 5.154110908508301, "learning_rate": 3.792490837650642e-06, "loss": 2.3731, "step": 4870 }, { "epoch": 0.8884034225377754, "grad_norm": 4.55220365524292, "learning_rate": 3.673219959789803e-06, "loss": 2.3956, "step": 4880 }, { "epoch": 0.8902239213544512, "grad_norm": 5.095583915710449, "learning_rate": 3.5557831155633715e-06, "loss": 2.4077, "step": 4890 }, { "epoch": 0.8920444201711268, "grad_norm": 5.244335174560547, "learning_rate": 3.4401849541855493e-06, "loss": 2.4164, "step": 4900 }, { "epoch": 0.8938649189878026, "grad_norm": 5.595185279846191, "learning_rate": 3.3264300520787607e-06, "loss": 2.4055, "step": 4910 }, { "epoch": 0.8956854178044784, "grad_norm": 5.167870044708252, "learning_rate": 3.214522912692547e-06, "loss": 2.4064, "step": 4920 }, { "epoch": 0.8975059166211542, "grad_norm": 4.97356653213501, "learning_rate": 3.1044679663252807e-06, "loss": 2.4249, "step": 4930 }, { "epoch": 0.89932641543783, "grad_norm": 5.116016387939453, "learning_rate": 2.996269569948745e-06, "loss": 2.3617, "step": 4940 }, { "epoch": 0.9011469142545058, "grad_norm": 5.192502498626709, "learning_rate": 2.889932007035645e-06, "loss": 2.3889, "step": 4950 }, { "epoch": 0.9029674130711816, "grad_norm": 4.694880485534668, "learning_rate": 2.7854594873900463e-06, "loss": 2.4309, "step": 4960 }, { "epoch": 0.9047879118878572, "grad_norm": 4.840787887573242, "learning_rate": 2.6828561469807e-06, "loss": 2.4412, "step": 4970 }, { "epoch": 0.906608410704533, "grad_norm": 5.104063987731934, "learning_rate": 2.582126047777328e-06, "loss": 2.4295, "step": 4980 }, { "epoch": 0.9084289095212088, "grad_norm": 4.761752605438232, "learning_rate": 2.4832731775897844e-06, "loss": 2.4125, "step": 4990 }, { "epoch": 0.9102494083378846, "grad_norm": 4.580504894256592, "learning_rate": 2.3863014499101775e-06, "loss": 2.4762, "step": 5000 }, { "epoch": 0.9102494083378846, "eval_loss": 2.4089949131011963, "eval_runtime": 1012.4809, "eval_samples_per_second": 9.646, "eval_steps_per_second": 1.206, "step": 5000 }, { "epoch": 0.9120699071545604, "grad_norm": 4.849244117736816, "learning_rate": 2.291214703757982e-06, "loss": 2.3958, "step": 5010 }, { "epoch": 0.9138904059712362, "grad_norm": 4.8128204345703125, "learning_rate": 2.1980167035280163e-06, "loss": 2.4288, "step": 5020 }, { "epoch": 0.9157109047879118, "grad_norm": 5.573403835296631, "learning_rate": 2.1067111388414163e-06, "loss": 2.4134, "step": 5030 }, { "epoch": 0.9175314036045876, "grad_norm": 4.9487504959106445, "learning_rate": 2.0173016243995866e-06, "loss": 2.4095, "step": 5040 }, { "epoch": 0.9193519024212634, "grad_norm": 4.933927536010742, "learning_rate": 1.929791699841066e-06, "loss": 2.4014, "step": 5050 }, { "epoch": 0.9211724012379392, "grad_norm": 5.116062641143799, "learning_rate": 1.844184829601453e-06, "loss": 2.4196, "step": 5060 }, { "epoch": 0.922992900054615, "grad_norm": 4.888516902923584, "learning_rate": 1.7604844027761802e-06, "loss": 2.4418, "step": 5070 }, { "epoch": 0.9248133988712908, "grad_norm": 4.990447998046875, "learning_rate": 1.6786937329864027e-06, "loss": 2.4049, "step": 5080 }, { "epoch": 0.9266338976879666, "grad_norm": 4.672518253326416, "learning_rate": 1.5988160582477818e-06, "loss": 2.3873, "step": 5090 }, { "epoch": 0.9284543965046422, "grad_norm": 5.029353618621826, "learning_rate": 1.5208545408423092e-06, "loss": 2.4754, "step": 5100 }, { "epoch": 0.930274895321318, "grad_norm": 4.660059928894043, "learning_rate": 1.444812267193102e-06, "loss": 2.4081, "step": 5110 }, { "epoch": 0.9320953941379938, "grad_norm": 5.001034259796143, "learning_rate": 1.3706922477422336e-06, "loss": 2.4014, "step": 5120 }, { "epoch": 0.9339158929546696, "grad_norm": 5.1275858879089355, "learning_rate": 1.2984974168315234e-06, "loss": 2.4251, "step": 5130 }, { "epoch": 0.9357363917713454, "grad_norm": 4.893324375152588, "learning_rate": 1.2282306325864135e-06, "loss": 2.4196, "step": 5140 }, { "epoch": 0.9375568905880212, "grad_norm": 4.734968662261963, "learning_rate": 1.1598946768027863e-06, "loss": 2.401, "step": 5150 }, { "epoch": 0.9393773894046968, "grad_norm": 4.66255521774292, "learning_rate": 1.0934922548368254e-06, "loss": 2.3846, "step": 5160 }, { "epoch": 0.9411978882213726, "grad_norm": 4.771427631378174, "learning_rate": 1.0290259954979397e-06, "loss": 2.3953, "step": 5170 }, { "epoch": 0.9430183870380484, "grad_norm": 4.673166275024414, "learning_rate": 9.664984509446917e-07, "loss": 2.3694, "step": 5180 }, { "epoch": 0.9448388858547242, "grad_norm": 4.778134346008301, "learning_rate": 9.059120965837331e-07, "loss": 2.3948, "step": 5190 }, { "epoch": 0.9466593846714, "grad_norm": 4.706231594085693, "learning_rate": 8.472693309718283e-07, "loss": 2.4153, "step": 5200 }, { "epoch": 0.9484798834880758, "grad_norm": 4.645259380340576, "learning_rate": 7.905724757208965e-07, "loss": 2.3806, "step": 5210 }, { "epoch": 0.9503003823047516, "grad_norm": 5.04796838760376, "learning_rate": 7.358237754060915e-07, "loss": 2.454, "step": 5220 }, { "epoch": 0.9521208811214272, "grad_norm": 4.7881646156311035, "learning_rate": 6.830253974769496e-07, "loss": 2.4161, "step": 5230 }, { "epoch": 0.953941379938103, "grad_norm": 4.7254743576049805, "learning_rate": 6.321794321715757e-07, "loss": 2.4715, "step": 5240 }, { "epoch": 0.9557618787547788, "grad_norm": 5.13754415512085, "learning_rate": 5.832878924338869e-07, "loss": 2.4191, "step": 5250 }, { "epoch": 0.9575823775714546, "grad_norm": 4.781599998474121, "learning_rate": 5.363527138339597e-07, "loss": 2.4127, "step": 5260 }, { "epoch": 0.9594028763881304, "grad_norm": 4.541421413421631, "learning_rate": 4.913757544913355e-07, "loss": 2.3908, "step": 5270 }, { "epoch": 0.9612233752048062, "grad_norm": 5.078845500946045, "learning_rate": 4.4835879500153556e-07, "loss": 2.4303, "step": 5280 }, { "epoch": 0.9630438740214818, "grad_norm": 4.745322227478027, "learning_rate": 4.0730353836549993e-07, "loss": 2.4046, "step": 5290 }, { "epoch": 0.9648643728381576, "grad_norm": 4.688536643981934, "learning_rate": 3.6821160992221993e-07, "loss": 2.4456, "step": 5300 }, { "epoch": 0.9666848716548334, "grad_norm": 4.9088592529296875, "learning_rate": 3.310845572843557e-07, "loss": 2.3846, "step": 5310 }, { "epoch": 0.9685053704715092, "grad_norm": 5.126766681671143, "learning_rate": 2.959238502769912e-07, "loss": 2.4093, "step": 5320 }, { "epoch": 0.970325869288185, "grad_norm": 4.49152946472168, "learning_rate": 2.6273088087943597e-07, "loss": 2.3837, "step": 5330 }, { "epoch": 0.9721463681048608, "grad_norm": 4.944559097290039, "learning_rate": 2.315069631701139e-07, "loss": 2.3791, "step": 5340 }, { "epoch": 0.9739668669215366, "grad_norm": 4.91040563583374, "learning_rate": 2.022533332745602e-07, "loss": 2.4035, "step": 5350 }, { "epoch": 0.9757873657382122, "grad_norm": 4.91538143157959, "learning_rate": 1.7497114931644965e-07, "loss": 2.4057, "step": 5360 }, { "epoch": 0.977607864554888, "grad_norm": 5.63076114654541, "learning_rate": 1.496614913717831e-07, "loss": 2.3627, "step": 5370 }, { "epoch": 0.9794283633715638, "grad_norm": 4.944591045379639, "learning_rate": 1.2632536142609397e-07, "loss": 2.3662, "step": 5380 }, { "epoch": 0.9812488621882396, "grad_norm": 4.864638328552246, "learning_rate": 1.0496368333482442e-07, "loss": 2.3704, "step": 5390 }, { "epoch": 0.9830693610049154, "grad_norm": 4.991931438446045, "learning_rate": 8.557730278669906e-08, "loss": 2.3767, "step": 5400 }, { "epoch": 0.9848898598215912, "grad_norm": 4.382468223571777, "learning_rate": 6.816698727029614e-08, "loss": 2.4112, "step": 5410 }, { "epoch": 0.9867103586382668, "grad_norm": 44.841453552246094, "learning_rate": 5.273342604361631e-08, "loss": 2.4092, "step": 5420 }, { "epoch": 0.9885308574549426, "grad_norm": 4.815988063812256, "learning_rate": 3.9277230106832264e-08, "loss": 2.4256, "step": 5430 }, { "epoch": 0.9903513562716184, "grad_norm": 4.87392520904541, "learning_rate": 2.7798932178080274e-08, "loss": 2.3936, "step": 5440 }, { "epoch": 0.9921718550882942, "grad_norm": 5.1465559005737305, "learning_rate": 1.829898667237151e-08, "loss": 2.3805, "step": 5450 }, { "epoch": 0.99399235390497, "grad_norm": 4.486802101135254, "learning_rate": 1.0777769683617544e-08, "loss": 2.3492, "step": 5460 }, { "epoch": 0.9958128527216458, "grad_norm": 5.0049614906311035, "learning_rate": 5.2355789697144945e-09, "loss": 2.4414, "step": 5470 }, { "epoch": 0.9976333515383216, "grad_norm": 4.7070441246032715, "learning_rate": 1.6726339407857616e-09, "loss": 2.4294, "step": 5480 }, { "epoch": 0.9994538503549972, "grad_norm": 4.9832539558410645, "learning_rate": 8.907565046678557e-11, "loss": 2.3724, "step": 5490 } ], "logging_steps": 10, "max_steps": 5493, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2843428615741768e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }