diff --git "a/RAG-Tuning-S/trainer_state.json" "b/RAG-Tuning-S/trainer_state.json" new file mode 100644--- /dev/null +++ "b/RAG-Tuning-S/trainer_state.json" @@ -0,0 +1,34636 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 24710, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00020234722784297855, + "grad_norm": 2.9933409690856934, + "learning_rate": 1.0101010101010103e-06, + "loss": 1.5596, + "step": 5 + }, + { + "epoch": 0.0004046944556859571, + "grad_norm": 2.9861791133880615, + "learning_rate": 2.0202020202020206e-06, + "loss": 1.5265, + "step": 10 + }, + { + "epoch": 0.0006070416835289356, + "grad_norm": 2.7128653526306152, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.5348, + "step": 15 + }, + { + "epoch": 0.0008093889113719142, + "grad_norm": 2.5973939895629883, + "learning_rate": 4.040404040404041e-06, + "loss": 1.5126, + "step": 20 + }, + { + "epoch": 0.0010117361392148927, + "grad_norm": 2.5141825675964355, + "learning_rate": 5.050505050505051e-06, + "loss": 1.4586, + "step": 25 + }, + { + "epoch": 0.0012140833670578712, + "grad_norm": 2.514392137527466, + "learning_rate": 6.060606060606061e-06, + "loss": 1.4285, + "step": 30 + }, + { + "epoch": 0.00141643059490085, + "grad_norm": 2.238926410675049, + "learning_rate": 7.0707070707070704e-06, + "loss": 1.4558, + "step": 35 + }, + { + "epoch": 0.0016187778227438284, + "grad_norm": 2.2784197330474854, + "learning_rate": 8.080808080808082e-06, + "loss": 1.3484, + "step": 40 + }, + { + "epoch": 0.001821125050586807, + "grad_norm": 1.9064857959747314, + "learning_rate": 9.090909090909091e-06, + "loss": 1.3921, + "step": 45 + }, + { + "epoch": 0.0020234722784297854, + "grad_norm": 1.7825236320495605, + "learning_rate": 1.0101010101010101e-05, + "loss": 1.3447, + "step": 50 + }, + { + "epoch": 0.002225819506272764, + "grad_norm": 1.9339872598648071, + "learning_rate": 1.1111111111111112e-05, + "loss": 1.3706, + "step": 55 + }, + { + "epoch": 0.0024281667341157424, + "grad_norm": 1.8017120361328125, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.2476, + "step": 60 + }, + { + "epoch": 0.002630513961958721, + "grad_norm": 2.308884620666504, + "learning_rate": 1.3131313131313134e-05, + "loss": 1.2883, + "step": 65 + }, + { + "epoch": 0.0028328611898017, + "grad_norm": 2.2627971172332764, + "learning_rate": 1.4141414141414141e-05, + "loss": 1.2553, + "step": 70 + }, + { + "epoch": 0.003035208417644678, + "grad_norm": 1.89839768409729, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.1874, + "step": 75 + }, + { + "epoch": 0.003237555645487657, + "grad_norm": 2.02109956741333, + "learning_rate": 1.6161616161616165e-05, + "loss": 1.1467, + "step": 80 + }, + { + "epoch": 0.0034399028733306356, + "grad_norm": 2.418301820755005, + "learning_rate": 1.7171717171717173e-05, + "loss": 1.1796, + "step": 85 + }, + { + "epoch": 0.003642250101173614, + "grad_norm": 1.8888823986053467, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.1421, + "step": 90 + }, + { + "epoch": 0.0038445973290165926, + "grad_norm": 1.6717690229415894, + "learning_rate": 1.919191919191919e-05, + "loss": 1.1976, + "step": 95 + }, + { + "epoch": 0.004046944556859571, + "grad_norm": 1.9028016328811646, + "learning_rate": 2.0202020202020203e-05, + "loss": 1.1088, + "step": 100 + }, + { + "epoch": 0.00424929178470255, + "grad_norm": 1.8961031436920166, + "learning_rate": 2.1212121212121215e-05, + "loss": 1.1656, + "step": 105 + }, + { + "epoch": 0.004451639012545528, + "grad_norm": 2.270857095718384, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.1573, + "step": 110 + }, + { + "epoch": 0.0046539862403885066, + "grad_norm": 1.9814001321792603, + "learning_rate": 2.3232323232323232e-05, + "loss": 1.1281, + "step": 115 + }, + { + "epoch": 0.004856333468231485, + "grad_norm": 2.3769867420196533, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.1401, + "step": 120 + }, + { + "epoch": 0.005058680696074464, + "grad_norm": 3.6437642574310303, + "learning_rate": 2.5252525252525256e-05, + "loss": 1.1035, + "step": 125 + }, + { + "epoch": 0.005261027923917442, + "grad_norm": 2.351130962371826, + "learning_rate": 2.6262626262626268e-05, + "loss": 1.1744, + "step": 130 + }, + { + "epoch": 0.0054633751517604206, + "grad_norm": 2.160093069076538, + "learning_rate": 2.7272727272727273e-05, + "loss": 1.1117, + "step": 135 + }, + { + "epoch": 0.0056657223796034, + "grad_norm": 2.536659002304077, + "learning_rate": 2.8282828282828282e-05, + "loss": 1.1664, + "step": 140 + }, + { + "epoch": 0.005868069607446378, + "grad_norm": 2.9314124584198, + "learning_rate": 2.9292929292929294e-05, + "loss": 1.0955, + "step": 145 + }, + { + "epoch": 0.006070416835289356, + "grad_norm": 2.7155613899230957, + "learning_rate": 3.0303030303030306e-05, + "loss": 1.1312, + "step": 150 + }, + { + "epoch": 0.006272764063132335, + "grad_norm": 2.258028268814087, + "learning_rate": 3.131313131313132e-05, + "loss": 1.088, + "step": 155 + }, + { + "epoch": 0.006475111290975314, + "grad_norm": 2.6700029373168945, + "learning_rate": 3.232323232323233e-05, + "loss": 1.183, + "step": 160 + }, + { + "epoch": 0.006677458518818292, + "grad_norm": 2.71748948097229, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.1442, + "step": 165 + }, + { + "epoch": 0.006879805746661271, + "grad_norm": 2.9919252395629883, + "learning_rate": 3.434343434343435e-05, + "loss": 1.0959, + "step": 170 + }, + { + "epoch": 0.007082152974504249, + "grad_norm": 2.545902967453003, + "learning_rate": 3.535353535353535e-05, + "loss": 1.1813, + "step": 175 + }, + { + "epoch": 0.007284500202347228, + "grad_norm": 2.7845704555511475, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.0744, + "step": 180 + }, + { + "epoch": 0.007486847430190206, + "grad_norm": 3.071317672729492, + "learning_rate": 3.7373737373737376e-05, + "loss": 1.1003, + "step": 185 + }, + { + "epoch": 0.007689194658033185, + "grad_norm": 2.471811294555664, + "learning_rate": 3.838383838383838e-05, + "loss": 1.1205, + "step": 190 + }, + { + "epoch": 0.007891541885876164, + "grad_norm": 2.479722738265991, + "learning_rate": 3.939393939393939e-05, + "loss": 1.1572, + "step": 195 + }, + { + "epoch": 0.008093889113719142, + "grad_norm": 2.5352907180786133, + "learning_rate": 4.0404040404040405e-05, + "loss": 1.0776, + "step": 200 + }, + { + "epoch": 0.00829623634156212, + "grad_norm": 2.292621612548828, + "learning_rate": 4.141414141414142e-05, + "loss": 1.1347, + "step": 205 + }, + { + "epoch": 0.0084985835694051, + "grad_norm": 2.502856731414795, + "learning_rate": 4.242424242424243e-05, + "loss": 1.1138, + "step": 210 + }, + { + "epoch": 0.008700930797248077, + "grad_norm": 2.574815034866333, + "learning_rate": 4.343434343434344e-05, + "loss": 1.095, + "step": 215 + }, + { + "epoch": 0.008903278025091057, + "grad_norm": 2.8264522552490234, + "learning_rate": 4.4444444444444447e-05, + "loss": 1.1218, + "step": 220 + }, + { + "epoch": 0.009105625252934034, + "grad_norm": 2.918670892715454, + "learning_rate": 4.545454545454546e-05, + "loss": 1.1477, + "step": 225 + }, + { + "epoch": 0.009307972480777013, + "grad_norm": 2.5850000381469727, + "learning_rate": 4.6464646464646464e-05, + "loss": 1.06, + "step": 230 + }, + { + "epoch": 0.009510319708619992, + "grad_norm": 2.5582289695739746, + "learning_rate": 4.7474747474747476e-05, + "loss": 1.1202, + "step": 235 + }, + { + "epoch": 0.00971266693646297, + "grad_norm": 2.556922197341919, + "learning_rate": 4.848484848484849e-05, + "loss": 1.0806, + "step": 240 + }, + { + "epoch": 0.009915014164305949, + "grad_norm": 2.5984082221984863, + "learning_rate": 4.94949494949495e-05, + "loss": 1.1722, + "step": 245 + }, + { + "epoch": 0.010117361392148928, + "grad_norm": 2.8163859844207764, + "learning_rate": 5.050505050505051e-05, + "loss": 1.121, + "step": 250 + }, + { + "epoch": 0.010319708619991905, + "grad_norm": 2.8502392768859863, + "learning_rate": 5.151515151515152e-05, + "loss": 1.0742, + "step": 255 + }, + { + "epoch": 0.010522055847834885, + "grad_norm": 2.6049704551696777, + "learning_rate": 5.2525252525252536e-05, + "loss": 1.1487, + "step": 260 + }, + { + "epoch": 0.010724403075677864, + "grad_norm": 2.836660861968994, + "learning_rate": 5.353535353535354e-05, + "loss": 1.0866, + "step": 265 + }, + { + "epoch": 0.010926750303520841, + "grad_norm": 2.7358970642089844, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.1158, + "step": 270 + }, + { + "epoch": 0.01112909753136382, + "grad_norm": 2.190136194229126, + "learning_rate": 5.555555555555556e-05, + "loss": 1.0547, + "step": 275 + }, + { + "epoch": 0.0113314447592068, + "grad_norm": 3.045732021331787, + "learning_rate": 5.6565656565656563e-05, + "loss": 1.0845, + "step": 280 + }, + { + "epoch": 0.011533791987049777, + "grad_norm": 2.5552291870117188, + "learning_rate": 5.757575757575758e-05, + "loss": 1.1007, + "step": 285 + }, + { + "epoch": 0.011736139214892756, + "grad_norm": 2.478412628173828, + "learning_rate": 5.858585858585859e-05, + "loss": 1.2019, + "step": 290 + }, + { + "epoch": 0.011938486442735735, + "grad_norm": 2.5224175453186035, + "learning_rate": 5.959595959595959e-05, + "loss": 1.031, + "step": 295 + }, + { + "epoch": 0.012140833670578713, + "grad_norm": 2.516164779663086, + "learning_rate": 6.060606060606061e-05, + "loss": 1.052, + "step": 300 + }, + { + "epoch": 0.012343180898421692, + "grad_norm": 2.3297924995422363, + "learning_rate": 6.161616161616162e-05, + "loss": 1.1779, + "step": 305 + }, + { + "epoch": 0.01254552812626467, + "grad_norm": 2.5777242183685303, + "learning_rate": 6.262626262626264e-05, + "loss": 1.0896, + "step": 310 + }, + { + "epoch": 0.012747875354107648, + "grad_norm": 2.1272308826446533, + "learning_rate": 6.363636363636364e-05, + "loss": 1.1065, + "step": 315 + }, + { + "epoch": 0.012950222581950627, + "grad_norm": 2.5600132942199707, + "learning_rate": 6.464646464646466e-05, + "loss": 1.1035, + "step": 320 + }, + { + "epoch": 0.013152569809793607, + "grad_norm": 2.5389156341552734, + "learning_rate": 6.565656565656566e-05, + "loss": 1.0641, + "step": 325 + }, + { + "epoch": 0.013354917037636584, + "grad_norm": 1.9946951866149902, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0919, + "step": 330 + }, + { + "epoch": 0.013557264265479563, + "grad_norm": 2.1555545330047607, + "learning_rate": 6.767676767676769e-05, + "loss": 1.0443, + "step": 335 + }, + { + "epoch": 0.013759611493322542, + "grad_norm": 2.399268865585327, + "learning_rate": 6.86868686868687e-05, + "loss": 1.0891, + "step": 340 + }, + { + "epoch": 0.01396195872116552, + "grad_norm": 2.261282205581665, + "learning_rate": 6.96969696969697e-05, + "loss": 1.0574, + "step": 345 + }, + { + "epoch": 0.014164305949008499, + "grad_norm": 2.1969683170318604, + "learning_rate": 7.07070707070707e-05, + "loss": 1.1118, + "step": 350 + }, + { + "epoch": 0.014366653176851478, + "grad_norm": 2.359020471572876, + "learning_rate": 7.171717171717171e-05, + "loss": 1.1636, + "step": 355 + }, + { + "epoch": 0.014569000404694455, + "grad_norm": 2.1169421672821045, + "learning_rate": 7.272727272727273e-05, + "loss": 1.0405, + "step": 360 + }, + { + "epoch": 0.014771347632537435, + "grad_norm": 2.5211663246154785, + "learning_rate": 7.373737373737373e-05, + "loss": 1.1449, + "step": 365 + }, + { + "epoch": 0.014973694860380412, + "grad_norm": 2.165107250213623, + "learning_rate": 7.474747474747475e-05, + "loss": 1.102, + "step": 370 + }, + { + "epoch": 0.015176042088223391, + "grad_norm": 2.2055845260620117, + "learning_rate": 7.575757575757576e-05, + "loss": 1.1009, + "step": 375 + }, + { + "epoch": 0.01537838931606637, + "grad_norm": 2.101855516433716, + "learning_rate": 7.676767676767676e-05, + "loss": 1.0385, + "step": 380 + }, + { + "epoch": 0.015580736543909348, + "grad_norm": 2.1183292865753174, + "learning_rate": 7.777777777777778e-05, + "loss": 1.0664, + "step": 385 + }, + { + "epoch": 0.01578308377175233, + "grad_norm": 2.391688823699951, + "learning_rate": 7.878787878787879e-05, + "loss": 1.095, + "step": 390 + }, + { + "epoch": 0.015985430999595304, + "grad_norm": 2.121948719024658, + "learning_rate": 7.97979797979798e-05, + "loss": 1.0647, + "step": 395 + }, + { + "epoch": 0.016187778227438283, + "grad_norm": 2.142220973968506, + "learning_rate": 8.080808080808081e-05, + "loss": 1.1395, + "step": 400 + }, + { + "epoch": 0.016390125455281263, + "grad_norm": 2.2406890392303467, + "learning_rate": 8.181818181818183e-05, + "loss": 1.0615, + "step": 405 + }, + { + "epoch": 0.01659247268312424, + "grad_norm": 2.036979913711548, + "learning_rate": 8.282828282828283e-05, + "loss": 1.0771, + "step": 410 + }, + { + "epoch": 0.01679481991096722, + "grad_norm": 1.961674690246582, + "learning_rate": 8.383838383838384e-05, + "loss": 1.0595, + "step": 415 + }, + { + "epoch": 0.0169971671388102, + "grad_norm": 1.9730347394943237, + "learning_rate": 8.484848484848486e-05, + "loss": 0.9874, + "step": 420 + }, + { + "epoch": 0.017199514366653176, + "grad_norm": 1.7028435468673706, + "learning_rate": 8.585858585858586e-05, + "loss": 1.071, + "step": 425 + }, + { + "epoch": 0.017401861594496155, + "grad_norm": 1.8906868696212769, + "learning_rate": 8.686868686868688e-05, + "loss": 1.0726, + "step": 430 + }, + { + "epoch": 0.017604208822339134, + "grad_norm": 1.80305016040802, + "learning_rate": 8.787878787878789e-05, + "loss": 1.0412, + "step": 435 + }, + { + "epoch": 0.017806556050182113, + "grad_norm": 2.0835936069488525, + "learning_rate": 8.888888888888889e-05, + "loss": 1.1184, + "step": 440 + }, + { + "epoch": 0.018008903278025092, + "grad_norm": 1.9061412811279297, + "learning_rate": 8.98989898989899e-05, + "loss": 1.0537, + "step": 445 + }, + { + "epoch": 0.018211250505868068, + "grad_norm": 2.6074249744415283, + "learning_rate": 9.090909090909092e-05, + "loss": 1.088, + "step": 450 + }, + { + "epoch": 0.018413597733711047, + "grad_norm": 1.6072639226913452, + "learning_rate": 9.191919191919192e-05, + "loss": 1.1465, + "step": 455 + }, + { + "epoch": 0.018615944961554026, + "grad_norm": 1.7711330652236938, + "learning_rate": 9.292929292929293e-05, + "loss": 1.0707, + "step": 460 + }, + { + "epoch": 0.018818292189397005, + "grad_norm": 2.105698823928833, + "learning_rate": 9.393939393939395e-05, + "loss": 1.0771, + "step": 465 + }, + { + "epoch": 0.019020639417239985, + "grad_norm": 2.5414555072784424, + "learning_rate": 9.494949494949495e-05, + "loss": 1.1593, + "step": 470 + }, + { + "epoch": 0.019222986645082964, + "grad_norm": 1.7689402103424072, + "learning_rate": 9.595959595959596e-05, + "loss": 1.0731, + "step": 475 + }, + { + "epoch": 0.01942533387292594, + "grad_norm": 1.7413325309753418, + "learning_rate": 9.696969696969698e-05, + "loss": 1.048, + "step": 480 + }, + { + "epoch": 0.01962768110076892, + "grad_norm": 1.7363505363464355, + "learning_rate": 9.797979797979798e-05, + "loss": 1.0845, + "step": 485 + }, + { + "epoch": 0.019830028328611898, + "grad_norm": 1.6728994846343994, + "learning_rate": 9.8989898989899e-05, + "loss": 1.1154, + "step": 490 + }, + { + "epoch": 0.020032375556454877, + "grad_norm": 2.205254077911377, + "learning_rate": 0.0001, + "loss": 1.0316, + "step": 495 + }, + { + "epoch": 0.020234722784297856, + "grad_norm": 1.5651170015335083, + "learning_rate": 9.997935164154451e-05, + "loss": 1.1225, + "step": 500 + }, + { + "epoch": 0.020437070012140835, + "grad_norm": 1.6764373779296875, + "learning_rate": 9.9958703283089e-05, + "loss": 1.0609, + "step": 505 + }, + { + "epoch": 0.02063941723998381, + "grad_norm": 1.4497535228729248, + "learning_rate": 9.993805492463349e-05, + "loss": 1.0826, + "step": 510 + }, + { + "epoch": 0.02084176446782679, + "grad_norm": 1.9358776807785034, + "learning_rate": 9.9917406566178e-05, + "loss": 1.076, + "step": 515 + }, + { + "epoch": 0.02104411169566977, + "grad_norm": 1.9405758380889893, + "learning_rate": 9.98967582077225e-05, + "loss": 1.0944, + "step": 520 + }, + { + "epoch": 0.021246458923512748, + "grad_norm": 1.8175926208496094, + "learning_rate": 9.987610984926699e-05, + "loss": 1.0325, + "step": 525 + }, + { + "epoch": 0.021448806151355727, + "grad_norm": 1.5051298141479492, + "learning_rate": 9.985546149081149e-05, + "loss": 1.087, + "step": 530 + }, + { + "epoch": 0.021651153379198707, + "grad_norm": 1.5366894006729126, + "learning_rate": 9.983481313235598e-05, + "loss": 1.0605, + "step": 535 + }, + { + "epoch": 0.021853500607041682, + "grad_norm": 1.655651330947876, + "learning_rate": 9.981416477390047e-05, + "loss": 1.0321, + "step": 540 + }, + { + "epoch": 0.02205584783488466, + "grad_norm": 1.7619483470916748, + "learning_rate": 9.979351641544498e-05, + "loss": 1.0845, + "step": 545 + }, + { + "epoch": 0.02225819506272764, + "grad_norm": 1.6305811405181885, + "learning_rate": 9.977286805698948e-05, + "loss": 1.1212, + "step": 550 + }, + { + "epoch": 0.02246054229057062, + "grad_norm": 1.5049748420715332, + "learning_rate": 9.975221969853397e-05, + "loss": 1.0087, + "step": 555 + }, + { + "epoch": 0.0226628895184136, + "grad_norm": 1.5912582874298096, + "learning_rate": 9.973157134007846e-05, + "loss": 1.046, + "step": 560 + }, + { + "epoch": 0.022865236746256578, + "grad_norm": 1.4049822092056274, + "learning_rate": 9.971092298162297e-05, + "loss": 1.0532, + "step": 565 + }, + { + "epoch": 0.023067583974099554, + "grad_norm": 1.6603472232818604, + "learning_rate": 9.969027462316747e-05, + "loss": 1.081, + "step": 570 + }, + { + "epoch": 0.023269931201942533, + "grad_norm": 1.5358834266662598, + "learning_rate": 9.966962626471196e-05, + "loss": 1.0783, + "step": 575 + }, + { + "epoch": 0.023472278429785512, + "grad_norm": 1.5615278482437134, + "learning_rate": 9.964897790625645e-05, + "loss": 1.0672, + "step": 580 + }, + { + "epoch": 0.02367462565762849, + "grad_norm": 1.6925407648086548, + "learning_rate": 9.962832954780096e-05, + "loss": 1.0514, + "step": 585 + }, + { + "epoch": 0.02387697288547147, + "grad_norm": 1.834684133529663, + "learning_rate": 9.960768118934545e-05, + "loss": 1.0672, + "step": 590 + }, + { + "epoch": 0.024079320113314446, + "grad_norm": 1.6813185214996338, + "learning_rate": 9.958703283088995e-05, + "loss": 1.0448, + "step": 595 + }, + { + "epoch": 0.024281667341157425, + "grad_norm": 1.5269089937210083, + "learning_rate": 9.956638447243445e-05, + "loss": 1.1439, + "step": 600 + }, + { + "epoch": 0.024484014569000404, + "grad_norm": 1.4650517702102661, + "learning_rate": 9.954573611397893e-05, + "loss": 1.0276, + "step": 605 + }, + { + "epoch": 0.024686361796843383, + "grad_norm": 1.6329649686813354, + "learning_rate": 9.952508775552344e-05, + "loss": 1.0589, + "step": 610 + }, + { + "epoch": 0.024888709024686363, + "grad_norm": 1.4608739614486694, + "learning_rate": 9.950443939706794e-05, + "loss": 1.1105, + "step": 615 + }, + { + "epoch": 0.02509105625252934, + "grad_norm": 1.4417763948440552, + "learning_rate": 9.948379103861244e-05, + "loss": 1.0282, + "step": 620 + }, + { + "epoch": 0.025293403480372317, + "grad_norm": 1.7151459455490112, + "learning_rate": 9.946314268015693e-05, + "loss": 1.0601, + "step": 625 + }, + { + "epoch": 0.025495750708215296, + "grad_norm": 1.565805435180664, + "learning_rate": 9.944249432170143e-05, + "loss": 1.0962, + "step": 630 + }, + { + "epoch": 0.025698097936058276, + "grad_norm": 1.6773918867111206, + "learning_rate": 9.942184596324593e-05, + "loss": 1.1163, + "step": 635 + }, + { + "epoch": 0.025900445163901255, + "grad_norm": 1.8091540336608887, + "learning_rate": 9.940119760479042e-05, + "loss": 1.0751, + "step": 640 + }, + { + "epoch": 0.026102792391744234, + "grad_norm": 1.5862232446670532, + "learning_rate": 9.938054924633492e-05, + "loss": 1.0864, + "step": 645 + }, + { + "epoch": 0.026305139619587213, + "grad_norm": 1.5611926317214966, + "learning_rate": 9.935990088787941e-05, + "loss": 1.0644, + "step": 650 + }, + { + "epoch": 0.02650748684743019, + "grad_norm": 1.6312520503997803, + "learning_rate": 9.933925252942392e-05, + "loss": 1.096, + "step": 655 + }, + { + "epoch": 0.026709834075273168, + "grad_norm": 1.7054439783096313, + "learning_rate": 9.931860417096841e-05, + "loss": 1.0949, + "step": 660 + }, + { + "epoch": 0.026912181303116147, + "grad_norm": 1.5077580213546753, + "learning_rate": 9.929795581251291e-05, + "loss": 1.0756, + "step": 665 + }, + { + "epoch": 0.027114528530959126, + "grad_norm": 1.8034014701843262, + "learning_rate": 9.927730745405742e-05, + "loss": 1.0091, + "step": 670 + }, + { + "epoch": 0.027316875758802105, + "grad_norm": 1.5382342338562012, + "learning_rate": 9.92566590956019e-05, + "loss": 1.1142, + "step": 675 + }, + { + "epoch": 0.027519222986645085, + "grad_norm": 1.607927680015564, + "learning_rate": 9.92360107371464e-05, + "loss": 1.0273, + "step": 680 + }, + { + "epoch": 0.02772157021448806, + "grad_norm": 1.561005711555481, + "learning_rate": 9.92153623786909e-05, + "loss": 1.0857, + "step": 685 + }, + { + "epoch": 0.02792391744233104, + "grad_norm": 1.7696928977966309, + "learning_rate": 9.919471402023539e-05, + "loss": 1.0289, + "step": 690 + }, + { + "epoch": 0.02812626467017402, + "grad_norm": 1.628246545791626, + "learning_rate": 9.91740656617799e-05, + "loss": 1.0224, + "step": 695 + }, + { + "epoch": 0.028328611898016998, + "grad_norm": 1.578949213027954, + "learning_rate": 9.915341730332439e-05, + "loss": 1.0747, + "step": 700 + }, + { + "epoch": 0.028530959125859977, + "grad_norm": 1.3843584060668945, + "learning_rate": 9.913276894486889e-05, + "loss": 1.0602, + "step": 705 + }, + { + "epoch": 0.028733306353702956, + "grad_norm": 1.7278279066085815, + "learning_rate": 9.911212058641338e-05, + "loss": 1.0388, + "step": 710 + }, + { + "epoch": 0.02893565358154593, + "grad_norm": 1.4063900709152222, + "learning_rate": 9.909147222795789e-05, + "loss": 1.0551, + "step": 715 + }, + { + "epoch": 0.02913800080938891, + "grad_norm": 1.477317214012146, + "learning_rate": 9.907082386950238e-05, + "loss": 1.0041, + "step": 720 + }, + { + "epoch": 0.02934034803723189, + "grad_norm": 1.8149793148040771, + "learning_rate": 9.905017551104687e-05, + "loss": 1.0325, + "step": 725 + }, + { + "epoch": 0.02954269526507487, + "grad_norm": 1.5079026222229004, + "learning_rate": 9.902952715259137e-05, + "loss": 1.0383, + "step": 730 + }, + { + "epoch": 0.029745042492917848, + "grad_norm": 1.6261570453643799, + "learning_rate": 9.900887879413588e-05, + "loss": 1.0451, + "step": 735 + }, + { + "epoch": 0.029947389720760824, + "grad_norm": 1.351682186126709, + "learning_rate": 9.898823043568038e-05, + "loss": 1.1057, + "step": 740 + }, + { + "epoch": 0.030149736948603803, + "grad_norm": 1.3424296379089355, + "learning_rate": 9.896758207722487e-05, + "loss": 1.0734, + "step": 745 + }, + { + "epoch": 0.030352084176446782, + "grad_norm": 1.5590558052062988, + "learning_rate": 9.894693371876936e-05, + "loss": 1.0689, + "step": 750 + }, + { + "epoch": 0.03055443140428976, + "grad_norm": 1.5399047136306763, + "learning_rate": 9.892628536031386e-05, + "loss": 1.0878, + "step": 755 + }, + { + "epoch": 0.03075677863213274, + "grad_norm": 1.5072790384292603, + "learning_rate": 9.890563700185835e-05, + "loss": 1.0718, + "step": 760 + }, + { + "epoch": 0.03095912585997572, + "grad_norm": 1.5990835428237915, + "learning_rate": 9.888498864340286e-05, + "loss": 1.0803, + "step": 765 + }, + { + "epoch": 0.031161473087818695, + "grad_norm": 1.5335228443145752, + "learning_rate": 9.886434028494735e-05, + "loss": 1.0789, + "step": 770 + }, + { + "epoch": 0.03136382031566168, + "grad_norm": 1.611234188079834, + "learning_rate": 9.884369192649184e-05, + "loss": 1.0253, + "step": 775 + }, + { + "epoch": 0.03156616754350466, + "grad_norm": 1.4838621616363525, + "learning_rate": 9.882304356803634e-05, + "loss": 1.0687, + "step": 780 + }, + { + "epoch": 0.03176851477134763, + "grad_norm": 1.3513612747192383, + "learning_rate": 9.880239520958085e-05, + "loss": 1.061, + "step": 785 + }, + { + "epoch": 0.03197086199919061, + "grad_norm": 1.3502790927886963, + "learning_rate": 9.878174685112534e-05, + "loss": 1.0791, + "step": 790 + }, + { + "epoch": 0.03217320922703359, + "grad_norm": 1.496467113494873, + "learning_rate": 9.876109849266983e-05, + "loss": 1.0942, + "step": 795 + }, + { + "epoch": 0.03237555645487657, + "grad_norm": 1.894826889038086, + "learning_rate": 9.874045013421433e-05, + "loss": 1.0523, + "step": 800 + }, + { + "epoch": 0.032577903682719546, + "grad_norm": 1.5444755554199219, + "learning_rate": 9.871980177575884e-05, + "loss": 1.044, + "step": 805 + }, + { + "epoch": 0.032780250910562525, + "grad_norm": 1.4778664112091064, + "learning_rate": 9.869915341730333e-05, + "loss": 1.0029, + "step": 810 + }, + { + "epoch": 0.032982598138405504, + "grad_norm": 1.549651861190796, + "learning_rate": 9.867850505884783e-05, + "loss": 1.0215, + "step": 815 + }, + { + "epoch": 0.03318494536624848, + "grad_norm": 1.4651092290878296, + "learning_rate": 9.865785670039232e-05, + "loss": 0.9801, + "step": 820 + }, + { + "epoch": 0.03338729259409146, + "grad_norm": 1.2966331243515015, + "learning_rate": 9.863720834193681e-05, + "loss": 1.0499, + "step": 825 + }, + { + "epoch": 0.03358963982193444, + "grad_norm": 1.3640475273132324, + "learning_rate": 9.861655998348132e-05, + "loss": 1.0223, + "step": 830 + }, + { + "epoch": 0.03379198704977742, + "grad_norm": 1.25409734249115, + "learning_rate": 9.859591162502582e-05, + "loss": 1.1069, + "step": 835 + }, + { + "epoch": 0.0339943342776204, + "grad_norm": 1.547042965888977, + "learning_rate": 9.857526326657031e-05, + "loss": 1.0552, + "step": 840 + }, + { + "epoch": 0.03419668150546337, + "grad_norm": 1.4446684122085571, + "learning_rate": 9.85546149081148e-05, + "loss": 1.0126, + "step": 845 + }, + { + "epoch": 0.03439902873330635, + "grad_norm": 1.2919784784317017, + "learning_rate": 9.85339665496593e-05, + "loss": 1.0333, + "step": 850 + }, + { + "epoch": 0.03460137596114933, + "grad_norm": 1.2963666915893555, + "learning_rate": 9.851331819120381e-05, + "loss": 1.04, + "step": 855 + }, + { + "epoch": 0.03480372318899231, + "grad_norm": 1.4333245754241943, + "learning_rate": 9.84926698327483e-05, + "loss": 1.0674, + "step": 860 + }, + { + "epoch": 0.03500607041683529, + "grad_norm": 1.4501776695251465, + "learning_rate": 9.847202147429279e-05, + "loss": 1.1066, + "step": 865 + }, + { + "epoch": 0.03520841764467827, + "grad_norm": 1.4631706476211548, + "learning_rate": 9.84513731158373e-05, + "loss": 1.0648, + "step": 870 + }, + { + "epoch": 0.03541076487252125, + "grad_norm": 1.4564629793167114, + "learning_rate": 9.84307247573818e-05, + "loss": 1.0179, + "step": 875 + }, + { + "epoch": 0.035613112100364226, + "grad_norm": 1.3551617860794067, + "learning_rate": 9.841007639892629e-05, + "loss": 1.0684, + "step": 880 + }, + { + "epoch": 0.035815459328207205, + "grad_norm": 1.4886606931686401, + "learning_rate": 9.83894280404708e-05, + "loss": 1.0865, + "step": 885 + }, + { + "epoch": 0.036017806556050184, + "grad_norm": 1.4882872104644775, + "learning_rate": 9.836877968201528e-05, + "loss": 1.0389, + "step": 890 + }, + { + "epoch": 0.036220153783893164, + "grad_norm": 1.275161862373352, + "learning_rate": 9.834813132355977e-05, + "loss": 1.1254, + "step": 895 + }, + { + "epoch": 0.036422501011736136, + "grad_norm": 1.3433767557144165, + "learning_rate": 9.832748296510428e-05, + "loss": 1.0277, + "step": 900 + }, + { + "epoch": 0.036624848239579115, + "grad_norm": 1.473062515258789, + "learning_rate": 9.830683460664878e-05, + "loss": 1.0902, + "step": 905 + }, + { + "epoch": 0.036827195467422094, + "grad_norm": 1.5629346370697021, + "learning_rate": 9.828618624819327e-05, + "loss": 1.1125, + "step": 910 + }, + { + "epoch": 0.03702954269526507, + "grad_norm": 1.3655176162719727, + "learning_rate": 9.826553788973776e-05, + "loss": 1.0535, + "step": 915 + }, + { + "epoch": 0.03723188992310805, + "grad_norm": 1.3610424995422363, + "learning_rate": 9.824488953128227e-05, + "loss": 1.0586, + "step": 920 + }, + { + "epoch": 0.03743423715095103, + "grad_norm": 1.3422636985778809, + "learning_rate": 9.822424117282677e-05, + "loss": 1.0694, + "step": 925 + }, + { + "epoch": 0.03763658437879401, + "grad_norm": 1.4051406383514404, + "learning_rate": 9.820359281437126e-05, + "loss": 1.0393, + "step": 930 + }, + { + "epoch": 0.03783893160663699, + "grad_norm": 1.2342345714569092, + "learning_rate": 9.818294445591575e-05, + "loss": 1.0426, + "step": 935 + }, + { + "epoch": 0.03804127883447997, + "grad_norm": 1.2904876470565796, + "learning_rate": 9.816229609746026e-05, + "loss": 1.048, + "step": 940 + }, + { + "epoch": 0.03824362606232295, + "grad_norm": 1.5387827157974243, + "learning_rate": 9.814164773900475e-05, + "loss": 1.0752, + "step": 945 + }, + { + "epoch": 0.03844597329016593, + "grad_norm": 1.2083275318145752, + "learning_rate": 9.812099938054925e-05, + "loss": 1.0497, + "step": 950 + }, + { + "epoch": 0.038648320518008906, + "grad_norm": 1.3084746599197388, + "learning_rate": 9.810035102209376e-05, + "loss": 1.0597, + "step": 955 + }, + { + "epoch": 0.03885066774585188, + "grad_norm": 1.4155186414718628, + "learning_rate": 9.807970266363825e-05, + "loss": 1.0142, + "step": 960 + }, + { + "epoch": 0.03905301497369486, + "grad_norm": 1.272055983543396, + "learning_rate": 9.805905430518274e-05, + "loss": 1.0295, + "step": 965 + }, + { + "epoch": 0.03925536220153784, + "grad_norm": 1.25702702999115, + "learning_rate": 9.803840594672724e-05, + "loss": 1.0694, + "step": 970 + }, + { + "epoch": 0.039457709429380816, + "grad_norm": 1.3168611526489258, + "learning_rate": 9.801775758827175e-05, + "loss": 1.004, + "step": 975 + }, + { + "epoch": 0.039660056657223795, + "grad_norm": 1.3408727645874023, + "learning_rate": 9.799710922981624e-05, + "loss": 1.051, + "step": 980 + }, + { + "epoch": 0.039862403885066774, + "grad_norm": 1.5531495809555054, + "learning_rate": 9.797646087136073e-05, + "loss": 1.0327, + "step": 985 + }, + { + "epoch": 0.040064751112909754, + "grad_norm": 1.474417805671692, + "learning_rate": 9.795581251290523e-05, + "loss": 1.0806, + "step": 990 + }, + { + "epoch": 0.04026709834075273, + "grad_norm": 1.354904055595398, + "learning_rate": 9.793516415444972e-05, + "loss": 1.0267, + "step": 995 + }, + { + "epoch": 0.04046944556859571, + "grad_norm": 1.3857539892196655, + "learning_rate": 9.791451579599422e-05, + "loss": 1.0049, + "step": 1000 + }, + { + "epoch": 0.04067179279643869, + "grad_norm": 1.3299477100372314, + "learning_rate": 9.789386743753872e-05, + "loss": 1.0257, + "step": 1005 + }, + { + "epoch": 0.04087414002428167, + "grad_norm": 1.3018697500228882, + "learning_rate": 9.787321907908322e-05, + "loss": 1.0152, + "step": 1010 + }, + { + "epoch": 0.04107648725212465, + "grad_norm": 1.437354564666748, + "learning_rate": 9.785257072062771e-05, + "loss": 1.0539, + "step": 1015 + }, + { + "epoch": 0.04127883447996762, + "grad_norm": 1.4543030261993408, + "learning_rate": 9.783192236217221e-05, + "loss": 1.0573, + "step": 1020 + }, + { + "epoch": 0.0414811817078106, + "grad_norm": 1.41005539894104, + "learning_rate": 9.781127400371672e-05, + "loss": 1.0377, + "step": 1025 + }, + { + "epoch": 0.04168352893565358, + "grad_norm": 1.3793244361877441, + "learning_rate": 9.779062564526121e-05, + "loss": 1.0468, + "step": 1030 + }, + { + "epoch": 0.04188587616349656, + "grad_norm": 1.3902945518493652, + "learning_rate": 9.77699772868057e-05, + "loss": 1.061, + "step": 1035 + }, + { + "epoch": 0.04208822339133954, + "grad_norm": 1.3566033840179443, + "learning_rate": 9.77493289283502e-05, + "loss": 1.0839, + "step": 1040 + }, + { + "epoch": 0.04229057061918252, + "grad_norm": 1.3005142211914062, + "learning_rate": 9.77286805698947e-05, + "loss": 1.0235, + "step": 1045 + }, + { + "epoch": 0.042492917847025496, + "grad_norm": 1.4049321413040161, + "learning_rate": 9.77080322114392e-05, + "loss": 1.063, + "step": 1050 + }, + { + "epoch": 0.042695265074868476, + "grad_norm": 1.2821658849716187, + "learning_rate": 9.768738385298369e-05, + "loss": 1.0399, + "step": 1055 + }, + { + "epoch": 0.042897612302711455, + "grad_norm": 1.2726246118545532, + "learning_rate": 9.766673549452819e-05, + "loss": 1.0331, + "step": 1060 + }, + { + "epoch": 0.043099959530554434, + "grad_norm": 1.390950322151184, + "learning_rate": 9.764608713607268e-05, + "loss": 1.0974, + "step": 1065 + }, + { + "epoch": 0.04330230675839741, + "grad_norm": 1.430403709411621, + "learning_rate": 9.762543877761719e-05, + "loss": 0.979, + "step": 1070 + }, + { + "epoch": 0.043504653986240385, + "grad_norm": 1.2843495607376099, + "learning_rate": 9.760479041916169e-05, + "loss": 1.0334, + "step": 1075 + }, + { + "epoch": 0.043707001214083364, + "grad_norm": 1.4190033674240112, + "learning_rate": 9.758414206070617e-05, + "loss": 1.0625, + "step": 1080 + }, + { + "epoch": 0.043909348441926344, + "grad_norm": 1.4139447212219238, + "learning_rate": 9.756349370225067e-05, + "loss": 1.0092, + "step": 1085 + }, + { + "epoch": 0.04411169566976932, + "grad_norm": 1.532297968864441, + "learning_rate": 9.754284534379518e-05, + "loss": 1.0471, + "step": 1090 + }, + { + "epoch": 0.0443140428976123, + "grad_norm": 1.3429151773452759, + "learning_rate": 9.752219698533968e-05, + "loss": 1.0501, + "step": 1095 + }, + { + "epoch": 0.04451639012545528, + "grad_norm": 1.6312729120254517, + "learning_rate": 9.750154862688417e-05, + "loss": 0.9615, + "step": 1100 + }, + { + "epoch": 0.04471873735329826, + "grad_norm": 1.2032707929611206, + "learning_rate": 9.748090026842866e-05, + "loss": 1.0629, + "step": 1105 + }, + { + "epoch": 0.04492108458114124, + "grad_norm": 1.2905160188674927, + "learning_rate": 9.746025190997317e-05, + "loss": 1.0609, + "step": 1110 + }, + { + "epoch": 0.04512343180898422, + "grad_norm": 1.4337005615234375, + "learning_rate": 9.743960355151766e-05, + "loss": 1.0235, + "step": 1115 + }, + { + "epoch": 0.0453257790368272, + "grad_norm": 1.4135839939117432, + "learning_rate": 9.741895519306216e-05, + "loss": 1.0707, + "step": 1120 + }, + { + "epoch": 0.04552812626467018, + "grad_norm": 1.3754996061325073, + "learning_rate": 9.739830683460665e-05, + "loss": 1.0873, + "step": 1125 + }, + { + "epoch": 0.045730473492513156, + "grad_norm": 1.2317836284637451, + "learning_rate": 9.737765847615114e-05, + "loss": 1.0743, + "step": 1130 + }, + { + "epoch": 0.04593282072035613, + "grad_norm": 1.5067507028579712, + "learning_rate": 9.735701011769565e-05, + "loss": 1.0218, + "step": 1135 + }, + { + "epoch": 0.04613516794819911, + "grad_norm": 1.308156132698059, + "learning_rate": 9.733636175924015e-05, + "loss": 1.0798, + "step": 1140 + }, + { + "epoch": 0.046337515176042086, + "grad_norm": 1.4515304565429688, + "learning_rate": 9.731571340078465e-05, + "loss": 1.028, + "step": 1145 + }, + { + "epoch": 0.046539862403885066, + "grad_norm": 1.418377161026001, + "learning_rate": 9.729506504232913e-05, + "loss": 1.0614, + "step": 1150 + }, + { + "epoch": 0.046742209631728045, + "grad_norm": 1.3889667987823486, + "learning_rate": 9.727441668387363e-05, + "loss": 1.0, + "step": 1155 + }, + { + "epoch": 0.046944556859571024, + "grad_norm": 1.2457728385925293, + "learning_rate": 9.725376832541814e-05, + "loss": 1.0673, + "step": 1160 + }, + { + "epoch": 0.047146904087414, + "grad_norm": 1.36286199092865, + "learning_rate": 9.723311996696263e-05, + "loss": 0.9971, + "step": 1165 + }, + { + "epoch": 0.04734925131525698, + "grad_norm": 1.5431827306747437, + "learning_rate": 9.721247160850713e-05, + "loss": 1.05, + "step": 1170 + }, + { + "epoch": 0.04755159854309996, + "grad_norm": 1.2391327619552612, + "learning_rate": 9.719182325005162e-05, + "loss": 1.0493, + "step": 1175 + }, + { + "epoch": 0.04775394577094294, + "grad_norm": 1.3675509691238403, + "learning_rate": 9.717117489159611e-05, + "loss": 1.067, + "step": 1180 + }, + { + "epoch": 0.04795629299878592, + "grad_norm": 1.4171303510665894, + "learning_rate": 9.715052653314062e-05, + "loss": 1.0593, + "step": 1185 + }, + { + "epoch": 0.04815864022662889, + "grad_norm": 1.3410508632659912, + "learning_rate": 9.712987817468512e-05, + "loss": 0.9839, + "step": 1190 + }, + { + "epoch": 0.04836098745447187, + "grad_norm": 1.3725942373275757, + "learning_rate": 9.710922981622961e-05, + "loss": 1.0113, + "step": 1195 + }, + { + "epoch": 0.04856333468231485, + "grad_norm": 1.2524265050888062, + "learning_rate": 9.70885814577741e-05, + "loss": 1.0413, + "step": 1200 + }, + { + "epoch": 0.04876568191015783, + "grad_norm": 1.3189339637756348, + "learning_rate": 9.706793309931861e-05, + "loss": 1.0651, + "step": 1205 + }, + { + "epoch": 0.04896802913800081, + "grad_norm": 1.4519147872924805, + "learning_rate": 9.704728474086311e-05, + "loss": 1.0089, + "step": 1210 + }, + { + "epoch": 0.04917037636584379, + "grad_norm": 1.250978708267212, + "learning_rate": 9.70266363824076e-05, + "loss": 1.0253, + "step": 1215 + }, + { + "epoch": 0.04937272359368677, + "grad_norm": 1.262344479560852, + "learning_rate": 9.700598802395209e-05, + "loss": 1.0247, + "step": 1220 + }, + { + "epoch": 0.049575070821529746, + "grad_norm": 1.3184866905212402, + "learning_rate": 9.69853396654966e-05, + "loss": 1.0358, + "step": 1225 + }, + { + "epoch": 0.049777418049372725, + "grad_norm": 1.4247705936431885, + "learning_rate": 9.69646913070411e-05, + "loss": 1.0147, + "step": 1230 + }, + { + "epoch": 0.049979765277215704, + "grad_norm": 1.520044207572937, + "learning_rate": 9.694404294858559e-05, + "loss": 1.0491, + "step": 1235 + }, + { + "epoch": 0.05018211250505868, + "grad_norm": 1.2870755195617676, + "learning_rate": 9.69233945901301e-05, + "loss": 1.0032, + "step": 1240 + }, + { + "epoch": 0.05038445973290166, + "grad_norm": 1.2937649488449097, + "learning_rate": 9.690274623167459e-05, + "loss": 0.9758, + "step": 1245 + }, + { + "epoch": 0.050586806960744635, + "grad_norm": 1.4870978593826294, + "learning_rate": 9.688209787321908e-05, + "loss": 1.0998, + "step": 1250 + }, + { + "epoch": 0.050789154188587614, + "grad_norm": 1.2912105321884155, + "learning_rate": 9.686144951476358e-05, + "loss": 0.9734, + "step": 1255 + }, + { + "epoch": 0.05099150141643059, + "grad_norm": 1.3370693922042847, + "learning_rate": 9.684080115630808e-05, + "loss": 1.0799, + "step": 1260 + }, + { + "epoch": 0.05119384864427357, + "grad_norm": 1.1921980381011963, + "learning_rate": 9.682015279785257e-05, + "loss": 0.9679, + "step": 1265 + }, + { + "epoch": 0.05139619587211655, + "grad_norm": 1.3673900365829468, + "learning_rate": 9.679950443939707e-05, + "loss": 1.0979, + "step": 1270 + }, + { + "epoch": 0.05159854309995953, + "grad_norm": 1.3121528625488281, + "learning_rate": 9.677885608094157e-05, + "loss": 1.0508, + "step": 1275 + }, + { + "epoch": 0.05180089032780251, + "grad_norm": 1.416839361190796, + "learning_rate": 9.675820772248607e-05, + "loss": 1.0685, + "step": 1280 + }, + { + "epoch": 0.05200323755564549, + "grad_norm": 1.3744992017745972, + "learning_rate": 9.673755936403056e-05, + "loss": 1.0447, + "step": 1285 + }, + { + "epoch": 0.05220558478348847, + "grad_norm": 1.2653158903121948, + "learning_rate": 9.671691100557507e-05, + "loss": 1.0382, + "step": 1290 + }, + { + "epoch": 0.05240793201133145, + "grad_norm": 1.4127179384231567, + "learning_rate": 9.669626264711956e-05, + "loss": 1.0803, + "step": 1295 + }, + { + "epoch": 0.052610279239174426, + "grad_norm": 1.256456971168518, + "learning_rate": 9.667561428866405e-05, + "loss": 1.0679, + "step": 1300 + }, + { + "epoch": 0.052812626467017405, + "grad_norm": 1.3415658473968506, + "learning_rate": 9.665496593020855e-05, + "loss": 1.0205, + "step": 1305 + }, + { + "epoch": 0.05301497369486038, + "grad_norm": 1.4126713275909424, + "learning_rate": 9.663431757175306e-05, + "loss": 1.0116, + "step": 1310 + }, + { + "epoch": 0.05321732092270336, + "grad_norm": 1.3043524026870728, + "learning_rate": 9.661366921329755e-05, + "loss": 0.997, + "step": 1315 + }, + { + "epoch": 0.053419668150546336, + "grad_norm": 1.3901809453964233, + "learning_rate": 9.659302085484204e-05, + "loss": 1.0033, + "step": 1320 + }, + { + "epoch": 0.053622015378389315, + "grad_norm": 1.386953592300415, + "learning_rate": 9.657237249638654e-05, + "loss": 1.0546, + "step": 1325 + }, + { + "epoch": 0.053824362606232294, + "grad_norm": 1.2973097562789917, + "learning_rate": 9.655172413793105e-05, + "loss": 1.0523, + "step": 1330 + }, + { + "epoch": 0.05402670983407527, + "grad_norm": 1.3961467742919922, + "learning_rate": 9.653107577947554e-05, + "loss": 1.0213, + "step": 1335 + }, + { + "epoch": 0.05422905706191825, + "grad_norm": 1.2452396154403687, + "learning_rate": 9.651042742102003e-05, + "loss": 1.0588, + "step": 1340 + }, + { + "epoch": 0.05443140428976123, + "grad_norm": 1.25496244430542, + "learning_rate": 9.648977906256453e-05, + "loss": 1.0773, + "step": 1345 + }, + { + "epoch": 0.05463375151760421, + "grad_norm": 1.4122798442840576, + "learning_rate": 9.646913070410902e-05, + "loss": 1.0801, + "step": 1350 + }, + { + "epoch": 0.05483609874544719, + "grad_norm": 1.3670319318771362, + "learning_rate": 9.644848234565353e-05, + "loss": 1.0736, + "step": 1355 + }, + { + "epoch": 0.05503844597329017, + "grad_norm": 1.346893548965454, + "learning_rate": 9.642783398719803e-05, + "loss": 0.974, + "step": 1360 + }, + { + "epoch": 0.05524079320113314, + "grad_norm": 1.3234906196594238, + "learning_rate": 9.640718562874252e-05, + "loss": 1.0739, + "step": 1365 + }, + { + "epoch": 0.05544314042897612, + "grad_norm": 1.2294628620147705, + "learning_rate": 9.638653727028701e-05, + "loss": 1.0281, + "step": 1370 + }, + { + "epoch": 0.0556454876568191, + "grad_norm": 1.2931239604949951, + "learning_rate": 9.636588891183152e-05, + "loss": 1.0329, + "step": 1375 + }, + { + "epoch": 0.05584783488466208, + "grad_norm": 1.3048388957977295, + "learning_rate": 9.634524055337602e-05, + "loss": 1.0759, + "step": 1380 + }, + { + "epoch": 0.05605018211250506, + "grad_norm": 1.412604570388794, + "learning_rate": 9.632459219492051e-05, + "loss": 1.0506, + "step": 1385 + }, + { + "epoch": 0.05625252934034804, + "grad_norm": 1.3981273174285889, + "learning_rate": 9.6303943836465e-05, + "loss": 1.0566, + "step": 1390 + }, + { + "epoch": 0.056454876568191016, + "grad_norm": 1.2319412231445312, + "learning_rate": 9.62832954780095e-05, + "loss": 1.0059, + "step": 1395 + }, + { + "epoch": 0.056657223796033995, + "grad_norm": 1.2856113910675049, + "learning_rate": 9.6262647119554e-05, + "loss": 1.0594, + "step": 1400 + }, + { + "epoch": 0.056859571023876974, + "grad_norm": 1.5007933378219604, + "learning_rate": 9.62419987610985e-05, + "loss": 1.0191, + "step": 1405 + }, + { + "epoch": 0.057061918251719954, + "grad_norm": 1.3999196290969849, + "learning_rate": 9.622135040264299e-05, + "loss": 1.0383, + "step": 1410 + }, + { + "epoch": 0.05726426547956293, + "grad_norm": 1.3111178874969482, + "learning_rate": 9.62007020441875e-05, + "loss": 1.0648, + "step": 1415 + }, + { + "epoch": 0.05746661270740591, + "grad_norm": 1.2582710981369019, + "learning_rate": 9.618005368573198e-05, + "loss": 1.1045, + "step": 1420 + }, + { + "epoch": 0.057668959935248884, + "grad_norm": 1.445177674293518, + "learning_rate": 9.615940532727649e-05, + "loss": 1.0277, + "step": 1425 + }, + { + "epoch": 0.05787130716309186, + "grad_norm": 1.3125499486923218, + "learning_rate": 9.613875696882099e-05, + "loss": 1.0525, + "step": 1430 + }, + { + "epoch": 0.05807365439093484, + "grad_norm": 1.3522056341171265, + "learning_rate": 9.611810861036547e-05, + "loss": 1.0191, + "step": 1435 + }, + { + "epoch": 0.05827600161877782, + "grad_norm": 1.19707453250885, + "learning_rate": 9.609746025190997e-05, + "loss": 1.0182, + "step": 1440 + }, + { + "epoch": 0.0584783488466208, + "grad_norm": 1.3498811721801758, + "learning_rate": 9.607681189345448e-05, + "loss": 1.0606, + "step": 1445 + }, + { + "epoch": 0.05868069607446378, + "grad_norm": 1.1649166345596313, + "learning_rate": 9.605616353499898e-05, + "loss": 1.0361, + "step": 1450 + }, + { + "epoch": 0.05888304330230676, + "grad_norm": 1.2549792528152466, + "learning_rate": 9.603551517654347e-05, + "loss": 1.0384, + "step": 1455 + }, + { + "epoch": 0.05908539053014974, + "grad_norm": 1.2901394367218018, + "learning_rate": 9.601486681808796e-05, + "loss": 0.9723, + "step": 1460 + }, + { + "epoch": 0.05928773775799272, + "grad_norm": 1.189284324645996, + "learning_rate": 9.599421845963247e-05, + "loss": 1.0405, + "step": 1465 + }, + { + "epoch": 0.059490084985835696, + "grad_norm": 1.2396763563156128, + "learning_rate": 9.597357010117696e-05, + "loss": 1.0173, + "step": 1470 + }, + { + "epoch": 0.059692432213678676, + "grad_norm": 1.291982650756836, + "learning_rate": 9.595292174272146e-05, + "loss": 1.07, + "step": 1475 + }, + { + "epoch": 0.05989477944152165, + "grad_norm": 1.4714529514312744, + "learning_rate": 9.593227338426595e-05, + "loss": 1.0584, + "step": 1480 + }, + { + "epoch": 0.06009712666936463, + "grad_norm": 1.2746899127960205, + "learning_rate": 9.591162502581044e-05, + "loss": 1.0436, + "step": 1485 + }, + { + "epoch": 0.060299473897207606, + "grad_norm": 1.3938498497009277, + "learning_rate": 9.589097666735495e-05, + "loss": 1.0457, + "step": 1490 + }, + { + "epoch": 0.060501821125050585, + "grad_norm": 1.505146861076355, + "learning_rate": 9.587032830889945e-05, + "loss": 1.0649, + "step": 1495 + }, + { + "epoch": 0.060704168352893564, + "grad_norm": 1.349221110343933, + "learning_rate": 9.584967995044395e-05, + "loss": 1.0201, + "step": 1500 + }, + { + "epoch": 0.060906515580736544, + "grad_norm": 1.2473959922790527, + "learning_rate": 9.582903159198845e-05, + "loss": 1.0345, + "step": 1505 + }, + { + "epoch": 0.06110886280857952, + "grad_norm": 1.2991083860397339, + "learning_rate": 9.580838323353294e-05, + "loss": 1.0163, + "step": 1510 + }, + { + "epoch": 0.0613112100364225, + "grad_norm": 1.4911009073257446, + "learning_rate": 9.578773487507744e-05, + "loss": 1.0206, + "step": 1515 + }, + { + "epoch": 0.06151355726426548, + "grad_norm": 1.3233214616775513, + "learning_rate": 9.576708651662193e-05, + "loss": 1.0825, + "step": 1520 + }, + { + "epoch": 0.06171590449210846, + "grad_norm": 1.4564628601074219, + "learning_rate": 9.574643815816643e-05, + "loss": 1.0681, + "step": 1525 + }, + { + "epoch": 0.06191825171995144, + "grad_norm": 1.3113048076629639, + "learning_rate": 9.572578979971092e-05, + "loss": 1.0998, + "step": 1530 + }, + { + "epoch": 0.06212059894779442, + "grad_norm": 1.437929630279541, + "learning_rate": 9.570514144125542e-05, + "loss": 0.9953, + "step": 1535 + }, + { + "epoch": 0.06232294617563739, + "grad_norm": 1.33054518699646, + "learning_rate": 9.568449308279992e-05, + "loss": 1.0731, + "step": 1540 + }, + { + "epoch": 0.06252529340348037, + "grad_norm": 1.2281547784805298, + "learning_rate": 9.566384472434442e-05, + "loss": 1.046, + "step": 1545 + }, + { + "epoch": 0.06272764063132336, + "grad_norm": 1.413252830505371, + "learning_rate": 9.564319636588891e-05, + "loss": 1.0632, + "step": 1550 + }, + { + "epoch": 0.06292998785916633, + "grad_norm": 1.4634878635406494, + "learning_rate": 9.56225480074334e-05, + "loss": 0.9653, + "step": 1555 + }, + { + "epoch": 0.06313233508700931, + "grad_norm": 1.1909383535385132, + "learning_rate": 9.560189964897791e-05, + "loss": 0.9952, + "step": 1560 + }, + { + "epoch": 0.06333468231485229, + "grad_norm": 1.3073731660842896, + "learning_rate": 9.558125129052241e-05, + "loss": 1.0672, + "step": 1565 + }, + { + "epoch": 0.06353702954269526, + "grad_norm": 1.221425175666809, + "learning_rate": 9.55606029320669e-05, + "loss": 1.0212, + "step": 1570 + }, + { + "epoch": 0.06373937677053824, + "grad_norm": 1.3182533979415894, + "learning_rate": 9.553995457361141e-05, + "loss": 1.0355, + "step": 1575 + }, + { + "epoch": 0.06394172399838122, + "grad_norm": 1.2224425077438354, + "learning_rate": 9.55193062151559e-05, + "loss": 0.9842, + "step": 1580 + }, + { + "epoch": 0.0641440712262242, + "grad_norm": 1.3230282068252563, + "learning_rate": 9.54986578567004e-05, + "loss": 0.9847, + "step": 1585 + }, + { + "epoch": 0.06434641845406718, + "grad_norm": 1.6240323781967163, + "learning_rate": 9.547800949824489e-05, + "loss": 1.0517, + "step": 1590 + }, + { + "epoch": 0.06454876568191016, + "grad_norm": 1.2214101552963257, + "learning_rate": 9.54573611397894e-05, + "loss": 1.0031, + "step": 1595 + }, + { + "epoch": 0.06475111290975313, + "grad_norm": 1.3238086700439453, + "learning_rate": 9.543671278133389e-05, + "loss": 0.9992, + "step": 1600 + }, + { + "epoch": 0.06495346013759612, + "grad_norm": 1.3082062005996704, + "learning_rate": 9.541606442287838e-05, + "loss": 1.0782, + "step": 1605 + }, + { + "epoch": 0.06515580736543909, + "grad_norm": 1.2175467014312744, + "learning_rate": 9.539541606442288e-05, + "loss": 1.0405, + "step": 1610 + }, + { + "epoch": 0.06535815459328208, + "grad_norm": 1.2372517585754395, + "learning_rate": 9.537476770596739e-05, + "loss": 1.0847, + "step": 1615 + }, + { + "epoch": 0.06556050182112505, + "grad_norm": 1.3659371137619019, + "learning_rate": 9.535411934751188e-05, + "loss": 1.0472, + "step": 1620 + }, + { + "epoch": 0.06576284904896802, + "grad_norm": 1.4121381044387817, + "learning_rate": 9.533347098905637e-05, + "loss": 1.0403, + "step": 1625 + }, + { + "epoch": 0.06596519627681101, + "grad_norm": 1.247285008430481, + "learning_rate": 9.531282263060087e-05, + "loss": 1.0667, + "step": 1630 + }, + { + "epoch": 0.06616754350465398, + "grad_norm": 1.2073017358779907, + "learning_rate": 9.529217427214537e-05, + "loss": 1.0118, + "step": 1635 + }, + { + "epoch": 0.06636989073249697, + "grad_norm": 1.5580484867095947, + "learning_rate": 9.527152591368987e-05, + "loss": 1.1244, + "step": 1640 + }, + { + "epoch": 0.06657223796033994, + "grad_norm": 1.1624681949615479, + "learning_rate": 9.525087755523437e-05, + "loss": 1.0397, + "step": 1645 + }, + { + "epoch": 0.06677458518818292, + "grad_norm": 1.4433684349060059, + "learning_rate": 9.523022919677886e-05, + "loss": 1.0386, + "step": 1650 + }, + { + "epoch": 0.0669769324160259, + "grad_norm": 1.217288613319397, + "learning_rate": 9.520958083832335e-05, + "loss": 1.0078, + "step": 1655 + }, + { + "epoch": 0.06717927964386888, + "grad_norm": 1.3176137208938599, + "learning_rate": 9.518893247986785e-05, + "loss": 1.0367, + "step": 1660 + }, + { + "epoch": 0.06738162687171186, + "grad_norm": 1.3557639122009277, + "learning_rate": 9.516828412141236e-05, + "loss": 1.0161, + "step": 1665 + }, + { + "epoch": 0.06758397409955484, + "grad_norm": 1.1859376430511475, + "learning_rate": 9.514763576295685e-05, + "loss": 1.0092, + "step": 1670 + }, + { + "epoch": 0.06778632132739781, + "grad_norm": 1.2193354368209839, + "learning_rate": 9.512698740450134e-05, + "loss": 1.0393, + "step": 1675 + }, + { + "epoch": 0.0679886685552408, + "grad_norm": 1.1776807308197021, + "learning_rate": 9.510633904604584e-05, + "loss": 1.0088, + "step": 1680 + }, + { + "epoch": 0.06819101578308377, + "grad_norm": 1.2251741886138916, + "learning_rate": 9.508569068759035e-05, + "loss": 1.0542, + "step": 1685 + }, + { + "epoch": 0.06839336301092674, + "grad_norm": 1.4066046476364136, + "learning_rate": 9.506504232913484e-05, + "loss": 1.0778, + "step": 1690 + }, + { + "epoch": 0.06859571023876973, + "grad_norm": 1.1775314807891846, + "learning_rate": 9.504439397067933e-05, + "loss": 0.9923, + "step": 1695 + }, + { + "epoch": 0.0687980574666127, + "grad_norm": 1.2167171239852905, + "learning_rate": 9.502374561222383e-05, + "loss": 0.9782, + "step": 1700 + }, + { + "epoch": 0.06900040469445569, + "grad_norm": 1.356432557106018, + "learning_rate": 9.500309725376832e-05, + "loss": 1.0112, + "step": 1705 + }, + { + "epoch": 0.06920275192229866, + "grad_norm": 1.240261435508728, + "learning_rate": 9.498244889531283e-05, + "loss": 1.0609, + "step": 1710 + }, + { + "epoch": 0.06940509915014165, + "grad_norm": 1.2335928678512573, + "learning_rate": 9.496180053685733e-05, + "loss": 1.0303, + "step": 1715 + }, + { + "epoch": 0.06960744637798462, + "grad_norm": 1.190518856048584, + "learning_rate": 9.494115217840182e-05, + "loss": 1.0079, + "step": 1720 + }, + { + "epoch": 0.0698097936058276, + "grad_norm": 1.315698504447937, + "learning_rate": 9.492050381994631e-05, + "loss": 0.9892, + "step": 1725 + }, + { + "epoch": 0.07001214083367058, + "grad_norm": 1.4281184673309326, + "learning_rate": 9.489985546149082e-05, + "loss": 1.0088, + "step": 1730 + }, + { + "epoch": 0.07021448806151356, + "grad_norm": 1.3519691228866577, + "learning_rate": 9.487920710303532e-05, + "loss": 1.0213, + "step": 1735 + }, + { + "epoch": 0.07041683528935654, + "grad_norm": 1.1940042972564697, + "learning_rate": 9.485855874457981e-05, + "loss": 1.0422, + "step": 1740 + }, + { + "epoch": 0.07061918251719951, + "grad_norm": 1.2125798463821411, + "learning_rate": 9.48379103861243e-05, + "loss": 1.0255, + "step": 1745 + }, + { + "epoch": 0.0708215297450425, + "grad_norm": 1.251612901687622, + "learning_rate": 9.48172620276688e-05, + "loss": 1.0212, + "step": 1750 + }, + { + "epoch": 0.07102387697288547, + "grad_norm": 1.4712817668914795, + "learning_rate": 9.47966136692133e-05, + "loss": 1.0403, + "step": 1755 + }, + { + "epoch": 0.07122622420072845, + "grad_norm": 1.2654145956039429, + "learning_rate": 9.47759653107578e-05, + "loss": 1.0262, + "step": 1760 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 1.3461631536483765, + "learning_rate": 9.475531695230229e-05, + "loss": 1.0176, + "step": 1765 + }, + { + "epoch": 0.07163091865641441, + "grad_norm": 1.3915088176727295, + "learning_rate": 9.47346685938468e-05, + "loss": 1.0428, + "step": 1770 + }, + { + "epoch": 0.07183326588425738, + "grad_norm": 1.377109169960022, + "learning_rate": 9.471402023539129e-05, + "loss": 1.0332, + "step": 1775 + }, + { + "epoch": 0.07203561311210037, + "grad_norm": 1.136925220489502, + "learning_rate": 9.469337187693579e-05, + "loss": 1.0588, + "step": 1780 + }, + { + "epoch": 0.07223796033994334, + "grad_norm": 1.3466987609863281, + "learning_rate": 9.46727235184803e-05, + "loss": 1.0595, + "step": 1785 + }, + { + "epoch": 0.07244030756778633, + "grad_norm": 1.435737133026123, + "learning_rate": 9.465207516002478e-05, + "loss": 1.0148, + "step": 1790 + }, + { + "epoch": 0.0726426547956293, + "grad_norm": 1.4260509014129639, + "learning_rate": 9.463142680156927e-05, + "loss": 1.1411, + "step": 1795 + }, + { + "epoch": 0.07284500202347227, + "grad_norm": 1.1842601299285889, + "learning_rate": 9.461077844311378e-05, + "loss": 1.043, + "step": 1800 + }, + { + "epoch": 0.07304734925131526, + "grad_norm": 1.2776232957839966, + "learning_rate": 9.459013008465828e-05, + "loss": 1.0548, + "step": 1805 + }, + { + "epoch": 0.07324969647915823, + "grad_norm": 1.310470700263977, + "learning_rate": 9.456948172620277e-05, + "loss": 1.0171, + "step": 1810 + }, + { + "epoch": 0.07345204370700122, + "grad_norm": 1.2996270656585693, + "learning_rate": 9.454883336774726e-05, + "loss": 1.0352, + "step": 1815 + }, + { + "epoch": 0.07365439093484419, + "grad_norm": 1.381764531135559, + "learning_rate": 9.452818500929177e-05, + "loss": 1.0938, + "step": 1820 + }, + { + "epoch": 0.07385673816268717, + "grad_norm": 1.3565207719802856, + "learning_rate": 9.450753665083626e-05, + "loss": 1.0069, + "step": 1825 + }, + { + "epoch": 0.07405908539053015, + "grad_norm": 1.2954041957855225, + "learning_rate": 9.448688829238076e-05, + "loss": 0.9616, + "step": 1830 + }, + { + "epoch": 0.07426143261837313, + "grad_norm": 1.3097357749938965, + "learning_rate": 9.446623993392527e-05, + "loss": 1.0161, + "step": 1835 + }, + { + "epoch": 0.0744637798462161, + "grad_norm": 1.1322047710418701, + "learning_rate": 9.444559157546974e-05, + "loss": 0.9654, + "step": 1840 + }, + { + "epoch": 0.07466612707405909, + "grad_norm": 1.252774715423584, + "learning_rate": 9.442494321701425e-05, + "loss": 1.0625, + "step": 1845 + }, + { + "epoch": 0.07486847430190206, + "grad_norm": 1.0868663787841797, + "learning_rate": 9.440429485855875e-05, + "loss": 1.0023, + "step": 1850 + }, + { + "epoch": 0.07507082152974505, + "grad_norm": 1.290256381034851, + "learning_rate": 9.438364650010326e-05, + "loss": 1.018, + "step": 1855 + }, + { + "epoch": 0.07527316875758802, + "grad_norm": 1.159004807472229, + "learning_rate": 9.436299814164775e-05, + "loss": 1.0381, + "step": 1860 + }, + { + "epoch": 0.075475515985431, + "grad_norm": 1.345625400543213, + "learning_rate": 9.434234978319224e-05, + "loss": 1.0238, + "step": 1865 + }, + { + "epoch": 0.07567786321327398, + "grad_norm": 1.199846625328064, + "learning_rate": 9.432170142473674e-05, + "loss": 1.0145, + "step": 1870 + }, + { + "epoch": 0.07588021044111695, + "grad_norm": 1.122860074043274, + "learning_rate": 9.430105306628123e-05, + "loss": 0.9912, + "step": 1875 + }, + { + "epoch": 0.07608255766895994, + "grad_norm": 1.387333631515503, + "learning_rate": 9.428040470782574e-05, + "loss": 1.0813, + "step": 1880 + }, + { + "epoch": 0.07628490489680291, + "grad_norm": 1.2576044797897339, + "learning_rate": 9.425975634937023e-05, + "loss": 1.1047, + "step": 1885 + }, + { + "epoch": 0.0764872521246459, + "grad_norm": 1.2864686250686646, + "learning_rate": 9.423910799091473e-05, + "loss": 1.0696, + "step": 1890 + }, + { + "epoch": 0.07668959935248887, + "grad_norm": 1.2306206226348877, + "learning_rate": 9.421845963245922e-05, + "loss": 0.9927, + "step": 1895 + }, + { + "epoch": 0.07689194658033185, + "grad_norm": 1.1592239141464233, + "learning_rate": 9.419781127400372e-05, + "loss": 1.0237, + "step": 1900 + }, + { + "epoch": 0.07709429380817483, + "grad_norm": 1.3854925632476807, + "learning_rate": 9.417716291554823e-05, + "loss": 1.0869, + "step": 1905 + }, + { + "epoch": 0.07729664103601781, + "grad_norm": 1.3255068063735962, + "learning_rate": 9.41565145570927e-05, + "loss": 1.0857, + "step": 1910 + }, + { + "epoch": 0.07749898826386079, + "grad_norm": 1.4847171306610107, + "learning_rate": 9.413586619863721e-05, + "loss": 1.0406, + "step": 1915 + }, + { + "epoch": 0.07770133549170376, + "grad_norm": 1.382543921470642, + "learning_rate": 9.411521784018171e-05, + "loss": 1.0376, + "step": 1920 + }, + { + "epoch": 0.07790368271954674, + "grad_norm": 1.2871674299240112, + "learning_rate": 9.40945694817262e-05, + "loss": 1.0433, + "step": 1925 + }, + { + "epoch": 0.07810602994738972, + "grad_norm": 1.2749968767166138, + "learning_rate": 9.407392112327071e-05, + "loss": 1.0692, + "step": 1930 + }, + { + "epoch": 0.0783083771752327, + "grad_norm": 1.2841780185699463, + "learning_rate": 9.40532727648152e-05, + "loss": 1.0352, + "step": 1935 + }, + { + "epoch": 0.07851072440307567, + "grad_norm": 1.3862274885177612, + "learning_rate": 9.40326244063597e-05, + "loss": 1.0197, + "step": 1940 + }, + { + "epoch": 0.07871307163091866, + "grad_norm": 1.2190536260604858, + "learning_rate": 9.40119760479042e-05, + "loss": 1.0357, + "step": 1945 + }, + { + "epoch": 0.07891541885876163, + "grad_norm": 1.2616676092147827, + "learning_rate": 9.39913276894487e-05, + "loss": 1.03, + "step": 1950 + }, + { + "epoch": 0.07911776608660462, + "grad_norm": 1.3833273649215698, + "learning_rate": 9.397067933099319e-05, + "loss": 1.0219, + "step": 1955 + }, + { + "epoch": 0.07932011331444759, + "grad_norm": 1.2849769592285156, + "learning_rate": 9.395003097253768e-05, + "loss": 1.002, + "step": 1960 + }, + { + "epoch": 0.07952246054229058, + "grad_norm": 1.333940863609314, + "learning_rate": 9.392938261408218e-05, + "loss": 0.9834, + "step": 1965 + }, + { + "epoch": 0.07972480777013355, + "grad_norm": 1.343896508216858, + "learning_rate": 9.390873425562669e-05, + "loss": 1.0425, + "step": 1970 + }, + { + "epoch": 0.07992715499797652, + "grad_norm": 1.1878740787506104, + "learning_rate": 9.388808589717118e-05, + "loss": 1.0398, + "step": 1975 + }, + { + "epoch": 0.08012950222581951, + "grad_norm": 1.4231536388397217, + "learning_rate": 9.386743753871567e-05, + "loss": 0.982, + "step": 1980 + }, + { + "epoch": 0.08033184945366248, + "grad_norm": 1.3932554721832275, + "learning_rate": 9.384678918026017e-05, + "loss": 1.0108, + "step": 1985 + }, + { + "epoch": 0.08053419668150547, + "grad_norm": 1.2101713418960571, + "learning_rate": 9.382614082180468e-05, + "loss": 1.002, + "step": 1990 + }, + { + "epoch": 0.08073654390934844, + "grad_norm": 1.2729885578155518, + "learning_rate": 9.380549246334917e-05, + "loss": 1.0472, + "step": 1995 + }, + { + "epoch": 0.08093889113719142, + "grad_norm": 1.488101601600647, + "learning_rate": 9.378484410489367e-05, + "loss": 1.0282, + "step": 2000 + }, + { + "epoch": 0.0811412383650344, + "grad_norm": 1.326236605644226, + "learning_rate": 9.376419574643816e-05, + "loss": 1.029, + "step": 2005 + }, + { + "epoch": 0.08134358559287738, + "grad_norm": 1.3277944326400757, + "learning_rate": 9.374354738798265e-05, + "loss": 1.041, + "step": 2010 + }, + { + "epoch": 0.08154593282072035, + "grad_norm": 1.202103614807129, + "learning_rate": 9.372289902952716e-05, + "loss": 1.0915, + "step": 2015 + }, + { + "epoch": 0.08174828004856334, + "grad_norm": 1.2832789421081543, + "learning_rate": 9.370225067107166e-05, + "loss": 0.9902, + "step": 2020 + }, + { + "epoch": 0.08195062727640631, + "grad_norm": 1.2172725200653076, + "learning_rate": 9.368160231261615e-05, + "loss": 1.0475, + "step": 2025 + }, + { + "epoch": 0.0821529745042493, + "grad_norm": 1.1746991872787476, + "learning_rate": 9.366095395416064e-05, + "loss": 1.0354, + "step": 2030 + }, + { + "epoch": 0.08235532173209227, + "grad_norm": 1.2759616374969482, + "learning_rate": 9.364030559570515e-05, + "loss": 0.9939, + "step": 2035 + }, + { + "epoch": 0.08255766895993524, + "grad_norm": 1.2615772485733032, + "learning_rate": 9.361965723724965e-05, + "loss": 1.0286, + "step": 2040 + }, + { + "epoch": 0.08276001618777823, + "grad_norm": 1.3714519739151, + "learning_rate": 9.359900887879414e-05, + "loss": 1.0196, + "step": 2045 + }, + { + "epoch": 0.0829623634156212, + "grad_norm": 1.1940999031066895, + "learning_rate": 9.357836052033864e-05, + "loss": 1.0553, + "step": 2050 + }, + { + "epoch": 0.08316471064346419, + "grad_norm": 1.2421550750732422, + "learning_rate": 9.355771216188313e-05, + "loss": 1.0067, + "step": 2055 + }, + { + "epoch": 0.08336705787130716, + "grad_norm": 1.223291277885437, + "learning_rate": 9.353706380342762e-05, + "loss": 1.0034, + "step": 2060 + }, + { + "epoch": 0.08356940509915015, + "grad_norm": 1.191751480102539, + "learning_rate": 9.351641544497213e-05, + "loss": 1.0465, + "step": 2065 + }, + { + "epoch": 0.08377175232699312, + "grad_norm": 1.1390283107757568, + "learning_rate": 9.349576708651663e-05, + "loss": 1.0437, + "step": 2070 + }, + { + "epoch": 0.0839740995548361, + "grad_norm": 1.1788053512573242, + "learning_rate": 9.347511872806112e-05, + "loss": 0.9663, + "step": 2075 + }, + { + "epoch": 0.08417644678267908, + "grad_norm": 1.1897859573364258, + "learning_rate": 9.345447036960561e-05, + "loss": 1.0421, + "step": 2080 + }, + { + "epoch": 0.08437879401052206, + "grad_norm": 1.1853989362716675, + "learning_rate": 9.343382201115012e-05, + "loss": 1.0293, + "step": 2085 + }, + { + "epoch": 0.08458114123836503, + "grad_norm": 1.0884404182434082, + "learning_rate": 9.341317365269462e-05, + "loss": 0.9338, + "step": 2090 + }, + { + "epoch": 0.084783488466208, + "grad_norm": 1.1858789920806885, + "learning_rate": 9.339252529423911e-05, + "loss": 1.0303, + "step": 2095 + }, + { + "epoch": 0.08498583569405099, + "grad_norm": 1.2867400646209717, + "learning_rate": 9.33718769357836e-05, + "loss": 1.0058, + "step": 2100 + }, + { + "epoch": 0.08518818292189397, + "grad_norm": 1.1759954690933228, + "learning_rate": 9.335122857732811e-05, + "loss": 1.0497, + "step": 2105 + }, + { + "epoch": 0.08539053014973695, + "grad_norm": 1.2815693616867065, + "learning_rate": 9.33305802188726e-05, + "loss": 1.0262, + "step": 2110 + }, + { + "epoch": 0.08559287737757992, + "grad_norm": 1.2004886865615845, + "learning_rate": 9.33099318604171e-05, + "loss": 1.0051, + "step": 2115 + }, + { + "epoch": 0.08579522460542291, + "grad_norm": 1.178978681564331, + "learning_rate": 9.32892835019616e-05, + "loss": 1.0415, + "step": 2120 + }, + { + "epoch": 0.08599757183326588, + "grad_norm": 1.2252072095870972, + "learning_rate": 9.32686351435061e-05, + "loss": 0.999, + "step": 2125 + }, + { + "epoch": 0.08619991906110887, + "grad_norm": 1.2782108783721924, + "learning_rate": 9.324798678505059e-05, + "loss": 1.0602, + "step": 2130 + }, + { + "epoch": 0.08640226628895184, + "grad_norm": 1.1181281805038452, + "learning_rate": 9.322733842659509e-05, + "loss": 0.9785, + "step": 2135 + }, + { + "epoch": 0.08660461351679483, + "grad_norm": 1.2799198627471924, + "learning_rate": 9.32066900681396e-05, + "loss": 0.9867, + "step": 2140 + }, + { + "epoch": 0.0868069607446378, + "grad_norm": 1.1740738153457642, + "learning_rate": 9.318604170968409e-05, + "loss": 0.9925, + "step": 2145 + }, + { + "epoch": 0.08700930797248077, + "grad_norm": 1.1738531589508057, + "learning_rate": 9.316539335122858e-05, + "loss": 1.0328, + "step": 2150 + }, + { + "epoch": 0.08721165520032376, + "grad_norm": 1.2151775360107422, + "learning_rate": 9.314474499277308e-05, + "loss": 1.0374, + "step": 2155 + }, + { + "epoch": 0.08741400242816673, + "grad_norm": 1.2134398221969604, + "learning_rate": 9.312409663431758e-05, + "loss": 1.0062, + "step": 2160 + }, + { + "epoch": 0.08761634965600971, + "grad_norm": 1.2791131734848022, + "learning_rate": 9.310344827586207e-05, + "loss": 1.1111, + "step": 2165 + }, + { + "epoch": 0.08781869688385269, + "grad_norm": 1.198521614074707, + "learning_rate": 9.308279991740657e-05, + "loss": 1.0132, + "step": 2170 + }, + { + "epoch": 0.08802104411169567, + "grad_norm": 1.2800092697143555, + "learning_rate": 9.306215155895107e-05, + "loss": 1.051, + "step": 2175 + }, + { + "epoch": 0.08822339133953865, + "grad_norm": 1.3057935237884521, + "learning_rate": 9.304150320049556e-05, + "loss": 1.1417, + "step": 2180 + }, + { + "epoch": 0.08842573856738163, + "grad_norm": 1.2939696311950684, + "learning_rate": 9.302085484204006e-05, + "loss": 0.9713, + "step": 2185 + }, + { + "epoch": 0.0886280857952246, + "grad_norm": 1.4257221221923828, + "learning_rate": 9.300020648358457e-05, + "loss": 1.0949, + "step": 2190 + }, + { + "epoch": 0.08883043302306759, + "grad_norm": 1.3082605600357056, + "learning_rate": 9.297955812512905e-05, + "loss": 1.0281, + "step": 2195 + }, + { + "epoch": 0.08903278025091056, + "grad_norm": 1.3272719383239746, + "learning_rate": 9.295890976667355e-05, + "loss": 1.0791, + "step": 2200 + }, + { + "epoch": 0.08923512747875353, + "grad_norm": 1.2881433963775635, + "learning_rate": 9.293826140821805e-05, + "loss": 1.0072, + "step": 2205 + }, + { + "epoch": 0.08943747470659652, + "grad_norm": 1.3805770874023438, + "learning_rate": 9.291761304976256e-05, + "loss": 1.0069, + "step": 2210 + }, + { + "epoch": 0.08963982193443949, + "grad_norm": 1.2969526052474976, + "learning_rate": 9.289696469130705e-05, + "loss": 1.0512, + "step": 2215 + }, + { + "epoch": 0.08984216916228248, + "grad_norm": 1.3401156663894653, + "learning_rate": 9.287631633285154e-05, + "loss": 1.0234, + "step": 2220 + }, + { + "epoch": 0.09004451639012545, + "grad_norm": 1.2723054885864258, + "learning_rate": 9.285566797439604e-05, + "loss": 1.0485, + "step": 2225 + }, + { + "epoch": 0.09024686361796844, + "grad_norm": 1.2819281816482544, + "learning_rate": 9.283501961594053e-05, + "loss": 1.0066, + "step": 2230 + }, + { + "epoch": 0.09044921084581141, + "grad_norm": 1.3629616498947144, + "learning_rate": 9.281437125748504e-05, + "loss": 1.0007, + "step": 2235 + }, + { + "epoch": 0.0906515580736544, + "grad_norm": 1.213684320449829, + "learning_rate": 9.279372289902953e-05, + "loss": 0.9829, + "step": 2240 + }, + { + "epoch": 0.09085390530149737, + "grad_norm": 1.2831264734268188, + "learning_rate": 9.277307454057403e-05, + "loss": 1.0386, + "step": 2245 + }, + { + "epoch": 0.09105625252934035, + "grad_norm": 1.2570645809173584, + "learning_rate": 9.275242618211852e-05, + "loss": 1.0436, + "step": 2250 + }, + { + "epoch": 0.09125859975718333, + "grad_norm": 1.1203771829605103, + "learning_rate": 9.273177782366303e-05, + "loss": 0.9944, + "step": 2255 + }, + { + "epoch": 0.09146094698502631, + "grad_norm": 1.2696106433868408, + "learning_rate": 9.271112946520753e-05, + "loss": 1.0134, + "step": 2260 + }, + { + "epoch": 0.09166329421286928, + "grad_norm": 1.2304413318634033, + "learning_rate": 9.269048110675202e-05, + "loss": 1.0011, + "step": 2265 + }, + { + "epoch": 0.09186564144071226, + "grad_norm": 1.2513751983642578, + "learning_rate": 9.266983274829651e-05, + "loss": 1.0538, + "step": 2270 + }, + { + "epoch": 0.09206798866855524, + "grad_norm": 1.2047661542892456, + "learning_rate": 9.264918438984102e-05, + "loss": 0.9995, + "step": 2275 + }, + { + "epoch": 0.09227033589639821, + "grad_norm": 1.3607407808303833, + "learning_rate": 9.26285360313855e-05, + "loss": 1.0474, + "step": 2280 + }, + { + "epoch": 0.0924726831242412, + "grad_norm": 1.2867265939712524, + "learning_rate": 9.260788767293001e-05, + "loss": 1.0599, + "step": 2285 + }, + { + "epoch": 0.09267503035208417, + "grad_norm": 1.1794477701187134, + "learning_rate": 9.25872393144745e-05, + "loss": 1.0008, + "step": 2290 + }, + { + "epoch": 0.09287737757992716, + "grad_norm": 1.3165411949157715, + "learning_rate": 9.2566590956019e-05, + "loss": 1.0157, + "step": 2295 + }, + { + "epoch": 0.09307972480777013, + "grad_norm": 1.1853994131088257, + "learning_rate": 9.25459425975635e-05, + "loss": 1.079, + "step": 2300 + }, + { + "epoch": 0.09328207203561312, + "grad_norm": 1.2266639471054077, + "learning_rate": 9.2525294239108e-05, + "loss": 0.9848, + "step": 2305 + }, + { + "epoch": 0.09348441926345609, + "grad_norm": 1.2124624252319336, + "learning_rate": 9.250464588065249e-05, + "loss": 1.046, + "step": 2310 + }, + { + "epoch": 0.09368676649129908, + "grad_norm": 1.186888337135315, + "learning_rate": 9.248399752219698e-05, + "loss": 1.0597, + "step": 2315 + }, + { + "epoch": 0.09388911371914205, + "grad_norm": 1.2497700452804565, + "learning_rate": 9.246334916374148e-05, + "loss": 1.0414, + "step": 2320 + }, + { + "epoch": 0.09409146094698502, + "grad_norm": 1.2072032690048218, + "learning_rate": 9.244270080528599e-05, + "loss": 1.0096, + "step": 2325 + }, + { + "epoch": 0.094293808174828, + "grad_norm": 1.2232177257537842, + "learning_rate": 9.242205244683048e-05, + "loss": 1.0635, + "step": 2330 + }, + { + "epoch": 0.09449615540267098, + "grad_norm": 1.243944764137268, + "learning_rate": 9.240140408837498e-05, + "loss": 1.0287, + "step": 2335 + }, + { + "epoch": 0.09469850263051396, + "grad_norm": 1.3213430643081665, + "learning_rate": 9.238075572991947e-05, + "loss": 1.0428, + "step": 2340 + }, + { + "epoch": 0.09490084985835694, + "grad_norm": 1.518526315689087, + "learning_rate": 9.236010737146398e-05, + "loss": 0.9719, + "step": 2345 + }, + { + "epoch": 0.09510319708619992, + "grad_norm": 1.4371111392974854, + "learning_rate": 9.233945901300847e-05, + "loss": 1.0664, + "step": 2350 + }, + { + "epoch": 0.0953055443140429, + "grad_norm": 1.2702082395553589, + "learning_rate": 9.231881065455297e-05, + "loss": 1.0641, + "step": 2355 + }, + { + "epoch": 0.09550789154188588, + "grad_norm": 1.4100208282470703, + "learning_rate": 9.229816229609746e-05, + "loss": 0.9899, + "step": 2360 + }, + { + "epoch": 0.09571023876972885, + "grad_norm": 1.2306245565414429, + "learning_rate": 9.227751393764195e-05, + "loss": 0.9753, + "step": 2365 + }, + { + "epoch": 0.09591258599757184, + "grad_norm": 1.3112229108810425, + "learning_rate": 9.225686557918646e-05, + "loss": 1.0417, + "step": 2370 + }, + { + "epoch": 0.09611493322541481, + "grad_norm": 1.2760628461837769, + "learning_rate": 9.223621722073096e-05, + "loss": 1.01, + "step": 2375 + }, + { + "epoch": 0.09631728045325778, + "grad_norm": 1.2782249450683594, + "learning_rate": 9.221556886227547e-05, + "loss": 1.0407, + "step": 2380 + }, + { + "epoch": 0.09651962768110077, + "grad_norm": 1.4450081586837769, + "learning_rate": 9.219492050381994e-05, + "loss": 1.0293, + "step": 2385 + }, + { + "epoch": 0.09672197490894374, + "grad_norm": 1.0772294998168945, + "learning_rate": 9.217427214536445e-05, + "loss": 1.025, + "step": 2390 + }, + { + "epoch": 0.09692432213678673, + "grad_norm": 1.2389899492263794, + "learning_rate": 9.215362378690895e-05, + "loss": 1.0022, + "step": 2395 + }, + { + "epoch": 0.0971266693646297, + "grad_norm": 1.2315919399261475, + "learning_rate": 9.213297542845344e-05, + "loss": 1.0203, + "step": 2400 + }, + { + "epoch": 0.09732901659247269, + "grad_norm": 1.1966350078582764, + "learning_rate": 9.211232706999795e-05, + "loss": 0.9981, + "step": 2405 + }, + { + "epoch": 0.09753136382031566, + "grad_norm": 1.2289438247680664, + "learning_rate": 9.209167871154244e-05, + "loss": 1.0131, + "step": 2410 + }, + { + "epoch": 0.09773371104815864, + "grad_norm": 1.2669563293457031, + "learning_rate": 9.207103035308693e-05, + "loss": 1.0016, + "step": 2415 + }, + { + "epoch": 0.09793605827600162, + "grad_norm": 1.2027100324630737, + "learning_rate": 9.205038199463143e-05, + "loss": 1.0285, + "step": 2420 + }, + { + "epoch": 0.0981384055038446, + "grad_norm": 1.133360743522644, + "learning_rate": 9.202973363617593e-05, + "loss": 0.9646, + "step": 2425 + }, + { + "epoch": 0.09834075273168758, + "grad_norm": 1.3844754695892334, + "learning_rate": 9.200908527772042e-05, + "loss": 1.0183, + "step": 2430 + }, + { + "epoch": 0.09854309995953056, + "grad_norm": 1.2737170457839966, + "learning_rate": 9.198843691926492e-05, + "loss": 0.9493, + "step": 2435 + }, + { + "epoch": 0.09874544718737353, + "grad_norm": 1.3229756355285645, + "learning_rate": 9.196778856080942e-05, + "loss": 1.0441, + "step": 2440 + }, + { + "epoch": 0.0989477944152165, + "grad_norm": 1.3351441621780396, + "learning_rate": 9.194714020235392e-05, + "loss": 0.9643, + "step": 2445 + }, + { + "epoch": 0.09915014164305949, + "grad_norm": 1.4130533933639526, + "learning_rate": 9.192649184389841e-05, + "loss": 1.0617, + "step": 2450 + }, + { + "epoch": 0.09935248887090246, + "grad_norm": 1.3605046272277832, + "learning_rate": 9.19058434854429e-05, + "loss": 1.0063, + "step": 2455 + }, + { + "epoch": 0.09955483609874545, + "grad_norm": 1.1895331144332886, + "learning_rate": 9.188519512698741e-05, + "loss": 1.0004, + "step": 2460 + }, + { + "epoch": 0.09975718332658842, + "grad_norm": 1.2517513036727905, + "learning_rate": 9.186454676853191e-05, + "loss": 1.037, + "step": 2465 + }, + { + "epoch": 0.09995953055443141, + "grad_norm": 1.2222354412078857, + "learning_rate": 9.18438984100764e-05, + "loss": 1.0397, + "step": 2470 + }, + { + "epoch": 0.10016187778227438, + "grad_norm": 1.2808754444122314, + "learning_rate": 9.182325005162091e-05, + "loss": 1.016, + "step": 2475 + }, + { + "epoch": 0.10036422501011737, + "grad_norm": 1.2805005311965942, + "learning_rate": 9.18026016931654e-05, + "loss": 0.9915, + "step": 2480 + }, + { + "epoch": 0.10056657223796034, + "grad_norm": 1.2117308378219604, + "learning_rate": 9.178195333470989e-05, + "loss": 1.0035, + "step": 2485 + }, + { + "epoch": 0.10076891946580332, + "grad_norm": 1.232007622718811, + "learning_rate": 9.176130497625439e-05, + "loss": 1.0447, + "step": 2490 + }, + { + "epoch": 0.1009712666936463, + "grad_norm": 1.306788444519043, + "learning_rate": 9.17406566177989e-05, + "loss": 1.0473, + "step": 2495 + }, + { + "epoch": 0.10117361392148927, + "grad_norm": 1.1974440813064575, + "learning_rate": 9.172000825934339e-05, + "loss": 1.0293, + "step": 2500 + }, + { + "epoch": 0.10137596114933226, + "grad_norm": 1.2958585023880005, + "learning_rate": 9.169935990088788e-05, + "loss": 1.0503, + "step": 2505 + }, + { + "epoch": 0.10157830837717523, + "grad_norm": 1.1704072952270508, + "learning_rate": 9.167871154243238e-05, + "loss": 1.0912, + "step": 2510 + }, + { + "epoch": 0.10178065560501821, + "grad_norm": 1.2891044616699219, + "learning_rate": 9.165806318397689e-05, + "loss": 1.0266, + "step": 2515 + }, + { + "epoch": 0.10198300283286119, + "grad_norm": 1.2312427759170532, + "learning_rate": 9.163741482552138e-05, + "loss": 0.9993, + "step": 2520 + }, + { + "epoch": 0.10218535006070417, + "grad_norm": 1.2950193881988525, + "learning_rate": 9.161676646706587e-05, + "loss": 1.0399, + "step": 2525 + }, + { + "epoch": 0.10238769728854714, + "grad_norm": 1.2646780014038086, + "learning_rate": 9.159611810861037e-05, + "loss": 1.0725, + "step": 2530 + }, + { + "epoch": 0.10259004451639013, + "grad_norm": 1.1012483835220337, + "learning_rate": 9.157546975015486e-05, + "loss": 1.0173, + "step": 2535 + }, + { + "epoch": 0.1027923917442331, + "grad_norm": 1.278733491897583, + "learning_rate": 9.155482139169937e-05, + "loss": 1.0597, + "step": 2540 + }, + { + "epoch": 0.10299473897207609, + "grad_norm": 1.0903396606445312, + "learning_rate": 9.153417303324387e-05, + "loss": 0.9878, + "step": 2545 + }, + { + "epoch": 0.10319708619991906, + "grad_norm": 1.290555477142334, + "learning_rate": 9.151352467478836e-05, + "loss": 1.0107, + "step": 2550 + }, + { + "epoch": 0.10339943342776203, + "grad_norm": 1.3766034841537476, + "learning_rate": 9.149287631633285e-05, + "loss": 1.0373, + "step": 2555 + }, + { + "epoch": 0.10360178065560502, + "grad_norm": 1.2376962900161743, + "learning_rate": 9.147222795787735e-05, + "loss": 1.1126, + "step": 2560 + }, + { + "epoch": 0.10380412788344799, + "grad_norm": 1.1498472690582275, + "learning_rate": 9.145157959942186e-05, + "loss": 1.0453, + "step": 2565 + }, + { + "epoch": 0.10400647511129098, + "grad_norm": 1.2858022451400757, + "learning_rate": 9.143093124096635e-05, + "loss": 1.0428, + "step": 2570 + }, + { + "epoch": 0.10420882233913395, + "grad_norm": 1.219605803489685, + "learning_rate": 9.141028288251084e-05, + "loss": 1.0017, + "step": 2575 + }, + { + "epoch": 0.10441116956697694, + "grad_norm": 1.31840181350708, + "learning_rate": 9.138963452405534e-05, + "loss": 1.0432, + "step": 2580 + }, + { + "epoch": 0.10461351679481991, + "grad_norm": 1.096554160118103, + "learning_rate": 9.136898616559983e-05, + "loss": 0.9801, + "step": 2585 + }, + { + "epoch": 0.1048158640226629, + "grad_norm": 1.1494611501693726, + "learning_rate": 9.134833780714434e-05, + "loss": 1.0558, + "step": 2590 + }, + { + "epoch": 0.10501821125050587, + "grad_norm": 1.178409218788147, + "learning_rate": 9.132768944868884e-05, + "loss": 1.0043, + "step": 2595 + }, + { + "epoch": 0.10522055847834885, + "grad_norm": 1.216878056526184, + "learning_rate": 9.130704109023333e-05, + "loss": 1.0066, + "step": 2600 + }, + { + "epoch": 0.10542290570619182, + "grad_norm": 1.221721887588501, + "learning_rate": 9.128639273177782e-05, + "loss": 1.0711, + "step": 2605 + }, + { + "epoch": 0.10562525293403481, + "grad_norm": 1.1642242670059204, + "learning_rate": 9.126574437332233e-05, + "loss": 1.0198, + "step": 2610 + }, + { + "epoch": 0.10582760016187778, + "grad_norm": 1.2565926313400269, + "learning_rate": 9.124509601486683e-05, + "loss": 0.9287, + "step": 2615 + }, + { + "epoch": 0.10602994738972076, + "grad_norm": 1.2995041608810425, + "learning_rate": 9.122444765641132e-05, + "loss": 1.051, + "step": 2620 + }, + { + "epoch": 0.10623229461756374, + "grad_norm": 1.163234829902649, + "learning_rate": 9.120379929795581e-05, + "loss": 1.0064, + "step": 2625 + }, + { + "epoch": 0.10643464184540671, + "grad_norm": 1.3579676151275635, + "learning_rate": 9.118315093950032e-05, + "loss": 1.0226, + "step": 2630 + }, + { + "epoch": 0.1066369890732497, + "grad_norm": 1.2112040519714355, + "learning_rate": 9.116250258104481e-05, + "loss": 1.0841, + "step": 2635 + }, + { + "epoch": 0.10683933630109267, + "grad_norm": 1.1701689958572388, + "learning_rate": 9.114185422258931e-05, + "loss": 1.0144, + "step": 2640 + }, + { + "epoch": 0.10704168352893566, + "grad_norm": 1.2789571285247803, + "learning_rate": 9.11212058641338e-05, + "loss": 1.0044, + "step": 2645 + }, + { + "epoch": 0.10724403075677863, + "grad_norm": 1.1566429138183594, + "learning_rate": 9.11005575056783e-05, + "loss": 1.0511, + "step": 2650 + }, + { + "epoch": 0.10744637798462162, + "grad_norm": 1.3453134298324585, + "learning_rate": 9.10799091472228e-05, + "loss": 1.0144, + "step": 2655 + }, + { + "epoch": 0.10764872521246459, + "grad_norm": 1.2248982191085815, + "learning_rate": 9.10592607887673e-05, + "loss": 1.0253, + "step": 2660 + }, + { + "epoch": 0.10785107244030757, + "grad_norm": 1.1890755891799927, + "learning_rate": 9.10386124303118e-05, + "loss": 1.0566, + "step": 2665 + }, + { + "epoch": 0.10805341966815055, + "grad_norm": 1.3527295589447021, + "learning_rate": 9.101796407185628e-05, + "loss": 1.0318, + "step": 2670 + }, + { + "epoch": 0.10825576689599352, + "grad_norm": 1.111272931098938, + "learning_rate": 9.099731571340079e-05, + "loss": 1.0269, + "step": 2675 + }, + { + "epoch": 0.1084581141238365, + "grad_norm": 1.19521963596344, + "learning_rate": 9.097666735494529e-05, + "loss": 0.9893, + "step": 2680 + }, + { + "epoch": 0.10866046135167948, + "grad_norm": 1.269953966140747, + "learning_rate": 9.095601899648978e-05, + "loss": 0.9787, + "step": 2685 + }, + { + "epoch": 0.10886280857952246, + "grad_norm": 1.2876157760620117, + "learning_rate": 9.093537063803428e-05, + "loss": 0.981, + "step": 2690 + }, + { + "epoch": 0.10906515580736544, + "grad_norm": 1.2349538803100586, + "learning_rate": 9.091472227957877e-05, + "loss": 0.9918, + "step": 2695 + }, + { + "epoch": 0.10926750303520842, + "grad_norm": 1.266349196434021, + "learning_rate": 9.089407392112328e-05, + "loss": 0.9416, + "step": 2700 + }, + { + "epoch": 0.1094698502630514, + "grad_norm": 1.300087571144104, + "learning_rate": 9.087342556266777e-05, + "loss": 1.0499, + "step": 2705 + }, + { + "epoch": 0.10967219749089438, + "grad_norm": 1.2513761520385742, + "learning_rate": 9.085277720421227e-05, + "loss": 1.0398, + "step": 2710 + }, + { + "epoch": 0.10987454471873735, + "grad_norm": 1.3558810949325562, + "learning_rate": 9.083212884575676e-05, + "loss": 1.0786, + "step": 2715 + }, + { + "epoch": 0.11007689194658034, + "grad_norm": 1.3106088638305664, + "learning_rate": 9.081148048730125e-05, + "loss": 0.982, + "step": 2720 + }, + { + "epoch": 0.11027923917442331, + "grad_norm": 1.3098465204238892, + "learning_rate": 9.079083212884576e-05, + "loss": 0.9478, + "step": 2725 + }, + { + "epoch": 0.11048158640226628, + "grad_norm": 1.139407992362976, + "learning_rate": 9.077018377039026e-05, + "loss": 1.0434, + "step": 2730 + }, + { + "epoch": 0.11068393363010927, + "grad_norm": 1.3353989124298096, + "learning_rate": 9.074953541193477e-05, + "loss": 1.0305, + "step": 2735 + }, + { + "epoch": 0.11088628085795224, + "grad_norm": 1.0765186548233032, + "learning_rate": 9.072888705347924e-05, + "loss": 1.0019, + "step": 2740 + }, + { + "epoch": 0.11108862808579523, + "grad_norm": 1.3412421941757202, + "learning_rate": 9.070823869502375e-05, + "loss": 1.0322, + "step": 2745 + }, + { + "epoch": 0.1112909753136382, + "grad_norm": 1.3108158111572266, + "learning_rate": 9.068759033656825e-05, + "loss": 1.0105, + "step": 2750 + }, + { + "epoch": 0.11149332254148119, + "grad_norm": 1.2250312566757202, + "learning_rate": 9.066694197811274e-05, + "loss": 1.03, + "step": 2755 + }, + { + "epoch": 0.11169566976932416, + "grad_norm": 1.0880059003829956, + "learning_rate": 9.064629361965725e-05, + "loss": 1.0655, + "step": 2760 + }, + { + "epoch": 0.11189801699716714, + "grad_norm": 1.1405460834503174, + "learning_rate": 9.062564526120174e-05, + "loss": 1.0332, + "step": 2765 + }, + { + "epoch": 0.11210036422501012, + "grad_norm": 1.14668607711792, + "learning_rate": 9.060499690274623e-05, + "loss": 1.0427, + "step": 2770 + }, + { + "epoch": 0.1123027114528531, + "grad_norm": 1.274117112159729, + "learning_rate": 9.058434854429073e-05, + "loss": 1.0524, + "step": 2775 + }, + { + "epoch": 0.11250505868069607, + "grad_norm": 1.285851240158081, + "learning_rate": 9.056370018583524e-05, + "loss": 1.0323, + "step": 2780 + }, + { + "epoch": 0.11270740590853905, + "grad_norm": 1.1409989595413208, + "learning_rate": 9.054305182737973e-05, + "loss": 0.9721, + "step": 2785 + }, + { + "epoch": 0.11290975313638203, + "grad_norm": 1.226494550704956, + "learning_rate": 9.052240346892422e-05, + "loss": 0.9696, + "step": 2790 + }, + { + "epoch": 0.113112100364225, + "grad_norm": 1.3489365577697754, + "learning_rate": 9.050175511046872e-05, + "loss": 1.0354, + "step": 2795 + }, + { + "epoch": 0.11331444759206799, + "grad_norm": 1.182013988494873, + "learning_rate": 9.048110675201322e-05, + "loss": 1.0206, + "step": 2800 + }, + { + "epoch": 0.11351679481991096, + "grad_norm": 1.232530951499939, + "learning_rate": 9.046045839355772e-05, + "loss": 1.0469, + "step": 2805 + }, + { + "epoch": 0.11371914204775395, + "grad_norm": 1.1662116050720215, + "learning_rate": 9.043981003510222e-05, + "loss": 1.1351, + "step": 2810 + }, + { + "epoch": 0.11392148927559692, + "grad_norm": 1.294116735458374, + "learning_rate": 9.041916167664671e-05, + "loss": 0.9955, + "step": 2815 + }, + { + "epoch": 0.11412383650343991, + "grad_norm": 1.209054708480835, + "learning_rate": 9.039851331819121e-05, + "loss": 1.1027, + "step": 2820 + }, + { + "epoch": 0.11432618373128288, + "grad_norm": 1.2408312559127808, + "learning_rate": 9.03778649597357e-05, + "loss": 0.9704, + "step": 2825 + }, + { + "epoch": 0.11452853095912587, + "grad_norm": 1.1527987718582153, + "learning_rate": 9.035721660128021e-05, + "loss": 1.04, + "step": 2830 + }, + { + "epoch": 0.11473087818696884, + "grad_norm": 1.2887940406799316, + "learning_rate": 9.03365682428247e-05, + "loss": 0.9587, + "step": 2835 + }, + { + "epoch": 0.11493322541481182, + "grad_norm": 1.1732901334762573, + "learning_rate": 9.031591988436919e-05, + "loss": 1.0022, + "step": 2840 + }, + { + "epoch": 0.1151355726426548, + "grad_norm": 1.3233847618103027, + "learning_rate": 9.02952715259137e-05, + "loss": 1.0523, + "step": 2845 + }, + { + "epoch": 0.11533791987049777, + "grad_norm": 1.1359490156173706, + "learning_rate": 9.02746231674582e-05, + "loss": 1.022, + "step": 2850 + }, + { + "epoch": 0.11554026709834075, + "grad_norm": 1.1741466522216797, + "learning_rate": 9.025397480900269e-05, + "loss": 1.0289, + "step": 2855 + }, + { + "epoch": 0.11574261432618373, + "grad_norm": 1.1990710496902466, + "learning_rate": 9.023332645054718e-05, + "loss": 1.0145, + "step": 2860 + }, + { + "epoch": 0.11594496155402671, + "grad_norm": 1.251400351524353, + "learning_rate": 9.021267809209168e-05, + "loss": 1.019, + "step": 2865 + }, + { + "epoch": 0.11614730878186968, + "grad_norm": 1.3180302381515503, + "learning_rate": 9.019202973363619e-05, + "loss": 1.0802, + "step": 2870 + }, + { + "epoch": 0.11634965600971267, + "grad_norm": 1.184276819229126, + "learning_rate": 9.017138137518068e-05, + "loss": 0.9496, + "step": 2875 + }, + { + "epoch": 0.11655200323755564, + "grad_norm": 1.1231951713562012, + "learning_rate": 9.015073301672518e-05, + "loss": 1.0757, + "step": 2880 + }, + { + "epoch": 0.11675435046539863, + "grad_norm": 1.166365385055542, + "learning_rate": 9.013008465826967e-05, + "loss": 1.0515, + "step": 2885 + }, + { + "epoch": 0.1169566976932416, + "grad_norm": 1.0476434230804443, + "learning_rate": 9.010943629981416e-05, + "loss": 0.9998, + "step": 2890 + }, + { + "epoch": 0.11715904492108459, + "grad_norm": 1.2853447198867798, + "learning_rate": 9.008878794135867e-05, + "loss": 1.0104, + "step": 2895 + }, + { + "epoch": 0.11736139214892756, + "grad_norm": 1.2302528619766235, + "learning_rate": 9.006813958290317e-05, + "loss": 0.9935, + "step": 2900 + }, + { + "epoch": 0.11756373937677053, + "grad_norm": 1.1947234869003296, + "learning_rate": 9.004749122444766e-05, + "loss": 1.0557, + "step": 2905 + }, + { + "epoch": 0.11776608660461352, + "grad_norm": 1.4698901176452637, + "learning_rate": 9.002684286599215e-05, + "loss": 1.0283, + "step": 2910 + }, + { + "epoch": 0.11796843383245649, + "grad_norm": 1.3562864065170288, + "learning_rate": 9.000619450753666e-05, + "loss": 1.022, + "step": 2915 + }, + { + "epoch": 0.11817078106029948, + "grad_norm": 1.2436528205871582, + "learning_rate": 8.998554614908116e-05, + "loss": 1.0331, + "step": 2920 + }, + { + "epoch": 0.11837312828814245, + "grad_norm": 1.2349079847335815, + "learning_rate": 8.996489779062565e-05, + "loss": 1.0529, + "step": 2925 + }, + { + "epoch": 0.11857547551598543, + "grad_norm": 1.2597922086715698, + "learning_rate": 8.994424943217014e-05, + "loss": 1.0482, + "step": 2930 + }, + { + "epoch": 0.1187778227438284, + "grad_norm": 1.2937184572219849, + "learning_rate": 8.992360107371464e-05, + "loss": 1.0562, + "step": 2935 + }, + { + "epoch": 0.11898016997167139, + "grad_norm": 1.1591882705688477, + "learning_rate": 8.990295271525914e-05, + "loss": 1.0604, + "step": 2940 + }, + { + "epoch": 0.11918251719951437, + "grad_norm": 1.2187837362289429, + "learning_rate": 8.988230435680364e-05, + "loss": 1.0112, + "step": 2945 + }, + { + "epoch": 0.11938486442735735, + "grad_norm": 1.216113567352295, + "learning_rate": 8.986165599834814e-05, + "loss": 1.0057, + "step": 2950 + }, + { + "epoch": 0.11958721165520032, + "grad_norm": 1.253690242767334, + "learning_rate": 8.984100763989263e-05, + "loss": 0.9962, + "step": 2955 + }, + { + "epoch": 0.1197895588830433, + "grad_norm": 1.1752190589904785, + "learning_rate": 8.982035928143712e-05, + "loss": 0.9657, + "step": 2960 + }, + { + "epoch": 0.11999190611088628, + "grad_norm": 1.2840155363082886, + "learning_rate": 8.979971092298163e-05, + "loss": 1.0075, + "step": 2965 + }, + { + "epoch": 0.12019425333872925, + "grad_norm": 1.1470037698745728, + "learning_rate": 8.977906256452613e-05, + "loss": 1.0055, + "step": 2970 + }, + { + "epoch": 0.12039660056657224, + "grad_norm": 1.2354692220687866, + "learning_rate": 8.975841420607062e-05, + "loss": 1.0202, + "step": 2975 + }, + { + "epoch": 0.12059894779441521, + "grad_norm": 1.3677310943603516, + "learning_rate": 8.973776584761511e-05, + "loss": 1.0447, + "step": 2980 + }, + { + "epoch": 0.1208012950222582, + "grad_norm": 1.186793327331543, + "learning_rate": 8.971711748915962e-05, + "loss": 1.0494, + "step": 2985 + }, + { + "epoch": 0.12100364225010117, + "grad_norm": 1.5150768756866455, + "learning_rate": 8.969646913070411e-05, + "loss": 1.0144, + "step": 2990 + }, + { + "epoch": 0.12120598947794416, + "grad_norm": 1.2600922584533691, + "learning_rate": 8.967582077224861e-05, + "loss": 1.0463, + "step": 2995 + }, + { + "epoch": 0.12140833670578713, + "grad_norm": 1.2720588445663452, + "learning_rate": 8.96551724137931e-05, + "loss": 0.9943, + "step": 3000 + }, + { + "epoch": 0.12161068393363011, + "grad_norm": 1.140571117401123, + "learning_rate": 8.963452405533761e-05, + "loss": 0.9777, + "step": 3005 + }, + { + "epoch": 0.12181303116147309, + "grad_norm": 1.1251741647720337, + "learning_rate": 8.96138756968821e-05, + "loss": 1.0613, + "step": 3010 + }, + { + "epoch": 0.12201537838931607, + "grad_norm": 1.306720495223999, + "learning_rate": 8.95932273384266e-05, + "loss": 0.9563, + "step": 3015 + }, + { + "epoch": 0.12221772561715905, + "grad_norm": 1.2379626035690308, + "learning_rate": 8.95725789799711e-05, + "loss": 1.0389, + "step": 3020 + }, + { + "epoch": 0.12242007284500202, + "grad_norm": 1.3128517866134644, + "learning_rate": 8.955193062151558e-05, + "loss": 0.985, + "step": 3025 + }, + { + "epoch": 0.122622420072845, + "grad_norm": 1.198457956314087, + "learning_rate": 8.953128226306009e-05, + "loss": 1.0527, + "step": 3030 + }, + { + "epoch": 0.12282476730068798, + "grad_norm": 1.2627030611038208, + "learning_rate": 8.951063390460459e-05, + "loss": 1.0217, + "step": 3035 + }, + { + "epoch": 0.12302711452853096, + "grad_norm": 1.3503879308700562, + "learning_rate": 8.948998554614908e-05, + "loss": 0.9815, + "step": 3040 + }, + { + "epoch": 0.12322946175637393, + "grad_norm": 1.3443350791931152, + "learning_rate": 8.946933718769359e-05, + "loss": 1.049, + "step": 3045 + }, + { + "epoch": 0.12343180898421692, + "grad_norm": 1.224956750869751, + "learning_rate": 8.944868882923808e-05, + "loss": 1.0303, + "step": 3050 + }, + { + "epoch": 0.12363415621205989, + "grad_norm": 1.2062424421310425, + "learning_rate": 8.942804047078258e-05, + "loss": 1.0169, + "step": 3055 + }, + { + "epoch": 0.12383650343990288, + "grad_norm": 1.3424972295761108, + "learning_rate": 8.940739211232707e-05, + "loss": 1.0114, + "step": 3060 + }, + { + "epoch": 0.12403885066774585, + "grad_norm": 1.1913210153579712, + "learning_rate": 8.938674375387157e-05, + "loss": 0.9841, + "step": 3065 + }, + { + "epoch": 0.12424119789558884, + "grad_norm": 1.1994571685791016, + "learning_rate": 8.936609539541607e-05, + "loss": 1.016, + "step": 3070 + }, + { + "epoch": 0.12444354512343181, + "grad_norm": 1.1911218166351318, + "learning_rate": 8.934544703696056e-05, + "loss": 1.0134, + "step": 3075 + }, + { + "epoch": 0.12464589235127478, + "grad_norm": 1.2363308668136597, + "learning_rate": 8.932479867850506e-05, + "loss": 1.0685, + "step": 3080 + }, + { + "epoch": 0.12484823957911777, + "grad_norm": 1.1953235864639282, + "learning_rate": 8.930415032004956e-05, + "loss": 1.0589, + "step": 3085 + }, + { + "epoch": 0.12505058680696074, + "grad_norm": 1.1757105588912964, + "learning_rate": 8.928350196159407e-05, + "loss": 1.0334, + "step": 3090 + }, + { + "epoch": 0.12525293403480373, + "grad_norm": 1.1546622514724731, + "learning_rate": 8.926285360313856e-05, + "loss": 1.0462, + "step": 3095 + }, + { + "epoch": 0.1254552812626467, + "grad_norm": 1.316259741783142, + "learning_rate": 8.924220524468305e-05, + "loss": 1.0553, + "step": 3100 + }, + { + "epoch": 0.12565762849048967, + "grad_norm": 1.2377395629882812, + "learning_rate": 8.922155688622755e-05, + "loss": 0.9546, + "step": 3105 + }, + { + "epoch": 0.12585997571833266, + "grad_norm": 1.2293140888214111, + "learning_rate": 8.920090852777204e-05, + "loss": 1.0169, + "step": 3110 + }, + { + "epoch": 0.12606232294617564, + "grad_norm": 1.112518548965454, + "learning_rate": 8.918026016931655e-05, + "loss": 1.0356, + "step": 3115 + }, + { + "epoch": 0.12626467017401863, + "grad_norm": 1.2322146892547607, + "learning_rate": 8.915961181086104e-05, + "loss": 1.0166, + "step": 3120 + }, + { + "epoch": 0.1264670174018616, + "grad_norm": 1.1821508407592773, + "learning_rate": 8.913896345240553e-05, + "loss": 1.0381, + "step": 3125 + }, + { + "epoch": 0.12666936462970457, + "grad_norm": 1.210894227027893, + "learning_rate": 8.911831509395003e-05, + "loss": 0.9886, + "step": 3130 + }, + { + "epoch": 0.12687171185754756, + "grad_norm": 1.4033355712890625, + "learning_rate": 8.909766673549454e-05, + "loss": 1.0438, + "step": 3135 + }, + { + "epoch": 0.12707405908539052, + "grad_norm": 1.3342198133468628, + "learning_rate": 8.907701837703904e-05, + "loss": 1.0696, + "step": 3140 + }, + { + "epoch": 0.1272764063132335, + "grad_norm": 1.3505562543869019, + "learning_rate": 8.905637001858352e-05, + "loss": 1.0322, + "step": 3145 + }, + { + "epoch": 0.1274787535410765, + "grad_norm": 1.1969664096832275, + "learning_rate": 8.903572166012802e-05, + "loss": 0.9917, + "step": 3150 + }, + { + "epoch": 0.12768110076891948, + "grad_norm": 1.1248817443847656, + "learning_rate": 8.901507330167253e-05, + "loss": 1.022, + "step": 3155 + }, + { + "epoch": 0.12788344799676243, + "grad_norm": 1.4438211917877197, + "learning_rate": 8.899442494321702e-05, + "loss": 1.021, + "step": 3160 + }, + { + "epoch": 0.12808579522460542, + "grad_norm": 1.2094289064407349, + "learning_rate": 8.897377658476152e-05, + "loss": 1.0498, + "step": 3165 + }, + { + "epoch": 0.1282881424524484, + "grad_norm": 1.34991455078125, + "learning_rate": 8.895312822630601e-05, + "loss": 1.0577, + "step": 3170 + }, + { + "epoch": 0.1284904896802914, + "grad_norm": 1.1748855113983154, + "learning_rate": 8.893247986785052e-05, + "loss": 1.0063, + "step": 3175 + }, + { + "epoch": 0.12869283690813435, + "grad_norm": 1.327193021774292, + "learning_rate": 8.8911831509395e-05, + "loss": 1.036, + "step": 3180 + }, + { + "epoch": 0.12889518413597734, + "grad_norm": 1.2270658016204834, + "learning_rate": 8.889118315093951e-05, + "loss": 1.0837, + "step": 3185 + }, + { + "epoch": 0.12909753136382032, + "grad_norm": 1.312050223350525, + "learning_rate": 8.8870534792484e-05, + "loss": 0.9861, + "step": 3190 + }, + { + "epoch": 0.12929987859166328, + "grad_norm": 1.2333570718765259, + "learning_rate": 8.884988643402849e-05, + "loss": 1.0555, + "step": 3195 + }, + { + "epoch": 0.12950222581950627, + "grad_norm": 1.2160171270370483, + "learning_rate": 8.8829238075573e-05, + "loss": 1.0088, + "step": 3200 + }, + { + "epoch": 0.12970457304734925, + "grad_norm": 1.2750040292739868, + "learning_rate": 8.88085897171175e-05, + "loss": 1.0411, + "step": 3205 + }, + { + "epoch": 0.12990692027519224, + "grad_norm": 1.3623186349868774, + "learning_rate": 8.878794135866199e-05, + "loss": 1.0382, + "step": 3210 + }, + { + "epoch": 0.1301092675030352, + "grad_norm": 1.3808802366256714, + "learning_rate": 8.876729300020648e-05, + "loss": 1.0019, + "step": 3215 + }, + { + "epoch": 0.13031161473087818, + "grad_norm": 1.3014731407165527, + "learning_rate": 8.874664464175098e-05, + "loss": 1.0205, + "step": 3220 + }, + { + "epoch": 0.13051396195872117, + "grad_norm": 1.1322846412658691, + "learning_rate": 8.872599628329549e-05, + "loss": 1.0346, + "step": 3225 + }, + { + "epoch": 0.13071630918656416, + "grad_norm": 1.2544022798538208, + "learning_rate": 8.870534792483998e-05, + "loss": 1.0122, + "step": 3230 + }, + { + "epoch": 0.13091865641440711, + "grad_norm": 1.1611053943634033, + "learning_rate": 8.868469956638448e-05, + "loss": 1.0121, + "step": 3235 + }, + { + "epoch": 0.1311210036422501, + "grad_norm": 1.2711516618728638, + "learning_rate": 8.866405120792897e-05, + "loss": 1.0305, + "step": 3240 + }, + { + "epoch": 0.1313233508700931, + "grad_norm": 1.1643027067184448, + "learning_rate": 8.864340284947346e-05, + "loss": 1.0167, + "step": 3245 + }, + { + "epoch": 0.13152569809793604, + "grad_norm": 1.1797773838043213, + "learning_rate": 8.862275449101797e-05, + "loss": 1.0055, + "step": 3250 + }, + { + "epoch": 0.13172804532577903, + "grad_norm": 1.0747370719909668, + "learning_rate": 8.860210613256247e-05, + "loss": 0.9986, + "step": 3255 + }, + { + "epoch": 0.13193039255362202, + "grad_norm": 1.1961429119110107, + "learning_rate": 8.858145777410696e-05, + "loss": 1.059, + "step": 3260 + }, + { + "epoch": 0.132132739781465, + "grad_norm": 1.2358653545379639, + "learning_rate": 8.856080941565145e-05, + "loss": 0.9815, + "step": 3265 + }, + { + "epoch": 0.13233508700930796, + "grad_norm": 1.2921229600906372, + "learning_rate": 8.854016105719596e-05, + "loss": 1.0244, + "step": 3270 + }, + { + "epoch": 0.13253743423715095, + "grad_norm": 1.1417112350463867, + "learning_rate": 8.851951269874046e-05, + "loss": 0.9919, + "step": 3275 + }, + { + "epoch": 0.13273978146499393, + "grad_norm": 1.225769281387329, + "learning_rate": 8.849886434028495e-05, + "loss": 0.9792, + "step": 3280 + }, + { + "epoch": 0.13294212869283692, + "grad_norm": 1.1507500410079956, + "learning_rate": 8.847821598182944e-05, + "loss": 0.9951, + "step": 3285 + }, + { + "epoch": 0.13314447592067988, + "grad_norm": 1.2668052911758423, + "learning_rate": 8.845756762337395e-05, + "loss": 1.0371, + "step": 3290 + }, + { + "epoch": 0.13334682314852286, + "grad_norm": 1.1190897226333618, + "learning_rate": 8.843691926491844e-05, + "loss": 1.0822, + "step": 3295 + }, + { + "epoch": 0.13354917037636585, + "grad_norm": 1.2662688493728638, + "learning_rate": 8.841627090646294e-05, + "loss": 0.9802, + "step": 3300 + }, + { + "epoch": 0.13375151760420884, + "grad_norm": 1.2569993734359741, + "learning_rate": 8.839562254800744e-05, + "loss": 1.0317, + "step": 3305 + }, + { + "epoch": 0.1339538648320518, + "grad_norm": 1.0859438180923462, + "learning_rate": 8.837497418955194e-05, + "loss": 1.0241, + "step": 3310 + }, + { + "epoch": 0.13415621205989478, + "grad_norm": 1.2171080112457275, + "learning_rate": 8.835432583109643e-05, + "loss": 0.9646, + "step": 3315 + }, + { + "epoch": 0.13435855928773777, + "grad_norm": 1.2457976341247559, + "learning_rate": 8.833367747264093e-05, + "loss": 1.0491, + "step": 3320 + }, + { + "epoch": 0.13456090651558072, + "grad_norm": 1.2020723819732666, + "learning_rate": 8.831302911418543e-05, + "loss": 1.0559, + "step": 3325 + }, + { + "epoch": 0.1347632537434237, + "grad_norm": 1.2359280586242676, + "learning_rate": 8.829238075572992e-05, + "loss": 1.0673, + "step": 3330 + }, + { + "epoch": 0.1349656009712667, + "grad_norm": 1.119128942489624, + "learning_rate": 8.827173239727442e-05, + "loss": 1.0245, + "step": 3335 + }, + { + "epoch": 0.13516794819910968, + "grad_norm": 1.1393615007400513, + "learning_rate": 8.825108403881892e-05, + "loss": 1.0338, + "step": 3340 + }, + { + "epoch": 0.13537029542695264, + "grad_norm": 1.202042579650879, + "learning_rate": 8.823043568036341e-05, + "loss": 1.0262, + "step": 3345 + }, + { + "epoch": 0.13557264265479563, + "grad_norm": 1.2606916427612305, + "learning_rate": 8.820978732190791e-05, + "loss": 0.9757, + "step": 3350 + }, + { + "epoch": 0.1357749898826386, + "grad_norm": 1.1343713998794556, + "learning_rate": 8.818913896345242e-05, + "loss": 1.0005, + "step": 3355 + }, + { + "epoch": 0.1359773371104816, + "grad_norm": 1.2434736490249634, + "learning_rate": 8.816849060499691e-05, + "loss": 1.0394, + "step": 3360 + }, + { + "epoch": 0.13617968433832456, + "grad_norm": 1.1181955337524414, + "learning_rate": 8.81478422465414e-05, + "loss": 1.0572, + "step": 3365 + }, + { + "epoch": 0.13638203156616754, + "grad_norm": 1.2072449922561646, + "learning_rate": 8.81271938880859e-05, + "loss": 1.0625, + "step": 3370 + }, + { + "epoch": 0.13658437879401053, + "grad_norm": 1.2195651531219482, + "learning_rate": 8.810654552963041e-05, + "loss": 1.0399, + "step": 3375 + }, + { + "epoch": 0.1367867260218535, + "grad_norm": 1.175723671913147, + "learning_rate": 8.80858971711749e-05, + "loss": 0.9996, + "step": 3380 + }, + { + "epoch": 0.13698907324969647, + "grad_norm": 1.1045277118682861, + "learning_rate": 8.806524881271939e-05, + "loss": 1.0098, + "step": 3385 + }, + { + "epoch": 0.13719142047753946, + "grad_norm": 1.2739930152893066, + "learning_rate": 8.804460045426389e-05, + "loss": 0.9593, + "step": 3390 + }, + { + "epoch": 0.13739376770538245, + "grad_norm": 1.0590320825576782, + "learning_rate": 8.80239520958084e-05, + "loss": 1.0003, + "step": 3395 + }, + { + "epoch": 0.1375961149332254, + "grad_norm": 1.220510482788086, + "learning_rate": 8.800330373735289e-05, + "loss": 1.0289, + "step": 3400 + }, + { + "epoch": 0.1377984621610684, + "grad_norm": 1.1820683479309082, + "learning_rate": 8.798265537889738e-05, + "loss": 0.9708, + "step": 3405 + }, + { + "epoch": 0.13800080938891138, + "grad_norm": 1.138117790222168, + "learning_rate": 8.796200702044188e-05, + "loss": 1.0258, + "step": 3410 + }, + { + "epoch": 0.13820315661675436, + "grad_norm": 1.1486241817474365, + "learning_rate": 8.794135866198637e-05, + "loss": 1.0041, + "step": 3415 + }, + { + "epoch": 0.13840550384459732, + "grad_norm": 1.3090317249298096, + "learning_rate": 8.792071030353088e-05, + "loss": 0.9687, + "step": 3420 + }, + { + "epoch": 0.1386078510724403, + "grad_norm": 1.2961742877960205, + "learning_rate": 8.790006194507538e-05, + "loss": 1.0142, + "step": 3425 + }, + { + "epoch": 0.1388101983002833, + "grad_norm": 1.1060842275619507, + "learning_rate": 8.787941358661986e-05, + "loss": 0.988, + "step": 3430 + }, + { + "epoch": 0.13901254552812625, + "grad_norm": 1.1456315517425537, + "learning_rate": 8.785876522816436e-05, + "loss": 0.9785, + "step": 3435 + }, + { + "epoch": 0.13921489275596924, + "grad_norm": 1.2191643714904785, + "learning_rate": 8.783811686970887e-05, + "loss": 1.0835, + "step": 3440 + }, + { + "epoch": 0.13941723998381222, + "grad_norm": 1.2775850296020508, + "learning_rate": 8.781746851125337e-05, + "loss": 0.98, + "step": 3445 + }, + { + "epoch": 0.1396195872116552, + "grad_norm": 1.239769458770752, + "learning_rate": 8.779682015279786e-05, + "loss": 1.0547, + "step": 3450 + }, + { + "epoch": 0.13982193443949817, + "grad_norm": 1.193727970123291, + "learning_rate": 8.777617179434235e-05, + "loss": 0.976, + "step": 3455 + }, + { + "epoch": 0.14002428166734115, + "grad_norm": 1.2514077425003052, + "learning_rate": 8.775552343588685e-05, + "loss": 1.0088, + "step": 3460 + }, + { + "epoch": 0.14022662889518414, + "grad_norm": 1.3171583414077759, + "learning_rate": 8.773487507743134e-05, + "loss": 1.0476, + "step": 3465 + }, + { + "epoch": 0.14042897612302713, + "grad_norm": 1.1793632507324219, + "learning_rate": 8.771422671897585e-05, + "loss": 1.052, + "step": 3470 + }, + { + "epoch": 0.14063132335087009, + "grad_norm": 1.096036672592163, + "learning_rate": 8.769357836052034e-05, + "loss": 1.0444, + "step": 3475 + }, + { + "epoch": 0.14083367057871307, + "grad_norm": 1.3066802024841309, + "learning_rate": 8.767293000206483e-05, + "loss": 1.0321, + "step": 3480 + }, + { + "epoch": 0.14103601780655606, + "grad_norm": 1.175502896308899, + "learning_rate": 8.765228164360933e-05, + "loss": 1.046, + "step": 3485 + }, + { + "epoch": 0.14123836503439902, + "grad_norm": 1.2827718257904053, + "learning_rate": 8.763163328515384e-05, + "loss": 1.0152, + "step": 3490 + }, + { + "epoch": 0.141440712262242, + "grad_norm": 1.280197262763977, + "learning_rate": 8.761098492669834e-05, + "loss": 1.0324, + "step": 3495 + }, + { + "epoch": 0.141643059490085, + "grad_norm": 1.1645952463150024, + "learning_rate": 8.759033656824282e-05, + "loss": 1.0167, + "step": 3500 + }, + { + "epoch": 0.14184540671792797, + "grad_norm": 1.2604427337646484, + "learning_rate": 8.756968820978732e-05, + "loss": 1.028, + "step": 3505 + }, + { + "epoch": 0.14204775394577093, + "grad_norm": 1.3135136365890503, + "learning_rate": 8.754903985133183e-05, + "loss": 1.0121, + "step": 3510 + }, + { + "epoch": 0.14225010117361392, + "grad_norm": 1.1825295686721802, + "learning_rate": 8.752839149287632e-05, + "loss": 1.023, + "step": 3515 + }, + { + "epoch": 0.1424524484014569, + "grad_norm": 1.134616732597351, + "learning_rate": 8.750774313442082e-05, + "loss": 0.9759, + "step": 3520 + }, + { + "epoch": 0.1426547956292999, + "grad_norm": 1.1520540714263916, + "learning_rate": 8.748709477596531e-05, + "loss": 1.0127, + "step": 3525 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.1450047492980957, + "learning_rate": 8.746644641750982e-05, + "loss": 1.0345, + "step": 3530 + }, + { + "epoch": 0.14305949008498584, + "grad_norm": 1.087993860244751, + "learning_rate": 8.744579805905431e-05, + "loss": 0.968, + "step": 3535 + }, + { + "epoch": 0.14326183731282882, + "grad_norm": 1.3990801572799683, + "learning_rate": 8.742514970059881e-05, + "loss": 0.989, + "step": 3540 + }, + { + "epoch": 0.14346418454067178, + "grad_norm": 1.4547699689865112, + "learning_rate": 8.74045013421433e-05, + "loss": 1.0015, + "step": 3545 + }, + { + "epoch": 0.14366653176851477, + "grad_norm": 1.1456403732299805, + "learning_rate": 8.738385298368779e-05, + "loss": 0.9822, + "step": 3550 + }, + { + "epoch": 0.14386887899635775, + "grad_norm": 1.2532793283462524, + "learning_rate": 8.73632046252323e-05, + "loss": 1.0256, + "step": 3555 + }, + { + "epoch": 0.14407122622420074, + "grad_norm": 1.1669167280197144, + "learning_rate": 8.73425562667768e-05, + "loss": 0.9475, + "step": 3560 + }, + { + "epoch": 0.1442735734520437, + "grad_norm": 1.130604863166809, + "learning_rate": 8.732190790832129e-05, + "loss": 1.0035, + "step": 3565 + }, + { + "epoch": 0.14447592067988668, + "grad_norm": 1.2054868936538696, + "learning_rate": 8.73012595498658e-05, + "loss": 1.0052, + "step": 3570 + }, + { + "epoch": 0.14467826790772967, + "grad_norm": 1.2163805961608887, + "learning_rate": 8.728061119141029e-05, + "loss": 0.9924, + "step": 3575 + }, + { + "epoch": 0.14488061513557265, + "grad_norm": 1.3135406970977783, + "learning_rate": 8.725996283295479e-05, + "loss": 0.9828, + "step": 3580 + }, + { + "epoch": 0.1450829623634156, + "grad_norm": 1.1675292253494263, + "learning_rate": 8.723931447449928e-05, + "loss": 1.0074, + "step": 3585 + }, + { + "epoch": 0.1452853095912586, + "grad_norm": 1.3223679065704346, + "learning_rate": 8.721866611604378e-05, + "loss": 1.0411, + "step": 3590 + }, + { + "epoch": 0.14548765681910159, + "grad_norm": 1.2408627271652222, + "learning_rate": 8.719801775758827e-05, + "loss": 0.9985, + "step": 3595 + }, + { + "epoch": 0.14569000404694454, + "grad_norm": 1.055103063583374, + "learning_rate": 8.717736939913277e-05, + "loss": 1.0143, + "step": 3600 + }, + { + "epoch": 0.14589235127478753, + "grad_norm": 1.174357295036316, + "learning_rate": 8.715672104067727e-05, + "loss": 1.0361, + "step": 3605 + }, + { + "epoch": 0.14609469850263052, + "grad_norm": 1.195693850517273, + "learning_rate": 8.713607268222177e-05, + "loss": 1.0385, + "step": 3610 + }, + { + "epoch": 0.1462970457304735, + "grad_norm": 1.2959665060043335, + "learning_rate": 8.711542432376626e-05, + "loss": 1.0121, + "step": 3615 + }, + { + "epoch": 0.14649939295831646, + "grad_norm": 1.25341796875, + "learning_rate": 8.709477596531075e-05, + "loss": 1.0562, + "step": 3620 + }, + { + "epoch": 0.14670174018615945, + "grad_norm": 1.2359027862548828, + "learning_rate": 8.707412760685526e-05, + "loss": 1.0372, + "step": 3625 + }, + { + "epoch": 0.14690408741400243, + "grad_norm": 1.233233094215393, + "learning_rate": 8.705347924839976e-05, + "loss": 1.02, + "step": 3630 + }, + { + "epoch": 0.14710643464184542, + "grad_norm": 1.246022343635559, + "learning_rate": 8.703283088994425e-05, + "loss": 1.0463, + "step": 3635 + }, + { + "epoch": 0.14730878186968838, + "grad_norm": 1.2579573392868042, + "learning_rate": 8.701218253148876e-05, + "loss": 0.9662, + "step": 3640 + }, + { + "epoch": 0.14751112909753136, + "grad_norm": 1.2932707071304321, + "learning_rate": 8.699153417303325e-05, + "loss": 1.024, + "step": 3645 + }, + { + "epoch": 0.14771347632537435, + "grad_norm": 1.1545662879943848, + "learning_rate": 8.697088581457774e-05, + "loss": 1.0321, + "step": 3650 + }, + { + "epoch": 0.1479158235532173, + "grad_norm": 1.1988214254379272, + "learning_rate": 8.695023745612224e-05, + "loss": 1.0274, + "step": 3655 + }, + { + "epoch": 0.1481181707810603, + "grad_norm": 1.1396006345748901, + "learning_rate": 8.692958909766675e-05, + "loss": 1.094, + "step": 3660 + }, + { + "epoch": 0.14832051800890328, + "grad_norm": 1.1652321815490723, + "learning_rate": 8.690894073921124e-05, + "loss": 1.0602, + "step": 3665 + }, + { + "epoch": 0.14852286523674627, + "grad_norm": 1.1104350090026855, + "learning_rate": 8.688829238075573e-05, + "loss": 1.0074, + "step": 3670 + }, + { + "epoch": 0.14872521246458922, + "grad_norm": 1.164095163345337, + "learning_rate": 8.686764402230023e-05, + "loss": 0.9879, + "step": 3675 + }, + { + "epoch": 0.1489275596924322, + "grad_norm": 1.276949167251587, + "learning_rate": 8.684699566384474e-05, + "loss": 1.0931, + "step": 3680 + }, + { + "epoch": 0.1491299069202752, + "grad_norm": 1.211052417755127, + "learning_rate": 8.682634730538923e-05, + "loss": 1.0339, + "step": 3685 + }, + { + "epoch": 0.14933225414811818, + "grad_norm": 1.135284662246704, + "learning_rate": 8.680569894693372e-05, + "loss": 1.0324, + "step": 3690 + }, + { + "epoch": 0.14953460137596114, + "grad_norm": 1.1767339706420898, + "learning_rate": 8.678505058847822e-05, + "loss": 1.0568, + "step": 3695 + }, + { + "epoch": 0.14973694860380413, + "grad_norm": 1.199989676475525, + "learning_rate": 8.676440223002271e-05, + "loss": 1.0421, + "step": 3700 + }, + { + "epoch": 0.1499392958316471, + "grad_norm": 1.4178532361984253, + "learning_rate": 8.674375387156722e-05, + "loss": 1.0056, + "step": 3705 + }, + { + "epoch": 0.1501416430594901, + "grad_norm": 1.312332034111023, + "learning_rate": 8.672310551311172e-05, + "loss": 0.9942, + "step": 3710 + }, + { + "epoch": 0.15034399028733306, + "grad_norm": 1.1598105430603027, + "learning_rate": 8.670245715465621e-05, + "loss": 1.0479, + "step": 3715 + }, + { + "epoch": 0.15054633751517604, + "grad_norm": 1.192805290222168, + "learning_rate": 8.66818087962007e-05, + "loss": 1.0352, + "step": 3720 + }, + { + "epoch": 0.15074868474301903, + "grad_norm": 1.024114727973938, + "learning_rate": 8.66611604377452e-05, + "loss": 1.0282, + "step": 3725 + }, + { + "epoch": 0.150951031970862, + "grad_norm": 1.0919108390808105, + "learning_rate": 8.664051207928971e-05, + "loss": 1.0674, + "step": 3730 + }, + { + "epoch": 0.15115337919870497, + "grad_norm": 1.2635074853897095, + "learning_rate": 8.66198637208342e-05, + "loss": 1.0537, + "step": 3735 + }, + { + "epoch": 0.15135572642654796, + "grad_norm": 1.2155044078826904, + "learning_rate": 8.659921536237869e-05, + "loss": 1.0337, + "step": 3740 + }, + { + "epoch": 0.15155807365439095, + "grad_norm": 1.3911570310592651, + "learning_rate": 8.65785670039232e-05, + "loss": 0.9941, + "step": 3745 + }, + { + "epoch": 0.1517604208822339, + "grad_norm": 1.2916566133499146, + "learning_rate": 8.65579186454677e-05, + "loss": 1.0786, + "step": 3750 + }, + { + "epoch": 0.1519627681100769, + "grad_norm": 1.1407253742218018, + "learning_rate": 8.653727028701219e-05, + "loss": 0.9985, + "step": 3755 + }, + { + "epoch": 0.15216511533791988, + "grad_norm": 1.1862139701843262, + "learning_rate": 8.651662192855668e-05, + "loss": 0.9468, + "step": 3760 + }, + { + "epoch": 0.15236746256576286, + "grad_norm": 1.4419888257980347, + "learning_rate": 8.649597357010118e-05, + "loss": 1.0126, + "step": 3765 + }, + { + "epoch": 0.15256980979360582, + "grad_norm": 1.222913384437561, + "learning_rate": 8.647532521164567e-05, + "loss": 0.9966, + "step": 3770 + }, + { + "epoch": 0.1527721570214488, + "grad_norm": 1.0772240161895752, + "learning_rate": 8.645467685319018e-05, + "loss": 1.035, + "step": 3775 + }, + { + "epoch": 0.1529745042492918, + "grad_norm": 1.1999720335006714, + "learning_rate": 8.643402849473468e-05, + "loss": 0.9401, + "step": 3780 + }, + { + "epoch": 0.15317685147713475, + "grad_norm": 1.3183565139770508, + "learning_rate": 8.641338013627916e-05, + "loss": 1.0557, + "step": 3785 + }, + { + "epoch": 0.15337919870497774, + "grad_norm": 1.2679200172424316, + "learning_rate": 8.639273177782366e-05, + "loss": 0.969, + "step": 3790 + }, + { + "epoch": 0.15358154593282072, + "grad_norm": 1.2706146240234375, + "learning_rate": 8.637208341936817e-05, + "loss": 0.938, + "step": 3795 + }, + { + "epoch": 0.1537838931606637, + "grad_norm": 1.203769564628601, + "learning_rate": 8.635143506091267e-05, + "loss": 1.0065, + "step": 3800 + }, + { + "epoch": 0.15398624038850667, + "grad_norm": 1.2441059350967407, + "learning_rate": 8.633078670245716e-05, + "loss": 1.0512, + "step": 3805 + }, + { + "epoch": 0.15418858761634965, + "grad_norm": 1.2276763916015625, + "learning_rate": 8.631013834400165e-05, + "loss": 0.9929, + "step": 3810 + }, + { + "epoch": 0.15439093484419264, + "grad_norm": 1.284480094909668, + "learning_rate": 8.628948998554616e-05, + "loss": 1.0114, + "step": 3815 + }, + { + "epoch": 0.15459328207203563, + "grad_norm": 1.270969033241272, + "learning_rate": 8.626884162709065e-05, + "loss": 1.0202, + "step": 3820 + }, + { + "epoch": 0.15479562929987858, + "grad_norm": 1.3259066343307495, + "learning_rate": 8.624819326863515e-05, + "loss": 1.0032, + "step": 3825 + }, + { + "epoch": 0.15499797652772157, + "grad_norm": 1.2419929504394531, + "learning_rate": 8.622754491017964e-05, + "loss": 0.9993, + "step": 3830 + }, + { + "epoch": 0.15520032375556456, + "grad_norm": 1.2978901863098145, + "learning_rate": 8.620689655172413e-05, + "loss": 0.9806, + "step": 3835 + }, + { + "epoch": 0.15540267098340751, + "grad_norm": 1.2446649074554443, + "learning_rate": 8.618624819326864e-05, + "loss": 0.9643, + "step": 3840 + }, + { + "epoch": 0.1556050182112505, + "grad_norm": 1.1495150327682495, + "learning_rate": 8.616559983481314e-05, + "loss": 1.0339, + "step": 3845 + }, + { + "epoch": 0.1558073654390935, + "grad_norm": 1.1303049325942993, + "learning_rate": 8.614495147635764e-05, + "loss": 1.015, + "step": 3850 + }, + { + "epoch": 0.15600971266693647, + "grad_norm": 1.3211051225662231, + "learning_rate": 8.612430311790213e-05, + "loss": 1.0436, + "step": 3855 + }, + { + "epoch": 0.15621205989477943, + "grad_norm": 1.1358425617218018, + "learning_rate": 8.610365475944662e-05, + "loss": 0.9691, + "step": 3860 + }, + { + "epoch": 0.15641440712262242, + "grad_norm": 1.2107272148132324, + "learning_rate": 8.608300640099113e-05, + "loss": 0.9924, + "step": 3865 + }, + { + "epoch": 0.1566167543504654, + "grad_norm": 1.2978575229644775, + "learning_rate": 8.606235804253562e-05, + "loss": 1.0476, + "step": 3870 + }, + { + "epoch": 0.1568191015783084, + "grad_norm": 1.103603482246399, + "learning_rate": 8.604170968408012e-05, + "loss": 1.0207, + "step": 3875 + }, + { + "epoch": 0.15702144880615135, + "grad_norm": 1.2778555154800415, + "learning_rate": 8.602106132562461e-05, + "loss": 0.9794, + "step": 3880 + }, + { + "epoch": 0.15722379603399433, + "grad_norm": 1.2931251525878906, + "learning_rate": 8.600041296716912e-05, + "loss": 1.0217, + "step": 3885 + }, + { + "epoch": 0.15742614326183732, + "grad_norm": 1.2048687934875488, + "learning_rate": 8.597976460871361e-05, + "loss": 0.9122, + "step": 3890 + }, + { + "epoch": 0.15762849048968028, + "grad_norm": 1.1445924043655396, + "learning_rate": 8.595911625025811e-05, + "loss": 0.9711, + "step": 3895 + }, + { + "epoch": 0.15783083771752326, + "grad_norm": 1.1007800102233887, + "learning_rate": 8.593846789180262e-05, + "loss": 1.0013, + "step": 3900 + }, + { + "epoch": 0.15803318494536625, + "grad_norm": 1.1982959508895874, + "learning_rate": 8.59178195333471e-05, + "loss": 1.0429, + "step": 3905 + }, + { + "epoch": 0.15823553217320924, + "grad_norm": 1.206376552581787, + "learning_rate": 8.58971711748916e-05, + "loss": 1.0588, + "step": 3910 + }, + { + "epoch": 0.1584378794010522, + "grad_norm": 1.2315871715545654, + "learning_rate": 8.58765228164361e-05, + "loss": 1.052, + "step": 3915 + }, + { + "epoch": 0.15864022662889518, + "grad_norm": 1.1181589365005493, + "learning_rate": 8.585587445798059e-05, + "loss": 1.0116, + "step": 3920 + }, + { + "epoch": 0.15884257385673817, + "grad_norm": 1.2370235919952393, + "learning_rate": 8.58352260995251e-05, + "loss": 1.0724, + "step": 3925 + }, + { + "epoch": 0.15904492108458115, + "grad_norm": 1.3594273328781128, + "learning_rate": 8.581457774106959e-05, + "loss": 1.0249, + "step": 3930 + }, + { + "epoch": 0.1592472683124241, + "grad_norm": 1.2228608131408691, + "learning_rate": 8.579392938261409e-05, + "loss": 0.9731, + "step": 3935 + }, + { + "epoch": 0.1594496155402671, + "grad_norm": 1.1914616823196411, + "learning_rate": 8.577328102415858e-05, + "loss": 1.0062, + "step": 3940 + }, + { + "epoch": 0.15965196276811008, + "grad_norm": 1.0850211381912231, + "learning_rate": 8.575263266570309e-05, + "loss": 1.007, + "step": 3945 + }, + { + "epoch": 0.15985430999595304, + "grad_norm": 1.281343698501587, + "learning_rate": 8.573198430724758e-05, + "loss": 1.0535, + "step": 3950 + }, + { + "epoch": 0.16005665722379603, + "grad_norm": 1.243461012840271, + "learning_rate": 8.571133594879207e-05, + "loss": 1.0938, + "step": 3955 + }, + { + "epoch": 0.16025900445163901, + "grad_norm": 1.1410428285598755, + "learning_rate": 8.569068759033657e-05, + "loss": 1.0383, + "step": 3960 + }, + { + "epoch": 0.160461351679482, + "grad_norm": 1.1548258066177368, + "learning_rate": 8.567003923188107e-05, + "loss": 1.065, + "step": 3965 + }, + { + "epoch": 0.16066369890732496, + "grad_norm": 1.187805414199829, + "learning_rate": 8.564939087342557e-05, + "loss": 0.9705, + "step": 3970 + }, + { + "epoch": 0.16086604613516794, + "grad_norm": 1.3056522607803345, + "learning_rate": 8.562874251497006e-05, + "loss": 1.0489, + "step": 3975 + }, + { + "epoch": 0.16106839336301093, + "grad_norm": 1.1718940734863281, + "learning_rate": 8.560809415651456e-05, + "loss": 0.9866, + "step": 3980 + }, + { + "epoch": 0.16127074059085392, + "grad_norm": 1.1825013160705566, + "learning_rate": 8.558744579805906e-05, + "loss": 1.0026, + "step": 3985 + }, + { + "epoch": 0.16147308781869688, + "grad_norm": 1.1425038576126099, + "learning_rate": 8.556679743960355e-05, + "loss": 0.9875, + "step": 3990 + }, + { + "epoch": 0.16167543504653986, + "grad_norm": 1.316321849822998, + "learning_rate": 8.554614908114806e-05, + "loss": 1.0179, + "step": 3995 + }, + { + "epoch": 0.16187778227438285, + "grad_norm": 1.2819961309432983, + "learning_rate": 8.552550072269255e-05, + "loss": 1.0207, + "step": 4000 + }, + { + "epoch": 0.1620801295022258, + "grad_norm": 1.1831226348876953, + "learning_rate": 8.550485236423704e-05, + "loss": 1.0433, + "step": 4005 + }, + { + "epoch": 0.1622824767300688, + "grad_norm": 1.1329317092895508, + "learning_rate": 8.548420400578154e-05, + "loss": 1.0272, + "step": 4010 + }, + { + "epoch": 0.16248482395791178, + "grad_norm": 1.1299233436584473, + "learning_rate": 8.546355564732605e-05, + "loss": 1.005, + "step": 4015 + }, + { + "epoch": 0.16268717118575476, + "grad_norm": 1.5367989540100098, + "learning_rate": 8.544290728887054e-05, + "loss": 1.026, + "step": 4020 + }, + { + "epoch": 0.16288951841359772, + "grad_norm": 1.4270910024642944, + "learning_rate": 8.542225893041503e-05, + "loss": 1.0135, + "step": 4025 + }, + { + "epoch": 0.1630918656414407, + "grad_norm": 1.3334009647369385, + "learning_rate": 8.540161057195953e-05, + "loss": 1.0924, + "step": 4030 + }, + { + "epoch": 0.1632942128692837, + "grad_norm": 1.2332525253295898, + "learning_rate": 8.538096221350404e-05, + "loss": 1.0167, + "step": 4035 + }, + { + "epoch": 0.16349656009712668, + "grad_norm": 1.2154927253723145, + "learning_rate": 8.536031385504853e-05, + "loss": 0.9935, + "step": 4040 + }, + { + "epoch": 0.16369890732496964, + "grad_norm": 1.1705427169799805, + "learning_rate": 8.533966549659302e-05, + "loss": 1.0358, + "step": 4045 + }, + { + "epoch": 0.16390125455281263, + "grad_norm": 1.1485010385513306, + "learning_rate": 8.531901713813752e-05, + "loss": 0.9742, + "step": 4050 + }, + { + "epoch": 0.1641036017806556, + "grad_norm": 1.3040324449539185, + "learning_rate": 8.529836877968201e-05, + "loss": 1.0109, + "step": 4055 + }, + { + "epoch": 0.1643059490084986, + "grad_norm": 1.1587469577789307, + "learning_rate": 8.527772042122652e-05, + "loss": 1.005, + "step": 4060 + }, + { + "epoch": 0.16450829623634156, + "grad_norm": 1.1227656602859497, + "learning_rate": 8.525707206277102e-05, + "loss": 0.9587, + "step": 4065 + }, + { + "epoch": 0.16471064346418454, + "grad_norm": 1.2229607105255127, + "learning_rate": 8.523642370431551e-05, + "loss": 0.9741, + "step": 4070 + }, + { + "epoch": 0.16491299069202753, + "grad_norm": 1.2067303657531738, + "learning_rate": 8.521577534586e-05, + "loss": 0.9957, + "step": 4075 + }, + { + "epoch": 0.1651153379198705, + "grad_norm": 1.1254018545150757, + "learning_rate": 8.51951269874045e-05, + "loss": 1.0125, + "step": 4080 + }, + { + "epoch": 0.16531768514771347, + "grad_norm": 1.163508653640747, + "learning_rate": 8.517447862894901e-05, + "loss": 1.0425, + "step": 4085 + }, + { + "epoch": 0.16552003237555646, + "grad_norm": 1.2304508686065674, + "learning_rate": 8.51538302704935e-05, + "loss": 0.9819, + "step": 4090 + }, + { + "epoch": 0.16572237960339944, + "grad_norm": 1.19584059715271, + "learning_rate": 8.513318191203799e-05, + "loss": 1.008, + "step": 4095 + }, + { + "epoch": 0.1659247268312424, + "grad_norm": 1.2220029830932617, + "learning_rate": 8.51125335535825e-05, + "loss": 0.982, + "step": 4100 + }, + { + "epoch": 0.1661270740590854, + "grad_norm": 1.150023102760315, + "learning_rate": 8.5091885195127e-05, + "loss": 1.0726, + "step": 4105 + }, + { + "epoch": 0.16632942128692838, + "grad_norm": 1.1686456203460693, + "learning_rate": 8.507123683667149e-05, + "loss": 0.9949, + "step": 4110 + }, + { + "epoch": 0.16653176851477136, + "grad_norm": 1.0913928747177124, + "learning_rate": 8.5050588478216e-05, + "loss": 1.0185, + "step": 4115 + }, + { + "epoch": 0.16673411574261432, + "grad_norm": 1.1012707948684692, + "learning_rate": 8.502994011976048e-05, + "loss": 1.0289, + "step": 4120 + }, + { + "epoch": 0.1669364629704573, + "grad_norm": 1.2062115669250488, + "learning_rate": 8.500929176130497e-05, + "loss": 1.0099, + "step": 4125 + }, + { + "epoch": 0.1671388101983003, + "grad_norm": 1.0764849185943604, + "learning_rate": 8.498864340284948e-05, + "loss": 0.9775, + "step": 4130 + }, + { + "epoch": 0.16734115742614325, + "grad_norm": 1.1648088693618774, + "learning_rate": 8.496799504439398e-05, + "loss": 1.0453, + "step": 4135 + }, + { + "epoch": 0.16754350465398624, + "grad_norm": 1.1501338481903076, + "learning_rate": 8.494734668593847e-05, + "loss": 0.9998, + "step": 4140 + }, + { + "epoch": 0.16774585188182922, + "grad_norm": 1.1651593446731567, + "learning_rate": 8.492669832748296e-05, + "loss": 1.068, + "step": 4145 + }, + { + "epoch": 0.1679481991096722, + "grad_norm": 1.299416422843933, + "learning_rate": 8.490604996902747e-05, + "loss": 1.0172, + "step": 4150 + }, + { + "epoch": 0.16815054633751517, + "grad_norm": 1.2932381629943848, + "learning_rate": 8.488540161057197e-05, + "loss": 0.9496, + "step": 4155 + }, + { + "epoch": 0.16835289356535815, + "grad_norm": 1.2833499908447266, + "learning_rate": 8.486475325211646e-05, + "loss": 1.0473, + "step": 4160 + }, + { + "epoch": 0.16855524079320114, + "grad_norm": 1.069675087928772, + "learning_rate": 8.484410489366095e-05, + "loss": 1.0372, + "step": 4165 + }, + { + "epoch": 0.16875758802104412, + "grad_norm": 1.2872824668884277, + "learning_rate": 8.482345653520546e-05, + "loss": 1.028, + "step": 4170 + }, + { + "epoch": 0.16895993524888708, + "grad_norm": 1.1362758874893188, + "learning_rate": 8.480280817674995e-05, + "loss": 1.0358, + "step": 4175 + }, + { + "epoch": 0.16916228247673007, + "grad_norm": 1.3370100259780884, + "learning_rate": 8.478215981829445e-05, + "loss": 1.002, + "step": 4180 + }, + { + "epoch": 0.16936462970457306, + "grad_norm": 1.0713083744049072, + "learning_rate": 8.476151145983896e-05, + "loss": 1.017, + "step": 4185 + }, + { + "epoch": 0.169566976932416, + "grad_norm": 1.1837053298950195, + "learning_rate": 8.474086310138343e-05, + "loss": 1.0054, + "step": 4190 + }, + { + "epoch": 0.169769324160259, + "grad_norm": 1.2177187204360962, + "learning_rate": 8.472021474292794e-05, + "loss": 0.9947, + "step": 4195 + }, + { + "epoch": 0.16997167138810199, + "grad_norm": 1.2140326499938965, + "learning_rate": 8.469956638447244e-05, + "loss": 1.1113, + "step": 4200 + }, + { + "epoch": 0.17017401861594497, + "grad_norm": 1.2045141458511353, + "learning_rate": 8.467891802601694e-05, + "loss": 1.081, + "step": 4205 + }, + { + "epoch": 0.17037636584378793, + "grad_norm": 1.1738569736480713, + "learning_rate": 8.465826966756144e-05, + "loss": 1.0536, + "step": 4210 + }, + { + "epoch": 0.17057871307163092, + "grad_norm": 1.1751322746276855, + "learning_rate": 8.463762130910593e-05, + "loss": 0.9973, + "step": 4215 + }, + { + "epoch": 0.1707810602994739, + "grad_norm": 1.2755372524261475, + "learning_rate": 8.461697295065043e-05, + "loss": 1.0359, + "step": 4220 + }, + { + "epoch": 0.1709834075273169, + "grad_norm": 1.3054158687591553, + "learning_rate": 8.459632459219492e-05, + "loss": 1.0842, + "step": 4225 + }, + { + "epoch": 0.17118575475515985, + "grad_norm": 1.2696008682250977, + "learning_rate": 8.457567623373942e-05, + "loss": 1.0586, + "step": 4230 + }, + { + "epoch": 0.17138810198300283, + "grad_norm": 1.0441927909851074, + "learning_rate": 8.455502787528392e-05, + "loss": 0.973, + "step": 4235 + }, + { + "epoch": 0.17159044921084582, + "grad_norm": 1.0617234706878662, + "learning_rate": 8.453437951682842e-05, + "loss": 1.0067, + "step": 4240 + }, + { + "epoch": 0.17179279643868878, + "grad_norm": 1.1689379215240479, + "learning_rate": 8.451373115837291e-05, + "loss": 1.0333, + "step": 4245 + }, + { + "epoch": 0.17199514366653176, + "grad_norm": 1.168121337890625, + "learning_rate": 8.449308279991741e-05, + "loss": 0.9874, + "step": 4250 + }, + { + "epoch": 0.17219749089437475, + "grad_norm": 1.1850627660751343, + "learning_rate": 8.447243444146192e-05, + "loss": 1.056, + "step": 4255 + }, + { + "epoch": 0.17239983812221774, + "grad_norm": 1.170566439628601, + "learning_rate": 8.44517860830064e-05, + "loss": 1.0082, + "step": 4260 + }, + { + "epoch": 0.1726021853500607, + "grad_norm": 1.25258207321167, + "learning_rate": 8.44311377245509e-05, + "loss": 1.0581, + "step": 4265 + }, + { + "epoch": 0.17280453257790368, + "grad_norm": 1.2613723278045654, + "learning_rate": 8.44104893660954e-05, + "loss": 0.9972, + "step": 4270 + }, + { + "epoch": 0.17300687980574667, + "grad_norm": 1.3771018981933594, + "learning_rate": 8.43898410076399e-05, + "loss": 0.9891, + "step": 4275 + }, + { + "epoch": 0.17320922703358965, + "grad_norm": 1.164170265197754, + "learning_rate": 8.43691926491844e-05, + "loss": 1.0615, + "step": 4280 + }, + { + "epoch": 0.1734115742614326, + "grad_norm": 1.2523274421691895, + "learning_rate": 8.434854429072889e-05, + "loss": 1.0216, + "step": 4285 + }, + { + "epoch": 0.1736139214892756, + "grad_norm": 1.168658971786499, + "learning_rate": 8.432789593227339e-05, + "loss": 1.0541, + "step": 4290 + }, + { + "epoch": 0.17381626871711858, + "grad_norm": 1.125502586364746, + "learning_rate": 8.430724757381788e-05, + "loss": 1.0072, + "step": 4295 + }, + { + "epoch": 0.17401861594496154, + "grad_norm": 1.0859943628311157, + "learning_rate": 8.428659921536239e-05, + "loss": 1.0341, + "step": 4300 + }, + { + "epoch": 0.17422096317280453, + "grad_norm": 1.2627450227737427, + "learning_rate": 8.426595085690688e-05, + "loss": 1.0175, + "step": 4305 + }, + { + "epoch": 0.1744233104006475, + "grad_norm": 1.1298612356185913, + "learning_rate": 8.424530249845137e-05, + "loss": 1.0278, + "step": 4310 + }, + { + "epoch": 0.1746256576284905, + "grad_norm": 1.203171968460083, + "learning_rate": 8.422465413999587e-05, + "loss": 1.0459, + "step": 4315 + }, + { + "epoch": 0.17482800485633346, + "grad_norm": 1.181216835975647, + "learning_rate": 8.420400578154038e-05, + "loss": 1.0512, + "step": 4320 + }, + { + "epoch": 0.17503035208417644, + "grad_norm": 1.2767380475997925, + "learning_rate": 8.418335742308488e-05, + "loss": 0.9839, + "step": 4325 + }, + { + "epoch": 0.17523269931201943, + "grad_norm": 1.1616220474243164, + "learning_rate": 8.416270906462936e-05, + "loss": 0.9819, + "step": 4330 + }, + { + "epoch": 0.17543504653986242, + "grad_norm": 1.036577582359314, + "learning_rate": 8.414206070617386e-05, + "loss": 1.0161, + "step": 4335 + }, + { + "epoch": 0.17563739376770537, + "grad_norm": 1.205965280532837, + "learning_rate": 8.412141234771837e-05, + "loss": 1.0092, + "step": 4340 + }, + { + "epoch": 0.17583974099554836, + "grad_norm": 1.1267623901367188, + "learning_rate": 8.410076398926286e-05, + "loss": 1.0086, + "step": 4345 + }, + { + "epoch": 0.17604208822339135, + "grad_norm": 1.0909661054611206, + "learning_rate": 8.408011563080736e-05, + "loss": 1.0171, + "step": 4350 + }, + { + "epoch": 0.1762444354512343, + "grad_norm": 1.1458765268325806, + "learning_rate": 8.405946727235185e-05, + "loss": 1.0693, + "step": 4355 + }, + { + "epoch": 0.1764467826790773, + "grad_norm": 1.093388557434082, + "learning_rate": 8.403881891389634e-05, + "loss": 0.9917, + "step": 4360 + }, + { + "epoch": 0.17664912990692028, + "grad_norm": 1.2402604818344116, + "learning_rate": 8.401817055544084e-05, + "loss": 1.0196, + "step": 4365 + }, + { + "epoch": 0.17685147713476326, + "grad_norm": 1.253222942352295, + "learning_rate": 8.399752219698535e-05, + "loss": 1.0269, + "step": 4370 + }, + { + "epoch": 0.17705382436260622, + "grad_norm": 1.1422299146652222, + "learning_rate": 8.397687383852984e-05, + "loss": 0.9778, + "step": 4375 + }, + { + "epoch": 0.1772561715904492, + "grad_norm": 1.1944977045059204, + "learning_rate": 8.395622548007433e-05, + "loss": 1.0795, + "step": 4380 + }, + { + "epoch": 0.1774585188182922, + "grad_norm": 1.1311194896697998, + "learning_rate": 8.393557712161883e-05, + "loss": 1.051, + "step": 4385 + }, + { + "epoch": 0.17766086604613518, + "grad_norm": 1.1790812015533447, + "learning_rate": 8.391492876316334e-05, + "loss": 1.0372, + "step": 4390 + }, + { + "epoch": 0.17786321327397814, + "grad_norm": 1.2022043466567993, + "learning_rate": 8.389428040470783e-05, + "loss": 0.9778, + "step": 4395 + }, + { + "epoch": 0.17806556050182112, + "grad_norm": 1.250864863395691, + "learning_rate": 8.387363204625233e-05, + "loss": 1.0344, + "step": 4400 + }, + { + "epoch": 0.1782679077296641, + "grad_norm": 1.234934687614441, + "learning_rate": 8.385298368779682e-05, + "loss": 1.0017, + "step": 4405 + }, + { + "epoch": 0.17847025495750707, + "grad_norm": 1.1351219415664673, + "learning_rate": 8.383233532934131e-05, + "loss": 1.0024, + "step": 4410 + }, + { + "epoch": 0.17867260218535005, + "grad_norm": 1.1732226610183716, + "learning_rate": 8.381168697088582e-05, + "loss": 1.0497, + "step": 4415 + }, + { + "epoch": 0.17887494941319304, + "grad_norm": 1.1598520278930664, + "learning_rate": 8.379103861243032e-05, + "loss": 1.0466, + "step": 4420 + }, + { + "epoch": 0.17907729664103603, + "grad_norm": 1.2276618480682373, + "learning_rate": 8.377039025397481e-05, + "loss": 0.9633, + "step": 4425 + }, + { + "epoch": 0.17927964386887899, + "grad_norm": 1.175768494606018, + "learning_rate": 8.37497418955193e-05, + "loss": 1.0127, + "step": 4430 + }, + { + "epoch": 0.17948199109672197, + "grad_norm": 1.171652913093567, + "learning_rate": 8.372909353706381e-05, + "loss": 0.9925, + "step": 4435 + }, + { + "epoch": 0.17968433832456496, + "grad_norm": 1.2681893110275269, + "learning_rate": 8.370844517860831e-05, + "loss": 1.0505, + "step": 4440 + }, + { + "epoch": 0.17988668555240794, + "grad_norm": 1.1942038536071777, + "learning_rate": 8.36877968201528e-05, + "loss": 0.9726, + "step": 4445 + }, + { + "epoch": 0.1800890327802509, + "grad_norm": 1.1117100715637207, + "learning_rate": 8.366714846169729e-05, + "loss": 1.0066, + "step": 4450 + }, + { + "epoch": 0.1802913800080939, + "grad_norm": 1.1762025356292725, + "learning_rate": 8.36465001032418e-05, + "loss": 1.0156, + "step": 4455 + }, + { + "epoch": 0.18049372723593687, + "grad_norm": 1.2027854919433594, + "learning_rate": 8.36258517447863e-05, + "loss": 1.011, + "step": 4460 + }, + { + "epoch": 0.18069607446377986, + "grad_norm": 1.1443296670913696, + "learning_rate": 8.360520338633079e-05, + "loss": 1.0126, + "step": 4465 + }, + { + "epoch": 0.18089842169162282, + "grad_norm": 1.1379833221435547, + "learning_rate": 8.35845550278753e-05, + "loss": 1.0172, + "step": 4470 + }, + { + "epoch": 0.1811007689194658, + "grad_norm": 1.2371028661727905, + "learning_rate": 8.356390666941979e-05, + "loss": 1.0788, + "step": 4475 + }, + { + "epoch": 0.1813031161473088, + "grad_norm": 1.101577639579773, + "learning_rate": 8.354325831096428e-05, + "loss": 1.042, + "step": 4480 + }, + { + "epoch": 0.18150546337515175, + "grad_norm": 1.0971204042434692, + "learning_rate": 8.352260995250878e-05, + "loss": 0.9927, + "step": 4485 + }, + { + "epoch": 0.18170781060299473, + "grad_norm": 1.16663658618927, + "learning_rate": 8.350196159405328e-05, + "loss": 1.0371, + "step": 4490 + }, + { + "epoch": 0.18191015783083772, + "grad_norm": 1.2800010442733765, + "learning_rate": 8.348131323559777e-05, + "loss": 1.0155, + "step": 4495 + }, + { + "epoch": 0.1821125050586807, + "grad_norm": 1.0925393104553223, + "learning_rate": 8.346066487714227e-05, + "loss": 1.013, + "step": 4500 + }, + { + "epoch": 0.18231485228652367, + "grad_norm": 1.227115273475647, + "learning_rate": 8.344001651868677e-05, + "loss": 0.9777, + "step": 4505 + }, + { + "epoch": 0.18251719951436665, + "grad_norm": 1.0858711004257202, + "learning_rate": 8.341936816023127e-05, + "loss": 1.0006, + "step": 4510 + }, + { + "epoch": 0.18271954674220964, + "grad_norm": 1.1313248872756958, + "learning_rate": 8.339871980177576e-05, + "loss": 1.0027, + "step": 4515 + }, + { + "epoch": 0.18292189397005262, + "grad_norm": 1.0812978744506836, + "learning_rate": 8.337807144332025e-05, + "loss": 0.9642, + "step": 4520 + }, + { + "epoch": 0.18312424119789558, + "grad_norm": 1.1415776014328003, + "learning_rate": 8.335742308486476e-05, + "loss": 0.9529, + "step": 4525 + }, + { + "epoch": 0.18332658842573857, + "grad_norm": 1.1783350706100464, + "learning_rate": 8.333677472640925e-05, + "loss": 1.0325, + "step": 4530 + }, + { + "epoch": 0.18352893565358155, + "grad_norm": 1.1650296449661255, + "learning_rate": 8.331612636795375e-05, + "loss": 0.9809, + "step": 4535 + }, + { + "epoch": 0.1837312828814245, + "grad_norm": 1.091321587562561, + "learning_rate": 8.329547800949826e-05, + "loss": 1.0616, + "step": 4540 + }, + { + "epoch": 0.1839336301092675, + "grad_norm": 1.1316403150558472, + "learning_rate": 8.327482965104273e-05, + "loss": 1.0276, + "step": 4545 + }, + { + "epoch": 0.18413597733711048, + "grad_norm": 1.3371331691741943, + "learning_rate": 8.325418129258724e-05, + "loss": 0.9285, + "step": 4550 + }, + { + "epoch": 0.18433832456495347, + "grad_norm": 1.232073187828064, + "learning_rate": 8.323353293413174e-05, + "loss": 1.0179, + "step": 4555 + }, + { + "epoch": 0.18454067179279643, + "grad_norm": 1.1317367553710938, + "learning_rate": 8.321288457567625e-05, + "loss": 1.0738, + "step": 4560 + }, + { + "epoch": 0.18474301902063942, + "grad_norm": 1.2579426765441895, + "learning_rate": 8.319223621722074e-05, + "loss": 0.9997, + "step": 4565 + }, + { + "epoch": 0.1849453662484824, + "grad_norm": 1.0826157331466675, + "learning_rate": 8.317158785876523e-05, + "loss": 1.034, + "step": 4570 + }, + { + "epoch": 0.1851477134763254, + "grad_norm": 1.2121678590774536, + "learning_rate": 8.315093950030973e-05, + "loss": 0.9689, + "step": 4575 + }, + { + "epoch": 0.18535006070416835, + "grad_norm": 1.1447327136993408, + "learning_rate": 8.313029114185422e-05, + "loss": 0.9275, + "step": 4580 + }, + { + "epoch": 0.18555240793201133, + "grad_norm": 1.2216501235961914, + "learning_rate": 8.310964278339873e-05, + "loss": 1.0189, + "step": 4585 + }, + { + "epoch": 0.18575475515985432, + "grad_norm": 1.1991691589355469, + "learning_rate": 8.308899442494322e-05, + "loss": 0.9912, + "step": 4590 + }, + { + "epoch": 0.18595710238769728, + "grad_norm": 1.0804492235183716, + "learning_rate": 8.306834606648772e-05, + "loss": 0.9893, + "step": 4595 + }, + { + "epoch": 0.18615944961554026, + "grad_norm": 1.140123963356018, + "learning_rate": 8.304769770803221e-05, + "loss": 1.015, + "step": 4600 + }, + { + "epoch": 0.18636179684338325, + "grad_norm": 1.0930538177490234, + "learning_rate": 8.302704934957672e-05, + "loss": 1.0898, + "step": 4605 + }, + { + "epoch": 0.18656414407122623, + "grad_norm": 1.1015198230743408, + "learning_rate": 8.300640099112122e-05, + "loss": 1.012, + "step": 4610 + }, + { + "epoch": 0.1867664912990692, + "grad_norm": 1.2269937992095947, + "learning_rate": 8.298575263266571e-05, + "loss": 1.007, + "step": 4615 + }, + { + "epoch": 0.18696883852691218, + "grad_norm": 1.1857010126113892, + "learning_rate": 8.29651042742102e-05, + "loss": 0.9999, + "step": 4620 + }, + { + "epoch": 0.18717118575475516, + "grad_norm": 1.1667230129241943, + "learning_rate": 8.29444559157547e-05, + "loss": 1.0035, + "step": 4625 + }, + { + "epoch": 0.18737353298259815, + "grad_norm": 1.1285626888275146, + "learning_rate": 8.29238075572992e-05, + "loss": 1.0846, + "step": 4630 + }, + { + "epoch": 0.1875758802104411, + "grad_norm": 1.2603074312210083, + "learning_rate": 8.29031591988437e-05, + "loss": 1.0128, + "step": 4635 + }, + { + "epoch": 0.1877782274382841, + "grad_norm": 1.1492124795913696, + "learning_rate": 8.288251084038819e-05, + "loss": 1.0123, + "step": 4640 + }, + { + "epoch": 0.18798057466612708, + "grad_norm": 1.0780872106552124, + "learning_rate": 8.28618624819327e-05, + "loss": 1.0135, + "step": 4645 + }, + { + "epoch": 0.18818292189397004, + "grad_norm": 1.1884671449661255, + "learning_rate": 8.284121412347718e-05, + "loss": 1.0077, + "step": 4650 + }, + { + "epoch": 0.18838526912181303, + "grad_norm": 1.22124445438385, + "learning_rate": 8.282056576502169e-05, + "loss": 1.0195, + "step": 4655 + }, + { + "epoch": 0.188587616349656, + "grad_norm": 1.2182954549789429, + "learning_rate": 8.279991740656619e-05, + "loss": 0.9951, + "step": 4660 + }, + { + "epoch": 0.188789963577499, + "grad_norm": 1.06657075881958, + "learning_rate": 8.277926904811067e-05, + "loss": 1.013, + "step": 4665 + }, + { + "epoch": 0.18899231080534196, + "grad_norm": 1.447752833366394, + "learning_rate": 8.275862068965517e-05, + "loss": 1.0266, + "step": 4670 + }, + { + "epoch": 0.18919465803318494, + "grad_norm": 1.1149766445159912, + "learning_rate": 8.273797233119968e-05, + "loss": 1.0416, + "step": 4675 + }, + { + "epoch": 0.18939700526102793, + "grad_norm": 1.1205918788909912, + "learning_rate": 8.271732397274418e-05, + "loss": 1.0024, + "step": 4680 + }, + { + "epoch": 0.18959935248887091, + "grad_norm": 1.1822267770767212, + "learning_rate": 8.269667561428867e-05, + "loss": 1.035, + "step": 4685 + }, + { + "epoch": 0.18980169971671387, + "grad_norm": 1.1343849897384644, + "learning_rate": 8.267602725583316e-05, + "loss": 1.0002, + "step": 4690 + }, + { + "epoch": 0.19000404694455686, + "grad_norm": 1.2238893508911133, + "learning_rate": 8.265537889737767e-05, + "loss": 1.0472, + "step": 4695 + }, + { + "epoch": 0.19020639417239985, + "grad_norm": 1.2157957553863525, + "learning_rate": 8.263473053892216e-05, + "loss": 1.0133, + "step": 4700 + }, + { + "epoch": 0.1904087414002428, + "grad_norm": 1.1864662170410156, + "learning_rate": 8.261408218046666e-05, + "loss": 1.0209, + "step": 4705 + }, + { + "epoch": 0.1906110886280858, + "grad_norm": 1.296589970588684, + "learning_rate": 8.259343382201115e-05, + "loss": 1.0054, + "step": 4710 + }, + { + "epoch": 0.19081343585592878, + "grad_norm": 1.1743710041046143, + "learning_rate": 8.257278546355564e-05, + "loss": 0.9936, + "step": 4715 + }, + { + "epoch": 0.19101578308377176, + "grad_norm": 1.2557445764541626, + "learning_rate": 8.255213710510015e-05, + "loss": 0.9909, + "step": 4720 + }, + { + "epoch": 0.19121813031161472, + "grad_norm": 1.2726306915283203, + "learning_rate": 8.253148874664465e-05, + "loss": 1.0016, + "step": 4725 + }, + { + "epoch": 0.1914204775394577, + "grad_norm": 1.1929681301116943, + "learning_rate": 8.251084038818915e-05, + "loss": 0.9949, + "step": 4730 + }, + { + "epoch": 0.1916228247673007, + "grad_norm": 1.2225488424301147, + "learning_rate": 8.249019202973363e-05, + "loss": 1.0163, + "step": 4735 + }, + { + "epoch": 0.19182517199514368, + "grad_norm": 1.4360350370407104, + "learning_rate": 8.246954367127814e-05, + "loss": 1.0541, + "step": 4740 + }, + { + "epoch": 0.19202751922298664, + "grad_norm": 1.2667735815048218, + "learning_rate": 8.244889531282264e-05, + "loss": 1.0523, + "step": 4745 + }, + { + "epoch": 0.19222986645082962, + "grad_norm": 1.2565869092941284, + "learning_rate": 8.242824695436713e-05, + "loss": 1.0108, + "step": 4750 + }, + { + "epoch": 0.1924322136786726, + "grad_norm": 1.1470930576324463, + "learning_rate": 8.240759859591163e-05, + "loss": 0.98, + "step": 4755 + }, + { + "epoch": 0.19263456090651557, + "grad_norm": 1.1337534189224243, + "learning_rate": 8.238695023745612e-05, + "loss": 0.9693, + "step": 4760 + }, + { + "epoch": 0.19283690813435855, + "grad_norm": 1.1028400659561157, + "learning_rate": 8.236630187900061e-05, + "loss": 1.0606, + "step": 4765 + }, + { + "epoch": 0.19303925536220154, + "grad_norm": 1.245178461074829, + "learning_rate": 8.234565352054512e-05, + "loss": 1.0099, + "step": 4770 + }, + { + "epoch": 0.19324160259004453, + "grad_norm": 1.2336320877075195, + "learning_rate": 8.232500516208962e-05, + "loss": 1.0134, + "step": 4775 + }, + { + "epoch": 0.19344394981788748, + "grad_norm": 1.170240879058838, + "learning_rate": 8.230435680363411e-05, + "loss": 1.0229, + "step": 4780 + }, + { + "epoch": 0.19364629704573047, + "grad_norm": 1.2225584983825684, + "learning_rate": 8.22837084451786e-05, + "loss": 1.0356, + "step": 4785 + }, + { + "epoch": 0.19384864427357346, + "grad_norm": 1.2629694938659668, + "learning_rate": 8.226306008672311e-05, + "loss": 1.0283, + "step": 4790 + }, + { + "epoch": 0.19405099150141644, + "grad_norm": 1.2679898738861084, + "learning_rate": 8.224241172826761e-05, + "loss": 1.0095, + "step": 4795 + }, + { + "epoch": 0.1942533387292594, + "grad_norm": 1.2377355098724365, + "learning_rate": 8.22217633698121e-05, + "loss": 1.0122, + "step": 4800 + }, + { + "epoch": 0.1944556859571024, + "grad_norm": 1.1721128225326538, + "learning_rate": 8.22011150113566e-05, + "loss": 1.0383, + "step": 4805 + }, + { + "epoch": 0.19465803318494537, + "grad_norm": 1.0998475551605225, + "learning_rate": 8.21804666529011e-05, + "loss": 0.9939, + "step": 4810 + }, + { + "epoch": 0.19486038041278833, + "grad_norm": 1.3133405447006226, + "learning_rate": 8.21598182944456e-05, + "loss": 0.9786, + "step": 4815 + }, + { + "epoch": 0.19506272764063132, + "grad_norm": 1.1301445960998535, + "learning_rate": 8.213916993599009e-05, + "loss": 1.0173, + "step": 4820 + }, + { + "epoch": 0.1952650748684743, + "grad_norm": 1.3158178329467773, + "learning_rate": 8.21185215775346e-05, + "loss": 1.016, + "step": 4825 + }, + { + "epoch": 0.1954674220963173, + "grad_norm": 1.1632481813430786, + "learning_rate": 8.209787321907909e-05, + "loss": 1.0331, + "step": 4830 + }, + { + "epoch": 0.19566976932416025, + "grad_norm": 1.1861474514007568, + "learning_rate": 8.207722486062358e-05, + "loss": 1.0242, + "step": 4835 + }, + { + "epoch": 0.19587211655200323, + "grad_norm": 1.1996656656265259, + "learning_rate": 8.205657650216808e-05, + "loss": 1.0216, + "step": 4840 + }, + { + "epoch": 0.19607446377984622, + "grad_norm": 1.0401570796966553, + "learning_rate": 8.203592814371259e-05, + "loss": 1.0539, + "step": 4845 + }, + { + "epoch": 0.1962768110076892, + "grad_norm": 1.1545593738555908, + "learning_rate": 8.201527978525708e-05, + "loss": 1.0752, + "step": 4850 + }, + { + "epoch": 0.19647915823553216, + "grad_norm": 1.1579617261886597, + "learning_rate": 8.199463142680157e-05, + "loss": 0.9972, + "step": 4855 + }, + { + "epoch": 0.19668150546337515, + "grad_norm": 1.154383897781372, + "learning_rate": 8.197398306834607e-05, + "loss": 1.0125, + "step": 4860 + }, + { + "epoch": 0.19688385269121814, + "grad_norm": 1.2943758964538574, + "learning_rate": 8.195333470989057e-05, + "loss": 1.0772, + "step": 4865 + }, + { + "epoch": 0.19708619991906112, + "grad_norm": 1.2109222412109375, + "learning_rate": 8.193268635143506e-05, + "loss": 0.9919, + "step": 4870 + }, + { + "epoch": 0.19728854714690408, + "grad_norm": 1.1765272617340088, + "learning_rate": 8.191203799297956e-05, + "loss": 1.0669, + "step": 4875 + }, + { + "epoch": 0.19749089437474707, + "grad_norm": 1.085466980934143, + "learning_rate": 8.189138963452406e-05, + "loss": 1.0174, + "step": 4880 + }, + { + "epoch": 0.19769324160259005, + "grad_norm": 1.313828468322754, + "learning_rate": 8.187074127606855e-05, + "loss": 0.9742, + "step": 4885 + }, + { + "epoch": 0.197895588830433, + "grad_norm": 1.1793757677078247, + "learning_rate": 8.185009291761305e-05, + "loss": 1.0921, + "step": 4890 + }, + { + "epoch": 0.198097936058276, + "grad_norm": 1.1285810470581055, + "learning_rate": 8.182944455915756e-05, + "loss": 0.9753, + "step": 4895 + }, + { + "epoch": 0.19830028328611898, + "grad_norm": 1.2235735654830933, + "learning_rate": 8.180879620070205e-05, + "loss": 1.0708, + "step": 4900 + }, + { + "epoch": 0.19850263051396197, + "grad_norm": 1.3059691190719604, + "learning_rate": 8.178814784224654e-05, + "loss": 0.9947, + "step": 4905 + }, + { + "epoch": 0.19870497774180493, + "grad_norm": 1.2152923345565796, + "learning_rate": 8.176749948379104e-05, + "loss": 0.9984, + "step": 4910 + }, + { + "epoch": 0.19890732496964791, + "grad_norm": 1.2941012382507324, + "learning_rate": 8.174685112533555e-05, + "loss": 1.0011, + "step": 4915 + }, + { + "epoch": 0.1991096721974909, + "grad_norm": 1.0703668594360352, + "learning_rate": 8.172620276688004e-05, + "loss": 1.0087, + "step": 4920 + }, + { + "epoch": 0.1993120194253339, + "grad_norm": 1.228332757949829, + "learning_rate": 8.170555440842453e-05, + "loss": 0.9731, + "step": 4925 + }, + { + "epoch": 0.19951436665317684, + "grad_norm": 1.259106993675232, + "learning_rate": 8.168490604996903e-05, + "loss": 0.9851, + "step": 4930 + }, + { + "epoch": 0.19971671388101983, + "grad_norm": 1.2263469696044922, + "learning_rate": 8.166425769151352e-05, + "loss": 0.9997, + "step": 4935 + }, + { + "epoch": 0.19991906110886282, + "grad_norm": 1.289626121520996, + "learning_rate": 8.164360933305803e-05, + "loss": 1.0157, + "step": 4940 + }, + { + "epoch": 0.20012140833670577, + "grad_norm": 1.1667588949203491, + "learning_rate": 8.162296097460253e-05, + "loss": 1.0841, + "step": 4945 + }, + { + "epoch": 0.20032375556454876, + "grad_norm": 1.2754533290863037, + "learning_rate": 8.160231261614702e-05, + "loss": 1.042, + "step": 4950 + }, + { + "epoch": 0.20052610279239175, + "grad_norm": 1.2538858652114868, + "learning_rate": 8.158166425769151e-05, + "loss": 0.9863, + "step": 4955 + }, + { + "epoch": 0.20072845002023473, + "grad_norm": 1.2489347457885742, + "learning_rate": 8.156101589923602e-05, + "loss": 0.99, + "step": 4960 + }, + { + "epoch": 0.2009307972480777, + "grad_norm": 1.3711369037628174, + "learning_rate": 8.154036754078052e-05, + "loss": 0.9796, + "step": 4965 + }, + { + "epoch": 0.20113314447592068, + "grad_norm": 1.1020487546920776, + "learning_rate": 8.151971918232501e-05, + "loss": 1.0118, + "step": 4970 + }, + { + "epoch": 0.20133549170376366, + "grad_norm": 1.0652621984481812, + "learning_rate": 8.14990708238695e-05, + "loss": 0.9861, + "step": 4975 + }, + { + "epoch": 0.20153783893160665, + "grad_norm": 1.2487317323684692, + "learning_rate": 8.1478422465414e-05, + "loss": 1.0211, + "step": 4980 + }, + { + "epoch": 0.2017401861594496, + "grad_norm": 1.0528713464736938, + "learning_rate": 8.14577741069585e-05, + "loss": 1.0425, + "step": 4985 + }, + { + "epoch": 0.2019425333872926, + "grad_norm": 1.261579990386963, + "learning_rate": 8.1437125748503e-05, + "loss": 1.0487, + "step": 4990 + }, + { + "epoch": 0.20214488061513558, + "grad_norm": 1.2511799335479736, + "learning_rate": 8.141647739004749e-05, + "loss": 1.0015, + "step": 4995 + }, + { + "epoch": 0.20234722784297854, + "grad_norm": 1.2175990343093872, + "learning_rate": 8.1395829031592e-05, + "loss": 1.0144, + "step": 5000 + }, + { + "epoch": 0.20254957507082152, + "grad_norm": 1.232932448387146, + "learning_rate": 8.137518067313649e-05, + "loss": 1.0212, + "step": 5005 + }, + { + "epoch": 0.2027519222986645, + "grad_norm": 1.2393242120742798, + "learning_rate": 8.135453231468099e-05, + "loss": 1.0109, + "step": 5010 + }, + { + "epoch": 0.2029542695265075, + "grad_norm": 1.2997404336929321, + "learning_rate": 8.13338839562255e-05, + "loss": 1.0225, + "step": 5015 + }, + { + "epoch": 0.20315661675435046, + "grad_norm": 1.0594301223754883, + "learning_rate": 8.131323559776997e-05, + "loss": 1.0358, + "step": 5020 + }, + { + "epoch": 0.20335896398219344, + "grad_norm": 1.193315863609314, + "learning_rate": 8.129258723931447e-05, + "loss": 0.9909, + "step": 5025 + }, + { + "epoch": 0.20356131121003643, + "grad_norm": 1.2019113302230835, + "learning_rate": 8.127193888085898e-05, + "loss": 1.0085, + "step": 5030 + }, + { + "epoch": 0.2037636584378794, + "grad_norm": 1.2004979848861694, + "learning_rate": 8.125129052240348e-05, + "loss": 1.0471, + "step": 5035 + }, + { + "epoch": 0.20396600566572237, + "grad_norm": 1.210090160369873, + "learning_rate": 8.123064216394797e-05, + "loss": 0.9805, + "step": 5040 + }, + { + "epoch": 0.20416835289356536, + "grad_norm": 1.1681770086288452, + "learning_rate": 8.120999380549246e-05, + "loss": 1.0305, + "step": 5045 + }, + { + "epoch": 0.20437070012140834, + "grad_norm": 1.2289959192276, + "learning_rate": 8.118934544703697e-05, + "loss": 1.037, + "step": 5050 + }, + { + "epoch": 0.2045730473492513, + "grad_norm": 1.1978446245193481, + "learning_rate": 8.116869708858146e-05, + "loss": 0.994, + "step": 5055 + }, + { + "epoch": 0.2047753945770943, + "grad_norm": 1.1448619365692139, + "learning_rate": 8.114804873012596e-05, + "loss": 1.0227, + "step": 5060 + }, + { + "epoch": 0.20497774180493727, + "grad_norm": 1.124419927597046, + "learning_rate": 8.112740037167045e-05, + "loss": 0.9965, + "step": 5065 + }, + { + "epoch": 0.20518008903278026, + "grad_norm": 1.0857465267181396, + "learning_rate": 8.110675201321494e-05, + "loss": 1.0547, + "step": 5070 + }, + { + "epoch": 0.20538243626062322, + "grad_norm": 1.1845394372940063, + "learning_rate": 8.108610365475945e-05, + "loss": 1.1202, + "step": 5075 + }, + { + "epoch": 0.2055847834884662, + "grad_norm": 1.098524570465088, + "learning_rate": 8.106545529630395e-05, + "loss": 1.0128, + "step": 5080 + }, + { + "epoch": 0.2057871307163092, + "grad_norm": 1.2847548723220825, + "learning_rate": 8.104480693784846e-05, + "loss": 1.0487, + "step": 5085 + }, + { + "epoch": 0.20598947794415218, + "grad_norm": 1.1287879943847656, + "learning_rate": 8.102415857939293e-05, + "loss": 0.9686, + "step": 5090 + }, + { + "epoch": 0.20619182517199514, + "grad_norm": 1.1327563524246216, + "learning_rate": 8.100351022093744e-05, + "loss": 1.0362, + "step": 5095 + }, + { + "epoch": 0.20639417239983812, + "grad_norm": 1.1193857192993164, + "learning_rate": 8.098286186248194e-05, + "loss": 1.0032, + "step": 5100 + }, + { + "epoch": 0.2065965196276811, + "grad_norm": 1.1401958465576172, + "learning_rate": 8.096221350402643e-05, + "loss": 1.0223, + "step": 5105 + }, + { + "epoch": 0.20679886685552407, + "grad_norm": 1.1872715950012207, + "learning_rate": 8.094156514557094e-05, + "loss": 0.9417, + "step": 5110 + }, + { + "epoch": 0.20700121408336705, + "grad_norm": 1.2122455835342407, + "learning_rate": 8.092091678711543e-05, + "loss": 1.0725, + "step": 5115 + }, + { + "epoch": 0.20720356131121004, + "grad_norm": 1.1419376134872437, + "learning_rate": 8.090026842865992e-05, + "loss": 1.0154, + "step": 5120 + }, + { + "epoch": 0.20740590853905302, + "grad_norm": 1.2034239768981934, + "learning_rate": 8.087962007020442e-05, + "loss": 1.0571, + "step": 5125 + }, + { + "epoch": 0.20760825576689598, + "grad_norm": 1.117256760597229, + "learning_rate": 8.085897171174892e-05, + "loss": 0.9774, + "step": 5130 + }, + { + "epoch": 0.20781060299473897, + "grad_norm": 1.1485257148742676, + "learning_rate": 8.083832335329341e-05, + "loss": 1.0187, + "step": 5135 + }, + { + "epoch": 0.20801295022258195, + "grad_norm": 1.197513222694397, + "learning_rate": 8.08176749948379e-05, + "loss": 1.049, + "step": 5140 + }, + { + "epoch": 0.20821529745042494, + "grad_norm": 1.0740694999694824, + "learning_rate": 8.079702663638241e-05, + "loss": 1.0412, + "step": 5145 + }, + { + "epoch": 0.2084176446782679, + "grad_norm": 1.2320221662521362, + "learning_rate": 8.077637827792691e-05, + "loss": 1.0407, + "step": 5150 + }, + { + "epoch": 0.20861999190611089, + "grad_norm": 1.1767700910568237, + "learning_rate": 8.07557299194714e-05, + "loss": 0.9273, + "step": 5155 + }, + { + "epoch": 0.20882233913395387, + "grad_norm": 1.3007943630218506, + "learning_rate": 8.073508156101591e-05, + "loss": 1.0011, + "step": 5160 + }, + { + "epoch": 0.20902468636179683, + "grad_norm": 1.441924810409546, + "learning_rate": 8.07144332025604e-05, + "loss": 0.9932, + "step": 5165 + }, + { + "epoch": 0.20922703358963982, + "grad_norm": 1.1403335332870483, + "learning_rate": 8.06937848441049e-05, + "loss": 1.0037, + "step": 5170 + }, + { + "epoch": 0.2094293808174828, + "grad_norm": 1.2318395376205444, + "learning_rate": 8.06731364856494e-05, + "loss": 0.9821, + "step": 5175 + }, + { + "epoch": 0.2096317280453258, + "grad_norm": 1.3537083864212036, + "learning_rate": 8.06524881271939e-05, + "loss": 0.9923, + "step": 5180 + }, + { + "epoch": 0.20983407527316875, + "grad_norm": 1.2805323600769043, + "learning_rate": 8.063183976873839e-05, + "loss": 1.0084, + "step": 5185 + }, + { + "epoch": 0.21003642250101173, + "grad_norm": 1.1598711013793945, + "learning_rate": 8.061119141028288e-05, + "loss": 0.9564, + "step": 5190 + }, + { + "epoch": 0.21023876972885472, + "grad_norm": 1.1318501234054565, + "learning_rate": 8.059054305182738e-05, + "loss": 0.9757, + "step": 5195 + }, + { + "epoch": 0.2104411169566977, + "grad_norm": 1.1981204748153687, + "learning_rate": 8.056989469337189e-05, + "loss": 1.0163, + "step": 5200 + }, + { + "epoch": 0.21064346418454066, + "grad_norm": 1.2053571939468384, + "learning_rate": 8.054924633491638e-05, + "loss": 0.9828, + "step": 5205 + }, + { + "epoch": 0.21084581141238365, + "grad_norm": 1.0915008783340454, + "learning_rate": 8.052859797646087e-05, + "loss": 1.0114, + "step": 5210 + }, + { + "epoch": 0.21104815864022664, + "grad_norm": 1.2757831811904907, + "learning_rate": 8.050794961800537e-05, + "loss": 1.0101, + "step": 5215 + }, + { + "epoch": 0.21125050586806962, + "grad_norm": 1.116852879524231, + "learning_rate": 8.048730125954988e-05, + "loss": 1.0005, + "step": 5220 + }, + { + "epoch": 0.21145285309591258, + "grad_norm": 1.1757320165634155, + "learning_rate": 8.046665290109437e-05, + "loss": 1.0358, + "step": 5225 + }, + { + "epoch": 0.21165520032375557, + "grad_norm": 1.096636414527893, + "learning_rate": 8.044600454263887e-05, + "loss": 1.026, + "step": 5230 + }, + { + "epoch": 0.21185754755159855, + "grad_norm": 1.0272443294525146, + "learning_rate": 8.042535618418336e-05, + "loss": 1.0651, + "step": 5235 + }, + { + "epoch": 0.2120598947794415, + "grad_norm": 1.132805585861206, + "learning_rate": 8.040470782572785e-05, + "loss": 1.0081, + "step": 5240 + }, + { + "epoch": 0.2122622420072845, + "grad_norm": 1.165757656097412, + "learning_rate": 8.038405946727236e-05, + "loss": 1.0167, + "step": 5245 + }, + { + "epoch": 0.21246458923512748, + "grad_norm": 1.3743444681167603, + "learning_rate": 8.036341110881686e-05, + "loss": 1.0315, + "step": 5250 + }, + { + "epoch": 0.21266693646297047, + "grad_norm": 1.0902739763259888, + "learning_rate": 8.034276275036135e-05, + "loss": 1.0458, + "step": 5255 + }, + { + "epoch": 0.21286928369081343, + "grad_norm": 1.3256388902664185, + "learning_rate": 8.032211439190584e-05, + "loss": 1.0317, + "step": 5260 + }, + { + "epoch": 0.2130716309186564, + "grad_norm": 1.2284111976623535, + "learning_rate": 8.030146603345034e-05, + "loss": 1.0429, + "step": 5265 + }, + { + "epoch": 0.2132739781464994, + "grad_norm": 1.137561559677124, + "learning_rate": 8.028081767499485e-05, + "loss": 0.9972, + "step": 5270 + }, + { + "epoch": 0.21347632537434238, + "grad_norm": 1.2012466192245483, + "learning_rate": 8.026016931653934e-05, + "loss": 1.0394, + "step": 5275 + }, + { + "epoch": 0.21367867260218534, + "grad_norm": 1.210174798965454, + "learning_rate": 8.023952095808383e-05, + "loss": 1.0, + "step": 5280 + }, + { + "epoch": 0.21388101983002833, + "grad_norm": 1.2262771129608154, + "learning_rate": 8.021887259962833e-05, + "loss": 1.0235, + "step": 5285 + }, + { + "epoch": 0.21408336705787132, + "grad_norm": 1.1709131002426147, + "learning_rate": 8.019822424117282e-05, + "loss": 0.984, + "step": 5290 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.3027403354644775, + "learning_rate": 8.017757588271733e-05, + "loss": 1.0056, + "step": 5295 + }, + { + "epoch": 0.21448806151355726, + "grad_norm": 1.119411826133728, + "learning_rate": 8.015692752426183e-05, + "loss": 1.0415, + "step": 5300 + }, + { + "epoch": 0.21469040874140025, + "grad_norm": 1.345045804977417, + "learning_rate": 8.013627916580632e-05, + "loss": 1.0551, + "step": 5305 + }, + { + "epoch": 0.21489275596924323, + "grad_norm": 1.1461879014968872, + "learning_rate": 8.011563080735081e-05, + "loss": 1.0299, + "step": 5310 + }, + { + "epoch": 0.2150951031970862, + "grad_norm": 1.107438564300537, + "learning_rate": 8.009498244889532e-05, + "loss": 1.0377, + "step": 5315 + }, + { + "epoch": 0.21529745042492918, + "grad_norm": 1.1863116025924683, + "learning_rate": 8.007433409043982e-05, + "loss": 0.9396, + "step": 5320 + }, + { + "epoch": 0.21549979765277216, + "grad_norm": 1.131195306777954, + "learning_rate": 8.005368573198431e-05, + "loss": 1.0107, + "step": 5325 + }, + { + "epoch": 0.21570214488061515, + "grad_norm": 1.1013654470443726, + "learning_rate": 8.00330373735288e-05, + "loss": 1.0834, + "step": 5330 + }, + { + "epoch": 0.2159044921084581, + "grad_norm": 1.3237591981887817, + "learning_rate": 8.001238901507331e-05, + "loss": 0.9525, + "step": 5335 + }, + { + "epoch": 0.2161068393363011, + "grad_norm": 1.237547755241394, + "learning_rate": 7.99917406566178e-05, + "loss": 1.0534, + "step": 5340 + }, + { + "epoch": 0.21630918656414408, + "grad_norm": 1.21968674659729, + "learning_rate": 7.99710922981623e-05, + "loss": 1.0109, + "step": 5345 + }, + { + "epoch": 0.21651153379198704, + "grad_norm": 1.2342300415039062, + "learning_rate": 7.995044393970679e-05, + "loss": 1.009, + "step": 5350 + }, + { + "epoch": 0.21671388101983002, + "grad_norm": 1.2325963973999023, + "learning_rate": 7.99297955812513e-05, + "loss": 1.0736, + "step": 5355 + }, + { + "epoch": 0.216916228247673, + "grad_norm": 1.1565101146697998, + "learning_rate": 7.990914722279579e-05, + "loss": 0.9942, + "step": 5360 + }, + { + "epoch": 0.217118575475516, + "grad_norm": 1.2055383920669556, + "learning_rate": 7.988849886434029e-05, + "loss": 1.0574, + "step": 5365 + }, + { + "epoch": 0.21732092270335895, + "grad_norm": 1.2551612854003906, + "learning_rate": 7.98678505058848e-05, + "loss": 1.0091, + "step": 5370 + }, + { + "epoch": 0.21752326993120194, + "grad_norm": 1.1769312620162964, + "learning_rate": 7.984720214742929e-05, + "loss": 1.0012, + "step": 5375 + }, + { + "epoch": 0.21772561715904493, + "grad_norm": 1.2143961191177368, + "learning_rate": 7.982655378897378e-05, + "loss": 1.057, + "step": 5380 + }, + { + "epoch": 0.2179279643868879, + "grad_norm": 1.179459571838379, + "learning_rate": 7.980590543051828e-05, + "loss": 0.9842, + "step": 5385 + }, + { + "epoch": 0.21813031161473087, + "grad_norm": 1.2663486003875732, + "learning_rate": 7.978525707206278e-05, + "loss": 0.9715, + "step": 5390 + }, + { + "epoch": 0.21833265884257386, + "grad_norm": 1.3901774883270264, + "learning_rate": 7.976460871360727e-05, + "loss": 1.0165, + "step": 5395 + }, + { + "epoch": 0.21853500607041684, + "grad_norm": 1.1746774911880493, + "learning_rate": 7.974396035515176e-05, + "loss": 1.0111, + "step": 5400 + }, + { + "epoch": 0.2187373532982598, + "grad_norm": 1.140589952468872, + "learning_rate": 7.972331199669627e-05, + "loss": 1.0013, + "step": 5405 + }, + { + "epoch": 0.2189397005261028, + "grad_norm": 1.3139210939407349, + "learning_rate": 7.970266363824076e-05, + "loss": 1.0556, + "step": 5410 + }, + { + "epoch": 0.21914204775394577, + "grad_norm": 0.9810004830360413, + "learning_rate": 7.968201527978526e-05, + "loss": 1.0059, + "step": 5415 + }, + { + "epoch": 0.21934439498178876, + "grad_norm": 1.1734237670898438, + "learning_rate": 7.966136692132977e-05, + "loss": 1.048, + "step": 5420 + }, + { + "epoch": 0.21954674220963172, + "grad_norm": 1.1313258409500122, + "learning_rate": 7.964071856287424e-05, + "loss": 0.93, + "step": 5425 + }, + { + "epoch": 0.2197490894374747, + "grad_norm": 1.2841383218765259, + "learning_rate": 7.962007020441875e-05, + "loss": 1.0334, + "step": 5430 + }, + { + "epoch": 0.2199514366653177, + "grad_norm": 1.191046118736267, + "learning_rate": 7.959942184596325e-05, + "loss": 1.0424, + "step": 5435 + }, + { + "epoch": 0.22015378389316068, + "grad_norm": 1.2136199474334717, + "learning_rate": 7.957877348750776e-05, + "loss": 1.0314, + "step": 5440 + }, + { + "epoch": 0.22035613112100363, + "grad_norm": 1.2416642904281616, + "learning_rate": 7.955812512905225e-05, + "loss": 1.0247, + "step": 5445 + }, + { + "epoch": 0.22055847834884662, + "grad_norm": 1.175480604171753, + "learning_rate": 7.953747677059674e-05, + "loss": 1.0402, + "step": 5450 + }, + { + "epoch": 0.2207608255766896, + "grad_norm": 1.2230721712112427, + "learning_rate": 7.951682841214124e-05, + "loss": 1.0268, + "step": 5455 + }, + { + "epoch": 0.22096317280453256, + "grad_norm": 1.1856865882873535, + "learning_rate": 7.949618005368573e-05, + "loss": 0.9919, + "step": 5460 + }, + { + "epoch": 0.22116552003237555, + "grad_norm": 1.1516177654266357, + "learning_rate": 7.947553169523024e-05, + "loss": 1.0664, + "step": 5465 + }, + { + "epoch": 0.22136786726021854, + "grad_norm": 1.0597288608551025, + "learning_rate": 7.945488333677473e-05, + "loss": 0.9861, + "step": 5470 + }, + { + "epoch": 0.22157021448806152, + "grad_norm": 1.228318691253662, + "learning_rate": 7.943423497831922e-05, + "loss": 0.9624, + "step": 5475 + }, + { + "epoch": 0.22177256171590448, + "grad_norm": 1.11944580078125, + "learning_rate": 7.941358661986372e-05, + "loss": 1.0034, + "step": 5480 + }, + { + "epoch": 0.22197490894374747, + "grad_norm": 1.184719443321228, + "learning_rate": 7.939293826140823e-05, + "loss": 0.9928, + "step": 5485 + }, + { + "epoch": 0.22217725617159045, + "grad_norm": 1.2186683416366577, + "learning_rate": 7.937228990295273e-05, + "loss": 1.0219, + "step": 5490 + }, + { + "epoch": 0.22237960339943344, + "grad_norm": 1.0990355014801025, + "learning_rate": 7.93516415444972e-05, + "loss": 1.0182, + "step": 5495 + }, + { + "epoch": 0.2225819506272764, + "grad_norm": 1.1261568069458008, + "learning_rate": 7.933099318604171e-05, + "loss": 0.9818, + "step": 5500 + }, + { + "epoch": 0.22278429785511938, + "grad_norm": 1.1164358854293823, + "learning_rate": 7.931034482758621e-05, + "loss": 1.0395, + "step": 5505 + }, + { + "epoch": 0.22298664508296237, + "grad_norm": 1.0908489227294922, + "learning_rate": 7.92896964691307e-05, + "loss": 0.9929, + "step": 5510 + }, + { + "epoch": 0.22318899231080533, + "grad_norm": 1.1659622192382812, + "learning_rate": 7.926904811067521e-05, + "loss": 0.9867, + "step": 5515 + }, + { + "epoch": 0.22339133953864831, + "grad_norm": 1.1731120347976685, + "learning_rate": 7.92483997522197e-05, + "loss": 1.0232, + "step": 5520 + }, + { + "epoch": 0.2235936867664913, + "grad_norm": 1.2476763725280762, + "learning_rate": 7.92277513937642e-05, + "loss": 1.0217, + "step": 5525 + }, + { + "epoch": 0.2237960339943343, + "grad_norm": 1.1859256029129028, + "learning_rate": 7.92071030353087e-05, + "loss": 1.0695, + "step": 5530 + }, + { + "epoch": 0.22399838122217725, + "grad_norm": 1.3493760824203491, + "learning_rate": 7.91864546768532e-05, + "loss": 0.9726, + "step": 5535 + }, + { + "epoch": 0.22420072845002023, + "grad_norm": 1.1764709949493408, + "learning_rate": 7.916580631839769e-05, + "loss": 1.0557, + "step": 5540 + }, + { + "epoch": 0.22440307567786322, + "grad_norm": 1.2197325229644775, + "learning_rate": 7.914515795994218e-05, + "loss": 0.937, + "step": 5545 + }, + { + "epoch": 0.2246054229057062, + "grad_norm": 1.1879414319992065, + "learning_rate": 7.912450960148668e-05, + "loss": 1.0306, + "step": 5550 + }, + { + "epoch": 0.22480777013354916, + "grad_norm": 1.2065080404281616, + "learning_rate": 7.910386124303119e-05, + "loss": 1.0551, + "step": 5555 + }, + { + "epoch": 0.22501011736139215, + "grad_norm": 1.0465185642242432, + "learning_rate": 7.908321288457568e-05, + "loss": 1.0172, + "step": 5560 + }, + { + "epoch": 0.22521246458923513, + "grad_norm": 1.1151576042175293, + "learning_rate": 7.906256452612017e-05, + "loss": 1.0684, + "step": 5565 + }, + { + "epoch": 0.2254148118170781, + "grad_norm": 1.2189502716064453, + "learning_rate": 7.904191616766467e-05, + "loss": 1.0053, + "step": 5570 + }, + { + "epoch": 0.22561715904492108, + "grad_norm": 1.2066047191619873, + "learning_rate": 7.902126780920918e-05, + "loss": 0.9839, + "step": 5575 + }, + { + "epoch": 0.22581950627276406, + "grad_norm": 1.1031054258346558, + "learning_rate": 7.900061945075367e-05, + "loss": 0.9268, + "step": 5580 + }, + { + "epoch": 0.22602185350060705, + "grad_norm": 1.3175009489059448, + "learning_rate": 7.897997109229817e-05, + "loss": 1.0683, + "step": 5585 + }, + { + "epoch": 0.22622420072845, + "grad_norm": 1.1946015357971191, + "learning_rate": 7.895932273384266e-05, + "loss": 1.0687, + "step": 5590 + }, + { + "epoch": 0.226426547956293, + "grad_norm": 1.0944335460662842, + "learning_rate": 7.893867437538715e-05, + "loss": 0.9761, + "step": 5595 + }, + { + "epoch": 0.22662889518413598, + "grad_norm": 1.2366076707839966, + "learning_rate": 7.891802601693166e-05, + "loss": 1.0127, + "step": 5600 + }, + { + "epoch": 0.22683124241197897, + "grad_norm": 1.172518014907837, + "learning_rate": 7.889737765847616e-05, + "loss": 1.0268, + "step": 5605 + }, + { + "epoch": 0.22703358963982193, + "grad_norm": 1.0506185293197632, + "learning_rate": 7.887672930002065e-05, + "loss": 1.0219, + "step": 5610 + }, + { + "epoch": 0.2272359368676649, + "grad_norm": 1.2196459770202637, + "learning_rate": 7.885608094156514e-05, + "loss": 1.0136, + "step": 5615 + }, + { + "epoch": 0.2274382840955079, + "grad_norm": 1.1773611307144165, + "learning_rate": 7.883543258310965e-05, + "loss": 0.9439, + "step": 5620 + }, + { + "epoch": 0.22764063132335088, + "grad_norm": 1.2770379781723022, + "learning_rate": 7.881478422465415e-05, + "loss": 1.0332, + "step": 5625 + }, + { + "epoch": 0.22784297855119384, + "grad_norm": 1.1902120113372803, + "learning_rate": 7.879413586619864e-05, + "loss": 1.0306, + "step": 5630 + }, + { + "epoch": 0.22804532577903683, + "grad_norm": 1.0333483219146729, + "learning_rate": 7.877348750774313e-05, + "loss": 1.0273, + "step": 5635 + }, + { + "epoch": 0.22824767300687981, + "grad_norm": 1.2635235786437988, + "learning_rate": 7.875283914928764e-05, + "loss": 0.991, + "step": 5640 + }, + { + "epoch": 0.22845002023472277, + "grad_norm": 1.229201078414917, + "learning_rate": 7.873219079083213e-05, + "loss": 1.0122, + "step": 5645 + }, + { + "epoch": 0.22865236746256576, + "grad_norm": 1.1765732765197754, + "learning_rate": 7.871154243237663e-05, + "loss": 1.0348, + "step": 5650 + }, + { + "epoch": 0.22885471469040874, + "grad_norm": 1.1195917129516602, + "learning_rate": 7.869089407392113e-05, + "loss": 0.962, + "step": 5655 + }, + { + "epoch": 0.22905706191825173, + "grad_norm": 1.4746836423873901, + "learning_rate": 7.867024571546562e-05, + "loss": 1.0242, + "step": 5660 + }, + { + "epoch": 0.2292594091460947, + "grad_norm": 1.0648586750030518, + "learning_rate": 7.864959735701011e-05, + "loss": 1.0264, + "step": 5665 + }, + { + "epoch": 0.22946175637393768, + "grad_norm": 1.2590371370315552, + "learning_rate": 7.862894899855462e-05, + "loss": 0.9696, + "step": 5670 + }, + { + "epoch": 0.22966410360178066, + "grad_norm": 1.1653438806533813, + "learning_rate": 7.860830064009912e-05, + "loss": 0.9849, + "step": 5675 + }, + { + "epoch": 0.22986645082962365, + "grad_norm": 1.2064003944396973, + "learning_rate": 7.858765228164361e-05, + "loss": 1.0079, + "step": 5680 + }, + { + "epoch": 0.2300687980574666, + "grad_norm": 1.1991125345230103, + "learning_rate": 7.85670039231881e-05, + "loss": 1.0827, + "step": 5685 + }, + { + "epoch": 0.2302711452853096, + "grad_norm": 1.1990333795547485, + "learning_rate": 7.854635556473261e-05, + "loss": 1.039, + "step": 5690 + }, + { + "epoch": 0.23047349251315258, + "grad_norm": 1.106918215751648, + "learning_rate": 7.85257072062771e-05, + "loss": 0.9845, + "step": 5695 + }, + { + "epoch": 0.23067583974099554, + "grad_norm": 1.1864373683929443, + "learning_rate": 7.85050588478216e-05, + "loss": 0.967, + "step": 5700 + }, + { + "epoch": 0.23087818696883852, + "grad_norm": 1.2981541156768799, + "learning_rate": 7.848441048936611e-05, + "loss": 1.0868, + "step": 5705 + }, + { + "epoch": 0.2310805341966815, + "grad_norm": 1.1637886762619019, + "learning_rate": 7.84637621309106e-05, + "loss": 1.006, + "step": 5710 + }, + { + "epoch": 0.2312828814245245, + "grad_norm": 1.1366996765136719, + "learning_rate": 7.844311377245509e-05, + "loss": 1.0588, + "step": 5715 + }, + { + "epoch": 0.23148522865236745, + "grad_norm": 1.1532763242721558, + "learning_rate": 7.842246541399959e-05, + "loss": 1.0586, + "step": 5720 + }, + { + "epoch": 0.23168757588021044, + "grad_norm": 1.2979742288589478, + "learning_rate": 7.84018170555441e-05, + "loss": 1.0527, + "step": 5725 + }, + { + "epoch": 0.23188992310805343, + "grad_norm": 1.1109341382980347, + "learning_rate": 7.838116869708859e-05, + "loss": 1.0451, + "step": 5730 + }, + { + "epoch": 0.2320922703358964, + "grad_norm": 1.1665804386138916, + "learning_rate": 7.836052033863308e-05, + "loss": 1.0379, + "step": 5735 + }, + { + "epoch": 0.23229461756373937, + "grad_norm": 1.217777967453003, + "learning_rate": 7.833987198017758e-05, + "loss": 0.9944, + "step": 5740 + }, + { + "epoch": 0.23249696479158236, + "grad_norm": 1.1025943756103516, + "learning_rate": 7.831922362172209e-05, + "loss": 0.9897, + "step": 5745 + }, + { + "epoch": 0.23269931201942534, + "grad_norm": 1.0555897951126099, + "learning_rate": 7.829857526326658e-05, + "loss": 1.0565, + "step": 5750 + }, + { + "epoch": 0.2329016592472683, + "grad_norm": 1.1598858833312988, + "learning_rate": 7.827792690481107e-05, + "loss": 0.977, + "step": 5755 + }, + { + "epoch": 0.2331040064751113, + "grad_norm": 1.0308842658996582, + "learning_rate": 7.825727854635557e-05, + "loss": 1.048, + "step": 5760 + }, + { + "epoch": 0.23330635370295427, + "grad_norm": 1.1150622367858887, + "learning_rate": 7.823663018790006e-05, + "loss": 0.9863, + "step": 5765 + }, + { + "epoch": 0.23350870093079726, + "grad_norm": 1.189045786857605, + "learning_rate": 7.821598182944456e-05, + "loss": 1.0165, + "step": 5770 + }, + { + "epoch": 0.23371104815864022, + "grad_norm": 1.2735435962677002, + "learning_rate": 7.819533347098907e-05, + "loss": 1.038, + "step": 5775 + }, + { + "epoch": 0.2339133953864832, + "grad_norm": 1.2038140296936035, + "learning_rate": 7.817468511253355e-05, + "loss": 1.0072, + "step": 5780 + }, + { + "epoch": 0.2341157426143262, + "grad_norm": 1.0952842235565186, + "learning_rate": 7.815403675407805e-05, + "loss": 0.9773, + "step": 5785 + }, + { + "epoch": 0.23431808984216917, + "grad_norm": 1.3182307481765747, + "learning_rate": 7.813338839562255e-05, + "loss": 0.9853, + "step": 5790 + }, + { + "epoch": 0.23452043707001213, + "grad_norm": 1.1275579929351807, + "learning_rate": 7.811274003716706e-05, + "loss": 0.9792, + "step": 5795 + }, + { + "epoch": 0.23472278429785512, + "grad_norm": 1.2322643995285034, + "learning_rate": 7.809209167871155e-05, + "loss": 0.9901, + "step": 5800 + }, + { + "epoch": 0.2349251315256981, + "grad_norm": 1.1326335668563843, + "learning_rate": 7.807144332025604e-05, + "loss": 0.977, + "step": 5805 + }, + { + "epoch": 0.23512747875354106, + "grad_norm": 1.1416250467300415, + "learning_rate": 7.805079496180054e-05, + "loss": 0.9662, + "step": 5810 + }, + { + "epoch": 0.23532982598138405, + "grad_norm": 1.2562077045440674, + "learning_rate": 7.803014660334503e-05, + "loss": 0.9903, + "step": 5815 + }, + { + "epoch": 0.23553217320922704, + "grad_norm": 1.15487539768219, + "learning_rate": 7.800949824488954e-05, + "loss": 1.0253, + "step": 5820 + }, + { + "epoch": 0.23573452043707002, + "grad_norm": 1.0592039823532104, + "learning_rate": 7.798884988643403e-05, + "loss": 0.944, + "step": 5825 + }, + { + "epoch": 0.23593686766491298, + "grad_norm": 1.2863718271255493, + "learning_rate": 7.796820152797853e-05, + "loss": 1.0292, + "step": 5830 + }, + { + "epoch": 0.23613921489275597, + "grad_norm": 1.145825743675232, + "learning_rate": 7.794755316952302e-05, + "loss": 1.0172, + "step": 5835 + }, + { + "epoch": 0.23634156212059895, + "grad_norm": 1.0824973583221436, + "learning_rate": 7.792690481106753e-05, + "loss": 0.9922, + "step": 5840 + }, + { + "epoch": 0.23654390934844194, + "grad_norm": 1.1497865915298462, + "learning_rate": 7.790625645261203e-05, + "loss": 1.0203, + "step": 5845 + }, + { + "epoch": 0.2367462565762849, + "grad_norm": 1.1841051578521729, + "learning_rate": 7.788560809415651e-05, + "loss": 1.0286, + "step": 5850 + }, + { + "epoch": 0.23694860380412788, + "grad_norm": 1.1089333295822144, + "learning_rate": 7.786495973570101e-05, + "loss": 1.0517, + "step": 5855 + }, + { + "epoch": 0.23715095103197087, + "grad_norm": 1.3060795068740845, + "learning_rate": 7.784431137724552e-05, + "loss": 1.0358, + "step": 5860 + }, + { + "epoch": 0.23735329825981383, + "grad_norm": 1.2065348625183105, + "learning_rate": 7.782366301879e-05, + "loss": 1.0508, + "step": 5865 + }, + { + "epoch": 0.2375556454876568, + "grad_norm": 1.0978989601135254, + "learning_rate": 7.780301466033451e-05, + "loss": 0.9901, + "step": 5870 + }, + { + "epoch": 0.2377579927154998, + "grad_norm": 1.1329617500305176, + "learning_rate": 7.7782366301879e-05, + "loss": 1.0113, + "step": 5875 + }, + { + "epoch": 0.23796033994334279, + "grad_norm": 1.1056050062179565, + "learning_rate": 7.77617179434235e-05, + "loss": 1.0738, + "step": 5880 + }, + { + "epoch": 0.23816268717118574, + "grad_norm": 1.3044637441635132, + "learning_rate": 7.7741069584968e-05, + "loss": 1.0375, + "step": 5885 + }, + { + "epoch": 0.23836503439902873, + "grad_norm": 1.2118210792541504, + "learning_rate": 7.77204212265125e-05, + "loss": 1.0014, + "step": 5890 + }, + { + "epoch": 0.23856738162687172, + "grad_norm": 1.3539366722106934, + "learning_rate": 7.769977286805699e-05, + "loss": 0.978, + "step": 5895 + }, + { + "epoch": 0.2387697288547147, + "grad_norm": 1.1183351278305054, + "learning_rate": 7.767912450960148e-05, + "loss": 0.9477, + "step": 5900 + }, + { + "epoch": 0.23897207608255766, + "grad_norm": 1.2246525287628174, + "learning_rate": 7.765847615114599e-05, + "loss": 1.0384, + "step": 5905 + }, + { + "epoch": 0.23917442331040065, + "grad_norm": 1.2153912782669067, + "learning_rate": 7.763782779269049e-05, + "loss": 0.9998, + "step": 5910 + }, + { + "epoch": 0.23937677053824363, + "grad_norm": 1.2209187746047974, + "learning_rate": 7.761717943423498e-05, + "loss": 1.0025, + "step": 5915 + }, + { + "epoch": 0.2395791177660866, + "grad_norm": 1.1944117546081543, + "learning_rate": 7.759653107577948e-05, + "loss": 1.0176, + "step": 5920 + }, + { + "epoch": 0.23978146499392958, + "grad_norm": 1.1967753171920776, + "learning_rate": 7.757588271732397e-05, + "loss": 1.0214, + "step": 5925 + }, + { + "epoch": 0.23998381222177256, + "grad_norm": 1.0911521911621094, + "learning_rate": 7.755523435886848e-05, + "loss": 0.9598, + "step": 5930 + }, + { + "epoch": 0.24018615944961555, + "grad_norm": 1.2010823488235474, + "learning_rate": 7.753458600041297e-05, + "loss": 0.9866, + "step": 5935 + }, + { + "epoch": 0.2403885066774585, + "grad_norm": 1.1828703880310059, + "learning_rate": 7.751393764195747e-05, + "loss": 0.9887, + "step": 5940 + }, + { + "epoch": 0.2405908539053015, + "grad_norm": 1.1219539642333984, + "learning_rate": 7.749328928350196e-05, + "loss": 1.0612, + "step": 5945 + }, + { + "epoch": 0.24079320113314448, + "grad_norm": 1.1368293762207031, + "learning_rate": 7.747264092504645e-05, + "loss": 0.9991, + "step": 5950 + }, + { + "epoch": 0.24099554836098747, + "grad_norm": 1.1059019565582275, + "learning_rate": 7.745199256659096e-05, + "loss": 1.0682, + "step": 5955 + }, + { + "epoch": 0.24119789558883042, + "grad_norm": 1.0998921394348145, + "learning_rate": 7.743134420813546e-05, + "loss": 1.0012, + "step": 5960 + }, + { + "epoch": 0.2414002428166734, + "grad_norm": 1.2579197883605957, + "learning_rate": 7.741069584967997e-05, + "loss": 1.0193, + "step": 5965 + }, + { + "epoch": 0.2416025900445164, + "grad_norm": 1.1530227661132812, + "learning_rate": 7.739004749122444e-05, + "loss": 1.024, + "step": 5970 + }, + { + "epoch": 0.24180493727235935, + "grad_norm": 1.229365587234497, + "learning_rate": 7.736939913276895e-05, + "loss": 1.0177, + "step": 5975 + }, + { + "epoch": 0.24200728450020234, + "grad_norm": 1.150947093963623, + "learning_rate": 7.734875077431345e-05, + "loss": 1.0463, + "step": 5980 + }, + { + "epoch": 0.24220963172804533, + "grad_norm": 1.0764719247817993, + "learning_rate": 7.732810241585794e-05, + "loss": 1.0137, + "step": 5985 + }, + { + "epoch": 0.2424119789558883, + "grad_norm": 1.0965443849563599, + "learning_rate": 7.730745405740245e-05, + "loss": 1.0019, + "step": 5990 + }, + { + "epoch": 0.24261432618373127, + "grad_norm": 1.1556476354599, + "learning_rate": 7.728680569894694e-05, + "loss": 0.9645, + "step": 5995 + }, + { + "epoch": 0.24281667341157426, + "grad_norm": 1.1066598892211914, + "learning_rate": 7.726615734049143e-05, + "loss": 1.0096, + "step": 6000 + }, + { + "epoch": 0.24301902063941724, + "grad_norm": 1.0250757932662964, + "learning_rate": 7.724550898203593e-05, + "loss": 1.0202, + "step": 6005 + }, + { + "epoch": 0.24322136786726023, + "grad_norm": 1.1670464277267456, + "learning_rate": 7.722486062358044e-05, + "loss": 1.004, + "step": 6010 + }, + { + "epoch": 0.2434237150951032, + "grad_norm": 1.0662355422973633, + "learning_rate": 7.720421226512493e-05, + "loss": 1.0175, + "step": 6015 + }, + { + "epoch": 0.24362606232294617, + "grad_norm": 1.116700291633606, + "learning_rate": 7.718356390666942e-05, + "loss": 1.0203, + "step": 6020 + }, + { + "epoch": 0.24382840955078916, + "grad_norm": 1.2482693195343018, + "learning_rate": 7.716291554821392e-05, + "loss": 1.0136, + "step": 6025 + }, + { + "epoch": 0.24403075677863215, + "grad_norm": 1.1996208429336548, + "learning_rate": 7.714226718975842e-05, + "loss": 0.9716, + "step": 6030 + }, + { + "epoch": 0.2442331040064751, + "grad_norm": 1.2952680587768555, + "learning_rate": 7.712161883130291e-05, + "loss": 1.0123, + "step": 6035 + }, + { + "epoch": 0.2444354512343181, + "grad_norm": 1.1774368286132812, + "learning_rate": 7.71009704728474e-05, + "loss": 1.0258, + "step": 6040 + }, + { + "epoch": 0.24463779846216108, + "grad_norm": 1.0907679796218872, + "learning_rate": 7.708032211439191e-05, + "loss": 1.0059, + "step": 6045 + }, + { + "epoch": 0.24484014569000404, + "grad_norm": 1.2474931478500366, + "learning_rate": 7.70596737559364e-05, + "loss": 1.0642, + "step": 6050 + }, + { + "epoch": 0.24504249291784702, + "grad_norm": 1.1548250913619995, + "learning_rate": 7.70390253974809e-05, + "loss": 1.0278, + "step": 6055 + }, + { + "epoch": 0.24524484014569, + "grad_norm": 1.126945972442627, + "learning_rate": 7.701837703902541e-05, + "loss": 0.9554, + "step": 6060 + }, + { + "epoch": 0.245447187373533, + "grad_norm": 1.265137791633606, + "learning_rate": 7.69977286805699e-05, + "loss": 1.0365, + "step": 6065 + }, + { + "epoch": 0.24564953460137595, + "grad_norm": 1.3145203590393066, + "learning_rate": 7.697708032211439e-05, + "loss": 1.0036, + "step": 6070 + }, + { + "epoch": 0.24585188182921894, + "grad_norm": 1.2152060270309448, + "learning_rate": 7.695643196365889e-05, + "loss": 0.9918, + "step": 6075 + }, + { + "epoch": 0.24605422905706192, + "grad_norm": 1.1506688594818115, + "learning_rate": 7.69357836052034e-05, + "loss": 1.0007, + "step": 6080 + }, + { + "epoch": 0.2462565762849049, + "grad_norm": 1.2358943223953247, + "learning_rate": 7.691513524674789e-05, + "loss": 1.001, + "step": 6085 + }, + { + "epoch": 0.24645892351274787, + "grad_norm": 1.3536697626113892, + "learning_rate": 7.689448688829238e-05, + "loss": 1.0041, + "step": 6090 + }, + { + "epoch": 0.24666127074059085, + "grad_norm": 1.1014422178268433, + "learning_rate": 7.687383852983688e-05, + "loss": 1.0577, + "step": 6095 + }, + { + "epoch": 0.24686361796843384, + "grad_norm": 1.2396435737609863, + "learning_rate": 7.685319017138139e-05, + "loss": 1.0578, + "step": 6100 + }, + { + "epoch": 0.2470659651962768, + "grad_norm": 1.0494437217712402, + "learning_rate": 7.683254181292588e-05, + "loss": 1.0146, + "step": 6105 + }, + { + "epoch": 0.24726831242411978, + "grad_norm": 1.1579269170761108, + "learning_rate": 7.681189345447037e-05, + "loss": 1.0344, + "step": 6110 + }, + { + "epoch": 0.24747065965196277, + "grad_norm": 1.0838518142700195, + "learning_rate": 7.679124509601487e-05, + "loss": 1.0192, + "step": 6115 + }, + { + "epoch": 0.24767300687980576, + "grad_norm": 1.1838576793670654, + "learning_rate": 7.677059673755936e-05, + "loss": 0.9935, + "step": 6120 + }, + { + "epoch": 0.24787535410764872, + "grad_norm": 1.1222121715545654, + "learning_rate": 7.674994837910387e-05, + "loss": 0.9936, + "step": 6125 + }, + { + "epoch": 0.2480777013354917, + "grad_norm": 1.2552130222320557, + "learning_rate": 7.672930002064837e-05, + "loss": 1.0226, + "step": 6130 + }, + { + "epoch": 0.2482800485633347, + "grad_norm": 1.1589610576629639, + "learning_rate": 7.670865166219286e-05, + "loss": 1.0136, + "step": 6135 + }, + { + "epoch": 0.24848239579117767, + "grad_norm": 1.0672346353530884, + "learning_rate": 7.668800330373735e-05, + "loss": 0.9752, + "step": 6140 + }, + { + "epoch": 0.24868474301902063, + "grad_norm": 1.1981768608093262, + "learning_rate": 7.666735494528186e-05, + "loss": 1.0218, + "step": 6145 + }, + { + "epoch": 0.24888709024686362, + "grad_norm": 1.2581173181533813, + "learning_rate": 7.664670658682636e-05, + "loss": 0.9783, + "step": 6150 + }, + { + "epoch": 0.2490894374747066, + "grad_norm": 1.217789649963379, + "learning_rate": 7.662605822837085e-05, + "loss": 1.0005, + "step": 6155 + }, + { + "epoch": 0.24929178470254956, + "grad_norm": 1.1852567195892334, + "learning_rate": 7.660540986991534e-05, + "loss": 1.0515, + "step": 6160 + }, + { + "epoch": 0.24949413193039255, + "grad_norm": 1.1580768823623657, + "learning_rate": 7.658476151145984e-05, + "loss": 1.0018, + "step": 6165 + }, + { + "epoch": 0.24969647915823553, + "grad_norm": 1.112929344177246, + "learning_rate": 7.656411315300434e-05, + "loss": 0.9975, + "step": 6170 + }, + { + "epoch": 0.24989882638607852, + "grad_norm": 1.1734755039215088, + "learning_rate": 7.654346479454884e-05, + "loss": 0.9948, + "step": 6175 + }, + { + "epoch": 0.2501011736139215, + "grad_norm": 1.334947943687439, + "learning_rate": 7.652281643609333e-05, + "loss": 0.9802, + "step": 6180 + }, + { + "epoch": 0.2503035208417645, + "grad_norm": 1.2167521715164185, + "learning_rate": 7.650216807763783e-05, + "loss": 0.9401, + "step": 6185 + }, + { + "epoch": 0.25050586806960745, + "grad_norm": 1.1352683305740356, + "learning_rate": 7.648151971918232e-05, + "loss": 0.952, + "step": 6190 + }, + { + "epoch": 0.2507082152974504, + "grad_norm": 1.062712550163269, + "learning_rate": 7.646087136072683e-05, + "loss": 1.0991, + "step": 6195 + }, + { + "epoch": 0.2509105625252934, + "grad_norm": 1.1996139287948608, + "learning_rate": 7.644022300227133e-05, + "loss": 1.0518, + "step": 6200 + }, + { + "epoch": 0.2511129097531364, + "grad_norm": 1.1053889989852905, + "learning_rate": 7.641957464381582e-05, + "loss": 1.0319, + "step": 6205 + }, + { + "epoch": 0.25131525698097934, + "grad_norm": 1.2459156513214111, + "learning_rate": 7.639892628536031e-05, + "loss": 1.0318, + "step": 6210 + }, + { + "epoch": 0.25151760420882235, + "grad_norm": 1.159220576286316, + "learning_rate": 7.637827792690482e-05, + "loss": 1.0361, + "step": 6215 + }, + { + "epoch": 0.2517199514366653, + "grad_norm": 1.0500812530517578, + "learning_rate": 7.635762956844931e-05, + "loss": 1.0122, + "step": 6220 + }, + { + "epoch": 0.25192229866450827, + "grad_norm": 1.1624324321746826, + "learning_rate": 7.633698120999381e-05, + "loss": 1.0061, + "step": 6225 + }, + { + "epoch": 0.2521246458923513, + "grad_norm": 1.254845380783081, + "learning_rate": 7.63163328515383e-05, + "loss": 1.0205, + "step": 6230 + }, + { + "epoch": 0.25232699312019424, + "grad_norm": 1.1832902431488037, + "learning_rate": 7.62956844930828e-05, + "loss": 1.0651, + "step": 6235 + }, + { + "epoch": 0.25252934034803726, + "grad_norm": 1.1697665452957153, + "learning_rate": 7.62750361346273e-05, + "loss": 1.0184, + "step": 6240 + }, + { + "epoch": 0.2527316875758802, + "grad_norm": 1.0969288349151611, + "learning_rate": 7.62543877761718e-05, + "loss": 0.9464, + "step": 6245 + }, + { + "epoch": 0.2529340348037232, + "grad_norm": 1.3374019861221313, + "learning_rate": 7.62337394177163e-05, + "loss": 1.0608, + "step": 6250 + }, + { + "epoch": 0.2531363820315662, + "grad_norm": 1.2375236749649048, + "learning_rate": 7.621309105926078e-05, + "loss": 0.9866, + "step": 6255 + }, + { + "epoch": 0.25333872925940915, + "grad_norm": 1.2215721607208252, + "learning_rate": 7.619244270080529e-05, + "loss": 1.0422, + "step": 6260 + }, + { + "epoch": 0.2535410764872521, + "grad_norm": 1.124558687210083, + "learning_rate": 7.617179434234979e-05, + "loss": 0.9831, + "step": 6265 + }, + { + "epoch": 0.2537434237150951, + "grad_norm": 1.1476385593414307, + "learning_rate": 7.615114598389428e-05, + "loss": 1.0262, + "step": 6270 + }, + { + "epoch": 0.2539457709429381, + "grad_norm": 1.2365763187408447, + "learning_rate": 7.613049762543879e-05, + "loss": 1.0476, + "step": 6275 + }, + { + "epoch": 0.25414811817078103, + "grad_norm": 1.1760541200637817, + "learning_rate": 7.610984926698328e-05, + "loss": 1.0112, + "step": 6280 + }, + { + "epoch": 0.25435046539862405, + "grad_norm": 1.1309133768081665, + "learning_rate": 7.608920090852778e-05, + "loss": 1.0201, + "step": 6285 + }, + { + "epoch": 0.254552812626467, + "grad_norm": 1.133035659790039, + "learning_rate": 7.606855255007227e-05, + "loss": 1.0136, + "step": 6290 + }, + { + "epoch": 0.25475515985431, + "grad_norm": 1.1882902383804321, + "learning_rate": 7.604790419161677e-05, + "loss": 0.9954, + "step": 6295 + }, + { + "epoch": 0.254957507082153, + "grad_norm": 1.1148749589920044, + "learning_rate": 7.602725583316126e-05, + "loss": 1.0052, + "step": 6300 + }, + { + "epoch": 0.25515985430999594, + "grad_norm": 1.1846961975097656, + "learning_rate": 7.600660747470576e-05, + "loss": 0.9672, + "step": 6305 + }, + { + "epoch": 0.25536220153783895, + "grad_norm": 1.0866841077804565, + "learning_rate": 7.598595911625026e-05, + "loss": 1.0197, + "step": 6310 + }, + { + "epoch": 0.2555645487656819, + "grad_norm": 1.1378873586654663, + "learning_rate": 7.596531075779476e-05, + "loss": 1.0278, + "step": 6315 + }, + { + "epoch": 0.25576689599352487, + "grad_norm": 1.1977087259292603, + "learning_rate": 7.594466239933927e-05, + "loss": 0.988, + "step": 6320 + }, + { + "epoch": 0.2559692432213679, + "grad_norm": 1.1284993886947632, + "learning_rate": 7.592401404088374e-05, + "loss": 1.0356, + "step": 6325 + }, + { + "epoch": 0.25617159044921084, + "grad_norm": 1.0692222118377686, + "learning_rate": 7.590336568242825e-05, + "loss": 0.9913, + "step": 6330 + }, + { + "epoch": 0.2563739376770538, + "grad_norm": 1.2074707746505737, + "learning_rate": 7.588271732397275e-05, + "loss": 1.0101, + "step": 6335 + }, + { + "epoch": 0.2565762849048968, + "grad_norm": 1.0940556526184082, + "learning_rate": 7.586206896551724e-05, + "loss": 1.0192, + "step": 6340 + }, + { + "epoch": 0.25677863213273977, + "grad_norm": 1.2523458003997803, + "learning_rate": 7.584142060706175e-05, + "loss": 1.0089, + "step": 6345 + }, + { + "epoch": 0.2569809793605828, + "grad_norm": 1.1117687225341797, + "learning_rate": 7.582077224860624e-05, + "loss": 1.0206, + "step": 6350 + }, + { + "epoch": 0.25718332658842574, + "grad_norm": 1.1576602458953857, + "learning_rate": 7.580012389015073e-05, + "loss": 0.9893, + "step": 6355 + }, + { + "epoch": 0.2573856738162687, + "grad_norm": 1.1742234230041504, + "learning_rate": 7.577947553169523e-05, + "loss": 1.0281, + "step": 6360 + }, + { + "epoch": 0.2575880210441117, + "grad_norm": 1.2967197895050049, + "learning_rate": 7.575882717323974e-05, + "loss": 1.0162, + "step": 6365 + }, + { + "epoch": 0.2577903682719547, + "grad_norm": 1.1597435474395752, + "learning_rate": 7.573817881478423e-05, + "loss": 0.9695, + "step": 6370 + }, + { + "epoch": 0.25799271549979763, + "grad_norm": 1.1363296508789062, + "learning_rate": 7.571753045632872e-05, + "loss": 1.0629, + "step": 6375 + }, + { + "epoch": 0.25819506272764065, + "grad_norm": 1.2326802015304565, + "learning_rate": 7.569688209787322e-05, + "loss": 0.972, + "step": 6380 + }, + { + "epoch": 0.2583974099554836, + "grad_norm": 1.0850800275802612, + "learning_rate": 7.567623373941773e-05, + "loss": 0.9562, + "step": 6385 + }, + { + "epoch": 0.25859975718332656, + "grad_norm": 1.3475916385650635, + "learning_rate": 7.565558538096222e-05, + "loss": 0.96, + "step": 6390 + }, + { + "epoch": 0.2588021044111696, + "grad_norm": 1.1954843997955322, + "learning_rate": 7.56349370225067e-05, + "loss": 1.0534, + "step": 6395 + }, + { + "epoch": 0.25900445163901253, + "grad_norm": 1.2429571151733398, + "learning_rate": 7.561428866405121e-05, + "loss": 1.0081, + "step": 6400 + }, + { + "epoch": 0.25920679886685555, + "grad_norm": 1.2253036499023438, + "learning_rate": 7.559364030559571e-05, + "loss": 0.9653, + "step": 6405 + }, + { + "epoch": 0.2594091460946985, + "grad_norm": 1.1782009601593018, + "learning_rate": 7.55729919471402e-05, + "loss": 1.0345, + "step": 6410 + }, + { + "epoch": 0.25961149332254146, + "grad_norm": 1.2082030773162842, + "learning_rate": 7.555234358868471e-05, + "loss": 1.0469, + "step": 6415 + }, + { + "epoch": 0.2598138405503845, + "grad_norm": 1.2173150777816772, + "learning_rate": 7.55316952302292e-05, + "loss": 0.991, + "step": 6420 + }, + { + "epoch": 0.26001618777822744, + "grad_norm": 1.286616325378418, + "learning_rate": 7.551104687177369e-05, + "loss": 0.9516, + "step": 6425 + }, + { + "epoch": 0.2602185350060704, + "grad_norm": 1.1527777910232544, + "learning_rate": 7.54903985133182e-05, + "loss": 1.0253, + "step": 6430 + }, + { + "epoch": 0.2604208822339134, + "grad_norm": 1.1592366695404053, + "learning_rate": 7.54697501548627e-05, + "loss": 1.0062, + "step": 6435 + }, + { + "epoch": 0.26062322946175637, + "grad_norm": 1.2501929998397827, + "learning_rate": 7.544910179640719e-05, + "loss": 1.0231, + "step": 6440 + }, + { + "epoch": 0.2608255766895993, + "grad_norm": 1.3574258089065552, + "learning_rate": 7.542845343795168e-05, + "loss": 0.9926, + "step": 6445 + }, + { + "epoch": 0.26102792391744234, + "grad_norm": 1.1337406635284424, + "learning_rate": 7.540780507949618e-05, + "loss": 0.9602, + "step": 6450 + }, + { + "epoch": 0.2612302711452853, + "grad_norm": 1.170088291168213, + "learning_rate": 7.538715672104069e-05, + "loss": 1.0621, + "step": 6455 + }, + { + "epoch": 0.2614326183731283, + "grad_norm": 1.0618562698364258, + "learning_rate": 7.536650836258518e-05, + "loss": 1.016, + "step": 6460 + }, + { + "epoch": 0.26163496560097127, + "grad_norm": 1.1312986612319946, + "learning_rate": 7.534586000412968e-05, + "loss": 1.029, + "step": 6465 + }, + { + "epoch": 0.26183731282881423, + "grad_norm": 1.052512764930725, + "learning_rate": 7.532521164567417e-05, + "loss": 1.0404, + "step": 6470 + }, + { + "epoch": 0.26203966005665724, + "grad_norm": 1.1444694995880127, + "learning_rate": 7.530456328721866e-05, + "loss": 0.9828, + "step": 6475 + }, + { + "epoch": 0.2622420072845002, + "grad_norm": 1.016725778579712, + "learning_rate": 7.528391492876317e-05, + "loss": 1.0533, + "step": 6480 + }, + { + "epoch": 0.26244435451234316, + "grad_norm": 1.1582973003387451, + "learning_rate": 7.526326657030767e-05, + "loss": 0.9797, + "step": 6485 + }, + { + "epoch": 0.2626467017401862, + "grad_norm": 1.1264119148254395, + "learning_rate": 7.524261821185216e-05, + "loss": 0.9768, + "step": 6490 + }, + { + "epoch": 0.26284904896802913, + "grad_norm": 1.236449956893921, + "learning_rate": 7.522196985339665e-05, + "loss": 0.9688, + "step": 6495 + }, + { + "epoch": 0.2630513961958721, + "grad_norm": 1.2319833040237427, + "learning_rate": 7.520132149494116e-05, + "loss": 1.0407, + "step": 6500 + }, + { + "epoch": 0.2632537434237151, + "grad_norm": 1.259884238243103, + "learning_rate": 7.518067313648566e-05, + "loss": 1.0292, + "step": 6505 + }, + { + "epoch": 0.26345609065155806, + "grad_norm": 1.0968300104141235, + "learning_rate": 7.516002477803015e-05, + "loss": 1.0098, + "step": 6510 + }, + { + "epoch": 0.2636584378794011, + "grad_norm": 1.209838628768921, + "learning_rate": 7.513937641957464e-05, + "loss": 1.0292, + "step": 6515 + }, + { + "epoch": 0.26386078510724403, + "grad_norm": 1.2530168294906616, + "learning_rate": 7.511872806111915e-05, + "loss": 1.0444, + "step": 6520 + }, + { + "epoch": 0.264063132335087, + "grad_norm": 1.2566606998443604, + "learning_rate": 7.509807970266364e-05, + "loss": 1.0087, + "step": 6525 + }, + { + "epoch": 0.26426547956293, + "grad_norm": 1.1261377334594727, + "learning_rate": 7.507743134420814e-05, + "loss": 1.0128, + "step": 6530 + }, + { + "epoch": 0.26446782679077296, + "grad_norm": 1.2107958793640137, + "learning_rate": 7.505678298575264e-05, + "loss": 1.0155, + "step": 6535 + }, + { + "epoch": 0.2646701740186159, + "grad_norm": 1.0442723035812378, + "learning_rate": 7.503613462729714e-05, + "loss": 0.9896, + "step": 6540 + }, + { + "epoch": 0.26487252124645894, + "grad_norm": 1.1261143684387207, + "learning_rate": 7.501548626884163e-05, + "loss": 0.9892, + "step": 6545 + }, + { + "epoch": 0.2650748684743019, + "grad_norm": 1.144325613975525, + "learning_rate": 7.499483791038613e-05, + "loss": 1.0115, + "step": 6550 + }, + { + "epoch": 0.26527721570214485, + "grad_norm": 1.214985966682434, + "learning_rate": 7.497418955193063e-05, + "loss": 1.0264, + "step": 6555 + }, + { + "epoch": 0.26547956292998787, + "grad_norm": 1.2811050415039062, + "learning_rate": 7.495354119347512e-05, + "loss": 0.9441, + "step": 6560 + }, + { + "epoch": 0.2656819101578308, + "grad_norm": 1.319085955619812, + "learning_rate": 7.493289283501961e-05, + "loss": 1.0454, + "step": 6565 + }, + { + "epoch": 0.26588425738567384, + "grad_norm": 1.091052770614624, + "learning_rate": 7.491224447656412e-05, + "loss": 1.1001, + "step": 6570 + }, + { + "epoch": 0.2660866046135168, + "grad_norm": 1.2728432416915894, + "learning_rate": 7.489159611810861e-05, + "loss": 1.0169, + "step": 6575 + }, + { + "epoch": 0.26628895184135976, + "grad_norm": 1.3494826555252075, + "learning_rate": 7.487094775965311e-05, + "loss": 1.0424, + "step": 6580 + }, + { + "epoch": 0.26649129906920277, + "grad_norm": 1.1133043766021729, + "learning_rate": 7.48502994011976e-05, + "loss": 1.0255, + "step": 6585 + }, + { + "epoch": 0.26669364629704573, + "grad_norm": 1.2886162996292114, + "learning_rate": 7.482965104274211e-05, + "loss": 1.0015, + "step": 6590 + }, + { + "epoch": 0.2668959935248887, + "grad_norm": 1.2574056386947632, + "learning_rate": 7.48090026842866e-05, + "loss": 1.0546, + "step": 6595 + }, + { + "epoch": 0.2670983407527317, + "grad_norm": 1.2051347494125366, + "learning_rate": 7.47883543258311e-05, + "loss": 0.9476, + "step": 6600 + }, + { + "epoch": 0.26730068798057466, + "grad_norm": 1.0692707300186157, + "learning_rate": 7.47677059673756e-05, + "loss": 0.9864, + "step": 6605 + }, + { + "epoch": 0.26750303520841767, + "grad_norm": 1.1030018329620361, + "learning_rate": 7.474705760892008e-05, + "loss": 1.026, + "step": 6610 + }, + { + "epoch": 0.26770538243626063, + "grad_norm": 1.1978520154953003, + "learning_rate": 7.472640925046459e-05, + "loss": 0.9951, + "step": 6615 + }, + { + "epoch": 0.2679077296641036, + "grad_norm": 1.2217199802398682, + "learning_rate": 7.470576089200909e-05, + "loss": 1.0317, + "step": 6620 + }, + { + "epoch": 0.2681100768919466, + "grad_norm": 1.103721261024475, + "learning_rate": 7.468511253355358e-05, + "loss": 1.0316, + "step": 6625 + }, + { + "epoch": 0.26831242411978956, + "grad_norm": 1.1881301403045654, + "learning_rate": 7.466446417509809e-05, + "loss": 0.9826, + "step": 6630 + }, + { + "epoch": 0.2685147713476325, + "grad_norm": 1.2023499011993408, + "learning_rate": 7.464381581664258e-05, + "loss": 1.0445, + "step": 6635 + }, + { + "epoch": 0.26871711857547553, + "grad_norm": 1.1933207511901855, + "learning_rate": 7.462316745818708e-05, + "loss": 0.9749, + "step": 6640 + }, + { + "epoch": 0.2689194658033185, + "grad_norm": 1.2708888053894043, + "learning_rate": 7.460251909973157e-05, + "loss": 1.0065, + "step": 6645 + }, + { + "epoch": 0.26912181303116145, + "grad_norm": 1.2242518663406372, + "learning_rate": 7.458187074127608e-05, + "loss": 1.0361, + "step": 6650 + }, + { + "epoch": 0.26932416025900446, + "grad_norm": 1.0936435461044312, + "learning_rate": 7.456122238282057e-05, + "loss": 1.0353, + "step": 6655 + }, + { + "epoch": 0.2695265074868474, + "grad_norm": 1.0418567657470703, + "learning_rate": 7.454057402436506e-05, + "loss": 1.0422, + "step": 6660 + }, + { + "epoch": 0.26972885471469044, + "grad_norm": 1.101579189300537, + "learning_rate": 7.451992566590956e-05, + "loss": 1.0011, + "step": 6665 + }, + { + "epoch": 0.2699312019425334, + "grad_norm": 1.1149858236312866, + "learning_rate": 7.449927730745406e-05, + "loss": 1.0366, + "step": 6670 + }, + { + "epoch": 0.27013354917037635, + "grad_norm": 1.158974528312683, + "learning_rate": 7.447862894899857e-05, + "loss": 1.0378, + "step": 6675 + }, + { + "epoch": 0.27033589639821937, + "grad_norm": 1.1632798910140991, + "learning_rate": 7.445798059054306e-05, + "loss": 1.01, + "step": 6680 + }, + { + "epoch": 0.2705382436260623, + "grad_norm": 1.1123712062835693, + "learning_rate": 7.443733223208755e-05, + "loss": 0.9903, + "step": 6685 + }, + { + "epoch": 0.2707405908539053, + "grad_norm": 1.2239755392074585, + "learning_rate": 7.441668387363205e-05, + "loss": 0.9864, + "step": 6690 + }, + { + "epoch": 0.2709429380817483, + "grad_norm": 1.1682302951812744, + "learning_rate": 7.439603551517654e-05, + "loss": 1.0237, + "step": 6695 + }, + { + "epoch": 0.27114528530959126, + "grad_norm": 1.0680562257766724, + "learning_rate": 7.437538715672105e-05, + "loss": 1.022, + "step": 6700 + }, + { + "epoch": 0.2713476325374342, + "grad_norm": 1.2862532138824463, + "learning_rate": 7.435473879826554e-05, + "loss": 1.0518, + "step": 6705 + }, + { + "epoch": 0.2715499797652772, + "grad_norm": 1.133471965789795, + "learning_rate": 7.433409043981003e-05, + "loss": 0.9525, + "step": 6710 + }, + { + "epoch": 0.2717523269931202, + "grad_norm": 1.1541469097137451, + "learning_rate": 7.431344208135453e-05, + "loss": 1.0238, + "step": 6715 + }, + { + "epoch": 0.2719546742209632, + "grad_norm": 1.2197669744491577, + "learning_rate": 7.429279372289904e-05, + "loss": 1.0381, + "step": 6720 + }, + { + "epoch": 0.27215702144880616, + "grad_norm": 1.1882134675979614, + "learning_rate": 7.427214536444353e-05, + "loss": 1.0635, + "step": 6725 + }, + { + "epoch": 0.2723593686766491, + "grad_norm": 1.17057204246521, + "learning_rate": 7.425149700598802e-05, + "loss": 1.0651, + "step": 6730 + }, + { + "epoch": 0.27256171590449213, + "grad_norm": 1.131810188293457, + "learning_rate": 7.423084864753252e-05, + "loss": 0.9965, + "step": 6735 + }, + { + "epoch": 0.2727640631323351, + "grad_norm": 1.1254119873046875, + "learning_rate": 7.421020028907703e-05, + "loss": 1.0012, + "step": 6740 + }, + { + "epoch": 0.27296641036017805, + "grad_norm": 1.146243929862976, + "learning_rate": 7.418955193062152e-05, + "loss": 1.0263, + "step": 6745 + }, + { + "epoch": 0.27316875758802106, + "grad_norm": 1.1290664672851562, + "learning_rate": 7.416890357216602e-05, + "loss": 1.0291, + "step": 6750 + }, + { + "epoch": 0.273371104815864, + "grad_norm": 1.1978673934936523, + "learning_rate": 7.414825521371051e-05, + "loss": 1.0562, + "step": 6755 + }, + { + "epoch": 0.273573452043707, + "grad_norm": 1.1925079822540283, + "learning_rate": 7.412760685525502e-05, + "loss": 1.0621, + "step": 6760 + }, + { + "epoch": 0.27377579927155, + "grad_norm": 1.1550565958023071, + "learning_rate": 7.41069584967995e-05, + "loss": 0.9679, + "step": 6765 + }, + { + "epoch": 0.27397814649939295, + "grad_norm": 1.2220369577407837, + "learning_rate": 7.408631013834401e-05, + "loss": 1.0226, + "step": 6770 + }, + { + "epoch": 0.27418049372723596, + "grad_norm": 1.0354042053222656, + "learning_rate": 7.40656617798885e-05, + "loss": 1.0115, + "step": 6775 + }, + { + "epoch": 0.2743828409550789, + "grad_norm": 1.2225804328918457, + "learning_rate": 7.404501342143299e-05, + "loss": 1.0814, + "step": 6780 + }, + { + "epoch": 0.2745851881829219, + "grad_norm": 1.1748751401901245, + "learning_rate": 7.40243650629775e-05, + "loss": 1.0224, + "step": 6785 + }, + { + "epoch": 0.2747875354107649, + "grad_norm": 1.0723620653152466, + "learning_rate": 7.4003716704522e-05, + "loss": 1.0631, + "step": 6790 + }, + { + "epoch": 0.27498988263860785, + "grad_norm": 1.219942569732666, + "learning_rate": 7.398306834606649e-05, + "loss": 1.0023, + "step": 6795 + }, + { + "epoch": 0.2751922298664508, + "grad_norm": 1.3363230228424072, + "learning_rate": 7.396241998761098e-05, + "loss": 0.9559, + "step": 6800 + }, + { + "epoch": 0.2753945770942938, + "grad_norm": 1.2138142585754395, + "learning_rate": 7.394177162915548e-05, + "loss": 1.0219, + "step": 6805 + }, + { + "epoch": 0.2755969243221368, + "grad_norm": 1.2388733625411987, + "learning_rate": 7.392112327069999e-05, + "loss": 0.9824, + "step": 6810 + }, + { + "epoch": 0.27579927154997974, + "grad_norm": 1.1982988119125366, + "learning_rate": 7.390047491224448e-05, + "loss": 0.9888, + "step": 6815 + }, + { + "epoch": 0.27600161877782275, + "grad_norm": 1.2767200469970703, + "learning_rate": 7.387982655378898e-05, + "loss": 0.9389, + "step": 6820 + }, + { + "epoch": 0.2762039660056657, + "grad_norm": 1.1729559898376465, + "learning_rate": 7.385917819533347e-05, + "loss": 1.0463, + "step": 6825 + }, + { + "epoch": 0.2764063132335087, + "grad_norm": 1.0749800205230713, + "learning_rate": 7.383852983687796e-05, + "loss": 0.9682, + "step": 6830 + }, + { + "epoch": 0.2766086604613517, + "grad_norm": 1.3590329885482788, + "learning_rate": 7.381788147842247e-05, + "loss": 1.0713, + "step": 6835 + }, + { + "epoch": 0.27681100768919464, + "grad_norm": 1.1369819641113281, + "learning_rate": 7.379723311996697e-05, + "loss": 1.0568, + "step": 6840 + }, + { + "epoch": 0.27701335491703766, + "grad_norm": 1.242820143699646, + "learning_rate": 7.377658476151146e-05, + "loss": 0.9609, + "step": 6845 + }, + { + "epoch": 0.2772157021448806, + "grad_norm": 1.1668909788131714, + "learning_rate": 7.375593640305595e-05, + "loss": 1.0397, + "step": 6850 + }, + { + "epoch": 0.2774180493727236, + "grad_norm": 1.2021713256835938, + "learning_rate": 7.373528804460046e-05, + "loss": 0.9497, + "step": 6855 + }, + { + "epoch": 0.2776203966005666, + "grad_norm": 1.191312313079834, + "learning_rate": 7.371463968614496e-05, + "loss": 0.9175, + "step": 6860 + }, + { + "epoch": 0.27782274382840955, + "grad_norm": 1.2159568071365356, + "learning_rate": 7.369399132768945e-05, + "loss": 1.038, + "step": 6865 + }, + { + "epoch": 0.2780250910562525, + "grad_norm": 1.1642060279846191, + "learning_rate": 7.367334296923394e-05, + "loss": 0.9647, + "step": 6870 + }, + { + "epoch": 0.2782274382840955, + "grad_norm": 1.1729024648666382, + "learning_rate": 7.365269461077845e-05, + "loss": 0.9741, + "step": 6875 + }, + { + "epoch": 0.2784297855119385, + "grad_norm": 1.2645635604858398, + "learning_rate": 7.363204625232294e-05, + "loss": 1.0328, + "step": 6880 + }, + { + "epoch": 0.2786321327397815, + "grad_norm": 1.1870768070220947, + "learning_rate": 7.361139789386744e-05, + "loss": 1.0114, + "step": 6885 + }, + { + "epoch": 0.27883447996762445, + "grad_norm": 1.1066211462020874, + "learning_rate": 7.359074953541195e-05, + "loss": 0.9975, + "step": 6890 + }, + { + "epoch": 0.2790368271954674, + "grad_norm": 1.148180603981018, + "learning_rate": 7.357010117695644e-05, + "loss": 0.9911, + "step": 6895 + }, + { + "epoch": 0.2792391744233104, + "grad_norm": 1.1885652542114258, + "learning_rate": 7.354945281850093e-05, + "loss": 1.0457, + "step": 6900 + }, + { + "epoch": 0.2794415216511534, + "grad_norm": 1.3599821329116821, + "learning_rate": 7.352880446004543e-05, + "loss": 1.0676, + "step": 6905 + }, + { + "epoch": 0.27964386887899634, + "grad_norm": 1.051513433456421, + "learning_rate": 7.350815610158993e-05, + "loss": 0.9989, + "step": 6910 + }, + { + "epoch": 0.27984621610683935, + "grad_norm": 1.2398626804351807, + "learning_rate": 7.348750774313443e-05, + "loss": 1.0297, + "step": 6915 + }, + { + "epoch": 0.2800485633346823, + "grad_norm": 1.2511979341506958, + "learning_rate": 7.346685938467892e-05, + "loss": 0.9924, + "step": 6920 + }, + { + "epoch": 0.28025091056252527, + "grad_norm": 1.1494407653808594, + "learning_rate": 7.344621102622342e-05, + "loss": 0.9201, + "step": 6925 + }, + { + "epoch": 0.2804532577903683, + "grad_norm": 1.199025273323059, + "learning_rate": 7.342556266776791e-05, + "loss": 0.9594, + "step": 6930 + }, + { + "epoch": 0.28065560501821124, + "grad_norm": 1.1938326358795166, + "learning_rate": 7.340491430931241e-05, + "loss": 0.9798, + "step": 6935 + }, + { + "epoch": 0.28085795224605425, + "grad_norm": 1.061036229133606, + "learning_rate": 7.33842659508569e-05, + "loss": 0.981, + "step": 6940 + }, + { + "epoch": 0.2810602994738972, + "grad_norm": 1.1499427556991577, + "learning_rate": 7.336361759240141e-05, + "loss": 0.9694, + "step": 6945 + }, + { + "epoch": 0.28126264670174017, + "grad_norm": 1.351099967956543, + "learning_rate": 7.33429692339459e-05, + "loss": 1.0097, + "step": 6950 + }, + { + "epoch": 0.2814649939295832, + "grad_norm": 1.0928959846496582, + "learning_rate": 7.33223208754904e-05, + "loss": 1.0144, + "step": 6955 + }, + { + "epoch": 0.28166734115742614, + "grad_norm": 1.2172058820724487, + "learning_rate": 7.330167251703491e-05, + "loss": 1.0061, + "step": 6960 + }, + { + "epoch": 0.2818696883852691, + "grad_norm": 1.1754196882247925, + "learning_rate": 7.32810241585794e-05, + "loss": 1.0083, + "step": 6965 + }, + { + "epoch": 0.2820720356131121, + "grad_norm": 1.2395962476730347, + "learning_rate": 7.326037580012389e-05, + "loss": 1.0243, + "step": 6970 + }, + { + "epoch": 0.2822743828409551, + "grad_norm": 1.160964012145996, + "learning_rate": 7.323972744166839e-05, + "loss": 1.003, + "step": 6975 + }, + { + "epoch": 0.28247673006879803, + "grad_norm": 1.1365638971328735, + "learning_rate": 7.321907908321288e-05, + "loss": 1.0273, + "step": 6980 + }, + { + "epoch": 0.28267907729664105, + "grad_norm": 1.196136713027954, + "learning_rate": 7.319843072475739e-05, + "loss": 1.0482, + "step": 6985 + }, + { + "epoch": 0.282881424524484, + "grad_norm": 1.2548751831054688, + "learning_rate": 7.317778236630188e-05, + "loss": 1.0182, + "step": 6990 + }, + { + "epoch": 0.283083771752327, + "grad_norm": 1.1191009283065796, + "learning_rate": 7.315713400784638e-05, + "loss": 1.0106, + "step": 6995 + }, + { + "epoch": 0.28328611898017, + "grad_norm": 1.2741000652313232, + "learning_rate": 7.313648564939087e-05, + "loss": 1.037, + "step": 7000 + }, + { + "epoch": 0.28348846620801293, + "grad_norm": 1.0600517988204956, + "learning_rate": 7.311583729093538e-05, + "loss": 0.9773, + "step": 7005 + }, + { + "epoch": 0.28369081343585595, + "grad_norm": 1.126713514328003, + "learning_rate": 7.309518893247988e-05, + "loss": 1.066, + "step": 7010 + }, + { + "epoch": 0.2838931606636989, + "grad_norm": 1.0954577922821045, + "learning_rate": 7.307454057402436e-05, + "loss": 1.019, + "step": 7015 + }, + { + "epoch": 0.28409550789154187, + "grad_norm": 1.3487249612808228, + "learning_rate": 7.305389221556886e-05, + "loss": 1.0658, + "step": 7020 + }, + { + "epoch": 0.2842978551193849, + "grad_norm": 1.0767533779144287, + "learning_rate": 7.303324385711337e-05, + "loss": 0.9997, + "step": 7025 + }, + { + "epoch": 0.28450020234722784, + "grad_norm": 1.2380719184875488, + "learning_rate": 7.301259549865787e-05, + "loss": 1.0391, + "step": 7030 + }, + { + "epoch": 0.2847025495750708, + "grad_norm": 1.1037429571151733, + "learning_rate": 7.299194714020236e-05, + "loss": 0.9583, + "step": 7035 + }, + { + "epoch": 0.2849048968029138, + "grad_norm": 1.2484043836593628, + "learning_rate": 7.297129878174685e-05, + "loss": 1.0238, + "step": 7040 + }, + { + "epoch": 0.28510724403075677, + "grad_norm": 1.1408405303955078, + "learning_rate": 7.295065042329136e-05, + "loss": 1.06, + "step": 7045 + }, + { + "epoch": 0.2853095912585998, + "grad_norm": 1.1027109622955322, + "learning_rate": 7.293000206483585e-05, + "loss": 0.977, + "step": 7050 + }, + { + "epoch": 0.28551193848644274, + "grad_norm": 1.1841386556625366, + "learning_rate": 7.290935370638035e-05, + "loss": 1.0146, + "step": 7055 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.19268000125885, + "learning_rate": 7.288870534792484e-05, + "loss": 1.0176, + "step": 7060 + }, + { + "epoch": 0.2859166329421287, + "grad_norm": 1.1941889524459839, + "learning_rate": 7.286805698946933e-05, + "loss": 1.0325, + "step": 7065 + }, + { + "epoch": 0.28611898016997167, + "grad_norm": 1.0459282398223877, + "learning_rate": 7.284740863101383e-05, + "loss": 0.9613, + "step": 7070 + }, + { + "epoch": 0.28632132739781463, + "grad_norm": 1.2329634428024292, + "learning_rate": 7.282676027255834e-05, + "loss": 1.0587, + "step": 7075 + }, + { + "epoch": 0.28652367462565764, + "grad_norm": 1.180199384689331, + "learning_rate": 7.280611191410284e-05, + "loss": 0.9983, + "step": 7080 + }, + { + "epoch": 0.2867260218535006, + "grad_norm": 1.201682448387146, + "learning_rate": 7.278546355564732e-05, + "loss": 0.9952, + "step": 7085 + }, + { + "epoch": 0.28692836908134356, + "grad_norm": 1.1668422222137451, + "learning_rate": 7.276481519719182e-05, + "loss": 0.9602, + "step": 7090 + }, + { + "epoch": 0.2871307163091866, + "grad_norm": 1.1423628330230713, + "learning_rate": 7.274416683873633e-05, + "loss": 1.0044, + "step": 7095 + }, + { + "epoch": 0.28733306353702953, + "grad_norm": 1.217802882194519, + "learning_rate": 7.272351848028082e-05, + "loss": 1.0701, + "step": 7100 + }, + { + "epoch": 0.28753541076487255, + "grad_norm": 0.9813905954360962, + "learning_rate": 7.270287012182532e-05, + "loss": 1.03, + "step": 7105 + }, + { + "epoch": 0.2877377579927155, + "grad_norm": 1.2254445552825928, + "learning_rate": 7.268222176336981e-05, + "loss": 1.0226, + "step": 7110 + }, + { + "epoch": 0.28794010522055846, + "grad_norm": 1.2287665605545044, + "learning_rate": 7.266157340491432e-05, + "loss": 1.037, + "step": 7115 + }, + { + "epoch": 0.2881424524484015, + "grad_norm": 1.2598094940185547, + "learning_rate": 7.264092504645881e-05, + "loss": 0.968, + "step": 7120 + }, + { + "epoch": 0.28834479967624443, + "grad_norm": 1.1876108646392822, + "learning_rate": 7.262027668800331e-05, + "loss": 1.0464, + "step": 7125 + }, + { + "epoch": 0.2885471469040874, + "grad_norm": 1.1915405988693237, + "learning_rate": 7.25996283295478e-05, + "loss": 0.98, + "step": 7130 + }, + { + "epoch": 0.2887494941319304, + "grad_norm": 1.2311309576034546, + "learning_rate": 7.257897997109229e-05, + "loss": 1.0208, + "step": 7135 + }, + { + "epoch": 0.28895184135977336, + "grad_norm": 1.3213067054748535, + "learning_rate": 7.25583316126368e-05, + "loss": 0.9752, + "step": 7140 + }, + { + "epoch": 0.2891541885876163, + "grad_norm": 1.1361618041992188, + "learning_rate": 7.25376832541813e-05, + "loss": 0.9961, + "step": 7145 + }, + { + "epoch": 0.28935653581545934, + "grad_norm": 1.1876906156539917, + "learning_rate": 7.251703489572579e-05, + "loss": 0.9563, + "step": 7150 + }, + { + "epoch": 0.2895588830433023, + "grad_norm": 1.2165839672088623, + "learning_rate": 7.249638653727028e-05, + "loss": 1.0085, + "step": 7155 + }, + { + "epoch": 0.2897612302711453, + "grad_norm": 1.0741130113601685, + "learning_rate": 7.247573817881479e-05, + "loss": 1.0152, + "step": 7160 + }, + { + "epoch": 0.28996357749898827, + "grad_norm": 1.1732629537582397, + "learning_rate": 7.245508982035929e-05, + "loss": 1.0184, + "step": 7165 + }, + { + "epoch": 0.2901659247268312, + "grad_norm": 1.1553676128387451, + "learning_rate": 7.243444146190378e-05, + "loss": 0.9966, + "step": 7170 + }, + { + "epoch": 0.29036827195467424, + "grad_norm": 1.230701208114624, + "learning_rate": 7.241379310344828e-05, + "loss": 1.0381, + "step": 7175 + }, + { + "epoch": 0.2905706191825172, + "grad_norm": 1.1343389749526978, + "learning_rate": 7.239314474499278e-05, + "loss": 1.0462, + "step": 7180 + }, + { + "epoch": 0.29077296641036016, + "grad_norm": 1.1028715372085571, + "learning_rate": 7.237249638653727e-05, + "loss": 1.0403, + "step": 7185 + }, + { + "epoch": 0.29097531363820317, + "grad_norm": 1.2318840026855469, + "learning_rate": 7.235184802808177e-05, + "loss": 0.9667, + "step": 7190 + }, + { + "epoch": 0.29117766086604613, + "grad_norm": 1.1649657487869263, + "learning_rate": 7.233119966962627e-05, + "loss": 1.0312, + "step": 7195 + }, + { + "epoch": 0.2913800080938891, + "grad_norm": 1.1640312671661377, + "learning_rate": 7.231055131117076e-05, + "loss": 1.0257, + "step": 7200 + }, + { + "epoch": 0.2915823553217321, + "grad_norm": 1.3711130619049072, + "learning_rate": 7.228990295271526e-05, + "loss": 1.0607, + "step": 7205 + }, + { + "epoch": 0.29178470254957506, + "grad_norm": 1.1209375858306885, + "learning_rate": 7.226925459425976e-05, + "loss": 0.9962, + "step": 7210 + }, + { + "epoch": 0.2919870497774181, + "grad_norm": 1.1353542804718018, + "learning_rate": 7.224860623580426e-05, + "loss": 0.9753, + "step": 7215 + }, + { + "epoch": 0.29218939700526103, + "grad_norm": 1.2244285345077515, + "learning_rate": 7.222795787734875e-05, + "loss": 1.0051, + "step": 7220 + }, + { + "epoch": 0.292391744233104, + "grad_norm": 1.119053602218628, + "learning_rate": 7.220730951889326e-05, + "loss": 1.0178, + "step": 7225 + }, + { + "epoch": 0.292594091460947, + "grad_norm": 1.1514949798583984, + "learning_rate": 7.218666116043775e-05, + "loss": 1.0123, + "step": 7230 + }, + { + "epoch": 0.29279643868878996, + "grad_norm": 1.1903537511825562, + "learning_rate": 7.216601280198224e-05, + "loss": 0.9997, + "step": 7235 + }, + { + "epoch": 0.2929987859166329, + "grad_norm": 1.1025440692901611, + "learning_rate": 7.214536444352674e-05, + "loss": 1.0114, + "step": 7240 + }, + { + "epoch": 0.29320113314447593, + "grad_norm": 1.1389793157577515, + "learning_rate": 7.212471608507125e-05, + "loss": 0.9734, + "step": 7245 + }, + { + "epoch": 0.2934034803723189, + "grad_norm": 1.1365653276443481, + "learning_rate": 7.210406772661574e-05, + "loss": 0.9598, + "step": 7250 + }, + { + "epoch": 0.29360582760016185, + "grad_norm": 1.265367031097412, + "learning_rate": 7.208341936816023e-05, + "loss": 1.0125, + "step": 7255 + }, + { + "epoch": 0.29380817482800486, + "grad_norm": 1.2603001594543457, + "learning_rate": 7.206277100970473e-05, + "loss": 1.028, + "step": 7260 + }, + { + "epoch": 0.2940105220558478, + "grad_norm": 1.2002099752426147, + "learning_rate": 7.204212265124924e-05, + "loss": 0.9804, + "step": 7265 + }, + { + "epoch": 0.29421286928369084, + "grad_norm": 1.0949093103408813, + "learning_rate": 7.202147429279373e-05, + "loss": 0.9647, + "step": 7270 + }, + { + "epoch": 0.2944152165115338, + "grad_norm": 1.195500373840332, + "learning_rate": 7.200082593433822e-05, + "loss": 1.012, + "step": 7275 + }, + { + "epoch": 0.29461756373937675, + "grad_norm": 1.0756292343139648, + "learning_rate": 7.198017757588272e-05, + "loss": 1.0446, + "step": 7280 + }, + { + "epoch": 0.29481991096721977, + "grad_norm": 1.1095898151397705, + "learning_rate": 7.195952921742721e-05, + "loss": 1.055, + "step": 7285 + }, + { + "epoch": 0.2950222581950627, + "grad_norm": 1.0555248260498047, + "learning_rate": 7.193888085897172e-05, + "loss": 0.9849, + "step": 7290 + }, + { + "epoch": 0.2952246054229057, + "grad_norm": 1.2022385597229004, + "learning_rate": 7.191823250051622e-05, + "loss": 1.0044, + "step": 7295 + }, + { + "epoch": 0.2954269526507487, + "grad_norm": 1.196317195892334, + "learning_rate": 7.189758414206071e-05, + "loss": 1.0223, + "step": 7300 + }, + { + "epoch": 0.29562929987859166, + "grad_norm": 1.2295843362808228, + "learning_rate": 7.18769357836052e-05, + "loss": 1.0401, + "step": 7305 + }, + { + "epoch": 0.2958316471064346, + "grad_norm": 1.1430981159210205, + "learning_rate": 7.18562874251497e-05, + "loss": 1.0177, + "step": 7310 + }, + { + "epoch": 0.29603399433427763, + "grad_norm": 1.2077337503433228, + "learning_rate": 7.183563906669421e-05, + "loss": 1.0704, + "step": 7315 + }, + { + "epoch": 0.2962363415621206, + "grad_norm": 1.1060575246810913, + "learning_rate": 7.18149907082387e-05, + "loss": 1.0082, + "step": 7320 + }, + { + "epoch": 0.2964386887899636, + "grad_norm": 1.1972790956497192, + "learning_rate": 7.179434234978319e-05, + "loss": 0.9948, + "step": 7325 + }, + { + "epoch": 0.29664103601780656, + "grad_norm": 1.2054885625839233, + "learning_rate": 7.17736939913277e-05, + "loss": 0.9546, + "step": 7330 + }, + { + "epoch": 0.2968433832456495, + "grad_norm": 1.1200127601623535, + "learning_rate": 7.17530456328722e-05, + "loss": 1.041, + "step": 7335 + }, + { + "epoch": 0.29704573047349253, + "grad_norm": 1.0624228715896606, + "learning_rate": 7.173239727441669e-05, + "loss": 1.039, + "step": 7340 + }, + { + "epoch": 0.2972480777013355, + "grad_norm": 1.0303196907043457, + "learning_rate": 7.171174891596118e-05, + "loss": 1.1089, + "step": 7345 + }, + { + "epoch": 0.29745042492917845, + "grad_norm": 1.1626769304275513, + "learning_rate": 7.169110055750568e-05, + "loss": 1.0513, + "step": 7350 + }, + { + "epoch": 0.29765277215702146, + "grad_norm": 1.2582601308822632, + "learning_rate": 7.167045219905017e-05, + "loss": 0.9749, + "step": 7355 + }, + { + "epoch": 0.2978551193848644, + "grad_norm": 1.1853293180465698, + "learning_rate": 7.164980384059468e-05, + "loss": 0.9623, + "step": 7360 + }, + { + "epoch": 0.29805746661270743, + "grad_norm": 1.2648826837539673, + "learning_rate": 7.162915548213918e-05, + "loss": 1.088, + "step": 7365 + }, + { + "epoch": 0.2982598138405504, + "grad_norm": 1.1166695356369019, + "learning_rate": 7.160850712368366e-05, + "loss": 1.0096, + "step": 7370 + }, + { + "epoch": 0.29846216106839335, + "grad_norm": 1.2206895351409912, + "learning_rate": 7.158785876522816e-05, + "loss": 1.0238, + "step": 7375 + }, + { + "epoch": 0.29866450829623636, + "grad_norm": 1.195966362953186, + "learning_rate": 7.156721040677267e-05, + "loss": 1.0151, + "step": 7380 + }, + { + "epoch": 0.2988668555240793, + "grad_norm": 1.0893754959106445, + "learning_rate": 7.154656204831717e-05, + "loss": 0.9902, + "step": 7385 + }, + { + "epoch": 0.2990692027519223, + "grad_norm": 1.1973437070846558, + "learning_rate": 7.152591368986166e-05, + "loss": 0.9808, + "step": 7390 + }, + { + "epoch": 0.2992715499797653, + "grad_norm": 1.2002286911010742, + "learning_rate": 7.150526533140615e-05, + "loss": 0.9959, + "step": 7395 + }, + { + "epoch": 0.29947389720760825, + "grad_norm": 1.122012972831726, + "learning_rate": 7.148461697295066e-05, + "loss": 0.9995, + "step": 7400 + }, + { + "epoch": 0.2996762444354512, + "grad_norm": 1.1145695447921753, + "learning_rate": 7.146396861449515e-05, + "loss": 1.0395, + "step": 7405 + }, + { + "epoch": 0.2998785916632942, + "grad_norm": 1.151845097541809, + "learning_rate": 7.144332025603965e-05, + "loss": 1.0706, + "step": 7410 + }, + { + "epoch": 0.3000809388911372, + "grad_norm": 1.1342051029205322, + "learning_rate": 7.142267189758414e-05, + "loss": 1.0394, + "step": 7415 + }, + { + "epoch": 0.3002832861189802, + "grad_norm": 1.1038486957550049, + "learning_rate": 7.140202353912863e-05, + "loss": 0.9879, + "step": 7420 + }, + { + "epoch": 0.30048563334682316, + "grad_norm": 1.143272876739502, + "learning_rate": 7.138137518067314e-05, + "loss": 1.0117, + "step": 7425 + }, + { + "epoch": 0.3006879805746661, + "grad_norm": 1.2253665924072266, + "learning_rate": 7.136072682221764e-05, + "loss": 1.0272, + "step": 7430 + }, + { + "epoch": 0.3008903278025091, + "grad_norm": 1.1236751079559326, + "learning_rate": 7.134007846376214e-05, + "loss": 1.0089, + "step": 7435 + }, + { + "epoch": 0.3010926750303521, + "grad_norm": 1.4188357591629028, + "learning_rate": 7.131943010530663e-05, + "loss": 1.0131, + "step": 7440 + }, + { + "epoch": 0.30129502225819504, + "grad_norm": 1.224983811378479, + "learning_rate": 7.129878174685113e-05, + "loss": 1.0096, + "step": 7445 + }, + { + "epoch": 0.30149736948603806, + "grad_norm": 1.0498284101486206, + "learning_rate": 7.127813338839563e-05, + "loss": 1.0623, + "step": 7450 + }, + { + "epoch": 0.301699716713881, + "grad_norm": 1.3433605432510376, + "learning_rate": 7.125748502994012e-05, + "loss": 1.0363, + "step": 7455 + }, + { + "epoch": 0.301902063941724, + "grad_norm": 1.3511546850204468, + "learning_rate": 7.123683667148462e-05, + "loss": 1.023, + "step": 7460 + }, + { + "epoch": 0.302104411169567, + "grad_norm": 1.1709057092666626, + "learning_rate": 7.121618831302911e-05, + "loss": 1.0136, + "step": 7465 + }, + { + "epoch": 0.30230675839740995, + "grad_norm": 1.1816767454147339, + "learning_rate": 7.119553995457362e-05, + "loss": 0.9868, + "step": 7470 + }, + { + "epoch": 0.30250910562525296, + "grad_norm": 1.1204698085784912, + "learning_rate": 7.117489159611811e-05, + "loss": 1.0717, + "step": 7475 + }, + { + "epoch": 0.3027114528530959, + "grad_norm": 1.2430353164672852, + "learning_rate": 7.115424323766261e-05, + "loss": 1.0317, + "step": 7480 + }, + { + "epoch": 0.3029138000809389, + "grad_norm": 1.0575361251831055, + "learning_rate": 7.11335948792071e-05, + "loss": 0.9902, + "step": 7485 + }, + { + "epoch": 0.3031161473087819, + "grad_norm": 1.0389426946640015, + "learning_rate": 7.11129465207516e-05, + "loss": 1.0214, + "step": 7490 + }, + { + "epoch": 0.30331849453662485, + "grad_norm": 1.0878791809082031, + "learning_rate": 7.10922981622961e-05, + "loss": 0.9587, + "step": 7495 + }, + { + "epoch": 0.3035208417644678, + "grad_norm": 1.1660854816436768, + "learning_rate": 7.10716498038406e-05, + "loss": 0.9606, + "step": 7500 + }, + { + "epoch": 0.3037231889923108, + "grad_norm": 1.1651402711868286, + "learning_rate": 7.105100144538509e-05, + "loss": 1.0076, + "step": 7505 + }, + { + "epoch": 0.3039255362201538, + "grad_norm": 1.248314380645752, + "learning_rate": 7.10303530869296e-05, + "loss": 1.0847, + "step": 7510 + }, + { + "epoch": 0.30412788344799674, + "grad_norm": 1.1461608409881592, + "learning_rate": 7.100970472847409e-05, + "loss": 1.0003, + "step": 7515 + }, + { + "epoch": 0.30433023067583975, + "grad_norm": 1.2227619886398315, + "learning_rate": 7.098905637001859e-05, + "loss": 1.0442, + "step": 7520 + }, + { + "epoch": 0.3045325779036827, + "grad_norm": 1.2554718255996704, + "learning_rate": 7.096840801156308e-05, + "loss": 1.0341, + "step": 7525 + }, + { + "epoch": 0.3047349251315257, + "grad_norm": 1.1214122772216797, + "learning_rate": 7.094775965310759e-05, + "loss": 0.9841, + "step": 7530 + }, + { + "epoch": 0.3049372723593687, + "grad_norm": 1.2532953023910522, + "learning_rate": 7.092711129465208e-05, + "loss": 1.0041, + "step": 7535 + }, + { + "epoch": 0.30513961958721164, + "grad_norm": 1.1885756254196167, + "learning_rate": 7.090646293619657e-05, + "loss": 1.0439, + "step": 7540 + }, + { + "epoch": 0.30534196681505466, + "grad_norm": 1.074864149093628, + "learning_rate": 7.088581457774107e-05, + "loss": 1.0343, + "step": 7545 + }, + { + "epoch": 0.3055443140428976, + "grad_norm": 1.1654303073883057, + "learning_rate": 7.086516621928558e-05, + "loss": 0.9469, + "step": 7550 + }, + { + "epoch": 0.30574666127074057, + "grad_norm": 1.1716253757476807, + "learning_rate": 7.084451786083007e-05, + "loss": 1.0065, + "step": 7555 + }, + { + "epoch": 0.3059490084985836, + "grad_norm": 1.120162844657898, + "learning_rate": 7.082386950237456e-05, + "loss": 1.0365, + "step": 7560 + }, + { + "epoch": 0.30615135572642654, + "grad_norm": 1.263826847076416, + "learning_rate": 7.080322114391906e-05, + "loss": 1.0265, + "step": 7565 + }, + { + "epoch": 0.3063537029542695, + "grad_norm": 1.2363839149475098, + "learning_rate": 7.078257278546356e-05, + "loss": 0.9972, + "step": 7570 + }, + { + "epoch": 0.3065560501821125, + "grad_norm": 1.1157922744750977, + "learning_rate": 7.076192442700806e-05, + "loss": 0.9479, + "step": 7575 + }, + { + "epoch": 0.3067583974099555, + "grad_norm": 1.2315844297409058, + "learning_rate": 7.074127606855256e-05, + "loss": 0.995, + "step": 7580 + }, + { + "epoch": 0.3069607446377985, + "grad_norm": 1.3027738332748413, + "learning_rate": 7.072062771009705e-05, + "loss": 0.9842, + "step": 7585 + }, + { + "epoch": 0.30716309186564145, + "grad_norm": 1.1640541553497314, + "learning_rate": 7.069997935164154e-05, + "loss": 1.0227, + "step": 7590 + }, + { + "epoch": 0.3073654390934844, + "grad_norm": 1.2683963775634766, + "learning_rate": 7.067933099318604e-05, + "loss": 1.0546, + "step": 7595 + }, + { + "epoch": 0.3075677863213274, + "grad_norm": 1.0934149026870728, + "learning_rate": 7.065868263473055e-05, + "loss": 0.959, + "step": 7600 + }, + { + "epoch": 0.3077701335491704, + "grad_norm": 1.579567551612854, + "learning_rate": 7.063803427627504e-05, + "loss": 0.981, + "step": 7605 + }, + { + "epoch": 0.30797248077701334, + "grad_norm": 1.078355073928833, + "learning_rate": 7.061738591781953e-05, + "loss": 1.0538, + "step": 7610 + }, + { + "epoch": 0.30817482800485635, + "grad_norm": 1.025793433189392, + "learning_rate": 7.059673755936403e-05, + "loss": 1.0032, + "step": 7615 + }, + { + "epoch": 0.3083771752326993, + "grad_norm": 1.0965713262557983, + "learning_rate": 7.057608920090854e-05, + "loss": 1.0416, + "step": 7620 + }, + { + "epoch": 0.30857952246054227, + "grad_norm": 1.094805121421814, + "learning_rate": 7.055544084245303e-05, + "loss": 1.0167, + "step": 7625 + }, + { + "epoch": 0.3087818696883853, + "grad_norm": 1.0859425067901611, + "learning_rate": 7.053479248399752e-05, + "loss": 1.031, + "step": 7630 + }, + { + "epoch": 0.30898421691622824, + "grad_norm": 1.1122045516967773, + "learning_rate": 7.051414412554202e-05, + "loss": 0.9729, + "step": 7635 + }, + { + "epoch": 0.30918656414407125, + "grad_norm": 1.1862298250198364, + "learning_rate": 7.049349576708651e-05, + "loss": 0.9737, + "step": 7640 + }, + { + "epoch": 0.3093889113719142, + "grad_norm": 1.2225682735443115, + "learning_rate": 7.047284740863102e-05, + "loss": 0.983, + "step": 7645 + }, + { + "epoch": 0.30959125859975717, + "grad_norm": 1.1471396684646606, + "learning_rate": 7.045219905017552e-05, + "loss": 1.0125, + "step": 7650 + }, + { + "epoch": 0.3097936058276002, + "grad_norm": 1.17530357837677, + "learning_rate": 7.043155069172001e-05, + "loss": 0.9278, + "step": 7655 + }, + { + "epoch": 0.30999595305544314, + "grad_norm": 1.224523901939392, + "learning_rate": 7.04109023332645e-05, + "loss": 1.0306, + "step": 7660 + }, + { + "epoch": 0.3101983002832861, + "grad_norm": 1.2725141048431396, + "learning_rate": 7.0390253974809e-05, + "loss": 0.9934, + "step": 7665 + }, + { + "epoch": 0.3104006475111291, + "grad_norm": 1.2264436483383179, + "learning_rate": 7.036960561635351e-05, + "loss": 1.0184, + "step": 7670 + }, + { + "epoch": 0.31060299473897207, + "grad_norm": 1.1794722080230713, + "learning_rate": 7.0348957257898e-05, + "loss": 0.9811, + "step": 7675 + }, + { + "epoch": 0.31080534196681503, + "grad_norm": 1.1239395141601562, + "learning_rate": 7.032830889944249e-05, + "loss": 0.9433, + "step": 7680 + }, + { + "epoch": 0.31100768919465804, + "grad_norm": 1.1605916023254395, + "learning_rate": 7.0307660540987e-05, + "loss": 0.9934, + "step": 7685 + }, + { + "epoch": 0.311210036422501, + "grad_norm": 1.1802419424057007, + "learning_rate": 7.02870121825315e-05, + "loss": 1.03, + "step": 7690 + }, + { + "epoch": 0.311412383650344, + "grad_norm": 1.2433397769927979, + "learning_rate": 7.026636382407599e-05, + "loss": 0.9642, + "step": 7695 + }, + { + "epoch": 0.311614730878187, + "grad_norm": 1.1377280950546265, + "learning_rate": 7.024571546562048e-05, + "loss": 1.0044, + "step": 7700 + }, + { + "epoch": 0.31181707810602993, + "grad_norm": 1.1464645862579346, + "learning_rate": 7.022506710716498e-05, + "loss": 0.9674, + "step": 7705 + }, + { + "epoch": 0.31201942533387295, + "grad_norm": 1.0886270999908447, + "learning_rate": 7.020441874870948e-05, + "loss": 0.9622, + "step": 7710 + }, + { + "epoch": 0.3122217725617159, + "grad_norm": 1.2346837520599365, + "learning_rate": 7.018377039025398e-05, + "loss": 0.9845, + "step": 7715 + }, + { + "epoch": 0.31242411978955886, + "grad_norm": 1.0920614004135132, + "learning_rate": 7.016312203179848e-05, + "loss": 1.0288, + "step": 7720 + }, + { + "epoch": 0.3126264670174019, + "grad_norm": 1.0794581174850464, + "learning_rate": 7.014247367334297e-05, + "loss": 1.0598, + "step": 7725 + }, + { + "epoch": 0.31282881424524484, + "grad_norm": 1.2226083278656006, + "learning_rate": 7.012182531488746e-05, + "loss": 0.9955, + "step": 7730 + }, + { + "epoch": 0.3130311614730878, + "grad_norm": 1.1473082304000854, + "learning_rate": 7.010117695643197e-05, + "loss": 1.0104, + "step": 7735 + }, + { + "epoch": 0.3132335087009308, + "grad_norm": 1.152406930923462, + "learning_rate": 7.008052859797647e-05, + "loss": 0.99, + "step": 7740 + }, + { + "epoch": 0.31343585592877377, + "grad_norm": 1.3039500713348389, + "learning_rate": 7.005988023952096e-05, + "loss": 1.0358, + "step": 7745 + }, + { + "epoch": 0.3136382031566168, + "grad_norm": 1.223159670829773, + "learning_rate": 7.003923188106545e-05, + "loss": 1.0109, + "step": 7750 + }, + { + "epoch": 0.31384055038445974, + "grad_norm": 1.1193894147872925, + "learning_rate": 7.001858352260996e-05, + "loss": 1.0297, + "step": 7755 + }, + { + "epoch": 0.3140428976123027, + "grad_norm": 1.1684590578079224, + "learning_rate": 6.999793516415445e-05, + "loss": 1.0384, + "step": 7760 + }, + { + "epoch": 0.3142452448401457, + "grad_norm": 1.1572351455688477, + "learning_rate": 6.997728680569895e-05, + "loss": 0.9851, + "step": 7765 + }, + { + "epoch": 0.31444759206798867, + "grad_norm": 1.0718976259231567, + "learning_rate": 6.995663844724346e-05, + "loss": 0.9977, + "step": 7770 + }, + { + "epoch": 0.3146499392958316, + "grad_norm": 1.1924231052398682, + "learning_rate": 6.993599008878793e-05, + "loss": 1.0081, + "step": 7775 + }, + { + "epoch": 0.31485228652367464, + "grad_norm": 1.25642991065979, + "learning_rate": 6.991534173033244e-05, + "loss": 0.9735, + "step": 7780 + }, + { + "epoch": 0.3150546337515176, + "grad_norm": 1.0597145557403564, + "learning_rate": 6.989469337187694e-05, + "loss": 0.9961, + "step": 7785 + }, + { + "epoch": 0.31525698097936056, + "grad_norm": 1.1548058986663818, + "learning_rate": 6.987404501342145e-05, + "loss": 1.0639, + "step": 7790 + }, + { + "epoch": 0.31545932820720357, + "grad_norm": 1.20090913772583, + "learning_rate": 6.985339665496594e-05, + "loss": 1.0209, + "step": 7795 + }, + { + "epoch": 0.31566167543504653, + "grad_norm": 1.1690526008605957, + "learning_rate": 6.983274829651043e-05, + "loss": 1.0425, + "step": 7800 + }, + { + "epoch": 0.31586402266288954, + "grad_norm": 1.0979046821594238, + "learning_rate": 6.981209993805493e-05, + "loss": 1.0157, + "step": 7805 + }, + { + "epoch": 0.3160663698907325, + "grad_norm": 1.1317203044891357, + "learning_rate": 6.979145157959942e-05, + "loss": 1.0308, + "step": 7810 + }, + { + "epoch": 0.31626871711857546, + "grad_norm": 1.0178236961364746, + "learning_rate": 6.977080322114393e-05, + "loss": 0.9917, + "step": 7815 + }, + { + "epoch": 0.3164710643464185, + "grad_norm": 1.1497842073440552, + "learning_rate": 6.975015486268842e-05, + "loss": 0.9876, + "step": 7820 + }, + { + "epoch": 0.31667341157426143, + "grad_norm": 1.106931209564209, + "learning_rate": 6.972950650423292e-05, + "loss": 1.0162, + "step": 7825 + }, + { + "epoch": 0.3168757588021044, + "grad_norm": 1.0901240110397339, + "learning_rate": 6.970885814577741e-05, + "loss": 1.0236, + "step": 7830 + }, + { + "epoch": 0.3170781060299474, + "grad_norm": 1.0298885107040405, + "learning_rate": 6.968820978732191e-05, + "loss": 1.0623, + "step": 7835 + }, + { + "epoch": 0.31728045325779036, + "grad_norm": 1.3310502767562866, + "learning_rate": 6.966756142886642e-05, + "loss": 1.0257, + "step": 7840 + }, + { + "epoch": 0.3174828004856333, + "grad_norm": 1.2882601022720337, + "learning_rate": 6.96469130704109e-05, + "loss": 1.0523, + "step": 7845 + }, + { + "epoch": 0.31768514771347633, + "grad_norm": 1.3289625644683838, + "learning_rate": 6.96262647119554e-05, + "loss": 1.0001, + "step": 7850 + }, + { + "epoch": 0.3178874949413193, + "grad_norm": 1.1948840618133545, + "learning_rate": 6.96056163534999e-05, + "loss": 1.0072, + "step": 7855 + }, + { + "epoch": 0.3180898421691623, + "grad_norm": 1.1636812686920166, + "learning_rate": 6.95849679950444e-05, + "loss": 0.9555, + "step": 7860 + }, + { + "epoch": 0.31829218939700527, + "grad_norm": 1.1317676305770874, + "learning_rate": 6.95643196365889e-05, + "loss": 0.9662, + "step": 7865 + }, + { + "epoch": 0.3184945366248482, + "grad_norm": 1.180823564529419, + "learning_rate": 6.954367127813339e-05, + "loss": 0.984, + "step": 7870 + }, + { + "epoch": 0.31869688385269124, + "grad_norm": 1.0699626207351685, + "learning_rate": 6.952302291967789e-05, + "loss": 0.9885, + "step": 7875 + }, + { + "epoch": 0.3188992310805342, + "grad_norm": 1.0747015476226807, + "learning_rate": 6.950237456122238e-05, + "loss": 1.0191, + "step": 7880 + }, + { + "epoch": 0.31910157830837715, + "grad_norm": 1.1225368976593018, + "learning_rate": 6.948172620276689e-05, + "loss": 1.0072, + "step": 7885 + }, + { + "epoch": 0.31930392553622017, + "grad_norm": 1.081352710723877, + "learning_rate": 6.946107784431138e-05, + "loss": 1.0131, + "step": 7890 + }, + { + "epoch": 0.3195062727640631, + "grad_norm": 1.1860682964324951, + "learning_rate": 6.944042948585587e-05, + "loss": 1.0169, + "step": 7895 + }, + { + "epoch": 0.3197086199919061, + "grad_norm": 1.1549993753433228, + "learning_rate": 6.941978112740037e-05, + "loss": 1.0003, + "step": 7900 + }, + { + "epoch": 0.3199109672197491, + "grad_norm": 1.1799170970916748, + "learning_rate": 6.939913276894488e-05, + "loss": 0.9382, + "step": 7905 + }, + { + "epoch": 0.32011331444759206, + "grad_norm": 1.1656957864761353, + "learning_rate": 6.937848441048937e-05, + "loss": 1.0021, + "step": 7910 + }, + { + "epoch": 0.32031566167543507, + "grad_norm": 1.3107892274856567, + "learning_rate": 6.935783605203386e-05, + "loss": 0.9928, + "step": 7915 + }, + { + "epoch": 0.32051800890327803, + "grad_norm": 1.1690044403076172, + "learning_rate": 6.933718769357836e-05, + "loss": 0.9826, + "step": 7920 + }, + { + "epoch": 0.320720356131121, + "grad_norm": 1.1157152652740479, + "learning_rate": 6.931653933512287e-05, + "loss": 1.0577, + "step": 7925 + }, + { + "epoch": 0.320922703358964, + "grad_norm": 1.170356273651123, + "learning_rate": 6.929589097666736e-05, + "loss": 0.9903, + "step": 7930 + }, + { + "epoch": 0.32112505058680696, + "grad_norm": 1.1648330688476562, + "learning_rate": 6.927524261821186e-05, + "loss": 1.0133, + "step": 7935 + }, + { + "epoch": 0.3213273978146499, + "grad_norm": 1.267077922821045, + "learning_rate": 6.925459425975635e-05, + "loss": 1.0096, + "step": 7940 + }, + { + "epoch": 0.32152974504249293, + "grad_norm": 1.2341781854629517, + "learning_rate": 6.923394590130084e-05, + "loss": 0.9657, + "step": 7945 + }, + { + "epoch": 0.3217320922703359, + "grad_norm": 1.1833523511886597, + "learning_rate": 6.921329754284535e-05, + "loss": 1.011, + "step": 7950 + }, + { + "epoch": 0.32193443949817885, + "grad_norm": 1.206613540649414, + "learning_rate": 6.919264918438985e-05, + "loss": 1.0141, + "step": 7955 + }, + { + "epoch": 0.32213678672602186, + "grad_norm": 1.2183783054351807, + "learning_rate": 6.917200082593434e-05, + "loss": 0.9733, + "step": 7960 + }, + { + "epoch": 0.3223391339538648, + "grad_norm": 1.0917104482650757, + "learning_rate": 6.915135246747883e-05, + "loss": 1.0339, + "step": 7965 + }, + { + "epoch": 0.32254148118170783, + "grad_norm": 1.1361770629882812, + "learning_rate": 6.913070410902333e-05, + "loss": 0.9885, + "step": 7970 + }, + { + "epoch": 0.3227438284095508, + "grad_norm": 1.0975098609924316, + "learning_rate": 6.911005575056784e-05, + "loss": 1.0272, + "step": 7975 + }, + { + "epoch": 0.32294617563739375, + "grad_norm": 1.1798059940338135, + "learning_rate": 6.908940739211233e-05, + "loss": 1.0756, + "step": 7980 + }, + { + "epoch": 0.32314852286523676, + "grad_norm": 1.1872261762619019, + "learning_rate": 6.906875903365683e-05, + "loss": 0.962, + "step": 7985 + }, + { + "epoch": 0.3233508700930797, + "grad_norm": 1.101155400276184, + "learning_rate": 6.904811067520132e-05, + "loss": 1.0203, + "step": 7990 + }, + { + "epoch": 0.3235532173209227, + "grad_norm": 1.2731853723526, + "learning_rate": 6.902746231674581e-05, + "loss": 0.9985, + "step": 7995 + }, + { + "epoch": 0.3237555645487657, + "grad_norm": 1.1103601455688477, + "learning_rate": 6.900681395829032e-05, + "loss": 1.0107, + "step": 8000 + }, + { + "epoch": 0.32395791177660865, + "grad_norm": 1.0946158170700073, + "learning_rate": 6.898616559983482e-05, + "loss": 1.0405, + "step": 8005 + }, + { + "epoch": 0.3241602590044516, + "grad_norm": 1.145131230354309, + "learning_rate": 6.896551724137931e-05, + "loss": 1.0078, + "step": 8010 + }, + { + "epoch": 0.3243626062322946, + "grad_norm": 1.2215243577957153, + "learning_rate": 6.89448688829238e-05, + "loss": 1.0298, + "step": 8015 + }, + { + "epoch": 0.3245649534601376, + "grad_norm": 1.146912693977356, + "learning_rate": 6.892422052446831e-05, + "loss": 1.0104, + "step": 8020 + }, + { + "epoch": 0.3247673006879806, + "grad_norm": 1.0603580474853516, + "learning_rate": 6.890357216601281e-05, + "loss": 1.0011, + "step": 8025 + }, + { + "epoch": 0.32496964791582356, + "grad_norm": 1.2084134817123413, + "learning_rate": 6.88829238075573e-05, + "loss": 1.031, + "step": 8030 + }, + { + "epoch": 0.3251719951436665, + "grad_norm": 1.2100253105163574, + "learning_rate": 6.886227544910179e-05, + "loss": 1.017, + "step": 8035 + }, + { + "epoch": 0.32537434237150953, + "grad_norm": 1.1398605108261108, + "learning_rate": 6.88416270906463e-05, + "loss": 1.0072, + "step": 8040 + }, + { + "epoch": 0.3255766895993525, + "grad_norm": 1.1360002756118774, + "learning_rate": 6.88209787321908e-05, + "loss": 1.0873, + "step": 8045 + }, + { + "epoch": 0.32577903682719545, + "grad_norm": 1.2177627086639404, + "learning_rate": 6.880033037373529e-05, + "loss": 1.0156, + "step": 8050 + }, + { + "epoch": 0.32598138405503846, + "grad_norm": 1.3088794946670532, + "learning_rate": 6.87796820152798e-05, + "loss": 0.9743, + "step": 8055 + }, + { + "epoch": 0.3261837312828814, + "grad_norm": 1.2426222562789917, + "learning_rate": 6.875903365682429e-05, + "loss": 0.9841, + "step": 8060 + }, + { + "epoch": 0.3263860785107244, + "grad_norm": 1.1549899578094482, + "learning_rate": 6.873838529836878e-05, + "loss": 1.0433, + "step": 8065 + }, + { + "epoch": 0.3265884257385674, + "grad_norm": 1.0796194076538086, + "learning_rate": 6.871773693991328e-05, + "loss": 1.0458, + "step": 8070 + }, + { + "epoch": 0.32679077296641035, + "grad_norm": 1.2137360572814941, + "learning_rate": 6.869708858145778e-05, + "loss": 0.9939, + "step": 8075 + }, + { + "epoch": 0.32699312019425336, + "grad_norm": 1.224730134010315, + "learning_rate": 6.867644022300228e-05, + "loss": 1.0541, + "step": 8080 + }, + { + "epoch": 0.3271954674220963, + "grad_norm": 1.3345255851745605, + "learning_rate": 6.865579186454677e-05, + "loss": 1.0071, + "step": 8085 + }, + { + "epoch": 0.3273978146499393, + "grad_norm": 1.2496358156204224, + "learning_rate": 6.863514350609127e-05, + "loss": 0.9642, + "step": 8090 + }, + { + "epoch": 0.3276001618777823, + "grad_norm": 1.1511274576187134, + "learning_rate": 6.861449514763577e-05, + "loss": 1.0755, + "step": 8095 + }, + { + "epoch": 0.32780250910562525, + "grad_norm": 1.2001534700393677, + "learning_rate": 6.859384678918026e-05, + "loss": 1.0396, + "step": 8100 + }, + { + "epoch": 0.3280048563334682, + "grad_norm": 1.0557612180709839, + "learning_rate": 6.857319843072476e-05, + "loss": 0.9412, + "step": 8105 + }, + { + "epoch": 0.3282072035613112, + "grad_norm": 1.2284162044525146, + "learning_rate": 6.855255007226926e-05, + "loss": 1.0042, + "step": 8110 + }, + { + "epoch": 0.3284095507891542, + "grad_norm": 1.176843523979187, + "learning_rate": 6.853190171381375e-05, + "loss": 1.0567, + "step": 8115 + }, + { + "epoch": 0.3286118980169972, + "grad_norm": 1.2524343729019165, + "learning_rate": 6.851125335535825e-05, + "loss": 1.0043, + "step": 8120 + }, + { + "epoch": 0.32881424524484015, + "grad_norm": 1.2177811861038208, + "learning_rate": 6.849060499690276e-05, + "loss": 0.9998, + "step": 8125 + }, + { + "epoch": 0.3290165924726831, + "grad_norm": 1.309518814086914, + "learning_rate": 6.846995663844723e-05, + "loss": 1.0058, + "step": 8130 + }, + { + "epoch": 0.3292189397005261, + "grad_norm": 1.1757453680038452, + "learning_rate": 6.844930827999174e-05, + "loss": 1.039, + "step": 8135 + }, + { + "epoch": 0.3294212869283691, + "grad_norm": 1.3004614114761353, + "learning_rate": 6.842865992153624e-05, + "loss": 0.9395, + "step": 8140 + }, + { + "epoch": 0.32962363415621204, + "grad_norm": 1.1624009609222412, + "learning_rate": 6.840801156308075e-05, + "loss": 1.0253, + "step": 8145 + }, + { + "epoch": 0.32982598138405506, + "grad_norm": 1.1674039363861084, + "learning_rate": 6.838736320462524e-05, + "loss": 0.9377, + "step": 8150 + }, + { + "epoch": 0.330028328611898, + "grad_norm": 1.3319944143295288, + "learning_rate": 6.836671484616973e-05, + "loss": 1.0289, + "step": 8155 + }, + { + "epoch": 0.330230675839741, + "grad_norm": 1.2189615964889526, + "learning_rate": 6.834606648771423e-05, + "loss": 1.0323, + "step": 8160 + }, + { + "epoch": 0.330433023067584, + "grad_norm": 1.179700255393982, + "learning_rate": 6.832541812925872e-05, + "loss": 0.98, + "step": 8165 + }, + { + "epoch": 0.33063537029542694, + "grad_norm": 1.1240464448928833, + "learning_rate": 6.830476977080323e-05, + "loss": 0.9815, + "step": 8170 + }, + { + "epoch": 0.33083771752326996, + "grad_norm": 1.1898940801620483, + "learning_rate": 6.828412141234772e-05, + "loss": 1.005, + "step": 8175 + }, + { + "epoch": 0.3310400647511129, + "grad_norm": 1.1352357864379883, + "learning_rate": 6.826347305389222e-05, + "loss": 1.0635, + "step": 8180 + }, + { + "epoch": 0.3312424119789559, + "grad_norm": 1.1874902248382568, + "learning_rate": 6.824282469543671e-05, + "loss": 1.0506, + "step": 8185 + }, + { + "epoch": 0.3314447592067989, + "grad_norm": 1.250718116760254, + "learning_rate": 6.822217633698122e-05, + "loss": 1.0776, + "step": 8190 + }, + { + "epoch": 0.33164710643464185, + "grad_norm": 1.1884347200393677, + "learning_rate": 6.820152797852572e-05, + "loss": 1.0312, + "step": 8195 + }, + { + "epoch": 0.3318494536624848, + "grad_norm": 1.047613501548767, + "learning_rate": 6.818087962007021e-05, + "loss": 0.98, + "step": 8200 + }, + { + "epoch": 0.3320518008903278, + "grad_norm": 1.2003908157348633, + "learning_rate": 6.81602312616147e-05, + "loss": 1.0186, + "step": 8205 + }, + { + "epoch": 0.3322541481181708, + "grad_norm": 1.148883581161499, + "learning_rate": 6.81395829031592e-05, + "loss": 0.9383, + "step": 8210 + }, + { + "epoch": 0.33245649534601374, + "grad_norm": 1.173259973526001, + "learning_rate": 6.81189345447037e-05, + "loss": 1.0041, + "step": 8215 + }, + { + "epoch": 0.33265884257385675, + "grad_norm": 1.140166997909546, + "learning_rate": 6.80982861862482e-05, + "loss": 1.0139, + "step": 8220 + }, + { + "epoch": 0.3328611898016997, + "grad_norm": 1.3193848133087158, + "learning_rate": 6.807763782779269e-05, + "loss": 1.0187, + "step": 8225 + }, + { + "epoch": 0.3330635370295427, + "grad_norm": 1.1724793910980225, + "learning_rate": 6.80569894693372e-05, + "loss": 1.0155, + "step": 8230 + }, + { + "epoch": 0.3332658842573857, + "grad_norm": 1.2499336004257202, + "learning_rate": 6.803634111088168e-05, + "loss": 1.0641, + "step": 8235 + }, + { + "epoch": 0.33346823148522864, + "grad_norm": 1.2024049758911133, + "learning_rate": 6.801569275242619e-05, + "loss": 1.0274, + "step": 8240 + }, + { + "epoch": 0.33367057871307165, + "grad_norm": 1.1916636228561401, + "learning_rate": 6.799504439397068e-05, + "loss": 0.9823, + "step": 8245 + }, + { + "epoch": 0.3338729259409146, + "grad_norm": 1.1534245014190674, + "learning_rate": 6.797439603551517e-05, + "loss": 0.9894, + "step": 8250 + }, + { + "epoch": 0.33407527316875757, + "grad_norm": 1.3492305278778076, + "learning_rate": 6.795374767705967e-05, + "loss": 1.0102, + "step": 8255 + }, + { + "epoch": 0.3342776203966006, + "grad_norm": 1.2801117897033691, + "learning_rate": 6.793309931860418e-05, + "loss": 0.9809, + "step": 8260 + }, + { + "epoch": 0.33447996762444354, + "grad_norm": 1.2604655027389526, + "learning_rate": 6.791245096014868e-05, + "loss": 1.0098, + "step": 8265 + }, + { + "epoch": 0.3346823148522865, + "grad_norm": 1.1137096881866455, + "learning_rate": 6.789180260169317e-05, + "loss": 0.9381, + "step": 8270 + }, + { + "epoch": 0.3348846620801295, + "grad_norm": 1.128345012664795, + "learning_rate": 6.787115424323766e-05, + "loss": 0.9622, + "step": 8275 + }, + { + "epoch": 0.33508700930797247, + "grad_norm": 1.153509259223938, + "learning_rate": 6.785050588478217e-05, + "loss": 0.9851, + "step": 8280 + }, + { + "epoch": 0.3352893565358155, + "grad_norm": 1.18508780002594, + "learning_rate": 6.782985752632666e-05, + "loss": 0.9857, + "step": 8285 + }, + { + "epoch": 0.33549170376365844, + "grad_norm": 1.2997406721115112, + "learning_rate": 6.780920916787116e-05, + "loss": 1.0219, + "step": 8290 + }, + { + "epoch": 0.3356940509915014, + "grad_norm": 1.0437331199645996, + "learning_rate": 6.778856080941565e-05, + "loss": 0.9976, + "step": 8295 + }, + { + "epoch": 0.3358963982193444, + "grad_norm": 1.1383854150772095, + "learning_rate": 6.776791245096014e-05, + "loss": 0.9957, + "step": 8300 + }, + { + "epoch": 0.3360987454471874, + "grad_norm": 1.0369354486465454, + "learning_rate": 6.774726409250465e-05, + "loss": 0.9836, + "step": 8305 + }, + { + "epoch": 0.33630109267503033, + "grad_norm": 0.9924631118774414, + "learning_rate": 6.772661573404915e-05, + "loss": 1.0246, + "step": 8310 + }, + { + "epoch": 0.33650343990287335, + "grad_norm": 1.0379579067230225, + "learning_rate": 6.770596737559366e-05, + "loss": 0.9578, + "step": 8315 + }, + { + "epoch": 0.3367057871307163, + "grad_norm": 1.2099262475967407, + "learning_rate": 6.768531901713813e-05, + "loss": 1.0898, + "step": 8320 + }, + { + "epoch": 0.33690813435855926, + "grad_norm": 1.0912680625915527, + "learning_rate": 6.766467065868264e-05, + "loss": 0.9977, + "step": 8325 + }, + { + "epoch": 0.3371104815864023, + "grad_norm": 1.2483148574829102, + "learning_rate": 6.764402230022714e-05, + "loss": 1.0372, + "step": 8330 + }, + { + "epoch": 0.33731282881424524, + "grad_norm": 1.259277105331421, + "learning_rate": 6.762337394177163e-05, + "loss": 0.998, + "step": 8335 + }, + { + "epoch": 0.33751517604208825, + "grad_norm": 1.105028510093689, + "learning_rate": 6.760272558331613e-05, + "loss": 1.0244, + "step": 8340 + }, + { + "epoch": 0.3377175232699312, + "grad_norm": 1.0760465860366821, + "learning_rate": 6.758207722486063e-05, + "loss": 1.0465, + "step": 8345 + }, + { + "epoch": 0.33791987049777417, + "grad_norm": 1.388336420059204, + "learning_rate": 6.756142886640512e-05, + "loss": 0.9558, + "step": 8350 + }, + { + "epoch": 0.3381222177256172, + "grad_norm": 1.2203596830368042, + "learning_rate": 6.754078050794962e-05, + "loss": 0.9896, + "step": 8355 + }, + { + "epoch": 0.33832456495346014, + "grad_norm": 1.1341876983642578, + "learning_rate": 6.752013214949412e-05, + "loss": 0.9645, + "step": 8360 + }, + { + "epoch": 0.3385269121813031, + "grad_norm": 1.2270278930664062, + "learning_rate": 6.749948379103861e-05, + "loss": 1.034, + "step": 8365 + }, + { + "epoch": 0.3387292594091461, + "grad_norm": 1.097016453742981, + "learning_rate": 6.74788354325831e-05, + "loss": 0.9782, + "step": 8370 + }, + { + "epoch": 0.33893160663698907, + "grad_norm": 1.2199102640151978, + "learning_rate": 6.745818707412761e-05, + "loss": 0.942, + "step": 8375 + }, + { + "epoch": 0.339133953864832, + "grad_norm": 1.2162120342254639, + "learning_rate": 6.743753871567211e-05, + "loss": 0.9886, + "step": 8380 + }, + { + "epoch": 0.33933630109267504, + "grad_norm": 1.1713508367538452, + "learning_rate": 6.74168903572166e-05, + "loss": 0.9767, + "step": 8385 + }, + { + "epoch": 0.339538648320518, + "grad_norm": 1.2054520845413208, + "learning_rate": 6.73962419987611e-05, + "loss": 0.965, + "step": 8390 + }, + { + "epoch": 0.339740995548361, + "grad_norm": 1.1206656694412231, + "learning_rate": 6.73755936403056e-05, + "loss": 0.98, + "step": 8395 + }, + { + "epoch": 0.33994334277620397, + "grad_norm": 1.171546459197998, + "learning_rate": 6.73549452818501e-05, + "loss": 1.0471, + "step": 8400 + }, + { + "epoch": 0.34014569000404693, + "grad_norm": 1.2924500703811646, + "learning_rate": 6.733429692339459e-05, + "loss": 0.9631, + "step": 8405 + }, + { + "epoch": 0.34034803723188994, + "grad_norm": 1.4253216981887817, + "learning_rate": 6.73136485649391e-05, + "loss": 0.9822, + "step": 8410 + }, + { + "epoch": 0.3405503844597329, + "grad_norm": 1.2813270092010498, + "learning_rate": 6.729300020648359e-05, + "loss": 1.0079, + "step": 8415 + }, + { + "epoch": 0.34075273168757586, + "grad_norm": 1.2764500379562378, + "learning_rate": 6.727235184802808e-05, + "loss": 1.0137, + "step": 8420 + }, + { + "epoch": 0.3409550789154189, + "grad_norm": 1.2863959074020386, + "learning_rate": 6.725170348957258e-05, + "loss": 1.0109, + "step": 8425 + }, + { + "epoch": 0.34115742614326183, + "grad_norm": 1.150322675704956, + "learning_rate": 6.723105513111709e-05, + "loss": 0.9935, + "step": 8430 + }, + { + "epoch": 0.3413597733711048, + "grad_norm": 1.238601803779602, + "learning_rate": 6.721040677266158e-05, + "loss": 0.9995, + "step": 8435 + }, + { + "epoch": 0.3415621205989478, + "grad_norm": 1.0317527055740356, + "learning_rate": 6.718975841420607e-05, + "loss": 1.0332, + "step": 8440 + }, + { + "epoch": 0.34176446782679076, + "grad_norm": 1.1871312856674194, + "learning_rate": 6.716911005575057e-05, + "loss": 0.9883, + "step": 8445 + }, + { + "epoch": 0.3419668150546338, + "grad_norm": 1.317663550376892, + "learning_rate": 6.714846169729508e-05, + "loss": 1.0207, + "step": 8450 + }, + { + "epoch": 0.34216916228247674, + "grad_norm": 1.2404143810272217, + "learning_rate": 6.712781333883957e-05, + "loss": 0.977, + "step": 8455 + }, + { + "epoch": 0.3423715095103197, + "grad_norm": 1.241338849067688, + "learning_rate": 6.710716498038406e-05, + "loss": 0.9577, + "step": 8460 + }, + { + "epoch": 0.3425738567381627, + "grad_norm": 1.1391888856887817, + "learning_rate": 6.708651662192856e-05, + "loss": 0.9494, + "step": 8465 + }, + { + "epoch": 0.34277620396600567, + "grad_norm": 1.0304710865020752, + "learning_rate": 6.706586826347305e-05, + "loss": 1.0009, + "step": 8470 + }, + { + "epoch": 0.3429785511938486, + "grad_norm": 1.1427103281021118, + "learning_rate": 6.704521990501756e-05, + "loss": 0.9629, + "step": 8475 + }, + { + "epoch": 0.34318089842169164, + "grad_norm": 1.000894546508789, + "learning_rate": 6.702457154656206e-05, + "loss": 0.9834, + "step": 8480 + }, + { + "epoch": 0.3433832456495346, + "grad_norm": 1.1159125566482544, + "learning_rate": 6.700392318810655e-05, + "loss": 1.0035, + "step": 8485 + }, + { + "epoch": 0.34358559287737755, + "grad_norm": 1.2044910192489624, + "learning_rate": 6.698327482965104e-05, + "loss": 1.0604, + "step": 8490 + }, + { + "epoch": 0.34378794010522057, + "grad_norm": 1.2182488441467285, + "learning_rate": 6.696262647119554e-05, + "loss": 1.0079, + "step": 8495 + }, + { + "epoch": 0.3439902873330635, + "grad_norm": 1.2006793022155762, + "learning_rate": 6.694197811274005e-05, + "loss": 1.0278, + "step": 8500 + }, + { + "epoch": 0.34419263456090654, + "grad_norm": 1.1410280466079712, + "learning_rate": 6.692132975428454e-05, + "loss": 1.0009, + "step": 8505 + }, + { + "epoch": 0.3443949817887495, + "grad_norm": 1.0319397449493408, + "learning_rate": 6.690068139582903e-05, + "loss": 1.0348, + "step": 8510 + }, + { + "epoch": 0.34459732901659246, + "grad_norm": 1.0982345342636108, + "learning_rate": 6.688003303737353e-05, + "loss": 1.0038, + "step": 8515 + }, + { + "epoch": 0.34479967624443547, + "grad_norm": 1.2336574792861938, + "learning_rate": 6.685938467891802e-05, + "loss": 1.0311, + "step": 8520 + }, + { + "epoch": 0.34500202347227843, + "grad_norm": 1.215314269065857, + "learning_rate": 6.683873632046253e-05, + "loss": 1.0142, + "step": 8525 + }, + { + "epoch": 0.3452043707001214, + "grad_norm": 1.2042006254196167, + "learning_rate": 6.681808796200703e-05, + "loss": 1.0162, + "step": 8530 + }, + { + "epoch": 0.3454067179279644, + "grad_norm": 1.1784260272979736, + "learning_rate": 6.679743960355152e-05, + "loss": 1.0074, + "step": 8535 + }, + { + "epoch": 0.34560906515580736, + "grad_norm": 1.1501988172531128, + "learning_rate": 6.677679124509601e-05, + "loss": 0.9841, + "step": 8540 + }, + { + "epoch": 0.3458114123836503, + "grad_norm": 1.0892448425292969, + "learning_rate": 6.675614288664052e-05, + "loss": 1.022, + "step": 8545 + }, + { + "epoch": 0.34601375961149333, + "grad_norm": 1.1556189060211182, + "learning_rate": 6.673549452818502e-05, + "loss": 0.9731, + "step": 8550 + }, + { + "epoch": 0.3462161068393363, + "grad_norm": 1.184709072113037, + "learning_rate": 6.671484616972951e-05, + "loss": 0.9789, + "step": 8555 + }, + { + "epoch": 0.3464184540671793, + "grad_norm": 1.1903433799743652, + "learning_rate": 6.6694197811274e-05, + "loss": 0.9733, + "step": 8560 + }, + { + "epoch": 0.34662080129502226, + "grad_norm": 1.0671143531799316, + "learning_rate": 6.66735494528185e-05, + "loss": 0.9794, + "step": 8565 + }, + { + "epoch": 0.3468231485228652, + "grad_norm": 1.2823303937911987, + "learning_rate": 6.6652901094363e-05, + "loss": 1.0007, + "step": 8570 + }, + { + "epoch": 0.34702549575070823, + "grad_norm": 1.497361660003662, + "learning_rate": 6.66322527359075e-05, + "loss": 1.0172, + "step": 8575 + }, + { + "epoch": 0.3472278429785512, + "grad_norm": 1.0234427452087402, + "learning_rate": 6.661160437745199e-05, + "loss": 0.9712, + "step": 8580 + }, + { + "epoch": 0.34743019020639415, + "grad_norm": 1.2735185623168945, + "learning_rate": 6.65909560189965e-05, + "loss": 1.0148, + "step": 8585 + }, + { + "epoch": 0.34763253743423717, + "grad_norm": 1.2002781629562378, + "learning_rate": 6.657030766054099e-05, + "loss": 1.0283, + "step": 8590 + }, + { + "epoch": 0.3478348846620801, + "grad_norm": 1.1865485906600952, + "learning_rate": 6.654965930208549e-05, + "loss": 1.0345, + "step": 8595 + }, + { + "epoch": 0.3480372318899231, + "grad_norm": 1.1625314950942993, + "learning_rate": 6.652901094363e-05, + "loss": 0.9528, + "step": 8600 + }, + { + "epoch": 0.3482395791177661, + "grad_norm": 1.134570598602295, + "learning_rate": 6.650836258517447e-05, + "loss": 0.9629, + "step": 8605 + }, + { + "epoch": 0.34844192634560905, + "grad_norm": 1.257711410522461, + "learning_rate": 6.648771422671898e-05, + "loss": 1.0164, + "step": 8610 + }, + { + "epoch": 0.34864427357345207, + "grad_norm": 1.0914216041564941, + "learning_rate": 6.646706586826348e-05, + "loss": 1.0292, + "step": 8615 + }, + { + "epoch": 0.348846620801295, + "grad_norm": 1.0632694959640503, + "learning_rate": 6.644641750980798e-05, + "loss": 0.9774, + "step": 8620 + }, + { + "epoch": 0.349048968029138, + "grad_norm": 1.3002113103866577, + "learning_rate": 6.642576915135247e-05, + "loss": 1.1101, + "step": 8625 + }, + { + "epoch": 0.349251315256981, + "grad_norm": 1.139308214187622, + "learning_rate": 6.640512079289696e-05, + "loss": 1.041, + "step": 8630 + }, + { + "epoch": 0.34945366248482396, + "grad_norm": 1.0839093923568726, + "learning_rate": 6.638447243444147e-05, + "loss": 0.9781, + "step": 8635 + }, + { + "epoch": 0.3496560097126669, + "grad_norm": 1.075213074684143, + "learning_rate": 6.636382407598596e-05, + "loss": 0.9966, + "step": 8640 + }, + { + "epoch": 0.34985835694050993, + "grad_norm": 1.1610937118530273, + "learning_rate": 6.634317571753046e-05, + "loss": 1.0111, + "step": 8645 + }, + { + "epoch": 0.3500607041683529, + "grad_norm": 1.100316047668457, + "learning_rate": 6.632252735907495e-05, + "loss": 0.9945, + "step": 8650 + }, + { + "epoch": 0.35026305139619585, + "grad_norm": 1.2150291204452515, + "learning_rate": 6.630187900061944e-05, + "loss": 0.9342, + "step": 8655 + }, + { + "epoch": 0.35046539862403886, + "grad_norm": 1.0899677276611328, + "learning_rate": 6.628123064216395e-05, + "loss": 1.0108, + "step": 8660 + }, + { + "epoch": 0.3506677458518818, + "grad_norm": 1.3460205793380737, + "learning_rate": 6.626058228370845e-05, + "loss": 1.0266, + "step": 8665 + }, + { + "epoch": 0.35087009307972483, + "grad_norm": 1.128051519393921, + "learning_rate": 6.623993392525296e-05, + "loss": 0.9024, + "step": 8670 + }, + { + "epoch": 0.3510724403075678, + "grad_norm": 1.1437114477157593, + "learning_rate": 6.621928556679743e-05, + "loss": 1.0248, + "step": 8675 + }, + { + "epoch": 0.35127478753541075, + "grad_norm": 1.159614086151123, + "learning_rate": 6.619863720834194e-05, + "loss": 1.0539, + "step": 8680 + }, + { + "epoch": 0.35147713476325376, + "grad_norm": 1.203351378440857, + "learning_rate": 6.617798884988644e-05, + "loss": 0.9775, + "step": 8685 + }, + { + "epoch": 0.3516794819910967, + "grad_norm": 1.1921762228012085, + "learning_rate": 6.615734049143093e-05, + "loss": 1.017, + "step": 8690 + }, + { + "epoch": 0.3518818292189397, + "grad_norm": 1.1406958103179932, + "learning_rate": 6.613669213297544e-05, + "loss": 0.9743, + "step": 8695 + }, + { + "epoch": 0.3520841764467827, + "grad_norm": 1.2871545553207397, + "learning_rate": 6.611604377451993e-05, + "loss": 1.0171, + "step": 8700 + }, + { + "epoch": 0.35228652367462565, + "grad_norm": 1.2367531061172485, + "learning_rate": 6.609539541606442e-05, + "loss": 1.0321, + "step": 8705 + }, + { + "epoch": 0.3524888709024686, + "grad_norm": 1.133935570716858, + "learning_rate": 6.607474705760892e-05, + "loss": 0.9645, + "step": 8710 + }, + { + "epoch": 0.3526912181303116, + "grad_norm": 1.1796834468841553, + "learning_rate": 6.605409869915343e-05, + "loss": 0.9797, + "step": 8715 + }, + { + "epoch": 0.3528935653581546, + "grad_norm": 1.1752643585205078, + "learning_rate": 6.603345034069792e-05, + "loss": 1.019, + "step": 8720 + }, + { + "epoch": 0.3530959125859976, + "grad_norm": 1.2460026741027832, + "learning_rate": 6.60128019822424e-05, + "loss": 1.0578, + "step": 8725 + }, + { + "epoch": 0.35329825981384055, + "grad_norm": 1.1869683265686035, + "learning_rate": 6.599215362378691e-05, + "loss": 1.0183, + "step": 8730 + }, + { + "epoch": 0.3535006070416835, + "grad_norm": 1.1391346454620361, + "learning_rate": 6.597150526533141e-05, + "loss": 1.0393, + "step": 8735 + }, + { + "epoch": 0.3537029542695265, + "grad_norm": 1.1753348112106323, + "learning_rate": 6.59508569068759e-05, + "loss": 1.0113, + "step": 8740 + }, + { + "epoch": 0.3539053014973695, + "grad_norm": 1.1217572689056396, + "learning_rate": 6.593020854842041e-05, + "loss": 0.9776, + "step": 8745 + }, + { + "epoch": 0.35410764872521244, + "grad_norm": 1.3061097860336304, + "learning_rate": 6.59095601899649e-05, + "loss": 1.0277, + "step": 8750 + }, + { + "epoch": 0.35430999595305546, + "grad_norm": 1.143172264099121, + "learning_rate": 6.58889118315094e-05, + "loss": 0.9993, + "step": 8755 + }, + { + "epoch": 0.3545123431808984, + "grad_norm": 1.1326736211776733, + "learning_rate": 6.58682634730539e-05, + "loss": 1.015, + "step": 8760 + }, + { + "epoch": 0.3547146904087414, + "grad_norm": 1.2135322093963623, + "learning_rate": 6.58476151145984e-05, + "loss": 1.0018, + "step": 8765 + }, + { + "epoch": 0.3549170376365844, + "grad_norm": 1.2909351587295532, + "learning_rate": 6.582696675614289e-05, + "loss": 1.0505, + "step": 8770 + }, + { + "epoch": 0.35511938486442735, + "grad_norm": 1.08174729347229, + "learning_rate": 6.580631839768738e-05, + "loss": 1.0007, + "step": 8775 + }, + { + "epoch": 0.35532173209227036, + "grad_norm": 1.168229579925537, + "learning_rate": 6.578567003923188e-05, + "loss": 0.9724, + "step": 8780 + }, + { + "epoch": 0.3555240793201133, + "grad_norm": 1.1908806562423706, + "learning_rate": 6.576502168077639e-05, + "loss": 1.0154, + "step": 8785 + }, + { + "epoch": 0.3557264265479563, + "grad_norm": 1.086943507194519, + "learning_rate": 6.574437332232088e-05, + "loss": 1.0814, + "step": 8790 + }, + { + "epoch": 0.3559287737757993, + "grad_norm": 1.2127574682235718, + "learning_rate": 6.572372496386537e-05, + "loss": 1.0067, + "step": 8795 + }, + { + "epoch": 0.35613112100364225, + "grad_norm": 1.2844805717468262, + "learning_rate": 6.570307660540987e-05, + "loss": 1.0409, + "step": 8800 + }, + { + "epoch": 0.3563334682314852, + "grad_norm": 1.143509030342102, + "learning_rate": 6.568242824695438e-05, + "loss": 1.0219, + "step": 8805 + }, + { + "epoch": 0.3565358154593282, + "grad_norm": 1.1596078872680664, + "learning_rate": 6.566177988849887e-05, + "loss": 1.0286, + "step": 8810 + }, + { + "epoch": 0.3567381626871712, + "grad_norm": 1.260378122329712, + "learning_rate": 6.564113153004337e-05, + "loss": 0.9993, + "step": 8815 + }, + { + "epoch": 0.35694050991501414, + "grad_norm": 1.2232650518417358, + "learning_rate": 6.562048317158786e-05, + "loss": 0.917, + "step": 8820 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 1.0145820379257202, + "learning_rate": 6.559983481313235e-05, + "loss": 0.9615, + "step": 8825 + }, + { + "epoch": 0.3573452043707001, + "grad_norm": 1.060854434967041, + "learning_rate": 6.557918645467686e-05, + "loss": 0.9444, + "step": 8830 + }, + { + "epoch": 0.3575475515985431, + "grad_norm": 1.2130520343780518, + "learning_rate": 6.555853809622136e-05, + "loss": 1.0045, + "step": 8835 + }, + { + "epoch": 0.3577498988263861, + "grad_norm": 1.0780822038650513, + "learning_rate": 6.553788973776585e-05, + "loss": 1.0154, + "step": 8840 + }, + { + "epoch": 0.35795224605422904, + "grad_norm": 1.2764308452606201, + "learning_rate": 6.551724137931034e-05, + "loss": 0.9512, + "step": 8845 + }, + { + "epoch": 0.35815459328207205, + "grad_norm": 1.157224416732788, + "learning_rate": 6.549659302085485e-05, + "loss": 0.9735, + "step": 8850 + }, + { + "epoch": 0.358356940509915, + "grad_norm": 1.2173582315444946, + "learning_rate": 6.547594466239935e-05, + "loss": 0.9817, + "step": 8855 + }, + { + "epoch": 0.35855928773775797, + "grad_norm": 1.1990514993667603, + "learning_rate": 6.545529630394384e-05, + "loss": 1.0039, + "step": 8860 + }, + { + "epoch": 0.358761634965601, + "grad_norm": 1.1545562744140625, + "learning_rate": 6.543464794548833e-05, + "loss": 0.9734, + "step": 8865 + }, + { + "epoch": 0.35896398219344394, + "grad_norm": 1.3034543991088867, + "learning_rate": 6.541399958703283e-05, + "loss": 1.047, + "step": 8870 + }, + { + "epoch": 0.3591663294212869, + "grad_norm": 1.164923071861267, + "learning_rate": 6.539335122857733e-05, + "loss": 1.0514, + "step": 8875 + }, + { + "epoch": 0.3593686766491299, + "grad_norm": 1.2409639358520508, + "learning_rate": 6.537270287012183e-05, + "loss": 1.0195, + "step": 8880 + }, + { + "epoch": 0.3595710238769729, + "grad_norm": 1.2295268774032593, + "learning_rate": 6.535205451166633e-05, + "loss": 1.0683, + "step": 8885 + }, + { + "epoch": 0.3597733711048159, + "grad_norm": 1.1249150037765503, + "learning_rate": 6.533140615321082e-05, + "loss": 1.0313, + "step": 8890 + }, + { + "epoch": 0.35997571833265885, + "grad_norm": 1.0729941129684448, + "learning_rate": 6.531075779475531e-05, + "loss": 1.0461, + "step": 8895 + }, + { + "epoch": 0.3601780655605018, + "grad_norm": 1.1724107265472412, + "learning_rate": 6.529010943629982e-05, + "loss": 1.002, + "step": 8900 + }, + { + "epoch": 0.3603804127883448, + "grad_norm": 1.2224723100662231, + "learning_rate": 6.526946107784432e-05, + "loss": 1.0158, + "step": 8905 + }, + { + "epoch": 0.3605827600161878, + "grad_norm": 1.2359263896942139, + "learning_rate": 6.524881271938881e-05, + "loss": 0.9996, + "step": 8910 + }, + { + "epoch": 0.36078510724403073, + "grad_norm": 1.0514304637908936, + "learning_rate": 6.52281643609333e-05, + "loss": 1.0003, + "step": 8915 + }, + { + "epoch": 0.36098745447187375, + "grad_norm": 1.1479510068893433, + "learning_rate": 6.520751600247781e-05, + "loss": 1.0162, + "step": 8920 + }, + { + "epoch": 0.3611898016997167, + "grad_norm": 1.1739133596420288, + "learning_rate": 6.51868676440223e-05, + "loss": 0.9948, + "step": 8925 + }, + { + "epoch": 0.3613921489275597, + "grad_norm": 1.0608174800872803, + "learning_rate": 6.51662192855668e-05, + "loss": 0.9774, + "step": 8930 + }, + { + "epoch": 0.3615944961554027, + "grad_norm": 1.2887754440307617, + "learning_rate": 6.514557092711129e-05, + "loss": 0.9972, + "step": 8935 + }, + { + "epoch": 0.36179684338324564, + "grad_norm": 1.2014949321746826, + "learning_rate": 6.51249225686558e-05, + "loss": 0.9897, + "step": 8940 + }, + { + "epoch": 0.36199919061108865, + "grad_norm": 1.1276029348373413, + "learning_rate": 6.510427421020029e-05, + "loss": 1.022, + "step": 8945 + }, + { + "epoch": 0.3622015378389316, + "grad_norm": 1.2513147592544556, + "learning_rate": 6.508362585174479e-05, + "loss": 1.103, + "step": 8950 + }, + { + "epoch": 0.36240388506677457, + "grad_norm": 1.149227261543274, + "learning_rate": 6.50629774932893e-05, + "loss": 0.9761, + "step": 8955 + }, + { + "epoch": 0.3626062322946176, + "grad_norm": 1.2752100229263306, + "learning_rate": 6.504232913483379e-05, + "loss": 0.9947, + "step": 8960 + }, + { + "epoch": 0.36280857952246054, + "grad_norm": 0.9958732724189758, + "learning_rate": 6.502168077637828e-05, + "loss": 0.9444, + "step": 8965 + }, + { + "epoch": 0.3630109267503035, + "grad_norm": 1.1153346300125122, + "learning_rate": 6.500103241792278e-05, + "loss": 0.9872, + "step": 8970 + }, + { + "epoch": 0.3632132739781465, + "grad_norm": 1.1249080896377563, + "learning_rate": 6.498038405946728e-05, + "loss": 1.0455, + "step": 8975 + }, + { + "epoch": 0.36341562120598947, + "grad_norm": 1.1198047399520874, + "learning_rate": 6.495973570101178e-05, + "loss": 1.0316, + "step": 8980 + }, + { + "epoch": 0.3636179684338325, + "grad_norm": 1.1231813430786133, + "learning_rate": 6.493908734255627e-05, + "loss": 1.0009, + "step": 8985 + }, + { + "epoch": 0.36382031566167544, + "grad_norm": 1.1274144649505615, + "learning_rate": 6.491843898410077e-05, + "loss": 0.9737, + "step": 8990 + }, + { + "epoch": 0.3640226628895184, + "grad_norm": 1.113556981086731, + "learning_rate": 6.489779062564526e-05, + "loss": 0.9659, + "step": 8995 + }, + { + "epoch": 0.3642250101173614, + "grad_norm": 1.1571226119995117, + "learning_rate": 6.487714226718976e-05, + "loss": 1.0494, + "step": 9000 + }, + { + "epoch": 0.3644273573452044, + "grad_norm": 1.1607192754745483, + "learning_rate": 6.485649390873425e-05, + "loss": 0.9423, + "step": 9005 + }, + { + "epoch": 0.36462970457304733, + "grad_norm": 1.0619757175445557, + "learning_rate": 6.483584555027875e-05, + "loss": 1.0382, + "step": 9010 + }, + { + "epoch": 0.36483205180089034, + "grad_norm": 1.1314587593078613, + "learning_rate": 6.481519719182325e-05, + "loss": 0.9555, + "step": 9015 + }, + { + "epoch": 0.3650343990287333, + "grad_norm": 1.0620434284210205, + "learning_rate": 6.479454883336775e-05, + "loss": 0.9703, + "step": 9020 + }, + { + "epoch": 0.36523674625657626, + "grad_norm": 1.18526291847229, + "learning_rate": 6.477390047491226e-05, + "loss": 0.9698, + "step": 9025 + }, + { + "epoch": 0.3654390934844193, + "grad_norm": 1.2484488487243652, + "learning_rate": 6.475325211645675e-05, + "loss": 1.0249, + "step": 9030 + }, + { + "epoch": 0.36564144071226223, + "grad_norm": 1.2012985944747925, + "learning_rate": 6.473260375800124e-05, + "loss": 0.9949, + "step": 9035 + }, + { + "epoch": 0.36584378794010525, + "grad_norm": 1.0843980312347412, + "learning_rate": 6.471195539954574e-05, + "loss": 1.0073, + "step": 9040 + }, + { + "epoch": 0.3660461351679482, + "grad_norm": 1.2630796432495117, + "learning_rate": 6.469130704109023e-05, + "loss": 1.0256, + "step": 9045 + }, + { + "epoch": 0.36624848239579116, + "grad_norm": 1.2447381019592285, + "learning_rate": 6.467065868263474e-05, + "loss": 1.0428, + "step": 9050 + }, + { + "epoch": 0.3664508296236342, + "grad_norm": 1.0718586444854736, + "learning_rate": 6.465001032417923e-05, + "loss": 1.0333, + "step": 9055 + }, + { + "epoch": 0.36665317685147714, + "grad_norm": 1.1159979104995728, + "learning_rate": 6.462936196572372e-05, + "loss": 1.0247, + "step": 9060 + }, + { + "epoch": 0.3668555240793201, + "grad_norm": 1.1500697135925293, + "learning_rate": 6.460871360726822e-05, + "loss": 1.0224, + "step": 9065 + }, + { + "epoch": 0.3670578713071631, + "grad_norm": 1.1497061252593994, + "learning_rate": 6.458806524881273e-05, + "loss": 1.0292, + "step": 9070 + }, + { + "epoch": 0.36726021853500607, + "grad_norm": 1.248008131980896, + "learning_rate": 6.456741689035723e-05, + "loss": 1.0007, + "step": 9075 + }, + { + "epoch": 0.367462565762849, + "grad_norm": 1.2401740550994873, + "learning_rate": 6.454676853190171e-05, + "loss": 0.9876, + "step": 9080 + }, + { + "epoch": 0.36766491299069204, + "grad_norm": 1.2434844970703125, + "learning_rate": 6.452612017344621e-05, + "loss": 1.0152, + "step": 9085 + }, + { + "epoch": 0.367867260218535, + "grad_norm": 1.1539874076843262, + "learning_rate": 6.450547181499072e-05, + "loss": 1.0284, + "step": 9090 + }, + { + "epoch": 0.368069607446378, + "grad_norm": 1.2069408893585205, + "learning_rate": 6.44848234565352e-05, + "loss": 0.9982, + "step": 9095 + }, + { + "epoch": 0.36827195467422097, + "grad_norm": 1.275341510772705, + "learning_rate": 6.446417509807971e-05, + "loss": 0.9699, + "step": 9100 + }, + { + "epoch": 0.3684743019020639, + "grad_norm": 1.1785728931427002, + "learning_rate": 6.44435267396242e-05, + "loss": 0.9883, + "step": 9105 + }, + { + "epoch": 0.36867664912990694, + "grad_norm": 1.1641168594360352, + "learning_rate": 6.44228783811687e-05, + "loss": 1.059, + "step": 9110 + }, + { + "epoch": 0.3688789963577499, + "grad_norm": 1.095104694366455, + "learning_rate": 6.44022300227132e-05, + "loss": 1.0262, + "step": 9115 + }, + { + "epoch": 0.36908134358559286, + "grad_norm": 1.2540876865386963, + "learning_rate": 6.43815816642577e-05, + "loss": 0.9935, + "step": 9120 + }, + { + "epoch": 0.36928369081343587, + "grad_norm": 1.1737287044525146, + "learning_rate": 6.436093330580219e-05, + "loss": 1.0222, + "step": 9125 + }, + { + "epoch": 0.36948603804127883, + "grad_norm": 1.1528546810150146, + "learning_rate": 6.434028494734668e-05, + "loss": 0.9998, + "step": 9130 + }, + { + "epoch": 0.3696883852691218, + "grad_norm": 1.2154673337936401, + "learning_rate": 6.431963658889118e-05, + "loss": 1.0409, + "step": 9135 + }, + { + "epoch": 0.3698907324969648, + "grad_norm": 1.1713812351226807, + "learning_rate": 6.429898823043569e-05, + "loss": 1.0076, + "step": 9140 + }, + { + "epoch": 0.37009307972480776, + "grad_norm": 1.2802129983901978, + "learning_rate": 6.427833987198018e-05, + "loss": 0.9767, + "step": 9145 + }, + { + "epoch": 0.3702954269526508, + "grad_norm": 1.17072594165802, + "learning_rate": 6.425769151352467e-05, + "loss": 0.9871, + "step": 9150 + }, + { + "epoch": 0.37049777418049373, + "grad_norm": 1.6219251155853271, + "learning_rate": 6.423704315506917e-05, + "loss": 1.0339, + "step": 9155 + }, + { + "epoch": 0.3707001214083367, + "grad_norm": 1.1033552885055542, + "learning_rate": 6.421639479661368e-05, + "loss": 1.0638, + "step": 9160 + }, + { + "epoch": 0.3709024686361797, + "grad_norm": 1.3425971269607544, + "learning_rate": 6.419574643815817e-05, + "loss": 1.0028, + "step": 9165 + }, + { + "epoch": 0.37110481586402266, + "grad_norm": 1.1628460884094238, + "learning_rate": 6.417509807970267e-05, + "loss": 0.9876, + "step": 9170 + }, + { + "epoch": 0.3713071630918656, + "grad_norm": 1.2166439294815063, + "learning_rate": 6.415444972124716e-05, + "loss": 1.0155, + "step": 9175 + }, + { + "epoch": 0.37150951031970864, + "grad_norm": 1.1808024644851685, + "learning_rate": 6.413380136279165e-05, + "loss": 0.9711, + "step": 9180 + }, + { + "epoch": 0.3717118575475516, + "grad_norm": 1.1127512454986572, + "learning_rate": 6.411315300433616e-05, + "loss": 0.9662, + "step": 9185 + }, + { + "epoch": 0.37191420477539455, + "grad_norm": 1.2843769788742065, + "learning_rate": 6.409250464588066e-05, + "loss": 1.021, + "step": 9190 + }, + { + "epoch": 0.37211655200323757, + "grad_norm": 1.1903021335601807, + "learning_rate": 6.407185628742515e-05, + "loss": 0.9993, + "step": 9195 + }, + { + "epoch": 0.3723188992310805, + "grad_norm": 1.1956493854522705, + "learning_rate": 6.405120792896964e-05, + "loss": 1.0009, + "step": 9200 + }, + { + "epoch": 0.37252124645892354, + "grad_norm": 1.1848713159561157, + "learning_rate": 6.403055957051415e-05, + "loss": 1.0089, + "step": 9205 + }, + { + "epoch": 0.3727235936867665, + "grad_norm": 1.0949673652648926, + "learning_rate": 6.400991121205865e-05, + "loss": 1.03, + "step": 9210 + }, + { + "epoch": 0.37292594091460946, + "grad_norm": 1.3276467323303223, + "learning_rate": 6.398926285360314e-05, + "loss": 1.0071, + "step": 9215 + }, + { + "epoch": 0.37312828814245247, + "grad_norm": 1.1459453105926514, + "learning_rate": 6.396861449514763e-05, + "loss": 1.0276, + "step": 9220 + }, + { + "epoch": 0.3733306353702954, + "grad_norm": 1.1494052410125732, + "learning_rate": 6.394796613669214e-05, + "loss": 1.0309, + "step": 9225 + }, + { + "epoch": 0.3735329825981384, + "grad_norm": 1.1526970863342285, + "learning_rate": 6.392731777823663e-05, + "loss": 1.0385, + "step": 9230 + }, + { + "epoch": 0.3737353298259814, + "grad_norm": 1.1462080478668213, + "learning_rate": 6.390666941978113e-05, + "loss": 1.0035, + "step": 9235 + }, + { + "epoch": 0.37393767705382436, + "grad_norm": 1.1997939348220825, + "learning_rate": 6.388602106132563e-05, + "loss": 0.9418, + "step": 9240 + }, + { + "epoch": 0.3741400242816673, + "grad_norm": 1.1789708137512207, + "learning_rate": 6.386537270287013e-05, + "loss": 1.0534, + "step": 9245 + }, + { + "epoch": 0.37434237150951033, + "grad_norm": 1.092958688735962, + "learning_rate": 6.384472434441462e-05, + "loss": 1.0371, + "step": 9250 + }, + { + "epoch": 0.3745447187373533, + "grad_norm": 1.2328760623931885, + "learning_rate": 6.382407598595912e-05, + "loss": 1.0039, + "step": 9255 + }, + { + "epoch": 0.3747470659651963, + "grad_norm": 1.1537787914276123, + "learning_rate": 6.380342762750362e-05, + "loss": 1.0679, + "step": 9260 + }, + { + "epoch": 0.37494941319303926, + "grad_norm": 1.1393468379974365, + "learning_rate": 6.378277926904811e-05, + "loss": 1.0122, + "step": 9265 + }, + { + "epoch": 0.3751517604208822, + "grad_norm": 1.287347435951233, + "learning_rate": 6.37621309105926e-05, + "loss": 0.9618, + "step": 9270 + }, + { + "epoch": 0.37535410764872523, + "grad_norm": 1.0002747774124146, + "learning_rate": 6.374148255213711e-05, + "loss": 0.9842, + "step": 9275 + }, + { + "epoch": 0.3755564548765682, + "grad_norm": 1.2078219652175903, + "learning_rate": 6.37208341936816e-05, + "loss": 1.0083, + "step": 9280 + }, + { + "epoch": 0.37575880210441115, + "grad_norm": 1.1521834135055542, + "learning_rate": 6.37001858352261e-05, + "loss": 0.9759, + "step": 9285 + }, + { + "epoch": 0.37596114933225416, + "grad_norm": 1.1871899366378784, + "learning_rate": 6.367953747677061e-05, + "loss": 1.0046, + "step": 9290 + }, + { + "epoch": 0.3761634965600971, + "grad_norm": 1.1746609210968018, + "learning_rate": 6.36588891183151e-05, + "loss": 1.0287, + "step": 9295 + }, + { + "epoch": 0.3763658437879401, + "grad_norm": 1.1792657375335693, + "learning_rate": 6.363824075985959e-05, + "loss": 1.0478, + "step": 9300 + }, + { + "epoch": 0.3765681910157831, + "grad_norm": 1.1547367572784424, + "learning_rate": 6.361759240140409e-05, + "loss": 1.0958, + "step": 9305 + }, + { + "epoch": 0.37677053824362605, + "grad_norm": 1.2225704193115234, + "learning_rate": 6.35969440429486e-05, + "loss": 1.0126, + "step": 9310 + }, + { + "epoch": 0.37697288547146907, + "grad_norm": 1.0655434131622314, + "learning_rate": 6.357629568449309e-05, + "loss": 1.0005, + "step": 9315 + }, + { + "epoch": 0.377175232699312, + "grad_norm": 1.0822560787200928, + "learning_rate": 6.355564732603758e-05, + "loss": 1.0338, + "step": 9320 + }, + { + "epoch": 0.377377579927155, + "grad_norm": 1.159787893295288, + "learning_rate": 6.353499896758208e-05, + "loss": 1.034, + "step": 9325 + }, + { + "epoch": 0.377579927154998, + "grad_norm": 1.0686500072479248, + "learning_rate": 6.351435060912659e-05, + "loss": 1.0069, + "step": 9330 + }, + { + "epoch": 0.37778227438284095, + "grad_norm": 1.304579734802246, + "learning_rate": 6.349370225067108e-05, + "loss": 0.9354, + "step": 9335 + }, + { + "epoch": 0.3779846216106839, + "grad_norm": 1.1627440452575684, + "learning_rate": 6.347305389221557e-05, + "loss": 0.9977, + "step": 9340 + }, + { + "epoch": 0.3781869688385269, + "grad_norm": 1.22258460521698, + "learning_rate": 6.345240553376007e-05, + "loss": 1.009, + "step": 9345 + }, + { + "epoch": 0.3783893160663699, + "grad_norm": 1.247749924659729, + "learning_rate": 6.343175717530456e-05, + "loss": 0.9786, + "step": 9350 + }, + { + "epoch": 0.37859166329421284, + "grad_norm": 1.127065896987915, + "learning_rate": 6.341110881684907e-05, + "loss": 0.9699, + "step": 9355 + }, + { + "epoch": 0.37879401052205586, + "grad_norm": 1.182802677154541, + "learning_rate": 6.339046045839357e-05, + "loss": 0.9907, + "step": 9360 + }, + { + "epoch": 0.3789963577498988, + "grad_norm": 1.0922465324401855, + "learning_rate": 6.336981209993805e-05, + "loss": 0.9663, + "step": 9365 + }, + { + "epoch": 0.37919870497774183, + "grad_norm": 1.1843430995941162, + "learning_rate": 6.334916374148255e-05, + "loss": 1.0436, + "step": 9370 + }, + { + "epoch": 0.3794010522055848, + "grad_norm": 1.1872503757476807, + "learning_rate": 6.332851538302705e-05, + "loss": 1.0583, + "step": 9375 + }, + { + "epoch": 0.37960339943342775, + "grad_norm": 1.1438274383544922, + "learning_rate": 6.330786702457156e-05, + "loss": 1.0213, + "step": 9380 + }, + { + "epoch": 0.37980574666127076, + "grad_norm": 1.1314841508865356, + "learning_rate": 6.328721866611605e-05, + "loss": 1.0038, + "step": 9385 + }, + { + "epoch": 0.3800080938891137, + "grad_norm": 1.3166422843933105, + "learning_rate": 6.326657030766054e-05, + "loss": 1.0751, + "step": 9390 + }, + { + "epoch": 0.3802104411169567, + "grad_norm": 1.115728735923767, + "learning_rate": 6.324592194920504e-05, + "loss": 1.033, + "step": 9395 + }, + { + "epoch": 0.3804127883447997, + "grad_norm": 1.0546056032180786, + "learning_rate": 6.322527359074953e-05, + "loss": 0.931, + "step": 9400 + }, + { + "epoch": 0.38061513557264265, + "grad_norm": 1.1586906909942627, + "learning_rate": 6.320462523229404e-05, + "loss": 1.0064, + "step": 9405 + }, + { + "epoch": 0.3808174828004856, + "grad_norm": 1.2858645915985107, + "learning_rate": 6.318397687383853e-05, + "loss": 1.0041, + "step": 9410 + }, + { + "epoch": 0.3810198300283286, + "grad_norm": 1.1011475324630737, + "learning_rate": 6.316332851538302e-05, + "loss": 1.0199, + "step": 9415 + }, + { + "epoch": 0.3812221772561716, + "grad_norm": 1.1812069416046143, + "learning_rate": 6.314268015692752e-05, + "loss": 0.998, + "step": 9420 + }, + { + "epoch": 0.3814245244840146, + "grad_norm": 1.2132670879364014, + "learning_rate": 6.312203179847203e-05, + "loss": 1.0099, + "step": 9425 + }, + { + "epoch": 0.38162687171185755, + "grad_norm": 1.3264063596725464, + "learning_rate": 6.310138344001653e-05, + "loss": 1.0115, + "step": 9430 + }, + { + "epoch": 0.3818292189397005, + "grad_norm": 1.084452509880066, + "learning_rate": 6.308073508156101e-05, + "loss": 0.9762, + "step": 9435 + }, + { + "epoch": 0.3820315661675435, + "grad_norm": 1.258212685585022, + "learning_rate": 6.306008672310551e-05, + "loss": 0.9482, + "step": 9440 + }, + { + "epoch": 0.3822339133953865, + "grad_norm": 1.2183173894882202, + "learning_rate": 6.303943836465002e-05, + "loss": 0.9731, + "step": 9445 + }, + { + "epoch": 0.38243626062322944, + "grad_norm": 1.2455830574035645, + "learning_rate": 6.301879000619451e-05, + "loss": 1.0306, + "step": 9450 + }, + { + "epoch": 0.38263860785107245, + "grad_norm": 1.2332731485366821, + "learning_rate": 6.299814164773901e-05, + "loss": 1.0703, + "step": 9455 + }, + { + "epoch": 0.3828409550789154, + "grad_norm": 1.212498664855957, + "learning_rate": 6.29774932892835e-05, + "loss": 0.9481, + "step": 9460 + }, + { + "epoch": 0.38304330230675837, + "grad_norm": 1.132043719291687, + "learning_rate": 6.2956844930828e-05, + "loss": 0.9952, + "step": 9465 + }, + { + "epoch": 0.3832456495346014, + "grad_norm": 1.193706750869751, + "learning_rate": 6.29361965723725e-05, + "loss": 0.9954, + "step": 9470 + }, + { + "epoch": 0.38344799676244434, + "grad_norm": 1.206764817237854, + "learning_rate": 6.2915548213917e-05, + "loss": 1.0108, + "step": 9475 + }, + { + "epoch": 0.38365034399028736, + "grad_norm": 1.2077068090438843, + "learning_rate": 6.289489985546149e-05, + "loss": 1.0416, + "step": 9480 + }, + { + "epoch": 0.3838526912181303, + "grad_norm": 1.2133302688598633, + "learning_rate": 6.287425149700598e-05, + "loss": 1.0439, + "step": 9485 + }, + { + "epoch": 0.3840550384459733, + "grad_norm": 1.1433510780334473, + "learning_rate": 6.285360313855049e-05, + "loss": 0.995, + "step": 9490 + }, + { + "epoch": 0.3842573856738163, + "grad_norm": 1.2141852378845215, + "learning_rate": 6.283295478009499e-05, + "loss": 0.9989, + "step": 9495 + }, + { + "epoch": 0.38445973290165925, + "grad_norm": 1.1626108884811401, + "learning_rate": 6.281230642163948e-05, + "loss": 0.9894, + "step": 9500 + }, + { + "epoch": 0.3846620801295022, + "grad_norm": 1.2495766878128052, + "learning_rate": 6.279165806318398e-05, + "loss": 0.9594, + "step": 9505 + }, + { + "epoch": 0.3848644273573452, + "grad_norm": 1.1957738399505615, + "learning_rate": 6.277100970472848e-05, + "loss": 1.0635, + "step": 9510 + }, + { + "epoch": 0.3850667745851882, + "grad_norm": 1.1707223653793335, + "learning_rate": 6.275036134627298e-05, + "loss": 1.0226, + "step": 9515 + }, + { + "epoch": 0.38526912181303113, + "grad_norm": 1.2480435371398926, + "learning_rate": 6.272971298781747e-05, + "loss": 1.0637, + "step": 9520 + }, + { + "epoch": 0.38547146904087415, + "grad_norm": 1.229467749595642, + "learning_rate": 6.270906462936197e-05, + "loss": 0.9747, + "step": 9525 + }, + { + "epoch": 0.3856738162687171, + "grad_norm": 1.2177376747131348, + "learning_rate": 6.268841627090646e-05, + "loss": 0.9665, + "step": 9530 + }, + { + "epoch": 0.3858761634965601, + "grad_norm": 1.1545181274414062, + "learning_rate": 6.266776791245095e-05, + "loss": 0.9642, + "step": 9535 + }, + { + "epoch": 0.3860785107244031, + "grad_norm": 1.0587862730026245, + "learning_rate": 6.264711955399546e-05, + "loss": 0.9979, + "step": 9540 + }, + { + "epoch": 0.38628085795224604, + "grad_norm": 1.0684212446212769, + "learning_rate": 6.262647119553996e-05, + "loss": 1.0164, + "step": 9545 + }, + { + "epoch": 0.38648320518008905, + "grad_norm": 1.3630390167236328, + "learning_rate": 6.260582283708445e-05, + "loss": 0.9687, + "step": 9550 + }, + { + "epoch": 0.386685552407932, + "grad_norm": 1.1170276403427124, + "learning_rate": 6.258517447862894e-05, + "loss": 1.0348, + "step": 9555 + }, + { + "epoch": 0.38688789963577497, + "grad_norm": 1.0596883296966553, + "learning_rate": 6.256452612017345e-05, + "loss": 1.0136, + "step": 9560 + }, + { + "epoch": 0.387090246863618, + "grad_norm": 1.0961753129959106, + "learning_rate": 6.254387776171795e-05, + "loss": 0.9488, + "step": 9565 + }, + { + "epoch": 0.38729259409146094, + "grad_norm": 1.198262333869934, + "learning_rate": 6.252322940326244e-05, + "loss": 1.0494, + "step": 9570 + }, + { + "epoch": 0.3874949413193039, + "grad_norm": 1.2458857297897339, + "learning_rate": 6.250258104480695e-05, + "loss": 1.0483, + "step": 9575 + }, + { + "epoch": 0.3876972885471469, + "grad_norm": 1.0762958526611328, + "learning_rate": 6.248193268635144e-05, + "loss": 1.0529, + "step": 9580 + }, + { + "epoch": 0.38789963577498987, + "grad_norm": 1.2601096630096436, + "learning_rate": 6.246128432789593e-05, + "loss": 0.9977, + "step": 9585 + }, + { + "epoch": 0.3881019830028329, + "grad_norm": 1.2507660388946533, + "learning_rate": 6.244063596944043e-05, + "loss": 1.0319, + "step": 9590 + }, + { + "epoch": 0.38830433023067584, + "grad_norm": 1.0800540447235107, + "learning_rate": 6.241998761098494e-05, + "loss": 1.0686, + "step": 9595 + }, + { + "epoch": 0.3885066774585188, + "grad_norm": 1.1687123775482178, + "learning_rate": 6.239933925252943e-05, + "loss": 1.0005, + "step": 9600 + }, + { + "epoch": 0.3887090246863618, + "grad_norm": 1.1139582395553589, + "learning_rate": 6.237869089407392e-05, + "loss": 1.0277, + "step": 9605 + }, + { + "epoch": 0.3889113719142048, + "grad_norm": 1.252946376800537, + "learning_rate": 6.235804253561842e-05, + "loss": 1.0128, + "step": 9610 + }, + { + "epoch": 0.38911371914204773, + "grad_norm": 1.088563084602356, + "learning_rate": 6.233739417716293e-05, + "loss": 0.9953, + "step": 9615 + }, + { + "epoch": 0.38931606636989075, + "grad_norm": 1.0879402160644531, + "learning_rate": 6.231674581870742e-05, + "loss": 0.9698, + "step": 9620 + }, + { + "epoch": 0.3895184135977337, + "grad_norm": 1.1478641033172607, + "learning_rate": 6.22960974602519e-05, + "loss": 1.0346, + "step": 9625 + }, + { + "epoch": 0.38972076082557666, + "grad_norm": 1.1672147512435913, + "learning_rate": 6.227544910179641e-05, + "loss": 0.9942, + "step": 9630 + }, + { + "epoch": 0.3899231080534197, + "grad_norm": 1.1004835367202759, + "learning_rate": 6.22548007433409e-05, + "loss": 1.0549, + "step": 9635 + }, + { + "epoch": 0.39012545528126263, + "grad_norm": 1.100008487701416, + "learning_rate": 6.22341523848854e-05, + "loss": 1.0119, + "step": 9640 + }, + { + "epoch": 0.39032780250910565, + "grad_norm": 1.2148017883300781, + "learning_rate": 6.221350402642991e-05, + "loss": 1.0729, + "step": 9645 + }, + { + "epoch": 0.3905301497369486, + "grad_norm": 1.2151126861572266, + "learning_rate": 6.21928556679744e-05, + "loss": 1.0249, + "step": 9650 + }, + { + "epoch": 0.39073249696479156, + "grad_norm": 1.074285864830017, + "learning_rate": 6.217220730951889e-05, + "loss": 1.0042, + "step": 9655 + }, + { + "epoch": 0.3909348441926346, + "grad_norm": 1.1544793844223022, + "learning_rate": 6.21515589510634e-05, + "loss": 1.0442, + "step": 9660 + }, + { + "epoch": 0.39113719142047754, + "grad_norm": 1.2228337526321411, + "learning_rate": 6.21309105926079e-05, + "loss": 1.0684, + "step": 9665 + }, + { + "epoch": 0.3913395386483205, + "grad_norm": 1.201342225074768, + "learning_rate": 6.211026223415239e-05, + "loss": 1.0026, + "step": 9670 + }, + { + "epoch": 0.3915418858761635, + "grad_norm": 1.127369999885559, + "learning_rate": 6.208961387569688e-05, + "loss": 0.999, + "step": 9675 + }, + { + "epoch": 0.39174423310400647, + "grad_norm": 1.2213554382324219, + "learning_rate": 6.206896551724138e-05, + "loss": 0.9542, + "step": 9680 + }, + { + "epoch": 0.3919465803318495, + "grad_norm": 1.2262746095657349, + "learning_rate": 6.204831715878589e-05, + "loss": 1.0872, + "step": 9685 + }, + { + "epoch": 0.39214892755969244, + "grad_norm": 1.2475377321243286, + "learning_rate": 6.202766880033038e-05, + "loss": 1.0248, + "step": 9690 + }, + { + "epoch": 0.3923512747875354, + "grad_norm": 1.161120057106018, + "learning_rate": 6.200702044187487e-05, + "loss": 1.0235, + "step": 9695 + }, + { + "epoch": 0.3925536220153784, + "grad_norm": 1.3133748769760132, + "learning_rate": 6.198637208341937e-05, + "loss": 1.0224, + "step": 9700 + }, + { + "epoch": 0.39275596924322137, + "grad_norm": 1.2146892547607422, + "learning_rate": 6.196572372496386e-05, + "loss": 1.0056, + "step": 9705 + }, + { + "epoch": 0.39295831647106433, + "grad_norm": 1.1127619743347168, + "learning_rate": 6.194507536650837e-05, + "loss": 1.0124, + "step": 9710 + }, + { + "epoch": 0.39316066369890734, + "grad_norm": 1.1972129344940186, + "learning_rate": 6.192442700805287e-05, + "loss": 1.0149, + "step": 9715 + }, + { + "epoch": 0.3933630109267503, + "grad_norm": 1.1928164958953857, + "learning_rate": 6.190377864959736e-05, + "loss": 1.018, + "step": 9720 + }, + { + "epoch": 0.39356535815459326, + "grad_norm": 1.1728615760803223, + "learning_rate": 6.188313029114185e-05, + "loss": 0.9853, + "step": 9725 + }, + { + "epoch": 0.3937677053824363, + "grad_norm": 1.1502978801727295, + "learning_rate": 6.186248193268636e-05, + "loss": 0.9828, + "step": 9730 + }, + { + "epoch": 0.39397005261027923, + "grad_norm": 1.169958233833313, + "learning_rate": 6.184183357423086e-05, + "loss": 0.9781, + "step": 9735 + }, + { + "epoch": 0.39417239983812224, + "grad_norm": 1.134246826171875, + "learning_rate": 6.182118521577535e-05, + "loss": 1.0098, + "step": 9740 + }, + { + "epoch": 0.3943747470659652, + "grad_norm": 1.4439961910247803, + "learning_rate": 6.180053685731984e-05, + "loss": 1.0077, + "step": 9745 + }, + { + "epoch": 0.39457709429380816, + "grad_norm": 1.2301324605941772, + "learning_rate": 6.177988849886435e-05, + "loss": 0.9961, + "step": 9750 + }, + { + "epoch": 0.3947794415216512, + "grad_norm": 1.174716591835022, + "learning_rate": 6.175924014040884e-05, + "loss": 1.0414, + "step": 9755 + }, + { + "epoch": 0.39498178874949413, + "grad_norm": 1.16427743434906, + "learning_rate": 6.173859178195334e-05, + "loss": 1.0298, + "step": 9760 + }, + { + "epoch": 0.3951841359773371, + "grad_norm": 1.3493810892105103, + "learning_rate": 6.171794342349783e-05, + "loss": 1.0464, + "step": 9765 + }, + { + "epoch": 0.3953864832051801, + "grad_norm": 1.3521122932434082, + "learning_rate": 6.169729506504233e-05, + "loss": 0.9717, + "step": 9770 + }, + { + "epoch": 0.39558883043302306, + "grad_norm": 1.2052327394485474, + "learning_rate": 6.167664670658683e-05, + "loss": 0.9965, + "step": 9775 + }, + { + "epoch": 0.395791177660866, + "grad_norm": 1.150696873664856, + "learning_rate": 6.165599834813133e-05, + "loss": 0.9972, + "step": 9780 + }, + { + "epoch": 0.39599352488870904, + "grad_norm": 1.1349776983261108, + "learning_rate": 6.163534998967583e-05, + "loss": 1.0575, + "step": 9785 + }, + { + "epoch": 0.396195872116552, + "grad_norm": 1.1687307357788086, + "learning_rate": 6.161470163122032e-05, + "loss": 1.0335, + "step": 9790 + }, + { + "epoch": 0.396398219344395, + "grad_norm": 1.2072405815124512, + "learning_rate": 6.159405327276481e-05, + "loss": 1.0044, + "step": 9795 + }, + { + "epoch": 0.39660056657223797, + "grad_norm": 1.060883641242981, + "learning_rate": 6.157340491430932e-05, + "loss": 1.0538, + "step": 9800 + }, + { + "epoch": 0.3968029138000809, + "grad_norm": 1.0977377891540527, + "learning_rate": 6.155275655585381e-05, + "loss": 0.9912, + "step": 9805 + }, + { + "epoch": 0.39700526102792394, + "grad_norm": 1.2743343114852905, + "learning_rate": 6.153210819739831e-05, + "loss": 1.0414, + "step": 9810 + }, + { + "epoch": 0.3972076082557669, + "grad_norm": 1.1622213125228882, + "learning_rate": 6.15114598389428e-05, + "loss": 0.9953, + "step": 9815 + }, + { + "epoch": 0.39740995548360986, + "grad_norm": 1.1404904127120972, + "learning_rate": 6.149081148048731e-05, + "loss": 1.035, + "step": 9820 + }, + { + "epoch": 0.39761230271145287, + "grad_norm": 1.1930956840515137, + "learning_rate": 6.14701631220318e-05, + "loss": 1.0397, + "step": 9825 + }, + { + "epoch": 0.39781464993929583, + "grad_norm": 1.158272385597229, + "learning_rate": 6.14495147635763e-05, + "loss": 1.0771, + "step": 9830 + }, + { + "epoch": 0.3980169971671388, + "grad_norm": 1.0452218055725098, + "learning_rate": 6.14288664051208e-05, + "loss": 1.029, + "step": 9835 + }, + { + "epoch": 0.3982193443949818, + "grad_norm": 1.1018149852752686, + "learning_rate": 6.140821804666528e-05, + "loss": 0.981, + "step": 9840 + }, + { + "epoch": 0.39842169162282476, + "grad_norm": 1.0617828369140625, + "learning_rate": 6.138756968820979e-05, + "loss": 1.0095, + "step": 9845 + }, + { + "epoch": 0.3986240388506678, + "grad_norm": 1.1889824867248535, + "learning_rate": 6.136692132975429e-05, + "loss": 0.9951, + "step": 9850 + }, + { + "epoch": 0.39882638607851073, + "grad_norm": 1.192192792892456, + "learning_rate": 6.134627297129878e-05, + "loss": 1.0154, + "step": 9855 + }, + { + "epoch": 0.3990287333063537, + "grad_norm": 1.2727915048599243, + "learning_rate": 6.132562461284329e-05, + "loss": 0.9683, + "step": 9860 + }, + { + "epoch": 0.3992310805341967, + "grad_norm": 1.1856448650360107, + "learning_rate": 6.130497625438778e-05, + "loss": 0.9876, + "step": 9865 + }, + { + "epoch": 0.39943342776203966, + "grad_norm": 1.104466438293457, + "learning_rate": 6.128432789593228e-05, + "loss": 0.9436, + "step": 9870 + }, + { + "epoch": 0.3996357749898826, + "grad_norm": 1.1249479055404663, + "learning_rate": 6.126367953747677e-05, + "loss": 1.0464, + "step": 9875 + }, + { + "epoch": 0.39983812221772563, + "grad_norm": 1.209954857826233, + "learning_rate": 6.124303117902128e-05, + "loss": 1.0734, + "step": 9880 + }, + { + "epoch": 0.4000404694455686, + "grad_norm": 1.080848217010498, + "learning_rate": 6.122238282056577e-05, + "loss": 0.9687, + "step": 9885 + }, + { + "epoch": 0.40024281667341155, + "grad_norm": 1.124177098274231, + "learning_rate": 6.120173446211026e-05, + "loss": 1.0057, + "step": 9890 + }, + { + "epoch": 0.40044516390125456, + "grad_norm": 1.0730397701263428, + "learning_rate": 6.118108610365476e-05, + "loss": 0.9908, + "step": 9895 + }, + { + "epoch": 0.4006475111290975, + "grad_norm": 1.192895531654358, + "learning_rate": 6.116043774519926e-05, + "loss": 1.0482, + "step": 9900 + }, + { + "epoch": 0.40084985835694054, + "grad_norm": 1.1992424726486206, + "learning_rate": 6.113978938674377e-05, + "loss": 0.9487, + "step": 9905 + }, + { + "epoch": 0.4010522055847835, + "grad_norm": 1.164494276046753, + "learning_rate": 6.111914102828825e-05, + "loss": 0.9974, + "step": 9910 + }, + { + "epoch": 0.40125455281262645, + "grad_norm": 0.9880785942077637, + "learning_rate": 6.109849266983275e-05, + "loss": 1.0034, + "step": 9915 + }, + { + "epoch": 0.40145690004046947, + "grad_norm": 1.2381869554519653, + "learning_rate": 6.107784431137725e-05, + "loss": 1.0313, + "step": 9920 + }, + { + "epoch": 0.4016592472683124, + "grad_norm": 1.1616636514663696, + "learning_rate": 6.105719595292174e-05, + "loss": 1.0086, + "step": 9925 + }, + { + "epoch": 0.4018615944961554, + "grad_norm": 1.2813246250152588, + "learning_rate": 6.103654759446625e-05, + "loss": 0.9827, + "step": 9930 + }, + { + "epoch": 0.4020639417239984, + "grad_norm": 1.1164380311965942, + "learning_rate": 6.1015899236010745e-05, + "loss": 1.0219, + "step": 9935 + }, + { + "epoch": 0.40226628895184136, + "grad_norm": 1.4000154733657837, + "learning_rate": 6.099525087755523e-05, + "loss": 0.9641, + "step": 9940 + }, + { + "epoch": 0.4024686361796843, + "grad_norm": 1.2360674142837524, + "learning_rate": 6.097460251909973e-05, + "loss": 0.9762, + "step": 9945 + }, + { + "epoch": 0.4026709834075273, + "grad_norm": 1.1454941034317017, + "learning_rate": 6.095395416064423e-05, + "loss": 1.0313, + "step": 9950 + }, + { + "epoch": 0.4028733306353703, + "grad_norm": 1.0879707336425781, + "learning_rate": 6.0933305802188735e-05, + "loss": 1.003, + "step": 9955 + }, + { + "epoch": 0.4030756778632133, + "grad_norm": 1.1175845861434937, + "learning_rate": 6.0912657443733225e-05, + "loss": 0.9766, + "step": 9960 + }, + { + "epoch": 0.40327802509105626, + "grad_norm": 1.144715428352356, + "learning_rate": 6.089200908527772e-05, + "loss": 0.9904, + "step": 9965 + }, + { + "epoch": 0.4034803723188992, + "grad_norm": 1.1816964149475098, + "learning_rate": 6.0871360726822226e-05, + "loss": 1.0159, + "step": 9970 + }, + { + "epoch": 0.40368271954674223, + "grad_norm": 1.2622935771942139, + "learning_rate": 6.085071236836671e-05, + "loss": 0.9864, + "step": 9975 + }, + { + "epoch": 0.4038850667745852, + "grad_norm": 1.110995888710022, + "learning_rate": 6.0830064009911214e-05, + "loss": 0.9845, + "step": 9980 + }, + { + "epoch": 0.40408741400242815, + "grad_norm": 1.1176053285598755, + "learning_rate": 6.080941565145571e-05, + "loss": 0.9725, + "step": 9985 + }, + { + "epoch": 0.40428976123027116, + "grad_norm": 1.1164429187774658, + "learning_rate": 6.07887672930002e-05, + "loss": 0.9887, + "step": 9990 + }, + { + "epoch": 0.4044921084581141, + "grad_norm": 1.1734952926635742, + "learning_rate": 6.0768118934544706e-05, + "loss": 0.9763, + "step": 9995 + }, + { + "epoch": 0.4046944556859571, + "grad_norm": 1.2598659992218018, + "learning_rate": 6.0747470576089203e-05, + "loss": 0.9681, + "step": 10000 + }, + { + "epoch": 0.4048968029138001, + "grad_norm": 1.0857003927230835, + "learning_rate": 6.072682221763371e-05, + "loss": 0.9386, + "step": 10005 + }, + { + "epoch": 0.40509915014164305, + "grad_norm": 1.3082906007766724, + "learning_rate": 6.07061738591782e-05, + "loss": 1.0256, + "step": 10010 + }, + { + "epoch": 0.40530149736948606, + "grad_norm": 1.2251588106155396, + "learning_rate": 6.0685525500722695e-05, + "loss": 0.9768, + "step": 10015 + }, + { + "epoch": 0.405503844597329, + "grad_norm": 1.1803746223449707, + "learning_rate": 6.066487714226719e-05, + "loss": 1.0686, + "step": 10020 + }, + { + "epoch": 0.405706191825172, + "grad_norm": 1.1959571838378906, + "learning_rate": 6.064422878381168e-05, + "loss": 1.0325, + "step": 10025 + }, + { + "epoch": 0.405908539053015, + "grad_norm": 1.0180895328521729, + "learning_rate": 6.062358042535619e-05, + "loss": 1.0369, + "step": 10030 + }, + { + "epoch": 0.40611088628085795, + "grad_norm": 1.203304409980774, + "learning_rate": 6.0602932066900685e-05, + "loss": 0.9924, + "step": 10035 + }, + { + "epoch": 0.4063132335087009, + "grad_norm": 1.200363278388977, + "learning_rate": 6.058228370844519e-05, + "loss": 1.0219, + "step": 10040 + }, + { + "epoch": 0.4065155807365439, + "grad_norm": 1.0810167789459229, + "learning_rate": 6.056163534998968e-05, + "loss": 0.9429, + "step": 10045 + }, + { + "epoch": 0.4067179279643869, + "grad_norm": 1.1412184238433838, + "learning_rate": 6.0540986991534176e-05, + "loss": 0.9684, + "step": 10050 + }, + { + "epoch": 0.40692027519222984, + "grad_norm": 1.1554936170578003, + "learning_rate": 6.0520338633078674e-05, + "loss": 1.0392, + "step": 10055 + }, + { + "epoch": 0.40712262242007285, + "grad_norm": 1.2978181838989258, + "learning_rate": 6.0499690274623164e-05, + "loss": 1.0067, + "step": 10060 + }, + { + "epoch": 0.4073249696479158, + "grad_norm": 1.1897743940353394, + "learning_rate": 6.047904191616767e-05, + "loss": 1.0091, + "step": 10065 + }, + { + "epoch": 0.4075273168757588, + "grad_norm": 1.1835557222366333, + "learning_rate": 6.0458393557712166e-05, + "loss": 1.0197, + "step": 10070 + }, + { + "epoch": 0.4077296641036018, + "grad_norm": 1.0526257753372192, + "learning_rate": 6.0437745199256656e-05, + "loss": 1.0808, + "step": 10075 + }, + { + "epoch": 0.40793201133144474, + "grad_norm": 1.0810807943344116, + "learning_rate": 6.041709684080116e-05, + "loss": 0.9582, + "step": 10080 + }, + { + "epoch": 0.40813435855928776, + "grad_norm": 1.1587885618209839, + "learning_rate": 6.039644848234566e-05, + "loss": 0.969, + "step": 10085 + }, + { + "epoch": 0.4083367057871307, + "grad_norm": 1.2569379806518555, + "learning_rate": 6.037580012389016e-05, + "loss": 1.0162, + "step": 10090 + }, + { + "epoch": 0.4085390530149737, + "grad_norm": 1.0843753814697266, + "learning_rate": 6.0355151765434645e-05, + "loss": 0.9554, + "step": 10095 + }, + { + "epoch": 0.4087414002428167, + "grad_norm": 1.225560188293457, + "learning_rate": 6.033450340697915e-05, + "loss": 1.0209, + "step": 10100 + }, + { + "epoch": 0.40894374747065965, + "grad_norm": 1.1393972635269165, + "learning_rate": 6.031385504852365e-05, + "loss": 1.0273, + "step": 10105 + }, + { + "epoch": 0.4091460946985026, + "grad_norm": 1.196390151977539, + "learning_rate": 6.029320669006814e-05, + "loss": 1.0688, + "step": 10110 + }, + { + "epoch": 0.4093484419263456, + "grad_norm": 1.1796727180480957, + "learning_rate": 6.027255833161264e-05, + "loss": 0.9621, + "step": 10115 + }, + { + "epoch": 0.4095507891541886, + "grad_norm": 1.1920093297958374, + "learning_rate": 6.025190997315714e-05, + "loss": 1.0199, + "step": 10120 + }, + { + "epoch": 0.4097531363820316, + "grad_norm": 1.0450389385223389, + "learning_rate": 6.023126161470164e-05, + "loss": 0.9026, + "step": 10125 + }, + { + "epoch": 0.40995548360987455, + "grad_norm": 1.3306684494018555, + "learning_rate": 6.0210613256246126e-05, + "loss": 1.0129, + "step": 10130 + }, + { + "epoch": 0.4101578308377175, + "grad_norm": 1.085116982460022, + "learning_rate": 6.018996489779063e-05, + "loss": 1.0175, + "step": 10135 + }, + { + "epoch": 0.4103601780655605, + "grad_norm": 1.1638381481170654, + "learning_rate": 6.016931653933513e-05, + "loss": 0.9514, + "step": 10140 + }, + { + "epoch": 0.4105625252934035, + "grad_norm": 1.2508279085159302, + "learning_rate": 6.014866818087962e-05, + "loss": 0.996, + "step": 10145 + }, + { + "epoch": 0.41076487252124644, + "grad_norm": 1.301274061203003, + "learning_rate": 6.012801982242412e-05, + "loss": 0.9411, + "step": 10150 + }, + { + "epoch": 0.41096721974908945, + "grad_norm": 1.1069871187210083, + "learning_rate": 6.010737146396862e-05, + "loss": 1.0319, + "step": 10155 + }, + { + "epoch": 0.4111695669769324, + "grad_norm": 1.1158190965652466, + "learning_rate": 6.008672310551311e-05, + "loss": 0.9787, + "step": 10160 + }, + { + "epoch": 0.41137191420477537, + "grad_norm": 1.184366226196289, + "learning_rate": 6.006607474705761e-05, + "loss": 1.0631, + "step": 10165 + }, + { + "epoch": 0.4115742614326184, + "grad_norm": 1.2500015497207642, + "learning_rate": 6.004542638860211e-05, + "loss": 0.9607, + "step": 10170 + }, + { + "epoch": 0.41177660866046134, + "grad_norm": 1.1481871604919434, + "learning_rate": 6.002477803014661e-05, + "loss": 0.9862, + "step": 10175 + }, + { + "epoch": 0.41197895588830435, + "grad_norm": 1.2750754356384277, + "learning_rate": 6.00041296716911e-05, + "loss": 0.9573, + "step": 10180 + }, + { + "epoch": 0.4121813031161473, + "grad_norm": 1.166333794593811, + "learning_rate": 5.9983481313235603e-05, + "loss": 1.013, + "step": 10185 + }, + { + "epoch": 0.41238365034399027, + "grad_norm": 1.0812666416168213, + "learning_rate": 5.99628329547801e-05, + "loss": 1.013, + "step": 10190 + }, + { + "epoch": 0.4125859975718333, + "grad_norm": 1.2474825382232666, + "learning_rate": 5.994218459632459e-05, + "loss": 0.9837, + "step": 10195 + }, + { + "epoch": 0.41278834479967624, + "grad_norm": 1.1481378078460693, + "learning_rate": 5.992153623786909e-05, + "loss": 1.0531, + "step": 10200 + }, + { + "epoch": 0.4129906920275192, + "grad_norm": 1.0672534704208374, + "learning_rate": 5.990088787941359e-05, + "loss": 1.0336, + "step": 10205 + }, + { + "epoch": 0.4131930392553622, + "grad_norm": 1.1827770471572876, + "learning_rate": 5.988023952095808e-05, + "loss": 1.0338, + "step": 10210 + }, + { + "epoch": 0.4133953864832052, + "grad_norm": 1.1003824472427368, + "learning_rate": 5.985959116250258e-05, + "loss": 1.0321, + "step": 10215 + }, + { + "epoch": 0.41359773371104813, + "grad_norm": 1.1213117837905884, + "learning_rate": 5.9838942804047085e-05, + "loss": 0.9712, + "step": 10220 + }, + { + "epoch": 0.41380008093889115, + "grad_norm": 1.1966711282730103, + "learning_rate": 5.981829444559158e-05, + "loss": 0.9826, + "step": 10225 + }, + { + "epoch": 0.4140024281667341, + "grad_norm": 1.1761008501052856, + "learning_rate": 5.979764608713607e-05, + "loss": 1.0222, + "step": 10230 + }, + { + "epoch": 0.4142047753945771, + "grad_norm": 1.2960001230239868, + "learning_rate": 5.977699772868057e-05, + "loss": 0.9271, + "step": 10235 + }, + { + "epoch": 0.4144071226224201, + "grad_norm": 1.204140067100525, + "learning_rate": 5.9756349370225074e-05, + "loss": 1.0036, + "step": 10240 + }, + { + "epoch": 0.41460946985026303, + "grad_norm": 1.1577142477035522, + "learning_rate": 5.9735701011769564e-05, + "loss": 0.986, + "step": 10245 + }, + { + "epoch": 0.41481181707810605, + "grad_norm": 1.135983943939209, + "learning_rate": 5.971505265331406e-05, + "loss": 1.0024, + "step": 10250 + }, + { + "epoch": 0.415014164305949, + "grad_norm": 1.3453749418258667, + "learning_rate": 5.9694404294858566e-05, + "loss": 0.9585, + "step": 10255 + }, + { + "epoch": 0.41521651153379197, + "grad_norm": 1.1529349088668823, + "learning_rate": 5.967375593640306e-05, + "loss": 1.0154, + "step": 10260 + }, + { + "epoch": 0.415418858761635, + "grad_norm": 1.1858090162277222, + "learning_rate": 5.9653107577947553e-05, + "loss": 0.9613, + "step": 10265 + }, + { + "epoch": 0.41562120598947794, + "grad_norm": 1.1481332778930664, + "learning_rate": 5.963245921949205e-05, + "loss": 0.9912, + "step": 10270 + }, + { + "epoch": 0.4158235532173209, + "grad_norm": 1.128801703453064, + "learning_rate": 5.9611810861036555e-05, + "loss": 1.0349, + "step": 10275 + }, + { + "epoch": 0.4160259004451639, + "grad_norm": 1.2165671586990356, + "learning_rate": 5.9591162502581045e-05, + "loss": 1.0492, + "step": 10280 + }, + { + "epoch": 0.41622824767300687, + "grad_norm": 1.1541047096252441, + "learning_rate": 5.957051414412554e-05, + "loss": 1.0514, + "step": 10285 + }, + { + "epoch": 0.4164305949008499, + "grad_norm": 1.4071108102798462, + "learning_rate": 5.954986578567005e-05, + "loss": 1.0151, + "step": 10290 + }, + { + "epoch": 0.41663294212869284, + "grad_norm": 1.2491838932037354, + "learning_rate": 5.952921742721454e-05, + "loss": 0.9952, + "step": 10295 + }, + { + "epoch": 0.4168352893565358, + "grad_norm": 1.1746826171875, + "learning_rate": 5.9508569068759034e-05, + "loss": 1.0023, + "step": 10300 + }, + { + "epoch": 0.4170376365843788, + "grad_norm": 1.1057684421539307, + "learning_rate": 5.948792071030354e-05, + "loss": 0.9433, + "step": 10305 + }, + { + "epoch": 0.41723998381222177, + "grad_norm": 1.190808892250061, + "learning_rate": 5.9467272351848036e-05, + "loss": 0.9958, + "step": 10310 + }, + { + "epoch": 0.41744233104006473, + "grad_norm": 1.114880084991455, + "learning_rate": 5.9446623993392526e-05, + "loss": 1.0478, + "step": 10315 + }, + { + "epoch": 0.41764467826790774, + "grad_norm": 1.089569091796875, + "learning_rate": 5.9425975634937024e-05, + "loss": 0.9614, + "step": 10320 + }, + { + "epoch": 0.4178470254957507, + "grad_norm": 1.2067276239395142, + "learning_rate": 5.940532727648153e-05, + "loss": 0.9489, + "step": 10325 + }, + { + "epoch": 0.41804937272359366, + "grad_norm": 1.0179661512374878, + "learning_rate": 5.938467891802602e-05, + "loss": 1.0104, + "step": 10330 + }, + { + "epoch": 0.4182517199514367, + "grad_norm": 1.0859757661819458, + "learning_rate": 5.9364030559570516e-05, + "loss": 1.0502, + "step": 10335 + }, + { + "epoch": 0.41845406717927963, + "grad_norm": 1.1184546947479248, + "learning_rate": 5.934338220111502e-05, + "loss": 1.0075, + "step": 10340 + }, + { + "epoch": 0.41865641440712265, + "grad_norm": 1.2337294816970825, + "learning_rate": 5.932273384265952e-05, + "loss": 1.0723, + "step": 10345 + }, + { + "epoch": 0.4188587616349656, + "grad_norm": 1.2576165199279785, + "learning_rate": 5.930208548420401e-05, + "loss": 0.8998, + "step": 10350 + }, + { + "epoch": 0.41906110886280856, + "grad_norm": 1.1309709548950195, + "learning_rate": 5.9281437125748505e-05, + "loss": 0.9974, + "step": 10355 + }, + { + "epoch": 0.4192634560906516, + "grad_norm": 1.310569167137146, + "learning_rate": 5.926078876729301e-05, + "loss": 0.9868, + "step": 10360 + }, + { + "epoch": 0.41946580331849453, + "grad_norm": 1.265221357345581, + "learning_rate": 5.92401404088375e-05, + "loss": 1.0637, + "step": 10365 + }, + { + "epoch": 0.4196681505463375, + "grad_norm": 1.1957981586456299, + "learning_rate": 5.9219492050382e-05, + "loss": 1.0066, + "step": 10370 + }, + { + "epoch": 0.4198704977741805, + "grad_norm": 1.2208542823791504, + "learning_rate": 5.91988436919265e-05, + "loss": 0.9734, + "step": 10375 + }, + { + "epoch": 0.42007284500202346, + "grad_norm": 1.2308872938156128, + "learning_rate": 5.9178195333470984e-05, + "loss": 1.0411, + "step": 10380 + }, + { + "epoch": 0.4202751922298664, + "grad_norm": 1.1346920728683472, + "learning_rate": 5.915754697501549e-05, + "loss": 1.0573, + "step": 10385 + }, + { + "epoch": 0.42047753945770944, + "grad_norm": 1.2495075464248657, + "learning_rate": 5.9136898616559986e-05, + "loss": 1.0205, + "step": 10390 + }, + { + "epoch": 0.4206798866855524, + "grad_norm": 1.196473479270935, + "learning_rate": 5.911625025810449e-05, + "loss": 1.0096, + "step": 10395 + }, + { + "epoch": 0.4208822339133954, + "grad_norm": 1.1124287843704224, + "learning_rate": 5.909560189964898e-05, + "loss": 0.9755, + "step": 10400 + }, + { + "epoch": 0.42108458114123837, + "grad_norm": 1.1813316345214844, + "learning_rate": 5.907495354119348e-05, + "loss": 1.0343, + "step": 10405 + }, + { + "epoch": 0.4212869283690813, + "grad_norm": 1.1222776174545288, + "learning_rate": 5.905430518273798e-05, + "loss": 1.0145, + "step": 10410 + }, + { + "epoch": 0.42148927559692434, + "grad_norm": 1.0855779647827148, + "learning_rate": 5.9033656824282466e-05, + "loss": 0.9858, + "step": 10415 + }, + { + "epoch": 0.4216916228247673, + "grad_norm": 1.155497670173645, + "learning_rate": 5.901300846582697e-05, + "loss": 1.0356, + "step": 10420 + }, + { + "epoch": 0.42189397005261026, + "grad_norm": 1.403380274772644, + "learning_rate": 5.899236010737147e-05, + "loss": 0.9808, + "step": 10425 + }, + { + "epoch": 0.42209631728045327, + "grad_norm": 1.2330858707427979, + "learning_rate": 5.897171174891596e-05, + "loss": 0.9357, + "step": 10430 + }, + { + "epoch": 0.42229866450829623, + "grad_norm": 1.1334590911865234, + "learning_rate": 5.895106339046046e-05, + "loss": 1.0152, + "step": 10435 + }, + { + "epoch": 0.42250101173613924, + "grad_norm": 1.1154812574386597, + "learning_rate": 5.893041503200496e-05, + "loss": 0.9547, + "step": 10440 + }, + { + "epoch": 0.4227033589639822, + "grad_norm": 1.1414520740509033, + "learning_rate": 5.890976667354946e-05, + "loss": 1.056, + "step": 10445 + }, + { + "epoch": 0.42290570619182516, + "grad_norm": 1.1283037662506104, + "learning_rate": 5.8889118315093947e-05, + "loss": 1.0158, + "step": 10450 + }, + { + "epoch": 0.4231080534196682, + "grad_norm": 1.2176074981689453, + "learning_rate": 5.886846995663845e-05, + "loss": 0.9993, + "step": 10455 + }, + { + "epoch": 0.42331040064751113, + "grad_norm": 1.1889210939407349, + "learning_rate": 5.884782159818295e-05, + "loss": 0.9266, + "step": 10460 + }, + { + "epoch": 0.4235127478753541, + "grad_norm": 1.1583353281021118, + "learning_rate": 5.882717323972744e-05, + "loss": 1.0195, + "step": 10465 + }, + { + "epoch": 0.4237150951031971, + "grad_norm": 1.1713775396347046, + "learning_rate": 5.880652488127194e-05, + "loss": 0.9766, + "step": 10470 + }, + { + "epoch": 0.42391744233104006, + "grad_norm": 1.212542176246643, + "learning_rate": 5.878587652281644e-05, + "loss": 0.9821, + "step": 10475 + }, + { + "epoch": 0.424119789558883, + "grad_norm": 1.2066254615783691, + "learning_rate": 5.8765228164360944e-05, + "loss": 1.0263, + "step": 10480 + }, + { + "epoch": 0.42432213678672603, + "grad_norm": 1.1281640529632568, + "learning_rate": 5.874457980590543e-05, + "loss": 0.9598, + "step": 10485 + }, + { + "epoch": 0.424524484014569, + "grad_norm": 1.1599007844924927, + "learning_rate": 5.872393144744993e-05, + "loss": 0.9789, + "step": 10490 + }, + { + "epoch": 0.424726831242412, + "grad_norm": 1.2747465372085571, + "learning_rate": 5.870328308899443e-05, + "loss": 0.9692, + "step": 10495 + }, + { + "epoch": 0.42492917847025496, + "grad_norm": 1.1403895616531372, + "learning_rate": 5.868263473053892e-05, + "loss": 0.9715, + "step": 10500 + }, + { + "epoch": 0.4251315256980979, + "grad_norm": 1.0249996185302734, + "learning_rate": 5.8661986372083424e-05, + "loss": 0.9788, + "step": 10505 + }, + { + "epoch": 0.42533387292594094, + "grad_norm": 1.1349996328353882, + "learning_rate": 5.864133801362792e-05, + "loss": 0.9761, + "step": 10510 + }, + { + "epoch": 0.4255362201537839, + "grad_norm": 1.0648891925811768, + "learning_rate": 5.862068965517241e-05, + "loss": 0.9968, + "step": 10515 + }, + { + "epoch": 0.42573856738162685, + "grad_norm": 1.2156158685684204, + "learning_rate": 5.860004129671691e-05, + "loss": 0.9648, + "step": 10520 + }, + { + "epoch": 0.42594091460946987, + "grad_norm": 1.1436049938201904, + "learning_rate": 5.857939293826141e-05, + "loss": 1.0048, + "step": 10525 + }, + { + "epoch": 0.4261432618373128, + "grad_norm": 1.1715660095214844, + "learning_rate": 5.855874457980591e-05, + "loss": 1.0497, + "step": 10530 + }, + { + "epoch": 0.4263456090651558, + "grad_norm": 1.1499030590057373, + "learning_rate": 5.85380962213504e-05, + "loss": 1.0035, + "step": 10535 + }, + { + "epoch": 0.4265479562929988, + "grad_norm": 1.1474660634994507, + "learning_rate": 5.8517447862894905e-05, + "loss": 1.0365, + "step": 10540 + }, + { + "epoch": 0.42675030352084176, + "grad_norm": 1.1669492721557617, + "learning_rate": 5.84967995044394e-05, + "loss": 0.9716, + "step": 10545 + }, + { + "epoch": 0.42695265074868477, + "grad_norm": 1.2073500156402588, + "learning_rate": 5.847615114598389e-05, + "loss": 0.9615, + "step": 10550 + }, + { + "epoch": 0.42715499797652773, + "grad_norm": 1.3077940940856934, + "learning_rate": 5.84555027875284e-05, + "loss": 1.0056, + "step": 10555 + }, + { + "epoch": 0.4273573452043707, + "grad_norm": 1.0633203983306885, + "learning_rate": 5.8434854429072894e-05, + "loss": 0.9749, + "step": 10560 + }, + { + "epoch": 0.4275596924322137, + "grad_norm": 1.124089241027832, + "learning_rate": 5.8414206070617384e-05, + "loss": 0.9614, + "step": 10565 + }, + { + "epoch": 0.42776203966005666, + "grad_norm": 1.088313102722168, + "learning_rate": 5.839355771216188e-05, + "loss": 0.9373, + "step": 10570 + }, + { + "epoch": 0.4279643868878996, + "grad_norm": 1.002710223197937, + "learning_rate": 5.8372909353706386e-05, + "loss": 1.0047, + "step": 10575 + }, + { + "epoch": 0.42816673411574263, + "grad_norm": 1.0879201889038086, + "learning_rate": 5.835226099525088e-05, + "loss": 0.9382, + "step": 10580 + }, + { + "epoch": 0.4283690813435856, + "grad_norm": 1.248063087463379, + "learning_rate": 5.8331612636795374e-05, + "loss": 1.0214, + "step": 10585 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.1118930578231812, + "learning_rate": 5.831096427833988e-05, + "loss": 1.0565, + "step": 10590 + }, + { + "epoch": 0.42877377579927156, + "grad_norm": 1.1189217567443848, + "learning_rate": 5.8290315919884375e-05, + "loss": 1.0402, + "step": 10595 + }, + { + "epoch": 0.4289761230271145, + "grad_norm": 1.099325180053711, + "learning_rate": 5.8269667561428866e-05, + "loss": 0.9871, + "step": 10600 + }, + { + "epoch": 0.42917847025495753, + "grad_norm": 1.2033805847167969, + "learning_rate": 5.824901920297336e-05, + "loss": 0.9529, + "step": 10605 + }, + { + "epoch": 0.4293808174828005, + "grad_norm": 0.9986255764961243, + "learning_rate": 5.822837084451787e-05, + "loss": 0.9639, + "step": 10610 + }, + { + "epoch": 0.42958316471064345, + "grad_norm": 1.23876953125, + "learning_rate": 5.8207722486062364e-05, + "loss": 1.0431, + "step": 10615 + }, + { + "epoch": 0.42978551193848646, + "grad_norm": 1.1878960132598877, + "learning_rate": 5.8187074127606855e-05, + "loss": 0.9407, + "step": 10620 + }, + { + "epoch": 0.4299878591663294, + "grad_norm": 1.0324996709823608, + "learning_rate": 5.816642576915136e-05, + "loss": 1.0141, + "step": 10625 + }, + { + "epoch": 0.4301902063941724, + "grad_norm": 1.0734326839447021, + "learning_rate": 5.8145777410695856e-05, + "loss": 1.0244, + "step": 10630 + }, + { + "epoch": 0.4303925536220154, + "grad_norm": 1.0938656330108643, + "learning_rate": 5.8125129052240347e-05, + "loss": 0.9731, + "step": 10635 + }, + { + "epoch": 0.43059490084985835, + "grad_norm": 1.1891802549362183, + "learning_rate": 5.8104480693784844e-05, + "loss": 0.9565, + "step": 10640 + }, + { + "epoch": 0.4307972480777013, + "grad_norm": 1.6427687406539917, + "learning_rate": 5.808383233532935e-05, + "loss": 1.015, + "step": 10645 + }, + { + "epoch": 0.4309995953055443, + "grad_norm": 1.1486482620239258, + "learning_rate": 5.806318397687384e-05, + "loss": 1.0105, + "step": 10650 + }, + { + "epoch": 0.4312019425333873, + "grad_norm": 1.1644834280014038, + "learning_rate": 5.8042535618418336e-05, + "loss": 1.0327, + "step": 10655 + }, + { + "epoch": 0.4314042897612303, + "grad_norm": 1.1761866807937622, + "learning_rate": 5.802188725996284e-05, + "loss": 0.9772, + "step": 10660 + }, + { + "epoch": 0.43160663698907326, + "grad_norm": 1.1902142763137817, + "learning_rate": 5.800123890150734e-05, + "loss": 0.9875, + "step": 10665 + }, + { + "epoch": 0.4318089842169162, + "grad_norm": 1.199822187423706, + "learning_rate": 5.798059054305183e-05, + "loss": 1.0068, + "step": 10670 + }, + { + "epoch": 0.43201133144475923, + "grad_norm": 1.155624508857727, + "learning_rate": 5.7959942184596325e-05, + "loss": 0.9842, + "step": 10675 + }, + { + "epoch": 0.4322136786726022, + "grad_norm": 1.1096001863479614, + "learning_rate": 5.793929382614083e-05, + "loss": 0.9737, + "step": 10680 + }, + { + "epoch": 0.43241602590044514, + "grad_norm": 1.1587976217269897, + "learning_rate": 5.791864546768532e-05, + "loss": 1.0342, + "step": 10685 + }, + { + "epoch": 0.43261837312828816, + "grad_norm": 1.1066583395004272, + "learning_rate": 5.789799710922982e-05, + "loss": 0.9886, + "step": 10690 + }, + { + "epoch": 0.4328207203561311, + "grad_norm": 1.148389458656311, + "learning_rate": 5.787734875077432e-05, + "loss": 0.986, + "step": 10695 + }, + { + "epoch": 0.4330230675839741, + "grad_norm": 1.2748706340789795, + "learning_rate": 5.785670039231882e-05, + "loss": 0.9912, + "step": 10700 + }, + { + "epoch": 0.4332254148118171, + "grad_norm": 1.3484047651290894, + "learning_rate": 5.783605203386331e-05, + "loss": 0.9831, + "step": 10705 + }, + { + "epoch": 0.43342776203966005, + "grad_norm": 1.1345332860946655, + "learning_rate": 5.7815403675407806e-05, + "loss": 0.9494, + "step": 10710 + }, + { + "epoch": 0.43363010926750306, + "grad_norm": 1.3133196830749512, + "learning_rate": 5.779475531695231e-05, + "loss": 1.0192, + "step": 10715 + }, + { + "epoch": 0.433832456495346, + "grad_norm": 1.258872389793396, + "learning_rate": 5.77741069584968e-05, + "loss": 1.0557, + "step": 10720 + }, + { + "epoch": 0.434034803723189, + "grad_norm": 1.1701213121414185, + "learning_rate": 5.77534586000413e-05, + "loss": 1.0009, + "step": 10725 + }, + { + "epoch": 0.434237150951032, + "grad_norm": 1.240444302558899, + "learning_rate": 5.77328102415858e-05, + "loss": 0.9558, + "step": 10730 + }, + { + "epoch": 0.43443949817887495, + "grad_norm": 1.263913631439209, + "learning_rate": 5.7712161883130286e-05, + "loss": 1.0595, + "step": 10735 + }, + { + "epoch": 0.4346418454067179, + "grad_norm": 1.1159262657165527, + "learning_rate": 5.769151352467479e-05, + "loss": 0.9617, + "step": 10740 + }, + { + "epoch": 0.4348441926345609, + "grad_norm": 1.154449701309204, + "learning_rate": 5.767086516621929e-05, + "loss": 1.0043, + "step": 10745 + }, + { + "epoch": 0.4350465398624039, + "grad_norm": 1.0637264251708984, + "learning_rate": 5.765021680776379e-05, + "loss": 1.0084, + "step": 10750 + }, + { + "epoch": 0.43524888709024684, + "grad_norm": 1.221408724784851, + "learning_rate": 5.762956844930828e-05, + "loss": 0.9945, + "step": 10755 + }, + { + "epoch": 0.43545123431808985, + "grad_norm": 1.239949345588684, + "learning_rate": 5.760892009085278e-05, + "loss": 0.964, + "step": 10760 + }, + { + "epoch": 0.4356535815459328, + "grad_norm": 1.1827601194381714, + "learning_rate": 5.758827173239728e-05, + "loss": 0.967, + "step": 10765 + }, + { + "epoch": 0.4358559287737758, + "grad_norm": 1.1999480724334717, + "learning_rate": 5.7567623373941774e-05, + "loss": 0.9989, + "step": 10770 + }, + { + "epoch": 0.4360582760016188, + "grad_norm": 1.1385078430175781, + "learning_rate": 5.754697501548627e-05, + "loss": 0.9738, + "step": 10775 + }, + { + "epoch": 0.43626062322946174, + "grad_norm": 1.2215017080307007, + "learning_rate": 5.752632665703077e-05, + "loss": 1.0081, + "step": 10780 + }, + { + "epoch": 0.43646297045730476, + "grad_norm": 1.2245571613311768, + "learning_rate": 5.750567829857526e-05, + "loss": 0.9952, + "step": 10785 + }, + { + "epoch": 0.4366653176851477, + "grad_norm": 1.1160204410552979, + "learning_rate": 5.748502994011976e-05, + "loss": 0.9938, + "step": 10790 + }, + { + "epoch": 0.43686766491299067, + "grad_norm": 1.2266266345977783, + "learning_rate": 5.746438158166426e-05, + "loss": 0.9662, + "step": 10795 + }, + { + "epoch": 0.4370700121408337, + "grad_norm": 1.1597180366516113, + "learning_rate": 5.7443733223208764e-05, + "loss": 0.9727, + "step": 10800 + }, + { + "epoch": 0.43727235936867664, + "grad_norm": 1.157707691192627, + "learning_rate": 5.7423084864753255e-05, + "loss": 0.9458, + "step": 10805 + }, + { + "epoch": 0.4374747065965196, + "grad_norm": 1.1504061222076416, + "learning_rate": 5.740243650629775e-05, + "loss": 0.9792, + "step": 10810 + }, + { + "epoch": 0.4376770538243626, + "grad_norm": 1.170724630355835, + "learning_rate": 5.738178814784225e-05, + "loss": 0.9569, + "step": 10815 + }, + { + "epoch": 0.4378794010522056, + "grad_norm": 1.3073015213012695, + "learning_rate": 5.736113978938674e-05, + "loss": 1.042, + "step": 10820 + }, + { + "epoch": 0.4380817482800486, + "grad_norm": 1.232712745666504, + "learning_rate": 5.7340491430931244e-05, + "loss": 0.9973, + "step": 10825 + }, + { + "epoch": 0.43828409550789155, + "grad_norm": 1.0662283897399902, + "learning_rate": 5.731984307247574e-05, + "loss": 0.9902, + "step": 10830 + }, + { + "epoch": 0.4384864427357345, + "grad_norm": 1.2456985712051392, + "learning_rate": 5.7299194714020245e-05, + "loss": 1.0268, + "step": 10835 + }, + { + "epoch": 0.4386887899635775, + "grad_norm": 1.0141834020614624, + "learning_rate": 5.7278546355564736e-05, + "loss": 1.0123, + "step": 10840 + }, + { + "epoch": 0.4388911371914205, + "grad_norm": 1.1344342231750488, + "learning_rate": 5.725789799710923e-05, + "loss": 1.0521, + "step": 10845 + }, + { + "epoch": 0.43909348441926344, + "grad_norm": 1.1500192880630493, + "learning_rate": 5.723724963865374e-05, + "loss": 1.001, + "step": 10850 + }, + { + "epoch": 0.43929583164710645, + "grad_norm": 1.23470139503479, + "learning_rate": 5.721660128019822e-05, + "loss": 1.0242, + "step": 10855 + }, + { + "epoch": 0.4394981788749494, + "grad_norm": 1.1283819675445557, + "learning_rate": 5.7195952921742725e-05, + "loss": 1.0173, + "step": 10860 + }, + { + "epoch": 0.43970052610279237, + "grad_norm": 1.0971150398254395, + "learning_rate": 5.717530456328722e-05, + "loss": 0.9823, + "step": 10865 + }, + { + "epoch": 0.4399028733306354, + "grad_norm": 1.1335411071777344, + "learning_rate": 5.715465620483171e-05, + "loss": 1.0396, + "step": 10870 + }, + { + "epoch": 0.44010522055847834, + "grad_norm": 1.16185462474823, + "learning_rate": 5.713400784637622e-05, + "loss": 1.0164, + "step": 10875 + }, + { + "epoch": 0.44030756778632135, + "grad_norm": 1.1602494716644287, + "learning_rate": 5.7113359487920714e-05, + "loss": 0.9838, + "step": 10880 + }, + { + "epoch": 0.4405099150141643, + "grad_norm": 1.1723899841308594, + "learning_rate": 5.709271112946522e-05, + "loss": 0.9555, + "step": 10885 + }, + { + "epoch": 0.44071226224200727, + "grad_norm": 1.0596345663070679, + "learning_rate": 5.70720627710097e-05, + "loss": 0.9953, + "step": 10890 + }, + { + "epoch": 0.4409146094698503, + "grad_norm": 1.4138137102127075, + "learning_rate": 5.7051414412554206e-05, + "loss": 1.0426, + "step": 10895 + }, + { + "epoch": 0.44111695669769324, + "grad_norm": 1.1126396656036377, + "learning_rate": 5.70307660540987e-05, + "loss": 1.0198, + "step": 10900 + }, + { + "epoch": 0.4413193039255362, + "grad_norm": 1.155994176864624, + "learning_rate": 5.7010117695643194e-05, + "loss": 1.002, + "step": 10905 + }, + { + "epoch": 0.4415216511533792, + "grad_norm": 1.209667682647705, + "learning_rate": 5.69894693371877e-05, + "loss": 0.978, + "step": 10910 + }, + { + "epoch": 0.44172399838122217, + "grad_norm": 1.2612943649291992, + "learning_rate": 5.6968820978732195e-05, + "loss": 0.9871, + "step": 10915 + }, + { + "epoch": 0.44192634560906513, + "grad_norm": 1.131173849105835, + "learning_rate": 5.6948172620276686e-05, + "loss": 0.9714, + "step": 10920 + }, + { + "epoch": 0.44212869283690814, + "grad_norm": 1.2085996866226196, + "learning_rate": 5.692752426182118e-05, + "loss": 1.0168, + "step": 10925 + }, + { + "epoch": 0.4423310400647511, + "grad_norm": 1.3031142950057983, + "learning_rate": 5.690687590336569e-05, + "loss": 0.9735, + "step": 10930 + }, + { + "epoch": 0.4425333872925941, + "grad_norm": 1.1572239398956299, + "learning_rate": 5.6886227544910184e-05, + "loss": 1.02, + "step": 10935 + }, + { + "epoch": 0.4427357345204371, + "grad_norm": 1.1638578176498413, + "learning_rate": 5.6865579186454675e-05, + "loss": 0.9327, + "step": 10940 + }, + { + "epoch": 0.44293808174828003, + "grad_norm": 1.2543002367019653, + "learning_rate": 5.684493082799918e-05, + "loss": 1.006, + "step": 10945 + }, + { + "epoch": 0.44314042897612305, + "grad_norm": 1.1135634183883667, + "learning_rate": 5.6824282469543676e-05, + "loss": 0.9771, + "step": 10950 + }, + { + "epoch": 0.443342776203966, + "grad_norm": 1.211946725845337, + "learning_rate": 5.680363411108817e-05, + "loss": 0.9941, + "step": 10955 + }, + { + "epoch": 0.44354512343180896, + "grad_norm": 1.198978304862976, + "learning_rate": 5.6782985752632664e-05, + "loss": 1.0382, + "step": 10960 + }, + { + "epoch": 0.443747470659652, + "grad_norm": 1.2260339260101318, + "learning_rate": 5.676233739417717e-05, + "loss": 1.0234, + "step": 10965 + }, + { + "epoch": 0.44394981788749494, + "grad_norm": 1.1562124490737915, + "learning_rate": 5.6741689035721665e-05, + "loss": 1.0022, + "step": 10970 + }, + { + "epoch": 0.4441521651153379, + "grad_norm": 1.0694836378097534, + "learning_rate": 5.6721040677266156e-05, + "loss": 0.9675, + "step": 10975 + }, + { + "epoch": 0.4443545123431809, + "grad_norm": 1.1761274337768555, + "learning_rate": 5.670039231881066e-05, + "loss": 1.0342, + "step": 10980 + }, + { + "epoch": 0.44455685957102387, + "grad_norm": 1.164414882659912, + "learning_rate": 5.667974396035516e-05, + "loss": 1.004, + "step": 10985 + }, + { + "epoch": 0.4447592067988669, + "grad_norm": 1.249759554862976, + "learning_rate": 5.665909560189965e-05, + "loss": 1.0289, + "step": 10990 + }, + { + "epoch": 0.44496155402670984, + "grad_norm": 1.251045823097229, + "learning_rate": 5.6638447243444145e-05, + "loss": 0.9858, + "step": 10995 + }, + { + "epoch": 0.4451639012545528, + "grad_norm": 1.205181360244751, + "learning_rate": 5.661779888498865e-05, + "loss": 0.9696, + "step": 11000 + }, + { + "epoch": 0.4453662484823958, + "grad_norm": 1.1822527647018433, + "learning_rate": 5.659715052653314e-05, + "loss": 0.9929, + "step": 11005 + }, + { + "epoch": 0.44556859571023877, + "grad_norm": 1.169823169708252, + "learning_rate": 5.657650216807764e-05, + "loss": 0.9968, + "step": 11010 + }, + { + "epoch": 0.4457709429380817, + "grad_norm": 1.0847971439361572, + "learning_rate": 5.655585380962214e-05, + "loss": 1.0832, + "step": 11015 + }, + { + "epoch": 0.44597329016592474, + "grad_norm": 1.2362971305847168, + "learning_rate": 5.653520545116664e-05, + "loss": 1.039, + "step": 11020 + }, + { + "epoch": 0.4461756373937677, + "grad_norm": 1.1471666097640991, + "learning_rate": 5.651455709271113e-05, + "loss": 1.0255, + "step": 11025 + }, + { + "epoch": 0.44637798462161066, + "grad_norm": 1.1853643655776978, + "learning_rate": 5.6493908734255626e-05, + "loss": 0.9831, + "step": 11030 + }, + { + "epoch": 0.44658033184945367, + "grad_norm": 1.054250955581665, + "learning_rate": 5.647326037580013e-05, + "loss": 1.0651, + "step": 11035 + }, + { + "epoch": 0.44678267907729663, + "grad_norm": 1.2345876693725586, + "learning_rate": 5.645261201734462e-05, + "loss": 0.9421, + "step": 11040 + }, + { + "epoch": 0.44698502630513964, + "grad_norm": 1.2633880376815796, + "learning_rate": 5.643196365888912e-05, + "loss": 0.9787, + "step": 11045 + }, + { + "epoch": 0.4471873735329826, + "grad_norm": 1.1371312141418457, + "learning_rate": 5.641131530043362e-05, + "loss": 0.942, + "step": 11050 + }, + { + "epoch": 0.44738972076082556, + "grad_norm": 1.1530300378799438, + "learning_rate": 5.639066694197812e-05, + "loss": 0.9845, + "step": 11055 + }, + { + "epoch": 0.4475920679886686, + "grad_norm": 1.1365545988082886, + "learning_rate": 5.637001858352261e-05, + "loss": 0.9124, + "step": 11060 + }, + { + "epoch": 0.44779441521651153, + "grad_norm": 1.3358913660049438, + "learning_rate": 5.634937022506711e-05, + "loss": 1.014, + "step": 11065 + }, + { + "epoch": 0.4479967624443545, + "grad_norm": 1.2181421518325806, + "learning_rate": 5.632872186661161e-05, + "loss": 1.0082, + "step": 11070 + }, + { + "epoch": 0.4481991096721975, + "grad_norm": 1.3012564182281494, + "learning_rate": 5.63080735081561e-05, + "loss": 0.9234, + "step": 11075 + }, + { + "epoch": 0.44840145690004046, + "grad_norm": 1.2210414409637451, + "learning_rate": 5.62874251497006e-05, + "loss": 1.0027, + "step": 11080 + }, + { + "epoch": 0.4486038041278834, + "grad_norm": 1.1770230531692505, + "learning_rate": 5.62667767912451e-05, + "loss": 0.9833, + "step": 11085 + }, + { + "epoch": 0.44880615135572643, + "grad_norm": 1.2124720811843872, + "learning_rate": 5.6246128432789594e-05, + "loss": 1.0064, + "step": 11090 + }, + { + "epoch": 0.4490084985835694, + "grad_norm": 1.2008227109909058, + "learning_rate": 5.622548007433409e-05, + "loss": 0.9921, + "step": 11095 + }, + { + "epoch": 0.4492108458114124, + "grad_norm": 1.0997034311294556, + "learning_rate": 5.6204831715878595e-05, + "loss": 1.0189, + "step": 11100 + }, + { + "epoch": 0.44941319303925537, + "grad_norm": 1.2778221368789673, + "learning_rate": 5.618418335742309e-05, + "loss": 0.9971, + "step": 11105 + }, + { + "epoch": 0.4496155402670983, + "grad_norm": 1.1807756423950195, + "learning_rate": 5.616353499896758e-05, + "loss": 0.9356, + "step": 11110 + }, + { + "epoch": 0.44981788749494134, + "grad_norm": 1.2341501712799072, + "learning_rate": 5.614288664051208e-05, + "loss": 1.0366, + "step": 11115 + }, + { + "epoch": 0.4500202347227843, + "grad_norm": 1.3213276863098145, + "learning_rate": 5.6122238282056584e-05, + "loss": 1.0088, + "step": 11120 + }, + { + "epoch": 0.45022258195062725, + "grad_norm": 1.3128165006637573, + "learning_rate": 5.6101589923601075e-05, + "loss": 1.04, + "step": 11125 + }, + { + "epoch": 0.45042492917847027, + "grad_norm": 1.2558141946792603, + "learning_rate": 5.608094156514557e-05, + "loss": 0.9987, + "step": 11130 + }, + { + "epoch": 0.4506272764063132, + "grad_norm": 1.163042426109314, + "learning_rate": 5.6060293206690076e-05, + "loss": 0.9675, + "step": 11135 + }, + { + "epoch": 0.4508296236341562, + "grad_norm": 1.1548547744750977, + "learning_rate": 5.603964484823456e-05, + "loss": 0.9765, + "step": 11140 + }, + { + "epoch": 0.4510319708619992, + "grad_norm": 1.1608883142471313, + "learning_rate": 5.6018996489779064e-05, + "loss": 0.9643, + "step": 11145 + }, + { + "epoch": 0.45123431808984216, + "grad_norm": 1.1438285112380981, + "learning_rate": 5.599834813132356e-05, + "loss": 0.9694, + "step": 11150 + }, + { + "epoch": 0.45143666531768517, + "grad_norm": 1.365047812461853, + "learning_rate": 5.5977699772868065e-05, + "loss": 1.0216, + "step": 11155 + }, + { + "epoch": 0.45163901254552813, + "grad_norm": 1.269562005996704, + "learning_rate": 5.5957051414412556e-05, + "loss": 1.0335, + "step": 11160 + }, + { + "epoch": 0.4518413597733711, + "grad_norm": 1.1969783306121826, + "learning_rate": 5.593640305595705e-05, + "loss": 1.0172, + "step": 11165 + }, + { + "epoch": 0.4520437070012141, + "grad_norm": 1.3603763580322266, + "learning_rate": 5.591575469750156e-05, + "loss": 0.9595, + "step": 11170 + }, + { + "epoch": 0.45224605422905706, + "grad_norm": 1.3282946348190308, + "learning_rate": 5.589510633904604e-05, + "loss": 0.9697, + "step": 11175 + }, + { + "epoch": 0.4524484014569, + "grad_norm": 0.9992858171463013, + "learning_rate": 5.5874457980590545e-05, + "loss": 0.9657, + "step": 11180 + }, + { + "epoch": 0.45265074868474303, + "grad_norm": 1.1385672092437744, + "learning_rate": 5.585380962213504e-05, + "loss": 0.9614, + "step": 11185 + }, + { + "epoch": 0.452853095912586, + "grad_norm": 1.3753652572631836, + "learning_rate": 5.5833161263679547e-05, + "loss": 0.8998, + "step": 11190 + }, + { + "epoch": 0.45305544314042895, + "grad_norm": 1.194413423538208, + "learning_rate": 5.581251290522404e-05, + "loss": 0.986, + "step": 11195 + }, + { + "epoch": 0.45325779036827196, + "grad_norm": 1.1120665073394775, + "learning_rate": 5.5791864546768534e-05, + "loss": 0.999, + "step": 11200 + }, + { + "epoch": 0.4534601375961149, + "grad_norm": 1.3044672012329102, + "learning_rate": 5.577121618831304e-05, + "loss": 0.989, + "step": 11205 + }, + { + "epoch": 0.45366248482395793, + "grad_norm": 1.0678913593292236, + "learning_rate": 5.575056782985752e-05, + "loss": 0.9563, + "step": 11210 + }, + { + "epoch": 0.4538648320518009, + "grad_norm": 1.3210352659225464, + "learning_rate": 5.5729919471402026e-05, + "loss": 0.9694, + "step": 11215 + }, + { + "epoch": 0.45406717927964385, + "grad_norm": 1.2733067274093628, + "learning_rate": 5.5709271112946524e-05, + "loss": 1.0371, + "step": 11220 + }, + { + "epoch": 0.45426952650748686, + "grad_norm": 1.194913387298584, + "learning_rate": 5.5688622754491014e-05, + "loss": 1.0445, + "step": 11225 + }, + { + "epoch": 0.4544718737353298, + "grad_norm": 1.1859469413757324, + "learning_rate": 5.566797439603552e-05, + "loss": 1.0561, + "step": 11230 + }, + { + "epoch": 0.4546742209631728, + "grad_norm": 1.1227186918258667, + "learning_rate": 5.5647326037580015e-05, + "loss": 1.0166, + "step": 11235 + }, + { + "epoch": 0.4548765681910158, + "grad_norm": 1.1340081691741943, + "learning_rate": 5.562667767912452e-05, + "loss": 0.952, + "step": 11240 + }, + { + "epoch": 0.45507891541885875, + "grad_norm": 1.1835229396820068, + "learning_rate": 5.5606029320669e-05, + "loss": 1.005, + "step": 11245 + }, + { + "epoch": 0.45528126264670177, + "grad_norm": 1.2764209508895874, + "learning_rate": 5.558538096221351e-05, + "loss": 0.9819, + "step": 11250 + }, + { + "epoch": 0.4554836098745447, + "grad_norm": 1.1682292222976685, + "learning_rate": 5.5564732603758005e-05, + "loss": 0.8616, + "step": 11255 + }, + { + "epoch": 0.4556859571023877, + "grad_norm": 1.1520472764968872, + "learning_rate": 5.5544084245302495e-05, + "loss": 1.045, + "step": 11260 + }, + { + "epoch": 0.4558883043302307, + "grad_norm": 1.1778866052627563, + "learning_rate": 5.5523435886847e-05, + "loss": 0.9874, + "step": 11265 + }, + { + "epoch": 0.45609065155807366, + "grad_norm": 1.0828711986541748, + "learning_rate": 5.5502787528391497e-05, + "loss": 0.9994, + "step": 11270 + }, + { + "epoch": 0.4562929987859166, + "grad_norm": 1.2597899436950684, + "learning_rate": 5.5482139169936e-05, + "loss": 1.0214, + "step": 11275 + }, + { + "epoch": 0.45649534601375963, + "grad_norm": 1.1565966606140137, + "learning_rate": 5.5461490811480484e-05, + "loss": 1.0158, + "step": 11280 + }, + { + "epoch": 0.4566976932416026, + "grad_norm": 1.20098876953125, + "learning_rate": 5.544084245302499e-05, + "loss": 0.9437, + "step": 11285 + }, + { + "epoch": 0.45690004046944555, + "grad_norm": 1.16834557056427, + "learning_rate": 5.5420194094569486e-05, + "loss": 0.9824, + "step": 11290 + }, + { + "epoch": 0.45710238769728856, + "grad_norm": 1.1236181259155273, + "learning_rate": 5.5399545736113976e-05, + "loss": 0.9707, + "step": 11295 + }, + { + "epoch": 0.4573047349251315, + "grad_norm": 1.1530249118804932, + "learning_rate": 5.537889737765848e-05, + "loss": 1.0455, + "step": 11300 + }, + { + "epoch": 0.45750708215297453, + "grad_norm": 1.2669841051101685, + "learning_rate": 5.535824901920298e-05, + "loss": 1.018, + "step": 11305 + }, + { + "epoch": 0.4577094293808175, + "grad_norm": 1.2789355516433716, + "learning_rate": 5.533760066074747e-05, + "loss": 0.9761, + "step": 11310 + }, + { + "epoch": 0.45791177660866045, + "grad_norm": 1.2220227718353271, + "learning_rate": 5.531695230229197e-05, + "loss": 1.0403, + "step": 11315 + }, + { + "epoch": 0.45811412383650346, + "grad_norm": 1.1159273386001587, + "learning_rate": 5.529630394383647e-05, + "loss": 0.9661, + "step": 11320 + }, + { + "epoch": 0.4583164710643464, + "grad_norm": 1.0662544965744019, + "learning_rate": 5.527565558538097e-05, + "loss": 1.0306, + "step": 11325 + }, + { + "epoch": 0.4585188182921894, + "grad_norm": 1.3621026277542114, + "learning_rate": 5.525500722692546e-05, + "loss": 1.0545, + "step": 11330 + }, + { + "epoch": 0.4587211655200324, + "grad_norm": 1.242497205734253, + "learning_rate": 5.523435886846996e-05, + "loss": 0.998, + "step": 11335 + }, + { + "epoch": 0.45892351274787535, + "grad_norm": 1.1795145273208618, + "learning_rate": 5.521371051001446e-05, + "loss": 1.023, + "step": 11340 + }, + { + "epoch": 0.4591258599757183, + "grad_norm": 1.172516942024231, + "learning_rate": 5.519306215155895e-05, + "loss": 1.0403, + "step": 11345 + }, + { + "epoch": 0.4593282072035613, + "grad_norm": 1.035287857055664, + "learning_rate": 5.517241379310345e-05, + "loss": 1.0572, + "step": 11350 + }, + { + "epoch": 0.4595305544314043, + "grad_norm": 1.2234196662902832, + "learning_rate": 5.515176543464795e-05, + "loss": 0.9477, + "step": 11355 + }, + { + "epoch": 0.4597329016592473, + "grad_norm": 1.054702639579773, + "learning_rate": 5.513111707619244e-05, + "loss": 0.9754, + "step": 11360 + }, + { + "epoch": 0.45993524888709025, + "grad_norm": 1.1466641426086426, + "learning_rate": 5.511046871773694e-05, + "loss": 1.0138, + "step": 11365 + }, + { + "epoch": 0.4601375961149332, + "grad_norm": 1.1666395664215088, + "learning_rate": 5.508982035928144e-05, + "loss": 0.9376, + "step": 11370 + }, + { + "epoch": 0.4603399433427762, + "grad_norm": 1.1848865747451782, + "learning_rate": 5.506917200082594e-05, + "loss": 1.0463, + "step": 11375 + }, + { + "epoch": 0.4605422905706192, + "grad_norm": 1.2855501174926758, + "learning_rate": 5.504852364237043e-05, + "loss": 0.9538, + "step": 11380 + }, + { + "epoch": 0.46074463779846214, + "grad_norm": 1.3183473348617554, + "learning_rate": 5.5027875283914934e-05, + "loss": 0.9932, + "step": 11385 + }, + { + "epoch": 0.46094698502630516, + "grad_norm": 1.1759259700775146, + "learning_rate": 5.500722692545943e-05, + "loss": 1.0433, + "step": 11390 + }, + { + "epoch": 0.4611493322541481, + "grad_norm": 1.193129539489746, + "learning_rate": 5.498657856700392e-05, + "loss": 0.9335, + "step": 11395 + }, + { + "epoch": 0.4613516794819911, + "grad_norm": 1.177869439125061, + "learning_rate": 5.496593020854842e-05, + "loss": 1.0172, + "step": 11400 + }, + { + "epoch": 0.4615540267098341, + "grad_norm": 1.1561225652694702, + "learning_rate": 5.4945281850092924e-05, + "loss": 1.0128, + "step": 11405 + }, + { + "epoch": 0.46175637393767704, + "grad_norm": 1.1183866262435913, + "learning_rate": 5.492463349163742e-05, + "loss": 1.0491, + "step": 11410 + }, + { + "epoch": 0.46195872116552006, + "grad_norm": 1.114930510520935, + "learning_rate": 5.490398513318191e-05, + "loss": 0.9707, + "step": 11415 + }, + { + "epoch": 0.462161068393363, + "grad_norm": 1.1722952127456665, + "learning_rate": 5.4883336774726415e-05, + "loss": 0.9784, + "step": 11420 + }, + { + "epoch": 0.462363415621206, + "grad_norm": 1.2154377698898315, + "learning_rate": 5.486268841627091e-05, + "loss": 1.0743, + "step": 11425 + }, + { + "epoch": 0.462565762849049, + "grad_norm": 1.1334062814712524, + "learning_rate": 5.48420400578154e-05, + "loss": 0.9652, + "step": 11430 + }, + { + "epoch": 0.46276811007689195, + "grad_norm": 1.1201444864273071, + "learning_rate": 5.48213916993599e-05, + "loss": 0.9014, + "step": 11435 + }, + { + "epoch": 0.4629704573047349, + "grad_norm": 1.1657004356384277, + "learning_rate": 5.4800743340904405e-05, + "loss": 1.0671, + "step": 11440 + }, + { + "epoch": 0.4631728045325779, + "grad_norm": 1.132319450378418, + "learning_rate": 5.4780094982448895e-05, + "loss": 0.9902, + "step": 11445 + }, + { + "epoch": 0.4633751517604209, + "grad_norm": 1.185171127319336, + "learning_rate": 5.475944662399339e-05, + "loss": 1.0329, + "step": 11450 + }, + { + "epoch": 0.46357749898826384, + "grad_norm": 1.1581697463989258, + "learning_rate": 5.4738798265537896e-05, + "loss": 1.0316, + "step": 11455 + }, + { + "epoch": 0.46377984621610685, + "grad_norm": 1.1859978437423706, + "learning_rate": 5.4718149907082394e-05, + "loss": 1.0021, + "step": 11460 + }, + { + "epoch": 0.4639821934439498, + "grad_norm": 3.962794780731201, + "learning_rate": 5.4697501548626884e-05, + "loss": 0.9965, + "step": 11465 + }, + { + "epoch": 0.4641845406717928, + "grad_norm": 1.3404104709625244, + "learning_rate": 5.467685319017138e-05, + "loss": 0.9506, + "step": 11470 + }, + { + "epoch": 0.4643868878996358, + "grad_norm": 1.3747602701187134, + "learning_rate": 5.4656204831715886e-05, + "loss": 0.9788, + "step": 11475 + }, + { + "epoch": 0.46458923512747874, + "grad_norm": 1.3079259395599365, + "learning_rate": 5.4635556473260376e-05, + "loss": 1.0819, + "step": 11480 + }, + { + "epoch": 0.46479158235532175, + "grad_norm": 1.1761444807052612, + "learning_rate": 5.4614908114804873e-05, + "loss": 0.9534, + "step": 11485 + }, + { + "epoch": 0.4649939295831647, + "grad_norm": 1.1462078094482422, + "learning_rate": 5.459425975634938e-05, + "loss": 1.0265, + "step": 11490 + }, + { + "epoch": 0.46519627681100767, + "grad_norm": 1.1891603469848633, + "learning_rate": 5.457361139789386e-05, + "loss": 1.0158, + "step": 11495 + }, + { + "epoch": 0.4653986240388507, + "grad_norm": 1.1987360715866089, + "learning_rate": 5.4552963039438365e-05, + "loss": 0.9456, + "step": 11500 + }, + { + "epoch": 0.46560097126669364, + "grad_norm": 1.1482681035995483, + "learning_rate": 5.453231468098286e-05, + "loss": 0.9909, + "step": 11505 + }, + { + "epoch": 0.4658033184945366, + "grad_norm": 1.2518885135650635, + "learning_rate": 5.451166632252737e-05, + "loss": 1.0183, + "step": 11510 + }, + { + "epoch": 0.4660056657223796, + "grad_norm": 1.112511157989502, + "learning_rate": 5.449101796407186e-05, + "loss": 1.0381, + "step": 11515 + }, + { + "epoch": 0.4662080129502226, + "grad_norm": 1.1407140493392944, + "learning_rate": 5.4470369605616355e-05, + "loss": 0.9752, + "step": 11520 + }, + { + "epoch": 0.4664103601780656, + "grad_norm": 1.2545710802078247, + "learning_rate": 5.444972124716086e-05, + "loss": 1.0329, + "step": 11525 + }, + { + "epoch": 0.46661270740590854, + "grad_norm": 1.160461187362671, + "learning_rate": 5.442907288870535e-05, + "loss": 0.9949, + "step": 11530 + }, + { + "epoch": 0.4668150546337515, + "grad_norm": 1.1197930574417114, + "learning_rate": 5.4408424530249846e-05, + "loss": 1.017, + "step": 11535 + }, + { + "epoch": 0.4670174018615945, + "grad_norm": 1.1824170351028442, + "learning_rate": 5.4387776171794344e-05, + "loss": 0.9759, + "step": 11540 + }, + { + "epoch": 0.4672197490894375, + "grad_norm": 1.122239589691162, + "learning_rate": 5.436712781333885e-05, + "loss": 1.0082, + "step": 11545 + }, + { + "epoch": 0.46742209631728043, + "grad_norm": 1.1205852031707764, + "learning_rate": 5.434647945488334e-05, + "loss": 1.0055, + "step": 11550 + }, + { + "epoch": 0.46762444354512345, + "grad_norm": 1.1242936849594116, + "learning_rate": 5.4325831096427836e-05, + "loss": 0.9787, + "step": 11555 + }, + { + "epoch": 0.4678267907729664, + "grad_norm": 1.4059381484985352, + "learning_rate": 5.430518273797234e-05, + "loss": 0.9984, + "step": 11560 + }, + { + "epoch": 0.46802913800080936, + "grad_norm": 1.2341316938400269, + "learning_rate": 5.428453437951683e-05, + "loss": 1.0389, + "step": 11565 + }, + { + "epoch": 0.4682314852286524, + "grad_norm": 1.228027105331421, + "learning_rate": 5.426388602106133e-05, + "loss": 1.0147, + "step": 11570 + }, + { + "epoch": 0.46843383245649534, + "grad_norm": 1.076759934425354, + "learning_rate": 5.4243237662605825e-05, + "loss": 1.0026, + "step": 11575 + }, + { + "epoch": 0.46863617968433835, + "grad_norm": 1.3640527725219727, + "learning_rate": 5.4222589304150315e-05, + "loss": 1.0227, + "step": 11580 + }, + { + "epoch": 0.4688385269121813, + "grad_norm": 1.1773855686187744, + "learning_rate": 5.420194094569482e-05, + "loss": 0.991, + "step": 11585 + }, + { + "epoch": 0.46904087414002427, + "grad_norm": 1.178702473640442, + "learning_rate": 5.418129258723932e-05, + "loss": 0.9894, + "step": 11590 + }, + { + "epoch": 0.4692432213678673, + "grad_norm": 1.263631820678711, + "learning_rate": 5.416064422878382e-05, + "loss": 1.0234, + "step": 11595 + }, + { + "epoch": 0.46944556859571024, + "grad_norm": 1.1302398443222046, + "learning_rate": 5.413999587032831e-05, + "loss": 1.0389, + "step": 11600 + }, + { + "epoch": 0.4696479158235532, + "grad_norm": 1.1499837636947632, + "learning_rate": 5.411934751187281e-05, + "loss": 1.0284, + "step": 11605 + }, + { + "epoch": 0.4698502630513962, + "grad_norm": 1.2414751052856445, + "learning_rate": 5.409869915341731e-05, + "loss": 0.9275, + "step": 11610 + }, + { + "epoch": 0.47005261027923917, + "grad_norm": 1.13211190700531, + "learning_rate": 5.4078050794961796e-05, + "loss": 0.9419, + "step": 11615 + }, + { + "epoch": 0.4702549575070821, + "grad_norm": 1.1615794897079468, + "learning_rate": 5.40574024365063e-05, + "loss": 0.9729, + "step": 11620 + }, + { + "epoch": 0.47045730473492514, + "grad_norm": 1.194742202758789, + "learning_rate": 5.40367540780508e-05, + "loss": 1.0409, + "step": 11625 + }, + { + "epoch": 0.4706596519627681, + "grad_norm": 1.2788242101669312, + "learning_rate": 5.40161057195953e-05, + "loss": 1.0719, + "step": 11630 + }, + { + "epoch": 0.4708619991906111, + "grad_norm": 1.361441731452942, + "learning_rate": 5.399545736113979e-05, + "loss": 0.9966, + "step": 11635 + }, + { + "epoch": 0.47106434641845407, + "grad_norm": 1.1751059293746948, + "learning_rate": 5.397480900268429e-05, + "loss": 1.053, + "step": 11640 + }, + { + "epoch": 0.47126669364629703, + "grad_norm": 1.1899535655975342, + "learning_rate": 5.3954160644228794e-05, + "loss": 1.0003, + "step": 11645 + }, + { + "epoch": 0.47146904087414004, + "grad_norm": 1.1658504009246826, + "learning_rate": 5.393351228577328e-05, + "loss": 1.0052, + "step": 11650 + }, + { + "epoch": 0.471671388101983, + "grad_norm": 1.126866102218628, + "learning_rate": 5.391286392731778e-05, + "loss": 0.9929, + "step": 11655 + }, + { + "epoch": 0.47187373532982596, + "grad_norm": 1.2745146751403809, + "learning_rate": 5.389221556886228e-05, + "loss": 1.0301, + "step": 11660 + }, + { + "epoch": 0.472076082557669, + "grad_norm": 1.0762321949005127, + "learning_rate": 5.387156721040677e-05, + "loss": 1.0129, + "step": 11665 + }, + { + "epoch": 0.47227842978551193, + "grad_norm": 1.1167628765106201, + "learning_rate": 5.3850918851951273e-05, + "loss": 0.9681, + "step": 11670 + }, + { + "epoch": 0.4724807770133549, + "grad_norm": 1.1747218370437622, + "learning_rate": 5.383027049349577e-05, + "loss": 1.0471, + "step": 11675 + }, + { + "epoch": 0.4726831242411979, + "grad_norm": 1.3087221384048462, + "learning_rate": 5.3809622135040275e-05, + "loss": 1.0013, + "step": 11680 + }, + { + "epoch": 0.47288547146904086, + "grad_norm": 1.1940945386886597, + "learning_rate": 5.378897377658476e-05, + "loss": 0.9774, + "step": 11685 + }, + { + "epoch": 0.4730878186968839, + "grad_norm": 1.164388656616211, + "learning_rate": 5.376832541812926e-05, + "loss": 1.0504, + "step": 11690 + }, + { + "epoch": 0.47329016592472684, + "grad_norm": 1.1465179920196533, + "learning_rate": 5.374767705967376e-05, + "loss": 1.0383, + "step": 11695 + }, + { + "epoch": 0.4734925131525698, + "grad_norm": 1.2491196393966675, + "learning_rate": 5.372702870121825e-05, + "loss": 0.9987, + "step": 11700 + }, + { + "epoch": 0.4736948603804128, + "grad_norm": 1.29067862033844, + "learning_rate": 5.3706380342762755e-05, + "loss": 1.0046, + "step": 11705 + }, + { + "epoch": 0.47389720760825577, + "grad_norm": 1.1442447900772095, + "learning_rate": 5.368573198430725e-05, + "loss": 0.9822, + "step": 11710 + }, + { + "epoch": 0.4740995548360987, + "grad_norm": 1.1371489763259888, + "learning_rate": 5.366508362585174e-05, + "loss": 0.9596, + "step": 11715 + }, + { + "epoch": 0.47430190206394174, + "grad_norm": 1.167847752571106, + "learning_rate": 5.364443526739624e-05, + "loss": 1.0093, + "step": 11720 + }, + { + "epoch": 0.4745042492917847, + "grad_norm": 1.2384512424468994, + "learning_rate": 5.3623786908940744e-05, + "loss": 1.0428, + "step": 11725 + }, + { + "epoch": 0.47470659651962765, + "grad_norm": 1.1138396263122559, + "learning_rate": 5.360313855048524e-05, + "loss": 0.9754, + "step": 11730 + }, + { + "epoch": 0.47490894374747067, + "grad_norm": 1.1507554054260254, + "learning_rate": 5.358249019202973e-05, + "loss": 0.9751, + "step": 11735 + }, + { + "epoch": 0.4751112909753136, + "grad_norm": 1.2503911256790161, + "learning_rate": 5.3561841833574236e-05, + "loss": 0.9987, + "step": 11740 + }, + { + "epoch": 0.47531363820315664, + "grad_norm": 1.0976605415344238, + "learning_rate": 5.354119347511873e-05, + "loss": 1.0059, + "step": 11745 + }, + { + "epoch": 0.4755159854309996, + "grad_norm": 1.3331193923950195, + "learning_rate": 5.3520545116663223e-05, + "loss": 0.985, + "step": 11750 + }, + { + "epoch": 0.47571833265884256, + "grad_norm": 1.1028928756713867, + "learning_rate": 5.349989675820772e-05, + "loss": 1.0103, + "step": 11755 + }, + { + "epoch": 0.47592067988668557, + "grad_norm": 1.3108789920806885, + "learning_rate": 5.3479248399752225e-05, + "loss": 1.0209, + "step": 11760 + }, + { + "epoch": 0.47612302711452853, + "grad_norm": 1.1938402652740479, + "learning_rate": 5.345860004129672e-05, + "loss": 0.9593, + "step": 11765 + }, + { + "epoch": 0.4763253743423715, + "grad_norm": 1.2749603986740112, + "learning_rate": 5.343795168284121e-05, + "loss": 1.0241, + "step": 11770 + }, + { + "epoch": 0.4765277215702145, + "grad_norm": 1.2287843227386475, + "learning_rate": 5.341730332438572e-05, + "loss": 1.013, + "step": 11775 + }, + { + "epoch": 0.47673006879805746, + "grad_norm": 1.2256628274917603, + "learning_rate": 5.3396654965930214e-05, + "loss": 1.0203, + "step": 11780 + }, + { + "epoch": 0.4769324160259004, + "grad_norm": 1.266689658164978, + "learning_rate": 5.3376006607474705e-05, + "loss": 0.988, + "step": 11785 + }, + { + "epoch": 0.47713476325374343, + "grad_norm": 1.2061651945114136, + "learning_rate": 5.33553582490192e-05, + "loss": 1.0481, + "step": 11790 + }, + { + "epoch": 0.4773371104815864, + "grad_norm": 1.203644037246704, + "learning_rate": 5.3334709890563706e-05, + "loss": 1.0139, + "step": 11795 + }, + { + "epoch": 0.4775394577094294, + "grad_norm": 1.2693864107131958, + "learning_rate": 5.3314061532108196e-05, + "loss": 1.0385, + "step": 11800 + }, + { + "epoch": 0.47774180493727236, + "grad_norm": 1.257041096687317, + "learning_rate": 5.3293413173652694e-05, + "loss": 0.9903, + "step": 11805 + }, + { + "epoch": 0.4779441521651153, + "grad_norm": 1.1452276706695557, + "learning_rate": 5.32727648151972e-05, + "loss": 1.0217, + "step": 11810 + }, + { + "epoch": 0.47814649939295834, + "grad_norm": 1.167068362236023, + "learning_rate": 5.3252116456741695e-05, + "loss": 1.0112, + "step": 11815 + }, + { + "epoch": 0.4783488466208013, + "grad_norm": 1.1084812879562378, + "learning_rate": 5.3231468098286186e-05, + "loss": 0.9981, + "step": 11820 + }, + { + "epoch": 0.47855119384864425, + "grad_norm": 1.2631816864013672, + "learning_rate": 5.321081973983068e-05, + "loss": 0.9906, + "step": 11825 + }, + { + "epoch": 0.47875354107648727, + "grad_norm": 1.2514671087265015, + "learning_rate": 5.319017138137519e-05, + "loss": 1.0225, + "step": 11830 + }, + { + "epoch": 0.4789558883043302, + "grad_norm": 1.3078927993774414, + "learning_rate": 5.316952302291968e-05, + "loss": 0.9914, + "step": 11835 + }, + { + "epoch": 0.4791582355321732, + "grad_norm": 1.336737036705017, + "learning_rate": 5.3148874664464175e-05, + "loss": 1.0396, + "step": 11840 + }, + { + "epoch": 0.4793605827600162, + "grad_norm": 1.0106703042984009, + "learning_rate": 5.312822630600868e-05, + "loss": 0.9494, + "step": 11845 + }, + { + "epoch": 0.47956292998785915, + "grad_norm": 1.2406935691833496, + "learning_rate": 5.3107577947553176e-05, + "loss": 0.96, + "step": 11850 + }, + { + "epoch": 0.47976527721570217, + "grad_norm": 1.166337251663208, + "learning_rate": 5.308692958909767e-05, + "loss": 0.9897, + "step": 11855 + }, + { + "epoch": 0.4799676244435451, + "grad_norm": 1.230905532836914, + "learning_rate": 5.306628123064217e-05, + "loss": 1.0616, + "step": 11860 + }, + { + "epoch": 0.4801699716713881, + "grad_norm": 1.2330104112625122, + "learning_rate": 5.304563287218667e-05, + "loss": 1.0153, + "step": 11865 + }, + { + "epoch": 0.4803723188992311, + "grad_norm": 1.6057419776916504, + "learning_rate": 5.302498451373116e-05, + "loss": 0.9794, + "step": 11870 + }, + { + "epoch": 0.48057466612707406, + "grad_norm": 1.1932746171951294, + "learning_rate": 5.3004336155275656e-05, + "loss": 0.9847, + "step": 11875 + }, + { + "epoch": 0.480777013354917, + "grad_norm": 1.1735295057296753, + "learning_rate": 5.298368779682016e-05, + "loss": 1.0061, + "step": 11880 + }, + { + "epoch": 0.48097936058276003, + "grad_norm": 1.2277863025665283, + "learning_rate": 5.296303943836465e-05, + "loss": 1.0974, + "step": 11885 + }, + { + "epoch": 0.481181707810603, + "grad_norm": 1.1564351320266724, + "learning_rate": 5.294239107990915e-05, + "loss": 1.0016, + "step": 11890 + }, + { + "epoch": 0.48138405503844595, + "grad_norm": 1.0572887659072876, + "learning_rate": 5.292174272145365e-05, + "loss": 0.9457, + "step": 11895 + }, + { + "epoch": 0.48158640226628896, + "grad_norm": 1.0993452072143555, + "learning_rate": 5.290109436299815e-05, + "loss": 0.9996, + "step": 11900 + }, + { + "epoch": 0.4817887494941319, + "grad_norm": 1.2279669046401978, + "learning_rate": 5.288044600454264e-05, + "loss": 1.0324, + "step": 11905 + }, + { + "epoch": 0.48199109672197493, + "grad_norm": 1.054007649421692, + "learning_rate": 5.285979764608714e-05, + "loss": 1.0432, + "step": 11910 + }, + { + "epoch": 0.4821934439498179, + "grad_norm": 1.1128709316253662, + "learning_rate": 5.283914928763164e-05, + "loss": 1.0279, + "step": 11915 + }, + { + "epoch": 0.48239579117766085, + "grad_norm": 1.351121425628662, + "learning_rate": 5.281850092917613e-05, + "loss": 0.9776, + "step": 11920 + }, + { + "epoch": 0.48259813840550386, + "grad_norm": 1.1498552560806274, + "learning_rate": 5.279785257072063e-05, + "loss": 0.9965, + "step": 11925 + }, + { + "epoch": 0.4828004856333468, + "grad_norm": 1.1827574968338013, + "learning_rate": 5.277720421226513e-05, + "loss": 0.9636, + "step": 11930 + }, + { + "epoch": 0.4830028328611898, + "grad_norm": 1.0850706100463867, + "learning_rate": 5.275655585380962e-05, + "loss": 1.0092, + "step": 11935 + }, + { + "epoch": 0.4832051800890328, + "grad_norm": 1.1755199432373047, + "learning_rate": 5.273590749535412e-05, + "loss": 1.0064, + "step": 11940 + }, + { + "epoch": 0.48340752731687575, + "grad_norm": 1.2772828340530396, + "learning_rate": 5.271525913689862e-05, + "loss": 1.0346, + "step": 11945 + }, + { + "epoch": 0.4836098745447187, + "grad_norm": 1.2006957530975342, + "learning_rate": 5.269461077844312e-05, + "loss": 0.9633, + "step": 11950 + }, + { + "epoch": 0.4838122217725617, + "grad_norm": 1.2521257400512695, + "learning_rate": 5.267396241998761e-05, + "loss": 1.0277, + "step": 11955 + }, + { + "epoch": 0.4840145690004047, + "grad_norm": 1.2037335634231567, + "learning_rate": 5.265331406153211e-05, + "loss": 1.0424, + "step": 11960 + }, + { + "epoch": 0.4842169162282477, + "grad_norm": 1.098489761352539, + "learning_rate": 5.2632665703076614e-05, + "loss": 1.0258, + "step": 11965 + }, + { + "epoch": 0.48441926345609065, + "grad_norm": 1.0685847997665405, + "learning_rate": 5.26120173446211e-05, + "loss": 1.0309, + "step": 11970 + }, + { + "epoch": 0.4846216106839336, + "grad_norm": 1.1604846715927124, + "learning_rate": 5.25913689861656e-05, + "loss": 1.0864, + "step": 11975 + }, + { + "epoch": 0.4848239579117766, + "grad_norm": 1.1499050855636597, + "learning_rate": 5.25707206277101e-05, + "loss": 0.9741, + "step": 11980 + }, + { + "epoch": 0.4850263051396196, + "grad_norm": 1.188353180885315, + "learning_rate": 5.25500722692546e-05, + "loss": 0.9864, + "step": 11985 + }, + { + "epoch": 0.48522865236746254, + "grad_norm": 1.1696805953979492, + "learning_rate": 5.2529423910799094e-05, + "loss": 0.9617, + "step": 11990 + }, + { + "epoch": 0.48543099959530556, + "grad_norm": 1.3206312656402588, + "learning_rate": 5.250877555234359e-05, + "loss": 1.0467, + "step": 11995 + }, + { + "epoch": 0.4856333468231485, + "grad_norm": 1.13299560546875, + "learning_rate": 5.2488127193888095e-05, + "loss": 0.973, + "step": 12000 + }, + { + "epoch": 0.48583569405099153, + "grad_norm": 1.416884422302246, + "learning_rate": 5.246747883543258e-05, + "loss": 1.0054, + "step": 12005 + }, + { + "epoch": 0.4860380412788345, + "grad_norm": 1.2453945875167847, + "learning_rate": 5.244683047697708e-05, + "loss": 1.0447, + "step": 12010 + }, + { + "epoch": 0.48624038850667745, + "grad_norm": 1.1166871786117554, + "learning_rate": 5.242618211852158e-05, + "loss": 0.9752, + "step": 12015 + }, + { + "epoch": 0.48644273573452046, + "grad_norm": 1.2610543966293335, + "learning_rate": 5.240553376006607e-05, + "loss": 0.9689, + "step": 12020 + }, + { + "epoch": 0.4866450829623634, + "grad_norm": 1.2604610919952393, + "learning_rate": 5.2384885401610575e-05, + "loss": 0.9667, + "step": 12025 + }, + { + "epoch": 0.4868474301902064, + "grad_norm": 1.3531112670898438, + "learning_rate": 5.236423704315507e-05, + "loss": 1.0058, + "step": 12030 + }, + { + "epoch": 0.4870497774180494, + "grad_norm": 1.0576692819595337, + "learning_rate": 5.2343588684699576e-05, + "loss": 1.0281, + "step": 12035 + }, + { + "epoch": 0.48725212464589235, + "grad_norm": 1.122986912727356, + "learning_rate": 5.232294032624406e-05, + "loss": 1.0254, + "step": 12040 + }, + { + "epoch": 0.4874544718737353, + "grad_norm": 1.157700538635254, + "learning_rate": 5.2302291967788564e-05, + "loss": 0.9843, + "step": 12045 + }, + { + "epoch": 0.4876568191015783, + "grad_norm": 1.2433433532714844, + "learning_rate": 5.228164360933306e-05, + "loss": 0.9999, + "step": 12050 + }, + { + "epoch": 0.4878591663294213, + "grad_norm": 1.188632845878601, + "learning_rate": 5.226099525087755e-05, + "loss": 0.9848, + "step": 12055 + }, + { + "epoch": 0.4880615135572643, + "grad_norm": 1.1356099843978882, + "learning_rate": 5.2240346892422056e-05, + "loss": 0.936, + "step": 12060 + }, + { + "epoch": 0.48826386078510725, + "grad_norm": 1.1671898365020752, + "learning_rate": 5.221969853396655e-05, + "loss": 0.9922, + "step": 12065 + }, + { + "epoch": 0.4884662080129502, + "grad_norm": 1.1089935302734375, + "learning_rate": 5.2199050175511044e-05, + "loss": 1.0443, + "step": 12070 + }, + { + "epoch": 0.4886685552407932, + "grad_norm": 1.323712706565857, + "learning_rate": 5.217840181705555e-05, + "loss": 1.0233, + "step": 12075 + }, + { + "epoch": 0.4888709024686362, + "grad_norm": 1.0796269178390503, + "learning_rate": 5.2157753458600045e-05, + "loss": 1.0662, + "step": 12080 + }, + { + "epoch": 0.48907324969647914, + "grad_norm": 1.1861119270324707, + "learning_rate": 5.213710510014454e-05, + "loss": 0.9814, + "step": 12085 + }, + { + "epoch": 0.48927559692432215, + "grad_norm": 1.104269027709961, + "learning_rate": 5.211645674168903e-05, + "loss": 0.9476, + "step": 12090 + }, + { + "epoch": 0.4894779441521651, + "grad_norm": 1.081963300704956, + "learning_rate": 5.209580838323354e-05, + "loss": 1.0231, + "step": 12095 + }, + { + "epoch": 0.48968029138000807, + "grad_norm": 1.1605753898620605, + "learning_rate": 5.2075160024778034e-05, + "loss": 1.0055, + "step": 12100 + }, + { + "epoch": 0.4898826386078511, + "grad_norm": 1.227674961090088, + "learning_rate": 5.2054511666322525e-05, + "loss": 0.9811, + "step": 12105 + }, + { + "epoch": 0.49008498583569404, + "grad_norm": 1.1121258735656738, + "learning_rate": 5.203386330786703e-05, + "loss": 1.0026, + "step": 12110 + }, + { + "epoch": 0.49028733306353706, + "grad_norm": 1.1806455850601196, + "learning_rate": 5.2013214949411526e-05, + "loss": 0.9891, + "step": 12115 + }, + { + "epoch": 0.49048968029138, + "grad_norm": 1.2708841562271118, + "learning_rate": 5.1992566590956023e-05, + "loss": 0.9987, + "step": 12120 + }, + { + "epoch": 0.490692027519223, + "grad_norm": 1.3260775804519653, + "learning_rate": 5.1971918232500514e-05, + "loss": 0.9689, + "step": 12125 + }, + { + "epoch": 0.490894374747066, + "grad_norm": 1.106037974357605, + "learning_rate": 5.195126987404502e-05, + "loss": 1.0022, + "step": 12130 + }, + { + "epoch": 0.49109672197490895, + "grad_norm": 1.2738261222839355, + "learning_rate": 5.1930621515589515e-05, + "loss": 1.0052, + "step": 12135 + }, + { + "epoch": 0.4912990692027519, + "grad_norm": 1.1397292613983154, + "learning_rate": 5.1909973157134006e-05, + "loss": 0.9537, + "step": 12140 + }, + { + "epoch": 0.4915014164305949, + "grad_norm": 1.1503263711929321, + "learning_rate": 5.188932479867851e-05, + "loss": 1.0132, + "step": 12145 + }, + { + "epoch": 0.4917037636584379, + "grad_norm": 1.117822289466858, + "learning_rate": 5.186867644022301e-05, + "loss": 1.0289, + "step": 12150 + }, + { + "epoch": 0.49190611088628083, + "grad_norm": 1.0901275873184204, + "learning_rate": 5.18480280817675e-05, + "loss": 0.993, + "step": 12155 + }, + { + "epoch": 0.49210845811412385, + "grad_norm": 1.1580654382705688, + "learning_rate": 5.1827379723311995e-05, + "loss": 0.9897, + "step": 12160 + }, + { + "epoch": 0.4923108053419668, + "grad_norm": 1.170100212097168, + "learning_rate": 5.18067313648565e-05, + "loss": 1.0015, + "step": 12165 + }, + { + "epoch": 0.4925131525698098, + "grad_norm": 1.1789393424987793, + "learning_rate": 5.1786083006400996e-05, + "loss": 0.9359, + "step": 12170 + }, + { + "epoch": 0.4927154997976528, + "grad_norm": 1.11495840549469, + "learning_rate": 5.176543464794549e-05, + "loss": 1.0539, + "step": 12175 + }, + { + "epoch": 0.49291784702549574, + "grad_norm": 1.2007088661193848, + "learning_rate": 5.174478628948999e-05, + "loss": 1.0957, + "step": 12180 + }, + { + "epoch": 0.49312019425333875, + "grad_norm": 1.1438637971878052, + "learning_rate": 5.172413793103449e-05, + "loss": 1.0123, + "step": 12185 + }, + { + "epoch": 0.4933225414811817, + "grad_norm": 1.118512511253357, + "learning_rate": 5.170348957257898e-05, + "loss": 0.9884, + "step": 12190 + }, + { + "epoch": 0.49352488870902467, + "grad_norm": 1.3337222337722778, + "learning_rate": 5.1682841214123476e-05, + "loss": 0.9369, + "step": 12195 + }, + { + "epoch": 0.4937272359368677, + "grad_norm": 1.2440013885498047, + "learning_rate": 5.166219285566798e-05, + "loss": 1.0743, + "step": 12200 + }, + { + "epoch": 0.49392958316471064, + "grad_norm": 1.0439364910125732, + "learning_rate": 5.164154449721248e-05, + "loss": 1.0591, + "step": 12205 + }, + { + "epoch": 0.4941319303925536, + "grad_norm": 1.130218505859375, + "learning_rate": 5.162089613875697e-05, + "loss": 1.0178, + "step": 12210 + }, + { + "epoch": 0.4943342776203966, + "grad_norm": 1.1202657222747803, + "learning_rate": 5.160024778030147e-05, + "loss": 0.9759, + "step": 12215 + }, + { + "epoch": 0.49453662484823957, + "grad_norm": 1.234213948249817, + "learning_rate": 5.157959942184597e-05, + "loss": 0.9899, + "step": 12220 + }, + { + "epoch": 0.4947389720760826, + "grad_norm": 1.201436161994934, + "learning_rate": 5.155895106339046e-05, + "loss": 1.0586, + "step": 12225 + }, + { + "epoch": 0.49494131930392554, + "grad_norm": 1.166717767715454, + "learning_rate": 5.153830270493496e-05, + "loss": 0.9978, + "step": 12230 + }, + { + "epoch": 0.4951436665317685, + "grad_norm": 1.1749318838119507, + "learning_rate": 5.151765434647946e-05, + "loss": 0.967, + "step": 12235 + }, + { + "epoch": 0.4953460137596115, + "grad_norm": 1.2312084436416626, + "learning_rate": 5.149700598802395e-05, + "loss": 0.9739, + "step": 12240 + }, + { + "epoch": 0.4955483609874545, + "grad_norm": 1.212628960609436, + "learning_rate": 5.147635762956845e-05, + "loss": 0.9882, + "step": 12245 + }, + { + "epoch": 0.49575070821529743, + "grad_norm": 1.3387917280197144, + "learning_rate": 5.145570927111295e-05, + "loss": 1.0215, + "step": 12250 + }, + { + "epoch": 0.49595305544314044, + "grad_norm": 1.0717612504959106, + "learning_rate": 5.143506091265745e-05, + "loss": 1.054, + "step": 12255 + }, + { + "epoch": 0.4961554026709834, + "grad_norm": 1.0845282077789307, + "learning_rate": 5.141441255420194e-05, + "loss": 0.972, + "step": 12260 + }, + { + "epoch": 0.49635774989882636, + "grad_norm": 1.11933171749115, + "learning_rate": 5.139376419574644e-05, + "loss": 0.9062, + "step": 12265 + }, + { + "epoch": 0.4965600971266694, + "grad_norm": 1.1734405755996704, + "learning_rate": 5.137311583729094e-05, + "loss": 0.9905, + "step": 12270 + }, + { + "epoch": 0.49676244435451233, + "grad_norm": 1.300087332725525, + "learning_rate": 5.135246747883543e-05, + "loss": 0.9919, + "step": 12275 + }, + { + "epoch": 0.49696479158235535, + "grad_norm": 1.2729748487472534, + "learning_rate": 5.133181912037993e-05, + "loss": 0.9914, + "step": 12280 + }, + { + "epoch": 0.4971671388101983, + "grad_norm": 1.3504347801208496, + "learning_rate": 5.1311170761924434e-05, + "loss": 1.0429, + "step": 12285 + }, + { + "epoch": 0.49736948603804126, + "grad_norm": 1.2052627801895142, + "learning_rate": 5.1290522403468925e-05, + "loss": 1.0078, + "step": 12290 + }, + { + "epoch": 0.4975718332658843, + "grad_norm": 1.3131917715072632, + "learning_rate": 5.126987404501342e-05, + "loss": 0.9245, + "step": 12295 + }, + { + "epoch": 0.49777418049372724, + "grad_norm": 1.3341904878616333, + "learning_rate": 5.124922568655792e-05, + "loss": 0.9872, + "step": 12300 + }, + { + "epoch": 0.4979765277215702, + "grad_norm": 1.1806317567825317, + "learning_rate": 5.1228577328102423e-05, + "loss": 0.9939, + "step": 12305 + }, + { + "epoch": 0.4981788749494132, + "grad_norm": 1.2073943614959717, + "learning_rate": 5.1207928969646914e-05, + "loss": 1.0261, + "step": 12310 + }, + { + "epoch": 0.49838122217725617, + "grad_norm": 1.3189400434494019, + "learning_rate": 5.118728061119141e-05, + "loss": 1.0034, + "step": 12315 + }, + { + "epoch": 0.4985835694050991, + "grad_norm": 1.1953120231628418, + "learning_rate": 5.1166632252735915e-05, + "loss": 0.9618, + "step": 12320 + }, + { + "epoch": 0.49878591663294214, + "grad_norm": 1.1699455976486206, + "learning_rate": 5.1145983894280406e-05, + "loss": 1.0228, + "step": 12325 + }, + { + "epoch": 0.4989882638607851, + "grad_norm": 1.1556991338729858, + "learning_rate": 5.11253355358249e-05, + "loss": 0.9953, + "step": 12330 + }, + { + "epoch": 0.4991906110886281, + "grad_norm": 1.2345956563949585, + "learning_rate": 5.11046871773694e-05, + "loss": 0.9339, + "step": 12335 + }, + { + "epoch": 0.49939295831647107, + "grad_norm": 1.6040476560592651, + "learning_rate": 5.1084038818913904e-05, + "loss": 0.9312, + "step": 12340 + }, + { + "epoch": 0.49959530554431403, + "grad_norm": 1.1234400272369385, + "learning_rate": 5.1063390460458395e-05, + "loss": 1.0077, + "step": 12345 + }, + { + "epoch": 0.49979765277215704, + "grad_norm": 1.2733681201934814, + "learning_rate": 5.104274210200289e-05, + "loss": 0.9654, + "step": 12350 + }, + { + "epoch": 0.5, + "grad_norm": 1.193357229232788, + "learning_rate": 5.1022093743547396e-05, + "loss": 0.9975, + "step": 12355 + }, + { + "epoch": 0.500202347227843, + "grad_norm": 1.1366767883300781, + "learning_rate": 5.100144538509189e-05, + "loss": 0.9933, + "step": 12360 + }, + { + "epoch": 0.5004046944556859, + "grad_norm": 1.148776888847351, + "learning_rate": 5.0980797026636384e-05, + "loss": 1.0654, + "step": 12365 + }, + { + "epoch": 0.500607041683529, + "grad_norm": 1.2743234634399414, + "learning_rate": 5.096014866818088e-05, + "loss": 0.9324, + "step": 12370 + }, + { + "epoch": 0.5008093889113719, + "grad_norm": 1.2196996212005615, + "learning_rate": 5.093950030972537e-05, + "loss": 1.0129, + "step": 12375 + }, + { + "epoch": 0.5010117361392149, + "grad_norm": 1.080466389656067, + "learning_rate": 5.0918851951269876e-05, + "loss": 0.967, + "step": 12380 + }, + { + "epoch": 0.5012140833670579, + "grad_norm": 1.15315580368042, + "learning_rate": 5.089820359281437e-05, + "loss": 1.0039, + "step": 12385 + }, + { + "epoch": 0.5014164305949008, + "grad_norm": 1.1998361349105835, + "learning_rate": 5.087755523435888e-05, + "loss": 1.0225, + "step": 12390 + }, + { + "epoch": 0.5016187778227438, + "grad_norm": 1.108877420425415, + "learning_rate": 5.085690687590337e-05, + "loss": 0.9265, + "step": 12395 + }, + { + "epoch": 0.5018211250505868, + "grad_norm": 1.1324632167816162, + "learning_rate": 5.0836258517447865e-05, + "loss": 0.9246, + "step": 12400 + }, + { + "epoch": 0.5020234722784298, + "grad_norm": 1.2282084226608276, + "learning_rate": 5.081561015899237e-05, + "loss": 1.0447, + "step": 12405 + }, + { + "epoch": 0.5022258195062728, + "grad_norm": 1.1794309616088867, + "learning_rate": 5.079496180053685e-05, + "loss": 1.0202, + "step": 12410 + }, + { + "epoch": 0.5024281667341157, + "grad_norm": 1.2083922624588013, + "learning_rate": 5.077431344208136e-05, + "loss": 1.006, + "step": 12415 + }, + { + "epoch": 0.5026305139619587, + "grad_norm": 1.2532967329025269, + "learning_rate": 5.0753665083625854e-05, + "loss": 1.0159, + "step": 12420 + }, + { + "epoch": 0.5028328611898017, + "grad_norm": 1.284624457359314, + "learning_rate": 5.0733016725170345e-05, + "loss": 1.0005, + "step": 12425 + }, + { + "epoch": 0.5030352084176447, + "grad_norm": 1.196325659751892, + "learning_rate": 5.071236836671485e-05, + "loss": 1.0457, + "step": 12430 + }, + { + "epoch": 0.5032375556454877, + "grad_norm": 1.2377886772155762, + "learning_rate": 5.0691720008259346e-05, + "loss": 1.0229, + "step": 12435 + }, + { + "epoch": 0.5034399028733306, + "grad_norm": 1.2728221416473389, + "learning_rate": 5.067107164980385e-05, + "loss": 1.0108, + "step": 12440 + }, + { + "epoch": 0.5036422501011736, + "grad_norm": 1.28044855594635, + "learning_rate": 5.0650423291348334e-05, + "loss": 1.0369, + "step": 12445 + }, + { + "epoch": 0.5038445973290165, + "grad_norm": 1.1525059938430786, + "learning_rate": 5.062977493289284e-05, + "loss": 0.9861, + "step": 12450 + }, + { + "epoch": 0.5040469445568596, + "grad_norm": 1.2731013298034668, + "learning_rate": 5.0609126574437336e-05, + "loss": 1.0362, + "step": 12455 + }, + { + "epoch": 0.5042492917847026, + "grad_norm": 1.2046812772750854, + "learning_rate": 5.0588478215981826e-05, + "loss": 1.0207, + "step": 12460 + }, + { + "epoch": 0.5044516390125455, + "grad_norm": 1.1956570148468018, + "learning_rate": 5.056782985752633e-05, + "loss": 0.9847, + "step": 12465 + }, + { + "epoch": 0.5046539862403885, + "grad_norm": 1.0756455659866333, + "learning_rate": 5.054718149907083e-05, + "loss": 0.9889, + "step": 12470 + }, + { + "epoch": 0.5048563334682314, + "grad_norm": 1.2245359420776367, + "learning_rate": 5.052653314061533e-05, + "loss": 1.043, + "step": 12475 + }, + { + "epoch": 0.5050586806960745, + "grad_norm": 1.3225243091583252, + "learning_rate": 5.0505884782159815e-05, + "loss": 0.9622, + "step": 12480 + }, + { + "epoch": 0.5052610279239175, + "grad_norm": 1.2724545001983643, + "learning_rate": 5.048523642370432e-05, + "loss": 1.0257, + "step": 12485 + }, + { + "epoch": 0.5054633751517604, + "grad_norm": 1.2298985719680786, + "learning_rate": 5.0464588065248817e-05, + "loss": 0.9616, + "step": 12490 + }, + { + "epoch": 0.5056657223796034, + "grad_norm": 1.1233696937561035, + "learning_rate": 5.044393970679331e-05, + "loss": 0.9892, + "step": 12495 + }, + { + "epoch": 0.5058680696074463, + "grad_norm": 1.1848814487457275, + "learning_rate": 5.042329134833781e-05, + "loss": 1.0535, + "step": 12500 + }, + { + "epoch": 0.5060704168352893, + "grad_norm": 1.1247293949127197, + "learning_rate": 5.040264298988231e-05, + "loss": 0.9868, + "step": 12505 + }, + { + "epoch": 0.5062727640631324, + "grad_norm": 1.1931838989257812, + "learning_rate": 5.03819946314268e-05, + "loss": 0.9507, + "step": 12510 + }, + { + "epoch": 0.5064751112909753, + "grad_norm": 1.1550499200820923, + "learning_rate": 5.0361346272971296e-05, + "loss": 1.051, + "step": 12515 + }, + { + "epoch": 0.5066774585188183, + "grad_norm": 1.3351200819015503, + "learning_rate": 5.03406979145158e-05, + "loss": 0.9849, + "step": 12520 + }, + { + "epoch": 0.5068798057466612, + "grad_norm": 1.156421184539795, + "learning_rate": 5.03200495560603e-05, + "loss": 0.9706, + "step": 12525 + }, + { + "epoch": 0.5070821529745042, + "grad_norm": 1.1729159355163574, + "learning_rate": 5.029940119760479e-05, + "loss": 1.0254, + "step": 12530 + }, + { + "epoch": 0.5072845002023473, + "grad_norm": 1.1957979202270508, + "learning_rate": 5.027875283914929e-05, + "loss": 0.985, + "step": 12535 + }, + { + "epoch": 0.5074868474301902, + "grad_norm": 1.2675886154174805, + "learning_rate": 5.025810448069379e-05, + "loss": 1.0182, + "step": 12540 + }, + { + "epoch": 0.5076891946580332, + "grad_norm": 1.2329570055007935, + "learning_rate": 5.023745612223828e-05, + "loss": 0.9963, + "step": 12545 + }, + { + "epoch": 0.5078915418858762, + "grad_norm": 1.115394115447998, + "learning_rate": 5.021680776378278e-05, + "loss": 1.0099, + "step": 12550 + }, + { + "epoch": 0.5080938891137191, + "grad_norm": 1.1686569452285767, + "learning_rate": 5.019615940532728e-05, + "loss": 0.9875, + "step": 12555 + }, + { + "epoch": 0.5082962363415621, + "grad_norm": 1.1489678621292114, + "learning_rate": 5.017551104687178e-05, + "loss": 0.9574, + "step": 12560 + }, + { + "epoch": 0.5084985835694051, + "grad_norm": 1.2116782665252686, + "learning_rate": 5.015486268841627e-05, + "loss": 1.0453, + "step": 12565 + }, + { + "epoch": 0.5087009307972481, + "grad_norm": 1.6617372035980225, + "learning_rate": 5.013421432996077e-05, + "loss": 1.0092, + "step": 12570 + }, + { + "epoch": 0.508903278025091, + "grad_norm": 1.1859623193740845, + "learning_rate": 5.011356597150527e-05, + "loss": 0.9867, + "step": 12575 + }, + { + "epoch": 0.509105625252934, + "grad_norm": 1.0471400022506714, + "learning_rate": 5.009291761304976e-05, + "loss": 1.0128, + "step": 12580 + }, + { + "epoch": 0.509307972480777, + "grad_norm": 1.1846133470535278, + "learning_rate": 5.007226925459426e-05, + "loss": 1.0121, + "step": 12585 + }, + { + "epoch": 0.50951031970862, + "grad_norm": 1.2684739828109741, + "learning_rate": 5.005162089613876e-05, + "loss": 0.9923, + "step": 12590 + }, + { + "epoch": 0.509712666936463, + "grad_norm": 1.230245590209961, + "learning_rate": 5.003097253768325e-05, + "loss": 1.0056, + "step": 12595 + }, + { + "epoch": 0.509915014164306, + "grad_norm": 1.1914970874786377, + "learning_rate": 5.001032417922775e-05, + "loss": 0.9449, + "step": 12600 + }, + { + "epoch": 0.5101173613921489, + "grad_norm": 1.1601983308792114, + "learning_rate": 4.9989675820772254e-05, + "loss": 0.988, + "step": 12605 + }, + { + "epoch": 0.5103197086199919, + "grad_norm": 1.1290122270584106, + "learning_rate": 4.9969027462316745e-05, + "loss": 1.0195, + "step": 12610 + }, + { + "epoch": 0.5105220558478348, + "grad_norm": 1.2394769191741943, + "learning_rate": 4.994837910386125e-05, + "loss": 0.9847, + "step": 12615 + }, + { + "epoch": 0.5107244030756779, + "grad_norm": 1.1106399297714233, + "learning_rate": 4.9927730745405746e-05, + "loss": 0.9432, + "step": 12620 + }, + { + "epoch": 0.5109267503035209, + "grad_norm": 1.2086856365203857, + "learning_rate": 4.990708238695024e-05, + "loss": 1.0175, + "step": 12625 + }, + { + "epoch": 0.5111290975313638, + "grad_norm": 1.1757665872573853, + "learning_rate": 4.988643402849474e-05, + "loss": 1.0162, + "step": 12630 + }, + { + "epoch": 0.5113314447592068, + "grad_norm": 1.10592782497406, + "learning_rate": 4.986578567003923e-05, + "loss": 1.0415, + "step": 12635 + }, + { + "epoch": 0.5115337919870497, + "grad_norm": 1.193536400794983, + "learning_rate": 4.9845137311583736e-05, + "loss": 0.9638, + "step": 12640 + }, + { + "epoch": 0.5117361392148928, + "grad_norm": 1.2107716798782349, + "learning_rate": 4.9824488953128226e-05, + "loss": 1.0404, + "step": 12645 + }, + { + "epoch": 0.5119384864427358, + "grad_norm": 1.1552340984344482, + "learning_rate": 4.980384059467272e-05, + "loss": 1.0049, + "step": 12650 + }, + { + "epoch": 0.5121408336705787, + "grad_norm": 1.0847322940826416, + "learning_rate": 4.978319223621723e-05, + "loss": 0.9955, + "step": 12655 + }, + { + "epoch": 0.5123431808984217, + "grad_norm": 1.1630275249481201, + "learning_rate": 4.976254387776172e-05, + "loss": 0.9893, + "step": 12660 + }, + { + "epoch": 0.5125455281262646, + "grad_norm": 1.1975666284561157, + "learning_rate": 4.974189551930622e-05, + "loss": 1.0614, + "step": 12665 + }, + { + "epoch": 0.5127478753541076, + "grad_norm": 1.1546673774719238, + "learning_rate": 4.972124716085071e-05, + "loss": 1.0466, + "step": 12670 + }, + { + "epoch": 0.5129502225819507, + "grad_norm": 1.2232493162155151, + "learning_rate": 4.970059880239521e-05, + "loss": 0.9794, + "step": 12675 + }, + { + "epoch": 0.5131525698097936, + "grad_norm": 1.173681378364563, + "learning_rate": 4.967995044393971e-05, + "loss": 1.0032, + "step": 12680 + }, + { + "epoch": 0.5133549170376366, + "grad_norm": 1.1275198459625244, + "learning_rate": 4.9659302085484204e-05, + "loss": 1.0221, + "step": 12685 + }, + { + "epoch": 0.5135572642654795, + "grad_norm": 1.188533067703247, + "learning_rate": 4.963865372702871e-05, + "loss": 0.941, + "step": 12690 + }, + { + "epoch": 0.5137596114933225, + "grad_norm": 1.095301628112793, + "learning_rate": 4.96180053685732e-05, + "loss": 0.9973, + "step": 12695 + }, + { + "epoch": 0.5139619587211656, + "grad_norm": 1.2311757802963257, + "learning_rate": 4.9597357010117696e-05, + "loss": 1.0309, + "step": 12700 + }, + { + "epoch": 0.5141643059490085, + "grad_norm": 1.1870546340942383, + "learning_rate": 4.9576708651662194e-05, + "loss": 1.0679, + "step": 12705 + }, + { + "epoch": 0.5143666531768515, + "grad_norm": 1.185866355895996, + "learning_rate": 4.955606029320669e-05, + "loss": 1.0102, + "step": 12710 + }, + { + "epoch": 0.5145690004046944, + "grad_norm": 1.1721301078796387, + "learning_rate": 4.953541193475119e-05, + "loss": 1.0154, + "step": 12715 + }, + { + "epoch": 0.5147713476325374, + "grad_norm": 1.2105289697647095, + "learning_rate": 4.9514763576295685e-05, + "loss": 1.0298, + "step": 12720 + }, + { + "epoch": 0.5149736948603804, + "grad_norm": 1.054449200630188, + "learning_rate": 4.949411521784019e-05, + "loss": 0.9709, + "step": 12725 + }, + { + "epoch": 0.5151760420882234, + "grad_norm": 1.1701349020004272, + "learning_rate": 4.947346685938468e-05, + "loss": 0.974, + "step": 12730 + }, + { + "epoch": 0.5153783893160664, + "grad_norm": 1.2717715501785278, + "learning_rate": 4.945281850092918e-05, + "loss": 1.0077, + "step": 12735 + }, + { + "epoch": 0.5155807365439093, + "grad_norm": 1.1190024614334106, + "learning_rate": 4.9432170142473675e-05, + "loss": 0.9512, + "step": 12740 + }, + { + "epoch": 0.5157830837717523, + "grad_norm": 1.200972557067871, + "learning_rate": 4.941152178401817e-05, + "loss": 1.0014, + "step": 12745 + }, + { + "epoch": 0.5159854309995953, + "grad_norm": 1.2721880674362183, + "learning_rate": 4.939087342556267e-05, + "loss": 0.9819, + "step": 12750 + }, + { + "epoch": 0.5161877782274383, + "grad_norm": 1.1532000303268433, + "learning_rate": 4.9370225067107167e-05, + "loss": 0.9611, + "step": 12755 + }, + { + "epoch": 0.5163901254552813, + "grad_norm": 1.1171449422836304, + "learning_rate": 4.9349576708651664e-05, + "loss": 0.9495, + "step": 12760 + }, + { + "epoch": 0.5165924726831242, + "grad_norm": 1.7924264669418335, + "learning_rate": 4.932892835019616e-05, + "loss": 1.0215, + "step": 12765 + }, + { + "epoch": 0.5167948199109672, + "grad_norm": 1.16851806640625, + "learning_rate": 4.930827999174066e-05, + "loss": 1.0273, + "step": 12770 + }, + { + "epoch": 0.5169971671388102, + "grad_norm": 1.1129615306854248, + "learning_rate": 4.9287631633285156e-05, + "loss": 1.0412, + "step": 12775 + }, + { + "epoch": 0.5171995143666531, + "grad_norm": 1.0944582223892212, + "learning_rate": 4.926698327482965e-05, + "loss": 1.0335, + "step": 12780 + }, + { + "epoch": 0.5174018615944962, + "grad_norm": 1.0807654857635498, + "learning_rate": 4.924633491637415e-05, + "loss": 0.9239, + "step": 12785 + }, + { + "epoch": 0.5176042088223392, + "grad_norm": 1.2499336004257202, + "learning_rate": 4.922568655791865e-05, + "loss": 0.9914, + "step": 12790 + }, + { + "epoch": 0.5178065560501821, + "grad_norm": 1.159722089767456, + "learning_rate": 4.9205038199463145e-05, + "loss": 0.9644, + "step": 12795 + }, + { + "epoch": 0.5180089032780251, + "grad_norm": 1.2099483013153076, + "learning_rate": 4.918438984100764e-05, + "loss": 0.9875, + "step": 12800 + }, + { + "epoch": 0.518211250505868, + "grad_norm": 1.1559176445007324, + "learning_rate": 4.916374148255214e-05, + "loss": 1.0176, + "step": 12805 + }, + { + "epoch": 0.5184135977337111, + "grad_norm": 1.1882922649383545, + "learning_rate": 4.914309312409664e-05, + "loss": 0.9865, + "step": 12810 + }, + { + "epoch": 0.518615944961554, + "grad_norm": 1.1380903720855713, + "learning_rate": 4.9122444765641134e-05, + "loss": 0.9426, + "step": 12815 + }, + { + "epoch": 0.518818292189397, + "grad_norm": 1.1586993932724, + "learning_rate": 4.910179640718563e-05, + "loss": 1.057, + "step": 12820 + }, + { + "epoch": 0.51902063941724, + "grad_norm": 1.2551969289779663, + "learning_rate": 4.908114804873013e-05, + "loss": 1.0243, + "step": 12825 + }, + { + "epoch": 0.5192229866450829, + "grad_norm": 1.0013608932495117, + "learning_rate": 4.9060499690274626e-05, + "loss": 0.9527, + "step": 12830 + }, + { + "epoch": 0.5194253338729259, + "grad_norm": 1.2260642051696777, + "learning_rate": 4.903985133181912e-05, + "loss": 0.9735, + "step": 12835 + }, + { + "epoch": 0.519627681100769, + "grad_norm": 1.1197712421417236, + "learning_rate": 4.901920297336362e-05, + "loss": 0.99, + "step": 12840 + }, + { + "epoch": 0.5198300283286119, + "grad_norm": 1.198989748954773, + "learning_rate": 4.899855461490812e-05, + "loss": 0.9567, + "step": 12845 + }, + { + "epoch": 0.5200323755564549, + "grad_norm": 1.0800516605377197, + "learning_rate": 4.8977906256452615e-05, + "loss": 0.9871, + "step": 12850 + }, + { + "epoch": 0.5202347227842978, + "grad_norm": 1.0519342422485352, + "learning_rate": 4.895725789799711e-05, + "loss": 1.0305, + "step": 12855 + }, + { + "epoch": 0.5204370700121408, + "grad_norm": 1.2230823040008545, + "learning_rate": 4.893660953954161e-05, + "loss": 0.9869, + "step": 12860 + }, + { + "epoch": 0.5206394172399839, + "grad_norm": 1.1805706024169922, + "learning_rate": 4.891596118108611e-05, + "loss": 1.0341, + "step": 12865 + }, + { + "epoch": 0.5208417644678268, + "grad_norm": 1.1650760173797607, + "learning_rate": 4.8895312822630604e-05, + "loss": 0.9969, + "step": 12870 + }, + { + "epoch": 0.5210441116956698, + "grad_norm": 1.0749975442886353, + "learning_rate": 4.88746644641751e-05, + "loss": 1.0302, + "step": 12875 + }, + { + "epoch": 0.5212464589235127, + "grad_norm": 1.0182640552520752, + "learning_rate": 4.88540161057196e-05, + "loss": 0.9953, + "step": 12880 + }, + { + "epoch": 0.5214488061513557, + "grad_norm": 1.1881667375564575, + "learning_rate": 4.8833367747264096e-05, + "loss": 0.9945, + "step": 12885 + }, + { + "epoch": 0.5216511533791987, + "grad_norm": 1.1521013975143433, + "learning_rate": 4.8812719388808594e-05, + "loss": 1.0438, + "step": 12890 + }, + { + "epoch": 0.5218535006070417, + "grad_norm": 1.1074672937393188, + "learning_rate": 4.8792071030353084e-05, + "loss": 1.0017, + "step": 12895 + }, + { + "epoch": 0.5220558478348847, + "grad_norm": 1.1211540699005127, + "learning_rate": 4.877142267189759e-05, + "loss": 1.0537, + "step": 12900 + }, + { + "epoch": 0.5222581950627276, + "grad_norm": 1.1508073806762695, + "learning_rate": 4.8750774313442085e-05, + "loss": 0.978, + "step": 12905 + }, + { + "epoch": 0.5224605422905706, + "grad_norm": 1.1583364009857178, + "learning_rate": 4.873012595498658e-05, + "loss": 1.0321, + "step": 12910 + }, + { + "epoch": 0.5226628895184136, + "grad_norm": 1.2299565076828003, + "learning_rate": 4.870947759653108e-05, + "loss": 0.9972, + "step": 12915 + }, + { + "epoch": 0.5228652367462566, + "grad_norm": 1.1356695890426636, + "learning_rate": 4.868882923807557e-05, + "loss": 1.0134, + "step": 12920 + }, + { + "epoch": 0.5230675839740996, + "grad_norm": 1.2544360160827637, + "learning_rate": 4.8668180879620075e-05, + "loss": 1.0036, + "step": 12925 + }, + { + "epoch": 0.5232699312019425, + "grad_norm": 1.1607639789581299, + "learning_rate": 4.8647532521164565e-05, + "loss": 0.9904, + "step": 12930 + }, + { + "epoch": 0.5234722784297855, + "grad_norm": 1.2394320964813232, + "learning_rate": 4.862688416270907e-05, + "loss": 0.9547, + "step": 12935 + }, + { + "epoch": 0.5236746256576285, + "grad_norm": 1.1878379583358765, + "learning_rate": 4.8606235804253567e-05, + "loss": 0.9685, + "step": 12940 + }, + { + "epoch": 0.5238769728854714, + "grad_norm": 1.2129117250442505, + "learning_rate": 4.858558744579806e-05, + "loss": 1.0465, + "step": 12945 + }, + { + "epoch": 0.5240793201133145, + "grad_norm": 1.0647077560424805, + "learning_rate": 4.856493908734256e-05, + "loss": 0.9771, + "step": 12950 + }, + { + "epoch": 0.5242816673411574, + "grad_norm": 1.1161081790924072, + "learning_rate": 4.854429072888705e-05, + "loss": 1.0098, + "step": 12955 + }, + { + "epoch": 0.5244840145690004, + "grad_norm": 1.1606014966964722, + "learning_rate": 4.8523642370431556e-05, + "loss": 1.032, + "step": 12960 + }, + { + "epoch": 0.5246863617968434, + "grad_norm": 1.1828463077545166, + "learning_rate": 4.8502994011976046e-05, + "loss": 0.999, + "step": 12965 + }, + { + "epoch": 0.5248887090246863, + "grad_norm": 1.135551929473877, + "learning_rate": 4.848234565352055e-05, + "loss": 0.9996, + "step": 12970 + }, + { + "epoch": 0.5250910562525294, + "grad_norm": 1.2061209678649902, + "learning_rate": 4.846169729506505e-05, + "loss": 0.9967, + "step": 12975 + }, + { + "epoch": 0.5252934034803723, + "grad_norm": 1.1298943758010864, + "learning_rate": 4.844104893660954e-05, + "loss": 0.9926, + "step": 12980 + }, + { + "epoch": 0.5254957507082153, + "grad_norm": 1.1890193223953247, + "learning_rate": 4.842040057815404e-05, + "loss": 1.0114, + "step": 12985 + }, + { + "epoch": 0.5256980979360583, + "grad_norm": 1.0580012798309326, + "learning_rate": 4.839975221969853e-05, + "loss": 0.9898, + "step": 12990 + }, + { + "epoch": 0.5259004451639012, + "grad_norm": 1.1724084615707397, + "learning_rate": 4.837910386124304e-05, + "loss": 1.0239, + "step": 12995 + }, + { + "epoch": 0.5261027923917442, + "grad_norm": 1.0939544439315796, + "learning_rate": 4.8358455502787534e-05, + "loss": 1.073, + "step": 13000 + }, + { + "epoch": 0.5263051396195872, + "grad_norm": 1.1149251461029053, + "learning_rate": 4.8337807144332025e-05, + "loss": 0.9969, + "step": 13005 + }, + { + "epoch": 0.5265074868474302, + "grad_norm": 1.157451868057251, + "learning_rate": 4.831715878587653e-05, + "loss": 0.9862, + "step": 13010 + }, + { + "epoch": 0.5267098340752732, + "grad_norm": 1.1021413803100586, + "learning_rate": 4.829651042742102e-05, + "loss": 0.9423, + "step": 13015 + }, + { + "epoch": 0.5269121813031161, + "grad_norm": 1.1386486291885376, + "learning_rate": 4.827586206896552e-05, + "loss": 0.95, + "step": 13020 + }, + { + "epoch": 0.5271145285309591, + "grad_norm": 1.3019928932189941, + "learning_rate": 4.8255213710510014e-05, + "loss": 0.9973, + "step": 13025 + }, + { + "epoch": 0.5273168757588022, + "grad_norm": 1.1542195081710815, + "learning_rate": 4.823456535205451e-05, + "loss": 0.973, + "step": 13030 + }, + { + "epoch": 0.5275192229866451, + "grad_norm": 1.1593953371047974, + "learning_rate": 4.8213916993599015e-05, + "loss": 0.9795, + "step": 13035 + }, + { + "epoch": 0.5277215702144881, + "grad_norm": 1.2002090215682983, + "learning_rate": 4.8193268635143506e-05, + "loss": 1.0046, + "step": 13040 + }, + { + "epoch": 0.527923917442331, + "grad_norm": 1.2281185388565063, + "learning_rate": 4.817262027668801e-05, + "loss": 1.0708, + "step": 13045 + }, + { + "epoch": 0.528126264670174, + "grad_norm": 1.063997507095337, + "learning_rate": 4.81519719182325e-05, + "loss": 1.0381, + "step": 13050 + }, + { + "epoch": 0.5283286118980169, + "grad_norm": 1.196027159690857, + "learning_rate": 4.8131323559777e-05, + "loss": 0.9978, + "step": 13055 + }, + { + "epoch": 0.52853095912586, + "grad_norm": 1.3122018575668335, + "learning_rate": 4.8110675201321495e-05, + "loss": 0.9686, + "step": 13060 + }, + { + "epoch": 0.528733306353703, + "grad_norm": 1.0822436809539795, + "learning_rate": 4.809002684286599e-05, + "loss": 1.0321, + "step": 13065 + }, + { + "epoch": 0.5289356535815459, + "grad_norm": 1.1360067129135132, + "learning_rate": 4.8069378484410496e-05, + "loss": 0.9818, + "step": 13070 + }, + { + "epoch": 0.5291380008093889, + "grad_norm": 1.1602964401245117, + "learning_rate": 4.804873012595499e-05, + "loss": 0.9691, + "step": 13075 + }, + { + "epoch": 0.5293403480372318, + "grad_norm": 1.0667049884796143, + "learning_rate": 4.802808176749949e-05, + "loss": 0.9802, + "step": 13080 + }, + { + "epoch": 0.5295426952650749, + "grad_norm": 1.2503582239151, + "learning_rate": 4.800743340904398e-05, + "loss": 1.0567, + "step": 13085 + }, + { + "epoch": 0.5297450424929179, + "grad_norm": 1.201912522315979, + "learning_rate": 4.798678505058848e-05, + "loss": 1.0163, + "step": 13090 + }, + { + "epoch": 0.5299473897207608, + "grad_norm": 1.2826204299926758, + "learning_rate": 4.7966136692132976e-05, + "loss": 0.9855, + "step": 13095 + }, + { + "epoch": 0.5301497369486038, + "grad_norm": 1.1695324182510376, + "learning_rate": 4.794548833367747e-05, + "loss": 0.9906, + "step": 13100 + }, + { + "epoch": 0.5303520841764467, + "grad_norm": 1.195469617843628, + "learning_rate": 4.792483997522198e-05, + "loss": 1.0072, + "step": 13105 + }, + { + "epoch": 0.5305544314042897, + "grad_norm": 1.1526228189468384, + "learning_rate": 4.790419161676647e-05, + "loss": 0.9963, + "step": 13110 + }, + { + "epoch": 0.5307567786321328, + "grad_norm": 1.158942699432373, + "learning_rate": 4.7883543258310965e-05, + "loss": 1.0502, + "step": 13115 + }, + { + "epoch": 0.5309591258599757, + "grad_norm": 1.2206699848175049, + "learning_rate": 4.786289489985546e-05, + "loss": 1.0469, + "step": 13120 + }, + { + "epoch": 0.5311614730878187, + "grad_norm": 1.1328481435775757, + "learning_rate": 4.784224654139996e-05, + "loss": 0.976, + "step": 13125 + }, + { + "epoch": 0.5313638203156617, + "grad_norm": 1.1425429582595825, + "learning_rate": 4.782159818294446e-05, + "loss": 1.0228, + "step": 13130 + }, + { + "epoch": 0.5315661675435046, + "grad_norm": 1.1963837146759033, + "learning_rate": 4.7800949824488954e-05, + "loss": 0.9743, + "step": 13135 + }, + { + "epoch": 0.5317685147713477, + "grad_norm": 1.151746153831482, + "learning_rate": 4.778030146603345e-05, + "loss": 1.0042, + "step": 13140 + }, + { + "epoch": 0.5319708619991906, + "grad_norm": 1.2935644388198853, + "learning_rate": 4.775965310757795e-05, + "loss": 1.0134, + "step": 13145 + }, + { + "epoch": 0.5321732092270336, + "grad_norm": 1.2662339210510254, + "learning_rate": 4.7739004749122446e-05, + "loss": 1.0167, + "step": 13150 + }, + { + "epoch": 0.5323755564548766, + "grad_norm": 1.1560618877410889, + "learning_rate": 4.7718356390666944e-05, + "loss": 0.9559, + "step": 13155 + }, + { + "epoch": 0.5325779036827195, + "grad_norm": 1.0124341249465942, + "learning_rate": 4.769770803221144e-05, + "loss": 1.009, + "step": 13160 + }, + { + "epoch": 0.5327802509105626, + "grad_norm": 1.2502763271331787, + "learning_rate": 4.767705967375594e-05, + "loss": 0.9563, + "step": 13165 + }, + { + "epoch": 0.5329825981384055, + "grad_norm": 1.1667861938476562, + "learning_rate": 4.7656411315300435e-05, + "loss": 1.0503, + "step": 13170 + }, + { + "epoch": 0.5331849453662485, + "grad_norm": 1.2142013311386108, + "learning_rate": 4.763576295684493e-05, + "loss": 0.9992, + "step": 13175 + }, + { + "epoch": 0.5333872925940915, + "grad_norm": 1.1926441192626953, + "learning_rate": 4.761511459838943e-05, + "loss": 0.9737, + "step": 13180 + }, + { + "epoch": 0.5335896398219344, + "grad_norm": 1.1604886054992676, + "learning_rate": 4.759446623993393e-05, + "loss": 1.0423, + "step": 13185 + }, + { + "epoch": 0.5337919870497774, + "grad_norm": 1.208500623703003, + "learning_rate": 4.7573817881478425e-05, + "loss": 0.9808, + "step": 13190 + }, + { + "epoch": 0.5339943342776204, + "grad_norm": 1.0896100997924805, + "learning_rate": 4.755316952302292e-05, + "loss": 0.9782, + "step": 13195 + }, + { + "epoch": 0.5341966815054634, + "grad_norm": 1.2205355167388916, + "learning_rate": 4.753252116456742e-05, + "loss": 1.0097, + "step": 13200 + }, + { + "epoch": 0.5343990287333064, + "grad_norm": 1.2296879291534424, + "learning_rate": 4.7511872806111917e-05, + "loss": 1.0112, + "step": 13205 + }, + { + "epoch": 0.5346013759611493, + "grad_norm": 1.1248998641967773, + "learning_rate": 4.7491224447656414e-05, + "loss": 0.9598, + "step": 13210 + }, + { + "epoch": 0.5348037231889923, + "grad_norm": 1.1658109426498413, + "learning_rate": 4.747057608920091e-05, + "loss": 1.0401, + "step": 13215 + }, + { + "epoch": 0.5350060704168353, + "grad_norm": 1.14699387550354, + "learning_rate": 4.744992773074541e-05, + "loss": 1.0084, + "step": 13220 + }, + { + "epoch": 0.5352084176446783, + "grad_norm": 1.141981601715088, + "learning_rate": 4.7429279372289906e-05, + "loss": 0.964, + "step": 13225 + }, + { + "epoch": 0.5354107648725213, + "grad_norm": 1.2816085815429688, + "learning_rate": 4.74086310138344e-05, + "loss": 1.0027, + "step": 13230 + }, + { + "epoch": 0.5356131121003642, + "grad_norm": 1.151843547821045, + "learning_rate": 4.73879826553789e-05, + "loss": 0.988, + "step": 13235 + }, + { + "epoch": 0.5358154593282072, + "grad_norm": 1.249070405960083, + "learning_rate": 4.73673342969234e-05, + "loss": 0.971, + "step": 13240 + }, + { + "epoch": 0.5360178065560501, + "grad_norm": 1.0401089191436768, + "learning_rate": 4.7346685938467895e-05, + "loss": 0.9856, + "step": 13245 + }, + { + "epoch": 0.5362201537838932, + "grad_norm": 1.2624151706695557, + "learning_rate": 4.732603758001239e-05, + "loss": 0.9866, + "step": 13250 + }, + { + "epoch": 0.5364225010117362, + "grad_norm": 1.4663299322128296, + "learning_rate": 4.730538922155689e-05, + "loss": 1.0089, + "step": 13255 + }, + { + "epoch": 0.5366248482395791, + "grad_norm": 1.167694330215454, + "learning_rate": 4.728474086310139e-05, + "loss": 0.9083, + "step": 13260 + }, + { + "epoch": 0.5368271954674221, + "grad_norm": 1.2311296463012695, + "learning_rate": 4.7264092504645884e-05, + "loss": 0.9911, + "step": 13265 + }, + { + "epoch": 0.537029542695265, + "grad_norm": 1.1577832698822021, + "learning_rate": 4.724344414619038e-05, + "loss": 0.9533, + "step": 13270 + }, + { + "epoch": 0.5372318899231081, + "grad_norm": 1.2128522396087646, + "learning_rate": 4.722279578773487e-05, + "loss": 0.997, + "step": 13275 + }, + { + "epoch": 0.5374342371509511, + "grad_norm": 1.1526645421981812, + "learning_rate": 4.7202147429279376e-05, + "loss": 0.9862, + "step": 13280 + }, + { + "epoch": 0.537636584378794, + "grad_norm": 1.2666778564453125, + "learning_rate": 4.718149907082387e-05, + "loss": 0.9863, + "step": 13285 + }, + { + "epoch": 0.537838931606637, + "grad_norm": 1.4724574089050293, + "learning_rate": 4.716085071236837e-05, + "loss": 0.982, + "step": 13290 + }, + { + "epoch": 0.5380412788344799, + "grad_norm": 1.155213713645935, + "learning_rate": 4.714020235391287e-05, + "loss": 0.9791, + "step": 13295 + }, + { + "epoch": 0.5382436260623229, + "grad_norm": 1.1476895809173584, + "learning_rate": 4.7119553995457365e-05, + "loss": 0.9988, + "step": 13300 + }, + { + "epoch": 0.538445973290166, + "grad_norm": 1.1798290014266968, + "learning_rate": 4.709890563700186e-05, + "loss": 1.0468, + "step": 13305 + }, + { + "epoch": 0.5386483205180089, + "grad_norm": 1.0469969511032104, + "learning_rate": 4.707825727854635e-05, + "loss": 0.9303, + "step": 13310 + }, + { + "epoch": 0.5388506677458519, + "grad_norm": 1.2639477252960205, + "learning_rate": 4.705760892009086e-05, + "loss": 0.9489, + "step": 13315 + }, + { + "epoch": 0.5390530149736948, + "grad_norm": 1.20005464553833, + "learning_rate": 4.7036960561635354e-05, + "loss": 1.0058, + "step": 13320 + }, + { + "epoch": 0.5392553622015378, + "grad_norm": 1.1812065839767456, + "learning_rate": 4.701631220317985e-05, + "loss": 0.9417, + "step": 13325 + }, + { + "epoch": 0.5394577094293809, + "grad_norm": 1.2369160652160645, + "learning_rate": 4.699566384472435e-05, + "loss": 1.0068, + "step": 13330 + }, + { + "epoch": 0.5396600566572238, + "grad_norm": 1.1857260465621948, + "learning_rate": 4.697501548626884e-05, + "loss": 0.9527, + "step": 13335 + }, + { + "epoch": 0.5398624038850668, + "grad_norm": 1.1948858499526978, + "learning_rate": 4.6954367127813344e-05, + "loss": 1.0241, + "step": 13340 + }, + { + "epoch": 0.5400647511129097, + "grad_norm": 1.1948617696762085, + "learning_rate": 4.6933718769357834e-05, + "loss": 0.9988, + "step": 13345 + }, + { + "epoch": 0.5402670983407527, + "grad_norm": 1.1652532815933228, + "learning_rate": 4.691307041090234e-05, + "loss": 1.051, + "step": 13350 + }, + { + "epoch": 0.5404694455685957, + "grad_norm": 1.1360259056091309, + "learning_rate": 4.6892422052446835e-05, + "loss": 0.965, + "step": 13355 + }, + { + "epoch": 0.5406717927964387, + "grad_norm": 1.151434302330017, + "learning_rate": 4.6871773693991326e-05, + "loss": 1.0157, + "step": 13360 + }, + { + "epoch": 0.5408741400242817, + "grad_norm": 1.175532579421997, + "learning_rate": 4.685112533553583e-05, + "loss": 0.9648, + "step": 13365 + }, + { + "epoch": 0.5410764872521246, + "grad_norm": 1.2540494203567505, + "learning_rate": 4.683047697708032e-05, + "loss": 1.0434, + "step": 13370 + }, + { + "epoch": 0.5412788344799676, + "grad_norm": 1.1384494304656982, + "learning_rate": 4.6809828618624825e-05, + "loss": 1.0127, + "step": 13375 + }, + { + "epoch": 0.5414811817078106, + "grad_norm": 1.1299713850021362, + "learning_rate": 4.678918026016932e-05, + "loss": 0.982, + "step": 13380 + }, + { + "epoch": 0.5416835289356536, + "grad_norm": 1.1605921983718872, + "learning_rate": 4.676853190171381e-05, + "loss": 1.0333, + "step": 13385 + }, + { + "epoch": 0.5418858761634966, + "grad_norm": 1.2055753469467163, + "learning_rate": 4.6747883543258316e-05, + "loss": 1.0047, + "step": 13390 + }, + { + "epoch": 0.5420882233913396, + "grad_norm": 1.1855299472808838, + "learning_rate": 4.672723518480281e-05, + "loss": 0.9953, + "step": 13395 + }, + { + "epoch": 0.5422905706191825, + "grad_norm": 1.2290223836898804, + "learning_rate": 4.670658682634731e-05, + "loss": 0.9913, + "step": 13400 + }, + { + "epoch": 0.5424929178470255, + "grad_norm": 1.1520344018936157, + "learning_rate": 4.66859384678918e-05, + "loss": 1.0972, + "step": 13405 + }, + { + "epoch": 0.5426952650748684, + "grad_norm": 1.0018000602722168, + "learning_rate": 4.66652901094363e-05, + "loss": 0.9873, + "step": 13410 + }, + { + "epoch": 0.5428976123027115, + "grad_norm": 1.1549125909805298, + "learning_rate": 4.66446417509808e-05, + "loss": 1.0019, + "step": 13415 + }, + { + "epoch": 0.5430999595305545, + "grad_norm": 1.1142404079437256, + "learning_rate": 4.6623993392525293e-05, + "loss": 0.9615, + "step": 13420 + }, + { + "epoch": 0.5433023067583974, + "grad_norm": 1.4011167287826538, + "learning_rate": 4.66033450340698e-05, + "loss": 1.0121, + "step": 13425 + }, + { + "epoch": 0.5435046539862404, + "grad_norm": 1.1823188066482544, + "learning_rate": 4.658269667561429e-05, + "loss": 0.9744, + "step": 13430 + }, + { + "epoch": 0.5437070012140833, + "grad_norm": 1.189070463180542, + "learning_rate": 4.656204831715879e-05, + "loss": 1.0399, + "step": 13435 + }, + { + "epoch": 0.5439093484419264, + "grad_norm": 1.09059476852417, + "learning_rate": 4.654139995870328e-05, + "loss": 1.0146, + "step": 13440 + }, + { + "epoch": 0.5441116956697694, + "grad_norm": 1.1695646047592163, + "learning_rate": 4.652075160024778e-05, + "loss": 0.9673, + "step": 13445 + }, + { + "epoch": 0.5443140428976123, + "grad_norm": 1.3173099756240845, + "learning_rate": 4.6500103241792284e-05, + "loss": 1.0139, + "step": 13450 + }, + { + "epoch": 0.5445163901254553, + "grad_norm": 1.3592511415481567, + "learning_rate": 4.6479454883336775e-05, + "loss": 0.9623, + "step": 13455 + }, + { + "epoch": 0.5447187373532982, + "grad_norm": 1.2108453512191772, + "learning_rate": 4.645880652488128e-05, + "loss": 1.0226, + "step": 13460 + }, + { + "epoch": 0.5449210845811412, + "grad_norm": 1.280344843864441, + "learning_rate": 4.643815816642577e-05, + "loss": 0.9857, + "step": 13465 + }, + { + "epoch": 0.5451234318089843, + "grad_norm": 1.1430131196975708, + "learning_rate": 4.6417509807970266e-05, + "loss": 0.9474, + "step": 13470 + }, + { + "epoch": 0.5453257790368272, + "grad_norm": 1.2163547277450562, + "learning_rate": 4.6396861449514764e-05, + "loss": 0.9842, + "step": 13475 + }, + { + "epoch": 0.5455281262646702, + "grad_norm": 1.1540495157241821, + "learning_rate": 4.637621309105926e-05, + "loss": 0.9969, + "step": 13480 + }, + { + "epoch": 0.5457304734925131, + "grad_norm": 1.2115650177001953, + "learning_rate": 4.6355564732603765e-05, + "loss": 0.981, + "step": 13485 + }, + { + "epoch": 0.5459328207203561, + "grad_norm": 1.1425132751464844, + "learning_rate": 4.6334916374148256e-05, + "loss": 0.9518, + "step": 13490 + }, + { + "epoch": 0.5461351679481992, + "grad_norm": 1.1303656101226807, + "learning_rate": 4.631426801569275e-05, + "loss": 1.0307, + "step": 13495 + }, + { + "epoch": 0.5463375151760421, + "grad_norm": 1.317814826965332, + "learning_rate": 4.629361965723725e-05, + "loss": 1.0072, + "step": 13500 + }, + { + "epoch": 0.5465398624038851, + "grad_norm": 1.2411702871322632, + "learning_rate": 4.627297129878175e-05, + "loss": 0.9987, + "step": 13505 + }, + { + "epoch": 0.546742209631728, + "grad_norm": 1.2443106174468994, + "learning_rate": 4.6252322940326245e-05, + "loss": 0.9274, + "step": 13510 + }, + { + "epoch": 0.546944556859571, + "grad_norm": 1.24956214427948, + "learning_rate": 4.623167458187074e-05, + "loss": 1.0074, + "step": 13515 + }, + { + "epoch": 0.547146904087414, + "grad_norm": 1.0481141805648804, + "learning_rate": 4.621102622341524e-05, + "loss": 0.9905, + "step": 13520 + }, + { + "epoch": 0.547349251315257, + "grad_norm": 1.0809906721115112, + "learning_rate": 4.619037786495974e-05, + "loss": 1.0201, + "step": 13525 + }, + { + "epoch": 0.5475515985431, + "grad_norm": 1.2640081644058228, + "learning_rate": 4.6169729506504234e-05, + "loss": 0.9462, + "step": 13530 + }, + { + "epoch": 0.5477539457709429, + "grad_norm": 1.1775264739990234, + "learning_rate": 4.614908114804873e-05, + "loss": 0.9548, + "step": 13535 + }, + { + "epoch": 0.5479562929987859, + "grad_norm": 1.2151538133621216, + "learning_rate": 4.612843278959323e-05, + "loss": 0.9686, + "step": 13540 + }, + { + "epoch": 0.5481586402266289, + "grad_norm": 1.1137951612472534, + "learning_rate": 4.610778443113773e-05, + "loss": 0.9793, + "step": 13545 + }, + { + "epoch": 0.5483609874544719, + "grad_norm": 1.1605082750320435, + "learning_rate": 4.608713607268222e-05, + "loss": 0.9889, + "step": 13550 + }, + { + "epoch": 0.5485633346823149, + "grad_norm": 1.1585845947265625, + "learning_rate": 4.606648771422672e-05, + "loss": 1.0128, + "step": 13555 + }, + { + "epoch": 0.5487656819101578, + "grad_norm": 1.0791916847229004, + "learning_rate": 4.604583935577122e-05, + "loss": 0.9795, + "step": 13560 + }, + { + "epoch": 0.5489680291380008, + "grad_norm": 1.255650520324707, + "learning_rate": 4.6025190997315715e-05, + "loss": 0.9925, + "step": 13565 + }, + { + "epoch": 0.5491703763658438, + "grad_norm": 1.1521662473678589, + "learning_rate": 4.600454263886021e-05, + "loss": 1.0126, + "step": 13570 + }, + { + "epoch": 0.5493727235936867, + "grad_norm": 1.1036776304244995, + "learning_rate": 4.598389428040471e-05, + "loss": 1.0461, + "step": 13575 + }, + { + "epoch": 0.5495750708215298, + "grad_norm": 1.139861822128296, + "learning_rate": 4.596324592194921e-05, + "loss": 0.9958, + "step": 13580 + }, + { + "epoch": 0.5497774180493727, + "grad_norm": 1.1681941747665405, + "learning_rate": 4.5942597563493704e-05, + "loss": 0.9831, + "step": 13585 + }, + { + "epoch": 0.5499797652772157, + "grad_norm": 1.1199458837509155, + "learning_rate": 4.59219492050382e-05, + "loss": 1.0242, + "step": 13590 + }, + { + "epoch": 0.5501821125050587, + "grad_norm": 1.1740652322769165, + "learning_rate": 4.59013008465827e-05, + "loss": 0.9959, + "step": 13595 + }, + { + "epoch": 0.5503844597329016, + "grad_norm": 1.0760498046875, + "learning_rate": 4.5880652488127196e-05, + "loss": 0.9597, + "step": 13600 + }, + { + "epoch": 0.5505868069607447, + "grad_norm": 1.0998018980026245, + "learning_rate": 4.5860004129671693e-05, + "loss": 1.0328, + "step": 13605 + }, + { + "epoch": 0.5507891541885876, + "grad_norm": 1.2137548923492432, + "learning_rate": 4.583935577121619e-05, + "loss": 1.0383, + "step": 13610 + }, + { + "epoch": 0.5509915014164306, + "grad_norm": 1.1961240768432617, + "learning_rate": 4.581870741276069e-05, + "loss": 0.9823, + "step": 13615 + }, + { + "epoch": 0.5511938486442736, + "grad_norm": 1.2069047689437866, + "learning_rate": 4.5798059054305185e-05, + "loss": 1.0187, + "step": 13620 + }, + { + "epoch": 0.5513961958721165, + "grad_norm": 1.1343979835510254, + "learning_rate": 4.577741069584968e-05, + "loss": 1.0032, + "step": 13625 + }, + { + "epoch": 0.5515985430999595, + "grad_norm": 1.1684554815292358, + "learning_rate": 4.575676233739418e-05, + "loss": 1.0025, + "step": 13630 + }, + { + "epoch": 0.5518008903278026, + "grad_norm": 1.1609210968017578, + "learning_rate": 4.573611397893868e-05, + "loss": 1.0775, + "step": 13635 + }, + { + "epoch": 0.5520032375556455, + "grad_norm": 1.1721374988555908, + "learning_rate": 4.5715465620483175e-05, + "loss": 0.9471, + "step": 13640 + }, + { + "epoch": 0.5522055847834885, + "grad_norm": 1.2115263938903809, + "learning_rate": 4.569481726202767e-05, + "loss": 1.0247, + "step": 13645 + }, + { + "epoch": 0.5524079320113314, + "grad_norm": 1.1860140562057495, + "learning_rate": 4.567416890357217e-05, + "loss": 1.0428, + "step": 13650 + }, + { + "epoch": 0.5526102792391744, + "grad_norm": 1.2308109998703003, + "learning_rate": 4.5653520545116666e-05, + "loss": 1.0178, + "step": 13655 + }, + { + "epoch": 0.5528126264670175, + "grad_norm": 1.2241133451461792, + "learning_rate": 4.5632872186661164e-05, + "loss": 1.0624, + "step": 13660 + }, + { + "epoch": 0.5530149736948604, + "grad_norm": 1.2302842140197754, + "learning_rate": 4.561222382820566e-05, + "loss": 0.9859, + "step": 13665 + }, + { + "epoch": 0.5532173209227034, + "grad_norm": 1.160462737083435, + "learning_rate": 4.559157546975016e-05, + "loss": 0.9593, + "step": 13670 + }, + { + "epoch": 0.5534196681505463, + "grad_norm": 1.130302906036377, + "learning_rate": 4.5570927111294656e-05, + "loss": 1.0348, + "step": 13675 + }, + { + "epoch": 0.5536220153783893, + "grad_norm": 1.1454074382781982, + "learning_rate": 4.555027875283915e-05, + "loss": 1.0393, + "step": 13680 + }, + { + "epoch": 0.5538243626062322, + "grad_norm": 1.1717177629470825, + "learning_rate": 4.552963039438365e-05, + "loss": 1.0357, + "step": 13685 + }, + { + "epoch": 0.5540267098340753, + "grad_norm": 1.2080857753753662, + "learning_rate": 4.550898203592814e-05, + "loss": 1.0632, + "step": 13690 + }, + { + "epoch": 0.5542290570619183, + "grad_norm": 1.1342897415161133, + "learning_rate": 4.5488333677472645e-05, + "loss": 0.9912, + "step": 13695 + }, + { + "epoch": 0.5544314042897612, + "grad_norm": 1.120079517364502, + "learning_rate": 4.546768531901714e-05, + "loss": 1.0341, + "step": 13700 + }, + { + "epoch": 0.5546337515176042, + "grad_norm": 1.1466180086135864, + "learning_rate": 4.544703696056164e-05, + "loss": 1.0419, + "step": 13705 + }, + { + "epoch": 0.5548360987454471, + "grad_norm": 1.118067741394043, + "learning_rate": 4.542638860210614e-05, + "loss": 1.0625, + "step": 13710 + }, + { + "epoch": 0.5550384459732902, + "grad_norm": 1.2374582290649414, + "learning_rate": 4.540574024365063e-05, + "loss": 1.021, + "step": 13715 + }, + { + "epoch": 0.5552407932011332, + "grad_norm": 1.3281618356704712, + "learning_rate": 4.538509188519513e-05, + "loss": 0.9659, + "step": 13720 + }, + { + "epoch": 0.5554431404289761, + "grad_norm": 1.2413415908813477, + "learning_rate": 4.536444352673962e-05, + "loss": 1.0318, + "step": 13725 + }, + { + "epoch": 0.5556454876568191, + "grad_norm": 1.1204394102096558, + "learning_rate": 4.5343795168284126e-05, + "loss": 0.9587, + "step": 13730 + }, + { + "epoch": 0.555847834884662, + "grad_norm": 1.3070437908172607, + "learning_rate": 4.532314680982862e-05, + "loss": 1.0923, + "step": 13735 + }, + { + "epoch": 0.556050182112505, + "grad_norm": 1.3235679864883423, + "learning_rate": 4.5302498451373114e-05, + "loss": 1.0561, + "step": 13740 + }, + { + "epoch": 0.5562525293403481, + "grad_norm": 1.1827317476272583, + "learning_rate": 4.528185009291762e-05, + "loss": 1.0212, + "step": 13745 + }, + { + "epoch": 0.556454876568191, + "grad_norm": 1.1174043416976929, + "learning_rate": 4.526120173446211e-05, + "loss": 1.0995, + "step": 13750 + }, + { + "epoch": 0.556657223796034, + "grad_norm": 1.128941535949707, + "learning_rate": 4.524055337600661e-05, + "loss": 1.0136, + "step": 13755 + }, + { + "epoch": 0.556859571023877, + "grad_norm": 1.1892149448394775, + "learning_rate": 4.521990501755111e-05, + "loss": 1.0523, + "step": 13760 + }, + { + "epoch": 0.5570619182517199, + "grad_norm": 1.0689456462860107, + "learning_rate": 4.519925665909561e-05, + "loss": 1.0007, + "step": 13765 + }, + { + "epoch": 0.557264265479563, + "grad_norm": 1.2329188585281372, + "learning_rate": 4.5178608300640104e-05, + "loss": 1.0157, + "step": 13770 + }, + { + "epoch": 0.5574666127074059, + "grad_norm": 1.202246069908142, + "learning_rate": 4.5157959942184595e-05, + "loss": 1.0, + "step": 13775 + }, + { + "epoch": 0.5576689599352489, + "grad_norm": 1.1702176332473755, + "learning_rate": 4.51373115837291e-05, + "loss": 1.0001, + "step": 13780 + }, + { + "epoch": 0.5578713071630919, + "grad_norm": 1.073132038116455, + "learning_rate": 4.511666322527359e-05, + "loss": 1.012, + "step": 13785 + }, + { + "epoch": 0.5580736543909348, + "grad_norm": 1.2309406995773315, + "learning_rate": 4.5096014866818093e-05, + "loss": 1.0682, + "step": 13790 + }, + { + "epoch": 0.5582760016187778, + "grad_norm": 1.0988270044326782, + "learning_rate": 4.507536650836259e-05, + "loss": 1.0334, + "step": 13795 + }, + { + "epoch": 0.5584783488466208, + "grad_norm": 1.1950405836105347, + "learning_rate": 4.505471814990708e-05, + "loss": 0.9594, + "step": 13800 + }, + { + "epoch": 0.5586806960744638, + "grad_norm": 1.2276102304458618, + "learning_rate": 4.5034069791451585e-05, + "loss": 0.9512, + "step": 13805 + }, + { + "epoch": 0.5588830433023068, + "grad_norm": 1.2122454643249512, + "learning_rate": 4.5013421432996076e-05, + "loss": 0.9706, + "step": 13810 + }, + { + "epoch": 0.5590853905301497, + "grad_norm": 1.043339490890503, + "learning_rate": 4.499277307454058e-05, + "loss": 0.9137, + "step": 13815 + }, + { + "epoch": 0.5592877377579927, + "grad_norm": 1.2296773195266724, + "learning_rate": 4.497212471608507e-05, + "loss": 1.0002, + "step": 13820 + }, + { + "epoch": 0.5594900849858357, + "grad_norm": 1.1996498107910156, + "learning_rate": 4.495147635762957e-05, + "loss": 0.9988, + "step": 13825 + }, + { + "epoch": 0.5596924322136787, + "grad_norm": 1.2913368940353394, + "learning_rate": 4.493082799917407e-05, + "loss": 1.0247, + "step": 13830 + }, + { + "epoch": 0.5598947794415217, + "grad_norm": 1.0832664966583252, + "learning_rate": 4.491017964071856e-05, + "loss": 0.962, + "step": 13835 + }, + { + "epoch": 0.5600971266693646, + "grad_norm": 1.238834261894226, + "learning_rate": 4.4889531282263066e-05, + "loss": 0.9866, + "step": 13840 + }, + { + "epoch": 0.5602994738972076, + "grad_norm": 1.069636344909668, + "learning_rate": 4.486888292380756e-05, + "loss": 0.9986, + "step": 13845 + }, + { + "epoch": 0.5605018211250505, + "grad_norm": 1.164320707321167, + "learning_rate": 4.4848234565352054e-05, + "loss": 0.9916, + "step": 13850 + }, + { + "epoch": 0.5607041683528936, + "grad_norm": 1.1442533731460571, + "learning_rate": 4.482758620689655e-05, + "loss": 1.0067, + "step": 13855 + }, + { + "epoch": 0.5609065155807366, + "grad_norm": 1.1331062316894531, + "learning_rate": 4.480693784844105e-05, + "loss": 0.9219, + "step": 13860 + }, + { + "epoch": 0.5611088628085795, + "grad_norm": 1.0536301136016846, + "learning_rate": 4.478628948998555e-05, + "loss": 1.0004, + "step": 13865 + }, + { + "epoch": 0.5613112100364225, + "grad_norm": 1.2179696559906006, + "learning_rate": 4.4765641131530043e-05, + "loss": 1.0181, + "step": 13870 + }, + { + "epoch": 0.5615135572642654, + "grad_norm": 1.1594337224960327, + "learning_rate": 4.474499277307454e-05, + "loss": 0.9524, + "step": 13875 + }, + { + "epoch": 0.5617159044921085, + "grad_norm": 1.311442494392395, + "learning_rate": 4.472434441461904e-05, + "loss": 1.0565, + "step": 13880 + }, + { + "epoch": 0.5619182517199515, + "grad_norm": 1.1441835165023804, + "learning_rate": 4.4703696056163535e-05, + "loss": 0.9707, + "step": 13885 + }, + { + "epoch": 0.5621205989477944, + "grad_norm": 2.9216387271881104, + "learning_rate": 4.468304769770803e-05, + "loss": 1.0217, + "step": 13890 + }, + { + "epoch": 0.5623229461756374, + "grad_norm": 1.1903290748596191, + "learning_rate": 4.466239933925253e-05, + "loss": 1.0293, + "step": 13895 + }, + { + "epoch": 0.5625252934034803, + "grad_norm": 1.1540318727493286, + "learning_rate": 4.4641750980797034e-05, + "loss": 0.9939, + "step": 13900 + }, + { + "epoch": 0.5627276406313233, + "grad_norm": 1.2955759763717651, + "learning_rate": 4.4621102622341525e-05, + "loss": 0.979, + "step": 13905 + }, + { + "epoch": 0.5629299878591664, + "grad_norm": 1.2096494436264038, + "learning_rate": 4.460045426388602e-05, + "loss": 0.9966, + "step": 13910 + }, + { + "epoch": 0.5631323350870093, + "grad_norm": 1.1765676736831665, + "learning_rate": 4.457980590543052e-05, + "loss": 0.998, + "step": 13915 + }, + { + "epoch": 0.5633346823148523, + "grad_norm": 1.1927794218063354, + "learning_rate": 4.4559157546975016e-05, + "loss": 1.0851, + "step": 13920 + }, + { + "epoch": 0.5635370295426952, + "grad_norm": 1.1235953569412231, + "learning_rate": 4.453850918851952e-05, + "loss": 0.9892, + "step": 13925 + }, + { + "epoch": 0.5637393767705382, + "grad_norm": 1.101344347000122, + "learning_rate": 4.451786083006401e-05, + "loss": 0.9586, + "step": 13930 + }, + { + "epoch": 0.5639417239983813, + "grad_norm": 1.2288463115692139, + "learning_rate": 4.449721247160851e-05, + "loss": 0.9964, + "step": 13935 + }, + { + "epoch": 0.5641440712262242, + "grad_norm": 1.2437199354171753, + "learning_rate": 4.4476564113153006e-05, + "loss": 1.0403, + "step": 13940 + }, + { + "epoch": 0.5643464184540672, + "grad_norm": 1.0780836343765259, + "learning_rate": 4.44559157546975e-05, + "loss": 1.0403, + "step": 13945 + }, + { + "epoch": 0.5645487656819101, + "grad_norm": 1.2592697143554688, + "learning_rate": 4.4435267396242e-05, + "loss": 1.0537, + "step": 13950 + }, + { + "epoch": 0.5647511129097531, + "grad_norm": 1.2178301811218262, + "learning_rate": 4.44146190377865e-05, + "loss": 1.0226, + "step": 13955 + }, + { + "epoch": 0.5649534601375961, + "grad_norm": 1.1477197408676147, + "learning_rate": 4.4393970679330995e-05, + "loss": 0.9814, + "step": 13960 + }, + { + "epoch": 0.5651558073654391, + "grad_norm": 1.1132920980453491, + "learning_rate": 4.437332232087549e-05, + "loss": 0.9875, + "step": 13965 + }, + { + "epoch": 0.5653581545932821, + "grad_norm": 1.2448670864105225, + "learning_rate": 4.435267396241999e-05, + "loss": 1.0072, + "step": 13970 + }, + { + "epoch": 0.565560501821125, + "grad_norm": 1.215716004371643, + "learning_rate": 4.433202560396449e-05, + "loss": 0.9826, + "step": 13975 + }, + { + "epoch": 0.565762849048968, + "grad_norm": 1.107932686805725, + "learning_rate": 4.4311377245508984e-05, + "loss": 0.9905, + "step": 13980 + }, + { + "epoch": 0.565965196276811, + "grad_norm": 1.1962240934371948, + "learning_rate": 4.429072888705348e-05, + "loss": 0.9612, + "step": 13985 + }, + { + "epoch": 0.566167543504654, + "grad_norm": 1.2366198301315308, + "learning_rate": 4.427008052859798e-05, + "loss": 0.968, + "step": 13990 + }, + { + "epoch": 0.566369890732497, + "grad_norm": 1.2942988872528076, + "learning_rate": 4.4249432170142476e-05, + "loss": 0.9525, + "step": 13995 + }, + { + "epoch": 0.56657223796034, + "grad_norm": 1.1409313678741455, + "learning_rate": 4.422878381168697e-05, + "loss": 1.0099, + "step": 14000 + }, + { + "epoch": 0.5667745851881829, + "grad_norm": 1.1154898405075073, + "learning_rate": 4.420813545323147e-05, + "loss": 1.0386, + "step": 14005 + }, + { + "epoch": 0.5669769324160259, + "grad_norm": 1.2833094596862793, + "learning_rate": 4.418748709477597e-05, + "loss": 0.9946, + "step": 14010 + }, + { + "epoch": 0.5671792796438688, + "grad_norm": 1.1053303480148315, + "learning_rate": 4.4166838736320465e-05, + "loss": 1.0222, + "step": 14015 + }, + { + "epoch": 0.5673816268717119, + "grad_norm": 1.2782630920410156, + "learning_rate": 4.414619037786496e-05, + "loss": 0.963, + "step": 14020 + }, + { + "epoch": 0.5675839740995549, + "grad_norm": 1.139122486114502, + "learning_rate": 4.412554201940946e-05, + "loss": 0.949, + "step": 14025 + }, + { + "epoch": 0.5677863213273978, + "grad_norm": 1.188439965248108, + "learning_rate": 4.410489366095396e-05, + "loss": 1.0107, + "step": 14030 + }, + { + "epoch": 0.5679886685552408, + "grad_norm": 1.1669381856918335, + "learning_rate": 4.4084245302498454e-05, + "loss": 0.9895, + "step": 14035 + }, + { + "epoch": 0.5681910157830837, + "grad_norm": 1.2100615501403809, + "learning_rate": 4.406359694404295e-05, + "loss": 0.9938, + "step": 14040 + }, + { + "epoch": 0.5683933630109268, + "grad_norm": 1.2322107553482056, + "learning_rate": 4.404294858558745e-05, + "loss": 1.0095, + "step": 14045 + }, + { + "epoch": 0.5685957102387698, + "grad_norm": 1.2963682413101196, + "learning_rate": 4.4022300227131946e-05, + "loss": 1.0473, + "step": 14050 + }, + { + "epoch": 0.5687980574666127, + "grad_norm": 1.2882816791534424, + "learning_rate": 4.4001651868676443e-05, + "loss": 0.9555, + "step": 14055 + }, + { + "epoch": 0.5690004046944557, + "grad_norm": 1.2242742776870728, + "learning_rate": 4.398100351022094e-05, + "loss": 0.9798, + "step": 14060 + }, + { + "epoch": 0.5692027519222986, + "grad_norm": 1.204200029373169, + "learning_rate": 4.396035515176544e-05, + "loss": 0.9869, + "step": 14065 + }, + { + "epoch": 0.5694050991501416, + "grad_norm": 1.1855534315109253, + "learning_rate": 4.393970679330993e-05, + "loss": 1.0247, + "step": 14070 + }, + { + "epoch": 0.5696074463779847, + "grad_norm": 1.0950124263763428, + "learning_rate": 4.391905843485443e-05, + "loss": 0.9799, + "step": 14075 + }, + { + "epoch": 0.5698097936058276, + "grad_norm": 1.1773353815078735, + "learning_rate": 4.389841007639893e-05, + "loss": 0.9764, + "step": 14080 + }, + { + "epoch": 0.5700121408336706, + "grad_norm": 1.2206952571868896, + "learning_rate": 4.387776171794343e-05, + "loss": 0.9877, + "step": 14085 + }, + { + "epoch": 0.5702144880615135, + "grad_norm": 1.174280047416687, + "learning_rate": 4.3857113359487924e-05, + "loss": 0.9823, + "step": 14090 + }, + { + "epoch": 0.5704168352893565, + "grad_norm": 1.272633671760559, + "learning_rate": 4.3836465001032415e-05, + "loss": 1.021, + "step": 14095 + }, + { + "epoch": 0.5706191825171996, + "grad_norm": 1.3584730625152588, + "learning_rate": 4.381581664257692e-05, + "loss": 1.0188, + "step": 14100 + }, + { + "epoch": 0.5708215297450425, + "grad_norm": 1.1411398649215698, + "learning_rate": 4.379516828412141e-05, + "loss": 0.9952, + "step": 14105 + }, + { + "epoch": 0.5710238769728855, + "grad_norm": 1.174756646156311, + "learning_rate": 4.3774519925665914e-05, + "loss": 1.0058, + "step": 14110 + }, + { + "epoch": 0.5712262242007284, + "grad_norm": 1.2773780822753906, + "learning_rate": 4.375387156721041e-05, + "loss": 0.9758, + "step": 14115 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.2203837633132935, + "learning_rate": 4.373322320875491e-05, + "loss": 1.0931, + "step": 14120 + }, + { + "epoch": 0.5716309186564144, + "grad_norm": 1.1752070188522339, + "learning_rate": 4.3712574850299406e-05, + "loss": 1.0002, + "step": 14125 + }, + { + "epoch": 0.5718332658842574, + "grad_norm": 1.1498125791549683, + "learning_rate": 4.3691926491843896e-05, + "loss": 0.9446, + "step": 14130 + }, + { + "epoch": 0.5720356131121004, + "grad_norm": 1.1896262168884277, + "learning_rate": 4.36712781333884e-05, + "loss": 1.0061, + "step": 14135 + }, + { + "epoch": 0.5722379603399433, + "grad_norm": 1.2011770009994507, + "learning_rate": 4.36506297749329e-05, + "loss": 1.0251, + "step": 14140 + }, + { + "epoch": 0.5724403075677863, + "grad_norm": 1.150048851966858, + "learning_rate": 4.3629981416477395e-05, + "loss": 1.025, + "step": 14145 + }, + { + "epoch": 0.5726426547956293, + "grad_norm": 1.1370536088943481, + "learning_rate": 4.360933305802189e-05, + "loss": 1.037, + "step": 14150 + }, + { + "epoch": 0.5728450020234723, + "grad_norm": 1.1036863327026367, + "learning_rate": 4.358868469956638e-05, + "loss": 0.9489, + "step": 14155 + }, + { + "epoch": 0.5730473492513153, + "grad_norm": 1.1035983562469482, + "learning_rate": 4.356803634111089e-05, + "loss": 0.9553, + "step": 14160 + }, + { + "epoch": 0.5732496964791582, + "grad_norm": 1.1876813173294067, + "learning_rate": 4.354738798265538e-05, + "loss": 1.0022, + "step": 14165 + }, + { + "epoch": 0.5734520437070012, + "grad_norm": 1.1236966848373413, + "learning_rate": 4.352673962419988e-05, + "loss": 0.9287, + "step": 14170 + }, + { + "epoch": 0.5736543909348442, + "grad_norm": 1.1244624853134155, + "learning_rate": 4.350609126574438e-05, + "loss": 0.9893, + "step": 14175 + }, + { + "epoch": 0.5738567381626871, + "grad_norm": 1.2417951822280884, + "learning_rate": 4.348544290728887e-05, + "loss": 1.0084, + "step": 14180 + }, + { + "epoch": 0.5740590853905302, + "grad_norm": 1.1004974842071533, + "learning_rate": 4.346479454883337e-05, + "loss": 1.0059, + "step": 14185 + }, + { + "epoch": 0.5742614326183731, + "grad_norm": 1.3367033004760742, + "learning_rate": 4.3444146190377864e-05, + "loss": 1.0446, + "step": 14190 + }, + { + "epoch": 0.5744637798462161, + "grad_norm": 1.2608211040496826, + "learning_rate": 4.342349783192237e-05, + "loss": 1.0296, + "step": 14195 + }, + { + "epoch": 0.5746661270740591, + "grad_norm": 1.2179604768753052, + "learning_rate": 4.340284947346686e-05, + "loss": 1.0519, + "step": 14200 + }, + { + "epoch": 0.574868474301902, + "grad_norm": 1.0648919343948364, + "learning_rate": 4.3382201115011356e-05, + "loss": 0.9879, + "step": 14205 + }, + { + "epoch": 0.5750708215297451, + "grad_norm": 1.1244484186172485, + "learning_rate": 4.336155275655586e-05, + "loss": 1.0345, + "step": 14210 + }, + { + "epoch": 0.575273168757588, + "grad_norm": 1.156739592552185, + "learning_rate": 4.334090439810035e-05, + "loss": 0.9823, + "step": 14215 + }, + { + "epoch": 0.575475515985431, + "grad_norm": 1.3055468797683716, + "learning_rate": 4.3320256039644854e-05, + "loss": 0.9907, + "step": 14220 + }, + { + "epoch": 0.575677863213274, + "grad_norm": 1.2654321193695068, + "learning_rate": 4.3299607681189345e-05, + "loss": 0.9654, + "step": 14225 + }, + { + "epoch": 0.5758802104411169, + "grad_norm": 1.2418204545974731, + "learning_rate": 4.327895932273385e-05, + "loss": 0.9957, + "step": 14230 + }, + { + "epoch": 0.5760825576689599, + "grad_norm": 1.2722078561782837, + "learning_rate": 4.325831096427834e-05, + "loss": 1.0861, + "step": 14235 + }, + { + "epoch": 0.576284904896803, + "grad_norm": 1.4188235998153687, + "learning_rate": 4.323766260582284e-05, + "loss": 0.9552, + "step": 14240 + }, + { + "epoch": 0.5764872521246459, + "grad_norm": 1.1043928861618042, + "learning_rate": 4.321701424736734e-05, + "loss": 0.9975, + "step": 14245 + }, + { + "epoch": 0.5766895993524889, + "grad_norm": 1.1651086807250977, + "learning_rate": 4.319636588891183e-05, + "loss": 0.9782, + "step": 14250 + }, + { + "epoch": 0.5768919465803318, + "grad_norm": 1.0888679027557373, + "learning_rate": 4.3175717530456335e-05, + "loss": 1.007, + "step": 14255 + }, + { + "epoch": 0.5770942938081748, + "grad_norm": 1.3613406419754028, + "learning_rate": 4.3155069172000826e-05, + "loss": 1.0085, + "step": 14260 + }, + { + "epoch": 0.5772966410360179, + "grad_norm": 1.208199381828308, + "learning_rate": 4.313442081354532e-05, + "loss": 1.019, + "step": 14265 + }, + { + "epoch": 0.5774989882638608, + "grad_norm": 1.3016966581344604, + "learning_rate": 4.311377245508982e-05, + "loss": 1.0349, + "step": 14270 + }, + { + "epoch": 0.5777013354917038, + "grad_norm": 1.1568187475204468, + "learning_rate": 4.309312409663432e-05, + "loss": 1.047, + "step": 14275 + }, + { + "epoch": 0.5779036827195467, + "grad_norm": 1.1511410474777222, + "learning_rate": 4.307247573817882e-05, + "loss": 0.9789, + "step": 14280 + }, + { + "epoch": 0.5781060299473897, + "grad_norm": 1.2285101413726807, + "learning_rate": 4.305182737972331e-05, + "loss": 0.9664, + "step": 14285 + }, + { + "epoch": 0.5783083771752326, + "grad_norm": 1.274605393409729, + "learning_rate": 4.303117902126781e-05, + "loss": 1.0009, + "step": 14290 + }, + { + "epoch": 0.5785107244030757, + "grad_norm": 1.226155400276184, + "learning_rate": 4.301053066281231e-05, + "loss": 0.9941, + "step": 14295 + }, + { + "epoch": 0.5787130716309187, + "grad_norm": 1.3813090324401855, + "learning_rate": 4.2989882304356804e-05, + "loss": 1.0203, + "step": 14300 + }, + { + "epoch": 0.5789154188587616, + "grad_norm": 1.242066502571106, + "learning_rate": 4.296923394590131e-05, + "loss": 0.9806, + "step": 14305 + }, + { + "epoch": 0.5791177660866046, + "grad_norm": 1.194819450378418, + "learning_rate": 4.29485855874458e-05, + "loss": 0.9885, + "step": 14310 + }, + { + "epoch": 0.5793201133144475, + "grad_norm": 1.0743086338043213, + "learning_rate": 4.2927937228990296e-05, + "loss": 0.9234, + "step": 14315 + }, + { + "epoch": 0.5795224605422906, + "grad_norm": 1.1583209037780762, + "learning_rate": 4.290728887053479e-05, + "loss": 0.9309, + "step": 14320 + }, + { + "epoch": 0.5797248077701336, + "grad_norm": 1.2013063430786133, + "learning_rate": 4.288664051207929e-05, + "loss": 0.9968, + "step": 14325 + }, + { + "epoch": 0.5799271549979765, + "grad_norm": 1.1978249549865723, + "learning_rate": 4.286599215362379e-05, + "loss": 0.9526, + "step": 14330 + }, + { + "epoch": 0.5801295022258195, + "grad_norm": 1.2646806240081787, + "learning_rate": 4.2845343795168285e-05, + "loss": 0.9877, + "step": 14335 + }, + { + "epoch": 0.5803318494536625, + "grad_norm": 1.271074891090393, + "learning_rate": 4.282469543671278e-05, + "loss": 1.0116, + "step": 14340 + }, + { + "epoch": 0.5805341966815054, + "grad_norm": 1.1461739540100098, + "learning_rate": 4.280404707825728e-05, + "loss": 1.0277, + "step": 14345 + }, + { + "epoch": 0.5807365439093485, + "grad_norm": 1.1434636116027832, + "learning_rate": 4.278339871980178e-05, + "loss": 1.0785, + "step": 14350 + }, + { + "epoch": 0.5809388911371914, + "grad_norm": 1.194696307182312, + "learning_rate": 4.2762750361346274e-05, + "loss": 1.0005, + "step": 14355 + }, + { + "epoch": 0.5811412383650344, + "grad_norm": 1.2170504331588745, + "learning_rate": 4.274210200289077e-05, + "loss": 0.9586, + "step": 14360 + }, + { + "epoch": 0.5813435855928774, + "grad_norm": 1.2026551961898804, + "learning_rate": 4.272145364443527e-05, + "loss": 0.9914, + "step": 14365 + }, + { + "epoch": 0.5815459328207203, + "grad_norm": 1.0865665674209595, + "learning_rate": 4.2700805285979766e-05, + "loss": 0.9974, + "step": 14370 + }, + { + "epoch": 0.5817482800485634, + "grad_norm": 1.2037372589111328, + "learning_rate": 4.2680156927524264e-05, + "loss": 1.0531, + "step": 14375 + }, + { + "epoch": 0.5819506272764063, + "grad_norm": 1.1456353664398193, + "learning_rate": 4.265950856906876e-05, + "loss": 0.9728, + "step": 14380 + }, + { + "epoch": 0.5821529745042493, + "grad_norm": 1.1749718189239502, + "learning_rate": 4.263886021061326e-05, + "loss": 1.0342, + "step": 14385 + }, + { + "epoch": 0.5823553217320923, + "grad_norm": 1.1799213886260986, + "learning_rate": 4.2618211852157756e-05, + "loss": 0.9889, + "step": 14390 + }, + { + "epoch": 0.5825576689599352, + "grad_norm": 1.0045866966247559, + "learning_rate": 4.259756349370225e-05, + "loss": 0.9254, + "step": 14395 + }, + { + "epoch": 0.5827600161877782, + "grad_norm": 1.3624835014343262, + "learning_rate": 4.257691513524675e-05, + "loss": 1.0147, + "step": 14400 + }, + { + "epoch": 0.5829623634156212, + "grad_norm": 1.0491089820861816, + "learning_rate": 4.255626677679125e-05, + "loss": 0.9083, + "step": 14405 + }, + { + "epoch": 0.5831647106434642, + "grad_norm": 1.3208736181259155, + "learning_rate": 4.2535618418335745e-05, + "loss": 1.026, + "step": 14410 + }, + { + "epoch": 0.5833670578713072, + "grad_norm": 1.1491566896438599, + "learning_rate": 4.251497005988024e-05, + "loss": 0.953, + "step": 14415 + }, + { + "epoch": 0.5835694050991501, + "grad_norm": 1.2377055883407593, + "learning_rate": 4.249432170142474e-05, + "loss": 0.9683, + "step": 14420 + }, + { + "epoch": 0.5837717523269931, + "grad_norm": 1.1501821279525757, + "learning_rate": 4.2473673342969237e-05, + "loss": 1.0097, + "step": 14425 + }, + { + "epoch": 0.5839740995548361, + "grad_norm": 1.0455905199050903, + "learning_rate": 4.2453024984513734e-05, + "loss": 1.0379, + "step": 14430 + }, + { + "epoch": 0.5841764467826791, + "grad_norm": 1.1751887798309326, + "learning_rate": 4.243237662605823e-05, + "loss": 1.0322, + "step": 14435 + }, + { + "epoch": 0.5843787940105221, + "grad_norm": 1.1225756406784058, + "learning_rate": 4.241172826760273e-05, + "loss": 1.0131, + "step": 14440 + }, + { + "epoch": 0.584581141238365, + "grad_norm": 1.4358786344528198, + "learning_rate": 4.2391079909147226e-05, + "loss": 0.9935, + "step": 14445 + }, + { + "epoch": 0.584783488466208, + "grad_norm": 1.1218318939208984, + "learning_rate": 4.2370431550691716e-05, + "loss": 1.0259, + "step": 14450 + }, + { + "epoch": 0.5849858356940509, + "grad_norm": 1.2656817436218262, + "learning_rate": 4.234978319223622e-05, + "loss": 1.0967, + "step": 14455 + }, + { + "epoch": 0.585188182921894, + "grad_norm": 1.195020318031311, + "learning_rate": 4.232913483378072e-05, + "loss": 1.0293, + "step": 14460 + }, + { + "epoch": 0.585390530149737, + "grad_norm": 1.267820119857788, + "learning_rate": 4.2308486475325215e-05, + "loss": 1.0329, + "step": 14465 + }, + { + "epoch": 0.5855928773775799, + "grad_norm": 1.2683968544006348, + "learning_rate": 4.228783811686971e-05, + "loss": 0.9337, + "step": 14470 + }, + { + "epoch": 0.5857952246054229, + "grad_norm": 1.218563199043274, + "learning_rate": 4.226718975841421e-05, + "loss": 0.9589, + "step": 14475 + }, + { + "epoch": 0.5859975718332658, + "grad_norm": 1.3326445817947388, + "learning_rate": 4.224654139995871e-05, + "loss": 0.933, + "step": 14480 + }, + { + "epoch": 0.5861999190611089, + "grad_norm": 1.155537724494934, + "learning_rate": 4.22258930415032e-05, + "loss": 1.1038, + "step": 14485 + }, + { + "epoch": 0.5864022662889519, + "grad_norm": 1.1963613033294678, + "learning_rate": 4.22052446830477e-05, + "loss": 0.9746, + "step": 14490 + }, + { + "epoch": 0.5866046135167948, + "grad_norm": 1.1540827751159668, + "learning_rate": 4.21845963245922e-05, + "loss": 1.0504, + "step": 14495 + }, + { + "epoch": 0.5868069607446378, + "grad_norm": 1.1261135339736938, + "learning_rate": 4.2163947966136696e-05, + "loss": 0.9379, + "step": 14500 + }, + { + "epoch": 0.5870093079724807, + "grad_norm": 1.1809544563293457, + "learning_rate": 4.214329960768119e-05, + "loss": 0.9444, + "step": 14505 + }, + { + "epoch": 0.5872116552003237, + "grad_norm": 1.1642730236053467, + "learning_rate": 4.2122651249225684e-05, + "loss": 0.977, + "step": 14510 + }, + { + "epoch": 0.5874140024281668, + "grad_norm": 1.2623203992843628, + "learning_rate": 4.210200289077019e-05, + "loss": 1.0256, + "step": 14515 + }, + { + "epoch": 0.5876163496560097, + "grad_norm": 1.1620464324951172, + "learning_rate": 4.208135453231468e-05, + "loss": 1.0281, + "step": 14520 + }, + { + "epoch": 0.5878186968838527, + "grad_norm": 1.2273989915847778, + "learning_rate": 4.206070617385918e-05, + "loss": 0.986, + "step": 14525 + }, + { + "epoch": 0.5880210441116956, + "grad_norm": 1.172258973121643, + "learning_rate": 4.204005781540368e-05, + "loss": 1.013, + "step": 14530 + }, + { + "epoch": 0.5882233913395386, + "grad_norm": 1.1386834383010864, + "learning_rate": 4.201940945694817e-05, + "loss": 1.0119, + "step": 14535 + }, + { + "epoch": 0.5884257385673817, + "grad_norm": 1.1152602434158325, + "learning_rate": 4.1998761098492674e-05, + "loss": 0.9738, + "step": 14540 + }, + { + "epoch": 0.5886280857952246, + "grad_norm": 1.1315892934799194, + "learning_rate": 4.1978112740037165e-05, + "loss": 0.9592, + "step": 14545 + }, + { + "epoch": 0.5888304330230676, + "grad_norm": 1.2412919998168945, + "learning_rate": 4.195746438158167e-05, + "loss": 1.0437, + "step": 14550 + }, + { + "epoch": 0.5890327802509105, + "grad_norm": 1.120381474494934, + "learning_rate": 4.1936816023126166e-05, + "loss": 1.0226, + "step": 14555 + }, + { + "epoch": 0.5892351274787535, + "grad_norm": 1.1512612104415894, + "learning_rate": 4.191616766467066e-05, + "loss": 1.0483, + "step": 14560 + }, + { + "epoch": 0.5894374747065965, + "grad_norm": 1.3168336153030396, + "learning_rate": 4.189551930621516e-05, + "loss": 0.9527, + "step": 14565 + }, + { + "epoch": 0.5896398219344395, + "grad_norm": 1.2016961574554443, + "learning_rate": 4.187487094775965e-05, + "loss": 0.9641, + "step": 14570 + }, + { + "epoch": 0.5898421691622825, + "grad_norm": 1.213405966758728, + "learning_rate": 4.1854222589304156e-05, + "loss": 1.0915, + "step": 14575 + }, + { + "epoch": 0.5900445163901255, + "grad_norm": 1.1711812019348145, + "learning_rate": 4.1833574230848646e-05, + "loss": 0.9716, + "step": 14580 + }, + { + "epoch": 0.5902468636179684, + "grad_norm": 1.2479557991027832, + "learning_rate": 4.181292587239315e-05, + "loss": 0.9745, + "step": 14585 + }, + { + "epoch": 0.5904492108458114, + "grad_norm": 1.2024391889572144, + "learning_rate": 4.179227751393765e-05, + "loss": 0.9723, + "step": 14590 + }, + { + "epoch": 0.5906515580736544, + "grad_norm": 1.2010310888290405, + "learning_rate": 4.177162915548214e-05, + "loss": 0.9534, + "step": 14595 + }, + { + "epoch": 0.5908539053014974, + "grad_norm": 1.2918951511383057, + "learning_rate": 4.175098079702664e-05, + "loss": 0.9641, + "step": 14600 + }, + { + "epoch": 0.5910562525293404, + "grad_norm": 1.25871741771698, + "learning_rate": 4.173033243857113e-05, + "loss": 0.9994, + "step": 14605 + }, + { + "epoch": 0.5912585997571833, + "grad_norm": 1.1251121759414673, + "learning_rate": 4.1709684080115637e-05, + "loss": 0.9179, + "step": 14610 + }, + { + "epoch": 0.5914609469850263, + "grad_norm": 1.3321064710617065, + "learning_rate": 4.168903572166013e-05, + "loss": 1.0003, + "step": 14615 + }, + { + "epoch": 0.5916632942128692, + "grad_norm": 1.2580547332763672, + "learning_rate": 4.1668387363204624e-05, + "loss": 1.0116, + "step": 14620 + }, + { + "epoch": 0.5918656414407123, + "grad_norm": 1.182379126548767, + "learning_rate": 4.164773900474913e-05, + "loss": 0.9547, + "step": 14625 + }, + { + "epoch": 0.5920679886685553, + "grad_norm": 1.2749547958374023, + "learning_rate": 4.162709064629362e-05, + "loss": 1.008, + "step": 14630 + }, + { + "epoch": 0.5922703358963982, + "grad_norm": 1.2136083841323853, + "learning_rate": 4.160644228783812e-05, + "loss": 1.1016, + "step": 14635 + }, + { + "epoch": 0.5924726831242412, + "grad_norm": 1.152838110923767, + "learning_rate": 4.1585793929382614e-05, + "loss": 0.984, + "step": 14640 + }, + { + "epoch": 0.5926750303520841, + "grad_norm": 1.245336890220642, + "learning_rate": 4.156514557092711e-05, + "loss": 1.0255, + "step": 14645 + }, + { + "epoch": 0.5928773775799272, + "grad_norm": 1.218671441078186, + "learning_rate": 4.154449721247161e-05, + "loss": 1.0192, + "step": 14650 + }, + { + "epoch": 0.5930797248077702, + "grad_norm": 1.1206589937210083, + "learning_rate": 4.1523848854016105e-05, + "loss": 0.9731, + "step": 14655 + }, + { + "epoch": 0.5932820720356131, + "grad_norm": 1.1717655658721924, + "learning_rate": 4.150320049556061e-05, + "loss": 0.9849, + "step": 14660 + }, + { + "epoch": 0.5934844192634561, + "grad_norm": 1.2015700340270996, + "learning_rate": 4.14825521371051e-05, + "loss": 1.0351, + "step": 14665 + }, + { + "epoch": 0.593686766491299, + "grad_norm": 1.153236746788025, + "learning_rate": 4.14619037786496e-05, + "loss": 0.989, + "step": 14670 + }, + { + "epoch": 0.5938891137191421, + "grad_norm": 1.1946407556533813, + "learning_rate": 4.1441255420194095e-05, + "loss": 1.0056, + "step": 14675 + }, + { + "epoch": 0.5940914609469851, + "grad_norm": 1.1855626106262207, + "learning_rate": 4.142060706173859e-05, + "loss": 0.981, + "step": 14680 + }, + { + "epoch": 0.594293808174828, + "grad_norm": 0.9957739114761353, + "learning_rate": 4.1399958703283096e-05, + "loss": 1.0083, + "step": 14685 + }, + { + "epoch": 0.594496155402671, + "grad_norm": 1.2123115062713623, + "learning_rate": 4.1379310344827587e-05, + "loss": 0.926, + "step": 14690 + }, + { + "epoch": 0.5946985026305139, + "grad_norm": 1.103893756866455, + "learning_rate": 4.135866198637209e-05, + "loss": 1.0038, + "step": 14695 + }, + { + "epoch": 0.5949008498583569, + "grad_norm": 1.1878453493118286, + "learning_rate": 4.133801362791658e-05, + "loss": 1.0305, + "step": 14700 + }, + { + "epoch": 0.5951031970862, + "grad_norm": 1.1317086219787598, + "learning_rate": 4.131736526946108e-05, + "loss": 0.9945, + "step": 14705 + }, + { + "epoch": 0.5953055443140429, + "grad_norm": 1.0596208572387695, + "learning_rate": 4.1296716911005576e-05, + "loss": 1.047, + "step": 14710 + }, + { + "epoch": 0.5955078915418859, + "grad_norm": 1.2252612113952637, + "learning_rate": 4.127606855255007e-05, + "loss": 1.0509, + "step": 14715 + }, + { + "epoch": 0.5957102387697288, + "grad_norm": 1.1817820072174072, + "learning_rate": 4.125542019409458e-05, + "loss": 0.9825, + "step": 14720 + }, + { + "epoch": 0.5959125859975718, + "grad_norm": 1.2887251377105713, + "learning_rate": 4.123477183563907e-05, + "loss": 0.9911, + "step": 14725 + }, + { + "epoch": 0.5961149332254149, + "grad_norm": 1.3245701789855957, + "learning_rate": 4.1214123477183565e-05, + "loss": 0.9956, + "step": 14730 + }, + { + "epoch": 0.5963172804532578, + "grad_norm": 1.1964930295944214, + "learning_rate": 4.119347511872806e-05, + "loss": 0.9505, + "step": 14735 + }, + { + "epoch": 0.5965196276811008, + "grad_norm": 1.1000622510910034, + "learning_rate": 4.117282676027256e-05, + "loss": 1.0079, + "step": 14740 + }, + { + "epoch": 0.5967219749089437, + "grad_norm": 1.2138142585754395, + "learning_rate": 4.115217840181706e-05, + "loss": 0.9815, + "step": 14745 + }, + { + "epoch": 0.5969243221367867, + "grad_norm": 1.1650316715240479, + "learning_rate": 4.1131530043361554e-05, + "loss": 1.0002, + "step": 14750 + }, + { + "epoch": 0.5971266693646297, + "grad_norm": 1.162232518196106, + "learning_rate": 4.111088168490605e-05, + "loss": 0.9867, + "step": 14755 + }, + { + "epoch": 0.5973290165924727, + "grad_norm": 1.2270444631576538, + "learning_rate": 4.109023332645055e-05, + "loss": 1.0356, + "step": 14760 + }, + { + "epoch": 0.5975313638203157, + "grad_norm": 1.1669378280639648, + "learning_rate": 4.1069584967995046e-05, + "loss": 0.9689, + "step": 14765 + }, + { + "epoch": 0.5977337110481586, + "grad_norm": 1.0744789838790894, + "learning_rate": 4.104893660953954e-05, + "loss": 0.9858, + "step": 14770 + }, + { + "epoch": 0.5979360582760016, + "grad_norm": 1.1895402669906616, + "learning_rate": 4.102828825108404e-05, + "loss": 0.9566, + "step": 14775 + }, + { + "epoch": 0.5981384055038446, + "grad_norm": 1.338990569114685, + "learning_rate": 4.100763989262854e-05, + "loss": 1.0136, + "step": 14780 + }, + { + "epoch": 0.5983407527316876, + "grad_norm": 1.1293729543685913, + "learning_rate": 4.0986991534173035e-05, + "loss": 0.9983, + "step": 14785 + }, + { + "epoch": 0.5985430999595306, + "grad_norm": 1.154767632484436, + "learning_rate": 4.096634317571753e-05, + "loss": 0.9983, + "step": 14790 + }, + { + "epoch": 0.5987454471873735, + "grad_norm": 1.1595851182937622, + "learning_rate": 4.094569481726203e-05, + "loss": 0.9666, + "step": 14795 + }, + { + "epoch": 0.5989477944152165, + "grad_norm": 1.1495463848114014, + "learning_rate": 4.092504645880653e-05, + "loss": 1.0128, + "step": 14800 + }, + { + "epoch": 0.5991501416430595, + "grad_norm": 1.178675651550293, + "learning_rate": 4.0904398100351024e-05, + "loss": 0.9879, + "step": 14805 + }, + { + "epoch": 0.5993524888709024, + "grad_norm": 1.1787465810775757, + "learning_rate": 4.088374974189552e-05, + "loss": 0.9369, + "step": 14810 + }, + { + "epoch": 0.5995548360987455, + "grad_norm": 1.1427485942840576, + "learning_rate": 4.086310138344002e-05, + "loss": 0.9885, + "step": 14815 + }, + { + "epoch": 0.5997571833265885, + "grad_norm": 1.3166379928588867, + "learning_rate": 4.0842453024984516e-05, + "loss": 0.9942, + "step": 14820 + }, + { + "epoch": 0.5999595305544314, + "grad_norm": 1.1612681150436401, + "learning_rate": 4.0821804666529014e-05, + "loss": 0.9906, + "step": 14825 + }, + { + "epoch": 0.6001618777822744, + "grad_norm": 1.1479135751724243, + "learning_rate": 4.080115630807351e-05, + "loss": 0.9998, + "step": 14830 + }, + { + "epoch": 0.6003642250101173, + "grad_norm": 1.1357676982879639, + "learning_rate": 4.078050794961801e-05, + "loss": 0.905, + "step": 14835 + }, + { + "epoch": 0.6005665722379604, + "grad_norm": 1.254703402519226, + "learning_rate": 4.0759859591162505e-05, + "loss": 0.9856, + "step": 14840 + }, + { + "epoch": 0.6007689194658034, + "grad_norm": 1.1657435894012451, + "learning_rate": 4.0739211232707e-05, + "loss": 0.92, + "step": 14845 + }, + { + "epoch": 0.6009712666936463, + "grad_norm": 1.1573985815048218, + "learning_rate": 4.07185628742515e-05, + "loss": 0.9758, + "step": 14850 + }, + { + "epoch": 0.6011736139214893, + "grad_norm": 1.1526132822036743, + "learning_rate": 4.0697914515796e-05, + "loss": 1.0268, + "step": 14855 + }, + { + "epoch": 0.6013759611493322, + "grad_norm": 1.119799256324768, + "learning_rate": 4.0677266157340495e-05, + "loss": 0.9355, + "step": 14860 + }, + { + "epoch": 0.6015783083771752, + "grad_norm": 1.231797695159912, + "learning_rate": 4.0656617798884985e-05, + "loss": 0.9955, + "step": 14865 + }, + { + "epoch": 0.6017806556050183, + "grad_norm": 1.1451510190963745, + "learning_rate": 4.063596944042949e-05, + "loss": 0.9976, + "step": 14870 + }, + { + "epoch": 0.6019830028328612, + "grad_norm": 1.0147970914840698, + "learning_rate": 4.0615321081973987e-05, + "loss": 0.9881, + "step": 14875 + }, + { + "epoch": 0.6021853500607042, + "grad_norm": 1.0572047233581543, + "learning_rate": 4.0594672723518484e-05, + "loss": 1.0732, + "step": 14880 + }, + { + "epoch": 0.6023876972885471, + "grad_norm": 1.1836466789245605, + "learning_rate": 4.057402436506298e-05, + "loss": 0.9699, + "step": 14885 + }, + { + "epoch": 0.6025900445163901, + "grad_norm": 1.2634001970291138, + "learning_rate": 4.055337600660747e-05, + "loss": 0.981, + "step": 14890 + }, + { + "epoch": 0.6027923917442332, + "grad_norm": 1.1793850660324097, + "learning_rate": 4.0532727648151976e-05, + "loss": 0.9694, + "step": 14895 + }, + { + "epoch": 0.6029947389720761, + "grad_norm": 1.241394281387329, + "learning_rate": 4.0512079289696466e-05, + "loss": 1.027, + "step": 14900 + }, + { + "epoch": 0.6031970861999191, + "grad_norm": 1.2651069164276123, + "learning_rate": 4.049143093124097e-05, + "loss": 1.002, + "step": 14905 + }, + { + "epoch": 0.603399433427762, + "grad_norm": 1.171766757965088, + "learning_rate": 4.047078257278547e-05, + "loss": 1.0042, + "step": 14910 + }, + { + "epoch": 0.603601780655605, + "grad_norm": 1.2115875482559204, + "learning_rate": 4.045013421432996e-05, + "loss": 1.0144, + "step": 14915 + }, + { + "epoch": 0.603804127883448, + "grad_norm": 1.08839750289917, + "learning_rate": 4.042948585587446e-05, + "loss": 1.0083, + "step": 14920 + }, + { + "epoch": 0.604006475111291, + "grad_norm": 1.0900975465774536, + "learning_rate": 4.040883749741895e-05, + "loss": 1.0039, + "step": 14925 + }, + { + "epoch": 0.604208822339134, + "grad_norm": 1.8475017547607422, + "learning_rate": 4.038818913896346e-05, + "loss": 0.9778, + "step": 14930 + }, + { + "epoch": 0.6044111695669769, + "grad_norm": 1.1439038515090942, + "learning_rate": 4.0367540780507954e-05, + "loss": 0.9927, + "step": 14935 + }, + { + "epoch": 0.6046135167948199, + "grad_norm": 1.2487175464630127, + "learning_rate": 4.034689242205245e-05, + "loss": 1.0361, + "step": 14940 + }, + { + "epoch": 0.6048158640226629, + "grad_norm": 1.2005751132965088, + "learning_rate": 4.032624406359695e-05, + "loss": 0.99, + "step": 14945 + }, + { + "epoch": 0.6050182112505059, + "grad_norm": 1.0894343852996826, + "learning_rate": 4.030559570514144e-05, + "loss": 1.0457, + "step": 14950 + }, + { + "epoch": 0.6052205584783489, + "grad_norm": 1.2973085641860962, + "learning_rate": 4.028494734668594e-05, + "loss": 1.018, + "step": 14955 + }, + { + "epoch": 0.6054229057061918, + "grad_norm": 1.1314729452133179, + "learning_rate": 4.0264298988230434e-05, + "loss": 1.0048, + "step": 14960 + }, + { + "epoch": 0.6056252529340348, + "grad_norm": 1.1631298065185547, + "learning_rate": 4.024365062977494e-05, + "loss": 0.9561, + "step": 14965 + }, + { + "epoch": 0.6058276001618778, + "grad_norm": 1.217316746711731, + "learning_rate": 4.0223002271319435e-05, + "loss": 0.9906, + "step": 14970 + }, + { + "epoch": 0.6060299473897207, + "grad_norm": 1.2196394205093384, + "learning_rate": 4.0202353912863926e-05, + "loss": 0.9855, + "step": 14975 + }, + { + "epoch": 0.6062322946175638, + "grad_norm": 1.2208998203277588, + "learning_rate": 4.018170555440843e-05, + "loss": 0.9476, + "step": 14980 + }, + { + "epoch": 0.6064346418454067, + "grad_norm": 1.222133994102478, + "learning_rate": 4.016105719595292e-05, + "loss": 0.9434, + "step": 14985 + }, + { + "epoch": 0.6066369890732497, + "grad_norm": 1.2885711193084717, + "learning_rate": 4.0140408837497424e-05, + "loss": 0.9793, + "step": 14990 + }, + { + "epoch": 0.6068393363010927, + "grad_norm": 1.108764410018921, + "learning_rate": 4.0119760479041915e-05, + "loss": 0.9479, + "step": 14995 + }, + { + "epoch": 0.6070416835289356, + "grad_norm": 1.2786108255386353, + "learning_rate": 4.009911212058641e-05, + "loss": 0.9701, + "step": 15000 + }, + { + "epoch": 0.6072440307567787, + "grad_norm": 1.1678630113601685, + "learning_rate": 4.0078463762130916e-05, + "loss": 1.0102, + "step": 15005 + }, + { + "epoch": 0.6074463779846216, + "grad_norm": 1.2520824670791626, + "learning_rate": 4.005781540367541e-05, + "loss": 0.9828, + "step": 15010 + }, + { + "epoch": 0.6076487252124646, + "grad_norm": 1.2107146978378296, + "learning_rate": 4.003716704521991e-05, + "loss": 1.0024, + "step": 15015 + }, + { + "epoch": 0.6078510724403076, + "grad_norm": 1.2790188789367676, + "learning_rate": 4.00165186867644e-05, + "loss": 1.0017, + "step": 15020 + }, + { + "epoch": 0.6080534196681505, + "grad_norm": 1.1601837873458862, + "learning_rate": 3.99958703283089e-05, + "loss": 0.9687, + "step": 15025 + }, + { + "epoch": 0.6082557668959935, + "grad_norm": 1.2074768543243408, + "learning_rate": 3.9975221969853396e-05, + "loss": 1.0258, + "step": 15030 + }, + { + "epoch": 0.6084581141238365, + "grad_norm": 1.3172718286514282, + "learning_rate": 3.995457361139789e-05, + "loss": 0.9482, + "step": 15035 + }, + { + "epoch": 0.6086604613516795, + "grad_norm": 1.1703674793243408, + "learning_rate": 3.99339252529424e-05, + "loss": 1.0559, + "step": 15040 + }, + { + "epoch": 0.6088628085795225, + "grad_norm": 1.2748117446899414, + "learning_rate": 3.991327689448689e-05, + "loss": 0.9715, + "step": 15045 + }, + { + "epoch": 0.6090651558073654, + "grad_norm": 1.1625258922576904, + "learning_rate": 3.989262853603139e-05, + "loss": 0.9783, + "step": 15050 + }, + { + "epoch": 0.6092675030352084, + "grad_norm": 1.2481300830841064, + "learning_rate": 3.987198017757588e-05, + "loss": 1.0329, + "step": 15055 + }, + { + "epoch": 0.6094698502630514, + "grad_norm": 1.2934519052505493, + "learning_rate": 3.985133181912038e-05, + "loss": 0.978, + "step": 15060 + }, + { + "epoch": 0.6096721974908944, + "grad_norm": 1.2320512533187866, + "learning_rate": 3.9830683460664884e-05, + "loss": 0.9465, + "step": 15065 + }, + { + "epoch": 0.6098745447187374, + "grad_norm": 1.1613794565200806, + "learning_rate": 3.9810035102209374e-05, + "loss": 0.942, + "step": 15070 + }, + { + "epoch": 0.6100768919465803, + "grad_norm": 1.1811312437057495, + "learning_rate": 3.978938674375388e-05, + "loss": 0.975, + "step": 15075 + }, + { + "epoch": 0.6102792391744233, + "grad_norm": 1.1655839681625366, + "learning_rate": 3.976873838529837e-05, + "loss": 0.9918, + "step": 15080 + }, + { + "epoch": 0.6104815864022662, + "grad_norm": 1.145727515220642, + "learning_rate": 3.9748090026842866e-05, + "loss": 1.0017, + "step": 15085 + }, + { + "epoch": 0.6106839336301093, + "grad_norm": 1.175281047821045, + "learning_rate": 3.9727441668387364e-05, + "loss": 1.0345, + "step": 15090 + }, + { + "epoch": 0.6108862808579523, + "grad_norm": 1.318772554397583, + "learning_rate": 3.970679330993186e-05, + "loss": 1.0113, + "step": 15095 + }, + { + "epoch": 0.6110886280857952, + "grad_norm": 1.195495843887329, + "learning_rate": 3.9686144951476365e-05, + "loss": 0.996, + "step": 15100 + }, + { + "epoch": 0.6112909753136382, + "grad_norm": 1.170361876487732, + "learning_rate": 3.9665496593020855e-05, + "loss": 1.0214, + "step": 15105 + }, + { + "epoch": 0.6114933225414811, + "grad_norm": 1.2344578504562378, + "learning_rate": 3.964484823456535e-05, + "loss": 0.981, + "step": 15110 + }, + { + "epoch": 0.6116956697693242, + "grad_norm": 1.1404913663864136, + "learning_rate": 3.962419987610985e-05, + "loss": 1.0143, + "step": 15115 + }, + { + "epoch": 0.6118980169971672, + "grad_norm": 1.2335190773010254, + "learning_rate": 3.960355151765435e-05, + "loss": 0.9336, + "step": 15120 + }, + { + "epoch": 0.6121003642250101, + "grad_norm": 1.2277600765228271, + "learning_rate": 3.9582903159198845e-05, + "loss": 0.9821, + "step": 15125 + }, + { + "epoch": 0.6123027114528531, + "grad_norm": 1.2261604070663452, + "learning_rate": 3.956225480074334e-05, + "loss": 0.981, + "step": 15130 + }, + { + "epoch": 0.612505058680696, + "grad_norm": 1.1750844717025757, + "learning_rate": 3.954160644228784e-05, + "loss": 0.9391, + "step": 15135 + }, + { + "epoch": 0.612707405908539, + "grad_norm": 1.1132014989852905, + "learning_rate": 3.9520958083832336e-05, + "loss": 0.9887, + "step": 15140 + }, + { + "epoch": 0.6129097531363821, + "grad_norm": 1.1465575695037842, + "learning_rate": 3.9500309725376834e-05, + "loss": 0.9454, + "step": 15145 + }, + { + "epoch": 0.613112100364225, + "grad_norm": 1.1274523735046387, + "learning_rate": 3.947966136692133e-05, + "loss": 1.0278, + "step": 15150 + }, + { + "epoch": 0.613314447592068, + "grad_norm": 1.1466535329818726, + "learning_rate": 3.945901300846583e-05, + "loss": 0.8991, + "step": 15155 + }, + { + "epoch": 0.613516794819911, + "grad_norm": 1.2201199531555176, + "learning_rate": 3.9438364650010326e-05, + "loss": 1.0018, + "step": 15160 + }, + { + "epoch": 0.6137191420477539, + "grad_norm": 1.1986336708068848, + "learning_rate": 3.941771629155482e-05, + "loss": 1.0362, + "step": 15165 + }, + { + "epoch": 0.613921489275597, + "grad_norm": 1.1385431289672852, + "learning_rate": 3.939706793309932e-05, + "loss": 0.978, + "step": 15170 + }, + { + "epoch": 0.6141238365034399, + "grad_norm": 1.144865870475769, + "learning_rate": 3.937641957464382e-05, + "loss": 0.9379, + "step": 15175 + }, + { + "epoch": 0.6143261837312829, + "grad_norm": 1.3139859437942505, + "learning_rate": 3.9355771216188315e-05, + "loss": 1.0175, + "step": 15180 + }, + { + "epoch": 0.6145285309591259, + "grad_norm": 1.0443328619003296, + "learning_rate": 3.933512285773281e-05, + "loss": 1.0142, + "step": 15185 + }, + { + "epoch": 0.6147308781869688, + "grad_norm": 1.1730111837387085, + "learning_rate": 3.931447449927731e-05, + "loss": 0.9804, + "step": 15190 + }, + { + "epoch": 0.6149332254148118, + "grad_norm": 1.1835631132125854, + "learning_rate": 3.929382614082181e-05, + "loss": 0.9471, + "step": 15195 + }, + { + "epoch": 0.6151355726426548, + "grad_norm": 1.3823308944702148, + "learning_rate": 3.9273177782366304e-05, + "loss": 1.0063, + "step": 15200 + }, + { + "epoch": 0.6153379198704978, + "grad_norm": 1.0838249921798706, + "learning_rate": 3.92525294239108e-05, + "loss": 1.0202, + "step": 15205 + }, + { + "epoch": 0.6155402670983408, + "grad_norm": 1.152218222618103, + "learning_rate": 3.92318810654553e-05, + "loss": 1.0241, + "step": 15210 + }, + { + "epoch": 0.6157426143261837, + "grad_norm": 1.018296241760254, + "learning_rate": 3.9211232706999796e-05, + "loss": 0.9714, + "step": 15215 + }, + { + "epoch": 0.6159449615540267, + "grad_norm": 1.2379239797592163, + "learning_rate": 3.919058434854429e-05, + "loss": 0.9989, + "step": 15220 + }, + { + "epoch": 0.6161473087818697, + "grad_norm": 1.1574537754058838, + "learning_rate": 3.916993599008879e-05, + "loss": 1.0686, + "step": 15225 + }, + { + "epoch": 0.6163496560097127, + "grad_norm": 1.1106170415878296, + "learning_rate": 3.914928763163329e-05, + "loss": 0.9782, + "step": 15230 + }, + { + "epoch": 0.6165520032375557, + "grad_norm": 1.272621989250183, + "learning_rate": 3.9128639273177785e-05, + "loss": 1.0206, + "step": 15235 + }, + { + "epoch": 0.6167543504653986, + "grad_norm": 1.2506707906723022, + "learning_rate": 3.910799091472228e-05, + "loss": 0.9813, + "step": 15240 + }, + { + "epoch": 0.6169566976932416, + "grad_norm": 1.2709102630615234, + "learning_rate": 3.908734255626677e-05, + "loss": 1.0325, + "step": 15245 + }, + { + "epoch": 0.6171590449210845, + "grad_norm": 1.1361140012741089, + "learning_rate": 3.906669419781128e-05, + "loss": 1.0265, + "step": 15250 + }, + { + "epoch": 0.6173613921489276, + "grad_norm": 1.2748886346817017, + "learning_rate": 3.9046045839355774e-05, + "loss": 0.9605, + "step": 15255 + }, + { + "epoch": 0.6175637393767706, + "grad_norm": 1.3143259286880493, + "learning_rate": 3.902539748090027e-05, + "loss": 1.0045, + "step": 15260 + }, + { + "epoch": 0.6177660866046135, + "grad_norm": 1.2562155723571777, + "learning_rate": 3.900474912244477e-05, + "loss": 1.0042, + "step": 15265 + }, + { + "epoch": 0.6179684338324565, + "grad_norm": 1.3425461053848267, + "learning_rate": 3.8984100763989266e-05, + "loss": 1.0258, + "step": 15270 + }, + { + "epoch": 0.6181707810602994, + "grad_norm": 1.1353882551193237, + "learning_rate": 3.8963452405533764e-05, + "loss": 1.0268, + "step": 15275 + }, + { + "epoch": 0.6183731282881425, + "grad_norm": 1.1892770528793335, + "learning_rate": 3.8942804047078254e-05, + "loss": 0.9912, + "step": 15280 + }, + { + "epoch": 0.6185754755159855, + "grad_norm": 1.1367006301879883, + "learning_rate": 3.892215568862276e-05, + "loss": 0.9799, + "step": 15285 + }, + { + "epoch": 0.6187778227438284, + "grad_norm": 1.2165271043777466, + "learning_rate": 3.8901507330167255e-05, + "loss": 1.0506, + "step": 15290 + }, + { + "epoch": 0.6189801699716714, + "grad_norm": 1.2636433839797974, + "learning_rate": 3.888085897171175e-05, + "loss": 0.9692, + "step": 15295 + }, + { + "epoch": 0.6191825171995143, + "grad_norm": 1.301344871520996, + "learning_rate": 3.886021061325625e-05, + "loss": 1.0132, + "step": 15300 + }, + { + "epoch": 0.6193848644273573, + "grad_norm": 1.1484495401382446, + "learning_rate": 3.883956225480074e-05, + "loss": 0.9806, + "step": 15305 + }, + { + "epoch": 0.6195872116552004, + "grad_norm": 1.274308681488037, + "learning_rate": 3.8818913896345245e-05, + "loss": 0.9839, + "step": 15310 + }, + { + "epoch": 0.6197895588830433, + "grad_norm": 1.2011617422103882, + "learning_rate": 3.879826553788974e-05, + "loss": 1.0091, + "step": 15315 + }, + { + "epoch": 0.6199919061108863, + "grad_norm": 1.217674970626831, + "learning_rate": 3.877761717943424e-05, + "loss": 1.0073, + "step": 15320 + }, + { + "epoch": 0.6201942533387292, + "grad_norm": 1.2425445318222046, + "learning_rate": 3.8756968820978736e-05, + "loss": 1.0315, + "step": 15325 + }, + { + "epoch": 0.6203966005665722, + "grad_norm": 1.2316184043884277, + "learning_rate": 3.873632046252323e-05, + "loss": 0.9831, + "step": 15330 + }, + { + "epoch": 0.6205989477944153, + "grad_norm": 1.172351598739624, + "learning_rate": 3.871567210406773e-05, + "loss": 0.9583, + "step": 15335 + }, + { + "epoch": 0.6208012950222582, + "grad_norm": 1.210734248161316, + "learning_rate": 3.869502374561222e-05, + "loss": 0.9854, + "step": 15340 + }, + { + "epoch": 0.6210036422501012, + "grad_norm": 1.279604434967041, + "learning_rate": 3.8674375387156726e-05, + "loss": 1.0296, + "step": 15345 + }, + { + "epoch": 0.6212059894779441, + "grad_norm": 1.3448833227157593, + "learning_rate": 3.865372702870122e-05, + "loss": 1.0159, + "step": 15350 + }, + { + "epoch": 0.6214083367057871, + "grad_norm": 1.362277626991272, + "learning_rate": 3.8633078670245713e-05, + "loss": 0.9623, + "step": 15355 + }, + { + "epoch": 0.6216106839336301, + "grad_norm": 1.2423343658447266, + "learning_rate": 3.861243031179022e-05, + "loss": 1.0087, + "step": 15360 + }, + { + "epoch": 0.6218130311614731, + "grad_norm": 1.1315988302230835, + "learning_rate": 3.859178195333471e-05, + "loss": 1.0209, + "step": 15365 + }, + { + "epoch": 0.6220153783893161, + "grad_norm": 1.128307580947876, + "learning_rate": 3.857113359487921e-05, + "loss": 0.9881, + "step": 15370 + }, + { + "epoch": 0.622217725617159, + "grad_norm": 1.2249513864517212, + "learning_rate": 3.85504852364237e-05, + "loss": 1.0157, + "step": 15375 + }, + { + "epoch": 0.622420072845002, + "grad_norm": 1.1540697813034058, + "learning_rate": 3.85298368779682e-05, + "loss": 0.985, + "step": 15380 + }, + { + "epoch": 0.622622420072845, + "grad_norm": 1.1913505792617798, + "learning_rate": 3.8509188519512704e-05, + "loss": 0.9908, + "step": 15385 + }, + { + "epoch": 0.622824767300688, + "grad_norm": 1.106575608253479, + "learning_rate": 3.8488540161057195e-05, + "loss": 1.058, + "step": 15390 + }, + { + "epoch": 0.623027114528531, + "grad_norm": 1.201764464378357, + "learning_rate": 3.84678918026017e-05, + "loss": 0.998, + "step": 15395 + }, + { + "epoch": 0.623229461756374, + "grad_norm": 1.1951693296432495, + "learning_rate": 3.844724344414619e-05, + "loss": 0.9374, + "step": 15400 + }, + { + "epoch": 0.6234318089842169, + "grad_norm": 1.1139637231826782, + "learning_rate": 3.842659508569069e-05, + "loss": 1.056, + "step": 15405 + }, + { + "epoch": 0.6236341562120599, + "grad_norm": 1.175554633140564, + "learning_rate": 3.8405946727235184e-05, + "loss": 1.0014, + "step": 15410 + }, + { + "epoch": 0.6238365034399028, + "grad_norm": 1.187445044517517, + "learning_rate": 3.838529836877968e-05, + "loss": 1.0286, + "step": 15415 + }, + { + "epoch": 0.6240388506677459, + "grad_norm": 1.302847981452942, + "learning_rate": 3.8364650010324185e-05, + "loss": 1.0182, + "step": 15420 + }, + { + "epoch": 0.6242411978955889, + "grad_norm": 1.1043428182601929, + "learning_rate": 3.8344001651868676e-05, + "loss": 0.9983, + "step": 15425 + }, + { + "epoch": 0.6244435451234318, + "grad_norm": 1.2310084104537964, + "learning_rate": 3.832335329341318e-05, + "loss": 1.0609, + "step": 15430 + }, + { + "epoch": 0.6246458923512748, + "grad_norm": 1.2543423175811768, + "learning_rate": 3.830270493495767e-05, + "loss": 1.0207, + "step": 15435 + }, + { + "epoch": 0.6248482395791177, + "grad_norm": 1.2612595558166504, + "learning_rate": 3.828205657650217e-05, + "loss": 0.9168, + "step": 15440 + }, + { + "epoch": 0.6250505868069608, + "grad_norm": 1.121643304824829, + "learning_rate": 3.8261408218046665e-05, + "loss": 1.0358, + "step": 15445 + }, + { + "epoch": 0.6252529340348038, + "grad_norm": 1.254227876663208, + "learning_rate": 3.824075985959116e-05, + "loss": 0.9918, + "step": 15450 + }, + { + "epoch": 0.6254552812626467, + "grad_norm": 1.2365626096725464, + "learning_rate": 3.8220111501135666e-05, + "loss": 0.9593, + "step": 15455 + }, + { + "epoch": 0.6256576284904897, + "grad_norm": 1.1530671119689941, + "learning_rate": 3.819946314268016e-05, + "loss": 1.0402, + "step": 15460 + }, + { + "epoch": 0.6258599757183326, + "grad_norm": 1.200811505317688, + "learning_rate": 3.8178814784224654e-05, + "loss": 0.9741, + "step": 15465 + }, + { + "epoch": 0.6260623229461756, + "grad_norm": 1.215122103691101, + "learning_rate": 3.815816642576915e-05, + "loss": 1.0084, + "step": 15470 + }, + { + "epoch": 0.6262646701740187, + "grad_norm": 1.1331044435501099, + "learning_rate": 3.813751806731365e-05, + "loss": 0.9991, + "step": 15475 + }, + { + "epoch": 0.6264670174018616, + "grad_norm": 1.1087418794631958, + "learning_rate": 3.811686970885815e-05, + "loss": 0.9812, + "step": 15480 + }, + { + "epoch": 0.6266693646297046, + "grad_norm": 1.2703540325164795, + "learning_rate": 3.809622135040264e-05, + "loss": 0.9741, + "step": 15485 + }, + { + "epoch": 0.6268717118575475, + "grad_norm": 1.2571314573287964, + "learning_rate": 3.807557299194714e-05, + "loss": 0.9684, + "step": 15490 + }, + { + "epoch": 0.6270740590853905, + "grad_norm": 1.2576099634170532, + "learning_rate": 3.805492463349164e-05, + "loss": 1.0615, + "step": 15495 + }, + { + "epoch": 0.6272764063132336, + "grad_norm": 1.0620274543762207, + "learning_rate": 3.8034276275036135e-05, + "loss": 0.92, + "step": 15500 + }, + { + "epoch": 0.6274787535410765, + "grad_norm": 1.2089036703109741, + "learning_rate": 3.801362791658063e-05, + "loss": 1.0057, + "step": 15505 + }, + { + "epoch": 0.6276811007689195, + "grad_norm": 1.563827633857727, + "learning_rate": 3.799297955812513e-05, + "loss": 0.978, + "step": 15510 + }, + { + "epoch": 0.6278834479967624, + "grad_norm": 1.2251412868499756, + "learning_rate": 3.7972331199669634e-05, + "loss": 0.9571, + "step": 15515 + }, + { + "epoch": 0.6280857952246054, + "grad_norm": 1.2349053621292114, + "learning_rate": 3.7951682841214124e-05, + "loss": 1.0135, + "step": 15520 + }, + { + "epoch": 0.6282881424524484, + "grad_norm": 1.2551171779632568, + "learning_rate": 3.793103448275862e-05, + "loss": 0.979, + "step": 15525 + }, + { + "epoch": 0.6284904896802914, + "grad_norm": 1.2055236101150513, + "learning_rate": 3.791038612430312e-05, + "loss": 1.0239, + "step": 15530 + }, + { + "epoch": 0.6286928369081344, + "grad_norm": 1.3006428480148315, + "learning_rate": 3.7889737765847616e-05, + "loss": 1.006, + "step": 15535 + }, + { + "epoch": 0.6288951841359773, + "grad_norm": 1.236907958984375, + "learning_rate": 3.7869089407392113e-05, + "loss": 0.9948, + "step": 15540 + }, + { + "epoch": 0.6290975313638203, + "grad_norm": 1.089967131614685, + "learning_rate": 3.784844104893661e-05, + "loss": 0.9968, + "step": 15545 + }, + { + "epoch": 0.6292998785916633, + "grad_norm": 1.1794949769973755, + "learning_rate": 3.782779269048111e-05, + "loss": 0.9917, + "step": 15550 + }, + { + "epoch": 0.6295022258195063, + "grad_norm": 1.2905328273773193, + "learning_rate": 3.7807144332025605e-05, + "loss": 1.0404, + "step": 15555 + }, + { + "epoch": 0.6297045730473493, + "grad_norm": 1.2904390096664429, + "learning_rate": 3.77864959735701e-05, + "loss": 1.0027, + "step": 15560 + }, + { + "epoch": 0.6299069202751922, + "grad_norm": 1.1533604860305786, + "learning_rate": 3.77658476151146e-05, + "loss": 0.9814, + "step": 15565 + }, + { + "epoch": 0.6301092675030352, + "grad_norm": 1.242135763168335, + "learning_rate": 3.77451992566591e-05, + "loss": 0.9952, + "step": 15570 + }, + { + "epoch": 0.6303116147308782, + "grad_norm": 1.1621266603469849, + "learning_rate": 3.7724550898203595e-05, + "loss": 0.917, + "step": 15575 + }, + { + "epoch": 0.6305139619587211, + "grad_norm": 1.2329875230789185, + "learning_rate": 3.770390253974809e-05, + "loss": 0.9992, + "step": 15580 + }, + { + "epoch": 0.6307163091865642, + "grad_norm": 1.028205156326294, + "learning_rate": 3.768325418129259e-05, + "loss": 0.9792, + "step": 15585 + }, + { + "epoch": 0.6309186564144071, + "grad_norm": 1.2253775596618652, + "learning_rate": 3.7662605822837086e-05, + "loss": 0.9968, + "step": 15590 + }, + { + "epoch": 0.6311210036422501, + "grad_norm": 1.1776041984558105, + "learning_rate": 3.7641957464381584e-05, + "loss": 0.935, + "step": 15595 + }, + { + "epoch": 0.6313233508700931, + "grad_norm": 1.4083956480026245, + "learning_rate": 3.762130910592608e-05, + "loss": 0.9615, + "step": 15600 + }, + { + "epoch": 0.631525698097936, + "grad_norm": 1.0517611503601074, + "learning_rate": 3.760066074747058e-05, + "loss": 0.965, + "step": 15605 + }, + { + "epoch": 0.6317280453257791, + "grad_norm": 1.0910779237747192, + "learning_rate": 3.7580012389015076e-05, + "loss": 0.9823, + "step": 15610 + }, + { + "epoch": 0.631930392553622, + "grad_norm": 1.2836335897445679, + "learning_rate": 3.755936403055957e-05, + "loss": 0.9022, + "step": 15615 + }, + { + "epoch": 0.632132739781465, + "grad_norm": 1.1814792156219482, + "learning_rate": 3.753871567210407e-05, + "loss": 1.0359, + "step": 15620 + }, + { + "epoch": 0.632335087009308, + "grad_norm": 1.1960265636444092, + "learning_rate": 3.751806731364857e-05, + "loss": 1.0093, + "step": 15625 + }, + { + "epoch": 0.6325374342371509, + "grad_norm": 1.1587327718734741, + "learning_rate": 3.7497418955193065e-05, + "loss": 0.9805, + "step": 15630 + }, + { + "epoch": 0.6327397814649939, + "grad_norm": 1.1636066436767578, + "learning_rate": 3.747677059673756e-05, + "loss": 0.9903, + "step": 15635 + }, + { + "epoch": 0.632942128692837, + "grad_norm": 1.295119047164917, + "learning_rate": 3.745612223828206e-05, + "loss": 1.0165, + "step": 15640 + }, + { + "epoch": 0.6331444759206799, + "grad_norm": 1.1014918088912964, + "learning_rate": 3.743547387982656e-05, + "loss": 1.0494, + "step": 15645 + }, + { + "epoch": 0.6333468231485229, + "grad_norm": 1.0557477474212646, + "learning_rate": 3.7414825521371054e-05, + "loss": 1.0426, + "step": 15650 + }, + { + "epoch": 0.6335491703763658, + "grad_norm": 1.279184103012085, + "learning_rate": 3.739417716291555e-05, + "loss": 1.0112, + "step": 15655 + }, + { + "epoch": 0.6337515176042088, + "grad_norm": 1.2059326171875, + "learning_rate": 3.737352880446004e-05, + "loss": 0.9797, + "step": 15660 + }, + { + "epoch": 0.6339538648320518, + "grad_norm": 1.2009028196334839, + "learning_rate": 3.7352880446004546e-05, + "loss": 0.9616, + "step": 15665 + }, + { + "epoch": 0.6341562120598948, + "grad_norm": 1.128832459449768, + "learning_rate": 3.733223208754904e-05, + "loss": 0.9949, + "step": 15670 + }, + { + "epoch": 0.6343585592877378, + "grad_norm": 1.143537163734436, + "learning_rate": 3.731158372909354e-05, + "loss": 0.9549, + "step": 15675 + }, + { + "epoch": 0.6345609065155807, + "grad_norm": 1.2537928819656372, + "learning_rate": 3.729093537063804e-05, + "loss": 1.0404, + "step": 15680 + }, + { + "epoch": 0.6347632537434237, + "grad_norm": 1.2879520654678345, + "learning_rate": 3.727028701218253e-05, + "loss": 1.0428, + "step": 15685 + }, + { + "epoch": 0.6349656009712666, + "grad_norm": 1.1847171783447266, + "learning_rate": 3.724963865372703e-05, + "loss": 0.9985, + "step": 15690 + }, + { + "epoch": 0.6351679481991097, + "grad_norm": 1.1616815328598022, + "learning_rate": 3.722899029527153e-05, + "loss": 0.9958, + "step": 15695 + }, + { + "epoch": 0.6353702954269527, + "grad_norm": 1.2530462741851807, + "learning_rate": 3.720834193681603e-05, + "loss": 1.0043, + "step": 15700 + }, + { + "epoch": 0.6355726426547956, + "grad_norm": 1.148319125175476, + "learning_rate": 3.7187693578360524e-05, + "loss": 0.9363, + "step": 15705 + }, + { + "epoch": 0.6357749898826386, + "grad_norm": 1.1484460830688477, + "learning_rate": 3.7167045219905015e-05, + "loss": 1.0185, + "step": 15710 + }, + { + "epoch": 0.6359773371104815, + "grad_norm": 1.283219575881958, + "learning_rate": 3.714639686144952e-05, + "loss": 0.9784, + "step": 15715 + }, + { + "epoch": 0.6361796843383246, + "grad_norm": 1.1322284936904907, + "learning_rate": 3.712574850299401e-05, + "loss": 0.997, + "step": 15720 + }, + { + "epoch": 0.6363820315661676, + "grad_norm": 1.0504428148269653, + "learning_rate": 3.7105100144538513e-05, + "loss": 1.0123, + "step": 15725 + }, + { + "epoch": 0.6365843787940105, + "grad_norm": 1.1896229982376099, + "learning_rate": 3.708445178608301e-05, + "loss": 0.9817, + "step": 15730 + }, + { + "epoch": 0.6367867260218535, + "grad_norm": 1.301744818687439, + "learning_rate": 3.706380342762751e-05, + "loss": 1.021, + "step": 15735 + }, + { + "epoch": 0.6369890732496964, + "grad_norm": 1.1301766633987427, + "learning_rate": 3.7043155069172005e-05, + "loss": 1.0073, + "step": 15740 + }, + { + "epoch": 0.6371914204775394, + "grad_norm": 1.283570408821106, + "learning_rate": 3.7022506710716496e-05, + "loss": 0.9424, + "step": 15745 + }, + { + "epoch": 0.6373937677053825, + "grad_norm": 1.2190791368484497, + "learning_rate": 3.7001858352261e-05, + "loss": 0.9692, + "step": 15750 + }, + { + "epoch": 0.6375961149332254, + "grad_norm": 1.1517695188522339, + "learning_rate": 3.698120999380549e-05, + "loss": 0.9715, + "step": 15755 + }, + { + "epoch": 0.6377984621610684, + "grad_norm": 1.2132381200790405, + "learning_rate": 3.6960561635349995e-05, + "loss": 1.0206, + "step": 15760 + }, + { + "epoch": 0.6380008093889113, + "grad_norm": 1.2588800191879272, + "learning_rate": 3.693991327689449e-05, + "loss": 0.9958, + "step": 15765 + }, + { + "epoch": 0.6382031566167543, + "grad_norm": 1.2574586868286133, + "learning_rate": 3.691926491843898e-05, + "loss": 1.0069, + "step": 15770 + }, + { + "epoch": 0.6384055038445974, + "grad_norm": 1.2218068838119507, + "learning_rate": 3.6898616559983486e-05, + "loss": 1.0338, + "step": 15775 + }, + { + "epoch": 0.6386078510724403, + "grad_norm": 1.2366846799850464, + "learning_rate": 3.687796820152798e-05, + "loss": 0.9635, + "step": 15780 + }, + { + "epoch": 0.6388101983002833, + "grad_norm": 1.1799321174621582, + "learning_rate": 3.685731984307248e-05, + "loss": 0.9578, + "step": 15785 + }, + { + "epoch": 0.6390125455281263, + "grad_norm": 1.0359095335006714, + "learning_rate": 3.683667148461697e-05, + "loss": 0.9935, + "step": 15790 + }, + { + "epoch": 0.6392148927559692, + "grad_norm": 1.11602783203125, + "learning_rate": 3.681602312616147e-05, + "loss": 1.0288, + "step": 15795 + }, + { + "epoch": 0.6394172399838122, + "grad_norm": 1.1980412006378174, + "learning_rate": 3.679537476770597e-05, + "loss": 1.0089, + "step": 15800 + }, + { + "epoch": 0.6396195872116552, + "grad_norm": 1.22096848487854, + "learning_rate": 3.6774726409250463e-05, + "loss": 0.9314, + "step": 15805 + }, + { + "epoch": 0.6398219344394982, + "grad_norm": 1.2322947978973389, + "learning_rate": 3.675407805079497e-05, + "loss": 1.0157, + "step": 15810 + }, + { + "epoch": 0.6400242816673412, + "grad_norm": 1.2284080982208252, + "learning_rate": 3.673342969233946e-05, + "loss": 0.9889, + "step": 15815 + }, + { + "epoch": 0.6402266288951841, + "grad_norm": 1.2398412227630615, + "learning_rate": 3.6712781333883955e-05, + "loss": 0.9665, + "step": 15820 + }, + { + "epoch": 0.6404289761230271, + "grad_norm": 1.1424891948699951, + "learning_rate": 3.669213297542845e-05, + "loss": 1.009, + "step": 15825 + }, + { + "epoch": 0.6406313233508701, + "grad_norm": 1.210742473602295, + "learning_rate": 3.667148461697295e-05, + "loss": 1.0247, + "step": 15830 + }, + { + "epoch": 0.6408336705787131, + "grad_norm": 1.292371392250061, + "learning_rate": 3.6650836258517454e-05, + "loss": 0.9835, + "step": 15835 + }, + { + "epoch": 0.6410360178065561, + "grad_norm": 1.166009545326233, + "learning_rate": 3.6630187900061944e-05, + "loss": 0.9475, + "step": 15840 + }, + { + "epoch": 0.641238365034399, + "grad_norm": 1.2181354761123657, + "learning_rate": 3.660953954160644e-05, + "loss": 0.9647, + "step": 15845 + }, + { + "epoch": 0.641440712262242, + "grad_norm": 1.1937371492385864, + "learning_rate": 3.658889118315094e-05, + "loss": 1.0044, + "step": 15850 + }, + { + "epoch": 0.6416430594900849, + "grad_norm": 1.2044020891189575, + "learning_rate": 3.6568242824695436e-05, + "loss": 0.9652, + "step": 15855 + }, + { + "epoch": 0.641845406717928, + "grad_norm": 1.1667225360870361, + "learning_rate": 3.654759446623994e-05, + "loss": 1.0825, + "step": 15860 + }, + { + "epoch": 0.642047753945771, + "grad_norm": 1.1601094007492065, + "learning_rate": 3.652694610778443e-05, + "loss": 0.989, + "step": 15865 + }, + { + "epoch": 0.6422501011736139, + "grad_norm": 1.0793761014938354, + "learning_rate": 3.6506297749328935e-05, + "loss": 0.9882, + "step": 15870 + }, + { + "epoch": 0.6424524484014569, + "grad_norm": 1.2860372066497803, + "learning_rate": 3.6485649390873426e-05, + "loss": 0.9862, + "step": 15875 + }, + { + "epoch": 0.6426547956292998, + "grad_norm": 1.1094404458999634, + "learning_rate": 3.646500103241792e-05, + "loss": 0.9718, + "step": 15880 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 1.167775273323059, + "learning_rate": 3.644435267396242e-05, + "loss": 0.9491, + "step": 15885 + }, + { + "epoch": 0.6430594900849859, + "grad_norm": 1.2569758892059326, + "learning_rate": 3.642370431550692e-05, + "loss": 0.9739, + "step": 15890 + }, + { + "epoch": 0.6432618373128288, + "grad_norm": 1.122658371925354, + "learning_rate": 3.640305595705142e-05, + "loss": 1.0148, + "step": 15895 + }, + { + "epoch": 0.6434641845406718, + "grad_norm": 1.1850612163543701, + "learning_rate": 3.638240759859591e-05, + "loss": 1.0014, + "step": 15900 + }, + { + "epoch": 0.6436665317685147, + "grad_norm": 1.2730854749679565, + "learning_rate": 3.636175924014041e-05, + "loss": 0.9741, + "step": 15905 + }, + { + "epoch": 0.6438688789963577, + "grad_norm": 1.190268635749817, + "learning_rate": 3.634111088168491e-05, + "loss": 1.0011, + "step": 15910 + }, + { + "epoch": 0.6440712262242008, + "grad_norm": 1.2439221143722534, + "learning_rate": 3.6320462523229404e-05, + "loss": 0.9201, + "step": 15915 + }, + { + "epoch": 0.6442735734520437, + "grad_norm": 1.273469090461731, + "learning_rate": 3.62998141647739e-05, + "loss": 1.0155, + "step": 15920 + }, + { + "epoch": 0.6444759206798867, + "grad_norm": 1.2768328189849854, + "learning_rate": 3.62791658063184e-05, + "loss": 1.0094, + "step": 15925 + }, + { + "epoch": 0.6446782679077296, + "grad_norm": 1.1529165506362915, + "learning_rate": 3.6258517447862896e-05, + "loss": 1.0457, + "step": 15930 + }, + { + "epoch": 0.6448806151355726, + "grad_norm": 1.2379252910614014, + "learning_rate": 3.623786908940739e-05, + "loss": 0.9919, + "step": 15935 + }, + { + "epoch": 0.6450829623634157, + "grad_norm": 1.090659260749817, + "learning_rate": 3.621722073095189e-05, + "loss": 1.0371, + "step": 15940 + }, + { + "epoch": 0.6452853095912586, + "grad_norm": 1.0815500020980835, + "learning_rate": 3.619657237249639e-05, + "loss": 0.9732, + "step": 15945 + }, + { + "epoch": 0.6454876568191016, + "grad_norm": 1.257379412651062, + "learning_rate": 3.6175924014040885e-05, + "loss": 0.9469, + "step": 15950 + }, + { + "epoch": 0.6456900040469445, + "grad_norm": 1.1595783233642578, + "learning_rate": 3.615527565558538e-05, + "loss": 1.0029, + "step": 15955 + }, + { + "epoch": 0.6458923512747875, + "grad_norm": 1.131295919418335, + "learning_rate": 3.613462729712988e-05, + "loss": 1.0195, + "step": 15960 + }, + { + "epoch": 0.6460946985026305, + "grad_norm": 1.174517035484314, + "learning_rate": 3.611397893867438e-05, + "loss": 0.9715, + "step": 15965 + }, + { + "epoch": 0.6462970457304735, + "grad_norm": 1.2053883075714111, + "learning_rate": 3.6093330580218874e-05, + "loss": 0.971, + "step": 15970 + }, + { + "epoch": 0.6464993929583165, + "grad_norm": 1.260723352432251, + "learning_rate": 3.607268222176337e-05, + "loss": 0.9793, + "step": 15975 + }, + { + "epoch": 0.6467017401861594, + "grad_norm": 1.025206446647644, + "learning_rate": 3.605203386330787e-05, + "loss": 0.9617, + "step": 15980 + }, + { + "epoch": 0.6469040874140024, + "grad_norm": 1.1306520700454712, + "learning_rate": 3.6031385504852366e-05, + "loss": 0.9441, + "step": 15985 + }, + { + "epoch": 0.6471064346418454, + "grad_norm": 1.1041451692581177, + "learning_rate": 3.6010737146396863e-05, + "loss": 1.0318, + "step": 15990 + }, + { + "epoch": 0.6473087818696884, + "grad_norm": 1.3494863510131836, + "learning_rate": 3.599008878794136e-05, + "loss": 0.9976, + "step": 15995 + }, + { + "epoch": 0.6475111290975314, + "grad_norm": 1.1904017925262451, + "learning_rate": 3.596944042948586e-05, + "loss": 1.0042, + "step": 16000 + }, + { + "epoch": 0.6477134763253743, + "grad_norm": 1.2178298234939575, + "learning_rate": 3.5948792071030355e-05, + "loss": 1.0431, + "step": 16005 + }, + { + "epoch": 0.6479158235532173, + "grad_norm": 1.104743242263794, + "learning_rate": 3.592814371257485e-05, + "loss": 0.9755, + "step": 16010 + }, + { + "epoch": 0.6481181707810603, + "grad_norm": 1.3872283697128296, + "learning_rate": 3.590749535411935e-05, + "loss": 0.9946, + "step": 16015 + }, + { + "epoch": 0.6483205180089032, + "grad_norm": 1.269563913345337, + "learning_rate": 3.588684699566385e-05, + "loss": 1.007, + "step": 16020 + }, + { + "epoch": 0.6485228652367463, + "grad_norm": 1.172032356262207, + "learning_rate": 3.5866198637208344e-05, + "loss": 0.9503, + "step": 16025 + }, + { + "epoch": 0.6487252124645893, + "grad_norm": 1.1208676099777222, + "learning_rate": 3.584555027875284e-05, + "loss": 0.9644, + "step": 16030 + }, + { + "epoch": 0.6489275596924322, + "grad_norm": 1.1338741779327393, + "learning_rate": 3.582490192029734e-05, + "loss": 0.9777, + "step": 16035 + }, + { + "epoch": 0.6491299069202752, + "grad_norm": 1.1406617164611816, + "learning_rate": 3.580425356184183e-05, + "loss": 0.9968, + "step": 16040 + }, + { + "epoch": 0.6493322541481181, + "grad_norm": 1.2349566221237183, + "learning_rate": 3.5783605203386334e-05, + "loss": 0.9214, + "step": 16045 + }, + { + "epoch": 0.6495346013759612, + "grad_norm": 1.1992546319961548, + "learning_rate": 3.576295684493083e-05, + "loss": 0.9601, + "step": 16050 + }, + { + "epoch": 0.6497369486038042, + "grad_norm": 1.1756788492202759, + "learning_rate": 3.574230848647533e-05, + "loss": 0.9593, + "step": 16055 + }, + { + "epoch": 0.6499392958316471, + "grad_norm": 1.2423843145370483, + "learning_rate": 3.5721660128019826e-05, + "loss": 1.0072, + "step": 16060 + }, + { + "epoch": 0.6501416430594901, + "grad_norm": 1.2211461067199707, + "learning_rate": 3.5701011769564316e-05, + "loss": 0.9781, + "step": 16065 + }, + { + "epoch": 0.650343990287333, + "grad_norm": 1.2080109119415283, + "learning_rate": 3.568036341110882e-05, + "loss": 0.9469, + "step": 16070 + }, + { + "epoch": 0.650546337515176, + "grad_norm": 1.2381408214569092, + "learning_rate": 3.565971505265332e-05, + "loss": 0.9369, + "step": 16075 + }, + { + "epoch": 0.6507486847430191, + "grad_norm": 1.21284019947052, + "learning_rate": 3.5639066694197815e-05, + "loss": 1.0199, + "step": 16080 + }, + { + "epoch": 0.650951031970862, + "grad_norm": 1.2571697235107422, + "learning_rate": 3.561841833574231e-05, + "loss": 0.9258, + "step": 16085 + }, + { + "epoch": 0.651153379198705, + "grad_norm": 1.1796058416366577, + "learning_rate": 3.559776997728681e-05, + "loss": 0.9726, + "step": 16090 + }, + { + "epoch": 0.6513557264265479, + "grad_norm": 1.285386085510254, + "learning_rate": 3.557712161883131e-05, + "loss": 0.9783, + "step": 16095 + }, + { + "epoch": 0.6515580736543909, + "grad_norm": 1.1672552824020386, + "learning_rate": 3.55564732603758e-05, + "loss": 0.9669, + "step": 16100 + }, + { + "epoch": 0.651760420882234, + "grad_norm": 1.1819517612457275, + "learning_rate": 3.55358249019203e-05, + "loss": 0.936, + "step": 16105 + }, + { + "epoch": 0.6519627681100769, + "grad_norm": 1.2727749347686768, + "learning_rate": 3.55151765434648e-05, + "loss": 0.9648, + "step": 16110 + }, + { + "epoch": 0.6521651153379199, + "grad_norm": 1.2024821043014526, + "learning_rate": 3.5494528185009296e-05, + "loss": 0.9441, + "step": 16115 + }, + { + "epoch": 0.6523674625657628, + "grad_norm": 1.1411044597625732, + "learning_rate": 3.547387982655379e-05, + "loss": 1.0269, + "step": 16120 + }, + { + "epoch": 0.6525698097936058, + "grad_norm": 1.1624733209609985, + "learning_rate": 3.5453231468098284e-05, + "loss": 0.989, + "step": 16125 + }, + { + "epoch": 0.6527721570214488, + "grad_norm": 1.2555545568466187, + "learning_rate": 3.543258310964279e-05, + "loss": 1.0641, + "step": 16130 + }, + { + "epoch": 0.6529745042492918, + "grad_norm": 1.3429213762283325, + "learning_rate": 3.541193475118728e-05, + "loss": 0.9779, + "step": 16135 + }, + { + "epoch": 0.6531768514771348, + "grad_norm": 1.2503955364227295, + "learning_rate": 3.539128639273178e-05, + "loss": 0.9847, + "step": 16140 + }, + { + "epoch": 0.6533791987049777, + "grad_norm": 1.2091625928878784, + "learning_rate": 3.537063803427628e-05, + "loss": 1.0222, + "step": 16145 + }, + { + "epoch": 0.6535815459328207, + "grad_norm": 1.1267467737197876, + "learning_rate": 3.534998967582077e-05, + "loss": 0.979, + "step": 16150 + }, + { + "epoch": 0.6537838931606637, + "grad_norm": 1.1542922258377075, + "learning_rate": 3.5329341317365274e-05, + "loss": 0.9906, + "step": 16155 + }, + { + "epoch": 0.6539862403885067, + "grad_norm": 1.2290841341018677, + "learning_rate": 3.5308692958909765e-05, + "loss": 0.9737, + "step": 16160 + }, + { + "epoch": 0.6541885876163497, + "grad_norm": 1.2749171257019043, + "learning_rate": 3.528804460045427e-05, + "loss": 0.9836, + "step": 16165 + }, + { + "epoch": 0.6543909348441926, + "grad_norm": 1.2310715913772583, + "learning_rate": 3.526739624199876e-05, + "loss": 1.0134, + "step": 16170 + }, + { + "epoch": 0.6545932820720356, + "grad_norm": 1.1757038831710815, + "learning_rate": 3.5246747883543257e-05, + "loss": 0.9772, + "step": 16175 + }, + { + "epoch": 0.6547956292998786, + "grad_norm": 1.2128708362579346, + "learning_rate": 3.522609952508776e-05, + "loss": 1.0178, + "step": 16180 + }, + { + "epoch": 0.6549979765277215, + "grad_norm": 1.2154395580291748, + "learning_rate": 3.520545116663225e-05, + "loss": 0.9508, + "step": 16185 + }, + { + "epoch": 0.6552003237555646, + "grad_norm": 1.1376159191131592, + "learning_rate": 3.5184802808176755e-05, + "loss": 0.9483, + "step": 16190 + }, + { + "epoch": 0.6554026709834075, + "grad_norm": 1.3748664855957031, + "learning_rate": 3.5164154449721246e-05, + "loss": 1.0099, + "step": 16195 + }, + { + "epoch": 0.6556050182112505, + "grad_norm": 1.1054377555847168, + "learning_rate": 3.514350609126575e-05, + "loss": 0.9903, + "step": 16200 + }, + { + "epoch": 0.6558073654390935, + "grad_norm": 1.235331416130066, + "learning_rate": 3.512285773281024e-05, + "loss": 1.0068, + "step": 16205 + }, + { + "epoch": 0.6560097126669364, + "grad_norm": 1.1809173822402954, + "learning_rate": 3.510220937435474e-05, + "loss": 1.0089, + "step": 16210 + }, + { + "epoch": 0.6562120598947795, + "grad_norm": 1.1006264686584473, + "learning_rate": 3.508156101589924e-05, + "loss": 0.9753, + "step": 16215 + }, + { + "epoch": 0.6564144071226224, + "grad_norm": 1.0573210716247559, + "learning_rate": 3.506091265744373e-05, + "loss": 0.9545, + "step": 16220 + }, + { + "epoch": 0.6566167543504654, + "grad_norm": 1.215930700302124, + "learning_rate": 3.5040264298988236e-05, + "loss": 1.0469, + "step": 16225 + }, + { + "epoch": 0.6568191015783084, + "grad_norm": 1.1159571409225464, + "learning_rate": 3.501961594053273e-05, + "loss": 0.9961, + "step": 16230 + }, + { + "epoch": 0.6570214488061513, + "grad_norm": 1.2668739557266235, + "learning_rate": 3.4998967582077224e-05, + "loss": 0.9515, + "step": 16235 + }, + { + "epoch": 0.6572237960339944, + "grad_norm": 1.1844149827957153, + "learning_rate": 3.497831922362173e-05, + "loss": 1.0669, + "step": 16240 + }, + { + "epoch": 0.6574261432618373, + "grad_norm": 1.2933884859085083, + "learning_rate": 3.495767086516622e-05, + "loss": 1.0009, + "step": 16245 + }, + { + "epoch": 0.6576284904896803, + "grad_norm": 1.1424107551574707, + "learning_rate": 3.493702250671072e-05, + "loss": 0.9989, + "step": 16250 + }, + { + "epoch": 0.6578308377175233, + "grad_norm": 1.184024691581726, + "learning_rate": 3.491637414825521e-05, + "loss": 1.0101, + "step": 16255 + }, + { + "epoch": 0.6580331849453662, + "grad_norm": 1.1887484788894653, + "learning_rate": 3.489572578979971e-05, + "loss": 0.9734, + "step": 16260 + }, + { + "epoch": 0.6582355321732092, + "grad_norm": 1.2848747968673706, + "learning_rate": 3.487507743134421e-05, + "loss": 1.0054, + "step": 16265 + }, + { + "epoch": 0.6584378794010523, + "grad_norm": 1.1876070499420166, + "learning_rate": 3.4854429072888705e-05, + "loss": 0.9477, + "step": 16270 + }, + { + "epoch": 0.6586402266288952, + "grad_norm": 1.2064857482910156, + "learning_rate": 3.483378071443321e-05, + "loss": 0.9756, + "step": 16275 + }, + { + "epoch": 0.6588425738567382, + "grad_norm": 1.1177282333374023, + "learning_rate": 3.48131323559777e-05, + "loss": 0.9524, + "step": 16280 + }, + { + "epoch": 0.6590449210845811, + "grad_norm": 1.2096445560455322, + "learning_rate": 3.47924839975222e-05, + "loss": 1.0441, + "step": 16285 + }, + { + "epoch": 0.6592472683124241, + "grad_norm": 1.1764529943466187, + "learning_rate": 3.4771835639066694e-05, + "loss": 0.9431, + "step": 16290 + }, + { + "epoch": 0.6594496155402672, + "grad_norm": 1.1835061311721802, + "learning_rate": 3.475118728061119e-05, + "loss": 0.9521, + "step": 16295 + }, + { + "epoch": 0.6596519627681101, + "grad_norm": 1.1990066766738892, + "learning_rate": 3.473053892215569e-05, + "loss": 1.0344, + "step": 16300 + }, + { + "epoch": 0.6598543099959531, + "grad_norm": 1.2333585023880005, + "learning_rate": 3.4709890563700186e-05, + "loss": 1.0388, + "step": 16305 + }, + { + "epoch": 0.660056657223796, + "grad_norm": 1.1921073198318481, + "learning_rate": 3.4689242205244684e-05, + "loss": 0.958, + "step": 16310 + }, + { + "epoch": 0.660259004451639, + "grad_norm": 1.2137154340744019, + "learning_rate": 3.466859384678918e-05, + "loss": 1.024, + "step": 16315 + }, + { + "epoch": 0.660461351679482, + "grad_norm": 1.1804203987121582, + "learning_rate": 3.464794548833368e-05, + "loss": 1.0156, + "step": 16320 + }, + { + "epoch": 0.660663698907325, + "grad_norm": 1.2474133968353271, + "learning_rate": 3.4627297129878176e-05, + "loss": 0.9971, + "step": 16325 + }, + { + "epoch": 0.660866046135168, + "grad_norm": 1.1229432821273804, + "learning_rate": 3.460664877142267e-05, + "loss": 0.9787, + "step": 16330 + }, + { + "epoch": 0.6610683933630109, + "grad_norm": 1.2054779529571533, + "learning_rate": 3.458600041296717e-05, + "loss": 0.9773, + "step": 16335 + }, + { + "epoch": 0.6612707405908539, + "grad_norm": 1.221892237663269, + "learning_rate": 3.456535205451167e-05, + "loss": 0.9425, + "step": 16340 + }, + { + "epoch": 0.6614730878186968, + "grad_norm": 1.3123283386230469, + "learning_rate": 3.4544703696056165e-05, + "loss": 1.0315, + "step": 16345 + }, + { + "epoch": 0.6616754350465399, + "grad_norm": 1.3071208000183105, + "learning_rate": 3.452405533760066e-05, + "loss": 1.0287, + "step": 16350 + }, + { + "epoch": 0.6618777822743829, + "grad_norm": 1.1664862632751465, + "learning_rate": 3.450340697914516e-05, + "loss": 1.0081, + "step": 16355 + }, + { + "epoch": 0.6620801295022258, + "grad_norm": 1.2433518171310425, + "learning_rate": 3.4482758620689657e-05, + "loss": 1.0246, + "step": 16360 + }, + { + "epoch": 0.6622824767300688, + "grad_norm": 1.2965155839920044, + "learning_rate": 3.4462110262234154e-05, + "loss": 0.9523, + "step": 16365 + }, + { + "epoch": 0.6624848239579118, + "grad_norm": 1.2354497909545898, + "learning_rate": 3.444146190377865e-05, + "loss": 1.0091, + "step": 16370 + }, + { + "epoch": 0.6626871711857547, + "grad_norm": 1.1297180652618408, + "learning_rate": 3.442081354532315e-05, + "loss": 1.0102, + "step": 16375 + }, + { + "epoch": 0.6628895184135978, + "grad_norm": 1.1552640199661255, + "learning_rate": 3.4400165186867646e-05, + "loss": 0.976, + "step": 16380 + }, + { + "epoch": 0.6630918656414407, + "grad_norm": 1.1431080102920532, + "learning_rate": 3.437951682841214e-05, + "loss": 0.9939, + "step": 16385 + }, + { + "epoch": 0.6632942128692837, + "grad_norm": 1.3781251907348633, + "learning_rate": 3.435886846995664e-05, + "loss": 0.98, + "step": 16390 + }, + { + "epoch": 0.6634965600971267, + "grad_norm": 1.2489134073257446, + "learning_rate": 3.433822011150114e-05, + "loss": 0.9347, + "step": 16395 + }, + { + "epoch": 0.6636989073249696, + "grad_norm": 1.084082841873169, + "learning_rate": 3.4317571753045635e-05, + "loss": 0.9716, + "step": 16400 + }, + { + "epoch": 0.6639012545528127, + "grad_norm": 1.173354148864746, + "learning_rate": 3.429692339459013e-05, + "loss": 0.9983, + "step": 16405 + }, + { + "epoch": 0.6641036017806556, + "grad_norm": 1.2579299211502075, + "learning_rate": 3.427627503613463e-05, + "loss": 0.9981, + "step": 16410 + }, + { + "epoch": 0.6643059490084986, + "grad_norm": 1.2689281702041626, + "learning_rate": 3.425562667767913e-05, + "loss": 1.0367, + "step": 16415 + }, + { + "epoch": 0.6645082962363416, + "grad_norm": 1.1956695318222046, + "learning_rate": 3.423497831922362e-05, + "loss": 1.0017, + "step": 16420 + }, + { + "epoch": 0.6647106434641845, + "grad_norm": 1.1953344345092773, + "learning_rate": 3.421432996076812e-05, + "loss": 0.9572, + "step": 16425 + }, + { + "epoch": 0.6649129906920275, + "grad_norm": 1.1050218343734741, + "learning_rate": 3.419368160231262e-05, + "loss": 0.9739, + "step": 16430 + }, + { + "epoch": 0.6651153379198705, + "grad_norm": 1.2522861957550049, + "learning_rate": 3.4173033243857116e-05, + "loss": 0.9683, + "step": 16435 + }, + { + "epoch": 0.6653176851477135, + "grad_norm": 1.4148693084716797, + "learning_rate": 3.415238488540161e-05, + "loss": 0.9713, + "step": 16440 + }, + { + "epoch": 0.6655200323755565, + "grad_norm": 1.3018653392791748, + "learning_rate": 3.413173652694611e-05, + "loss": 0.9295, + "step": 16445 + }, + { + "epoch": 0.6657223796033994, + "grad_norm": 1.2819592952728271, + "learning_rate": 3.411108816849061e-05, + "loss": 0.973, + "step": 16450 + }, + { + "epoch": 0.6659247268312424, + "grad_norm": 1.1259995698928833, + "learning_rate": 3.4090439810035105e-05, + "loss": 0.9863, + "step": 16455 + }, + { + "epoch": 0.6661270740590854, + "grad_norm": 1.1015294790267944, + "learning_rate": 3.40697914515796e-05, + "loss": 1.0036, + "step": 16460 + }, + { + "epoch": 0.6663294212869284, + "grad_norm": 1.012439489364624, + "learning_rate": 3.40491430931241e-05, + "loss": 1.0279, + "step": 16465 + }, + { + "epoch": 0.6665317685147714, + "grad_norm": 1.157087802886963, + "learning_rate": 3.40284947346686e-05, + "loss": 1.0178, + "step": 16470 + }, + { + "epoch": 0.6667341157426143, + "grad_norm": 1.246281623840332, + "learning_rate": 3.4007846376213094e-05, + "loss": 1.0249, + "step": 16475 + }, + { + "epoch": 0.6669364629704573, + "grad_norm": 1.208592414855957, + "learning_rate": 3.3987198017757585e-05, + "loss": 0.9562, + "step": 16480 + }, + { + "epoch": 0.6671388101983002, + "grad_norm": 1.1817739009857178, + "learning_rate": 3.396654965930209e-05, + "loss": 1.0095, + "step": 16485 + }, + { + "epoch": 0.6673411574261433, + "grad_norm": 1.2455251216888428, + "learning_rate": 3.3945901300846586e-05, + "loss": 1.0175, + "step": 16490 + }, + { + "epoch": 0.6675435046539863, + "grad_norm": 1.1381094455718994, + "learning_rate": 3.3925252942391084e-05, + "loss": 1.0171, + "step": 16495 + }, + { + "epoch": 0.6677458518818292, + "grad_norm": 1.1386624574661255, + "learning_rate": 3.390460458393558e-05, + "loss": 1.0674, + "step": 16500 + }, + { + "epoch": 0.6679481991096722, + "grad_norm": 1.1103817224502563, + "learning_rate": 3.388395622548007e-05, + "loss": 0.9728, + "step": 16505 + }, + { + "epoch": 0.6681505463375151, + "grad_norm": 1.268618106842041, + "learning_rate": 3.3863307867024575e-05, + "loss": 1.006, + "step": 16510 + }, + { + "epoch": 0.6683528935653582, + "grad_norm": 1.1923140287399292, + "learning_rate": 3.3842659508569066e-05, + "loss": 0.9763, + "step": 16515 + }, + { + "epoch": 0.6685552407932012, + "grad_norm": 1.3593353033065796, + "learning_rate": 3.382201115011357e-05, + "loss": 1.0071, + "step": 16520 + }, + { + "epoch": 0.6687575880210441, + "grad_norm": 1.3046915531158447, + "learning_rate": 3.380136279165807e-05, + "loss": 1.0135, + "step": 16525 + }, + { + "epoch": 0.6689599352488871, + "grad_norm": 1.217355489730835, + "learning_rate": 3.378071443320256e-05, + "loss": 0.9486, + "step": 16530 + }, + { + "epoch": 0.66916228247673, + "grad_norm": 1.1550461053848267, + "learning_rate": 3.376006607474706e-05, + "loss": 0.9181, + "step": 16535 + }, + { + "epoch": 0.669364629704573, + "grad_norm": 1.2547290325164795, + "learning_rate": 3.373941771629155e-05, + "loss": 1.0339, + "step": 16540 + }, + { + "epoch": 0.6695669769324161, + "grad_norm": 1.3131098747253418, + "learning_rate": 3.3718769357836057e-05, + "loss": 0.9527, + "step": 16545 + }, + { + "epoch": 0.669769324160259, + "grad_norm": 1.1808663606643677, + "learning_rate": 3.369812099938055e-05, + "loss": 1.0197, + "step": 16550 + }, + { + "epoch": 0.669971671388102, + "grad_norm": 1.25763738155365, + "learning_rate": 3.367747264092505e-05, + "loss": 0.9644, + "step": 16555 + }, + { + "epoch": 0.6701740186159449, + "grad_norm": 1.1556329727172852, + "learning_rate": 3.365682428246955e-05, + "loss": 0.9838, + "step": 16560 + }, + { + "epoch": 0.6703763658437879, + "grad_norm": 1.2575442790985107, + "learning_rate": 3.363617592401404e-05, + "loss": 0.9805, + "step": 16565 + }, + { + "epoch": 0.670578713071631, + "grad_norm": 1.1899510622024536, + "learning_rate": 3.361552756555854e-05, + "loss": 0.9748, + "step": 16570 + }, + { + "epoch": 0.6707810602994739, + "grad_norm": 1.3026821613311768, + "learning_rate": 3.3594879207103034e-05, + "loss": 0.9706, + "step": 16575 + }, + { + "epoch": 0.6709834075273169, + "grad_norm": 1.2790369987487793, + "learning_rate": 3.357423084864754e-05, + "loss": 1.0038, + "step": 16580 + }, + { + "epoch": 0.6711857547551598, + "grad_norm": 1.2287906408309937, + "learning_rate": 3.355358249019203e-05, + "loss": 1.0712, + "step": 16585 + }, + { + "epoch": 0.6713881019830028, + "grad_norm": 1.1447017192840576, + "learning_rate": 3.3532934131736525e-05, + "loss": 1.0111, + "step": 16590 + }, + { + "epoch": 0.6715904492108458, + "grad_norm": 1.4328875541687012, + "learning_rate": 3.351228577328103e-05, + "loss": 1.0095, + "step": 16595 + }, + { + "epoch": 0.6717927964386888, + "grad_norm": 1.0844640731811523, + "learning_rate": 3.349163741482552e-05, + "loss": 0.9768, + "step": 16600 + }, + { + "epoch": 0.6719951436665318, + "grad_norm": 1.2308974266052246, + "learning_rate": 3.3470989056370024e-05, + "loss": 1.094, + "step": 16605 + }, + { + "epoch": 0.6721974908943747, + "grad_norm": 1.1745285987854004, + "learning_rate": 3.3450340697914515e-05, + "loss": 1.0043, + "step": 16610 + }, + { + "epoch": 0.6723998381222177, + "grad_norm": 1.2722959518432617, + "learning_rate": 3.342969233945901e-05, + "loss": 0.9747, + "step": 16615 + }, + { + "epoch": 0.6726021853500607, + "grad_norm": 1.318652868270874, + "learning_rate": 3.3409043981003516e-05, + "loss": 1.0501, + "step": 16620 + }, + { + "epoch": 0.6728045325779037, + "grad_norm": 1.0574398040771484, + "learning_rate": 3.3388395622548007e-05, + "loss": 0.9792, + "step": 16625 + }, + { + "epoch": 0.6730068798057467, + "grad_norm": 1.241368055343628, + "learning_rate": 3.336774726409251e-05, + "loss": 0.9847, + "step": 16630 + }, + { + "epoch": 0.6732092270335897, + "grad_norm": 1.2177891731262207, + "learning_rate": 3.3347098905637e-05, + "loss": 1.0106, + "step": 16635 + }, + { + "epoch": 0.6734115742614326, + "grad_norm": 1.1439677476882935, + "learning_rate": 3.33264505471815e-05, + "loss": 1.0239, + "step": 16640 + }, + { + "epoch": 0.6736139214892756, + "grad_norm": 1.3419727087020874, + "learning_rate": 3.3305802188725996e-05, + "loss": 0.9621, + "step": 16645 + }, + { + "epoch": 0.6738162687171185, + "grad_norm": 1.226435661315918, + "learning_rate": 3.328515383027049e-05, + "loss": 1.0139, + "step": 16650 + }, + { + "epoch": 0.6740186159449616, + "grad_norm": 1.2880449295043945, + "learning_rate": 3.3264505471815e-05, + "loss": 0.9398, + "step": 16655 + }, + { + "epoch": 0.6742209631728046, + "grad_norm": 1.1530487537384033, + "learning_rate": 3.324385711335949e-05, + "loss": 1.005, + "step": 16660 + }, + { + "epoch": 0.6744233104006475, + "grad_norm": 1.1972562074661255, + "learning_rate": 3.322320875490399e-05, + "loss": 0.9762, + "step": 16665 + }, + { + "epoch": 0.6746256576284905, + "grad_norm": 1.2207541465759277, + "learning_rate": 3.320256039644848e-05, + "loss": 0.9496, + "step": 16670 + }, + { + "epoch": 0.6748280048563334, + "grad_norm": 1.1928716897964478, + "learning_rate": 3.318191203799298e-05, + "loss": 1.0167, + "step": 16675 + }, + { + "epoch": 0.6750303520841765, + "grad_norm": 1.2012739181518555, + "learning_rate": 3.316126367953748e-05, + "loss": 0.9725, + "step": 16680 + }, + { + "epoch": 0.6752326993120195, + "grad_norm": 1.3488842248916626, + "learning_rate": 3.3140615321081974e-05, + "loss": 0.9615, + "step": 16685 + }, + { + "epoch": 0.6754350465398624, + "grad_norm": 1.1975066661834717, + "learning_rate": 3.311996696262648e-05, + "loss": 0.9524, + "step": 16690 + }, + { + "epoch": 0.6756373937677054, + "grad_norm": 1.2017061710357666, + "learning_rate": 3.309931860417097e-05, + "loss": 0.9714, + "step": 16695 + }, + { + "epoch": 0.6758397409955483, + "grad_norm": 1.1276466846466064, + "learning_rate": 3.3078670245715466e-05, + "loss": 0.9561, + "step": 16700 + }, + { + "epoch": 0.6760420882233913, + "grad_norm": 1.1216696500778198, + "learning_rate": 3.305802188725996e-05, + "loss": 0.9854, + "step": 16705 + }, + { + "epoch": 0.6762444354512344, + "grad_norm": 1.143851399421692, + "learning_rate": 3.303737352880446e-05, + "loss": 1.0552, + "step": 16710 + }, + { + "epoch": 0.6764467826790773, + "grad_norm": 1.0941919088363647, + "learning_rate": 3.301672517034896e-05, + "loss": 1.0036, + "step": 16715 + }, + { + "epoch": 0.6766491299069203, + "grad_norm": 1.1506640911102295, + "learning_rate": 3.2996076811893455e-05, + "loss": 0.9902, + "step": 16720 + }, + { + "epoch": 0.6768514771347632, + "grad_norm": 1.1210603713989258, + "learning_rate": 3.297542845343795e-05, + "loss": 0.9526, + "step": 16725 + }, + { + "epoch": 0.6770538243626062, + "grad_norm": 1.1224634647369385, + "learning_rate": 3.295478009498245e-05, + "loss": 0.9588, + "step": 16730 + }, + { + "epoch": 0.6772561715904493, + "grad_norm": 1.122049331665039, + "learning_rate": 3.293413173652695e-05, + "loss": 1.0602, + "step": 16735 + }, + { + "epoch": 0.6774585188182922, + "grad_norm": 1.176449179649353, + "learning_rate": 3.2913483378071444e-05, + "loss": 0.9677, + "step": 16740 + }, + { + "epoch": 0.6776608660461352, + "grad_norm": 1.2817918062210083, + "learning_rate": 3.289283501961594e-05, + "loss": 1.0685, + "step": 16745 + }, + { + "epoch": 0.6778632132739781, + "grad_norm": 1.1789745092391968, + "learning_rate": 3.287218666116044e-05, + "loss": 0.976, + "step": 16750 + }, + { + "epoch": 0.6780655605018211, + "grad_norm": 1.095564842224121, + "learning_rate": 3.2851538302704936e-05, + "loss": 0.9916, + "step": 16755 + }, + { + "epoch": 0.678267907729664, + "grad_norm": 1.1979174613952637, + "learning_rate": 3.2830889944249434e-05, + "loss": 1.0478, + "step": 16760 + }, + { + "epoch": 0.6784702549575071, + "grad_norm": 1.3009766340255737, + "learning_rate": 3.281024158579393e-05, + "loss": 0.9987, + "step": 16765 + }, + { + "epoch": 0.6786726021853501, + "grad_norm": 1.1497248411178589, + "learning_rate": 3.278959322733843e-05, + "loss": 0.9367, + "step": 16770 + }, + { + "epoch": 0.678874949413193, + "grad_norm": 1.120176076889038, + "learning_rate": 3.2768944868882925e-05, + "loss": 1.019, + "step": 16775 + }, + { + "epoch": 0.679077296641036, + "grad_norm": 1.2681643962860107, + "learning_rate": 3.274829651042742e-05, + "loss": 0.9813, + "step": 16780 + }, + { + "epoch": 0.679279643868879, + "grad_norm": 1.1057486534118652, + "learning_rate": 3.272764815197192e-05, + "loss": 0.9439, + "step": 16785 + }, + { + "epoch": 0.679481991096722, + "grad_norm": 1.1760663986206055, + "learning_rate": 3.270699979351642e-05, + "loss": 0.9532, + "step": 16790 + }, + { + "epoch": 0.679684338324565, + "grad_norm": 1.2541499137878418, + "learning_rate": 3.2686351435060915e-05, + "loss": 0.9787, + "step": 16795 + }, + { + "epoch": 0.6798866855524079, + "grad_norm": 1.2193784713745117, + "learning_rate": 3.266570307660541e-05, + "loss": 0.9828, + "step": 16800 + }, + { + "epoch": 0.6800890327802509, + "grad_norm": 1.1541152000427246, + "learning_rate": 3.264505471814991e-05, + "loss": 1.0061, + "step": 16805 + }, + { + "epoch": 0.6802913800080939, + "grad_norm": 1.0843942165374756, + "learning_rate": 3.2624406359694407e-05, + "loss": 0.9739, + "step": 16810 + }, + { + "epoch": 0.6804937272359368, + "grad_norm": 1.2362585067749023, + "learning_rate": 3.2603758001238904e-05, + "loss": 0.9515, + "step": 16815 + }, + { + "epoch": 0.6806960744637799, + "grad_norm": 1.2097837924957275, + "learning_rate": 3.25831096427834e-05, + "loss": 0.9912, + "step": 16820 + }, + { + "epoch": 0.6808984216916228, + "grad_norm": 1.1431483030319214, + "learning_rate": 3.25624612843279e-05, + "loss": 0.9931, + "step": 16825 + }, + { + "epoch": 0.6811007689194658, + "grad_norm": 1.1017531156539917, + "learning_rate": 3.2541812925872396e-05, + "loss": 0.9941, + "step": 16830 + }, + { + "epoch": 0.6813031161473088, + "grad_norm": 1.1678105592727661, + "learning_rate": 3.252116456741689e-05, + "loss": 0.9737, + "step": 16835 + }, + { + "epoch": 0.6815054633751517, + "grad_norm": 1.2099162340164185, + "learning_rate": 3.250051620896139e-05, + "loss": 0.9884, + "step": 16840 + }, + { + "epoch": 0.6817078106029948, + "grad_norm": 1.3309848308563232, + "learning_rate": 3.247986785050589e-05, + "loss": 1.0069, + "step": 16845 + }, + { + "epoch": 0.6819101578308377, + "grad_norm": 1.2974705696105957, + "learning_rate": 3.2459219492050385e-05, + "loss": 1.0386, + "step": 16850 + }, + { + "epoch": 0.6821125050586807, + "grad_norm": 1.214913249015808, + "learning_rate": 3.243857113359488e-05, + "loss": 1.0237, + "step": 16855 + }, + { + "epoch": 0.6823148522865237, + "grad_norm": 1.3377631902694702, + "learning_rate": 3.241792277513937e-05, + "loss": 1.0285, + "step": 16860 + }, + { + "epoch": 0.6825171995143666, + "grad_norm": 1.264298439025879, + "learning_rate": 3.239727441668388e-05, + "loss": 1.0209, + "step": 16865 + }, + { + "epoch": 0.6827195467422096, + "grad_norm": 1.1477164030075073, + "learning_rate": 3.2376626058228374e-05, + "loss": 0.9898, + "step": 16870 + }, + { + "epoch": 0.6829218939700527, + "grad_norm": 1.2462587356567383, + "learning_rate": 3.235597769977287e-05, + "loss": 0.9754, + "step": 16875 + }, + { + "epoch": 0.6831242411978956, + "grad_norm": 1.2337095737457275, + "learning_rate": 3.233532934131737e-05, + "loss": 0.9265, + "step": 16880 + }, + { + "epoch": 0.6833265884257386, + "grad_norm": 1.1416597366333008, + "learning_rate": 3.231468098286186e-05, + "loss": 1.0265, + "step": 16885 + }, + { + "epoch": 0.6835289356535815, + "grad_norm": 1.1606651544570923, + "learning_rate": 3.229403262440636e-05, + "loss": 0.9875, + "step": 16890 + }, + { + "epoch": 0.6837312828814245, + "grad_norm": 1.233574628829956, + "learning_rate": 3.2273384265950854e-05, + "loss": 0.9783, + "step": 16895 + }, + { + "epoch": 0.6839336301092676, + "grad_norm": 1.2189573049545288, + "learning_rate": 3.225273590749536e-05, + "loss": 1.0409, + "step": 16900 + }, + { + "epoch": 0.6841359773371105, + "grad_norm": 1.1458240747451782, + "learning_rate": 3.2232087549039855e-05, + "loss": 0.9379, + "step": 16905 + }, + { + "epoch": 0.6843383245649535, + "grad_norm": 1.2293412685394287, + "learning_rate": 3.221143919058435e-05, + "loss": 1.0584, + "step": 16910 + }, + { + "epoch": 0.6845406717927964, + "grad_norm": 1.1189359426498413, + "learning_rate": 3.219079083212885e-05, + "loss": 0.9614, + "step": 16915 + }, + { + "epoch": 0.6847430190206394, + "grad_norm": 1.1481940746307373, + "learning_rate": 3.217014247367334e-05, + "loss": 0.9941, + "step": 16920 + }, + { + "epoch": 0.6849453662484823, + "grad_norm": 1.26361882686615, + "learning_rate": 3.2149494115217844e-05, + "loss": 0.9524, + "step": 16925 + }, + { + "epoch": 0.6851477134763254, + "grad_norm": 1.1836000680923462, + "learning_rate": 3.2128845756762335e-05, + "loss": 0.9901, + "step": 16930 + }, + { + "epoch": 0.6853500607041684, + "grad_norm": 1.1145380735397339, + "learning_rate": 3.210819739830684e-05, + "loss": 0.9956, + "step": 16935 + }, + { + "epoch": 0.6855524079320113, + "grad_norm": 1.091498851776123, + "learning_rate": 3.2087549039851336e-05, + "loss": 1.0457, + "step": 16940 + }, + { + "epoch": 0.6857547551598543, + "grad_norm": 1.1746047735214233, + "learning_rate": 3.206690068139583e-05, + "loss": 0.9925, + "step": 16945 + }, + { + "epoch": 0.6859571023876972, + "grad_norm": 1.166630506515503, + "learning_rate": 3.204625232294033e-05, + "loss": 0.9864, + "step": 16950 + }, + { + "epoch": 0.6861594496155403, + "grad_norm": 1.2178932428359985, + "learning_rate": 3.202560396448482e-05, + "loss": 0.9179, + "step": 16955 + }, + { + "epoch": 0.6863617968433833, + "grad_norm": 1.2030609846115112, + "learning_rate": 3.2004955606029325e-05, + "loss": 1.0445, + "step": 16960 + }, + { + "epoch": 0.6865641440712262, + "grad_norm": 1.319400668144226, + "learning_rate": 3.1984307247573816e-05, + "loss": 0.977, + "step": 16965 + }, + { + "epoch": 0.6867664912990692, + "grad_norm": 1.1344331502914429, + "learning_rate": 3.196365888911831e-05, + "loss": 0.9348, + "step": 16970 + }, + { + "epoch": 0.6869688385269122, + "grad_norm": 1.2260794639587402, + "learning_rate": 3.194301053066282e-05, + "loss": 1.0605, + "step": 16975 + }, + { + "epoch": 0.6871711857547551, + "grad_norm": 1.1607056856155396, + "learning_rate": 3.192236217220731e-05, + "loss": 1.0135, + "step": 16980 + }, + { + "epoch": 0.6873735329825982, + "grad_norm": 1.086566686630249, + "learning_rate": 3.190171381375181e-05, + "loss": 0.9789, + "step": 16985 + }, + { + "epoch": 0.6875758802104411, + "grad_norm": 1.1593726873397827, + "learning_rate": 3.18810654552963e-05, + "loss": 0.9674, + "step": 16990 + }, + { + "epoch": 0.6877782274382841, + "grad_norm": 1.2725913524627686, + "learning_rate": 3.18604170968408e-05, + "loss": 0.9664, + "step": 16995 + }, + { + "epoch": 0.687980574666127, + "grad_norm": 1.1086002588272095, + "learning_rate": 3.1839768738385304e-05, + "loss": 0.9972, + "step": 17000 + }, + { + "epoch": 0.68818292189397, + "grad_norm": 1.1716859340667725, + "learning_rate": 3.1819120379929794e-05, + "loss": 0.9771, + "step": 17005 + }, + { + "epoch": 0.6883852691218131, + "grad_norm": 1.1765456199645996, + "learning_rate": 3.17984720214743e-05, + "loss": 0.9971, + "step": 17010 + }, + { + "epoch": 0.688587616349656, + "grad_norm": 1.078888177871704, + "learning_rate": 3.177782366301879e-05, + "loss": 0.9953, + "step": 17015 + }, + { + "epoch": 0.688789963577499, + "grad_norm": 1.1807310581207275, + "learning_rate": 3.175717530456329e-05, + "loss": 1.0347, + "step": 17020 + }, + { + "epoch": 0.688992310805342, + "grad_norm": 1.1671158075332642, + "learning_rate": 3.1736526946107784e-05, + "loss": 0.9966, + "step": 17025 + }, + { + "epoch": 0.6891946580331849, + "grad_norm": 1.1963889598846436, + "learning_rate": 3.171587858765228e-05, + "loss": 0.9959, + "step": 17030 + }, + { + "epoch": 0.6893970052610279, + "grad_norm": 1.1303328275680542, + "learning_rate": 3.1695230229196785e-05, + "loss": 0.9829, + "step": 17035 + }, + { + "epoch": 0.6895993524888709, + "grad_norm": 1.2900586128234863, + "learning_rate": 3.1674581870741275e-05, + "loss": 0.9906, + "step": 17040 + }, + { + "epoch": 0.6898016997167139, + "grad_norm": 1.205917477607727, + "learning_rate": 3.165393351228578e-05, + "loss": 0.9605, + "step": 17045 + }, + { + "epoch": 0.6900040469445569, + "grad_norm": 1.1977159976959229, + "learning_rate": 3.163328515383027e-05, + "loss": 1.015, + "step": 17050 + }, + { + "epoch": 0.6902063941723998, + "grad_norm": 1.1795580387115479, + "learning_rate": 3.161263679537477e-05, + "loss": 0.9695, + "step": 17055 + }, + { + "epoch": 0.6904087414002428, + "grad_norm": 1.1956350803375244, + "learning_rate": 3.1591988436919265e-05, + "loss": 0.9329, + "step": 17060 + }, + { + "epoch": 0.6906110886280858, + "grad_norm": 1.2197588682174683, + "learning_rate": 3.157134007846376e-05, + "loss": 0.9645, + "step": 17065 + }, + { + "epoch": 0.6908134358559288, + "grad_norm": 1.2725110054016113, + "learning_rate": 3.1550691720008266e-05, + "loss": 1.0407, + "step": 17070 + }, + { + "epoch": 0.6910157830837718, + "grad_norm": 1.2563390731811523, + "learning_rate": 3.1530043361552756e-05, + "loss": 0.9709, + "step": 17075 + }, + { + "epoch": 0.6912181303116147, + "grad_norm": 1.1819779872894287, + "learning_rate": 3.1509395003097254e-05, + "loss": 0.9295, + "step": 17080 + }, + { + "epoch": 0.6914204775394577, + "grad_norm": 1.2610588073730469, + "learning_rate": 3.148874664464175e-05, + "loss": 0.974, + "step": 17085 + }, + { + "epoch": 0.6916228247673006, + "grad_norm": 1.2875587940216064, + "learning_rate": 3.146809828618625e-05, + "loss": 0.975, + "step": 17090 + }, + { + "epoch": 0.6918251719951437, + "grad_norm": 1.1317520141601562, + "learning_rate": 3.1447449927730746e-05, + "loss": 1.036, + "step": 17095 + }, + { + "epoch": 0.6920275192229867, + "grad_norm": 1.2763997316360474, + "learning_rate": 3.142680156927524e-05, + "loss": 0.9996, + "step": 17100 + }, + { + "epoch": 0.6922298664508296, + "grad_norm": 1.1817160844802856, + "learning_rate": 3.140615321081974e-05, + "loss": 0.9727, + "step": 17105 + }, + { + "epoch": 0.6924322136786726, + "grad_norm": 1.3305127620697021, + "learning_rate": 3.138550485236424e-05, + "loss": 0.9357, + "step": 17110 + }, + { + "epoch": 0.6926345609065155, + "grad_norm": 1.2789640426635742, + "learning_rate": 3.1364856493908735e-05, + "loss": 1.0553, + "step": 17115 + }, + { + "epoch": 0.6928369081343586, + "grad_norm": 1.2852317094802856, + "learning_rate": 3.134420813545323e-05, + "loss": 0.9732, + "step": 17120 + }, + { + "epoch": 0.6930392553622016, + "grad_norm": 1.098250150680542, + "learning_rate": 3.132355977699773e-05, + "loss": 1.0155, + "step": 17125 + }, + { + "epoch": 0.6932416025900445, + "grad_norm": 1.3127838373184204, + "learning_rate": 3.130291141854223e-05, + "loss": 1.0226, + "step": 17130 + }, + { + "epoch": 0.6934439498178875, + "grad_norm": 1.1965336799621582, + "learning_rate": 3.1282263060086724e-05, + "loss": 1.0328, + "step": 17135 + }, + { + "epoch": 0.6936462970457304, + "grad_norm": 1.1536129713058472, + "learning_rate": 3.126161470163122e-05, + "loss": 0.9851, + "step": 17140 + }, + { + "epoch": 0.6938486442735734, + "grad_norm": 1.166200041770935, + "learning_rate": 3.124096634317572e-05, + "loss": 1.0096, + "step": 17145 + }, + { + "epoch": 0.6940509915014165, + "grad_norm": 1.1907601356506348, + "learning_rate": 3.1220317984720216e-05, + "loss": 0.9566, + "step": 17150 + }, + { + "epoch": 0.6942533387292594, + "grad_norm": 1.1805274486541748, + "learning_rate": 3.119966962626471e-05, + "loss": 1.004, + "step": 17155 + }, + { + "epoch": 0.6944556859571024, + "grad_norm": 1.168805718421936, + "learning_rate": 3.117902126780921e-05, + "loss": 0.9628, + "step": 17160 + }, + { + "epoch": 0.6946580331849453, + "grad_norm": 1.1943094730377197, + "learning_rate": 3.115837290935371e-05, + "loss": 0.9498, + "step": 17165 + }, + { + "epoch": 0.6948603804127883, + "grad_norm": 1.1253079175949097, + "learning_rate": 3.1137724550898205e-05, + "loss": 1.0004, + "step": 17170 + }, + { + "epoch": 0.6950627276406314, + "grad_norm": 1.2831798791885376, + "learning_rate": 3.11170761924427e-05, + "loss": 1.0009, + "step": 17175 + }, + { + "epoch": 0.6952650748684743, + "grad_norm": 1.1908453702926636, + "learning_rate": 3.10964278339872e-05, + "loss": 1.0469, + "step": 17180 + }, + { + "epoch": 0.6954674220963173, + "grad_norm": 1.4067375659942627, + "learning_rate": 3.10757794755317e-05, + "loss": 0.9516, + "step": 17185 + }, + { + "epoch": 0.6956697693241602, + "grad_norm": 1.1740498542785645, + "learning_rate": 3.1055131117076194e-05, + "loss": 0.986, + "step": 17190 + }, + { + "epoch": 0.6958721165520032, + "grad_norm": 1.1124236583709717, + "learning_rate": 3.103448275862069e-05, + "loss": 0.981, + "step": 17195 + }, + { + "epoch": 0.6960744637798462, + "grad_norm": 1.2706719636917114, + "learning_rate": 3.101383440016519e-05, + "loss": 0.9836, + "step": 17200 + }, + { + "epoch": 0.6962768110076892, + "grad_norm": 1.1875782012939453, + "learning_rate": 3.0993186041709686e-05, + "loss": 0.9679, + "step": 17205 + }, + { + "epoch": 0.6964791582355322, + "grad_norm": 1.1833171844482422, + "learning_rate": 3.0972537683254184e-05, + "loss": 0.9984, + "step": 17210 + }, + { + "epoch": 0.6966815054633752, + "grad_norm": 1.259817123413086, + "learning_rate": 3.095188932479868e-05, + "loss": 1.0075, + "step": 17215 + }, + { + "epoch": 0.6968838526912181, + "grad_norm": 1.2820730209350586, + "learning_rate": 3.093124096634318e-05, + "loss": 0.9491, + "step": 17220 + }, + { + "epoch": 0.6970861999190611, + "grad_norm": 1.149603247642517, + "learning_rate": 3.0910592607887675e-05, + "loss": 0.9498, + "step": 17225 + }, + { + "epoch": 0.6972885471469041, + "grad_norm": 1.2305572032928467, + "learning_rate": 3.088994424943217e-05, + "loss": 1.0212, + "step": 17230 + }, + { + "epoch": 0.6974908943747471, + "grad_norm": 1.204371690750122, + "learning_rate": 3.086929589097667e-05, + "loss": 0.9334, + "step": 17235 + }, + { + "epoch": 0.69769324160259, + "grad_norm": 1.2756214141845703, + "learning_rate": 3.084864753252117e-05, + "loss": 0.9952, + "step": 17240 + }, + { + "epoch": 0.697895588830433, + "grad_norm": 1.318031668663025, + "learning_rate": 3.0827999174065665e-05, + "loss": 1.0067, + "step": 17245 + }, + { + "epoch": 0.698097936058276, + "grad_norm": 1.2004657983779907, + "learning_rate": 3.080735081561016e-05, + "loss": 0.9927, + "step": 17250 + }, + { + "epoch": 0.6983002832861189, + "grad_norm": 1.1927980184555054, + "learning_rate": 3.078670245715466e-05, + "loss": 1.0391, + "step": 17255 + }, + { + "epoch": 0.698502630513962, + "grad_norm": 1.2973883152008057, + "learning_rate": 3.0766054098699156e-05, + "loss": 0.9631, + "step": 17260 + }, + { + "epoch": 0.698704977741805, + "grad_norm": 1.366729497909546, + "learning_rate": 3.0745405740243654e-05, + "loss": 0.9519, + "step": 17265 + }, + { + "epoch": 0.6989073249696479, + "grad_norm": 1.086188793182373, + "learning_rate": 3.072475738178815e-05, + "loss": 0.9726, + "step": 17270 + }, + { + "epoch": 0.6991096721974909, + "grad_norm": 1.1788781881332397, + "learning_rate": 3.070410902333264e-05, + "loss": 0.9806, + "step": 17275 + }, + { + "epoch": 0.6993120194253338, + "grad_norm": 1.1486843824386597, + "learning_rate": 3.0683460664877146e-05, + "loss": 0.9807, + "step": 17280 + }, + { + "epoch": 0.6995143666531769, + "grad_norm": 1.2319406270980835, + "learning_rate": 3.066281230642164e-05, + "loss": 0.9255, + "step": 17285 + }, + { + "epoch": 0.6997167138810199, + "grad_norm": 1.2333320379257202, + "learning_rate": 3.064216394796614e-05, + "loss": 0.933, + "step": 17290 + }, + { + "epoch": 0.6999190611088628, + "grad_norm": 1.2039984464645386, + "learning_rate": 3.062151558951064e-05, + "loss": 0.9584, + "step": 17295 + }, + { + "epoch": 0.7001214083367058, + "grad_norm": 1.171964406967163, + "learning_rate": 3.060086723105513e-05, + "loss": 1.0097, + "step": 17300 + }, + { + "epoch": 0.7003237555645487, + "grad_norm": 1.22491455078125, + "learning_rate": 3.058021887259963e-05, + "loss": 0.9734, + "step": 17305 + }, + { + "epoch": 0.7005261027923917, + "grad_norm": 1.3244630098342896, + "learning_rate": 3.055957051414412e-05, + "loss": 1.0062, + "step": 17310 + }, + { + "epoch": 0.7007284500202348, + "grad_norm": 1.1864243745803833, + "learning_rate": 3.053892215568863e-05, + "loss": 0.9506, + "step": 17315 + }, + { + "epoch": 0.7009307972480777, + "grad_norm": 1.193135380744934, + "learning_rate": 3.0518273797233124e-05, + "loss": 0.9727, + "step": 17320 + }, + { + "epoch": 0.7011331444759207, + "grad_norm": 1.3358057737350464, + "learning_rate": 3.0497625438777615e-05, + "loss": 0.9744, + "step": 17325 + }, + { + "epoch": 0.7013354917037636, + "grad_norm": 1.1899951696395874, + "learning_rate": 3.0476977080322115e-05, + "loss": 0.9446, + "step": 17330 + }, + { + "epoch": 0.7015378389316066, + "grad_norm": 1.2701103687286377, + "learning_rate": 3.0456328721866613e-05, + "loss": 1.033, + "step": 17335 + }, + { + "epoch": 0.7017401861594497, + "grad_norm": 1.3066133260726929, + "learning_rate": 3.0435680363411113e-05, + "loss": 1.0006, + "step": 17340 + }, + { + "epoch": 0.7019425333872926, + "grad_norm": 1.1671679019927979, + "learning_rate": 3.0415032004955607e-05, + "loss": 0.9184, + "step": 17345 + }, + { + "epoch": 0.7021448806151356, + "grad_norm": 1.250704050064087, + "learning_rate": 3.03943836465001e-05, + "loss": 1.0011, + "step": 17350 + }, + { + "epoch": 0.7023472278429785, + "grad_norm": 1.2362216711044312, + "learning_rate": 3.0373735288044602e-05, + "loss": 1.0189, + "step": 17355 + }, + { + "epoch": 0.7025495750708215, + "grad_norm": 1.1965128183364868, + "learning_rate": 3.03530869295891e-05, + "loss": 1.0365, + "step": 17360 + }, + { + "epoch": 0.7027519222986645, + "grad_norm": 1.0046998262405396, + "learning_rate": 3.0332438571133596e-05, + "loss": 1.0028, + "step": 17365 + }, + { + "epoch": 0.7029542695265075, + "grad_norm": 1.3213443756103516, + "learning_rate": 3.0311790212678094e-05, + "loss": 0.9042, + "step": 17370 + }, + { + "epoch": 0.7031566167543505, + "grad_norm": 1.106751799583435, + "learning_rate": 3.0291141854222594e-05, + "loss": 1.0172, + "step": 17375 + }, + { + "epoch": 0.7033589639821934, + "grad_norm": 1.1988613605499268, + "learning_rate": 3.0270493495767088e-05, + "loss": 0.9577, + "step": 17380 + }, + { + "epoch": 0.7035613112100364, + "grad_norm": 1.223128318786621, + "learning_rate": 3.0249845137311582e-05, + "loss": 1.0076, + "step": 17385 + }, + { + "epoch": 0.7037636584378794, + "grad_norm": 1.166667103767395, + "learning_rate": 3.0229196778856083e-05, + "loss": 1.0142, + "step": 17390 + }, + { + "epoch": 0.7039660056657224, + "grad_norm": 1.2497634887695312, + "learning_rate": 3.020854842040058e-05, + "loss": 1.0035, + "step": 17395 + }, + { + "epoch": 0.7041683528935654, + "grad_norm": 1.2435925006866455, + "learning_rate": 3.018790006194508e-05, + "loss": 0.9792, + "step": 17400 + }, + { + "epoch": 0.7043707001214083, + "grad_norm": 1.2664761543273926, + "learning_rate": 3.0167251703489575e-05, + "loss": 1.0018, + "step": 17405 + }, + { + "epoch": 0.7045730473492513, + "grad_norm": 1.1748237609863281, + "learning_rate": 3.014660334503407e-05, + "loss": 1.015, + "step": 17410 + }, + { + "epoch": 0.7047753945770943, + "grad_norm": 1.3572683334350586, + "learning_rate": 3.012595498657857e-05, + "loss": 0.9939, + "step": 17415 + }, + { + "epoch": 0.7049777418049372, + "grad_norm": 1.2530001401901245, + "learning_rate": 3.0105306628123063e-05, + "loss": 1.0323, + "step": 17420 + }, + { + "epoch": 0.7051800890327803, + "grad_norm": 1.1220327615737915, + "learning_rate": 3.0084658269667564e-05, + "loss": 1.0245, + "step": 17425 + }, + { + "epoch": 0.7053824362606232, + "grad_norm": 1.2212517261505127, + "learning_rate": 3.006400991121206e-05, + "loss": 1.0247, + "step": 17430 + }, + { + "epoch": 0.7055847834884662, + "grad_norm": 1.3376418352127075, + "learning_rate": 3.0043361552756555e-05, + "loss": 0.9992, + "step": 17435 + }, + { + "epoch": 0.7057871307163092, + "grad_norm": 1.1860454082489014, + "learning_rate": 3.0022713194301056e-05, + "loss": 1.0302, + "step": 17440 + }, + { + "epoch": 0.7059894779441521, + "grad_norm": 1.2094342708587646, + "learning_rate": 3.000206483584555e-05, + "loss": 1.0539, + "step": 17445 + }, + { + "epoch": 0.7061918251719952, + "grad_norm": 1.252813458442688, + "learning_rate": 2.998141647739005e-05, + "loss": 0.9874, + "step": 17450 + }, + { + "epoch": 0.7063941723998381, + "grad_norm": 1.2359617948532104, + "learning_rate": 2.9960768118934544e-05, + "loss": 0.9854, + "step": 17455 + }, + { + "epoch": 0.7065965196276811, + "grad_norm": 1.0895074605941772, + "learning_rate": 2.994011976047904e-05, + "loss": 0.9447, + "step": 17460 + }, + { + "epoch": 0.7067988668555241, + "grad_norm": 1.1701818704605103, + "learning_rate": 2.9919471402023542e-05, + "loss": 0.9801, + "step": 17465 + }, + { + "epoch": 0.707001214083367, + "grad_norm": 1.2293882369995117, + "learning_rate": 2.9898823043568036e-05, + "loss": 0.955, + "step": 17470 + }, + { + "epoch": 0.70720356131121, + "grad_norm": 1.1827373504638672, + "learning_rate": 2.9878174685112537e-05, + "loss": 1.0189, + "step": 17475 + }, + { + "epoch": 0.707405908539053, + "grad_norm": 1.2127366065979004, + "learning_rate": 2.985752632665703e-05, + "loss": 0.9989, + "step": 17480 + }, + { + "epoch": 0.707608255766896, + "grad_norm": 1.1264581680297852, + "learning_rate": 2.983687796820153e-05, + "loss": 0.9531, + "step": 17485 + }, + { + "epoch": 0.707810602994739, + "grad_norm": 1.2644881010055542, + "learning_rate": 2.9816229609746025e-05, + "loss": 1.0012, + "step": 17490 + }, + { + "epoch": 0.7080129502225819, + "grad_norm": 1.210516333580017, + "learning_rate": 2.9795581251290523e-05, + "loss": 0.9253, + "step": 17495 + }, + { + "epoch": 0.7082152974504249, + "grad_norm": 1.147161602973938, + "learning_rate": 2.9774932892835023e-05, + "loss": 1.0003, + "step": 17500 + }, + { + "epoch": 0.708417644678268, + "grad_norm": 1.2431273460388184, + "learning_rate": 2.9754284534379517e-05, + "loss": 0.9843, + "step": 17505 + }, + { + "epoch": 0.7086199919061109, + "grad_norm": 1.1441364288330078, + "learning_rate": 2.9733636175924018e-05, + "loss": 0.974, + "step": 17510 + }, + { + "epoch": 0.7088223391339539, + "grad_norm": 1.2725236415863037, + "learning_rate": 2.9712987817468512e-05, + "loss": 1.0117, + "step": 17515 + }, + { + "epoch": 0.7090246863617968, + "grad_norm": 1.130359172821045, + "learning_rate": 2.969233945901301e-05, + "loss": 0.9799, + "step": 17520 + }, + { + "epoch": 0.7092270335896398, + "grad_norm": 1.3188213109970093, + "learning_rate": 2.967169110055751e-05, + "loss": 0.9573, + "step": 17525 + }, + { + "epoch": 0.7094293808174827, + "grad_norm": 1.219534158706665, + "learning_rate": 2.9651042742102004e-05, + "loss": 0.987, + "step": 17530 + }, + { + "epoch": 0.7096317280453258, + "grad_norm": 1.1407498121261597, + "learning_rate": 2.9630394383646504e-05, + "loss": 0.9485, + "step": 17535 + }, + { + "epoch": 0.7098340752731688, + "grad_norm": 1.3663009405136108, + "learning_rate": 2.9609746025191e-05, + "loss": 0.9618, + "step": 17540 + }, + { + "epoch": 0.7100364225010117, + "grad_norm": 1.2362415790557861, + "learning_rate": 2.9589097666735492e-05, + "loss": 1.0356, + "step": 17545 + }, + { + "epoch": 0.7102387697288547, + "grad_norm": 1.11896812915802, + "learning_rate": 2.9568449308279993e-05, + "loss": 1.0108, + "step": 17550 + }, + { + "epoch": 0.7104411169566976, + "grad_norm": 1.3680909872055054, + "learning_rate": 2.954780094982449e-05, + "loss": 0.9726, + "step": 17555 + }, + { + "epoch": 0.7106434641845407, + "grad_norm": 1.1663466691970825, + "learning_rate": 2.952715259136899e-05, + "loss": 0.9811, + "step": 17560 + }, + { + "epoch": 0.7108458114123837, + "grad_norm": 1.2008883953094482, + "learning_rate": 2.9506504232913485e-05, + "loss": 0.9161, + "step": 17565 + }, + { + "epoch": 0.7110481586402266, + "grad_norm": 1.5105302333831787, + "learning_rate": 2.948585587445798e-05, + "loss": 1.0413, + "step": 17570 + }, + { + "epoch": 0.7112505058680696, + "grad_norm": 1.129315733909607, + "learning_rate": 2.946520751600248e-05, + "loss": 0.9704, + "step": 17575 + }, + { + "epoch": 0.7114528530959126, + "grad_norm": 1.1245055198669434, + "learning_rate": 2.9444559157546973e-05, + "loss": 1.0059, + "step": 17580 + }, + { + "epoch": 0.7116552003237555, + "grad_norm": 1.1994887590408325, + "learning_rate": 2.9423910799091474e-05, + "loss": 1.0028, + "step": 17585 + }, + { + "epoch": 0.7118575475515986, + "grad_norm": 1.2163746356964111, + "learning_rate": 2.940326244063597e-05, + "loss": 0.9749, + "step": 17590 + }, + { + "epoch": 0.7120598947794415, + "grad_norm": 1.201568841934204, + "learning_rate": 2.9382614082180472e-05, + "loss": 1.0046, + "step": 17595 + }, + { + "epoch": 0.7122622420072845, + "grad_norm": 1.2677850723266602, + "learning_rate": 2.9361965723724966e-05, + "loss": 1.0374, + "step": 17600 + }, + { + "epoch": 0.7124645892351275, + "grad_norm": 1.3034635782241821, + "learning_rate": 2.934131736526946e-05, + "loss": 0.9419, + "step": 17605 + }, + { + "epoch": 0.7126669364629704, + "grad_norm": 1.3210262060165405, + "learning_rate": 2.932066900681396e-05, + "loss": 0.9751, + "step": 17610 + }, + { + "epoch": 0.7128692836908135, + "grad_norm": 1.225956678390503, + "learning_rate": 2.9300020648358454e-05, + "loss": 1.042, + "step": 17615 + }, + { + "epoch": 0.7130716309186564, + "grad_norm": 1.2935391664505005, + "learning_rate": 2.9279372289902955e-05, + "loss": 0.9783, + "step": 17620 + }, + { + "epoch": 0.7132739781464994, + "grad_norm": 1.1585502624511719, + "learning_rate": 2.9258723931447452e-05, + "loss": 0.9657, + "step": 17625 + }, + { + "epoch": 0.7134763253743424, + "grad_norm": 1.2231570482254028, + "learning_rate": 2.9238075572991946e-05, + "loss": 0.9845, + "step": 17630 + }, + { + "epoch": 0.7136786726021853, + "grad_norm": 1.0635497570037842, + "learning_rate": 2.9217427214536447e-05, + "loss": 0.9963, + "step": 17635 + }, + { + "epoch": 0.7138810198300283, + "grad_norm": 1.2352100610733032, + "learning_rate": 2.919677885608094e-05, + "loss": 1.0434, + "step": 17640 + }, + { + "epoch": 0.7140833670578713, + "grad_norm": 1.2931196689605713, + "learning_rate": 2.917613049762544e-05, + "loss": 0.9682, + "step": 17645 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.2205458879470825, + "learning_rate": 2.915548213916994e-05, + "loss": 0.9681, + "step": 17650 + }, + { + "epoch": 0.7144880615135573, + "grad_norm": 1.1368838548660278, + "learning_rate": 2.9134833780714433e-05, + "loss": 1.0158, + "step": 17655 + }, + { + "epoch": 0.7146904087414002, + "grad_norm": 1.108027458190918, + "learning_rate": 2.9114185422258933e-05, + "loss": 1.0608, + "step": 17660 + }, + { + "epoch": 0.7148927559692432, + "grad_norm": 1.2166893482208252, + "learning_rate": 2.9093537063803427e-05, + "loss": 1.0056, + "step": 17665 + }, + { + "epoch": 0.7150951031970862, + "grad_norm": 1.1938624382019043, + "learning_rate": 2.9072888705347928e-05, + "loss": 0.9431, + "step": 17670 + }, + { + "epoch": 0.7152974504249292, + "grad_norm": 1.1462186574935913, + "learning_rate": 2.9052240346892422e-05, + "loss": 0.9415, + "step": 17675 + }, + { + "epoch": 0.7154997976527722, + "grad_norm": 1.102442979812622, + "learning_rate": 2.903159198843692e-05, + "loss": 0.9445, + "step": 17680 + }, + { + "epoch": 0.7157021448806151, + "grad_norm": 1.1912766695022583, + "learning_rate": 2.901094362998142e-05, + "loss": 0.9873, + "step": 17685 + }, + { + "epoch": 0.7159044921084581, + "grad_norm": 1.3539094924926758, + "learning_rate": 2.8990295271525914e-05, + "loss": 1.0329, + "step": 17690 + }, + { + "epoch": 0.716106839336301, + "grad_norm": 1.161729097366333, + "learning_rate": 2.8969646913070415e-05, + "loss": 0.9554, + "step": 17695 + }, + { + "epoch": 0.7163091865641441, + "grad_norm": 1.1892973184585571, + "learning_rate": 2.894899855461491e-05, + "loss": 0.9904, + "step": 17700 + }, + { + "epoch": 0.7165115337919871, + "grad_norm": 1.1171448230743408, + "learning_rate": 2.892835019615941e-05, + "loss": 0.9547, + "step": 17705 + }, + { + "epoch": 0.71671388101983, + "grad_norm": 1.1299140453338623, + "learning_rate": 2.8907701837703903e-05, + "loss": 1.0529, + "step": 17710 + }, + { + "epoch": 0.716916228247673, + "grad_norm": 1.1318342685699463, + "learning_rate": 2.88870534792484e-05, + "loss": 1.0226, + "step": 17715 + }, + { + "epoch": 0.7171185754755159, + "grad_norm": 1.190993070602417, + "learning_rate": 2.88664051207929e-05, + "loss": 0.9811, + "step": 17720 + }, + { + "epoch": 0.717320922703359, + "grad_norm": 1.2098536491394043, + "learning_rate": 2.8845756762337395e-05, + "loss": 0.9948, + "step": 17725 + }, + { + "epoch": 0.717523269931202, + "grad_norm": 1.2147682905197144, + "learning_rate": 2.8825108403881896e-05, + "loss": 1.0396, + "step": 17730 + }, + { + "epoch": 0.7177256171590449, + "grad_norm": 1.2443863153457642, + "learning_rate": 2.880446004542639e-05, + "loss": 0.9413, + "step": 17735 + }, + { + "epoch": 0.7179279643868879, + "grad_norm": 1.1268609762191772, + "learning_rate": 2.8783811686970887e-05, + "loss": 0.9783, + "step": 17740 + }, + { + "epoch": 0.7181303116147308, + "grad_norm": 1.2341854572296143, + "learning_rate": 2.8763163328515384e-05, + "loss": 0.9627, + "step": 17745 + }, + { + "epoch": 0.7183326588425738, + "grad_norm": 1.1590079069137573, + "learning_rate": 2.874251497005988e-05, + "loss": 1.0087, + "step": 17750 + }, + { + "epoch": 0.7185350060704169, + "grad_norm": 1.437095046043396, + "learning_rate": 2.8721866611604382e-05, + "loss": 0.9936, + "step": 17755 + }, + { + "epoch": 0.7187373532982598, + "grad_norm": 1.2555290460586548, + "learning_rate": 2.8701218253148876e-05, + "loss": 0.9719, + "step": 17760 + }, + { + "epoch": 0.7189397005261028, + "grad_norm": 1.1498318910598755, + "learning_rate": 2.868056989469337e-05, + "loss": 1.0202, + "step": 17765 + }, + { + "epoch": 0.7191420477539457, + "grad_norm": 1.3355313539505005, + "learning_rate": 2.865992153623787e-05, + "loss": 1.0173, + "step": 17770 + }, + { + "epoch": 0.7193443949817887, + "grad_norm": 1.366964340209961, + "learning_rate": 2.8639273177782368e-05, + "loss": 1.0336, + "step": 17775 + }, + { + "epoch": 0.7195467422096318, + "grad_norm": 1.0923272371292114, + "learning_rate": 2.861862481932687e-05, + "loss": 1.0227, + "step": 17780 + }, + { + "epoch": 0.7197490894374747, + "grad_norm": 1.246429443359375, + "learning_rate": 2.8597976460871362e-05, + "loss": 1.0321, + "step": 17785 + }, + { + "epoch": 0.7199514366653177, + "grad_norm": 1.1993050575256348, + "learning_rate": 2.8577328102415856e-05, + "loss": 0.975, + "step": 17790 + }, + { + "epoch": 0.7201537838931606, + "grad_norm": 1.178747534751892, + "learning_rate": 2.8556679743960357e-05, + "loss": 1.0033, + "step": 17795 + }, + { + "epoch": 0.7203561311210036, + "grad_norm": 1.3549706935882568, + "learning_rate": 2.853603138550485e-05, + "loss": 1.0397, + "step": 17800 + }, + { + "epoch": 0.7205584783488467, + "grad_norm": 1.1827131509780884, + "learning_rate": 2.851538302704935e-05, + "loss": 1.0578, + "step": 17805 + }, + { + "epoch": 0.7207608255766896, + "grad_norm": 1.2043248414993286, + "learning_rate": 2.849473466859385e-05, + "loss": 1.002, + "step": 17810 + }, + { + "epoch": 0.7209631728045326, + "grad_norm": 1.2782275676727295, + "learning_rate": 2.8474086310138343e-05, + "loss": 0.9673, + "step": 17815 + }, + { + "epoch": 0.7211655200323756, + "grad_norm": 1.095707893371582, + "learning_rate": 2.8453437951682844e-05, + "loss": 0.9683, + "step": 17820 + }, + { + "epoch": 0.7213678672602185, + "grad_norm": 1.1790738105773926, + "learning_rate": 2.8432789593227337e-05, + "loss": 0.9955, + "step": 17825 + }, + { + "epoch": 0.7215702144880615, + "grad_norm": 1.1360628604888916, + "learning_rate": 2.8412141234771838e-05, + "loss": 1.045, + "step": 17830 + }, + { + "epoch": 0.7217725617159045, + "grad_norm": 1.2507461309432983, + "learning_rate": 2.8391492876316332e-05, + "loss": 1.0048, + "step": 17835 + }, + { + "epoch": 0.7219749089437475, + "grad_norm": 1.1951102018356323, + "learning_rate": 2.8370844517860833e-05, + "loss": 1.059, + "step": 17840 + }, + { + "epoch": 0.7221772561715905, + "grad_norm": 1.2759116888046265, + "learning_rate": 2.835019615940533e-05, + "loss": 1.0177, + "step": 17845 + }, + { + "epoch": 0.7223796033994334, + "grad_norm": 1.1330106258392334, + "learning_rate": 2.8329547800949824e-05, + "loss": 1.0073, + "step": 17850 + }, + { + "epoch": 0.7225819506272764, + "grad_norm": 1.2677319049835205, + "learning_rate": 2.8308899442494325e-05, + "loss": 0.9473, + "step": 17855 + }, + { + "epoch": 0.7227842978551194, + "grad_norm": 1.075184941291809, + "learning_rate": 2.828825108403882e-05, + "loss": 1.0303, + "step": 17860 + }, + { + "epoch": 0.7229866450829624, + "grad_norm": 1.1539348363876343, + "learning_rate": 2.826760272558332e-05, + "loss": 0.9587, + "step": 17865 + }, + { + "epoch": 0.7231889923108054, + "grad_norm": 1.0828721523284912, + "learning_rate": 2.8246954367127813e-05, + "loss": 1.0265, + "step": 17870 + }, + { + "epoch": 0.7233913395386483, + "grad_norm": 1.3209598064422607, + "learning_rate": 2.822630600867231e-05, + "loss": 1.0518, + "step": 17875 + }, + { + "epoch": 0.7235936867664913, + "grad_norm": 1.2192697525024414, + "learning_rate": 2.820565765021681e-05, + "loss": 0.9714, + "step": 17880 + }, + { + "epoch": 0.7237960339943342, + "grad_norm": 1.2710083723068237, + "learning_rate": 2.8185009291761305e-05, + "loss": 1.0182, + "step": 17885 + }, + { + "epoch": 0.7239983812221773, + "grad_norm": 1.3627142906188965, + "learning_rate": 2.8164360933305806e-05, + "loss": 1.0093, + "step": 17890 + }, + { + "epoch": 0.7242007284500203, + "grad_norm": 1.2135127782821655, + "learning_rate": 2.81437125748503e-05, + "loss": 0.9915, + "step": 17895 + }, + { + "epoch": 0.7244030756778632, + "grad_norm": 1.2205002307891846, + "learning_rate": 2.8123064216394797e-05, + "loss": 0.9681, + "step": 17900 + }, + { + "epoch": 0.7246054229057062, + "grad_norm": 1.2374382019042969, + "learning_rate": 2.8102415857939298e-05, + "loss": 1.0354, + "step": 17905 + }, + { + "epoch": 0.7248077701335491, + "grad_norm": 1.304052710533142, + "learning_rate": 2.808176749948379e-05, + "loss": 1.0044, + "step": 17910 + }, + { + "epoch": 0.7250101173613922, + "grad_norm": 1.1533074378967285, + "learning_rate": 2.8061119141028292e-05, + "loss": 1.045, + "step": 17915 + }, + { + "epoch": 0.7252124645892352, + "grad_norm": 1.1206939220428467, + "learning_rate": 2.8040470782572786e-05, + "loss": 0.9843, + "step": 17920 + }, + { + "epoch": 0.7254148118170781, + "grad_norm": 1.1591408252716064, + "learning_rate": 2.801982242411728e-05, + "loss": 0.9715, + "step": 17925 + }, + { + "epoch": 0.7256171590449211, + "grad_norm": 1.144925832748413, + "learning_rate": 2.799917406566178e-05, + "loss": 0.9587, + "step": 17930 + }, + { + "epoch": 0.725819506272764, + "grad_norm": 1.1545476913452148, + "learning_rate": 2.7978525707206278e-05, + "loss": 0.9965, + "step": 17935 + }, + { + "epoch": 0.726021853500607, + "grad_norm": 1.253890037536621, + "learning_rate": 2.795787734875078e-05, + "loss": 0.9684, + "step": 17940 + }, + { + "epoch": 0.7262242007284501, + "grad_norm": 1.132378101348877, + "learning_rate": 2.7937228990295273e-05, + "loss": 0.9984, + "step": 17945 + }, + { + "epoch": 0.726426547956293, + "grad_norm": 1.8752743005752563, + "learning_rate": 2.7916580631839773e-05, + "loss": 0.9577, + "step": 17950 + }, + { + "epoch": 0.726628895184136, + "grad_norm": 1.2820810079574585, + "learning_rate": 2.7895932273384267e-05, + "loss": 0.9291, + "step": 17955 + }, + { + "epoch": 0.7268312424119789, + "grad_norm": 1.2255547046661377, + "learning_rate": 2.787528391492876e-05, + "loss": 1.0168, + "step": 17960 + }, + { + "epoch": 0.7270335896398219, + "grad_norm": 1.2439258098602295, + "learning_rate": 2.7854635556473262e-05, + "loss": 0.952, + "step": 17965 + }, + { + "epoch": 0.727235936867665, + "grad_norm": 1.1685419082641602, + "learning_rate": 2.783398719801776e-05, + "loss": 1.0325, + "step": 17970 + }, + { + "epoch": 0.7274382840955079, + "grad_norm": 1.1932899951934814, + "learning_rate": 2.781333883956226e-05, + "loss": 0.9937, + "step": 17975 + }, + { + "epoch": 0.7276406313233509, + "grad_norm": 1.2068145275115967, + "learning_rate": 2.7792690481106754e-05, + "loss": 0.9389, + "step": 17980 + }, + { + "epoch": 0.7278429785511938, + "grad_norm": 1.2051451206207275, + "learning_rate": 2.7772042122651248e-05, + "loss": 1.0095, + "step": 17985 + }, + { + "epoch": 0.7280453257790368, + "grad_norm": 1.140273094177246, + "learning_rate": 2.7751393764195748e-05, + "loss": 0.9967, + "step": 17990 + }, + { + "epoch": 0.7282476730068798, + "grad_norm": 1.3008310794830322, + "learning_rate": 2.7730745405740242e-05, + "loss": 1.0487, + "step": 17995 + }, + { + "epoch": 0.7284500202347228, + "grad_norm": 1.230934977531433, + "learning_rate": 2.7710097047284743e-05, + "loss": 1.0499, + "step": 18000 + }, + { + "epoch": 0.7286523674625658, + "grad_norm": 1.1574170589447021, + "learning_rate": 2.768944868882924e-05, + "loss": 0.9733, + "step": 18005 + }, + { + "epoch": 0.7288547146904087, + "grad_norm": 1.1711808443069458, + "learning_rate": 2.7668800330373734e-05, + "loss": 1.0224, + "step": 18010 + }, + { + "epoch": 0.7290570619182517, + "grad_norm": 1.1061688661575317, + "learning_rate": 2.7648151971918235e-05, + "loss": 0.9966, + "step": 18015 + }, + { + "epoch": 0.7292594091460947, + "grad_norm": 1.1513621807098389, + "learning_rate": 2.762750361346273e-05, + "loss": 0.9743, + "step": 18020 + }, + { + "epoch": 0.7294617563739377, + "grad_norm": 1.2922521829605103, + "learning_rate": 2.760685525500723e-05, + "loss": 0.9283, + "step": 18025 + }, + { + "epoch": 0.7296641036017807, + "grad_norm": 1.3262009620666504, + "learning_rate": 2.7586206896551727e-05, + "loss": 0.9928, + "step": 18030 + }, + { + "epoch": 0.7298664508296236, + "grad_norm": 1.1433358192443848, + "learning_rate": 2.756555853809622e-05, + "loss": 0.9375, + "step": 18035 + }, + { + "epoch": 0.7300687980574666, + "grad_norm": 1.1117414236068726, + "learning_rate": 2.754491017964072e-05, + "loss": 0.946, + "step": 18040 + }, + { + "epoch": 0.7302711452853096, + "grad_norm": 1.0909231901168823, + "learning_rate": 2.7524261821185215e-05, + "loss": 0.9309, + "step": 18045 + }, + { + "epoch": 0.7304734925131525, + "grad_norm": 1.2091577053070068, + "learning_rate": 2.7503613462729716e-05, + "loss": 0.9294, + "step": 18050 + }, + { + "epoch": 0.7306758397409956, + "grad_norm": 1.213517189025879, + "learning_rate": 2.748296510427421e-05, + "loss": 0.9991, + "step": 18055 + }, + { + "epoch": 0.7308781869688386, + "grad_norm": 1.196061134338379, + "learning_rate": 2.746231674581871e-05, + "loss": 0.9831, + "step": 18060 + }, + { + "epoch": 0.7310805341966815, + "grad_norm": 1.3827677965164185, + "learning_rate": 2.7441668387363208e-05, + "loss": 0.9718, + "step": 18065 + }, + { + "epoch": 0.7312828814245245, + "grad_norm": 1.1541972160339355, + "learning_rate": 2.74210200289077e-05, + "loss": 0.9847, + "step": 18070 + }, + { + "epoch": 0.7314852286523674, + "grad_norm": 1.4115660190582275, + "learning_rate": 2.7400371670452202e-05, + "loss": 1.039, + "step": 18075 + }, + { + "epoch": 0.7316875758802105, + "grad_norm": 1.2789621353149414, + "learning_rate": 2.7379723311996696e-05, + "loss": 0.9885, + "step": 18080 + }, + { + "epoch": 0.7318899231080535, + "grad_norm": 1.1122533082962036, + "learning_rate": 2.7359074953541197e-05, + "loss": 0.9761, + "step": 18085 + }, + { + "epoch": 0.7320922703358964, + "grad_norm": 1.1807162761688232, + "learning_rate": 2.733842659508569e-05, + "loss": 1.0065, + "step": 18090 + }, + { + "epoch": 0.7322946175637394, + "grad_norm": 1.2819427251815796, + "learning_rate": 2.7317778236630188e-05, + "loss": 1.0249, + "step": 18095 + }, + { + "epoch": 0.7324969647915823, + "grad_norm": 1.377754807472229, + "learning_rate": 2.729712987817469e-05, + "loss": 1.001, + "step": 18100 + }, + { + "epoch": 0.7326993120194253, + "grad_norm": 1.3264992237091064, + "learning_rate": 2.7276481519719183e-05, + "loss": 1.0539, + "step": 18105 + }, + { + "epoch": 0.7329016592472684, + "grad_norm": 1.3716049194335938, + "learning_rate": 2.7255833161263683e-05, + "loss": 0.9949, + "step": 18110 + }, + { + "epoch": 0.7331040064751113, + "grad_norm": 1.1768916845321655, + "learning_rate": 2.7235184802808177e-05, + "loss": 1.0183, + "step": 18115 + }, + { + "epoch": 0.7333063537029543, + "grad_norm": 1.0700510740280151, + "learning_rate": 2.7214536444352675e-05, + "loss": 0.9822, + "step": 18120 + }, + { + "epoch": 0.7335087009307972, + "grad_norm": 1.0094561576843262, + "learning_rate": 2.7193888085897172e-05, + "loss": 0.9612, + "step": 18125 + }, + { + "epoch": 0.7337110481586402, + "grad_norm": 1.1067116260528564, + "learning_rate": 2.717323972744167e-05, + "loss": 0.925, + "step": 18130 + }, + { + "epoch": 0.7339133953864833, + "grad_norm": 1.5170282125473022, + "learning_rate": 2.715259136898617e-05, + "loss": 0.9799, + "step": 18135 + }, + { + "epoch": 0.7341157426143262, + "grad_norm": 1.2024511098861694, + "learning_rate": 2.7131943010530664e-05, + "loss": 0.9495, + "step": 18140 + }, + { + "epoch": 0.7343180898421692, + "grad_norm": 1.2323782444000244, + "learning_rate": 2.7111294652075158e-05, + "loss": 0.9456, + "step": 18145 + }, + { + "epoch": 0.7345204370700121, + "grad_norm": 1.0937429666519165, + "learning_rate": 2.709064629361966e-05, + "loss": 1.0044, + "step": 18150 + }, + { + "epoch": 0.7347227842978551, + "grad_norm": 1.1940041780471802, + "learning_rate": 2.7069997935164156e-05, + "loss": 0.9797, + "step": 18155 + }, + { + "epoch": 0.734925131525698, + "grad_norm": 1.17705237865448, + "learning_rate": 2.7049349576708656e-05, + "loss": 0.9811, + "step": 18160 + }, + { + "epoch": 0.7351274787535411, + "grad_norm": 1.2329814434051514, + "learning_rate": 2.702870121825315e-05, + "loss": 0.9981, + "step": 18165 + }, + { + "epoch": 0.7353298259813841, + "grad_norm": 1.1110891103744507, + "learning_rate": 2.700805285979765e-05, + "loss": 0.9597, + "step": 18170 + }, + { + "epoch": 0.735532173209227, + "grad_norm": 1.1086004972457886, + "learning_rate": 2.6987404501342145e-05, + "loss": 1.0896, + "step": 18175 + }, + { + "epoch": 0.73573452043707, + "grad_norm": 1.1698123216629028, + "learning_rate": 2.696675614288664e-05, + "loss": 0.9755, + "step": 18180 + }, + { + "epoch": 0.735936867664913, + "grad_norm": 1.23163640499115, + "learning_rate": 2.694610778443114e-05, + "loss": 0.9895, + "step": 18185 + }, + { + "epoch": 0.736139214892756, + "grad_norm": 1.226365327835083, + "learning_rate": 2.6925459425975637e-05, + "loss": 0.9699, + "step": 18190 + }, + { + "epoch": 0.736341562120599, + "grad_norm": 1.3278625011444092, + "learning_rate": 2.6904811067520137e-05, + "loss": 0.9982, + "step": 18195 + }, + { + "epoch": 0.7365439093484419, + "grad_norm": 1.099804401397705, + "learning_rate": 2.688416270906463e-05, + "loss": 0.9951, + "step": 18200 + }, + { + "epoch": 0.7367462565762849, + "grad_norm": 1.2941951751708984, + "learning_rate": 2.6863514350609125e-05, + "loss": 0.9536, + "step": 18205 + }, + { + "epoch": 0.7369486038041279, + "grad_norm": 1.3249821662902832, + "learning_rate": 2.6842865992153626e-05, + "loss": 0.9945, + "step": 18210 + }, + { + "epoch": 0.7371509510319708, + "grad_norm": 1.1776769161224365, + "learning_rate": 2.682221763369812e-05, + "loss": 1.0315, + "step": 18215 + }, + { + "epoch": 0.7373532982598139, + "grad_norm": 1.1420999765396118, + "learning_rate": 2.680156927524262e-05, + "loss": 1.0363, + "step": 18220 + }, + { + "epoch": 0.7375556454876568, + "grad_norm": 1.2246534824371338, + "learning_rate": 2.6780920916787118e-05, + "loss": 0.9712, + "step": 18225 + }, + { + "epoch": 0.7377579927154998, + "grad_norm": 1.2218377590179443, + "learning_rate": 2.6760272558331612e-05, + "loss": 0.9366, + "step": 18230 + }, + { + "epoch": 0.7379603399433428, + "grad_norm": 1.1964023113250732, + "learning_rate": 2.6739624199876112e-05, + "loss": 1.0244, + "step": 18235 + }, + { + "epoch": 0.7381626871711857, + "grad_norm": 1.1200047731399536, + "learning_rate": 2.6718975841420606e-05, + "loss": 0.9352, + "step": 18240 + }, + { + "epoch": 0.7383650343990288, + "grad_norm": 1.3178179264068604, + "learning_rate": 2.6698327482965107e-05, + "loss": 0.9939, + "step": 18245 + }, + { + "epoch": 0.7385673816268717, + "grad_norm": 1.1094086170196533, + "learning_rate": 2.66776791245096e-05, + "loss": 1.0116, + "step": 18250 + }, + { + "epoch": 0.7387697288547147, + "grad_norm": 1.3109591007232666, + "learning_rate": 2.6657030766054098e-05, + "loss": 0.9971, + "step": 18255 + }, + { + "epoch": 0.7389720760825577, + "grad_norm": 1.1998182535171509, + "learning_rate": 2.66363824075986e-05, + "loss": 1.0246, + "step": 18260 + }, + { + "epoch": 0.7391744233104006, + "grad_norm": 1.2484065294265747, + "learning_rate": 2.6615734049143093e-05, + "loss": 0.9283, + "step": 18265 + }, + { + "epoch": 0.7393767705382436, + "grad_norm": 1.207814335823059, + "learning_rate": 2.6595085690687593e-05, + "loss": 0.9862, + "step": 18270 + }, + { + "epoch": 0.7395791177660866, + "grad_norm": 1.3312376737594604, + "learning_rate": 2.6574437332232087e-05, + "loss": 0.9849, + "step": 18275 + }, + { + "epoch": 0.7397814649939296, + "grad_norm": 1.3686442375183105, + "learning_rate": 2.6553788973776588e-05, + "loss": 1.017, + "step": 18280 + }, + { + "epoch": 0.7399838122217726, + "grad_norm": 1.3069236278533936, + "learning_rate": 2.6533140615321085e-05, + "loss": 0.9818, + "step": 18285 + }, + { + "epoch": 0.7401861594496155, + "grad_norm": 1.190818428993225, + "learning_rate": 2.651249225686558e-05, + "loss": 0.9691, + "step": 18290 + }, + { + "epoch": 0.7403885066774585, + "grad_norm": 1.130037784576416, + "learning_rate": 2.649184389841008e-05, + "loss": 0.9361, + "step": 18295 + }, + { + "epoch": 0.7405908539053015, + "grad_norm": 1.138445258140564, + "learning_rate": 2.6471195539954574e-05, + "loss": 1.0278, + "step": 18300 + }, + { + "epoch": 0.7407932011331445, + "grad_norm": 1.204694390296936, + "learning_rate": 2.6450547181499075e-05, + "loss": 0.9281, + "step": 18305 + }, + { + "epoch": 0.7409955483609875, + "grad_norm": 1.0873088836669922, + "learning_rate": 2.642989882304357e-05, + "loss": 0.9988, + "step": 18310 + }, + { + "epoch": 0.7411978955888304, + "grad_norm": 1.3609319925308228, + "learning_rate": 2.6409250464588066e-05, + "loss": 0.9882, + "step": 18315 + }, + { + "epoch": 0.7414002428166734, + "grad_norm": 1.2003790140151978, + "learning_rate": 2.6388602106132566e-05, + "loss": 0.9606, + "step": 18320 + }, + { + "epoch": 0.7416025900445163, + "grad_norm": 1.2015271186828613, + "learning_rate": 2.636795374767706e-05, + "loss": 0.9433, + "step": 18325 + }, + { + "epoch": 0.7418049372723594, + "grad_norm": 1.1974081993103027, + "learning_rate": 2.634730538922156e-05, + "loss": 0.9562, + "step": 18330 + }, + { + "epoch": 0.7420072845002024, + "grad_norm": 1.1607143878936768, + "learning_rate": 2.6326657030766055e-05, + "loss": 0.9258, + "step": 18335 + }, + { + "epoch": 0.7422096317280453, + "grad_norm": 1.2583789825439453, + "learning_rate": 2.630600867231055e-05, + "loss": 0.9446, + "step": 18340 + }, + { + "epoch": 0.7424119789558883, + "grad_norm": 1.2502634525299072, + "learning_rate": 2.628536031385505e-05, + "loss": 0.9587, + "step": 18345 + }, + { + "epoch": 0.7426143261837312, + "grad_norm": 1.1891366243362427, + "learning_rate": 2.6264711955399547e-05, + "loss": 0.9987, + "step": 18350 + }, + { + "epoch": 0.7428166734115743, + "grad_norm": 1.2171727418899536, + "learning_rate": 2.6244063596944048e-05, + "loss": 0.9488, + "step": 18355 + }, + { + "epoch": 0.7430190206394173, + "grad_norm": 1.162196159362793, + "learning_rate": 2.622341523848854e-05, + "loss": 0.9942, + "step": 18360 + }, + { + "epoch": 0.7432213678672602, + "grad_norm": 1.180525302886963, + "learning_rate": 2.6202766880033035e-05, + "loss": 0.9381, + "step": 18365 + }, + { + "epoch": 0.7434237150951032, + "grad_norm": 1.3493324518203735, + "learning_rate": 2.6182118521577536e-05, + "loss": 1.011, + "step": 18370 + }, + { + "epoch": 0.7436260623229461, + "grad_norm": 1.1396207809448242, + "learning_rate": 2.616147016312203e-05, + "loss": 1.005, + "step": 18375 + }, + { + "epoch": 0.7438284095507891, + "grad_norm": 1.1421303749084473, + "learning_rate": 2.614082180466653e-05, + "loss": 1.0281, + "step": 18380 + }, + { + "epoch": 0.7440307567786322, + "grad_norm": 1.2090312242507935, + "learning_rate": 2.6120173446211028e-05, + "loss": 1.0517, + "step": 18385 + }, + { + "epoch": 0.7442331040064751, + "grad_norm": 1.2150853872299194, + "learning_rate": 2.6099525087755522e-05, + "loss": 1.0239, + "step": 18390 + }, + { + "epoch": 0.7444354512343181, + "grad_norm": 1.1943917274475098, + "learning_rate": 2.6078876729300023e-05, + "loss": 1.0505, + "step": 18395 + }, + { + "epoch": 0.744637798462161, + "grad_norm": 1.2696770429611206, + "learning_rate": 2.6058228370844516e-05, + "loss": 0.9861, + "step": 18400 + }, + { + "epoch": 0.744840145690004, + "grad_norm": 1.2047666311264038, + "learning_rate": 2.6037580012389017e-05, + "loss": 1.029, + "step": 18405 + }, + { + "epoch": 0.7450424929178471, + "grad_norm": 1.2024877071380615, + "learning_rate": 2.6016931653933514e-05, + "loss": 1.0143, + "step": 18410 + }, + { + "epoch": 0.74524484014569, + "grad_norm": 1.1922520399093628, + "learning_rate": 2.5996283295478012e-05, + "loss": 0.9752, + "step": 18415 + }, + { + "epoch": 0.745447187373533, + "grad_norm": 1.1806377172470093, + "learning_rate": 2.597563493702251e-05, + "loss": 1.0268, + "step": 18420 + }, + { + "epoch": 0.745649534601376, + "grad_norm": 1.2532659769058228, + "learning_rate": 2.5954986578567003e-05, + "loss": 0.9856, + "step": 18425 + }, + { + "epoch": 0.7458518818292189, + "grad_norm": 1.15655517578125, + "learning_rate": 2.5934338220111504e-05, + "loss": 0.9504, + "step": 18430 + }, + { + "epoch": 0.7460542290570619, + "grad_norm": 1.2378734350204468, + "learning_rate": 2.5913689861655998e-05, + "loss": 0.9727, + "step": 18435 + }, + { + "epoch": 0.7462565762849049, + "grad_norm": 1.1206599473953247, + "learning_rate": 2.5893041503200498e-05, + "loss": 0.9938, + "step": 18440 + }, + { + "epoch": 0.7464589235127479, + "grad_norm": 1.2198963165283203, + "learning_rate": 2.5872393144744995e-05, + "loss": 1.053, + "step": 18445 + }, + { + "epoch": 0.7466612707405909, + "grad_norm": 1.2086889743804932, + "learning_rate": 2.585174478628949e-05, + "loss": 1.007, + "step": 18450 + }, + { + "epoch": 0.7468636179684338, + "grad_norm": 1.107006549835205, + "learning_rate": 2.583109642783399e-05, + "loss": 0.9291, + "step": 18455 + }, + { + "epoch": 0.7470659651962768, + "grad_norm": 1.2523084878921509, + "learning_rate": 2.5810448069378484e-05, + "loss": 0.9592, + "step": 18460 + }, + { + "epoch": 0.7472683124241198, + "grad_norm": 1.2677992582321167, + "learning_rate": 2.5789799710922985e-05, + "loss": 0.9707, + "step": 18465 + }, + { + "epoch": 0.7474706596519628, + "grad_norm": 1.2503540515899658, + "learning_rate": 2.576915135246748e-05, + "loss": 1.0012, + "step": 18470 + }, + { + "epoch": 0.7476730068798058, + "grad_norm": 1.2988989353179932, + "learning_rate": 2.5748502994011976e-05, + "loss": 0.9533, + "step": 18475 + }, + { + "epoch": 0.7478753541076487, + "grad_norm": 1.226706862449646, + "learning_rate": 2.5727854635556477e-05, + "loss": 0.9822, + "step": 18480 + }, + { + "epoch": 0.7480777013354917, + "grad_norm": 1.1212823390960693, + "learning_rate": 2.570720627710097e-05, + "loss": 0.9749, + "step": 18485 + }, + { + "epoch": 0.7482800485633346, + "grad_norm": 1.1801321506500244, + "learning_rate": 2.568655791864547e-05, + "loss": 0.9993, + "step": 18490 + }, + { + "epoch": 0.7484823957911777, + "grad_norm": 1.2597798109054565, + "learning_rate": 2.5665909560189965e-05, + "loss": 1.0327, + "step": 18495 + }, + { + "epoch": 0.7486847430190207, + "grad_norm": 1.2265557050704956, + "learning_rate": 2.5645261201734462e-05, + "loss": 0.9664, + "step": 18500 + }, + { + "epoch": 0.7488870902468636, + "grad_norm": 1.0432270765304565, + "learning_rate": 2.562461284327896e-05, + "loss": 0.9637, + "step": 18505 + }, + { + "epoch": 0.7490894374747066, + "grad_norm": 1.0922579765319824, + "learning_rate": 2.5603964484823457e-05, + "loss": 1.0562, + "step": 18510 + }, + { + "epoch": 0.7492917847025495, + "grad_norm": 1.1779067516326904, + "learning_rate": 2.5583316126367958e-05, + "loss": 0.9537, + "step": 18515 + }, + { + "epoch": 0.7494941319303926, + "grad_norm": 1.1106982231140137, + "learning_rate": 2.556266776791245e-05, + "loss": 1.0006, + "step": 18520 + }, + { + "epoch": 0.7496964791582356, + "grad_norm": 1.337260127067566, + "learning_rate": 2.5542019409456952e-05, + "loss": 1.0258, + "step": 18525 + }, + { + "epoch": 0.7498988263860785, + "grad_norm": 1.141217827796936, + "learning_rate": 2.5521371051001446e-05, + "loss": 1.0355, + "step": 18530 + }, + { + "epoch": 0.7501011736139215, + "grad_norm": 1.1664879322052002, + "learning_rate": 2.5500722692545943e-05, + "loss": 0.9495, + "step": 18535 + }, + { + "epoch": 0.7503035208417644, + "grad_norm": 1.19535231590271, + "learning_rate": 2.548007433409044e-05, + "loss": 0.9651, + "step": 18540 + }, + { + "epoch": 0.7505058680696074, + "grad_norm": 1.162934422492981, + "learning_rate": 2.5459425975634938e-05, + "loss": 1.0744, + "step": 18545 + }, + { + "epoch": 0.7507082152974505, + "grad_norm": 1.2206164598464966, + "learning_rate": 2.543877761717944e-05, + "loss": 0.9585, + "step": 18550 + }, + { + "epoch": 0.7509105625252934, + "grad_norm": 1.2517008781433105, + "learning_rate": 2.5418129258723933e-05, + "loss": 1.0579, + "step": 18555 + }, + { + "epoch": 0.7511129097531364, + "grad_norm": 1.2082105875015259, + "learning_rate": 2.5397480900268427e-05, + "loss": 1.0415, + "step": 18560 + }, + { + "epoch": 0.7513152569809793, + "grad_norm": 1.3701311349868774, + "learning_rate": 2.5376832541812927e-05, + "loss": 0.9334, + "step": 18565 + }, + { + "epoch": 0.7515176042088223, + "grad_norm": 1.1496940851211548, + "learning_rate": 2.5356184183357425e-05, + "loss": 1.0017, + "step": 18570 + }, + { + "epoch": 0.7517199514366654, + "grad_norm": 1.1699267625808716, + "learning_rate": 2.5335535824901925e-05, + "loss": 1.0257, + "step": 18575 + }, + { + "epoch": 0.7519222986645083, + "grad_norm": 1.2081458568572998, + "learning_rate": 2.531488746644642e-05, + "loss": 0.978, + "step": 18580 + }, + { + "epoch": 0.7521246458923513, + "grad_norm": 1.2505255937576294, + "learning_rate": 2.5294239107990913e-05, + "loss": 1.0217, + "step": 18585 + }, + { + "epoch": 0.7523269931201942, + "grad_norm": 1.1666648387908936, + "learning_rate": 2.5273590749535414e-05, + "loss": 1.0024, + "step": 18590 + }, + { + "epoch": 0.7525293403480372, + "grad_norm": 1.2679240703582764, + "learning_rate": 2.5252942391079908e-05, + "loss": 0.9983, + "step": 18595 + }, + { + "epoch": 0.7527316875758802, + "grad_norm": 1.1022908687591553, + "learning_rate": 2.5232294032624408e-05, + "loss": 0.9505, + "step": 18600 + }, + { + "epoch": 0.7529340348037232, + "grad_norm": 1.192799687385559, + "learning_rate": 2.5211645674168906e-05, + "loss": 1.0234, + "step": 18605 + }, + { + "epoch": 0.7531363820315662, + "grad_norm": 1.08863365650177, + "learning_rate": 2.51909973157134e-05, + "loss": 0.9991, + "step": 18610 + }, + { + "epoch": 0.7533387292594091, + "grad_norm": 1.1891133785247803, + "learning_rate": 2.51703489572579e-05, + "loss": 1.0023, + "step": 18615 + }, + { + "epoch": 0.7535410764872521, + "grad_norm": 1.1604515314102173, + "learning_rate": 2.5149700598802394e-05, + "loss": 1.0229, + "step": 18620 + }, + { + "epoch": 0.7537434237150951, + "grad_norm": 1.220996618270874, + "learning_rate": 2.5129052240346895e-05, + "loss": 0.9946, + "step": 18625 + }, + { + "epoch": 0.7539457709429381, + "grad_norm": 1.2904902696609497, + "learning_rate": 2.510840388189139e-05, + "loss": 0.9669, + "step": 18630 + }, + { + "epoch": 0.7541481181707811, + "grad_norm": 1.170644760131836, + "learning_rate": 2.508775552343589e-05, + "loss": 1.0126, + "step": 18635 + }, + { + "epoch": 0.754350465398624, + "grad_norm": 1.1381133794784546, + "learning_rate": 2.5067107164980387e-05, + "loss": 0.9748, + "step": 18640 + }, + { + "epoch": 0.754552812626467, + "grad_norm": 1.2390223741531372, + "learning_rate": 2.504645880652488e-05, + "loss": 0.9901, + "step": 18645 + }, + { + "epoch": 0.75475515985431, + "grad_norm": 1.189243197441101, + "learning_rate": 2.502581044806938e-05, + "loss": 0.9997, + "step": 18650 + }, + { + "epoch": 0.7549575070821529, + "grad_norm": 1.1826403141021729, + "learning_rate": 2.5005162089613875e-05, + "loss": 0.9858, + "step": 18655 + }, + { + "epoch": 0.755159854309996, + "grad_norm": 1.1260607242584229, + "learning_rate": 2.4984513731158372e-05, + "loss": 1.0355, + "step": 18660 + }, + { + "epoch": 0.755362201537839, + "grad_norm": 1.3457492589950562, + "learning_rate": 2.4963865372702873e-05, + "loss": 1.0246, + "step": 18665 + }, + { + "epoch": 0.7555645487656819, + "grad_norm": 1.0560047626495361, + "learning_rate": 2.494321701424737e-05, + "loss": 0.9743, + "step": 18670 + }, + { + "epoch": 0.7557668959935249, + "grad_norm": 1.07358980178833, + "learning_rate": 2.4922568655791868e-05, + "loss": 0.9708, + "step": 18675 + }, + { + "epoch": 0.7559692432213678, + "grad_norm": 1.1762304306030273, + "learning_rate": 2.490192029733636e-05, + "loss": 1.0239, + "step": 18680 + }, + { + "epoch": 0.7561715904492109, + "grad_norm": 1.247923731803894, + "learning_rate": 2.488127193888086e-05, + "loss": 0.9804, + "step": 18685 + }, + { + "epoch": 0.7563739376770539, + "grad_norm": 1.2488702535629272, + "learning_rate": 2.4860623580425356e-05, + "loss": 0.9402, + "step": 18690 + }, + { + "epoch": 0.7565762849048968, + "grad_norm": 1.3600049018859863, + "learning_rate": 2.4839975221969854e-05, + "loss": 1.0327, + "step": 18695 + }, + { + "epoch": 0.7567786321327398, + "grad_norm": 1.2319860458374023, + "learning_rate": 2.4819326863514354e-05, + "loss": 1.0062, + "step": 18700 + }, + { + "epoch": 0.7569809793605827, + "grad_norm": 1.2301151752471924, + "learning_rate": 2.4798678505058848e-05, + "loss": 0.9879, + "step": 18705 + }, + { + "epoch": 0.7571833265884257, + "grad_norm": 1.2247709035873413, + "learning_rate": 2.4778030146603345e-05, + "loss": 1.0125, + "step": 18710 + }, + { + "epoch": 0.7573856738162688, + "grad_norm": 1.2979135513305664, + "learning_rate": 2.4757381788147843e-05, + "loss": 0.9305, + "step": 18715 + }, + { + "epoch": 0.7575880210441117, + "grad_norm": 1.1458330154418945, + "learning_rate": 2.473673342969234e-05, + "loss": 0.9594, + "step": 18720 + }, + { + "epoch": 0.7577903682719547, + "grad_norm": 1.2261348962783813, + "learning_rate": 2.4716085071236837e-05, + "loss": 1.0207, + "step": 18725 + }, + { + "epoch": 0.7579927154997976, + "grad_norm": 1.2570581436157227, + "learning_rate": 2.4695436712781335e-05, + "loss": 1.0073, + "step": 18730 + }, + { + "epoch": 0.7581950627276406, + "grad_norm": 1.1916133165359497, + "learning_rate": 2.4674788354325832e-05, + "loss": 0.9727, + "step": 18735 + }, + { + "epoch": 0.7583974099554837, + "grad_norm": 1.1151262521743774, + "learning_rate": 2.465413999587033e-05, + "loss": 0.9785, + "step": 18740 + }, + { + "epoch": 0.7585997571833266, + "grad_norm": 1.2051528692245483, + "learning_rate": 2.4633491637414827e-05, + "loss": 0.9576, + "step": 18745 + }, + { + "epoch": 0.7588021044111696, + "grad_norm": 1.2141988277435303, + "learning_rate": 2.4612843278959324e-05, + "loss": 0.951, + "step": 18750 + }, + { + "epoch": 0.7590044516390125, + "grad_norm": 1.1166486740112305, + "learning_rate": 2.459219492050382e-05, + "loss": 1.0134, + "step": 18755 + }, + { + "epoch": 0.7592067988668555, + "grad_norm": 1.1245037317276, + "learning_rate": 2.457154656204832e-05, + "loss": 0.9814, + "step": 18760 + }, + { + "epoch": 0.7594091460946985, + "grad_norm": 1.2884442806243896, + "learning_rate": 2.4550898203592816e-05, + "loss": 0.9701, + "step": 18765 + }, + { + "epoch": 0.7596114933225415, + "grad_norm": 1.1675169467926025, + "learning_rate": 2.4530249845137313e-05, + "loss": 0.9914, + "step": 18770 + }, + { + "epoch": 0.7598138405503845, + "grad_norm": 1.2772468328475952, + "learning_rate": 2.450960148668181e-05, + "loss": 1.0261, + "step": 18775 + }, + { + "epoch": 0.7600161877782274, + "grad_norm": 1.1847087144851685, + "learning_rate": 2.4488953128226308e-05, + "loss": 0.983, + "step": 18780 + }, + { + "epoch": 0.7602185350060704, + "grad_norm": 1.2734636068344116, + "learning_rate": 2.4468304769770805e-05, + "loss": 0.9856, + "step": 18785 + }, + { + "epoch": 0.7604208822339134, + "grad_norm": 1.172006368637085, + "learning_rate": 2.4447656411315302e-05, + "loss": 1.0438, + "step": 18790 + }, + { + "epoch": 0.7606232294617564, + "grad_norm": 1.1985136270523071, + "learning_rate": 2.44270080528598e-05, + "loss": 0.9974, + "step": 18795 + }, + { + "epoch": 0.7608255766895994, + "grad_norm": 1.2858589887619019, + "learning_rate": 2.4406359694404297e-05, + "loss": 0.9356, + "step": 18800 + }, + { + "epoch": 0.7610279239174423, + "grad_norm": 1.2080332040786743, + "learning_rate": 2.4385711335948794e-05, + "loss": 0.9645, + "step": 18805 + }, + { + "epoch": 0.7612302711452853, + "grad_norm": 1.2445225715637207, + "learning_rate": 2.436506297749329e-05, + "loss": 1.0145, + "step": 18810 + }, + { + "epoch": 0.7614326183731283, + "grad_norm": 1.3031141757965088, + "learning_rate": 2.4344414619037785e-05, + "loss": 1.0283, + "step": 18815 + }, + { + "epoch": 0.7616349656009712, + "grad_norm": 1.1435433626174927, + "learning_rate": 2.4323766260582283e-05, + "loss": 0.9968, + "step": 18820 + }, + { + "epoch": 0.7618373128288143, + "grad_norm": 1.119293451309204, + "learning_rate": 2.4303117902126783e-05, + "loss": 1.0004, + "step": 18825 + }, + { + "epoch": 0.7620396600566572, + "grad_norm": 1.1149024963378906, + "learning_rate": 2.428246954367128e-05, + "loss": 1.0003, + "step": 18830 + }, + { + "epoch": 0.7622420072845002, + "grad_norm": 1.1140360832214355, + "learning_rate": 2.4261821185215778e-05, + "loss": 0.9941, + "step": 18835 + }, + { + "epoch": 0.7624443545123432, + "grad_norm": 1.1843329668045044, + "learning_rate": 2.4241172826760275e-05, + "loss": 1.0184, + "step": 18840 + }, + { + "epoch": 0.7626467017401861, + "grad_norm": 1.3155696392059326, + "learning_rate": 2.422052446830477e-05, + "loss": 0.94, + "step": 18845 + }, + { + "epoch": 0.7628490489680292, + "grad_norm": 1.4078176021575928, + "learning_rate": 2.4199876109849266e-05, + "loss": 0.9909, + "step": 18850 + }, + { + "epoch": 0.7630513961958721, + "grad_norm": 1.2036023139953613, + "learning_rate": 2.4179227751393767e-05, + "loss": 1.0051, + "step": 18855 + }, + { + "epoch": 0.7632537434237151, + "grad_norm": 1.0942049026489258, + "learning_rate": 2.4158579392938264e-05, + "loss": 0.9281, + "step": 18860 + }, + { + "epoch": 0.7634560906515581, + "grad_norm": 1.1924493312835693, + "learning_rate": 2.413793103448276e-05, + "loss": 1.004, + "step": 18865 + }, + { + "epoch": 0.763658437879401, + "grad_norm": 1.2332738637924194, + "learning_rate": 2.4117282676027256e-05, + "loss": 0.9553, + "step": 18870 + }, + { + "epoch": 0.763860785107244, + "grad_norm": 1.193235993385315, + "learning_rate": 2.4096634317571753e-05, + "loss": 0.9922, + "step": 18875 + }, + { + "epoch": 0.764063132335087, + "grad_norm": 1.0221155881881714, + "learning_rate": 2.407598595911625e-05, + "loss": 0.9689, + "step": 18880 + }, + { + "epoch": 0.76426547956293, + "grad_norm": 1.1542539596557617, + "learning_rate": 2.4055337600660747e-05, + "loss": 1.0137, + "step": 18885 + }, + { + "epoch": 0.764467826790773, + "grad_norm": 1.1557893753051758, + "learning_rate": 2.4034689242205248e-05, + "loss": 0.9654, + "step": 18890 + }, + { + "epoch": 0.7646701740186159, + "grad_norm": 1.3672646284103394, + "learning_rate": 2.4014040883749745e-05, + "loss": 1.0191, + "step": 18895 + }, + { + "epoch": 0.7648725212464589, + "grad_norm": 1.2017077207565308, + "learning_rate": 2.399339252529424e-05, + "loss": 0.9156, + "step": 18900 + }, + { + "epoch": 0.765074868474302, + "grad_norm": 1.2825926542282104, + "learning_rate": 2.3972744166838737e-05, + "loss": 1.0015, + "step": 18905 + }, + { + "epoch": 0.7652772157021449, + "grad_norm": 1.1321334838867188, + "learning_rate": 2.3952095808383234e-05, + "loss": 0.9771, + "step": 18910 + }, + { + "epoch": 0.7654795629299879, + "grad_norm": 1.218597412109375, + "learning_rate": 2.393144744992773e-05, + "loss": 0.9791, + "step": 18915 + }, + { + "epoch": 0.7656819101578308, + "grad_norm": 1.1967525482177734, + "learning_rate": 2.391079909147223e-05, + "loss": 0.9675, + "step": 18920 + }, + { + "epoch": 0.7658842573856738, + "grad_norm": 1.3359050750732422, + "learning_rate": 2.3890150733016726e-05, + "loss": 1.0157, + "step": 18925 + }, + { + "epoch": 0.7660866046135167, + "grad_norm": 1.2007734775543213, + "learning_rate": 2.3869502374561223e-05, + "loss": 0.9727, + "step": 18930 + }, + { + "epoch": 0.7662889518413598, + "grad_norm": 1.2456916570663452, + "learning_rate": 2.384885401610572e-05, + "loss": 0.9768, + "step": 18935 + }, + { + "epoch": 0.7664912990692028, + "grad_norm": 1.2281389236450195, + "learning_rate": 2.3828205657650218e-05, + "loss": 1.0122, + "step": 18940 + }, + { + "epoch": 0.7666936462970457, + "grad_norm": 1.1848171949386597, + "learning_rate": 2.3807557299194715e-05, + "loss": 1.04, + "step": 18945 + }, + { + "epoch": 0.7668959935248887, + "grad_norm": 1.1801064014434814, + "learning_rate": 2.3786908940739212e-05, + "loss": 0.9442, + "step": 18950 + }, + { + "epoch": 0.7670983407527316, + "grad_norm": 1.1730283498764038, + "learning_rate": 2.376626058228371e-05, + "loss": 1.001, + "step": 18955 + }, + { + "epoch": 0.7673006879805747, + "grad_norm": 1.118220567703247, + "learning_rate": 2.3745612223828207e-05, + "loss": 0.9982, + "step": 18960 + }, + { + "epoch": 0.7675030352084177, + "grad_norm": 1.1394208669662476, + "learning_rate": 2.3724963865372704e-05, + "loss": 1.0106, + "step": 18965 + }, + { + "epoch": 0.7677053824362606, + "grad_norm": 1.1889700889587402, + "learning_rate": 2.37043155069172e-05, + "loss": 1.0927, + "step": 18970 + }, + { + "epoch": 0.7679077296641036, + "grad_norm": 1.1230664253234863, + "learning_rate": 2.36836671484617e-05, + "loss": 1.0148, + "step": 18975 + }, + { + "epoch": 0.7681100768919465, + "grad_norm": 1.2890617847442627, + "learning_rate": 2.3663018790006196e-05, + "loss": 1.0196, + "step": 18980 + }, + { + "epoch": 0.7683124241197895, + "grad_norm": 1.144126057624817, + "learning_rate": 2.3642370431550693e-05, + "loss": 0.9868, + "step": 18985 + }, + { + "epoch": 0.7685147713476326, + "grad_norm": 1.1859136819839478, + "learning_rate": 2.362172207309519e-05, + "loss": 0.9755, + "step": 18990 + }, + { + "epoch": 0.7687171185754755, + "grad_norm": 1.289074182510376, + "learning_rate": 2.3601073714639688e-05, + "loss": 1.0042, + "step": 18995 + }, + { + "epoch": 0.7689194658033185, + "grad_norm": 1.0631299018859863, + "learning_rate": 2.3580425356184185e-05, + "loss": 0.9353, + "step": 19000 + }, + { + "epoch": 0.7691218130311614, + "grad_norm": 1.1243079900741577, + "learning_rate": 2.3559776997728683e-05, + "loss": 1.0122, + "step": 19005 + }, + { + "epoch": 0.7693241602590044, + "grad_norm": 1.182533860206604, + "learning_rate": 2.3539128639273176e-05, + "loss": 0.9918, + "step": 19010 + }, + { + "epoch": 0.7695265074868475, + "grad_norm": 1.144896149635315, + "learning_rate": 2.3518480280817677e-05, + "loss": 0.9295, + "step": 19015 + }, + { + "epoch": 0.7697288547146904, + "grad_norm": 1.1534823179244995, + "learning_rate": 2.3497831922362174e-05, + "loss": 0.9665, + "step": 19020 + }, + { + "epoch": 0.7699312019425334, + "grad_norm": 1.4073677062988281, + "learning_rate": 2.3477183563906672e-05, + "loss": 0.9633, + "step": 19025 + }, + { + "epoch": 0.7701335491703764, + "grad_norm": 1.3116756677627563, + "learning_rate": 2.345653520545117e-05, + "loss": 0.9783, + "step": 19030 + }, + { + "epoch": 0.7703358963982193, + "grad_norm": 1.2318789958953857, + "learning_rate": 2.3435886846995663e-05, + "loss": 0.9311, + "step": 19035 + }, + { + "epoch": 0.7705382436260623, + "grad_norm": 1.1697779893875122, + "learning_rate": 2.341523848854016e-05, + "loss": 0.913, + "step": 19040 + }, + { + "epoch": 0.7707405908539053, + "grad_norm": 1.149557113647461, + "learning_rate": 2.339459013008466e-05, + "loss": 0.944, + "step": 19045 + }, + { + "epoch": 0.7709429380817483, + "grad_norm": 1.1746495962142944, + "learning_rate": 2.3373941771629158e-05, + "loss": 0.9823, + "step": 19050 + }, + { + "epoch": 0.7711452853095913, + "grad_norm": 1.2647314071655273, + "learning_rate": 2.3353293413173656e-05, + "loss": 0.9882, + "step": 19055 + }, + { + "epoch": 0.7713476325374342, + "grad_norm": 1.1291464567184448, + "learning_rate": 2.333264505471815e-05, + "loss": 1.0234, + "step": 19060 + }, + { + "epoch": 0.7715499797652772, + "grad_norm": 1.2369909286499023, + "learning_rate": 2.3311996696262647e-05, + "loss": 0.9598, + "step": 19065 + }, + { + "epoch": 0.7717523269931202, + "grad_norm": 1.0962897539138794, + "learning_rate": 2.3291348337807144e-05, + "loss": 0.9624, + "step": 19070 + }, + { + "epoch": 0.7719546742209632, + "grad_norm": 1.1354165077209473, + "learning_rate": 2.327069997935164e-05, + "loss": 1.0069, + "step": 19075 + }, + { + "epoch": 0.7721570214488062, + "grad_norm": 1.2026914358139038, + "learning_rate": 2.3250051620896142e-05, + "loss": 1.0439, + "step": 19080 + }, + { + "epoch": 0.7723593686766491, + "grad_norm": 1.22047758102417, + "learning_rate": 2.322940326244064e-05, + "loss": 0.9916, + "step": 19085 + }, + { + "epoch": 0.7725617159044921, + "grad_norm": 1.1651899814605713, + "learning_rate": 2.3208754903985133e-05, + "loss": 1.0401, + "step": 19090 + }, + { + "epoch": 0.772764063132335, + "grad_norm": 1.252793550491333, + "learning_rate": 2.318810654552963e-05, + "loss": 0.976, + "step": 19095 + }, + { + "epoch": 0.7729664103601781, + "grad_norm": 1.2411421537399292, + "learning_rate": 2.3167458187074128e-05, + "loss": 0.9926, + "step": 19100 + }, + { + "epoch": 0.7731687575880211, + "grad_norm": 1.2860257625579834, + "learning_rate": 2.3146809828618625e-05, + "loss": 1.0801, + "step": 19105 + }, + { + "epoch": 0.773371104815864, + "grad_norm": 1.345198392868042, + "learning_rate": 2.3126161470163122e-05, + "loss": 0.9809, + "step": 19110 + }, + { + "epoch": 0.773573452043707, + "grad_norm": 1.2429323196411133, + "learning_rate": 2.310551311170762e-05, + "loss": 0.9996, + "step": 19115 + }, + { + "epoch": 0.7737757992715499, + "grad_norm": 1.1630768775939941, + "learning_rate": 2.3084864753252117e-05, + "loss": 0.9374, + "step": 19120 + }, + { + "epoch": 0.773978146499393, + "grad_norm": 1.2421936988830566, + "learning_rate": 2.3064216394796614e-05, + "loss": 0.9434, + "step": 19125 + }, + { + "epoch": 0.774180493727236, + "grad_norm": 1.2803579568862915, + "learning_rate": 2.304356803634111e-05, + "loss": 0.9821, + "step": 19130 + }, + { + "epoch": 0.7743828409550789, + "grad_norm": 1.2965850830078125, + "learning_rate": 2.302291967788561e-05, + "loss": 0.9934, + "step": 19135 + }, + { + "epoch": 0.7745851881829219, + "grad_norm": 1.1569753885269165, + "learning_rate": 2.3002271319430106e-05, + "loss": 0.9648, + "step": 19140 + }, + { + "epoch": 0.7747875354107648, + "grad_norm": 1.2425038814544678, + "learning_rate": 2.2981622960974603e-05, + "loss": 0.9707, + "step": 19145 + }, + { + "epoch": 0.7749898826386078, + "grad_norm": 1.103983759880066, + "learning_rate": 2.29609746025191e-05, + "loss": 0.9614, + "step": 19150 + }, + { + "epoch": 0.7751922298664509, + "grad_norm": 1.1858563423156738, + "learning_rate": 2.2940326244063598e-05, + "loss": 1.0251, + "step": 19155 + }, + { + "epoch": 0.7753945770942938, + "grad_norm": 1.1213321685791016, + "learning_rate": 2.2919677885608095e-05, + "loss": 0.9825, + "step": 19160 + }, + { + "epoch": 0.7755969243221368, + "grad_norm": 1.2138493061065674, + "learning_rate": 2.2899029527152593e-05, + "loss": 1.0044, + "step": 19165 + }, + { + "epoch": 0.7757992715499797, + "grad_norm": 1.0400367975234985, + "learning_rate": 2.287838116869709e-05, + "loss": 0.9729, + "step": 19170 + }, + { + "epoch": 0.7760016187778227, + "grad_norm": 1.2685893774032593, + "learning_rate": 2.2857732810241587e-05, + "loss": 0.9559, + "step": 19175 + }, + { + "epoch": 0.7762039660056658, + "grad_norm": 1.23403799533844, + "learning_rate": 2.2837084451786085e-05, + "loss": 0.9808, + "step": 19180 + }, + { + "epoch": 0.7764063132335087, + "grad_norm": 1.1738874912261963, + "learning_rate": 2.2816436093330582e-05, + "loss": 0.9195, + "step": 19185 + }, + { + "epoch": 0.7766086604613517, + "grad_norm": 1.0791808366775513, + "learning_rate": 2.279578773487508e-05, + "loss": 0.9871, + "step": 19190 + }, + { + "epoch": 0.7768110076891946, + "grad_norm": 1.1705849170684814, + "learning_rate": 2.2775139376419576e-05, + "loss": 1.0212, + "step": 19195 + }, + { + "epoch": 0.7770133549170376, + "grad_norm": 1.12641179561615, + "learning_rate": 2.275449101796407e-05, + "loss": 0.9875, + "step": 19200 + }, + { + "epoch": 0.7772157021448806, + "grad_norm": 1.144472360610962, + "learning_rate": 2.273384265950857e-05, + "loss": 1.0125, + "step": 19205 + }, + { + "epoch": 0.7774180493727236, + "grad_norm": 1.0404157638549805, + "learning_rate": 2.271319430105307e-05, + "loss": 0.9465, + "step": 19210 + }, + { + "epoch": 0.7776203966005666, + "grad_norm": 1.2508095502853394, + "learning_rate": 2.2692545942597566e-05, + "loss": 1.0029, + "step": 19215 + }, + { + "epoch": 0.7778227438284095, + "grad_norm": 1.2936362028121948, + "learning_rate": 2.2671897584142063e-05, + "loss": 0.9797, + "step": 19220 + }, + { + "epoch": 0.7780250910562525, + "grad_norm": 1.134230375289917, + "learning_rate": 2.2651249225686557e-05, + "loss": 0.9835, + "step": 19225 + }, + { + "epoch": 0.7782274382840955, + "grad_norm": 1.2599493265151978, + "learning_rate": 2.2630600867231054e-05, + "loss": 0.9651, + "step": 19230 + }, + { + "epoch": 0.7784297855119385, + "grad_norm": 1.1519733667373657, + "learning_rate": 2.2609952508775555e-05, + "loss": 1.0254, + "step": 19235 + }, + { + "epoch": 0.7786321327397815, + "grad_norm": 1.3085072040557861, + "learning_rate": 2.2589304150320052e-05, + "loss": 0.9717, + "step": 19240 + }, + { + "epoch": 0.7788344799676244, + "grad_norm": 1.1786202192306519, + "learning_rate": 2.256865579186455e-05, + "loss": 1.0672, + "step": 19245 + }, + { + "epoch": 0.7790368271954674, + "grad_norm": 1.0811524391174316, + "learning_rate": 2.2548007433409047e-05, + "loss": 0.9968, + "step": 19250 + }, + { + "epoch": 0.7792391744233104, + "grad_norm": 1.196601152420044, + "learning_rate": 2.252735907495354e-05, + "loss": 1.0126, + "step": 19255 + }, + { + "epoch": 0.7794415216511533, + "grad_norm": 1.213605523109436, + "learning_rate": 2.2506710716498038e-05, + "loss": 0.9326, + "step": 19260 + }, + { + "epoch": 0.7796438688789964, + "grad_norm": 1.202684760093689, + "learning_rate": 2.2486062358042535e-05, + "loss": 0.9132, + "step": 19265 + }, + { + "epoch": 0.7798462161068394, + "grad_norm": 1.096232533454895, + "learning_rate": 2.2465413999587036e-05, + "loss": 0.974, + "step": 19270 + }, + { + "epoch": 0.7800485633346823, + "grad_norm": 1.2750041484832764, + "learning_rate": 2.2444765641131533e-05, + "loss": 0.9448, + "step": 19275 + }, + { + "epoch": 0.7802509105625253, + "grad_norm": 1.304419755935669, + "learning_rate": 2.2424117282676027e-05, + "loss": 0.9944, + "step": 19280 + }, + { + "epoch": 0.7804532577903682, + "grad_norm": 1.141446828842163, + "learning_rate": 2.2403468924220524e-05, + "loss": 1.0035, + "step": 19285 + }, + { + "epoch": 0.7806556050182113, + "grad_norm": 1.2574546337127686, + "learning_rate": 2.2382820565765022e-05, + "loss": 1.0331, + "step": 19290 + }, + { + "epoch": 0.7808579522460543, + "grad_norm": 1.16201913356781, + "learning_rate": 2.236217220730952e-05, + "loss": 0.9733, + "step": 19295 + }, + { + "epoch": 0.7810602994738972, + "grad_norm": 1.1824009418487549, + "learning_rate": 2.2341523848854016e-05, + "loss": 0.9713, + "step": 19300 + }, + { + "epoch": 0.7812626467017402, + "grad_norm": 1.146752953529358, + "learning_rate": 2.2320875490398517e-05, + "loss": 1.039, + "step": 19305 + }, + { + "epoch": 0.7814649939295831, + "grad_norm": 1.1276365518569946, + "learning_rate": 2.230022713194301e-05, + "loss": 1.0075, + "step": 19310 + }, + { + "epoch": 0.7816673411574262, + "grad_norm": 1.2805054187774658, + "learning_rate": 2.2279578773487508e-05, + "loss": 1.0806, + "step": 19315 + }, + { + "epoch": 0.7818696883852692, + "grad_norm": 1.2467832565307617, + "learning_rate": 2.2258930415032005e-05, + "loss": 1.0169, + "step": 19320 + }, + { + "epoch": 0.7820720356131121, + "grad_norm": 1.325809121131897, + "learning_rate": 2.2238282056576503e-05, + "loss": 1.0255, + "step": 19325 + }, + { + "epoch": 0.7822743828409551, + "grad_norm": 1.2889117002487183, + "learning_rate": 2.2217633698121e-05, + "loss": 1.0637, + "step": 19330 + }, + { + "epoch": 0.782476730068798, + "grad_norm": 1.1927183866500854, + "learning_rate": 2.2196985339665497e-05, + "loss": 0.997, + "step": 19335 + }, + { + "epoch": 0.782679077296641, + "grad_norm": 1.2247055768966675, + "learning_rate": 2.2176336981209995e-05, + "loss": 1.0583, + "step": 19340 + }, + { + "epoch": 0.7828814245244841, + "grad_norm": 1.1576895713806152, + "learning_rate": 2.2155688622754492e-05, + "loss": 0.9673, + "step": 19345 + }, + { + "epoch": 0.783083771752327, + "grad_norm": 1.1851730346679688, + "learning_rate": 2.213504026429899e-05, + "loss": 1.0247, + "step": 19350 + }, + { + "epoch": 0.78328611898017, + "grad_norm": 1.150207757949829, + "learning_rate": 2.2114391905843487e-05, + "loss": 0.9963, + "step": 19355 + }, + { + "epoch": 0.7834884662080129, + "grad_norm": 1.1914957761764526, + "learning_rate": 2.2093743547387984e-05, + "loss": 1.0087, + "step": 19360 + }, + { + "epoch": 0.7836908134358559, + "grad_norm": 1.1222381591796875, + "learning_rate": 2.207309518893248e-05, + "loss": 0.9735, + "step": 19365 + }, + { + "epoch": 0.783893160663699, + "grad_norm": 1.2687716484069824, + "learning_rate": 2.205244683047698e-05, + "loss": 0.9979, + "step": 19370 + }, + { + "epoch": 0.7840955078915419, + "grad_norm": 1.1201457977294922, + "learning_rate": 2.2031798472021476e-05, + "loss": 0.9835, + "step": 19375 + }, + { + "epoch": 0.7842978551193849, + "grad_norm": 1.0393489599227905, + "learning_rate": 2.2011150113565973e-05, + "loss": 0.9536, + "step": 19380 + }, + { + "epoch": 0.7845002023472278, + "grad_norm": 1.2747719287872314, + "learning_rate": 2.199050175511047e-05, + "loss": 0.9486, + "step": 19385 + }, + { + "epoch": 0.7847025495750708, + "grad_norm": 1.3020176887512207, + "learning_rate": 2.1969853396654964e-05, + "loss": 1.0086, + "step": 19390 + }, + { + "epoch": 0.7849048968029138, + "grad_norm": 1.2747538089752197, + "learning_rate": 2.1949205038199465e-05, + "loss": 1.017, + "step": 19395 + }, + { + "epoch": 0.7851072440307568, + "grad_norm": 1.2838488817214966, + "learning_rate": 2.1928556679743962e-05, + "loss": 0.9527, + "step": 19400 + }, + { + "epoch": 0.7853095912585998, + "grad_norm": 1.147567629814148, + "learning_rate": 2.190790832128846e-05, + "loss": 0.9879, + "step": 19405 + }, + { + "epoch": 0.7855119384864427, + "grad_norm": 1.268541932106018, + "learning_rate": 2.1887259962832957e-05, + "loss": 0.973, + "step": 19410 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 1.1716783046722412, + "learning_rate": 2.1866611604377454e-05, + "loss": 0.9145, + "step": 19415 + }, + { + "epoch": 0.7859166329421287, + "grad_norm": 1.1512025594711304, + "learning_rate": 2.1845963245921948e-05, + "loss": 0.9786, + "step": 19420 + }, + { + "epoch": 0.7861189801699717, + "grad_norm": 1.2119401693344116, + "learning_rate": 2.182531488746645e-05, + "loss": 1.0029, + "step": 19425 + }, + { + "epoch": 0.7863213273978147, + "grad_norm": 1.2084400653839111, + "learning_rate": 2.1804666529010946e-05, + "loss": 0.9445, + "step": 19430 + }, + { + "epoch": 0.7865236746256576, + "grad_norm": 1.188834309577942, + "learning_rate": 2.1784018170555443e-05, + "loss": 1.0071, + "step": 19435 + }, + { + "epoch": 0.7867260218535006, + "grad_norm": 1.2610461711883545, + "learning_rate": 2.176336981209994e-05, + "loss": 1.0147, + "step": 19440 + }, + { + "epoch": 0.7869283690813436, + "grad_norm": 1.2548645734786987, + "learning_rate": 2.1742721453644435e-05, + "loss": 1.0113, + "step": 19445 + }, + { + "epoch": 0.7871307163091865, + "grad_norm": 1.177480936050415, + "learning_rate": 2.1722073095188932e-05, + "loss": 0.9405, + "step": 19450 + }, + { + "epoch": 0.7873330635370296, + "grad_norm": 1.2200291156768799, + "learning_rate": 2.170142473673343e-05, + "loss": 1.0175, + "step": 19455 + }, + { + "epoch": 0.7875354107648725, + "grad_norm": 1.196225643157959, + "learning_rate": 2.168077637827793e-05, + "loss": 1.0179, + "step": 19460 + }, + { + "epoch": 0.7877377579927155, + "grad_norm": 1.069580078125, + "learning_rate": 2.1660128019822427e-05, + "loss": 1.0087, + "step": 19465 + }, + { + "epoch": 0.7879401052205585, + "grad_norm": 1.1265087127685547, + "learning_rate": 2.1639479661366924e-05, + "loss": 0.969, + "step": 19470 + }, + { + "epoch": 0.7881424524484014, + "grad_norm": 1.1495747566223145, + "learning_rate": 2.161883130291142e-05, + "loss": 1.0225, + "step": 19475 + }, + { + "epoch": 0.7883447996762445, + "grad_norm": 1.1369599103927612, + "learning_rate": 2.1598182944455916e-05, + "loss": 0.9334, + "step": 19480 + }, + { + "epoch": 0.7885471469040874, + "grad_norm": 1.2167320251464844, + "learning_rate": 2.1577534586000413e-05, + "loss": 0.9984, + "step": 19485 + }, + { + "epoch": 0.7887494941319304, + "grad_norm": 1.1752022504806519, + "learning_rate": 2.155688622754491e-05, + "loss": 0.9765, + "step": 19490 + }, + { + "epoch": 0.7889518413597734, + "grad_norm": 1.2124340534210205, + "learning_rate": 2.153623786908941e-05, + "loss": 0.9868, + "step": 19495 + }, + { + "epoch": 0.7891541885876163, + "grad_norm": 1.1635890007019043, + "learning_rate": 2.1515589510633905e-05, + "loss": 0.9795, + "step": 19500 + }, + { + "epoch": 0.7893565358154593, + "grad_norm": 1.2324472665786743, + "learning_rate": 2.1494941152178402e-05, + "loss": 0.9402, + "step": 19505 + }, + { + "epoch": 0.7895588830433024, + "grad_norm": 1.2834147214889526, + "learning_rate": 2.14742927937229e-05, + "loss": 1.0143, + "step": 19510 + }, + { + "epoch": 0.7897612302711453, + "grad_norm": 1.2503693103790283, + "learning_rate": 2.1453644435267397e-05, + "loss": 0.9469, + "step": 19515 + }, + { + "epoch": 0.7899635774989883, + "grad_norm": 1.3614864349365234, + "learning_rate": 2.1432996076811894e-05, + "loss": 1.0108, + "step": 19520 + }, + { + "epoch": 0.7901659247268312, + "grad_norm": 1.2642440795898438, + "learning_rate": 2.141234771835639e-05, + "loss": 1.0366, + "step": 19525 + }, + { + "epoch": 0.7903682719546742, + "grad_norm": 1.1245198249816895, + "learning_rate": 2.139169935990089e-05, + "loss": 0.9645, + "step": 19530 + }, + { + "epoch": 0.7905706191825173, + "grad_norm": 1.3378099203109741, + "learning_rate": 2.1371051001445386e-05, + "loss": 1.0262, + "step": 19535 + }, + { + "epoch": 0.7907729664103602, + "grad_norm": 1.1884162425994873, + "learning_rate": 2.1350402642989883e-05, + "loss": 1.0256, + "step": 19540 + }, + { + "epoch": 0.7909753136382032, + "grad_norm": 1.0942997932434082, + "learning_rate": 2.132975428453438e-05, + "loss": 1.0105, + "step": 19545 + }, + { + "epoch": 0.7911776608660461, + "grad_norm": 1.1098393201828003, + "learning_rate": 2.1309105926078878e-05, + "loss": 0.9206, + "step": 19550 + }, + { + "epoch": 0.7913800080938891, + "grad_norm": 1.1302059888839722, + "learning_rate": 2.1288457567623375e-05, + "loss": 0.9331, + "step": 19555 + }, + { + "epoch": 0.791582355321732, + "grad_norm": 1.3323559761047363, + "learning_rate": 2.1267809209167872e-05, + "loss": 1.0318, + "step": 19560 + }, + { + "epoch": 0.7917847025495751, + "grad_norm": 1.1884725093841553, + "learning_rate": 2.124716085071237e-05, + "loss": 1.0405, + "step": 19565 + }, + { + "epoch": 0.7919870497774181, + "grad_norm": 1.2075164318084717, + "learning_rate": 2.1226512492256867e-05, + "loss": 1.0126, + "step": 19570 + }, + { + "epoch": 0.792189397005261, + "grad_norm": 1.1292778253555298, + "learning_rate": 2.1205864133801364e-05, + "loss": 1.0286, + "step": 19575 + }, + { + "epoch": 0.792391744233104, + "grad_norm": 1.2976049184799194, + "learning_rate": 2.1185215775345858e-05, + "loss": 1.0342, + "step": 19580 + }, + { + "epoch": 0.792594091460947, + "grad_norm": 1.214419960975647, + "learning_rate": 2.116456741689036e-05, + "loss": 1.0012, + "step": 19585 + }, + { + "epoch": 0.79279643868879, + "grad_norm": 1.1389003992080688, + "learning_rate": 2.1143919058434856e-05, + "loss": 1.0274, + "step": 19590 + }, + { + "epoch": 0.792998785916633, + "grad_norm": 1.192901372909546, + "learning_rate": 2.1123270699979353e-05, + "loss": 0.9447, + "step": 19595 + }, + { + "epoch": 0.7932011331444759, + "grad_norm": 1.3499056100845337, + "learning_rate": 2.110262234152385e-05, + "loss": 0.9936, + "step": 19600 + }, + { + "epoch": 0.7934034803723189, + "grad_norm": 1.1274263858795166, + "learning_rate": 2.1081973983068348e-05, + "loss": 0.9792, + "step": 19605 + }, + { + "epoch": 0.7936058276001619, + "grad_norm": 1.1653188467025757, + "learning_rate": 2.1061325624612842e-05, + "loss": 1.0546, + "step": 19610 + }, + { + "epoch": 0.7938081748280048, + "grad_norm": 1.2700755596160889, + "learning_rate": 2.104067726615734e-05, + "loss": 0.9734, + "step": 19615 + }, + { + "epoch": 0.7940105220558479, + "grad_norm": 1.2344692945480347, + "learning_rate": 2.102002890770184e-05, + "loss": 1.038, + "step": 19620 + }, + { + "epoch": 0.7942128692836908, + "grad_norm": 1.2672911882400513, + "learning_rate": 2.0999380549246337e-05, + "loss": 1.0092, + "step": 19625 + }, + { + "epoch": 0.7944152165115338, + "grad_norm": 1.196106195449829, + "learning_rate": 2.0978732190790835e-05, + "loss": 0.9841, + "step": 19630 + }, + { + "epoch": 0.7946175637393768, + "grad_norm": 1.1073206663131714, + "learning_rate": 2.095808383233533e-05, + "loss": 0.9167, + "step": 19635 + }, + { + "epoch": 0.7948199109672197, + "grad_norm": 1.2175101041793823, + "learning_rate": 2.0937435473879826e-05, + "loss": 0.9759, + "step": 19640 + }, + { + "epoch": 0.7950222581950628, + "grad_norm": 1.1923753023147583, + "learning_rate": 2.0916787115424323e-05, + "loss": 0.9698, + "step": 19645 + }, + { + "epoch": 0.7952246054229057, + "grad_norm": 1.3313026428222656, + "learning_rate": 2.0896138756968824e-05, + "loss": 1.0561, + "step": 19650 + }, + { + "epoch": 0.7954269526507487, + "grad_norm": 1.2001488208770752, + "learning_rate": 2.087549039851332e-05, + "loss": 0.9968, + "step": 19655 + }, + { + "epoch": 0.7956292998785917, + "grad_norm": 1.2324854135513306, + "learning_rate": 2.0854842040057818e-05, + "loss": 1.0547, + "step": 19660 + }, + { + "epoch": 0.7958316471064346, + "grad_norm": 1.3095812797546387, + "learning_rate": 2.0834193681602312e-05, + "loss": 0.9453, + "step": 19665 + }, + { + "epoch": 0.7960339943342776, + "grad_norm": 1.228178858757019, + "learning_rate": 2.081354532314681e-05, + "loss": 0.9482, + "step": 19670 + }, + { + "epoch": 0.7962363415621206, + "grad_norm": 1.1888363361358643, + "learning_rate": 2.0792896964691307e-05, + "loss": 1.0083, + "step": 19675 + }, + { + "epoch": 0.7964386887899636, + "grad_norm": 1.1231098175048828, + "learning_rate": 2.0772248606235804e-05, + "loss": 1.0199, + "step": 19680 + }, + { + "epoch": 0.7966410360178066, + "grad_norm": 1.189549207687378, + "learning_rate": 2.0751600247780305e-05, + "loss": 0.9818, + "step": 19685 + }, + { + "epoch": 0.7968433832456495, + "grad_norm": 1.248259425163269, + "learning_rate": 2.07309518893248e-05, + "loss": 0.9371, + "step": 19690 + }, + { + "epoch": 0.7970457304734925, + "grad_norm": 1.1903223991394043, + "learning_rate": 2.0710303530869296e-05, + "loss": 1.0038, + "step": 19695 + }, + { + "epoch": 0.7972480777013355, + "grad_norm": 1.0861550569534302, + "learning_rate": 2.0689655172413793e-05, + "loss": 0.9521, + "step": 19700 + }, + { + "epoch": 0.7974504249291785, + "grad_norm": 1.2282209396362305, + "learning_rate": 2.066900681395829e-05, + "loss": 0.9781, + "step": 19705 + }, + { + "epoch": 0.7976527721570215, + "grad_norm": 1.270195722579956, + "learning_rate": 2.0648358455502788e-05, + "loss": 1.0046, + "step": 19710 + }, + { + "epoch": 0.7978551193848644, + "grad_norm": 1.261534333229065, + "learning_rate": 2.062771009704729e-05, + "loss": 0.9783, + "step": 19715 + }, + { + "epoch": 0.7980574666127074, + "grad_norm": 1.1097315549850464, + "learning_rate": 2.0607061738591782e-05, + "loss": 0.9546, + "step": 19720 + }, + { + "epoch": 0.7982598138405503, + "grad_norm": 1.145125389099121, + "learning_rate": 2.058641338013628e-05, + "loss": 1.0244, + "step": 19725 + }, + { + "epoch": 0.7984621610683934, + "grad_norm": 1.2401931285858154, + "learning_rate": 2.0565765021680777e-05, + "loss": 0.9883, + "step": 19730 + }, + { + "epoch": 0.7986645082962364, + "grad_norm": 1.247721791267395, + "learning_rate": 2.0545116663225274e-05, + "loss": 1.0114, + "step": 19735 + }, + { + "epoch": 0.7988668555240793, + "grad_norm": 1.2011133432388306, + "learning_rate": 2.052446830476977e-05, + "loss": 0.989, + "step": 19740 + }, + { + "epoch": 0.7990692027519223, + "grad_norm": 1.2275869846343994, + "learning_rate": 2.050381994631427e-05, + "loss": 0.9969, + "step": 19745 + }, + { + "epoch": 0.7992715499797652, + "grad_norm": 1.237228512763977, + "learning_rate": 2.0483171587858766e-05, + "loss": 0.9614, + "step": 19750 + }, + { + "epoch": 0.7994738972076083, + "grad_norm": 1.1042230129241943, + "learning_rate": 2.0462523229403264e-05, + "loss": 1.0287, + "step": 19755 + }, + { + "epoch": 0.7996762444354513, + "grad_norm": 1.2946480512619019, + "learning_rate": 2.044187487094776e-05, + "loss": 0.994, + "step": 19760 + }, + { + "epoch": 0.7998785916632942, + "grad_norm": 1.187788963317871, + "learning_rate": 2.0421226512492258e-05, + "loss": 0.915, + "step": 19765 + }, + { + "epoch": 0.8000809388911372, + "grad_norm": 1.2385916709899902, + "learning_rate": 2.0400578154036755e-05, + "loss": 1.0061, + "step": 19770 + }, + { + "epoch": 0.8002832861189801, + "grad_norm": 1.1602866649627686, + "learning_rate": 2.0379929795581253e-05, + "loss": 0.9703, + "step": 19775 + }, + { + "epoch": 0.8004856333468231, + "grad_norm": 1.199293613433838, + "learning_rate": 2.035928143712575e-05, + "loss": 0.9906, + "step": 19780 + }, + { + "epoch": 0.8006879805746662, + "grad_norm": 1.4394046068191528, + "learning_rate": 2.0338633078670247e-05, + "loss": 0.9859, + "step": 19785 + }, + { + "epoch": 0.8008903278025091, + "grad_norm": 1.1568101644515991, + "learning_rate": 2.0317984720214745e-05, + "loss": 1.0067, + "step": 19790 + }, + { + "epoch": 0.8010926750303521, + "grad_norm": 1.117028832435608, + "learning_rate": 2.0297336361759242e-05, + "loss": 0.9826, + "step": 19795 + }, + { + "epoch": 0.801295022258195, + "grad_norm": 1.211776852607727, + "learning_rate": 2.0276688003303736e-05, + "loss": 1.0046, + "step": 19800 + }, + { + "epoch": 0.801497369486038, + "grad_norm": 1.2188165187835693, + "learning_rate": 2.0256039644848233e-05, + "loss": 1.0403, + "step": 19805 + }, + { + "epoch": 0.8016997167138811, + "grad_norm": 1.2479783296585083, + "learning_rate": 2.0235391286392734e-05, + "loss": 1.0141, + "step": 19810 + }, + { + "epoch": 0.801902063941724, + "grad_norm": 1.2970749139785767, + "learning_rate": 2.021474292793723e-05, + "loss": 0.9122, + "step": 19815 + }, + { + "epoch": 0.802104411169567, + "grad_norm": 1.168317198753357, + "learning_rate": 2.019409456948173e-05, + "loss": 0.9999, + "step": 19820 + }, + { + "epoch": 0.80230675839741, + "grad_norm": 1.144818663597107, + "learning_rate": 2.0173446211026226e-05, + "loss": 0.9321, + "step": 19825 + }, + { + "epoch": 0.8025091056252529, + "grad_norm": 1.1624873876571655, + "learning_rate": 2.015279785257072e-05, + "loss": 0.9882, + "step": 19830 + }, + { + "epoch": 0.8027114528530959, + "grad_norm": 1.2556883096694946, + "learning_rate": 2.0132149494115217e-05, + "loss": 1.0018, + "step": 19835 + }, + { + "epoch": 0.8029138000809389, + "grad_norm": 1.088374137878418, + "learning_rate": 2.0111501135659718e-05, + "loss": 0.9678, + "step": 19840 + }, + { + "epoch": 0.8031161473087819, + "grad_norm": 1.1550886631011963, + "learning_rate": 2.0090852777204215e-05, + "loss": 0.9705, + "step": 19845 + }, + { + "epoch": 0.8033184945366248, + "grad_norm": 1.2255133390426636, + "learning_rate": 2.0070204418748712e-05, + "loss": 1.0041, + "step": 19850 + }, + { + "epoch": 0.8035208417644678, + "grad_norm": 1.132543921470642, + "learning_rate": 2.0049556060293206e-05, + "loss": 0.9867, + "step": 19855 + }, + { + "epoch": 0.8037231889923108, + "grad_norm": 1.3151130676269531, + "learning_rate": 2.0028907701837703e-05, + "loss": 0.9502, + "step": 19860 + }, + { + "epoch": 0.8039255362201538, + "grad_norm": 1.1357910633087158, + "learning_rate": 2.00082593433822e-05, + "loss": 0.9613, + "step": 19865 + }, + { + "epoch": 0.8041278834479968, + "grad_norm": 1.2936217784881592, + "learning_rate": 1.9987610984926698e-05, + "loss": 0.9899, + "step": 19870 + }, + { + "epoch": 0.8043302306758398, + "grad_norm": 1.5135611295700073, + "learning_rate": 1.99669626264712e-05, + "loss": 0.9504, + "step": 19875 + }, + { + "epoch": 0.8045325779036827, + "grad_norm": 1.2707353830337524, + "learning_rate": 1.9946314268015696e-05, + "loss": 0.9918, + "step": 19880 + }, + { + "epoch": 0.8047349251315257, + "grad_norm": 1.180137276649475, + "learning_rate": 1.992566590956019e-05, + "loss": 1.0152, + "step": 19885 + }, + { + "epoch": 0.8049372723593686, + "grad_norm": 1.2100902795791626, + "learning_rate": 1.9905017551104687e-05, + "loss": 1.0173, + "step": 19890 + }, + { + "epoch": 0.8051396195872117, + "grad_norm": 1.2103663682937622, + "learning_rate": 1.9884369192649184e-05, + "loss": 0.9532, + "step": 19895 + }, + { + "epoch": 0.8053419668150547, + "grad_norm": 1.119215965270996, + "learning_rate": 1.9863720834193682e-05, + "loss": 1.0026, + "step": 19900 + }, + { + "epoch": 0.8055443140428976, + "grad_norm": 1.3070436716079712, + "learning_rate": 1.9843072475738182e-05, + "loss": 1.0098, + "step": 19905 + }, + { + "epoch": 0.8057466612707406, + "grad_norm": 1.1795378923416138, + "learning_rate": 1.9822424117282676e-05, + "loss": 1.0136, + "step": 19910 + }, + { + "epoch": 0.8059490084985835, + "grad_norm": 1.3635504245758057, + "learning_rate": 1.9801775758827174e-05, + "loss": 0.9739, + "step": 19915 + }, + { + "epoch": 0.8061513557264266, + "grad_norm": 1.170520305633545, + "learning_rate": 1.978112740037167e-05, + "loss": 1.0263, + "step": 19920 + }, + { + "epoch": 0.8063537029542696, + "grad_norm": 1.30668044090271, + "learning_rate": 1.9760479041916168e-05, + "loss": 0.9868, + "step": 19925 + }, + { + "epoch": 0.8065560501821125, + "grad_norm": 1.2427819967269897, + "learning_rate": 1.9739830683460666e-05, + "loss": 1.0359, + "step": 19930 + }, + { + "epoch": 0.8067583974099555, + "grad_norm": 1.1163017749786377, + "learning_rate": 1.9719182325005163e-05, + "loss": 0.9936, + "step": 19935 + }, + { + "epoch": 0.8069607446377984, + "grad_norm": 1.2274726629257202, + "learning_rate": 1.969853396654966e-05, + "loss": 0.9545, + "step": 19940 + }, + { + "epoch": 0.8071630918656414, + "grad_norm": 1.4208333492279053, + "learning_rate": 1.9677885608094157e-05, + "loss": 1.0062, + "step": 19945 + }, + { + "epoch": 0.8073654390934845, + "grad_norm": 1.2495779991149902, + "learning_rate": 1.9657237249638655e-05, + "loss": 1.0276, + "step": 19950 + }, + { + "epoch": 0.8075677863213274, + "grad_norm": 1.2706964015960693, + "learning_rate": 1.9636588891183152e-05, + "loss": 0.9762, + "step": 19955 + }, + { + "epoch": 0.8077701335491704, + "grad_norm": 1.169280767440796, + "learning_rate": 1.961594053272765e-05, + "loss": 0.9751, + "step": 19960 + }, + { + "epoch": 0.8079724807770133, + "grad_norm": 1.1972851753234863, + "learning_rate": 1.9595292174272147e-05, + "loss": 1.0169, + "step": 19965 + }, + { + "epoch": 0.8081748280048563, + "grad_norm": 1.202343225479126, + "learning_rate": 1.9574643815816644e-05, + "loss": 0.9836, + "step": 19970 + }, + { + "epoch": 0.8083771752326994, + "grad_norm": 1.2811102867126465, + "learning_rate": 1.955399545736114e-05, + "loss": 0.9891, + "step": 19975 + }, + { + "epoch": 0.8085795224605423, + "grad_norm": 1.1969845294952393, + "learning_rate": 1.953334709890564e-05, + "loss": 1.0155, + "step": 19980 + }, + { + "epoch": 0.8087818696883853, + "grad_norm": 1.2557622194290161, + "learning_rate": 1.9512698740450136e-05, + "loss": 0.9148, + "step": 19985 + }, + { + "epoch": 0.8089842169162282, + "grad_norm": 1.2543156147003174, + "learning_rate": 1.9492050381994633e-05, + "loss": 0.9626, + "step": 19990 + }, + { + "epoch": 0.8091865641440712, + "grad_norm": 1.1905945539474487, + "learning_rate": 1.9471402023539127e-05, + "loss": 1.0148, + "step": 19995 + }, + { + "epoch": 0.8093889113719142, + "grad_norm": 1.2504193782806396, + "learning_rate": 1.9450753665083628e-05, + "loss": 0.9622, + "step": 20000 + }, + { + "epoch": 0.8095912585997572, + "grad_norm": 1.1896506547927856, + "learning_rate": 1.9430105306628125e-05, + "loss": 0.9763, + "step": 20005 + }, + { + "epoch": 0.8097936058276002, + "grad_norm": 1.116180658340454, + "learning_rate": 1.9409456948172622e-05, + "loss": 0.9942, + "step": 20010 + }, + { + "epoch": 0.8099959530554431, + "grad_norm": 1.131353735923767, + "learning_rate": 1.938880858971712e-05, + "loss": 0.9795, + "step": 20015 + }, + { + "epoch": 0.8101983002832861, + "grad_norm": 1.2329113483428955, + "learning_rate": 1.9368160231261613e-05, + "loss": 1.0307, + "step": 20020 + }, + { + "epoch": 0.8104006475111291, + "grad_norm": 1.2695358991622925, + "learning_rate": 1.934751187280611e-05, + "loss": 1.039, + "step": 20025 + }, + { + "epoch": 0.8106029947389721, + "grad_norm": 1.142235517501831, + "learning_rate": 1.932686351435061e-05, + "loss": 1.0011, + "step": 20030 + }, + { + "epoch": 0.8108053419668151, + "grad_norm": 1.2903319597244263, + "learning_rate": 1.930621515589511e-05, + "loss": 0.981, + "step": 20035 + }, + { + "epoch": 0.811007689194658, + "grad_norm": 1.3007261753082275, + "learning_rate": 1.9285566797439606e-05, + "loss": 0.9658, + "step": 20040 + }, + { + "epoch": 0.811210036422501, + "grad_norm": 1.1206128597259521, + "learning_rate": 1.92649184389841e-05, + "loss": 1.0229, + "step": 20045 + }, + { + "epoch": 0.811412383650344, + "grad_norm": 1.1064411401748657, + "learning_rate": 1.9244270080528597e-05, + "loss": 1.0258, + "step": 20050 + }, + { + "epoch": 0.8116147308781869, + "grad_norm": 1.0990204811096191, + "learning_rate": 1.9223621722073095e-05, + "loss": 1.0013, + "step": 20055 + }, + { + "epoch": 0.81181707810603, + "grad_norm": 1.1460708379745483, + "learning_rate": 1.9202973363617592e-05, + "loss": 0.96, + "step": 20060 + }, + { + "epoch": 0.812019425333873, + "grad_norm": 1.2765713930130005, + "learning_rate": 1.9182325005162093e-05, + "loss": 1.0415, + "step": 20065 + }, + { + "epoch": 0.8122217725617159, + "grad_norm": 1.1970674991607666, + "learning_rate": 1.916167664670659e-05, + "loss": 1.0275, + "step": 20070 + }, + { + "epoch": 0.8124241197895589, + "grad_norm": 1.2413722276687622, + "learning_rate": 1.9141028288251084e-05, + "loss": 1.022, + "step": 20075 + }, + { + "epoch": 0.8126264670174018, + "grad_norm": 1.1680649518966675, + "learning_rate": 1.912037992979558e-05, + "loss": 0.9867, + "step": 20080 + }, + { + "epoch": 0.8128288142452449, + "grad_norm": 1.2049671411514282, + "learning_rate": 1.909973157134008e-05, + "loss": 1.0504, + "step": 20085 + }, + { + "epoch": 0.8130311614730878, + "grad_norm": 1.235776662826538, + "learning_rate": 1.9079083212884576e-05, + "loss": 1.0066, + "step": 20090 + }, + { + "epoch": 0.8132335087009308, + "grad_norm": 1.249600887298584, + "learning_rate": 1.9058434854429076e-05, + "loss": 0.995, + "step": 20095 + }, + { + "epoch": 0.8134358559287738, + "grad_norm": 1.2152076959609985, + "learning_rate": 1.903778649597357e-05, + "loss": 1.0192, + "step": 20100 + }, + { + "epoch": 0.8136382031566167, + "grad_norm": 1.2431395053863525, + "learning_rate": 1.9017138137518068e-05, + "loss": 0.9872, + "step": 20105 + }, + { + "epoch": 0.8138405503844597, + "grad_norm": 1.156506061553955, + "learning_rate": 1.8996489779062565e-05, + "loss": 1.0307, + "step": 20110 + }, + { + "epoch": 0.8140428976123028, + "grad_norm": 1.1493124961853027, + "learning_rate": 1.8975841420607062e-05, + "loss": 0.9804, + "step": 20115 + }, + { + "epoch": 0.8142452448401457, + "grad_norm": 1.266171932220459, + "learning_rate": 1.895519306215156e-05, + "loss": 0.9843, + "step": 20120 + }, + { + "epoch": 0.8144475920679887, + "grad_norm": 1.1572659015655518, + "learning_rate": 1.8934544703696057e-05, + "loss": 1.0514, + "step": 20125 + }, + { + "epoch": 0.8146499392958316, + "grad_norm": 1.2233699560165405, + "learning_rate": 1.8913896345240554e-05, + "loss": 0.9665, + "step": 20130 + }, + { + "epoch": 0.8148522865236746, + "grad_norm": 1.1813246011734009, + "learning_rate": 1.889324798678505e-05, + "loss": 0.9978, + "step": 20135 + }, + { + "epoch": 0.8150546337515177, + "grad_norm": 1.1056386232376099, + "learning_rate": 1.887259962832955e-05, + "loss": 0.9947, + "step": 20140 + }, + { + "epoch": 0.8152569809793606, + "grad_norm": 1.2665637731552124, + "learning_rate": 1.8851951269874046e-05, + "loss": 1.0013, + "step": 20145 + }, + { + "epoch": 0.8154593282072036, + "grad_norm": 1.3943125009536743, + "learning_rate": 1.8831302911418543e-05, + "loss": 0.9596, + "step": 20150 + }, + { + "epoch": 0.8156616754350465, + "grad_norm": 1.2365728616714478, + "learning_rate": 1.881065455296304e-05, + "loss": 0.9706, + "step": 20155 + }, + { + "epoch": 0.8158640226628895, + "grad_norm": 1.1528480052947998, + "learning_rate": 1.8790006194507538e-05, + "loss": 0.9637, + "step": 20160 + }, + { + "epoch": 0.8160663698907324, + "grad_norm": 1.5409117937088013, + "learning_rate": 1.8769357836052035e-05, + "loss": 0.9236, + "step": 20165 + }, + { + "epoch": 0.8162687171185755, + "grad_norm": 1.258923888206482, + "learning_rate": 1.8748709477596532e-05, + "loss": 0.9726, + "step": 20170 + }, + { + "epoch": 0.8164710643464185, + "grad_norm": 1.1251558065414429, + "learning_rate": 1.872806111914103e-05, + "loss": 1.0228, + "step": 20175 + }, + { + "epoch": 0.8166734115742614, + "grad_norm": 1.2459688186645508, + "learning_rate": 1.8707412760685527e-05, + "loss": 0.9896, + "step": 20180 + }, + { + "epoch": 0.8168757588021044, + "grad_norm": 1.2140971422195435, + "learning_rate": 1.868676440223002e-05, + "loss": 1.0325, + "step": 20185 + }, + { + "epoch": 0.8170781060299473, + "grad_norm": 1.1849210262298584, + "learning_rate": 1.866611604377452e-05, + "loss": 1.0246, + "step": 20190 + }, + { + "epoch": 0.8172804532577904, + "grad_norm": 1.1096030473709106, + "learning_rate": 1.864546768531902e-05, + "loss": 1.0264, + "step": 20195 + }, + { + "epoch": 0.8174828004856334, + "grad_norm": 1.3153741359710693, + "learning_rate": 1.8624819326863516e-05, + "loss": 0.9969, + "step": 20200 + }, + { + "epoch": 0.8176851477134763, + "grad_norm": 1.021003246307373, + "learning_rate": 1.8604170968408013e-05, + "loss": 0.992, + "step": 20205 + }, + { + "epoch": 0.8178874949413193, + "grad_norm": 1.2519673109054565, + "learning_rate": 1.8583522609952507e-05, + "loss": 1.0064, + "step": 20210 + }, + { + "epoch": 0.8180898421691623, + "grad_norm": 1.209879994392395, + "learning_rate": 1.8562874251497005e-05, + "loss": 0.9879, + "step": 20215 + }, + { + "epoch": 0.8182921893970052, + "grad_norm": 1.195630669593811, + "learning_rate": 1.8542225893041505e-05, + "loss": 1.0022, + "step": 20220 + }, + { + "epoch": 0.8184945366248483, + "grad_norm": 1.342789888381958, + "learning_rate": 1.8521577534586003e-05, + "loss": 0.9606, + "step": 20225 + }, + { + "epoch": 0.8186968838526912, + "grad_norm": 1.3297576904296875, + "learning_rate": 1.85009291761305e-05, + "loss": 1.0015, + "step": 20230 + }, + { + "epoch": 0.8188992310805342, + "grad_norm": 1.3977317810058594, + "learning_rate": 1.8480280817674997e-05, + "loss": 0.9668, + "step": 20235 + }, + { + "epoch": 0.8191015783083772, + "grad_norm": 1.2348957061767578, + "learning_rate": 1.845963245921949e-05, + "loss": 0.9717, + "step": 20240 + }, + { + "epoch": 0.8193039255362201, + "grad_norm": 1.2327253818511963, + "learning_rate": 1.843898410076399e-05, + "loss": 1.018, + "step": 20245 + }, + { + "epoch": 0.8195062727640632, + "grad_norm": 1.0961917638778687, + "learning_rate": 1.8418335742308486e-05, + "loss": 1.0093, + "step": 20250 + }, + { + "epoch": 0.8197086199919061, + "grad_norm": 1.3516782522201538, + "learning_rate": 1.8397687383852986e-05, + "loss": 1.0199, + "step": 20255 + }, + { + "epoch": 0.8199109672197491, + "grad_norm": 1.3711910247802734, + "learning_rate": 1.8377039025397484e-05, + "loss": 1.0221, + "step": 20260 + }, + { + "epoch": 0.8201133144475921, + "grad_norm": 1.2947044372558594, + "learning_rate": 1.8356390666941978e-05, + "loss": 0.9439, + "step": 20265 + }, + { + "epoch": 0.820315661675435, + "grad_norm": 1.1903367042541504, + "learning_rate": 1.8335742308486475e-05, + "loss": 1.0293, + "step": 20270 + }, + { + "epoch": 0.820518008903278, + "grad_norm": 1.2941244840621948, + "learning_rate": 1.8315093950030972e-05, + "loss": 0.9934, + "step": 20275 + }, + { + "epoch": 0.820720356131121, + "grad_norm": 1.1714632511138916, + "learning_rate": 1.829444559157547e-05, + "loss": 1.0082, + "step": 20280 + }, + { + "epoch": 0.820922703358964, + "grad_norm": 1.4111402034759521, + "learning_rate": 1.827379723311997e-05, + "loss": 0.9716, + "step": 20285 + }, + { + "epoch": 0.821125050586807, + "grad_norm": 1.1551733016967773, + "learning_rate": 1.8253148874664468e-05, + "loss": 0.9969, + "step": 20290 + }, + { + "epoch": 0.8213273978146499, + "grad_norm": 1.2137131690979004, + "learning_rate": 1.823250051620896e-05, + "loss": 0.9523, + "step": 20295 + }, + { + "epoch": 0.8215297450424929, + "grad_norm": 1.05117666721344, + "learning_rate": 1.821185215775346e-05, + "loss": 1.0274, + "step": 20300 + }, + { + "epoch": 0.821732092270336, + "grad_norm": 1.204932451248169, + "learning_rate": 1.8191203799297956e-05, + "loss": 0.9704, + "step": 20305 + }, + { + "epoch": 0.8219344394981789, + "grad_norm": 1.1883883476257324, + "learning_rate": 1.8170555440842453e-05, + "loss": 1.0267, + "step": 20310 + }, + { + "epoch": 0.8221367867260219, + "grad_norm": 1.2452350854873657, + "learning_rate": 1.814990708238695e-05, + "loss": 0.9956, + "step": 20315 + }, + { + "epoch": 0.8223391339538648, + "grad_norm": 1.1953701972961426, + "learning_rate": 1.8129258723931448e-05, + "loss": 1.0129, + "step": 20320 + }, + { + "epoch": 0.8225414811817078, + "grad_norm": 1.3015085458755493, + "learning_rate": 1.8108610365475945e-05, + "loss": 1.0441, + "step": 20325 + }, + { + "epoch": 0.8227438284095507, + "grad_norm": 1.2774007320404053, + "learning_rate": 1.8087962007020443e-05, + "loss": 1.0372, + "step": 20330 + }, + { + "epoch": 0.8229461756373938, + "grad_norm": 1.350160837173462, + "learning_rate": 1.806731364856494e-05, + "loss": 0.9749, + "step": 20335 + }, + { + "epoch": 0.8231485228652368, + "grad_norm": 1.2322993278503418, + "learning_rate": 1.8046665290109437e-05, + "loss": 0.9913, + "step": 20340 + }, + { + "epoch": 0.8233508700930797, + "grad_norm": 1.2659082412719727, + "learning_rate": 1.8026016931653934e-05, + "loss": 0.9835, + "step": 20345 + }, + { + "epoch": 0.8235532173209227, + "grad_norm": 1.2427709102630615, + "learning_rate": 1.8005368573198432e-05, + "loss": 0.9948, + "step": 20350 + }, + { + "epoch": 0.8237555645487656, + "grad_norm": 1.1547057628631592, + "learning_rate": 1.798472021474293e-05, + "loss": 1.0473, + "step": 20355 + }, + { + "epoch": 0.8239579117766087, + "grad_norm": 1.237739086151123, + "learning_rate": 1.7964071856287426e-05, + "loss": 0.9942, + "step": 20360 + }, + { + "epoch": 0.8241602590044517, + "grad_norm": 1.1475869417190552, + "learning_rate": 1.7943423497831924e-05, + "loss": 1.0079, + "step": 20365 + }, + { + "epoch": 0.8243626062322946, + "grad_norm": 1.3094673156738281, + "learning_rate": 1.792277513937642e-05, + "loss": 0.9764, + "step": 20370 + }, + { + "epoch": 0.8245649534601376, + "grad_norm": 1.2824714183807373, + "learning_rate": 1.7902126780920915e-05, + "loss": 1.0004, + "step": 20375 + }, + { + "epoch": 0.8247673006879805, + "grad_norm": 1.0989863872528076, + "learning_rate": 1.7881478422465415e-05, + "loss": 1.0083, + "step": 20380 + }, + { + "epoch": 0.8249696479158235, + "grad_norm": 1.2977863550186157, + "learning_rate": 1.7860830064009913e-05, + "loss": 0.9982, + "step": 20385 + }, + { + "epoch": 0.8251719951436666, + "grad_norm": 1.4900519847869873, + "learning_rate": 1.784018170555441e-05, + "loss": 0.9782, + "step": 20390 + }, + { + "epoch": 0.8253743423715095, + "grad_norm": 1.1205878257751465, + "learning_rate": 1.7819533347098907e-05, + "loss": 1.0262, + "step": 20395 + }, + { + "epoch": 0.8255766895993525, + "grad_norm": 1.1933003664016724, + "learning_rate": 1.7798884988643405e-05, + "loss": 0.9792, + "step": 20400 + }, + { + "epoch": 0.8257790368271954, + "grad_norm": 1.175807237625122, + "learning_rate": 1.77782366301879e-05, + "loss": 1.0551, + "step": 20405 + }, + { + "epoch": 0.8259813840550384, + "grad_norm": 1.477211356163025, + "learning_rate": 1.77575882717324e-05, + "loss": 0.9905, + "step": 20410 + }, + { + "epoch": 0.8261837312828815, + "grad_norm": 1.0942533016204834, + "learning_rate": 1.7736939913276897e-05, + "loss": 0.9547, + "step": 20415 + }, + { + "epoch": 0.8263860785107244, + "grad_norm": 1.2155954837799072, + "learning_rate": 1.7716291554821394e-05, + "loss": 0.999, + "step": 20420 + }, + { + "epoch": 0.8265884257385674, + "grad_norm": 1.205565333366394, + "learning_rate": 1.769564319636589e-05, + "loss": 0.9823, + "step": 20425 + }, + { + "epoch": 0.8267907729664103, + "grad_norm": 1.3723074197769165, + "learning_rate": 1.7674994837910385e-05, + "loss": 0.9774, + "step": 20430 + }, + { + "epoch": 0.8269931201942533, + "grad_norm": 1.211984395980835, + "learning_rate": 1.7654346479454882e-05, + "loss": 0.967, + "step": 20435 + }, + { + "epoch": 0.8271954674220963, + "grad_norm": 1.1118062734603882, + "learning_rate": 1.763369812099938e-05, + "loss": 0.9869, + "step": 20440 + }, + { + "epoch": 0.8273978146499393, + "grad_norm": 1.2070207595825195, + "learning_rate": 1.761304976254388e-05, + "loss": 1.0136, + "step": 20445 + }, + { + "epoch": 0.8276001618777823, + "grad_norm": 1.2678449153900146, + "learning_rate": 1.7592401404088378e-05, + "loss": 1.0071, + "step": 20450 + }, + { + "epoch": 0.8278025091056253, + "grad_norm": 1.0911531448364258, + "learning_rate": 1.7571753045632875e-05, + "loss": 1.0103, + "step": 20455 + }, + { + "epoch": 0.8280048563334682, + "grad_norm": 1.1416620016098022, + "learning_rate": 1.755110468717737e-05, + "loss": 0.9129, + "step": 20460 + }, + { + "epoch": 0.8282072035613112, + "grad_norm": 1.2879242897033691, + "learning_rate": 1.7530456328721866e-05, + "loss": 1.0324, + "step": 20465 + }, + { + "epoch": 0.8284095507891542, + "grad_norm": 1.1336830854415894, + "learning_rate": 1.7509807970266363e-05, + "loss": 0.9955, + "step": 20470 + }, + { + "epoch": 0.8286118980169972, + "grad_norm": 1.1080355644226074, + "learning_rate": 1.7489159611810864e-05, + "loss": 0.9883, + "step": 20475 + }, + { + "epoch": 0.8288142452448402, + "grad_norm": 1.2331814765930176, + "learning_rate": 1.746851125335536e-05, + "loss": 0.991, + "step": 20480 + }, + { + "epoch": 0.8290165924726831, + "grad_norm": 1.0507975816726685, + "learning_rate": 1.7447862894899855e-05, + "loss": 1.0184, + "step": 20485 + }, + { + "epoch": 0.8292189397005261, + "grad_norm": 1.3183051347732544, + "learning_rate": 1.7427214536444353e-05, + "loss": 0.9714, + "step": 20490 + }, + { + "epoch": 0.829421286928369, + "grad_norm": 1.249200463294983, + "learning_rate": 1.740656617798885e-05, + "loss": 0.9824, + "step": 20495 + }, + { + "epoch": 0.8296236341562121, + "grad_norm": 1.0716665983200073, + "learning_rate": 1.7385917819533347e-05, + "loss": 0.9555, + "step": 20500 + }, + { + "epoch": 0.8298259813840551, + "grad_norm": 1.250664472579956, + "learning_rate": 1.7365269461077845e-05, + "loss": 0.9864, + "step": 20505 + }, + { + "epoch": 0.830028328611898, + "grad_norm": 1.2126374244689941, + "learning_rate": 1.7344621102622342e-05, + "loss": 1.0792, + "step": 20510 + }, + { + "epoch": 0.830230675839741, + "grad_norm": 1.1063995361328125, + "learning_rate": 1.732397274416684e-05, + "loss": 1.0379, + "step": 20515 + }, + { + "epoch": 0.8304330230675839, + "grad_norm": 1.1586008071899414, + "learning_rate": 1.7303324385711336e-05, + "loss": 1.0363, + "step": 20520 + }, + { + "epoch": 0.830635370295427, + "grad_norm": 1.195989727973938, + "learning_rate": 1.7282676027255834e-05, + "loss": 0.9894, + "step": 20525 + }, + { + "epoch": 0.83083771752327, + "grad_norm": 1.1733986139297485, + "learning_rate": 1.726202766880033e-05, + "loss": 0.977, + "step": 20530 + }, + { + "epoch": 0.8310400647511129, + "grad_norm": 1.2116700410842896, + "learning_rate": 1.7241379310344828e-05, + "loss": 0.9803, + "step": 20535 + }, + { + "epoch": 0.8312424119789559, + "grad_norm": 1.207283616065979, + "learning_rate": 1.7220730951889326e-05, + "loss": 1.0284, + "step": 20540 + }, + { + "epoch": 0.8314447592067988, + "grad_norm": 1.0571120977401733, + "learning_rate": 1.7200082593433823e-05, + "loss": 1.0076, + "step": 20545 + }, + { + "epoch": 0.8316471064346418, + "grad_norm": 1.2281038761138916, + "learning_rate": 1.717943423497832e-05, + "loss": 1.0208, + "step": 20550 + }, + { + "epoch": 0.8318494536624849, + "grad_norm": 1.186851143836975, + "learning_rate": 1.7158785876522817e-05, + "loss": 1.0347, + "step": 20555 + }, + { + "epoch": 0.8320518008903278, + "grad_norm": 1.3567386865615845, + "learning_rate": 1.7138137518067315e-05, + "loss": 0.9777, + "step": 20560 + }, + { + "epoch": 0.8322541481181708, + "grad_norm": 1.3898897171020508, + "learning_rate": 1.711748915961181e-05, + "loss": 0.9936, + "step": 20565 + }, + { + "epoch": 0.8324564953460137, + "grad_norm": 1.253336787223816, + "learning_rate": 1.709684080115631e-05, + "loss": 0.966, + "step": 20570 + }, + { + "epoch": 0.8326588425738567, + "grad_norm": 1.2542872428894043, + "learning_rate": 1.7076192442700807e-05, + "loss": 0.9025, + "step": 20575 + }, + { + "epoch": 0.8328611898016998, + "grad_norm": 1.243143916130066, + "learning_rate": 1.7055544084245304e-05, + "loss": 0.93, + "step": 20580 + }, + { + "epoch": 0.8330635370295427, + "grad_norm": 1.2384790182113647, + "learning_rate": 1.70348957257898e-05, + "loss": 0.9604, + "step": 20585 + }, + { + "epoch": 0.8332658842573857, + "grad_norm": 1.2404310703277588, + "learning_rate": 1.70142473673343e-05, + "loss": 0.9954, + "step": 20590 + }, + { + "epoch": 0.8334682314852286, + "grad_norm": 1.1246404647827148, + "learning_rate": 1.6993599008878792e-05, + "loss": 0.9574, + "step": 20595 + }, + { + "epoch": 0.8336705787130716, + "grad_norm": 1.0322741270065308, + "learning_rate": 1.6972950650423293e-05, + "loss": 1.0129, + "step": 20600 + }, + { + "epoch": 0.8338729259409146, + "grad_norm": 1.1479917764663696, + "learning_rate": 1.695230229196779e-05, + "loss": 0.9937, + "step": 20605 + }, + { + "epoch": 0.8340752731687576, + "grad_norm": 1.127181053161621, + "learning_rate": 1.6931653933512288e-05, + "loss": 1.0409, + "step": 20610 + }, + { + "epoch": 0.8342776203966006, + "grad_norm": 1.1959785223007202, + "learning_rate": 1.6911005575056785e-05, + "loss": 0.9722, + "step": 20615 + }, + { + "epoch": 0.8344799676244435, + "grad_norm": 1.1904857158660889, + "learning_rate": 1.689035721660128e-05, + "loss": 0.9778, + "step": 20620 + }, + { + "epoch": 0.8346823148522865, + "grad_norm": 1.2734626531600952, + "learning_rate": 1.6869708858145776e-05, + "loss": 0.9056, + "step": 20625 + }, + { + "epoch": 0.8348846620801295, + "grad_norm": 1.2978413105010986, + "learning_rate": 1.6849060499690274e-05, + "loss": 1.001, + "step": 20630 + }, + { + "epoch": 0.8350870093079725, + "grad_norm": 1.2909311056137085, + "learning_rate": 1.6828412141234774e-05, + "loss": 0.9382, + "step": 20635 + }, + { + "epoch": 0.8352893565358155, + "grad_norm": 1.1042921543121338, + "learning_rate": 1.680776378277927e-05, + "loss": 0.967, + "step": 20640 + }, + { + "epoch": 0.8354917037636584, + "grad_norm": 1.1582454442977905, + "learning_rate": 1.678711542432377e-05, + "loss": 1.0093, + "step": 20645 + }, + { + "epoch": 0.8356940509915014, + "grad_norm": 1.2365312576293945, + "learning_rate": 1.6766467065868263e-05, + "loss": 1.0098, + "step": 20650 + }, + { + "epoch": 0.8358963982193444, + "grad_norm": 1.275153398513794, + "learning_rate": 1.674581870741276e-05, + "loss": 0.9795, + "step": 20655 + }, + { + "epoch": 0.8360987454471873, + "grad_norm": 1.328636884689331, + "learning_rate": 1.6725170348957257e-05, + "loss": 0.9655, + "step": 20660 + }, + { + "epoch": 0.8363010926750304, + "grad_norm": 1.2965084314346313, + "learning_rate": 1.6704521990501758e-05, + "loss": 0.9852, + "step": 20665 + }, + { + "epoch": 0.8365034399028733, + "grad_norm": 1.1821112632751465, + "learning_rate": 1.6683873632046255e-05, + "loss": 0.9975, + "step": 20670 + }, + { + "epoch": 0.8367057871307163, + "grad_norm": 1.143494725227356, + "learning_rate": 1.666322527359075e-05, + "loss": 0.9518, + "step": 20675 + }, + { + "epoch": 0.8369081343585593, + "grad_norm": 1.1797096729278564, + "learning_rate": 1.6642576915135247e-05, + "loss": 1.0115, + "step": 20680 + }, + { + "epoch": 0.8371104815864022, + "grad_norm": 1.2627525329589844, + "learning_rate": 1.6621928556679744e-05, + "loss": 1.006, + "step": 20685 + }, + { + "epoch": 0.8373128288142453, + "grad_norm": 1.2649866342544556, + "learning_rate": 1.660128019822424e-05, + "loss": 0.9186, + "step": 20690 + }, + { + "epoch": 0.8375151760420882, + "grad_norm": 1.2155084609985352, + "learning_rate": 1.658063183976874e-05, + "loss": 0.9853, + "step": 20695 + }, + { + "epoch": 0.8377175232699312, + "grad_norm": 1.1376574039459229, + "learning_rate": 1.655998348131324e-05, + "loss": 1.0875, + "step": 20700 + }, + { + "epoch": 0.8379198704977742, + "grad_norm": 1.1920067071914673, + "learning_rate": 1.6539335122857733e-05, + "loss": 0.9851, + "step": 20705 + }, + { + "epoch": 0.8381222177256171, + "grad_norm": 1.170380711555481, + "learning_rate": 1.651868676440223e-05, + "loss": 0.9402, + "step": 20710 + }, + { + "epoch": 0.8383245649534601, + "grad_norm": 1.1304261684417725, + "learning_rate": 1.6498038405946728e-05, + "loss": 0.9497, + "step": 20715 + }, + { + "epoch": 0.8385269121813032, + "grad_norm": 1.155203938484192, + "learning_rate": 1.6477390047491225e-05, + "loss": 0.9433, + "step": 20720 + }, + { + "epoch": 0.8387292594091461, + "grad_norm": 1.1753532886505127, + "learning_rate": 1.6456741689035722e-05, + "loss": 1.0132, + "step": 20725 + }, + { + "epoch": 0.8389316066369891, + "grad_norm": 1.2378957271575928, + "learning_rate": 1.643609333058022e-05, + "loss": 1.0111, + "step": 20730 + }, + { + "epoch": 0.839133953864832, + "grad_norm": 1.0762790441513062, + "learning_rate": 1.6415444972124717e-05, + "loss": 1.0016, + "step": 20735 + }, + { + "epoch": 0.839336301092675, + "grad_norm": 1.2411805391311646, + "learning_rate": 1.6394796613669214e-05, + "loss": 1.0326, + "step": 20740 + }, + { + "epoch": 0.839538648320518, + "grad_norm": 1.2038969993591309, + "learning_rate": 1.637414825521371e-05, + "loss": 1.0516, + "step": 20745 + }, + { + "epoch": 0.839740995548361, + "grad_norm": 1.1077581644058228, + "learning_rate": 1.635349989675821e-05, + "loss": 1.0015, + "step": 20750 + }, + { + "epoch": 0.839943342776204, + "grad_norm": 1.1759651899337769, + "learning_rate": 1.6332851538302706e-05, + "loss": 1.0217, + "step": 20755 + }, + { + "epoch": 0.8401456900040469, + "grad_norm": 1.138208270072937, + "learning_rate": 1.6312203179847203e-05, + "loss": 1.0095, + "step": 20760 + }, + { + "epoch": 0.8403480372318899, + "grad_norm": 1.1641725301742554, + "learning_rate": 1.62915548213917e-05, + "loss": 1.0087, + "step": 20765 + }, + { + "epoch": 0.8405503844597328, + "grad_norm": 1.243086814880371, + "learning_rate": 1.6270906462936198e-05, + "loss": 0.9965, + "step": 20770 + }, + { + "epoch": 0.8407527316875759, + "grad_norm": 1.2246869802474976, + "learning_rate": 1.6250258104480695e-05, + "loss": 1.0346, + "step": 20775 + }, + { + "epoch": 0.8409550789154189, + "grad_norm": 1.2264448404312134, + "learning_rate": 1.6229609746025192e-05, + "loss": 0.9911, + "step": 20780 + }, + { + "epoch": 0.8411574261432618, + "grad_norm": 1.58490788936615, + "learning_rate": 1.6208961387569686e-05, + "loss": 0.9854, + "step": 20785 + }, + { + "epoch": 0.8413597733711048, + "grad_norm": 1.1928730010986328, + "learning_rate": 1.6188313029114187e-05, + "loss": 1.0452, + "step": 20790 + }, + { + "epoch": 0.8415621205989477, + "grad_norm": 1.259055495262146, + "learning_rate": 1.6167664670658684e-05, + "loss": 0.9524, + "step": 20795 + }, + { + "epoch": 0.8417644678267908, + "grad_norm": 1.2072827816009521, + "learning_rate": 1.614701631220318e-05, + "loss": 0.9834, + "step": 20800 + }, + { + "epoch": 0.8419668150546338, + "grad_norm": 1.2875561714172363, + "learning_rate": 1.612636795374768e-05, + "loss": 0.9549, + "step": 20805 + }, + { + "epoch": 0.8421691622824767, + "grad_norm": 1.1406989097595215, + "learning_rate": 1.6105719595292176e-05, + "loss": 0.966, + "step": 20810 + }, + { + "epoch": 0.8423715095103197, + "grad_norm": 1.243126630783081, + "learning_rate": 1.608507123683667e-05, + "loss": 0.9518, + "step": 20815 + }, + { + "epoch": 0.8425738567381627, + "grad_norm": 1.1952537298202515, + "learning_rate": 1.6064422878381167e-05, + "loss": 1.0327, + "step": 20820 + }, + { + "epoch": 0.8427762039660056, + "grad_norm": 1.1402138471603394, + "learning_rate": 1.6043774519925668e-05, + "loss": 1.0137, + "step": 20825 + }, + { + "epoch": 0.8429785511938487, + "grad_norm": 1.1675366163253784, + "learning_rate": 1.6023126161470165e-05, + "loss": 0.9981, + "step": 20830 + }, + { + "epoch": 0.8431808984216916, + "grad_norm": 1.215316891670227, + "learning_rate": 1.6002477803014663e-05, + "loss": 1.0721, + "step": 20835 + }, + { + "epoch": 0.8433832456495346, + "grad_norm": 1.1594030857086182, + "learning_rate": 1.5981829444559157e-05, + "loss": 1.1031, + "step": 20840 + }, + { + "epoch": 0.8435855928773776, + "grad_norm": 1.2180711030960083, + "learning_rate": 1.5961181086103654e-05, + "loss": 0.9981, + "step": 20845 + }, + { + "epoch": 0.8437879401052205, + "grad_norm": 1.2870091199874878, + "learning_rate": 1.594053272764815e-05, + "loss": 0.9359, + "step": 20850 + }, + { + "epoch": 0.8439902873330636, + "grad_norm": 1.3216265439987183, + "learning_rate": 1.5919884369192652e-05, + "loss": 0.9999, + "step": 20855 + }, + { + "epoch": 0.8441926345609065, + "grad_norm": 1.2268571853637695, + "learning_rate": 1.589923601073715e-05, + "loss": 0.9365, + "step": 20860 + }, + { + "epoch": 0.8443949817887495, + "grad_norm": 1.215065836906433, + "learning_rate": 1.5878587652281646e-05, + "loss": 1.018, + "step": 20865 + }, + { + "epoch": 0.8445973290165925, + "grad_norm": 1.2267355918884277, + "learning_rate": 1.585793929382614e-05, + "loss": 0.9637, + "step": 20870 + }, + { + "epoch": 0.8447996762444354, + "grad_norm": 1.1260260343551636, + "learning_rate": 1.5837290935370638e-05, + "loss": 0.971, + "step": 20875 + }, + { + "epoch": 0.8450020234722785, + "grad_norm": 1.1754131317138672, + "learning_rate": 1.5816642576915135e-05, + "loss": 1.0398, + "step": 20880 + }, + { + "epoch": 0.8452043707001214, + "grad_norm": 1.0897783041000366, + "learning_rate": 1.5795994218459632e-05, + "loss": 0.9756, + "step": 20885 + }, + { + "epoch": 0.8454067179279644, + "grad_norm": 1.2261866331100464, + "learning_rate": 1.5775345860004133e-05, + "loss": 0.9294, + "step": 20890 + }, + { + "epoch": 0.8456090651558074, + "grad_norm": 1.1631525754928589, + "learning_rate": 1.5754697501548627e-05, + "loss": 1.0199, + "step": 20895 + }, + { + "epoch": 0.8458114123836503, + "grad_norm": 1.2241733074188232, + "learning_rate": 1.5734049143093124e-05, + "loss": 0.9786, + "step": 20900 + }, + { + "epoch": 0.8460137596114933, + "grad_norm": 1.2011727094650269, + "learning_rate": 1.571340078463762e-05, + "loss": 0.9845, + "step": 20905 + }, + { + "epoch": 0.8462161068393363, + "grad_norm": 1.3325366973876953, + "learning_rate": 1.569275242618212e-05, + "loss": 0.9528, + "step": 20910 + }, + { + "epoch": 0.8464184540671793, + "grad_norm": 1.1038151979446411, + "learning_rate": 1.5672104067726616e-05, + "loss": 0.9831, + "step": 20915 + }, + { + "epoch": 0.8466208012950223, + "grad_norm": 1.2316639423370361, + "learning_rate": 1.5651455709271113e-05, + "loss": 0.9599, + "step": 20920 + }, + { + "epoch": 0.8468231485228652, + "grad_norm": 1.2244083881378174, + "learning_rate": 1.563080735081561e-05, + "loss": 0.9444, + "step": 20925 + }, + { + "epoch": 0.8470254957507082, + "grad_norm": 1.1958881616592407, + "learning_rate": 1.5610158992360108e-05, + "loss": 0.9704, + "step": 20930 + }, + { + "epoch": 0.8472278429785512, + "grad_norm": 1.1687911748886108, + "learning_rate": 1.5589510633904605e-05, + "loss": 0.9594, + "step": 20935 + }, + { + "epoch": 0.8474301902063942, + "grad_norm": 1.2375454902648926, + "learning_rate": 1.5568862275449103e-05, + "loss": 0.9012, + "step": 20940 + }, + { + "epoch": 0.8476325374342372, + "grad_norm": 1.1065729856491089, + "learning_rate": 1.55482139169936e-05, + "loss": 0.9231, + "step": 20945 + }, + { + "epoch": 0.8478348846620801, + "grad_norm": 1.2403284311294556, + "learning_rate": 1.5527565558538097e-05, + "loss": 0.9543, + "step": 20950 + }, + { + "epoch": 0.8480372318899231, + "grad_norm": 1.197769284248352, + "learning_rate": 1.5506917200082594e-05, + "loss": 0.9887, + "step": 20955 + }, + { + "epoch": 0.848239579117766, + "grad_norm": 1.2262988090515137, + "learning_rate": 1.5486268841627092e-05, + "loss": 1.0221, + "step": 20960 + }, + { + "epoch": 0.8484419263456091, + "grad_norm": 1.2083128690719604, + "learning_rate": 1.546562048317159e-05, + "loss": 1.0031, + "step": 20965 + }, + { + "epoch": 0.8486442735734521, + "grad_norm": 1.160172462463379, + "learning_rate": 1.5444972124716086e-05, + "loss": 0.9793, + "step": 20970 + }, + { + "epoch": 0.848846620801295, + "grad_norm": 1.2570322751998901, + "learning_rate": 1.5424323766260584e-05, + "loss": 0.9949, + "step": 20975 + }, + { + "epoch": 0.849048968029138, + "grad_norm": 1.1473708152770996, + "learning_rate": 1.540367540780508e-05, + "loss": 0.9765, + "step": 20980 + }, + { + "epoch": 0.8492513152569809, + "grad_norm": 1.172593593597412, + "learning_rate": 1.5383027049349578e-05, + "loss": 0.954, + "step": 20985 + }, + { + "epoch": 0.849453662484824, + "grad_norm": 1.0129601955413818, + "learning_rate": 1.5362378690894076e-05, + "loss": 0.9232, + "step": 20990 + }, + { + "epoch": 0.849656009712667, + "grad_norm": 1.2935516834259033, + "learning_rate": 1.5341730332438573e-05, + "loss": 1.0266, + "step": 20995 + }, + { + "epoch": 0.8498583569405099, + "grad_norm": 1.2592288255691528, + "learning_rate": 1.532108197398307e-05, + "loss": 0.9743, + "step": 21000 + }, + { + "epoch": 0.8500607041683529, + "grad_norm": 1.1387940645217896, + "learning_rate": 1.5300433615527564e-05, + "loss": 0.9626, + "step": 21005 + }, + { + "epoch": 0.8502630513961958, + "grad_norm": 1.2443729639053345, + "learning_rate": 1.527978525707206e-05, + "loss": 1.0068, + "step": 21010 + }, + { + "epoch": 0.8504653986240388, + "grad_norm": 1.1891158819198608, + "learning_rate": 1.5259136898616562e-05, + "loss": 0.9737, + "step": 21015 + }, + { + "epoch": 0.8506677458518819, + "grad_norm": 1.2353068590164185, + "learning_rate": 1.5238488540161058e-05, + "loss": 1.0029, + "step": 21020 + }, + { + "epoch": 0.8508700930797248, + "grad_norm": 1.1497776508331299, + "learning_rate": 1.5217840181705557e-05, + "loss": 1.048, + "step": 21025 + }, + { + "epoch": 0.8510724403075678, + "grad_norm": 1.2351295948028564, + "learning_rate": 1.519719182325005e-05, + "loss": 0.9483, + "step": 21030 + }, + { + "epoch": 0.8512747875354107, + "grad_norm": 1.2149288654327393, + "learning_rate": 1.517654346479455e-05, + "loss": 0.9573, + "step": 21035 + }, + { + "epoch": 0.8514771347632537, + "grad_norm": 1.1933867931365967, + "learning_rate": 1.5155895106339047e-05, + "loss": 0.9847, + "step": 21040 + }, + { + "epoch": 0.8516794819910968, + "grad_norm": 1.0731860399246216, + "learning_rate": 1.5135246747883544e-05, + "loss": 0.9594, + "step": 21045 + }, + { + "epoch": 0.8518818292189397, + "grad_norm": 1.1704195737838745, + "learning_rate": 1.5114598389428041e-05, + "loss": 1.0353, + "step": 21050 + }, + { + "epoch": 0.8520841764467827, + "grad_norm": 1.1087898015975952, + "learning_rate": 1.509395003097254e-05, + "loss": 0.9491, + "step": 21055 + }, + { + "epoch": 0.8522865236746257, + "grad_norm": 1.2999986410140991, + "learning_rate": 1.5073301672517034e-05, + "loss": 0.9467, + "step": 21060 + }, + { + "epoch": 0.8524888709024686, + "grad_norm": 1.2045687437057495, + "learning_rate": 1.5052653314061532e-05, + "loss": 0.976, + "step": 21065 + }, + { + "epoch": 0.8526912181303116, + "grad_norm": 1.1971813440322876, + "learning_rate": 1.503200495560603e-05, + "loss": 1.0275, + "step": 21070 + }, + { + "epoch": 0.8528935653581546, + "grad_norm": 1.2469074726104736, + "learning_rate": 1.5011356597150528e-05, + "loss": 0.9444, + "step": 21075 + }, + { + "epoch": 0.8530959125859976, + "grad_norm": 1.1796278953552246, + "learning_rate": 1.4990708238695025e-05, + "loss": 1.0424, + "step": 21080 + }, + { + "epoch": 0.8532982598138406, + "grad_norm": 1.1762430667877197, + "learning_rate": 1.497005988023952e-05, + "loss": 1.0121, + "step": 21085 + }, + { + "epoch": 0.8535006070416835, + "grad_norm": 1.522106647491455, + "learning_rate": 1.4949411521784018e-05, + "loss": 0.9712, + "step": 21090 + }, + { + "epoch": 0.8537029542695265, + "grad_norm": 1.2373191118240356, + "learning_rate": 1.4928763163328515e-05, + "loss": 0.9827, + "step": 21095 + }, + { + "epoch": 0.8539053014973695, + "grad_norm": 1.1276428699493408, + "learning_rate": 1.4908114804873013e-05, + "loss": 1.001, + "step": 21100 + }, + { + "epoch": 0.8541076487252125, + "grad_norm": 1.234139084815979, + "learning_rate": 1.4887466446417512e-05, + "loss": 0.9759, + "step": 21105 + }, + { + "epoch": 0.8543099959530555, + "grad_norm": 1.157067894935608, + "learning_rate": 1.4866818087962009e-05, + "loss": 0.9726, + "step": 21110 + }, + { + "epoch": 0.8545123431808984, + "grad_norm": 1.2965073585510254, + "learning_rate": 1.4846169729506505e-05, + "loss": 0.969, + "step": 21115 + }, + { + "epoch": 0.8547146904087414, + "grad_norm": 1.2545359134674072, + "learning_rate": 1.4825521371051002e-05, + "loss": 0.9854, + "step": 21120 + }, + { + "epoch": 0.8549170376365843, + "grad_norm": 1.1478519439697266, + "learning_rate": 1.48048730125955e-05, + "loss": 0.9751, + "step": 21125 + }, + { + "epoch": 0.8551193848644274, + "grad_norm": 1.2274898290634155, + "learning_rate": 1.4784224654139996e-05, + "loss": 0.9848, + "step": 21130 + }, + { + "epoch": 0.8553217320922704, + "grad_norm": 1.1130954027175903, + "learning_rate": 1.4763576295684495e-05, + "loss": 0.9941, + "step": 21135 + }, + { + "epoch": 0.8555240793201133, + "grad_norm": 1.1170850992202759, + "learning_rate": 1.474292793722899e-05, + "loss": 1.0035, + "step": 21140 + }, + { + "epoch": 0.8557264265479563, + "grad_norm": 1.3653924465179443, + "learning_rate": 1.4722279578773487e-05, + "loss": 1.0266, + "step": 21145 + }, + { + "epoch": 0.8559287737757992, + "grad_norm": 1.193739414215088, + "learning_rate": 1.4701631220317986e-05, + "loss": 1.0277, + "step": 21150 + }, + { + "epoch": 0.8561311210036423, + "grad_norm": 1.2416847944259644, + "learning_rate": 1.4680982861862483e-05, + "loss": 1.0254, + "step": 21155 + }, + { + "epoch": 0.8563334682314853, + "grad_norm": 1.1676281690597534, + "learning_rate": 1.466033450340698e-05, + "loss": 0.9555, + "step": 21160 + }, + { + "epoch": 0.8565358154593282, + "grad_norm": 1.2324869632720947, + "learning_rate": 1.4639686144951478e-05, + "loss": 0.9764, + "step": 21165 + }, + { + "epoch": 0.8567381626871712, + "grad_norm": 1.2595820426940918, + "learning_rate": 1.4619037786495973e-05, + "loss": 1.0357, + "step": 21170 + }, + { + "epoch": 0.8569405099150141, + "grad_norm": 1.2351970672607422, + "learning_rate": 1.459838942804047e-05, + "loss": 1.0074, + "step": 21175 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.1445361375808716, + "learning_rate": 1.457774106958497e-05, + "loss": 0.9785, + "step": 21180 + }, + { + "epoch": 0.8573452043707002, + "grad_norm": 1.1912777423858643, + "learning_rate": 1.4557092711129467e-05, + "loss": 1.0694, + "step": 21185 + }, + { + "epoch": 0.8575475515985431, + "grad_norm": 1.3794888257980347, + "learning_rate": 1.4536444352673964e-05, + "loss": 1.008, + "step": 21190 + }, + { + "epoch": 0.8577498988263861, + "grad_norm": 1.210392713546753, + "learning_rate": 1.451579599421846e-05, + "loss": 1.0009, + "step": 21195 + }, + { + "epoch": 0.857952246054229, + "grad_norm": 1.1218934059143066, + "learning_rate": 1.4495147635762957e-05, + "loss": 0.9867, + "step": 21200 + }, + { + "epoch": 0.858154593282072, + "grad_norm": 1.0614824295043945, + "learning_rate": 1.4474499277307454e-05, + "loss": 0.872, + "step": 21205 + }, + { + "epoch": 0.8583569405099151, + "grad_norm": 1.2585033178329468, + "learning_rate": 1.4453850918851952e-05, + "loss": 0.9982, + "step": 21210 + }, + { + "epoch": 0.858559287737758, + "grad_norm": 1.2803871631622314, + "learning_rate": 1.443320256039645e-05, + "loss": 0.9609, + "step": 21215 + }, + { + "epoch": 0.858761634965601, + "grad_norm": 1.095826268196106, + "learning_rate": 1.4412554201940948e-05, + "loss": 1.0214, + "step": 21220 + }, + { + "epoch": 0.8589639821934439, + "grad_norm": 1.105059027671814, + "learning_rate": 1.4391905843485443e-05, + "loss": 0.964, + "step": 21225 + }, + { + "epoch": 0.8591663294212869, + "grad_norm": 1.4007976055145264, + "learning_rate": 1.437125748502994e-05, + "loss": 0.9648, + "step": 21230 + }, + { + "epoch": 0.8593686766491299, + "grad_norm": 1.1132739782333374, + "learning_rate": 1.4350609126574438e-05, + "loss": 0.9983, + "step": 21235 + }, + { + "epoch": 0.8595710238769729, + "grad_norm": 1.2380614280700684, + "learning_rate": 1.4329960768118935e-05, + "loss": 0.987, + "step": 21240 + }, + { + "epoch": 0.8597733711048159, + "grad_norm": 1.3055659532546997, + "learning_rate": 1.4309312409663434e-05, + "loss": 1.0415, + "step": 21245 + }, + { + "epoch": 0.8599757183326588, + "grad_norm": 1.1960859298706055, + "learning_rate": 1.4288664051207928e-05, + "loss": 0.995, + "step": 21250 + }, + { + "epoch": 0.8601780655605018, + "grad_norm": 1.2132443189620972, + "learning_rate": 1.4268015692752425e-05, + "loss": 1.005, + "step": 21255 + }, + { + "epoch": 0.8603804127883448, + "grad_norm": 1.3091709613800049, + "learning_rate": 1.4247367334296924e-05, + "loss": 0.9947, + "step": 21260 + }, + { + "epoch": 0.8605827600161878, + "grad_norm": 1.2340102195739746, + "learning_rate": 1.4226718975841422e-05, + "loss": 0.9763, + "step": 21265 + }, + { + "epoch": 0.8607851072440308, + "grad_norm": 1.1634894609451294, + "learning_rate": 1.4206070617385919e-05, + "loss": 0.9707, + "step": 21270 + }, + { + "epoch": 0.8609874544718737, + "grad_norm": 1.237231731414795, + "learning_rate": 1.4185422258930416e-05, + "loss": 0.9842, + "step": 21275 + }, + { + "epoch": 0.8611898016997167, + "grad_norm": 1.1723443269729614, + "learning_rate": 1.4164773900474912e-05, + "loss": 0.9692, + "step": 21280 + }, + { + "epoch": 0.8613921489275597, + "grad_norm": 1.1599555015563965, + "learning_rate": 1.414412554201941e-05, + "loss": 0.9125, + "step": 21285 + }, + { + "epoch": 0.8615944961554026, + "grad_norm": 1.109898567199707, + "learning_rate": 1.4123477183563907e-05, + "loss": 0.9263, + "step": 21290 + }, + { + "epoch": 0.8617968433832457, + "grad_norm": 1.231009602546692, + "learning_rate": 1.4102828825108406e-05, + "loss": 0.9312, + "step": 21295 + }, + { + "epoch": 0.8619991906110887, + "grad_norm": 1.1627553701400757, + "learning_rate": 1.4082180466652903e-05, + "loss": 1.0614, + "step": 21300 + }, + { + "epoch": 0.8622015378389316, + "grad_norm": 1.1727001667022705, + "learning_rate": 1.4061532108197398e-05, + "loss": 1.0312, + "step": 21305 + }, + { + "epoch": 0.8624038850667746, + "grad_norm": 1.2657217979431152, + "learning_rate": 1.4040883749741896e-05, + "loss": 0.947, + "step": 21310 + }, + { + "epoch": 0.8626062322946175, + "grad_norm": 1.2353177070617676, + "learning_rate": 1.4020235391286393e-05, + "loss": 0.9976, + "step": 21315 + }, + { + "epoch": 0.8628085795224606, + "grad_norm": 1.23910653591156, + "learning_rate": 1.399958703283089e-05, + "loss": 0.9671, + "step": 21320 + }, + { + "epoch": 0.8630109267503036, + "grad_norm": 1.2736573219299316, + "learning_rate": 1.397893867437539e-05, + "loss": 1.0209, + "step": 21325 + }, + { + "epoch": 0.8632132739781465, + "grad_norm": 1.2034873962402344, + "learning_rate": 1.3958290315919887e-05, + "loss": 1.0165, + "step": 21330 + }, + { + "epoch": 0.8634156212059895, + "grad_norm": 1.2223907709121704, + "learning_rate": 1.393764195746438e-05, + "loss": 0.9769, + "step": 21335 + }, + { + "epoch": 0.8636179684338324, + "grad_norm": 1.176323652267456, + "learning_rate": 1.391699359900888e-05, + "loss": 1.0038, + "step": 21340 + }, + { + "epoch": 0.8638203156616754, + "grad_norm": 1.2671642303466797, + "learning_rate": 1.3896345240553377e-05, + "loss": 1.0339, + "step": 21345 + }, + { + "epoch": 0.8640226628895185, + "grad_norm": 1.3125969171524048, + "learning_rate": 1.3875696882097874e-05, + "loss": 0.9145, + "step": 21350 + }, + { + "epoch": 0.8642250101173614, + "grad_norm": 1.3143863677978516, + "learning_rate": 1.3855048523642371e-05, + "loss": 1.0101, + "step": 21355 + }, + { + "epoch": 0.8644273573452044, + "grad_norm": 1.1887516975402832, + "learning_rate": 1.3834400165186867e-05, + "loss": 0.9995, + "step": 21360 + }, + { + "epoch": 0.8646297045730473, + "grad_norm": 1.230217456817627, + "learning_rate": 1.3813751806731364e-05, + "loss": 1.0089, + "step": 21365 + }, + { + "epoch": 0.8648320518008903, + "grad_norm": 1.3523839712142944, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.9322, + "step": 21370 + }, + { + "epoch": 0.8650343990287334, + "grad_norm": 1.1450783014297485, + "learning_rate": 1.377245508982036e-05, + "loss": 0.9545, + "step": 21375 + }, + { + "epoch": 0.8652367462565763, + "grad_norm": 1.2504395246505737, + "learning_rate": 1.3751806731364858e-05, + "loss": 0.9773, + "step": 21380 + }, + { + "epoch": 0.8654390934844193, + "grad_norm": 1.2486871480941772, + "learning_rate": 1.3731158372909355e-05, + "loss": 0.9868, + "step": 21385 + }, + { + "epoch": 0.8656414407122622, + "grad_norm": 1.1254518032073975, + "learning_rate": 1.371051001445385e-05, + "loss": 1.024, + "step": 21390 + }, + { + "epoch": 0.8658437879401052, + "grad_norm": 1.1728692054748535, + "learning_rate": 1.3689861655998348e-05, + "loss": 0.9564, + "step": 21395 + }, + { + "epoch": 0.8660461351679482, + "grad_norm": 1.2263543605804443, + "learning_rate": 1.3669213297542845e-05, + "loss": 0.9728, + "step": 21400 + }, + { + "epoch": 0.8662484823957912, + "grad_norm": 1.131757140159607, + "learning_rate": 1.3648564939087344e-05, + "loss": 0.9845, + "step": 21405 + }, + { + "epoch": 0.8664508296236342, + "grad_norm": 1.223541021347046, + "learning_rate": 1.3627916580631842e-05, + "loss": 0.9843, + "step": 21410 + }, + { + "epoch": 0.8666531768514771, + "grad_norm": 1.2180362939834595, + "learning_rate": 1.3607268222176337e-05, + "loss": 1.0511, + "step": 21415 + }, + { + "epoch": 0.8668555240793201, + "grad_norm": 1.2126442193984985, + "learning_rate": 1.3586619863720835e-05, + "loss": 0.964, + "step": 21420 + }, + { + "epoch": 0.867057871307163, + "grad_norm": 1.350108027458191, + "learning_rate": 1.3565971505265332e-05, + "loss": 1.0417, + "step": 21425 + }, + { + "epoch": 0.8672602185350061, + "grad_norm": 1.229135274887085, + "learning_rate": 1.354532314680983e-05, + "loss": 1.0019, + "step": 21430 + }, + { + "epoch": 0.8674625657628491, + "grad_norm": 1.244516372680664, + "learning_rate": 1.3524674788354328e-05, + "loss": 0.9827, + "step": 21435 + }, + { + "epoch": 0.867664912990692, + "grad_norm": 1.2456797361373901, + "learning_rate": 1.3504026429898825e-05, + "loss": 0.9963, + "step": 21440 + }, + { + "epoch": 0.867867260218535, + "grad_norm": 1.1829853057861328, + "learning_rate": 1.348337807144332e-05, + "loss": 1.0354, + "step": 21445 + }, + { + "epoch": 0.868069607446378, + "grad_norm": 1.233486533164978, + "learning_rate": 1.3462729712987818e-05, + "loss": 0.964, + "step": 21450 + }, + { + "epoch": 0.8682719546742209, + "grad_norm": 1.2854801416397095, + "learning_rate": 1.3442081354532316e-05, + "loss": 0.9566, + "step": 21455 + }, + { + "epoch": 0.868474301902064, + "grad_norm": 1.2866089344024658, + "learning_rate": 1.3421432996076813e-05, + "loss": 0.9384, + "step": 21460 + }, + { + "epoch": 0.8686766491299069, + "grad_norm": 1.1400288343429565, + "learning_rate": 1.340078463762131e-05, + "loss": 1.0305, + "step": 21465 + }, + { + "epoch": 0.8688789963577499, + "grad_norm": 1.1516262292861938, + "learning_rate": 1.3380136279165806e-05, + "loss": 1.0067, + "step": 21470 + }, + { + "epoch": 0.8690813435855929, + "grad_norm": 1.212130069732666, + "learning_rate": 1.3359487920710303e-05, + "loss": 0.9806, + "step": 21475 + }, + { + "epoch": 0.8692836908134358, + "grad_norm": 1.2694216966629028, + "learning_rate": 1.33388395622548e-05, + "loss": 1.0292, + "step": 21480 + }, + { + "epoch": 0.8694860380412789, + "grad_norm": 1.2387278079986572, + "learning_rate": 1.33181912037993e-05, + "loss": 0.9744, + "step": 21485 + }, + { + "epoch": 0.8696883852691218, + "grad_norm": 1.2714673280715942, + "learning_rate": 1.3297542845343797e-05, + "loss": 1.0123, + "step": 21490 + }, + { + "epoch": 0.8698907324969648, + "grad_norm": 1.1535682678222656, + "learning_rate": 1.3276894486888294e-05, + "loss": 0.9502, + "step": 21495 + }, + { + "epoch": 0.8700930797248078, + "grad_norm": 1.1316004991531372, + "learning_rate": 1.325624612843279e-05, + "loss": 0.9473, + "step": 21500 + }, + { + "epoch": 0.8702954269526507, + "grad_norm": 1.2753766775131226, + "learning_rate": 1.3235597769977287e-05, + "loss": 1.0167, + "step": 21505 + }, + { + "epoch": 0.8704977741804937, + "grad_norm": 1.2434911727905273, + "learning_rate": 1.3214949411521784e-05, + "loss": 1.0, + "step": 21510 + }, + { + "epoch": 0.8707001214083367, + "grad_norm": 1.237639307975769, + "learning_rate": 1.3194301053066283e-05, + "loss": 0.9892, + "step": 21515 + }, + { + "epoch": 0.8709024686361797, + "grad_norm": 1.1442434787750244, + "learning_rate": 1.317365269461078e-05, + "loss": 0.968, + "step": 21520 + }, + { + "epoch": 0.8711048158640227, + "grad_norm": 1.21522057056427, + "learning_rate": 1.3153004336155274e-05, + "loss": 1.043, + "step": 21525 + }, + { + "epoch": 0.8713071630918656, + "grad_norm": 1.2270630598068237, + "learning_rate": 1.3132355977699773e-05, + "loss": 0.9797, + "step": 21530 + }, + { + "epoch": 0.8715095103197086, + "grad_norm": 1.1119053363800049, + "learning_rate": 1.311170761924427e-05, + "loss": 0.8997, + "step": 21535 + }, + { + "epoch": 0.8717118575475516, + "grad_norm": 1.2777276039123535, + "learning_rate": 1.3091059260788768e-05, + "loss": 1.0123, + "step": 21540 + }, + { + "epoch": 0.8719142047753946, + "grad_norm": 1.17526113986969, + "learning_rate": 1.3070410902333265e-05, + "loss": 1.0459, + "step": 21545 + }, + { + "epoch": 0.8721165520032376, + "grad_norm": 1.2038923501968384, + "learning_rate": 1.3049762543877761e-05, + "loss": 1.0099, + "step": 21550 + }, + { + "epoch": 0.8723188992310805, + "grad_norm": 1.293413519859314, + "learning_rate": 1.3029114185422258e-05, + "loss": 1.0036, + "step": 21555 + }, + { + "epoch": 0.8725212464589235, + "grad_norm": 1.163986325263977, + "learning_rate": 1.3008465826966757e-05, + "loss": 0.9912, + "step": 21560 + }, + { + "epoch": 0.8727235936867664, + "grad_norm": 1.279579520225525, + "learning_rate": 1.2987817468511255e-05, + "loss": 1.0417, + "step": 21565 + }, + { + "epoch": 0.8729259409146095, + "grad_norm": 1.1215415000915527, + "learning_rate": 1.2967169110055752e-05, + "loss": 0.9581, + "step": 21570 + }, + { + "epoch": 0.8731282881424525, + "grad_norm": 1.1366301774978638, + "learning_rate": 1.2946520751600249e-05, + "loss": 0.9994, + "step": 21575 + }, + { + "epoch": 0.8733306353702954, + "grad_norm": 1.1679660081863403, + "learning_rate": 1.2925872393144745e-05, + "loss": 0.9659, + "step": 21580 + }, + { + "epoch": 0.8735329825981384, + "grad_norm": 1.1928454637527466, + "learning_rate": 1.2905224034689242e-05, + "loss": 1.0112, + "step": 21585 + }, + { + "epoch": 0.8737353298259813, + "grad_norm": 1.2413065433502197, + "learning_rate": 1.288457567623374e-05, + "loss": 0.9623, + "step": 21590 + }, + { + "epoch": 0.8739376770538244, + "grad_norm": 1.2356163263320923, + "learning_rate": 1.2863927317778238e-05, + "loss": 1.0146, + "step": 21595 + }, + { + "epoch": 0.8741400242816674, + "grad_norm": 1.1208093166351318, + "learning_rate": 1.2843278959322736e-05, + "loss": 0.9413, + "step": 21600 + }, + { + "epoch": 0.8743423715095103, + "grad_norm": 1.1954208612442017, + "learning_rate": 1.2822630600867231e-05, + "loss": 1.0004, + "step": 21605 + }, + { + "epoch": 0.8745447187373533, + "grad_norm": 1.0967018604278564, + "learning_rate": 1.2801982242411728e-05, + "loss": 0.9661, + "step": 21610 + }, + { + "epoch": 0.8747470659651962, + "grad_norm": 1.3068009614944458, + "learning_rate": 1.2781333883956226e-05, + "loss": 0.9971, + "step": 21615 + }, + { + "epoch": 0.8749494131930392, + "grad_norm": 1.2453967332839966, + "learning_rate": 1.2760685525500723e-05, + "loss": 0.9947, + "step": 21620 + }, + { + "epoch": 0.8751517604208823, + "grad_norm": 1.2303450107574463, + "learning_rate": 1.274003716704522e-05, + "loss": 0.9848, + "step": 21625 + }, + { + "epoch": 0.8753541076487252, + "grad_norm": 1.1697107553482056, + "learning_rate": 1.271938880858972e-05, + "loss": 0.9864, + "step": 21630 + }, + { + "epoch": 0.8755564548765682, + "grad_norm": 1.1332446336746216, + "learning_rate": 1.2698740450134213e-05, + "loss": 0.9674, + "step": 21635 + }, + { + "epoch": 0.8757588021044111, + "grad_norm": 1.082084059715271, + "learning_rate": 1.2678092091678712e-05, + "loss": 0.9757, + "step": 21640 + }, + { + "epoch": 0.8759611493322541, + "grad_norm": 1.266560673713684, + "learning_rate": 1.265744373322321e-05, + "loss": 0.9536, + "step": 21645 + }, + { + "epoch": 0.8761634965600972, + "grad_norm": 1.2842439413070679, + "learning_rate": 1.2636795374767707e-05, + "loss": 1.0134, + "step": 21650 + }, + { + "epoch": 0.8763658437879401, + "grad_norm": 1.2898311614990234, + "learning_rate": 1.2616147016312204e-05, + "loss": 1.0112, + "step": 21655 + }, + { + "epoch": 0.8765681910157831, + "grad_norm": 1.2124890089035034, + "learning_rate": 1.25954986578567e-05, + "loss": 1.0626, + "step": 21660 + }, + { + "epoch": 0.876770538243626, + "grad_norm": 1.126383900642395, + "learning_rate": 1.2574850299401197e-05, + "loss": 0.9958, + "step": 21665 + }, + { + "epoch": 0.876972885471469, + "grad_norm": 1.1929011344909668, + "learning_rate": 1.2554201940945694e-05, + "loss": 1.0046, + "step": 21670 + }, + { + "epoch": 0.877175232699312, + "grad_norm": 1.068507194519043, + "learning_rate": 1.2533553582490193e-05, + "loss": 0.9997, + "step": 21675 + }, + { + "epoch": 0.877377579927155, + "grad_norm": 1.112916111946106, + "learning_rate": 1.251290522403469e-05, + "loss": 1.0405, + "step": 21680 + }, + { + "epoch": 0.877579927154998, + "grad_norm": 1.1991050243377686, + "learning_rate": 1.2492256865579186e-05, + "loss": 0.9949, + "step": 21685 + }, + { + "epoch": 0.877782274382841, + "grad_norm": 1.2340871095657349, + "learning_rate": 1.2471608507123685e-05, + "loss": 0.9546, + "step": 21690 + }, + { + "epoch": 0.8779846216106839, + "grad_norm": 1.2195053100585938, + "learning_rate": 1.245096014866818e-05, + "loss": 0.9761, + "step": 21695 + }, + { + "epoch": 0.8781869688385269, + "grad_norm": 1.2020443677902222, + "learning_rate": 1.2430311790212678e-05, + "loss": 0.9838, + "step": 21700 + }, + { + "epoch": 0.8783893160663699, + "grad_norm": 1.375296950340271, + "learning_rate": 1.2409663431757177e-05, + "loss": 0.9381, + "step": 21705 + }, + { + "epoch": 0.8785916632942129, + "grad_norm": 1.1228500604629517, + "learning_rate": 1.2389015073301673e-05, + "loss": 1.0171, + "step": 21710 + }, + { + "epoch": 0.8787940105220559, + "grad_norm": 1.1400810480117798, + "learning_rate": 1.236836671484617e-05, + "loss": 0.9277, + "step": 21715 + }, + { + "epoch": 0.8789963577498988, + "grad_norm": 1.3855520486831665, + "learning_rate": 1.2347718356390667e-05, + "loss": 0.9902, + "step": 21720 + }, + { + "epoch": 0.8791987049777418, + "grad_norm": 1.1449639797210693, + "learning_rate": 1.2327069997935165e-05, + "loss": 0.947, + "step": 21725 + }, + { + "epoch": 0.8794010522055847, + "grad_norm": 1.1496288776397705, + "learning_rate": 1.2306421639479662e-05, + "loss": 0.9917, + "step": 21730 + }, + { + "epoch": 0.8796033994334278, + "grad_norm": 1.1176990270614624, + "learning_rate": 1.228577328102416e-05, + "loss": 0.9002, + "step": 21735 + }, + { + "epoch": 0.8798057466612708, + "grad_norm": 1.286231279373169, + "learning_rate": 1.2265124922568657e-05, + "loss": 1.051, + "step": 21740 + }, + { + "epoch": 0.8800080938891137, + "grad_norm": 1.270430564880371, + "learning_rate": 1.2244476564113154e-05, + "loss": 1.0009, + "step": 21745 + }, + { + "epoch": 0.8802104411169567, + "grad_norm": 1.1725939512252808, + "learning_rate": 1.2223828205657651e-05, + "loss": 1.0042, + "step": 21750 + }, + { + "epoch": 0.8804127883447996, + "grad_norm": 1.2585035562515259, + "learning_rate": 1.2203179847202148e-05, + "loss": 1.0237, + "step": 21755 + }, + { + "epoch": 0.8806151355726427, + "grad_norm": 1.2797117233276367, + "learning_rate": 1.2182531488746646e-05, + "loss": 0.9502, + "step": 21760 + }, + { + "epoch": 0.8808174828004857, + "grad_norm": 1.1111143827438354, + "learning_rate": 1.2161883130291141e-05, + "loss": 0.9753, + "step": 21765 + }, + { + "epoch": 0.8810198300283286, + "grad_norm": 1.2845194339752197, + "learning_rate": 1.214123477183564e-05, + "loss": 0.9072, + "step": 21770 + }, + { + "epoch": 0.8812221772561716, + "grad_norm": 1.2153072357177734, + "learning_rate": 1.2120586413380138e-05, + "loss": 1.0065, + "step": 21775 + }, + { + "epoch": 0.8814245244840145, + "grad_norm": 1.1618738174438477, + "learning_rate": 1.2099938054924633e-05, + "loss": 1.0596, + "step": 21780 + }, + { + "epoch": 0.8816268717118575, + "grad_norm": 1.3450782299041748, + "learning_rate": 1.2079289696469132e-05, + "loss": 0.9763, + "step": 21785 + }, + { + "epoch": 0.8818292189397006, + "grad_norm": 1.1920745372772217, + "learning_rate": 1.2058641338013628e-05, + "loss": 1.0137, + "step": 21790 + }, + { + "epoch": 0.8820315661675435, + "grad_norm": 1.39998459815979, + "learning_rate": 1.2037992979558125e-05, + "loss": 0.9687, + "step": 21795 + }, + { + "epoch": 0.8822339133953865, + "grad_norm": 1.3136975765228271, + "learning_rate": 1.2017344621102624e-05, + "loss": 0.9613, + "step": 21800 + }, + { + "epoch": 0.8824362606232294, + "grad_norm": 1.1904999017715454, + "learning_rate": 1.199669626264712e-05, + "loss": 1.0179, + "step": 21805 + }, + { + "epoch": 0.8826386078510724, + "grad_norm": 1.0881669521331787, + "learning_rate": 1.1976047904191617e-05, + "loss": 0.9737, + "step": 21810 + }, + { + "epoch": 0.8828409550789155, + "grad_norm": 1.222800374031067, + "learning_rate": 1.1955399545736114e-05, + "loss": 1.0169, + "step": 21815 + }, + { + "epoch": 0.8830433023067584, + "grad_norm": 1.233533501625061, + "learning_rate": 1.1934751187280612e-05, + "loss": 1.0014, + "step": 21820 + }, + { + "epoch": 0.8832456495346014, + "grad_norm": 1.1993024349212646, + "learning_rate": 1.1914102828825109e-05, + "loss": 1.0282, + "step": 21825 + }, + { + "epoch": 0.8834479967624443, + "grad_norm": 1.3259748220443726, + "learning_rate": 1.1893454470369606e-05, + "loss": 1.0201, + "step": 21830 + }, + { + "epoch": 0.8836503439902873, + "grad_norm": 1.1832479238510132, + "learning_rate": 1.1872806111914103e-05, + "loss": 1.0084, + "step": 21835 + }, + { + "epoch": 0.8838526912181303, + "grad_norm": 1.4280952215194702, + "learning_rate": 1.18521577534586e-05, + "loss": 1.0573, + "step": 21840 + }, + { + "epoch": 0.8840550384459733, + "grad_norm": 1.2184512615203857, + "learning_rate": 1.1831509395003098e-05, + "loss": 0.9729, + "step": 21845 + }, + { + "epoch": 0.8842573856738163, + "grad_norm": 1.2103275060653687, + "learning_rate": 1.1810861036547595e-05, + "loss": 1.0354, + "step": 21850 + }, + { + "epoch": 0.8844597329016592, + "grad_norm": 1.17412269115448, + "learning_rate": 1.1790212678092093e-05, + "loss": 1.0041, + "step": 21855 + }, + { + "epoch": 0.8846620801295022, + "grad_norm": 1.2611116170883179, + "learning_rate": 1.1769564319636588e-05, + "loss": 1.0013, + "step": 21860 + }, + { + "epoch": 0.8848644273573452, + "grad_norm": 1.2616214752197266, + "learning_rate": 1.1748915961181087e-05, + "loss": 0.9902, + "step": 21865 + }, + { + "epoch": 0.8850667745851882, + "grad_norm": 1.2103784084320068, + "learning_rate": 1.1728267602725585e-05, + "loss": 1.0034, + "step": 21870 + }, + { + "epoch": 0.8852691218130312, + "grad_norm": 1.2653135061264038, + "learning_rate": 1.170761924427008e-05, + "loss": 0.9487, + "step": 21875 + }, + { + "epoch": 0.8854714690408741, + "grad_norm": 1.273098349571228, + "learning_rate": 1.1686970885814579e-05, + "loss": 0.9933, + "step": 21880 + }, + { + "epoch": 0.8856738162687171, + "grad_norm": 1.221599817276001, + "learning_rate": 1.1666322527359075e-05, + "loss": 1.0322, + "step": 21885 + }, + { + "epoch": 0.8858761634965601, + "grad_norm": 1.2934221029281616, + "learning_rate": 1.1645674168903572e-05, + "loss": 1.0064, + "step": 21890 + }, + { + "epoch": 0.886078510724403, + "grad_norm": 1.3288832902908325, + "learning_rate": 1.1625025810448071e-05, + "loss": 0.9758, + "step": 21895 + }, + { + "epoch": 0.8862808579522461, + "grad_norm": 1.1124895811080933, + "learning_rate": 1.1604377451992567e-05, + "loss": 0.9687, + "step": 21900 + }, + { + "epoch": 0.886483205180089, + "grad_norm": 1.308106780052185, + "learning_rate": 1.1583729093537064e-05, + "loss": 1.0038, + "step": 21905 + }, + { + "epoch": 0.886685552407932, + "grad_norm": 1.2233226299285889, + "learning_rate": 1.1563080735081561e-05, + "loss": 0.9868, + "step": 21910 + }, + { + "epoch": 0.886887899635775, + "grad_norm": 1.2253919839859009, + "learning_rate": 1.1542432376626059e-05, + "loss": 1.0083, + "step": 21915 + }, + { + "epoch": 0.8870902468636179, + "grad_norm": 1.287880778312683, + "learning_rate": 1.1521784018170556e-05, + "loss": 0.9998, + "step": 21920 + }, + { + "epoch": 0.887292594091461, + "grad_norm": 1.310227870941162, + "learning_rate": 1.1501135659715053e-05, + "loss": 1.0066, + "step": 21925 + }, + { + "epoch": 0.887494941319304, + "grad_norm": 1.285409688949585, + "learning_rate": 1.148048730125955e-05, + "loss": 0.9458, + "step": 21930 + }, + { + "epoch": 0.8876972885471469, + "grad_norm": 1.2010833024978638, + "learning_rate": 1.1459838942804048e-05, + "loss": 0.9599, + "step": 21935 + }, + { + "epoch": 0.8878996357749899, + "grad_norm": 1.2600255012512207, + "learning_rate": 1.1439190584348545e-05, + "loss": 1.0022, + "step": 21940 + }, + { + "epoch": 0.8881019830028328, + "grad_norm": 1.1470943689346313, + "learning_rate": 1.1418542225893042e-05, + "loss": 0.974, + "step": 21945 + }, + { + "epoch": 0.8883043302306758, + "grad_norm": 1.2417147159576416, + "learning_rate": 1.139789386743754e-05, + "loss": 0.9644, + "step": 21950 + }, + { + "epoch": 0.8885066774585189, + "grad_norm": 1.3502781391143799, + "learning_rate": 1.1377245508982035e-05, + "loss": 0.9731, + "step": 21955 + }, + { + "epoch": 0.8887090246863618, + "grad_norm": 1.2079834938049316, + "learning_rate": 1.1356597150526534e-05, + "loss": 0.9691, + "step": 21960 + }, + { + "epoch": 0.8889113719142048, + "grad_norm": 1.176963210105896, + "learning_rate": 1.1335948792071031e-05, + "loss": 0.9759, + "step": 21965 + }, + { + "epoch": 0.8891137191420477, + "grad_norm": 1.263867974281311, + "learning_rate": 1.1315300433615527e-05, + "loss": 0.9369, + "step": 21970 + }, + { + "epoch": 0.8893160663698907, + "grad_norm": 1.1320356130599976, + "learning_rate": 1.1294652075160026e-05, + "loss": 0.9553, + "step": 21975 + }, + { + "epoch": 0.8895184135977338, + "grad_norm": 1.204413652420044, + "learning_rate": 1.1274003716704523e-05, + "loss": 1.0072, + "step": 21980 + }, + { + "epoch": 0.8897207608255767, + "grad_norm": 1.1795121431350708, + "learning_rate": 1.1253355358249019e-05, + "loss": 0.984, + "step": 21985 + }, + { + "epoch": 0.8899231080534197, + "grad_norm": 1.2841216325759888, + "learning_rate": 1.1232706999793518e-05, + "loss": 0.9346, + "step": 21990 + }, + { + "epoch": 0.8901254552812626, + "grad_norm": 1.228280782699585, + "learning_rate": 1.1212058641338014e-05, + "loss": 0.9995, + "step": 21995 + }, + { + "epoch": 0.8903278025091056, + "grad_norm": 1.1534844636917114, + "learning_rate": 1.1191410282882511e-05, + "loss": 0.9765, + "step": 22000 + }, + { + "epoch": 0.8905301497369486, + "grad_norm": 1.3221265077590942, + "learning_rate": 1.1170761924427008e-05, + "loss": 0.9167, + "step": 22005 + }, + { + "epoch": 0.8907324969647916, + "grad_norm": 1.1518945693969727, + "learning_rate": 1.1150113565971505e-05, + "loss": 0.9555, + "step": 22010 + }, + { + "epoch": 0.8909348441926346, + "grad_norm": 1.3317203521728516, + "learning_rate": 1.1129465207516003e-05, + "loss": 0.975, + "step": 22015 + }, + { + "epoch": 0.8911371914204775, + "grad_norm": 1.214125394821167, + "learning_rate": 1.11088168490605e-05, + "loss": 0.9902, + "step": 22020 + }, + { + "epoch": 0.8913395386483205, + "grad_norm": 1.1284027099609375, + "learning_rate": 1.1088168490604997e-05, + "loss": 0.9531, + "step": 22025 + }, + { + "epoch": 0.8915418858761635, + "grad_norm": 1.1702163219451904, + "learning_rate": 1.1067520132149495e-05, + "loss": 0.9817, + "step": 22030 + }, + { + "epoch": 0.8917442331040065, + "grad_norm": 1.3232663869857788, + "learning_rate": 1.1046871773693992e-05, + "loss": 1.0207, + "step": 22035 + }, + { + "epoch": 0.8919465803318495, + "grad_norm": 1.061571717262268, + "learning_rate": 1.102622341523849e-05, + "loss": 1.0115, + "step": 22040 + }, + { + "epoch": 0.8921489275596924, + "grad_norm": 1.2872508764266968, + "learning_rate": 1.1005575056782987e-05, + "loss": 0.9768, + "step": 22045 + }, + { + "epoch": 0.8923512747875354, + "grad_norm": 1.2478415966033936, + "learning_rate": 1.0984926698327482e-05, + "loss": 0.9753, + "step": 22050 + }, + { + "epoch": 0.8925536220153784, + "grad_norm": 1.207758903503418, + "learning_rate": 1.0964278339871981e-05, + "loss": 0.9842, + "step": 22055 + }, + { + "epoch": 0.8927559692432213, + "grad_norm": 1.3182350397109985, + "learning_rate": 1.0943629981416478e-05, + "loss": 0.9764, + "step": 22060 + }, + { + "epoch": 0.8929583164710644, + "grad_norm": 1.3606094121932983, + "learning_rate": 1.0922981622960974e-05, + "loss": 1.009, + "step": 22065 + }, + { + "epoch": 0.8931606636989073, + "grad_norm": 1.20542311668396, + "learning_rate": 1.0902333264505473e-05, + "loss": 0.9638, + "step": 22070 + }, + { + "epoch": 0.8933630109267503, + "grad_norm": 1.2064385414123535, + "learning_rate": 1.088168490604997e-05, + "loss": 0.9451, + "step": 22075 + }, + { + "epoch": 0.8935653581545933, + "grad_norm": 1.1925119161605835, + "learning_rate": 1.0861036547594466e-05, + "loss": 0.985, + "step": 22080 + }, + { + "epoch": 0.8937677053824362, + "grad_norm": 1.3347357511520386, + "learning_rate": 1.0840388189138965e-05, + "loss": 0.9557, + "step": 22085 + }, + { + "epoch": 0.8939700526102793, + "grad_norm": 1.3416115045547485, + "learning_rate": 1.0819739830683462e-05, + "loss": 1.0071, + "step": 22090 + }, + { + "epoch": 0.8941723998381222, + "grad_norm": 1.234076738357544, + "learning_rate": 1.0799091472227958e-05, + "loss": 0.9865, + "step": 22095 + }, + { + "epoch": 0.8943747470659652, + "grad_norm": 1.362473487854004, + "learning_rate": 1.0778443113772455e-05, + "loss": 0.9891, + "step": 22100 + }, + { + "epoch": 0.8945770942938082, + "grad_norm": 1.2424629926681519, + "learning_rate": 1.0757794755316952e-05, + "loss": 0.9554, + "step": 22105 + }, + { + "epoch": 0.8947794415216511, + "grad_norm": 1.1338199377059937, + "learning_rate": 1.073714639686145e-05, + "loss": 0.9994, + "step": 22110 + }, + { + "epoch": 0.8949817887494941, + "grad_norm": 1.2993570566177368, + "learning_rate": 1.0716498038405947e-05, + "loss": 0.9799, + "step": 22115 + }, + { + "epoch": 0.8951841359773371, + "grad_norm": 1.1700685024261475, + "learning_rate": 1.0695849679950444e-05, + "loss": 1.0206, + "step": 22120 + }, + { + "epoch": 0.8953864832051801, + "grad_norm": 1.2185229063034058, + "learning_rate": 1.0675201321494942e-05, + "loss": 1.016, + "step": 22125 + }, + { + "epoch": 0.8955888304330231, + "grad_norm": 1.333257794380188, + "learning_rate": 1.0654552963039439e-05, + "loss": 0.9748, + "step": 22130 + }, + { + "epoch": 0.895791177660866, + "grad_norm": 1.2211995124816895, + "learning_rate": 1.0633904604583936e-05, + "loss": 0.9494, + "step": 22135 + }, + { + "epoch": 0.895993524888709, + "grad_norm": 1.155814528465271, + "learning_rate": 1.0613256246128433e-05, + "loss": 0.9847, + "step": 22140 + }, + { + "epoch": 0.896195872116552, + "grad_norm": 1.1218489408493042, + "learning_rate": 1.0592607887672929e-05, + "loss": 1.0325, + "step": 22145 + }, + { + "epoch": 0.896398219344395, + "grad_norm": 1.223176121711731, + "learning_rate": 1.0571959529217428e-05, + "loss": 1.0434, + "step": 22150 + }, + { + "epoch": 0.896600566572238, + "grad_norm": 1.2436774969100952, + "learning_rate": 1.0551311170761925e-05, + "loss": 0.9439, + "step": 22155 + }, + { + "epoch": 0.8968029138000809, + "grad_norm": 1.2586272954940796, + "learning_rate": 1.0530662812306421e-05, + "loss": 0.9873, + "step": 22160 + }, + { + "epoch": 0.8970052610279239, + "grad_norm": 1.3155723810195923, + "learning_rate": 1.051001445385092e-05, + "loss": 0.9646, + "step": 22165 + }, + { + "epoch": 0.8972076082557668, + "grad_norm": 1.1274899244308472, + "learning_rate": 1.0489366095395417e-05, + "loss": 1.0054, + "step": 22170 + }, + { + "epoch": 0.8974099554836099, + "grad_norm": 1.1293829679489136, + "learning_rate": 1.0468717736939913e-05, + "loss": 0.9475, + "step": 22175 + }, + { + "epoch": 0.8976123027114529, + "grad_norm": 1.286868691444397, + "learning_rate": 1.0448069378484412e-05, + "loss": 0.9467, + "step": 22180 + }, + { + "epoch": 0.8978146499392958, + "grad_norm": 1.1316075325012207, + "learning_rate": 1.0427421020028909e-05, + "loss": 0.9477, + "step": 22185 + }, + { + "epoch": 0.8980169971671388, + "grad_norm": 1.0803990364074707, + "learning_rate": 1.0406772661573405e-05, + "loss": 0.9651, + "step": 22190 + }, + { + "epoch": 0.8982193443949817, + "grad_norm": 1.2619987726211548, + "learning_rate": 1.0386124303117902e-05, + "loss": 1.0046, + "step": 22195 + }, + { + "epoch": 0.8984216916228248, + "grad_norm": 1.2577831745147705, + "learning_rate": 1.03654759446624e-05, + "loss": 1.0353, + "step": 22200 + }, + { + "epoch": 0.8986240388506678, + "grad_norm": 1.1763091087341309, + "learning_rate": 1.0344827586206897e-05, + "loss": 1.0354, + "step": 22205 + }, + { + "epoch": 0.8988263860785107, + "grad_norm": 1.2089554071426392, + "learning_rate": 1.0324179227751394e-05, + "loss": 0.9622, + "step": 22210 + }, + { + "epoch": 0.8990287333063537, + "grad_norm": 1.1004221439361572, + "learning_rate": 1.0303530869295891e-05, + "loss": 0.9883, + "step": 22215 + }, + { + "epoch": 0.8992310805341966, + "grad_norm": 1.1818112134933472, + "learning_rate": 1.0282882510840389e-05, + "loss": 1.0136, + "step": 22220 + }, + { + "epoch": 0.8994334277620396, + "grad_norm": 1.1124714612960815, + "learning_rate": 1.0262234152384886e-05, + "loss": 0.9735, + "step": 22225 + }, + { + "epoch": 0.8996357749898827, + "grad_norm": 1.2221177816390991, + "learning_rate": 1.0241585793929383e-05, + "loss": 0.9817, + "step": 22230 + }, + { + "epoch": 0.8998381222177256, + "grad_norm": 1.1876314878463745, + "learning_rate": 1.022093743547388e-05, + "loss": 0.9975, + "step": 22235 + }, + { + "epoch": 0.9000404694455686, + "grad_norm": 1.233405351638794, + "learning_rate": 1.0200289077018378e-05, + "loss": 0.9749, + "step": 22240 + }, + { + "epoch": 0.9002428166734115, + "grad_norm": 1.1681106090545654, + "learning_rate": 1.0179640718562875e-05, + "loss": 0.8572, + "step": 22245 + }, + { + "epoch": 0.9004451639012545, + "grad_norm": 1.2301884889602661, + "learning_rate": 1.0158992360107372e-05, + "loss": 0.9903, + "step": 22250 + }, + { + "epoch": 0.9006475111290976, + "grad_norm": 1.1902940273284912, + "learning_rate": 1.0138344001651868e-05, + "loss": 1.0435, + "step": 22255 + }, + { + "epoch": 0.9008498583569405, + "grad_norm": 1.2263290882110596, + "learning_rate": 1.0117695643196367e-05, + "loss": 0.9561, + "step": 22260 + }, + { + "epoch": 0.9010522055847835, + "grad_norm": 1.2020978927612305, + "learning_rate": 1.0097047284740864e-05, + "loss": 0.9672, + "step": 22265 + }, + { + "epoch": 0.9012545528126265, + "grad_norm": 1.1788512468338013, + "learning_rate": 1.007639892628536e-05, + "loss": 0.945, + "step": 22270 + }, + { + "epoch": 0.9014569000404694, + "grad_norm": 1.294374704360962, + "learning_rate": 1.0055750567829859e-05, + "loss": 0.9965, + "step": 22275 + }, + { + "epoch": 0.9016592472683124, + "grad_norm": 1.2951778173446655, + "learning_rate": 1.0035102209374356e-05, + "loss": 0.9357, + "step": 22280 + }, + { + "epoch": 0.9018615944961554, + "grad_norm": 1.068011999130249, + "learning_rate": 1.0014453850918852e-05, + "loss": 0.9796, + "step": 22285 + }, + { + "epoch": 0.9020639417239984, + "grad_norm": 1.2775375843048096, + "learning_rate": 9.993805492463349e-06, + "loss": 0.9574, + "step": 22290 + }, + { + "epoch": 0.9022662889518414, + "grad_norm": 1.3473182916641235, + "learning_rate": 9.973157134007848e-06, + "loss": 0.9608, + "step": 22295 + }, + { + "epoch": 0.9024686361796843, + "grad_norm": 1.080181360244751, + "learning_rate": 9.952508775552344e-06, + "loss": 1.0191, + "step": 22300 + }, + { + "epoch": 0.9026709834075273, + "grad_norm": 1.2121566534042358, + "learning_rate": 9.931860417096841e-06, + "loss": 1.0055, + "step": 22305 + }, + { + "epoch": 0.9028733306353703, + "grad_norm": 1.2975614070892334, + "learning_rate": 9.911212058641338e-06, + "loss": 1.0079, + "step": 22310 + }, + { + "epoch": 0.9030756778632133, + "grad_norm": 1.1123826503753662, + "learning_rate": 9.890563700185835e-06, + "loss": 1.0012, + "step": 22315 + }, + { + "epoch": 0.9032780250910563, + "grad_norm": 1.1919848918914795, + "learning_rate": 9.869915341730333e-06, + "loss": 0.9943, + "step": 22320 + }, + { + "epoch": 0.9034803723188992, + "grad_norm": 1.231441617012024, + "learning_rate": 9.84926698327483e-06, + "loss": 1.0508, + "step": 22325 + }, + { + "epoch": 0.9036827195467422, + "grad_norm": 1.2481560707092285, + "learning_rate": 9.828618624819327e-06, + "loss": 1.0405, + "step": 22330 + }, + { + "epoch": 0.9038850667745851, + "grad_norm": 1.2379087209701538, + "learning_rate": 9.807970266363825e-06, + "loss": 1.0126, + "step": 22335 + }, + { + "epoch": 0.9040874140024282, + "grad_norm": 1.2807567119598389, + "learning_rate": 9.787321907908322e-06, + "loss": 1.0009, + "step": 22340 + }, + { + "epoch": 0.9042897612302712, + "grad_norm": 1.19586181640625, + "learning_rate": 9.76667354945282e-06, + "loss": 0.9922, + "step": 22345 + }, + { + "epoch": 0.9044921084581141, + "grad_norm": 1.257541298866272, + "learning_rate": 9.746025190997317e-06, + "loss": 1.0157, + "step": 22350 + }, + { + "epoch": 0.9046944556859571, + "grad_norm": 1.1988418102264404, + "learning_rate": 9.725376832541814e-06, + "loss": 0.9513, + "step": 22355 + }, + { + "epoch": 0.9048968029138, + "grad_norm": 1.268105149269104, + "learning_rate": 9.704728474086311e-06, + "loss": 1.0328, + "step": 22360 + }, + { + "epoch": 0.9050991501416431, + "grad_norm": 1.1456495523452759, + "learning_rate": 9.684080115630807e-06, + "loss": 1.0015, + "step": 22365 + }, + { + "epoch": 0.9053014973694861, + "grad_norm": 1.2869234085083008, + "learning_rate": 9.663431757175306e-06, + "loss": 1.0053, + "step": 22370 + }, + { + "epoch": 0.905503844597329, + "grad_norm": 1.133236050605774, + "learning_rate": 9.642783398719803e-06, + "loss": 0.9603, + "step": 22375 + }, + { + "epoch": 0.905706191825172, + "grad_norm": 1.200329303741455, + "learning_rate": 9.622135040264299e-06, + "loss": 0.9874, + "step": 22380 + }, + { + "epoch": 0.9059085390530149, + "grad_norm": 1.145104169845581, + "learning_rate": 9.601486681808796e-06, + "loss": 0.9957, + "step": 22385 + }, + { + "epoch": 0.9061108862808579, + "grad_norm": 1.1849706172943115, + "learning_rate": 9.580838323353295e-06, + "loss": 1.0053, + "step": 22390 + }, + { + "epoch": 0.906313233508701, + "grad_norm": 1.083436131477356, + "learning_rate": 9.56018996489779e-06, + "loss": 0.9284, + "step": 22395 + }, + { + "epoch": 0.9065155807365439, + "grad_norm": 1.2961195707321167, + "learning_rate": 9.539541606442288e-06, + "loss": 1.0143, + "step": 22400 + }, + { + "epoch": 0.9067179279643869, + "grad_norm": 1.2798185348510742, + "learning_rate": 9.518893247986785e-06, + "loss": 0.9898, + "step": 22405 + }, + { + "epoch": 0.9069202751922298, + "grad_norm": 1.2063452005386353, + "learning_rate": 9.498244889531282e-06, + "loss": 1.013, + "step": 22410 + }, + { + "epoch": 0.9071226224200728, + "grad_norm": 1.2208327054977417, + "learning_rate": 9.47759653107578e-06, + "loss": 0.9956, + "step": 22415 + }, + { + "epoch": 0.9073249696479159, + "grad_norm": 2.438999652862549, + "learning_rate": 9.456948172620277e-06, + "loss": 1.0163, + "step": 22420 + }, + { + "epoch": 0.9075273168757588, + "grad_norm": 1.2716456651687622, + "learning_rate": 9.436299814164774e-06, + "loss": 1.0141, + "step": 22425 + }, + { + "epoch": 0.9077296641036018, + "grad_norm": 1.2127629518508911, + "learning_rate": 9.415651455709272e-06, + "loss": 0.982, + "step": 22430 + }, + { + "epoch": 0.9079320113314447, + "grad_norm": 1.2110321521759033, + "learning_rate": 9.395003097253769e-06, + "loss": 0.9764, + "step": 22435 + }, + { + "epoch": 0.9081343585592877, + "grad_norm": 1.1984643936157227, + "learning_rate": 9.374354738798266e-06, + "loss": 0.9687, + "step": 22440 + }, + { + "epoch": 0.9083367057871308, + "grad_norm": 1.256187915802002, + "learning_rate": 9.353706380342763e-06, + "loss": 0.9754, + "step": 22445 + }, + { + "epoch": 0.9085390530149737, + "grad_norm": 1.2054247856140137, + "learning_rate": 9.33305802188726e-06, + "loss": 0.9847, + "step": 22450 + }, + { + "epoch": 0.9087414002428167, + "grad_norm": 1.2009963989257812, + "learning_rate": 9.312409663431758e-06, + "loss": 0.9836, + "step": 22455 + }, + { + "epoch": 0.9089437474706596, + "grad_norm": 1.1755682229995728, + "learning_rate": 9.291761304976254e-06, + "loss": 1.0035, + "step": 22460 + }, + { + "epoch": 0.9091460946985026, + "grad_norm": 1.2057689428329468, + "learning_rate": 9.271112946520753e-06, + "loss": 0.9971, + "step": 22465 + }, + { + "epoch": 0.9093484419263456, + "grad_norm": 1.2346726655960083, + "learning_rate": 9.25046458806525e-06, + "loss": 0.9771, + "step": 22470 + }, + { + "epoch": 0.9095507891541886, + "grad_norm": 1.230569839477539, + "learning_rate": 9.229816229609746e-06, + "loss": 0.9842, + "step": 22475 + }, + { + "epoch": 0.9097531363820316, + "grad_norm": 1.242022156715393, + "learning_rate": 9.209167871154243e-06, + "loss": 0.9792, + "step": 22480 + }, + { + "epoch": 0.9099554836098745, + "grad_norm": 1.308518648147583, + "learning_rate": 9.188519512698742e-06, + "loss": 0.9571, + "step": 22485 + }, + { + "epoch": 0.9101578308377175, + "grad_norm": 1.1113145351409912, + "learning_rate": 9.167871154243237e-06, + "loss": 0.9663, + "step": 22490 + }, + { + "epoch": 0.9103601780655605, + "grad_norm": 1.3198206424713135, + "learning_rate": 9.147222795787735e-06, + "loss": 1.0134, + "step": 22495 + }, + { + "epoch": 0.9105625252934035, + "grad_norm": 1.285475492477417, + "learning_rate": 9.126574437332234e-06, + "loss": 0.9834, + "step": 22500 + }, + { + "epoch": 0.9107648725212465, + "grad_norm": 1.3377816677093506, + "learning_rate": 9.10592607887673e-06, + "loss": 1.0289, + "step": 22505 + }, + { + "epoch": 0.9109672197490895, + "grad_norm": 1.1217929124832153, + "learning_rate": 9.085277720421227e-06, + "loss": 0.9706, + "step": 22510 + }, + { + "epoch": 0.9111695669769324, + "grad_norm": 1.3376717567443848, + "learning_rate": 9.064629361965724e-06, + "loss": 1.0767, + "step": 22515 + }, + { + "epoch": 0.9113719142047754, + "grad_norm": 1.2571460008621216, + "learning_rate": 9.043981003510221e-06, + "loss": 0.9529, + "step": 22520 + }, + { + "epoch": 0.9115742614326183, + "grad_norm": 1.1524591445922852, + "learning_rate": 9.023332645054719e-06, + "loss": 0.9613, + "step": 22525 + }, + { + "epoch": 0.9117766086604614, + "grad_norm": 1.3369122743606567, + "learning_rate": 9.002684286599216e-06, + "loss": 1.009, + "step": 22530 + }, + { + "epoch": 0.9119789558883044, + "grad_norm": 1.2407011985778809, + "learning_rate": 8.982035928143713e-06, + "loss": 0.9616, + "step": 22535 + }, + { + "epoch": 0.9121813031161473, + "grad_norm": 1.1766000986099243, + "learning_rate": 8.96138756968821e-06, + "loss": 1.0649, + "step": 22540 + }, + { + "epoch": 0.9123836503439903, + "grad_norm": 1.2149438858032227, + "learning_rate": 8.940739211232708e-06, + "loss": 0.9535, + "step": 22545 + }, + { + "epoch": 0.9125859975718332, + "grad_norm": 1.3015145063400269, + "learning_rate": 8.920090852777205e-06, + "loss": 0.961, + "step": 22550 + }, + { + "epoch": 0.9127883447996763, + "grad_norm": 1.2790932655334473, + "learning_rate": 8.899442494321702e-06, + "loss": 1.0255, + "step": 22555 + }, + { + "epoch": 0.9129906920275193, + "grad_norm": 1.3546644449234009, + "learning_rate": 8.8787941358662e-06, + "loss": 1.0645, + "step": 22560 + }, + { + "epoch": 0.9131930392553622, + "grad_norm": 1.135512113571167, + "learning_rate": 8.858145777410697e-06, + "loss": 0.9898, + "step": 22565 + }, + { + "epoch": 0.9133953864832052, + "grad_norm": 1.1631687879562378, + "learning_rate": 8.837497418955193e-06, + "loss": 0.9646, + "step": 22570 + }, + { + "epoch": 0.9135977337110481, + "grad_norm": 1.1153877973556519, + "learning_rate": 8.81684906049969e-06, + "loss": 0.9689, + "step": 22575 + }, + { + "epoch": 0.9138000809388911, + "grad_norm": 1.1794513463974, + "learning_rate": 8.796200702044189e-06, + "loss": 1.0014, + "step": 22580 + }, + { + "epoch": 0.9140024281667342, + "grad_norm": 1.1592798233032227, + "learning_rate": 8.775552343588684e-06, + "loss": 0.9923, + "step": 22585 + }, + { + "epoch": 0.9142047753945771, + "grad_norm": 1.2975784540176392, + "learning_rate": 8.754903985133182e-06, + "loss": 0.9922, + "step": 22590 + }, + { + "epoch": 0.9144071226224201, + "grad_norm": 1.25165593624115, + "learning_rate": 8.73425562667768e-06, + "loss": 0.9499, + "step": 22595 + }, + { + "epoch": 0.914609469850263, + "grad_norm": 1.1658289432525635, + "learning_rate": 8.713607268222176e-06, + "loss": 0.963, + "step": 22600 + }, + { + "epoch": 0.914811817078106, + "grad_norm": 1.2364903688430786, + "learning_rate": 8.692958909766674e-06, + "loss": 0.9883, + "step": 22605 + }, + { + "epoch": 0.9150141643059491, + "grad_norm": 1.1751139163970947, + "learning_rate": 8.672310551311171e-06, + "loss": 0.9371, + "step": 22610 + }, + { + "epoch": 0.915216511533792, + "grad_norm": 1.239706039428711, + "learning_rate": 8.651662192855668e-06, + "loss": 0.986, + "step": 22615 + }, + { + "epoch": 0.915418858761635, + "grad_norm": 1.1792747974395752, + "learning_rate": 8.631013834400165e-06, + "loss": 0.9937, + "step": 22620 + }, + { + "epoch": 0.9156212059894779, + "grad_norm": 1.2934255599975586, + "learning_rate": 8.610365475944663e-06, + "loss": 1.0028, + "step": 22625 + }, + { + "epoch": 0.9158235532173209, + "grad_norm": 1.1981898546218872, + "learning_rate": 8.58971711748916e-06, + "loss": 0.9849, + "step": 22630 + }, + { + "epoch": 0.9160259004451639, + "grad_norm": 1.1199761629104614, + "learning_rate": 8.569068759033657e-06, + "loss": 0.9912, + "step": 22635 + }, + { + "epoch": 0.9162282476730069, + "grad_norm": 1.2269060611724854, + "learning_rate": 8.548420400578155e-06, + "loss": 1.002, + "step": 22640 + }, + { + "epoch": 0.9164305949008499, + "grad_norm": 1.19041907787323, + "learning_rate": 8.527772042122652e-06, + "loss": 0.9154, + "step": 22645 + }, + { + "epoch": 0.9166329421286928, + "grad_norm": 1.2220449447631836, + "learning_rate": 8.50712368366715e-06, + "loss": 0.9968, + "step": 22650 + }, + { + "epoch": 0.9168352893565358, + "grad_norm": 1.2628720998764038, + "learning_rate": 8.486475325211647e-06, + "loss": 1.062, + "step": 22655 + }, + { + "epoch": 0.9170376365843788, + "grad_norm": 1.2970205545425415, + "learning_rate": 8.465826966756144e-06, + "loss": 0.9405, + "step": 22660 + }, + { + "epoch": 0.9172399838122218, + "grad_norm": 1.187365174293518, + "learning_rate": 8.44517860830064e-06, + "loss": 0.9738, + "step": 22665 + }, + { + "epoch": 0.9174423310400648, + "grad_norm": 1.2985329627990723, + "learning_rate": 8.424530249845137e-06, + "loss": 1.0047, + "step": 22670 + }, + { + "epoch": 0.9176446782679077, + "grad_norm": 1.143072247505188, + "learning_rate": 8.403881891389636e-06, + "loss": 0.9638, + "step": 22675 + }, + { + "epoch": 0.9178470254957507, + "grad_norm": 1.1585158109664917, + "learning_rate": 8.383233532934131e-06, + "loss": 0.9175, + "step": 22680 + }, + { + "epoch": 0.9180493727235937, + "grad_norm": 1.1999739408493042, + "learning_rate": 8.362585174478629e-06, + "loss": 1.0941, + "step": 22685 + }, + { + "epoch": 0.9182517199514366, + "grad_norm": 1.2880604267120361, + "learning_rate": 8.341936816023128e-06, + "loss": 0.9812, + "step": 22690 + }, + { + "epoch": 0.9184540671792797, + "grad_norm": 1.1725088357925415, + "learning_rate": 8.321288457567623e-06, + "loss": 0.9726, + "step": 22695 + }, + { + "epoch": 0.9186564144071226, + "grad_norm": 1.2655597925186157, + "learning_rate": 8.30064009911212e-06, + "loss": 0.955, + "step": 22700 + }, + { + "epoch": 0.9188587616349656, + "grad_norm": 1.1834826469421387, + "learning_rate": 8.27999174065662e-06, + "loss": 0.9409, + "step": 22705 + }, + { + "epoch": 0.9190611088628086, + "grad_norm": 1.135432243347168, + "learning_rate": 8.259343382201115e-06, + "loss": 0.9644, + "step": 22710 + }, + { + "epoch": 0.9192634560906515, + "grad_norm": 1.1146769523620605, + "learning_rate": 8.238695023745612e-06, + "loss": 0.9816, + "step": 22715 + }, + { + "epoch": 0.9194658033184946, + "grad_norm": 1.2879973649978638, + "learning_rate": 8.21804666529011e-06, + "loss": 1.0648, + "step": 22720 + }, + { + "epoch": 0.9196681505463375, + "grad_norm": 1.2010254859924316, + "learning_rate": 8.197398306834607e-06, + "loss": 0.9895, + "step": 22725 + }, + { + "epoch": 0.9198704977741805, + "grad_norm": 1.1273995637893677, + "learning_rate": 8.176749948379104e-06, + "loss": 1.0098, + "step": 22730 + }, + { + "epoch": 0.9200728450020235, + "grad_norm": 1.1119916439056396, + "learning_rate": 8.156101589923602e-06, + "loss": 1.0431, + "step": 22735 + }, + { + "epoch": 0.9202751922298664, + "grad_norm": 1.2416926622390747, + "learning_rate": 8.135453231468099e-06, + "loss": 1.04, + "step": 22740 + }, + { + "epoch": 0.9204775394577094, + "grad_norm": 1.101529598236084, + "learning_rate": 8.114804873012596e-06, + "loss": 0.9656, + "step": 22745 + }, + { + "epoch": 0.9206798866855525, + "grad_norm": 1.2064924240112305, + "learning_rate": 8.094156514557094e-06, + "loss": 0.9936, + "step": 22750 + }, + { + "epoch": 0.9208822339133954, + "grad_norm": 1.202167272567749, + "learning_rate": 8.07350815610159e-06, + "loss": 0.9247, + "step": 22755 + }, + { + "epoch": 0.9210845811412384, + "grad_norm": 1.2607907056808472, + "learning_rate": 8.052859797646088e-06, + "loss": 1.0179, + "step": 22760 + }, + { + "epoch": 0.9212869283690813, + "grad_norm": 1.1044496297836304, + "learning_rate": 8.032211439190584e-06, + "loss": 0.9685, + "step": 22765 + }, + { + "epoch": 0.9214892755969243, + "grad_norm": 1.3169591426849365, + "learning_rate": 8.011563080735083e-06, + "loss": 1.0145, + "step": 22770 + }, + { + "epoch": 0.9216916228247674, + "grad_norm": 1.160158634185791, + "learning_rate": 7.990914722279578e-06, + "loss": 0.9693, + "step": 22775 + }, + { + "epoch": 0.9218939700526103, + "grad_norm": 1.3280168771743774, + "learning_rate": 7.970266363824076e-06, + "loss": 0.9799, + "step": 22780 + }, + { + "epoch": 0.9220963172804533, + "grad_norm": 1.2774527072906494, + "learning_rate": 7.949618005368575e-06, + "loss": 0.9861, + "step": 22785 + }, + { + "epoch": 0.9222986645082962, + "grad_norm": 1.2364764213562012, + "learning_rate": 7.92896964691307e-06, + "loss": 0.929, + "step": 22790 + }, + { + "epoch": 0.9225010117361392, + "grad_norm": 1.221522569656372, + "learning_rate": 7.908321288457568e-06, + "loss": 0.9077, + "step": 22795 + }, + { + "epoch": 0.9227033589639821, + "grad_norm": 1.2038472890853882, + "learning_rate": 7.887672930002066e-06, + "loss": 1.0101, + "step": 22800 + }, + { + "epoch": 0.9229057061918252, + "grad_norm": 1.1088597774505615, + "learning_rate": 7.867024571546562e-06, + "loss": 0.9654, + "step": 22805 + }, + { + "epoch": 0.9231080534196682, + "grad_norm": 1.1928927898406982, + "learning_rate": 7.84637621309106e-06, + "loss": 0.9374, + "step": 22810 + }, + { + "epoch": 0.9233104006475111, + "grad_norm": 1.1673182249069214, + "learning_rate": 7.825727854635557e-06, + "loss": 1.0107, + "step": 22815 + }, + { + "epoch": 0.9235127478753541, + "grad_norm": 1.2621965408325195, + "learning_rate": 7.805079496180054e-06, + "loss": 0.9653, + "step": 22820 + }, + { + "epoch": 0.923715095103197, + "grad_norm": 1.2311797142028809, + "learning_rate": 7.784431137724551e-06, + "loss": 0.9723, + "step": 22825 + }, + { + "epoch": 0.9239174423310401, + "grad_norm": 1.1744613647460938, + "learning_rate": 7.763782779269049e-06, + "loss": 0.9745, + "step": 22830 + }, + { + "epoch": 0.9241197895588831, + "grad_norm": 1.2949620485305786, + "learning_rate": 7.743134420813546e-06, + "loss": 0.9944, + "step": 22835 + }, + { + "epoch": 0.924322136786726, + "grad_norm": 1.1407514810562134, + "learning_rate": 7.722486062358043e-06, + "loss": 0.9649, + "step": 22840 + }, + { + "epoch": 0.924524484014569, + "grad_norm": 1.2529412508010864, + "learning_rate": 7.70183770390254e-06, + "loss": 1.0489, + "step": 22845 + }, + { + "epoch": 0.924726831242412, + "grad_norm": 1.30082106590271, + "learning_rate": 7.681189345447038e-06, + "loss": 0.9873, + "step": 22850 + }, + { + "epoch": 0.9249291784702549, + "grad_norm": 1.1744364500045776, + "learning_rate": 7.660540986991535e-06, + "loss": 0.955, + "step": 22855 + }, + { + "epoch": 0.925131525698098, + "grad_norm": 1.1827532052993774, + "learning_rate": 7.63989262853603e-06, + "loss": 0.9811, + "step": 22860 + }, + { + "epoch": 0.9253338729259409, + "grad_norm": 1.402942419052124, + "learning_rate": 7.619244270080529e-06, + "loss": 1.0014, + "step": 22865 + }, + { + "epoch": 0.9255362201537839, + "grad_norm": 1.2489560842514038, + "learning_rate": 7.598595911625025e-06, + "loss": 0.9448, + "step": 22870 + }, + { + "epoch": 0.9257385673816269, + "grad_norm": 1.270635962486267, + "learning_rate": 7.577947553169523e-06, + "loss": 0.9908, + "step": 22875 + }, + { + "epoch": 0.9259409146094698, + "grad_norm": 1.2608011960983276, + "learning_rate": 7.557299194714021e-06, + "loss": 0.9864, + "step": 22880 + }, + { + "epoch": 0.9261432618373129, + "grad_norm": 1.2094098329544067, + "learning_rate": 7.536650836258517e-06, + "loss": 0.965, + "step": 22885 + }, + { + "epoch": 0.9263456090651558, + "grad_norm": 1.3155937194824219, + "learning_rate": 7.516002477803015e-06, + "loss": 0.9344, + "step": 22890 + }, + { + "epoch": 0.9265479562929988, + "grad_norm": 1.2582231760025024, + "learning_rate": 7.495354119347513e-06, + "loss": 0.9776, + "step": 22895 + }, + { + "epoch": 0.9267503035208418, + "grad_norm": 1.2340329885482788, + "learning_rate": 7.474705760892009e-06, + "loss": 1.0597, + "step": 22900 + }, + { + "epoch": 0.9269526507486847, + "grad_norm": 1.1896069049835205, + "learning_rate": 7.454057402436506e-06, + "loss": 1.0278, + "step": 22905 + }, + { + "epoch": 0.9271549979765277, + "grad_norm": 1.203372836112976, + "learning_rate": 7.4334090439810045e-06, + "loss": 1.0645, + "step": 22910 + }, + { + "epoch": 0.9273573452043707, + "grad_norm": 1.2288404703140259, + "learning_rate": 7.412760685525501e-06, + "loss": 0.9511, + "step": 22915 + }, + { + "epoch": 0.9275596924322137, + "grad_norm": 1.1734936237335205, + "learning_rate": 7.392112327069998e-06, + "loss": 0.9714, + "step": 22920 + }, + { + "epoch": 0.9277620396600567, + "grad_norm": 1.307991623878479, + "learning_rate": 7.371463968614495e-06, + "loss": 0.9713, + "step": 22925 + }, + { + "epoch": 0.9279643868878996, + "grad_norm": 1.3006573915481567, + "learning_rate": 7.350815610158993e-06, + "loss": 1.0358, + "step": 22930 + }, + { + "epoch": 0.9281667341157426, + "grad_norm": 1.288576364517212, + "learning_rate": 7.33016725170349e-06, + "loss": 1.0295, + "step": 22935 + }, + { + "epoch": 0.9283690813435856, + "grad_norm": 1.1439498662948608, + "learning_rate": 7.3095188932479866e-06, + "loss": 1.0337, + "step": 22940 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 1.1806100606918335, + "learning_rate": 7.288870534792485e-06, + "loss": 0.9687, + "step": 22945 + }, + { + "epoch": 0.9287737757992716, + "grad_norm": 1.2200596332550049, + "learning_rate": 7.268222176336982e-06, + "loss": 1.0372, + "step": 22950 + }, + { + "epoch": 0.9289761230271145, + "grad_norm": 1.3245278596878052, + "learning_rate": 7.2475738178814785e-06, + "loss": 1.0367, + "step": 22955 + }, + { + "epoch": 0.9291784702549575, + "grad_norm": 1.2013788223266602, + "learning_rate": 7.226925459425976e-06, + "loss": 0.9974, + "step": 22960 + }, + { + "epoch": 0.9293808174828004, + "grad_norm": 1.2571405172348022, + "learning_rate": 7.206277100970474e-06, + "loss": 0.9595, + "step": 22965 + }, + { + "epoch": 0.9295831647106435, + "grad_norm": 1.1836342811584473, + "learning_rate": 7.18562874251497e-06, + "loss": 1.0234, + "step": 22970 + }, + { + "epoch": 0.9297855119384865, + "grad_norm": 1.231541633605957, + "learning_rate": 7.164980384059468e-06, + "loss": 1.0239, + "step": 22975 + }, + { + "epoch": 0.9299878591663294, + "grad_norm": 1.2750858068466187, + "learning_rate": 7.144332025603964e-06, + "loss": 0.9454, + "step": 22980 + }, + { + "epoch": 0.9301902063941724, + "grad_norm": 1.145346999168396, + "learning_rate": 7.123683667148462e-06, + "loss": 1.0459, + "step": 22985 + }, + { + "epoch": 0.9303925536220153, + "grad_norm": 1.2808715105056763, + "learning_rate": 7.1030353086929595e-06, + "loss": 0.9821, + "step": 22990 + }, + { + "epoch": 0.9305949008498584, + "grad_norm": 1.3871564865112305, + "learning_rate": 7.082386950237456e-06, + "loss": 1.0195, + "step": 22995 + }, + { + "epoch": 0.9307972480777014, + "grad_norm": 1.1897286176681519, + "learning_rate": 7.061738591781953e-06, + "loss": 1.0062, + "step": 23000 + }, + { + "epoch": 0.9309995953055443, + "grad_norm": 1.2897855043411255, + "learning_rate": 7.0410902333264514e-06, + "loss": 0.9726, + "step": 23005 + }, + { + "epoch": 0.9312019425333873, + "grad_norm": 1.158328652381897, + "learning_rate": 7.020441874870948e-06, + "loss": 0.9388, + "step": 23010 + }, + { + "epoch": 0.9314042897612302, + "grad_norm": 1.2545818090438843, + "learning_rate": 6.999793516415445e-06, + "loss": 1.073, + "step": 23015 + }, + { + "epoch": 0.9316066369890732, + "grad_norm": 1.1400245428085327, + "learning_rate": 6.979145157959943e-06, + "loss": 0.98, + "step": 23020 + }, + { + "epoch": 0.9318089842169163, + "grad_norm": 1.1022324562072754, + "learning_rate": 6.95849679950444e-06, + "loss": 1.0085, + "step": 23025 + }, + { + "epoch": 0.9320113314447592, + "grad_norm": 1.2606849670410156, + "learning_rate": 6.937848441048937e-06, + "loss": 1.0295, + "step": 23030 + }, + { + "epoch": 0.9322136786726022, + "grad_norm": 1.250056266784668, + "learning_rate": 6.9172000825934335e-06, + "loss": 0.9885, + "step": 23035 + }, + { + "epoch": 0.9324160259004451, + "grad_norm": 1.1324617862701416, + "learning_rate": 6.896551724137932e-06, + "loss": 0.9748, + "step": 23040 + }, + { + "epoch": 0.9326183731282881, + "grad_norm": 1.2661421298980713, + "learning_rate": 6.875903365682429e-06, + "loss": 0.9775, + "step": 23045 + }, + { + "epoch": 0.9328207203561312, + "grad_norm": 1.3545794486999512, + "learning_rate": 6.855255007226925e-06, + "loss": 1.0137, + "step": 23050 + }, + { + "epoch": 0.9330230675839741, + "grad_norm": 1.1687790155410767, + "learning_rate": 6.834606648771423e-06, + "loss": 1.0448, + "step": 23055 + }, + { + "epoch": 0.9332254148118171, + "grad_norm": 1.1806179285049438, + "learning_rate": 6.813958290315921e-06, + "loss": 0.988, + "step": 23060 + }, + { + "epoch": 0.93342776203966, + "grad_norm": 1.2068326473236084, + "learning_rate": 6.793309931860417e-06, + "loss": 0.9895, + "step": 23065 + }, + { + "epoch": 0.933630109267503, + "grad_norm": 1.2086092233657837, + "learning_rate": 6.772661573404915e-06, + "loss": 1.0398, + "step": 23070 + }, + { + "epoch": 0.933832456495346, + "grad_norm": 1.2049978971481323, + "learning_rate": 6.752013214949413e-06, + "loss": 1.0056, + "step": 23075 + }, + { + "epoch": 0.934034803723189, + "grad_norm": 1.3573262691497803, + "learning_rate": 6.731364856493909e-06, + "loss": 0.9986, + "step": 23080 + }, + { + "epoch": 0.934237150951032, + "grad_norm": 1.2404375076293945, + "learning_rate": 6.7107164980384065e-06, + "loss": 0.9761, + "step": 23085 + }, + { + "epoch": 0.934439498178875, + "grad_norm": 1.2016634941101074, + "learning_rate": 6.690068139582903e-06, + "loss": 0.9445, + "step": 23090 + }, + { + "epoch": 0.9346418454067179, + "grad_norm": 1.174824833869934, + "learning_rate": 6.6694197811274e-06, + "loss": 0.9682, + "step": 23095 + }, + { + "epoch": 0.9348441926345609, + "grad_norm": 1.2408758401870728, + "learning_rate": 6.648771422671898e-06, + "loss": 0.9577, + "step": 23100 + }, + { + "epoch": 0.9350465398624039, + "grad_norm": 1.238100528717041, + "learning_rate": 6.628123064216395e-06, + "loss": 0.9912, + "step": 23105 + }, + { + "epoch": 0.9352488870902469, + "grad_norm": 1.2727497816085815, + "learning_rate": 6.607474705760892e-06, + "loss": 0.9427, + "step": 23110 + }, + { + "epoch": 0.9354512343180899, + "grad_norm": 1.2046585083007812, + "learning_rate": 6.58682634730539e-06, + "loss": 1.0214, + "step": 23115 + }, + { + "epoch": 0.9356535815459328, + "grad_norm": 1.1359328031539917, + "learning_rate": 6.566177988849887e-06, + "loss": 0.9797, + "step": 23120 + }, + { + "epoch": 0.9358559287737758, + "grad_norm": 1.1375129222869873, + "learning_rate": 6.545529630394384e-06, + "loss": 1.0091, + "step": 23125 + }, + { + "epoch": 0.9360582760016187, + "grad_norm": 1.1417800188064575, + "learning_rate": 6.5248812719388805e-06, + "loss": 0.9363, + "step": 23130 + }, + { + "epoch": 0.9362606232294618, + "grad_norm": 1.1177046298980713, + "learning_rate": 6.504232913483379e-06, + "loss": 1.0011, + "step": 23135 + }, + { + "epoch": 0.9364629704573048, + "grad_norm": 1.1646171808242798, + "learning_rate": 6.483584555027876e-06, + "loss": 1.0475, + "step": 23140 + }, + { + "epoch": 0.9366653176851477, + "grad_norm": 1.3040295839309692, + "learning_rate": 6.462936196572372e-06, + "loss": 0.9766, + "step": 23145 + }, + { + "epoch": 0.9368676649129907, + "grad_norm": 1.1640188694000244, + "learning_rate": 6.44228783811687e-06, + "loss": 1.0204, + "step": 23150 + }, + { + "epoch": 0.9370700121408336, + "grad_norm": 1.2561315298080444, + "learning_rate": 6.421639479661368e-06, + "loss": 0.9952, + "step": 23155 + }, + { + "epoch": 0.9372723593686767, + "grad_norm": 1.1376277208328247, + "learning_rate": 6.400991121205864e-06, + "loss": 1.002, + "step": 23160 + }, + { + "epoch": 0.9374747065965197, + "grad_norm": 1.2699038982391357, + "learning_rate": 6.3803427627503615e-06, + "loss": 1.0086, + "step": 23165 + }, + { + "epoch": 0.9376770538243626, + "grad_norm": 1.3435354232788086, + "learning_rate": 6.35969440429486e-06, + "loss": 0.9559, + "step": 23170 + }, + { + "epoch": 0.9378794010522056, + "grad_norm": 1.3385918140411377, + "learning_rate": 6.339046045839356e-06, + "loss": 0.9236, + "step": 23175 + }, + { + "epoch": 0.9380817482800485, + "grad_norm": 1.1412479877471924, + "learning_rate": 6.3183976873838534e-06, + "loss": 0.9457, + "step": 23180 + }, + { + "epoch": 0.9382840955078915, + "grad_norm": 1.2468445301055908, + "learning_rate": 6.29774932892835e-06, + "loss": 1.0548, + "step": 23185 + }, + { + "epoch": 0.9384864427357346, + "grad_norm": 1.169424057006836, + "learning_rate": 6.277100970472847e-06, + "loss": 0.9992, + "step": 23190 + }, + { + "epoch": 0.9386887899635775, + "grad_norm": 1.1182925701141357, + "learning_rate": 6.256452612017345e-06, + "loss": 0.965, + "step": 23195 + }, + { + "epoch": 0.9388911371914205, + "grad_norm": 1.1153651475906372, + "learning_rate": 6.235804253561843e-06, + "loss": 0.9414, + "step": 23200 + }, + { + "epoch": 0.9390934844192634, + "grad_norm": 1.1454898118972778, + "learning_rate": 6.215155895106339e-06, + "loss": 0.9536, + "step": 23205 + }, + { + "epoch": 0.9392958316471064, + "grad_norm": 1.1971384286880493, + "learning_rate": 6.194507536650836e-06, + "loss": 0.9632, + "step": 23210 + }, + { + "epoch": 0.9394981788749495, + "grad_norm": 1.2329403162002563, + "learning_rate": 6.173859178195334e-06, + "loss": 1.0392, + "step": 23215 + }, + { + "epoch": 0.9397005261027924, + "grad_norm": 1.113457202911377, + "learning_rate": 6.153210819739831e-06, + "loss": 0.9807, + "step": 23220 + }, + { + "epoch": 0.9399028733306354, + "grad_norm": 1.0697240829467773, + "learning_rate": 6.132562461284328e-06, + "loss": 0.9878, + "step": 23225 + }, + { + "epoch": 0.9401052205584783, + "grad_norm": 1.0905632972717285, + "learning_rate": 6.1119141028288255e-06, + "loss": 1.0008, + "step": 23230 + }, + { + "epoch": 0.9403075677863213, + "grad_norm": 1.2598730325698853, + "learning_rate": 6.091265744373323e-06, + "loss": 0.9747, + "step": 23235 + }, + { + "epoch": 0.9405099150141643, + "grad_norm": 1.3575570583343506, + "learning_rate": 6.07061738591782e-06, + "loss": 0.9928, + "step": 23240 + }, + { + "epoch": 0.9407122622420073, + "grad_norm": 1.1281064748764038, + "learning_rate": 6.049969027462317e-06, + "loss": 0.9636, + "step": 23245 + }, + { + "epoch": 0.9409146094698503, + "grad_norm": 1.2783358097076416, + "learning_rate": 6.029320669006814e-06, + "loss": 0.9372, + "step": 23250 + }, + { + "epoch": 0.9411169566976932, + "grad_norm": 1.2213220596313477, + "learning_rate": 6.008672310551312e-06, + "loss": 0.955, + "step": 23255 + }, + { + "epoch": 0.9413193039255362, + "grad_norm": 1.3591943979263306, + "learning_rate": 5.9880239520958085e-06, + "loss": 1.0201, + "step": 23260 + }, + { + "epoch": 0.9415216511533792, + "grad_norm": 1.3074491024017334, + "learning_rate": 5.967375593640306e-06, + "loss": 1.0069, + "step": 23265 + }, + { + "epoch": 0.9417239983812222, + "grad_norm": 1.256322979927063, + "learning_rate": 5.946727235184803e-06, + "loss": 0.9375, + "step": 23270 + }, + { + "epoch": 0.9419263456090652, + "grad_norm": 1.349203109741211, + "learning_rate": 5.9260788767293e-06, + "loss": 0.9614, + "step": 23275 + }, + { + "epoch": 0.9421286928369081, + "grad_norm": 1.1078602075576782, + "learning_rate": 5.905430518273798e-06, + "loss": 0.9919, + "step": 23280 + }, + { + "epoch": 0.9423310400647511, + "grad_norm": 1.2093766927719116, + "learning_rate": 5.884782159818294e-06, + "loss": 1.0046, + "step": 23285 + }, + { + "epoch": 0.9425333872925941, + "grad_norm": 1.3467715978622437, + "learning_rate": 5.864133801362792e-06, + "loss": 1.0882, + "step": 23290 + }, + { + "epoch": 0.942735734520437, + "grad_norm": 1.3225382566452026, + "learning_rate": 5.8434854429072896e-06, + "loss": 0.9484, + "step": 23295 + }, + { + "epoch": 0.9429380817482801, + "grad_norm": 1.210727334022522, + "learning_rate": 5.822837084451786e-06, + "loss": 1.0186, + "step": 23300 + }, + { + "epoch": 0.943140428976123, + "grad_norm": 1.1972028017044067, + "learning_rate": 5.802188725996283e-06, + "loss": 0.9711, + "step": 23305 + }, + { + "epoch": 0.943342776203966, + "grad_norm": 1.1847857236862183, + "learning_rate": 5.781540367540781e-06, + "loss": 1.015, + "step": 23310 + }, + { + "epoch": 0.943545123431809, + "grad_norm": 1.140752911567688, + "learning_rate": 5.760892009085278e-06, + "loss": 0.9548, + "step": 23315 + }, + { + "epoch": 0.9437474706596519, + "grad_norm": 1.1426345109939575, + "learning_rate": 5.740243650629775e-06, + "loss": 1.0281, + "step": 23320 + }, + { + "epoch": 0.943949817887495, + "grad_norm": 1.2390629053115845, + "learning_rate": 5.7195952921742725e-06, + "loss": 1.0057, + "step": 23325 + }, + { + "epoch": 0.944152165115338, + "grad_norm": 1.1874552965164185, + "learning_rate": 5.69894693371877e-06, + "loss": 1.0439, + "step": 23330 + }, + { + "epoch": 0.9443545123431809, + "grad_norm": 1.174991488456726, + "learning_rate": 5.678298575263267e-06, + "loss": 0.9842, + "step": 23335 + }, + { + "epoch": 0.9445568595710239, + "grad_norm": 1.2053190469741821, + "learning_rate": 5.6576502168077635e-06, + "loss": 0.995, + "step": 23340 + }, + { + "epoch": 0.9447592067988668, + "grad_norm": 1.2012766599655151, + "learning_rate": 5.637001858352262e-06, + "loss": 0.9801, + "step": 23345 + }, + { + "epoch": 0.9449615540267098, + "grad_norm": 1.151880145072937, + "learning_rate": 5.616353499896759e-06, + "loss": 0.9699, + "step": 23350 + }, + { + "epoch": 0.9451639012545529, + "grad_norm": 1.3854516744613647, + "learning_rate": 5.5957051414412554e-06, + "loss": 1.023, + "step": 23355 + }, + { + "epoch": 0.9453662484823958, + "grad_norm": 1.3277459144592285, + "learning_rate": 5.575056782985753e-06, + "loss": 0.9664, + "step": 23360 + }, + { + "epoch": 0.9455685957102388, + "grad_norm": 1.2124056816101074, + "learning_rate": 5.55440842453025e-06, + "loss": 0.9919, + "step": 23365 + }, + { + "epoch": 0.9457709429380817, + "grad_norm": 1.1614476442337036, + "learning_rate": 5.533760066074747e-06, + "loss": 0.9156, + "step": 23370 + }, + { + "epoch": 0.9459732901659247, + "grad_norm": 1.1627764701843262, + "learning_rate": 5.513111707619245e-06, + "loss": 1.0013, + "step": 23375 + }, + { + "epoch": 0.9461756373937678, + "grad_norm": 1.1513303518295288, + "learning_rate": 5.492463349163741e-06, + "loss": 0.9588, + "step": 23380 + }, + { + "epoch": 0.9463779846216107, + "grad_norm": 1.2399709224700928, + "learning_rate": 5.471814990708239e-06, + "loss": 1.0207, + "step": 23385 + }, + { + "epoch": 0.9465803318494537, + "grad_norm": 1.1727218627929688, + "learning_rate": 5.4511666322527365e-06, + "loss": 1.002, + "step": 23390 + }, + { + "epoch": 0.9467826790772966, + "grad_norm": 1.1629681587219238, + "learning_rate": 5.430518273797233e-06, + "loss": 0.9888, + "step": 23395 + }, + { + "epoch": 0.9469850263051396, + "grad_norm": 1.2181727886199951, + "learning_rate": 5.409869915341731e-06, + "loss": 0.9615, + "step": 23400 + }, + { + "epoch": 0.9471873735329825, + "grad_norm": 1.2515959739685059, + "learning_rate": 5.3892215568862275e-06, + "loss": 0.9624, + "step": 23405 + }, + { + "epoch": 0.9473897207608256, + "grad_norm": 1.210911750793457, + "learning_rate": 5.368573198430725e-06, + "loss": 1.018, + "step": 23410 + }, + { + "epoch": 0.9475920679886686, + "grad_norm": 1.2728792428970337, + "learning_rate": 5.347924839975222e-06, + "loss": 0.9122, + "step": 23415 + }, + { + "epoch": 0.9477944152165115, + "grad_norm": 1.1684070825576782, + "learning_rate": 5.3272764815197194e-06, + "loss": 0.9388, + "step": 23420 + }, + { + "epoch": 0.9479967624443545, + "grad_norm": 1.2218414545059204, + "learning_rate": 5.306628123064217e-06, + "loss": 0.9215, + "step": 23425 + }, + { + "epoch": 0.9481991096721974, + "grad_norm": 1.234809160232544, + "learning_rate": 5.285979764608714e-06, + "loss": 0.9451, + "step": 23430 + }, + { + "epoch": 0.9484014569000405, + "grad_norm": 1.2986630201339722, + "learning_rate": 5.2653314061532105e-06, + "loss": 0.9546, + "step": 23435 + }, + { + "epoch": 0.9486038041278835, + "grad_norm": 1.1890127658843994, + "learning_rate": 5.244683047697709e-06, + "loss": 0.9962, + "step": 23440 + }, + { + "epoch": 0.9488061513557264, + "grad_norm": 1.1800432205200195, + "learning_rate": 5.224034689242206e-06, + "loss": 0.9911, + "step": 23445 + }, + { + "epoch": 0.9490084985835694, + "grad_norm": 1.3987566232681274, + "learning_rate": 5.203386330786702e-06, + "loss": 0.9909, + "step": 23450 + }, + { + "epoch": 0.9492108458114124, + "grad_norm": 1.1060357093811035, + "learning_rate": 5.1827379723312e-06, + "loss": 0.9269, + "step": 23455 + }, + { + "epoch": 0.9494131930392553, + "grad_norm": 1.1613500118255615, + "learning_rate": 5.162089613875697e-06, + "loss": 0.9696, + "step": 23460 + }, + { + "epoch": 0.9496155402670984, + "grad_norm": 1.2415356636047363, + "learning_rate": 5.141441255420194e-06, + "loss": 0.9699, + "step": 23465 + }, + { + "epoch": 0.9498178874949413, + "grad_norm": 1.0619957447052002, + "learning_rate": 5.1207928969646916e-06, + "loss": 0.9851, + "step": 23470 + }, + { + "epoch": 0.9500202347227843, + "grad_norm": 1.276246428489685, + "learning_rate": 5.100144538509189e-06, + "loss": 1.0033, + "step": 23475 + }, + { + "epoch": 0.9502225819506273, + "grad_norm": 1.1619969606399536, + "learning_rate": 5.079496180053686e-06, + "loss": 0.9582, + "step": 23480 + }, + { + "epoch": 0.9504249291784702, + "grad_norm": 1.3111114501953125, + "learning_rate": 5.0588478215981835e-06, + "loss": 1.0037, + "step": 23485 + }, + { + "epoch": 0.9506272764063133, + "grad_norm": 1.2124131917953491, + "learning_rate": 5.03819946314268e-06, + "loss": 1.0154, + "step": 23490 + }, + { + "epoch": 0.9508296236341562, + "grad_norm": 1.3681647777557373, + "learning_rate": 5.017551104687178e-06, + "loss": 0.9945, + "step": 23495 + }, + { + "epoch": 0.9510319708619992, + "grad_norm": 1.2329882383346558, + "learning_rate": 4.9969027462316745e-06, + "loss": 0.9812, + "step": 23500 + }, + { + "epoch": 0.9512343180898422, + "grad_norm": 1.2132128477096558, + "learning_rate": 4.976254387776172e-06, + "loss": 0.9996, + "step": 23505 + }, + { + "epoch": 0.9514366653176851, + "grad_norm": 1.2256234884262085, + "learning_rate": 4.955606029320669e-06, + "loss": 0.979, + "step": 23510 + }, + { + "epoch": 0.9516390125455281, + "grad_norm": 1.2053040266036987, + "learning_rate": 4.934957670865166e-06, + "loss": 0.9884, + "step": 23515 + }, + { + "epoch": 0.9518413597733711, + "grad_norm": 1.1597155332565308, + "learning_rate": 4.914309312409664e-06, + "loss": 0.9326, + "step": 23520 + }, + { + "epoch": 0.9520437070012141, + "grad_norm": 1.3003026247024536, + "learning_rate": 4.893660953954161e-06, + "loss": 1.0425, + "step": 23525 + }, + { + "epoch": 0.9522460542290571, + "grad_norm": 1.2210971117019653, + "learning_rate": 4.873012595498658e-06, + "loss": 0.9526, + "step": 23530 + }, + { + "epoch": 0.9524484014569, + "grad_norm": 1.221471905708313, + "learning_rate": 4.8523642370431556e-06, + "loss": 0.9584, + "step": 23535 + }, + { + "epoch": 0.952650748684743, + "grad_norm": 1.158247470855713, + "learning_rate": 4.831715878587653e-06, + "loss": 1.0348, + "step": 23540 + }, + { + "epoch": 0.952853095912586, + "grad_norm": 1.1828258037567139, + "learning_rate": 4.811067520132149e-06, + "loss": 0.9556, + "step": 23545 + }, + { + "epoch": 0.953055443140429, + "grad_norm": 1.292490005493164, + "learning_rate": 4.7904191616766475e-06, + "loss": 1.0561, + "step": 23550 + }, + { + "epoch": 0.953257790368272, + "grad_norm": 1.8282053470611572, + "learning_rate": 4.769770803221144e-06, + "loss": 0.9852, + "step": 23555 + }, + { + "epoch": 0.9534601375961149, + "grad_norm": 1.2209551334381104, + "learning_rate": 4.749122444765641e-06, + "loss": 0.9675, + "step": 23560 + }, + { + "epoch": 0.9536624848239579, + "grad_norm": 1.193122148513794, + "learning_rate": 4.7284740863101385e-06, + "loss": 0.9932, + "step": 23565 + }, + { + "epoch": 0.9538648320518008, + "grad_norm": 1.1373188495635986, + "learning_rate": 4.707825727854636e-06, + "loss": 0.9883, + "step": 23570 + }, + { + "epoch": 0.9540671792796439, + "grad_norm": 1.124559760093689, + "learning_rate": 4.687177369399133e-06, + "loss": 0.9834, + "step": 23575 + }, + { + "epoch": 0.9542695265074869, + "grad_norm": 1.2636216878890991, + "learning_rate": 4.66652901094363e-06, + "loss": 0.9846, + "step": 23580 + }, + { + "epoch": 0.9544718737353298, + "grad_norm": 1.1844679117202759, + "learning_rate": 4.645880652488127e-06, + "loss": 1.0643, + "step": 23585 + }, + { + "epoch": 0.9546742209631728, + "grad_norm": 1.1882669925689697, + "learning_rate": 4.625232294032625e-06, + "loss": 1.0647, + "step": 23590 + }, + { + "epoch": 0.9548765681910157, + "grad_norm": 1.2093403339385986, + "learning_rate": 4.6045839355771214e-06, + "loss": 0.9515, + "step": 23595 + }, + { + "epoch": 0.9550789154188588, + "grad_norm": 1.1817821264266968, + "learning_rate": 4.583935577121619e-06, + "loss": 1.02, + "step": 23600 + }, + { + "epoch": 0.9552812626467018, + "grad_norm": 1.1955592632293701, + "learning_rate": 4.563287218666117e-06, + "loss": 1.0052, + "step": 23605 + }, + { + "epoch": 0.9554836098745447, + "grad_norm": 1.329101800918579, + "learning_rate": 4.542638860210613e-06, + "loss": 1.0078, + "step": 23610 + }, + { + "epoch": 0.9556859571023877, + "grad_norm": 1.2466204166412354, + "learning_rate": 4.521990501755111e-06, + "loss": 0.9645, + "step": 23615 + }, + { + "epoch": 0.9558883043302306, + "grad_norm": 1.1778624057769775, + "learning_rate": 4.501342143299608e-06, + "loss": 0.9081, + "step": 23620 + }, + { + "epoch": 0.9560906515580736, + "grad_norm": 1.2266567945480347, + "learning_rate": 4.480693784844105e-06, + "loss": 0.9674, + "step": 23625 + }, + { + "epoch": 0.9562929987859167, + "grad_norm": 1.1299916505813599, + "learning_rate": 4.4600454263886025e-06, + "loss": 0.9353, + "step": 23630 + }, + { + "epoch": 0.9564953460137596, + "grad_norm": 1.2154983282089233, + "learning_rate": 4.4393970679331e-06, + "loss": 0.9808, + "step": 23635 + }, + { + "epoch": 0.9566976932416026, + "grad_norm": 1.2375404834747314, + "learning_rate": 4.418748709477596e-06, + "loss": 1.0393, + "step": 23640 + }, + { + "epoch": 0.9569000404694455, + "grad_norm": 1.2174794673919678, + "learning_rate": 4.398100351022094e-06, + "loss": 1.0399, + "step": 23645 + }, + { + "epoch": 0.9571023876972885, + "grad_norm": 1.078600525856018, + "learning_rate": 4.377451992566591e-06, + "loss": 0.9643, + "step": 23650 + }, + { + "epoch": 0.9573047349251316, + "grad_norm": 1.3520904779434204, + "learning_rate": 4.356803634111088e-06, + "loss": 0.9751, + "step": 23655 + }, + { + "epoch": 0.9575070821529745, + "grad_norm": 1.3326326608657837, + "learning_rate": 4.3361552756555855e-06, + "loss": 0.9358, + "step": 23660 + }, + { + "epoch": 0.9577094293808175, + "grad_norm": 1.2878941297531128, + "learning_rate": 4.315506917200083e-06, + "loss": 0.9768, + "step": 23665 + }, + { + "epoch": 0.9579117766086604, + "grad_norm": 1.404470682144165, + "learning_rate": 4.29485855874458e-06, + "loss": 1.011, + "step": 23670 + }, + { + "epoch": 0.9581141238365034, + "grad_norm": 1.32426917552948, + "learning_rate": 4.274210200289077e-06, + "loss": 0.9365, + "step": 23675 + }, + { + "epoch": 0.9583164710643464, + "grad_norm": 1.253981590270996, + "learning_rate": 4.253561841833575e-06, + "loss": 0.9478, + "step": 23680 + }, + { + "epoch": 0.9585188182921894, + "grad_norm": 1.2419568300247192, + "learning_rate": 4.232913483378072e-06, + "loss": 1.0353, + "step": 23685 + }, + { + "epoch": 0.9587211655200324, + "grad_norm": 1.22864830493927, + "learning_rate": 4.212265124922568e-06, + "loss": 0.9522, + "step": 23690 + }, + { + "epoch": 0.9589235127478754, + "grad_norm": 1.2241621017456055, + "learning_rate": 4.191616766467066e-06, + "loss": 0.955, + "step": 23695 + }, + { + "epoch": 0.9591258599757183, + "grad_norm": 1.1657403707504272, + "learning_rate": 4.170968408011564e-06, + "loss": 1.0208, + "step": 23700 + }, + { + "epoch": 0.9593282072035613, + "grad_norm": 1.2061777114868164, + "learning_rate": 4.15032004955606e-06, + "loss": 0.9613, + "step": 23705 + }, + { + "epoch": 0.9595305544314043, + "grad_norm": 1.3198875188827515, + "learning_rate": 4.1296716911005576e-06, + "loss": 0.9877, + "step": 23710 + }, + { + "epoch": 0.9597329016592473, + "grad_norm": 1.3837649822235107, + "learning_rate": 4.109023332645055e-06, + "loss": 1.0236, + "step": 23715 + }, + { + "epoch": 0.9599352488870903, + "grad_norm": 1.0314319133758545, + "learning_rate": 4.088374974189552e-06, + "loss": 0.9813, + "step": 23720 + }, + { + "epoch": 0.9601375961149332, + "grad_norm": 1.3057734966278076, + "learning_rate": 4.0677266157340495e-06, + "loss": 0.9563, + "step": 23725 + }, + { + "epoch": 0.9603399433427762, + "grad_norm": 1.2327723503112793, + "learning_rate": 4.047078257278547e-06, + "loss": 0.9636, + "step": 23730 + }, + { + "epoch": 0.9605422905706191, + "grad_norm": 1.1845272779464722, + "learning_rate": 4.026429898823044e-06, + "loss": 1.0171, + "step": 23735 + }, + { + "epoch": 0.9607446377984622, + "grad_norm": 1.091482162475586, + "learning_rate": 4.005781540367541e-06, + "loss": 1.0038, + "step": 23740 + }, + { + "epoch": 0.9609469850263052, + "grad_norm": 1.1078810691833496, + "learning_rate": 3.985133181912038e-06, + "loss": 0.9537, + "step": 23745 + }, + { + "epoch": 0.9611493322541481, + "grad_norm": 1.3094723224639893, + "learning_rate": 3.964484823456535e-06, + "loss": 0.9207, + "step": 23750 + }, + { + "epoch": 0.9613516794819911, + "grad_norm": 1.218976616859436, + "learning_rate": 3.943836465001033e-06, + "loss": 0.9726, + "step": 23755 + }, + { + "epoch": 0.961554026709834, + "grad_norm": 1.2138930559158325, + "learning_rate": 3.92318810654553e-06, + "loss": 1.0626, + "step": 23760 + }, + { + "epoch": 0.9617563739376771, + "grad_norm": 1.143302321434021, + "learning_rate": 3.902539748090027e-06, + "loss": 0.9807, + "step": 23765 + }, + { + "epoch": 0.9619587211655201, + "grad_norm": 1.1158267259597778, + "learning_rate": 3.881891389634524e-06, + "loss": 0.9533, + "step": 23770 + }, + { + "epoch": 0.962161068393363, + "grad_norm": 1.12842857837677, + "learning_rate": 3.861243031179022e-06, + "loss": 0.9658, + "step": 23775 + }, + { + "epoch": 0.962363415621206, + "grad_norm": 1.2312090396881104, + "learning_rate": 3.840594672723519e-06, + "loss": 0.9806, + "step": 23780 + }, + { + "epoch": 0.9625657628490489, + "grad_norm": 1.1958683729171753, + "learning_rate": 3.819946314268015e-06, + "loss": 0.9881, + "step": 23785 + }, + { + "epoch": 0.9627681100768919, + "grad_norm": 1.2550935745239258, + "learning_rate": 3.7992979558125126e-06, + "loss": 1.006, + "step": 23790 + }, + { + "epoch": 0.962970457304735, + "grad_norm": 1.1829735040664673, + "learning_rate": 3.7786495973570103e-06, + "loss": 1.0019, + "step": 23795 + }, + { + "epoch": 0.9631728045325779, + "grad_norm": 1.201974630355835, + "learning_rate": 3.7580012389015076e-06, + "loss": 0.9364, + "step": 23800 + }, + { + "epoch": 0.9633751517604209, + "grad_norm": 1.1843925714492798, + "learning_rate": 3.7373528804460045e-06, + "loss": 0.9625, + "step": 23805 + }, + { + "epoch": 0.9635774989882638, + "grad_norm": 1.2058708667755127, + "learning_rate": 3.7167045219905022e-06, + "loss": 0.9961, + "step": 23810 + }, + { + "epoch": 0.9637798462161068, + "grad_norm": 1.3171906471252441, + "learning_rate": 3.696056163534999e-06, + "loss": 1.0389, + "step": 23815 + }, + { + "epoch": 0.9639821934439499, + "grad_norm": 1.1943846940994263, + "learning_rate": 3.6754078050794964e-06, + "loss": 1.0027, + "step": 23820 + }, + { + "epoch": 0.9641845406717928, + "grad_norm": 1.185172438621521, + "learning_rate": 3.6547594466239933e-06, + "loss": 0.9723, + "step": 23825 + }, + { + "epoch": 0.9643868878996358, + "grad_norm": 1.1887675523757935, + "learning_rate": 3.634111088168491e-06, + "loss": 0.9918, + "step": 23830 + }, + { + "epoch": 0.9645892351274787, + "grad_norm": 1.2175261974334717, + "learning_rate": 3.613462729712988e-06, + "loss": 0.9914, + "step": 23835 + }, + { + "epoch": 0.9647915823553217, + "grad_norm": 1.08931303024292, + "learning_rate": 3.592814371257485e-06, + "loss": 0.9862, + "step": 23840 + }, + { + "epoch": 0.9649939295831647, + "grad_norm": 1.2798324823379517, + "learning_rate": 3.572166012801982e-06, + "loss": 0.979, + "step": 23845 + }, + { + "epoch": 0.9651962768110077, + "grad_norm": 1.1880285739898682, + "learning_rate": 3.5515176543464798e-06, + "loss": 0.9834, + "step": 23850 + }, + { + "epoch": 0.9653986240388507, + "grad_norm": 1.332466959953308, + "learning_rate": 3.5308692958909766e-06, + "loss": 1.019, + "step": 23855 + }, + { + "epoch": 0.9656009712666936, + "grad_norm": 1.1690194606781006, + "learning_rate": 3.510220937435474e-06, + "loss": 1.0431, + "step": 23860 + }, + { + "epoch": 0.9658033184945366, + "grad_norm": 1.1507174968719482, + "learning_rate": 3.4895725789799717e-06, + "loss": 1.0088, + "step": 23865 + }, + { + "epoch": 0.9660056657223796, + "grad_norm": 1.1203949451446533, + "learning_rate": 3.4689242205244685e-06, + "loss": 0.9703, + "step": 23870 + }, + { + "epoch": 0.9662080129502226, + "grad_norm": 1.2323098182678223, + "learning_rate": 3.448275862068966e-06, + "loss": 1.0264, + "step": 23875 + }, + { + "epoch": 0.9664103601780656, + "grad_norm": 1.2957608699798584, + "learning_rate": 3.4276275036134627e-06, + "loss": 0.9298, + "step": 23880 + }, + { + "epoch": 0.9666127074059085, + "grad_norm": 1.2158442735671997, + "learning_rate": 3.4069791451579604e-06, + "loss": 0.971, + "step": 23885 + }, + { + "epoch": 0.9668150546337515, + "grad_norm": 1.1678370237350464, + "learning_rate": 3.3863307867024573e-06, + "loss": 1.0172, + "step": 23890 + }, + { + "epoch": 0.9670174018615945, + "grad_norm": 1.0520416498184204, + "learning_rate": 3.3656824282469546e-06, + "loss": 1.0031, + "step": 23895 + }, + { + "epoch": 0.9672197490894374, + "grad_norm": 1.2183130979537964, + "learning_rate": 3.3450340697914515e-06, + "loss": 0.9367, + "step": 23900 + }, + { + "epoch": 0.9674220963172805, + "grad_norm": 1.152599811553955, + "learning_rate": 3.324385711335949e-06, + "loss": 0.9871, + "step": 23905 + }, + { + "epoch": 0.9676244435451234, + "grad_norm": 1.1636003255844116, + "learning_rate": 3.303737352880446e-06, + "loss": 1.0386, + "step": 23910 + }, + { + "epoch": 0.9678267907729664, + "grad_norm": 1.1853296756744385, + "learning_rate": 3.2830889944249434e-06, + "loss": 1.0082, + "step": 23915 + }, + { + "epoch": 0.9680291380008094, + "grad_norm": 1.2935311794281006, + "learning_rate": 3.2624406359694402e-06, + "loss": 1.0014, + "step": 23920 + }, + { + "epoch": 0.9682314852286523, + "grad_norm": 1.0872714519500732, + "learning_rate": 3.241792277513938e-06, + "loss": 1.0006, + "step": 23925 + }, + { + "epoch": 0.9684338324564954, + "grad_norm": 1.1798073053359985, + "learning_rate": 3.221143919058435e-06, + "loss": 1.0605, + "step": 23930 + }, + { + "epoch": 0.9686361796843383, + "grad_norm": 1.2358653545379639, + "learning_rate": 3.200495560602932e-06, + "loss": 0.9557, + "step": 23935 + }, + { + "epoch": 0.9688385269121813, + "grad_norm": 1.3411706686019897, + "learning_rate": 3.17984720214743e-06, + "loss": 1.0711, + "step": 23940 + }, + { + "epoch": 0.9690408741400243, + "grad_norm": 1.0622667074203491, + "learning_rate": 3.1591988436919267e-06, + "loss": 1.0514, + "step": 23945 + }, + { + "epoch": 0.9692432213678672, + "grad_norm": 1.4040849208831787, + "learning_rate": 3.1385504852364236e-06, + "loss": 0.9848, + "step": 23950 + }, + { + "epoch": 0.9694455685957103, + "grad_norm": 1.2281724214553833, + "learning_rate": 3.1179021267809213e-06, + "loss": 1.0132, + "step": 23955 + }, + { + "epoch": 0.9696479158235533, + "grad_norm": 1.2061702013015747, + "learning_rate": 3.097253768325418e-06, + "loss": 0.9329, + "step": 23960 + }, + { + "epoch": 0.9698502630513962, + "grad_norm": 1.2600946426391602, + "learning_rate": 3.0766054098699155e-06, + "loss": 1.011, + "step": 23965 + }, + { + "epoch": 0.9700526102792392, + "grad_norm": 1.2559062242507935, + "learning_rate": 3.0559570514144128e-06, + "loss": 0.9967, + "step": 23970 + }, + { + "epoch": 0.9702549575070821, + "grad_norm": 1.216749906539917, + "learning_rate": 3.03530869295891e-06, + "loss": 1.0337, + "step": 23975 + }, + { + "epoch": 0.9704573047349251, + "grad_norm": 1.1401817798614502, + "learning_rate": 3.014660334503407e-06, + "loss": 0.9632, + "step": 23980 + }, + { + "epoch": 0.9706596519627682, + "grad_norm": 1.1647950410842896, + "learning_rate": 2.9940119760479042e-06, + "loss": 0.9736, + "step": 23985 + }, + { + "epoch": 0.9708619991906111, + "grad_norm": 1.3680803775787354, + "learning_rate": 2.9733636175924015e-06, + "loss": 1.0719, + "step": 23990 + }, + { + "epoch": 0.9710643464184541, + "grad_norm": 1.322601079940796, + "learning_rate": 2.952715259136899e-06, + "loss": 0.9928, + "step": 23995 + }, + { + "epoch": 0.971266693646297, + "grad_norm": 1.2319669723510742, + "learning_rate": 2.932066900681396e-06, + "loss": 0.959, + "step": 24000 + }, + { + "epoch": 0.97146904087414, + "grad_norm": 1.271428108215332, + "learning_rate": 2.911418542225893e-06, + "loss": 0.9896, + "step": 24005 + }, + { + "epoch": 0.9716713881019831, + "grad_norm": 1.088760256767273, + "learning_rate": 2.8907701837703903e-06, + "loss": 0.9785, + "step": 24010 + }, + { + "epoch": 0.971873735329826, + "grad_norm": 1.2278944253921509, + "learning_rate": 2.8701218253148876e-06, + "loss": 0.9492, + "step": 24015 + }, + { + "epoch": 0.972076082557669, + "grad_norm": 1.3369373083114624, + "learning_rate": 2.849473466859385e-06, + "loss": 1.0074, + "step": 24020 + }, + { + "epoch": 0.9722784297855119, + "grad_norm": 1.1910151243209839, + "learning_rate": 2.8288251084038818e-06, + "loss": 0.938, + "step": 24025 + }, + { + "epoch": 0.9724807770133549, + "grad_norm": 1.1880441904067993, + "learning_rate": 2.8081767499483795e-06, + "loss": 1.0394, + "step": 24030 + }, + { + "epoch": 0.9726831242411978, + "grad_norm": 1.1743971109390259, + "learning_rate": 2.7875283914928764e-06, + "loss": 1.0416, + "step": 24035 + }, + { + "epoch": 0.9728854714690409, + "grad_norm": 1.2691752910614014, + "learning_rate": 2.7668800330373737e-06, + "loss": 0.9933, + "step": 24040 + }, + { + "epoch": 0.9730878186968839, + "grad_norm": 1.296424388885498, + "learning_rate": 2.7462316745818705e-06, + "loss": 1.0496, + "step": 24045 + }, + { + "epoch": 0.9732901659247268, + "grad_norm": 1.0720136165618896, + "learning_rate": 2.7255833161263683e-06, + "loss": 0.9824, + "step": 24050 + }, + { + "epoch": 0.9734925131525698, + "grad_norm": 1.2150864601135254, + "learning_rate": 2.7049349576708655e-06, + "loss": 0.9835, + "step": 24055 + }, + { + "epoch": 0.9736948603804128, + "grad_norm": 1.1679117679595947, + "learning_rate": 2.6842865992153624e-06, + "loss": 1.0128, + "step": 24060 + }, + { + "epoch": 0.9738972076082558, + "grad_norm": 1.2796560525894165, + "learning_rate": 2.6636382407598597e-06, + "loss": 0.9919, + "step": 24065 + }, + { + "epoch": 0.9740995548360988, + "grad_norm": 1.163400411605835, + "learning_rate": 2.642989882304357e-06, + "loss": 0.9694, + "step": 24070 + }, + { + "epoch": 0.9743019020639417, + "grad_norm": 1.3392345905303955, + "learning_rate": 2.6223415238488543e-06, + "loss": 0.9879, + "step": 24075 + }, + { + "epoch": 0.9745042492917847, + "grad_norm": 2.100224018096924, + "learning_rate": 2.601693165393351e-06, + "loss": 1.0275, + "step": 24080 + }, + { + "epoch": 0.9747065965196277, + "grad_norm": 1.163314938545227, + "learning_rate": 2.5810448069378485e-06, + "loss": 1.007, + "step": 24085 + }, + { + "epoch": 0.9749089437474706, + "grad_norm": 1.1846504211425781, + "learning_rate": 2.5603964484823458e-06, + "loss": 0.9755, + "step": 24090 + }, + { + "epoch": 0.9751112909753137, + "grad_norm": 1.2549328804016113, + "learning_rate": 2.539748090026843e-06, + "loss": 0.9705, + "step": 24095 + }, + { + "epoch": 0.9753136382031566, + "grad_norm": 1.228094220161438, + "learning_rate": 2.51909973157134e-06, + "loss": 1.0204, + "step": 24100 + }, + { + "epoch": 0.9755159854309996, + "grad_norm": 1.2853307723999023, + "learning_rate": 2.4984513731158372e-06, + "loss": 0.9794, + "step": 24105 + }, + { + "epoch": 0.9757183326588426, + "grad_norm": 1.2282099723815918, + "learning_rate": 2.4778030146603345e-06, + "loss": 1.059, + "step": 24110 + }, + { + "epoch": 0.9759206798866855, + "grad_norm": 1.2545039653778076, + "learning_rate": 2.457154656204832e-06, + "loss": 0.9809, + "step": 24115 + }, + { + "epoch": 0.9761230271145286, + "grad_norm": 1.141188383102417, + "learning_rate": 2.436506297749329e-06, + "loss": 0.9898, + "step": 24120 + }, + { + "epoch": 0.9763253743423715, + "grad_norm": 1.217935562133789, + "learning_rate": 2.4158579392938264e-06, + "loss": 1.0491, + "step": 24125 + }, + { + "epoch": 0.9765277215702145, + "grad_norm": 1.1647645235061646, + "learning_rate": 2.3952095808383237e-06, + "loss": 1.0066, + "step": 24130 + }, + { + "epoch": 0.9767300687980575, + "grad_norm": 1.2348576784133911, + "learning_rate": 2.3745612223828206e-06, + "loss": 1.058, + "step": 24135 + }, + { + "epoch": 0.9769324160259004, + "grad_norm": 1.3031387329101562, + "learning_rate": 2.353912863927318e-06, + "loss": 0.9852, + "step": 24140 + }, + { + "epoch": 0.9771347632537434, + "grad_norm": 1.1181423664093018, + "learning_rate": 2.333264505471815e-06, + "loss": 0.943, + "step": 24145 + }, + { + "epoch": 0.9773371104815864, + "grad_norm": 1.23284113407135, + "learning_rate": 2.3126161470163125e-06, + "loss": 0.989, + "step": 24150 + }, + { + "epoch": 0.9775394577094294, + "grad_norm": 1.132831335067749, + "learning_rate": 2.2919677885608094e-06, + "loss": 0.9345, + "step": 24155 + }, + { + "epoch": 0.9777418049372724, + "grad_norm": 1.279350757598877, + "learning_rate": 2.2713194301053067e-06, + "loss": 0.9246, + "step": 24160 + }, + { + "epoch": 0.9779441521651153, + "grad_norm": 1.315624475479126, + "learning_rate": 2.250671071649804e-06, + "loss": 1.0077, + "step": 24165 + }, + { + "epoch": 0.9781464993929583, + "grad_norm": 1.2561452388763428, + "learning_rate": 2.2300227131943013e-06, + "loss": 1.0589, + "step": 24170 + }, + { + "epoch": 0.9783488466208013, + "grad_norm": 1.9925191402435303, + "learning_rate": 2.209374354738798e-06, + "loss": 1.0016, + "step": 24175 + }, + { + "epoch": 0.9785511938486443, + "grad_norm": 1.1933507919311523, + "learning_rate": 2.1887259962832954e-06, + "loss": 0.9412, + "step": 24180 + }, + { + "epoch": 0.9787535410764873, + "grad_norm": 1.336834192276001, + "learning_rate": 2.1680776378277927e-06, + "loss": 0.9794, + "step": 24185 + }, + { + "epoch": 0.9789558883043302, + "grad_norm": 1.2687063217163086, + "learning_rate": 2.14742927937229e-06, + "loss": 1.0073, + "step": 24190 + }, + { + "epoch": 0.9791582355321732, + "grad_norm": 1.1436597108840942, + "learning_rate": 2.1267809209167873e-06, + "loss": 0.9973, + "step": 24195 + }, + { + "epoch": 0.9793605827600161, + "grad_norm": 1.2121599912643433, + "learning_rate": 2.106132562461284e-06, + "loss": 0.9389, + "step": 24200 + }, + { + "epoch": 0.9795629299878592, + "grad_norm": 1.1339515447616577, + "learning_rate": 2.085484204005782e-06, + "loss": 0.974, + "step": 24205 + }, + { + "epoch": 0.9797652772157022, + "grad_norm": 1.2546398639678955, + "learning_rate": 2.0648358455502788e-06, + "loss": 0.99, + "step": 24210 + }, + { + "epoch": 0.9799676244435451, + "grad_norm": 1.3780025243759155, + "learning_rate": 2.044187487094776e-06, + "loss": 1.0159, + "step": 24215 + }, + { + "epoch": 0.9801699716713881, + "grad_norm": 1.1643260717391968, + "learning_rate": 2.0235391286392734e-06, + "loss": 0.9544, + "step": 24220 + }, + { + "epoch": 0.980372318899231, + "grad_norm": 1.1889039278030396, + "learning_rate": 2.0028907701837707e-06, + "loss": 0.992, + "step": 24225 + }, + { + "epoch": 0.9805746661270741, + "grad_norm": 1.2637370824813843, + "learning_rate": 1.9822424117282676e-06, + "loss": 0.9729, + "step": 24230 + }, + { + "epoch": 0.9807770133549171, + "grad_norm": 1.2505862712860107, + "learning_rate": 1.961594053272765e-06, + "loss": 1.0078, + "step": 24235 + }, + { + "epoch": 0.98097936058276, + "grad_norm": 1.205688714981079, + "learning_rate": 1.940945694817262e-06, + "loss": 0.9961, + "step": 24240 + }, + { + "epoch": 0.981181707810603, + "grad_norm": 1.193853497505188, + "learning_rate": 1.9202973363617594e-06, + "loss": 0.9857, + "step": 24245 + }, + { + "epoch": 0.981384055038446, + "grad_norm": 1.2576864957809448, + "learning_rate": 1.8996489779062563e-06, + "loss": 1.0902, + "step": 24250 + }, + { + "epoch": 0.9815864022662889, + "grad_norm": 1.190581202507019, + "learning_rate": 1.8790006194507538e-06, + "loss": 0.9813, + "step": 24255 + }, + { + "epoch": 0.981788749494132, + "grad_norm": 1.1921381950378418, + "learning_rate": 1.8583522609952511e-06, + "loss": 0.957, + "step": 24260 + }, + { + "epoch": 0.9819910967219749, + "grad_norm": 1.3175832033157349, + "learning_rate": 1.8377039025397482e-06, + "loss": 0.9555, + "step": 24265 + }, + { + "epoch": 0.9821934439498179, + "grad_norm": 1.2868329286575317, + "learning_rate": 1.8170555440842455e-06, + "loss": 1.0136, + "step": 24270 + }, + { + "epoch": 0.9823957911776608, + "grad_norm": 1.213088870048523, + "learning_rate": 1.7964071856287426e-06, + "loss": 0.9871, + "step": 24275 + }, + { + "epoch": 0.9825981384055038, + "grad_norm": 1.3250539302825928, + "learning_rate": 1.7757588271732399e-06, + "loss": 0.96, + "step": 24280 + }, + { + "epoch": 0.9828004856333469, + "grad_norm": 1.2626069784164429, + "learning_rate": 1.755110468717737e-06, + "loss": 0.9576, + "step": 24285 + }, + { + "epoch": 0.9830028328611898, + "grad_norm": 1.156372308731079, + "learning_rate": 1.7344621102622343e-06, + "loss": 0.9399, + "step": 24290 + }, + { + "epoch": 0.9832051800890328, + "grad_norm": 1.209682822227478, + "learning_rate": 1.7138137518067314e-06, + "loss": 0.9698, + "step": 24295 + }, + { + "epoch": 0.9834075273168758, + "grad_norm": 1.3231494426727295, + "learning_rate": 1.6931653933512286e-06, + "loss": 0.9901, + "step": 24300 + }, + { + "epoch": 0.9836098745447187, + "grad_norm": 1.1929429769515991, + "learning_rate": 1.6725170348957257e-06, + "loss": 0.99, + "step": 24305 + }, + { + "epoch": 0.9838122217725617, + "grad_norm": 1.1918138265609741, + "learning_rate": 1.651868676440223e-06, + "loss": 1.035, + "step": 24310 + }, + { + "epoch": 0.9840145690004047, + "grad_norm": 1.1524841785430908, + "learning_rate": 1.6312203179847201e-06, + "loss": 0.9669, + "step": 24315 + }, + { + "epoch": 0.9842169162282477, + "grad_norm": 1.2520349025726318, + "learning_rate": 1.6105719595292174e-06, + "loss": 0.9889, + "step": 24320 + }, + { + "epoch": 0.9844192634560907, + "grad_norm": 1.2931617498397827, + "learning_rate": 1.589923601073715e-06, + "loss": 1.0173, + "step": 24325 + }, + { + "epoch": 0.9846216106839336, + "grad_norm": 1.1261266469955444, + "learning_rate": 1.5692752426182118e-06, + "loss": 0.9884, + "step": 24330 + }, + { + "epoch": 0.9848239579117766, + "grad_norm": 1.300706386566162, + "learning_rate": 1.548626884162709e-06, + "loss": 0.9546, + "step": 24335 + }, + { + "epoch": 0.9850263051396196, + "grad_norm": 1.229163408279419, + "learning_rate": 1.5279785257072064e-06, + "loss": 0.9687, + "step": 24340 + }, + { + "epoch": 0.9852286523674626, + "grad_norm": 1.2274683713912964, + "learning_rate": 1.5073301672517035e-06, + "loss": 1.0014, + "step": 24345 + }, + { + "epoch": 0.9854309995953056, + "grad_norm": 1.395676612854004, + "learning_rate": 1.4866818087962008e-06, + "loss": 1.0205, + "step": 24350 + }, + { + "epoch": 0.9856333468231485, + "grad_norm": 1.0325241088867188, + "learning_rate": 1.466033450340698e-06, + "loss": 0.9521, + "step": 24355 + }, + { + "epoch": 0.9858356940509915, + "grad_norm": 1.2387555837631226, + "learning_rate": 1.4453850918851952e-06, + "loss": 0.9868, + "step": 24360 + }, + { + "epoch": 0.9860380412788344, + "grad_norm": 1.153342604637146, + "learning_rate": 1.4247367334296924e-06, + "loss": 0.9706, + "step": 24365 + }, + { + "epoch": 0.9862403885066775, + "grad_norm": 1.3121017217636108, + "learning_rate": 1.4040883749741897e-06, + "loss": 1.0178, + "step": 24370 + }, + { + "epoch": 0.9864427357345205, + "grad_norm": 1.3076057434082031, + "learning_rate": 1.3834400165186868e-06, + "loss": 1.0004, + "step": 24375 + }, + { + "epoch": 0.9866450829623634, + "grad_norm": 1.2197092771530151, + "learning_rate": 1.3627916580631841e-06, + "loss": 1.0053, + "step": 24380 + }, + { + "epoch": 0.9868474301902064, + "grad_norm": 1.2461926937103271, + "learning_rate": 1.3421432996076812e-06, + "loss": 0.966, + "step": 24385 + }, + { + "epoch": 0.9870497774180493, + "grad_norm": 1.2928906679153442, + "learning_rate": 1.3214949411521785e-06, + "loss": 0.94, + "step": 24390 + }, + { + "epoch": 0.9872521246458924, + "grad_norm": 1.213357925415039, + "learning_rate": 1.3008465826966756e-06, + "loss": 1.0062, + "step": 24395 + }, + { + "epoch": 0.9874544718737354, + "grad_norm": 1.2100154161453247, + "learning_rate": 1.2801982242411729e-06, + "loss": 0.9827, + "step": 24400 + }, + { + "epoch": 0.9876568191015783, + "grad_norm": 1.1670055389404297, + "learning_rate": 1.25954986578567e-06, + "loss": 0.9837, + "step": 24405 + }, + { + "epoch": 0.9878591663294213, + "grad_norm": 1.1580208539962769, + "learning_rate": 1.2389015073301673e-06, + "loss": 0.9521, + "step": 24410 + }, + { + "epoch": 0.9880615135572642, + "grad_norm": 1.1305630207061768, + "learning_rate": 1.2182531488746646e-06, + "loss": 1.0028, + "step": 24415 + }, + { + "epoch": 0.9882638607851072, + "grad_norm": 1.2192747592926025, + "learning_rate": 1.1976047904191619e-06, + "loss": 0.9609, + "step": 24420 + }, + { + "epoch": 0.9884662080129503, + "grad_norm": 1.265036702156067, + "learning_rate": 1.176956431963659e-06, + "loss": 0.9222, + "step": 24425 + }, + { + "epoch": 0.9886685552407932, + "grad_norm": 1.2127940654754639, + "learning_rate": 1.1563080735081562e-06, + "loss": 0.9798, + "step": 24430 + }, + { + "epoch": 0.9888709024686362, + "grad_norm": 1.2852286100387573, + "learning_rate": 1.1356597150526533e-06, + "loss": 1.0294, + "step": 24435 + }, + { + "epoch": 0.9890732496964791, + "grad_norm": 1.1611634492874146, + "learning_rate": 1.1150113565971506e-06, + "loss": 0.9352, + "step": 24440 + }, + { + "epoch": 0.9892755969243221, + "grad_norm": 1.1146661043167114, + "learning_rate": 1.0943629981416477e-06, + "loss": 1.0145, + "step": 24445 + }, + { + "epoch": 0.9894779441521652, + "grad_norm": 1.195049524307251, + "learning_rate": 1.073714639686145e-06, + "loss": 1.0102, + "step": 24450 + }, + { + "epoch": 0.9896802913800081, + "grad_norm": 1.1704808473587036, + "learning_rate": 1.053066281230642e-06, + "loss": 1.024, + "step": 24455 + }, + { + "epoch": 0.9898826386078511, + "grad_norm": 1.1268508434295654, + "learning_rate": 1.0324179227751394e-06, + "loss": 1.014, + "step": 24460 + }, + { + "epoch": 0.990084985835694, + "grad_norm": 1.3437390327453613, + "learning_rate": 1.0117695643196367e-06, + "loss": 1.0233, + "step": 24465 + }, + { + "epoch": 0.990287333063537, + "grad_norm": 1.204566478729248, + "learning_rate": 9.911212058641338e-07, + "loss": 1.0272, + "step": 24470 + }, + { + "epoch": 0.99048968029138, + "grad_norm": 1.1397830247879028, + "learning_rate": 9.70472847408631e-07, + "loss": 0.9275, + "step": 24475 + }, + { + "epoch": 0.990692027519223, + "grad_norm": 1.2095698118209839, + "learning_rate": 9.498244889531282e-07, + "loss": 0.9429, + "step": 24480 + }, + { + "epoch": 0.990894374747066, + "grad_norm": 1.149962067604065, + "learning_rate": 9.291761304976256e-07, + "loss": 1.035, + "step": 24485 + }, + { + "epoch": 0.991096721974909, + "grad_norm": 1.2392507791519165, + "learning_rate": 9.085277720421228e-07, + "loss": 1.0362, + "step": 24490 + }, + { + "epoch": 0.9912990692027519, + "grad_norm": 1.2684184312820435, + "learning_rate": 8.878794135866199e-07, + "loss": 0.9692, + "step": 24495 + }, + { + "epoch": 0.9915014164305949, + "grad_norm": 1.1505355834960938, + "learning_rate": 8.672310551311171e-07, + "loss": 1.0345, + "step": 24500 + }, + { + "epoch": 0.9917037636584379, + "grad_norm": 1.2026902437210083, + "learning_rate": 8.465826966756143e-07, + "loss": 0.9959, + "step": 24505 + }, + { + "epoch": 0.9919061108862809, + "grad_norm": 1.0985236167907715, + "learning_rate": 8.259343382201115e-07, + "loss": 0.9768, + "step": 24510 + }, + { + "epoch": 0.9921084581141238, + "grad_norm": 1.1708601713180542, + "learning_rate": 8.052859797646087e-07, + "loss": 1.0179, + "step": 24515 + }, + { + "epoch": 0.9923108053419668, + "grad_norm": 1.1726040840148926, + "learning_rate": 7.846376213091059e-07, + "loss": 0.9639, + "step": 24520 + }, + { + "epoch": 0.9925131525698098, + "grad_norm": 1.179294228553772, + "learning_rate": 7.639892628536032e-07, + "loss": 0.9699, + "step": 24525 + }, + { + "epoch": 0.9927154997976527, + "grad_norm": 1.2063021659851074, + "learning_rate": 7.433409043981004e-07, + "loss": 0.9642, + "step": 24530 + }, + { + "epoch": 0.9929178470254958, + "grad_norm": 1.2376971244812012, + "learning_rate": 7.226925459425976e-07, + "loss": 0.961, + "step": 24535 + }, + { + "epoch": 0.9931201942533388, + "grad_norm": 1.3272475004196167, + "learning_rate": 7.020441874870949e-07, + "loss": 0.9758, + "step": 24540 + }, + { + "epoch": 0.9933225414811817, + "grad_norm": 1.1692842245101929, + "learning_rate": 6.813958290315921e-07, + "loss": 1.0068, + "step": 24545 + }, + { + "epoch": 0.9935248887090247, + "grad_norm": 1.1925163269042969, + "learning_rate": 6.607474705760893e-07, + "loss": 1.0023, + "step": 24550 + }, + { + "epoch": 0.9937272359368676, + "grad_norm": 1.2250268459320068, + "learning_rate": 6.400991121205864e-07, + "loss": 1.0437, + "step": 24555 + }, + { + "epoch": 0.9939295831647107, + "grad_norm": 1.1780215501785278, + "learning_rate": 6.194507536650836e-07, + "loss": 0.9488, + "step": 24560 + }, + { + "epoch": 0.9941319303925537, + "grad_norm": 1.102590799331665, + "learning_rate": 5.988023952095809e-07, + "loss": 0.9794, + "step": 24565 + }, + { + "epoch": 0.9943342776203966, + "grad_norm": 1.2239985466003418, + "learning_rate": 5.781540367540781e-07, + "loss": 0.9694, + "step": 24570 + }, + { + "epoch": 0.9945366248482396, + "grad_norm": 1.2165189981460571, + "learning_rate": 5.575056782985753e-07, + "loss": 0.999, + "step": 24575 + }, + { + "epoch": 0.9947389720760825, + "grad_norm": 1.164983868598938, + "learning_rate": 5.368573198430725e-07, + "loss": 1.0329, + "step": 24580 + }, + { + "epoch": 0.9949413193039255, + "grad_norm": 1.333021640777588, + "learning_rate": 5.162089613875697e-07, + "loss": 1.048, + "step": 24585 + }, + { + "epoch": 0.9951436665317686, + "grad_norm": 1.3476011753082275, + "learning_rate": 4.955606029320669e-07, + "loss": 0.9855, + "step": 24590 + }, + { + "epoch": 0.9953460137596115, + "grad_norm": 1.2735216617584229, + "learning_rate": 4.749122444765641e-07, + "loss": 1.0284, + "step": 24595 + }, + { + "epoch": 0.9955483609874545, + "grad_norm": 1.1732523441314697, + "learning_rate": 4.542638860210614e-07, + "loss": 1.0321, + "step": 24600 + }, + { + "epoch": 0.9957507082152974, + "grad_norm": 1.0077344179153442, + "learning_rate": 4.3361552756555857e-07, + "loss": 0.9924, + "step": 24605 + }, + { + "epoch": 0.9959530554431404, + "grad_norm": 1.174232840538025, + "learning_rate": 4.1296716911005576e-07, + "loss": 0.9537, + "step": 24610 + }, + { + "epoch": 0.9961554026709835, + "grad_norm": 1.2381494045257568, + "learning_rate": 3.9231881065455295e-07, + "loss": 0.9593, + "step": 24615 + }, + { + "epoch": 0.9963577498988264, + "grad_norm": 1.3087583780288696, + "learning_rate": 3.716704521990502e-07, + "loss": 0.967, + "step": 24620 + }, + { + "epoch": 0.9965600971266694, + "grad_norm": 1.222768783569336, + "learning_rate": 3.5102209374354744e-07, + "loss": 1.0279, + "step": 24625 + }, + { + "epoch": 0.9967624443545123, + "grad_norm": 1.2564250230789185, + "learning_rate": 3.3037373528804463e-07, + "loss": 1.034, + "step": 24630 + }, + { + "epoch": 0.9969647915823553, + "grad_norm": 1.0893826484680176, + "learning_rate": 3.097253768325418e-07, + "loss": 0.9938, + "step": 24635 + }, + { + "epoch": 0.9971671388101983, + "grad_norm": 1.1113886833190918, + "learning_rate": 2.8907701837703906e-07, + "loss": 0.9568, + "step": 24640 + }, + { + "epoch": 0.9973694860380413, + "grad_norm": 1.1226166486740112, + "learning_rate": 2.6842865992153625e-07, + "loss": 1.0333, + "step": 24645 + }, + { + "epoch": 0.9975718332658843, + "grad_norm": 1.3192880153656006, + "learning_rate": 2.4778030146603344e-07, + "loss": 0.913, + "step": 24650 + }, + { + "epoch": 0.9977741804937272, + "grad_norm": 1.1434147357940674, + "learning_rate": 2.271319430105307e-07, + "loss": 1.0118, + "step": 24655 + }, + { + "epoch": 0.9979765277215702, + "grad_norm": 1.2090116739273071, + "learning_rate": 2.0648358455502788e-07, + "loss": 0.9657, + "step": 24660 + }, + { + "epoch": 0.9981788749494132, + "grad_norm": 1.2893544435501099, + "learning_rate": 1.858352260995251e-07, + "loss": 0.9526, + "step": 24665 + }, + { + "epoch": 0.9983812221772562, + "grad_norm": 1.1249070167541504, + "learning_rate": 1.6518686764402231e-07, + "loss": 0.9653, + "step": 24670 + }, + { + "epoch": 0.9985835694050992, + "grad_norm": 1.1773556470870972, + "learning_rate": 1.4453850918851953e-07, + "loss": 1.0098, + "step": 24675 + }, + { + "epoch": 0.9987859166329421, + "grad_norm": 1.2133076190948486, + "learning_rate": 1.2389015073301672e-07, + "loss": 1.0156, + "step": 24680 + }, + { + "epoch": 0.9989882638607851, + "grad_norm": 1.2049704790115356, + "learning_rate": 1.0324179227751394e-07, + "loss": 0.9781, + "step": 24685 + }, + { + "epoch": 0.9991906110886281, + "grad_norm": 1.1690733432769775, + "learning_rate": 8.259343382201116e-08, + "loss": 0.9829, + "step": 24690 + }, + { + "epoch": 0.999392958316471, + "grad_norm": 1.3740712404251099, + "learning_rate": 6.194507536650836e-08, + "loss": 0.9842, + "step": 24695 + }, + { + "epoch": 0.9995953055443141, + "grad_norm": 1.194843053817749, + "learning_rate": 4.129671691100558e-08, + "loss": 1.0084, + "step": 24700 + }, + { + "epoch": 0.999797652772157, + "grad_norm": 1.3295986652374268, + "learning_rate": 2.064835845550279e-08, + "loss": 0.9322, + "step": 24705 + }, + { + "epoch": 1.0, + "grad_norm": 1.1768206357955933, + "learning_rate": 0.0, + "loss": 0.9516, + "step": 24710 + }, + { + "epoch": 1.0, + "step": 24710, + "total_flos": 4.050169848999287e+19, + "train_loss": 1.0059279920214441, + "train_runtime": 51238.2737, + "train_samples_per_second": 15.432, + "train_steps_per_second": 0.482 + } + ], + "logging_steps": 5, + "max_steps": 24710, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.050169848999287e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}