{ "best_metric": 0.9517125210555868, "best_model_checkpoint": "/mnt/data2/weizhi/checkpoints/unifilter_llava_qwen2.5_1.5b_instruct_caption_only_mse_loss_siglip_384_mmtoken_144_40k_data/checkpoint-5138", "epoch": 10.0, "eval_steps": 500, "global_step": 7340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013623978201634877, "grad_norm": 2317.7174681119245, "learning_rate": 9.04977375565611e-08, "loss": 8.627, "step": 1 }, { "epoch": 0.0027247956403269754, "grad_norm": 2181.211732152344, "learning_rate": 1.809954751131222e-07, "loss": 7.9336, "step": 2 }, { "epoch": 0.004087193460490463, "grad_norm": 2171.4355702276084, "learning_rate": 2.7149321266968326e-07, "loss": 7.9629, "step": 3 }, { "epoch": 0.005449591280653951, "grad_norm": 2244.6018879167855, "learning_rate": 3.619909502262444e-07, "loss": 8.2988, "step": 4 }, { "epoch": 0.006811989100817439, "grad_norm": 2315.086812498874, "learning_rate": 4.5248868778280546e-07, "loss": 8.5488, "step": 5 }, { "epoch": 0.008174386920980926, "grad_norm": 2176.8333541518605, "learning_rate": 5.429864253393665e-07, "loss": 8.0801, "step": 6 }, { "epoch": 0.009536784741144414, "grad_norm": 2174.9669683617267, "learning_rate": 6.334841628959276e-07, "loss": 7.9473, "step": 7 }, { "epoch": 0.010899182561307902, "grad_norm": 2156.9876781526864, "learning_rate": 7.239819004524888e-07, "loss": 7.8672, "step": 8 }, { "epoch": 0.01226158038147139, "grad_norm": 1908.232204987104, "learning_rate": 8.144796380090498e-07, "loss": 6.626, "step": 9 }, { "epoch": 0.013623978201634877, "grad_norm": 1807.8204716999592, "learning_rate": 9.049773755656109e-07, "loss": 5.9785, "step": 10 }, { "epoch": 0.014986376021798364, "grad_norm": 987.8267408520322, "learning_rate": 9.954751131221719e-07, "loss": 3.001, "step": 11 }, { "epoch": 0.01634877384196185, "grad_norm": 943.6121605756692, "learning_rate": 1.085972850678733e-06, "loss": 2.8496, "step": 12 }, { "epoch": 0.017711171662125342, "grad_norm": 903.0996774560291, "learning_rate": 1.1764705882352942e-06, "loss": 2.8564, "step": 13 }, { "epoch": 0.01907356948228883, "grad_norm": 684.0537531187284, "learning_rate": 1.2669683257918552e-06, "loss": 2.458, "step": 14 }, { "epoch": 0.020435967302452316, "grad_norm": 1633.0465393940271, "learning_rate": 1.3574660633484164e-06, "loss": 3.8301, "step": 15 }, { "epoch": 0.021798365122615803, "grad_norm": 3017.5624148376664, "learning_rate": 1.4479638009049775e-06, "loss": 10.5195, "step": 16 }, { "epoch": 0.02316076294277929, "grad_norm": 3248.6708043823423, "learning_rate": 1.5384615384615387e-06, "loss": 10.8555, "step": 17 }, { "epoch": 0.02452316076294278, "grad_norm": 3568.7307420430284, "learning_rate": 1.6289592760180997e-06, "loss": 11.6641, "step": 18 }, { "epoch": 0.025885558583106268, "grad_norm": 2716.2278695415425, "learning_rate": 1.7194570135746609e-06, "loss": 9.2227, "step": 19 }, { "epoch": 0.027247956403269755, "grad_norm": 2065.7388592860975, "learning_rate": 1.8099547511312218e-06, "loss": 6.2246, "step": 20 }, { "epoch": 0.02861035422343324, "grad_norm": 1294.9989758432484, "learning_rate": 1.9004524886877828e-06, "loss": 3.8633, "step": 21 }, { "epoch": 0.02997275204359673, "grad_norm": 478.11691924535, "learning_rate": 1.9909502262443437e-06, "loss": 1.8315, "step": 22 }, { "epoch": 0.031335149863760216, "grad_norm": 59.09137902683453, "learning_rate": 2.0814479638009053e-06, "loss": 1.5396, "step": 23 }, { "epoch": 0.0326975476839237, "grad_norm": 207.67004038449753, "learning_rate": 2.171945701357466e-06, "loss": 1.5728, "step": 24 }, { "epoch": 0.0340599455040872, "grad_norm": 223.16076668795733, "learning_rate": 2.2624434389140273e-06, "loss": 1.561, "step": 25 }, { "epoch": 0.035422343324250684, "grad_norm": 377.66731284215575, "learning_rate": 2.3529411764705885e-06, "loss": 1.874, "step": 26 }, { "epoch": 0.03678474114441417, "grad_norm": 398.2520826468206, "learning_rate": 2.4434389140271496e-06, "loss": 1.9438, "step": 27 }, { "epoch": 0.03814713896457766, "grad_norm": 363.36189127540956, "learning_rate": 2.5339366515837104e-06, "loss": 1.9136, "step": 28 }, { "epoch": 0.039509536784741145, "grad_norm": 399.77335085004427, "learning_rate": 2.624434389140272e-06, "loss": 2.0908, "step": 29 }, { "epoch": 0.04087193460490463, "grad_norm": 404.8355349152147, "learning_rate": 2.7149321266968327e-06, "loss": 1.8516, "step": 30 }, { "epoch": 0.04223433242506812, "grad_norm": 272.8978272335609, "learning_rate": 2.805429864253394e-06, "loss": 2.0317, "step": 31 }, { "epoch": 0.043596730245231606, "grad_norm": 216.33965790567197, "learning_rate": 2.895927601809955e-06, "loss": 1.5093, "step": 32 }, { "epoch": 0.04495912806539509, "grad_norm": 121.29196248363046, "learning_rate": 2.986425339366516e-06, "loss": 1.498, "step": 33 }, { "epoch": 0.04632152588555858, "grad_norm": 65.85461804840891, "learning_rate": 3.0769230769230774e-06, "loss": 1.3154, "step": 34 }, { "epoch": 0.047683923705722074, "grad_norm": 11.719149990046713, "learning_rate": 3.167420814479638e-06, "loss": 1.25, "step": 35 }, { "epoch": 0.04904632152588556, "grad_norm": 67.36909534813579, "learning_rate": 3.2579185520361994e-06, "loss": 1.3687, "step": 36 }, { "epoch": 0.05040871934604905, "grad_norm": 108.02880303096136, "learning_rate": 3.34841628959276e-06, "loss": 1.3784, "step": 37 }, { "epoch": 0.051771117166212535, "grad_norm": 108.19028301105189, "learning_rate": 3.4389140271493217e-06, "loss": 1.4824, "step": 38 }, { "epoch": 0.05313351498637602, "grad_norm": 127.10226013977743, "learning_rate": 3.529411764705883e-06, "loss": 1.5088, "step": 39 }, { "epoch": 0.05449591280653951, "grad_norm": 100.80024291664486, "learning_rate": 3.6199095022624436e-06, "loss": 1.3843, "step": 40 }, { "epoch": 0.055858310626702996, "grad_norm": 97.15111442794525, "learning_rate": 3.710407239819005e-06, "loss": 1.395, "step": 41 }, { "epoch": 0.05722070844686648, "grad_norm": 57.41853859369711, "learning_rate": 3.8009049773755656e-06, "loss": 1.3157, "step": 42 }, { "epoch": 0.05858310626702997, "grad_norm": 43.50766138531282, "learning_rate": 3.891402714932127e-06, "loss": 1.1626, "step": 43 }, { "epoch": 0.05994550408719346, "grad_norm": 6.633476632185626, "learning_rate": 3.9819004524886875e-06, "loss": 1.1453, "step": 44 }, { "epoch": 0.06130790190735695, "grad_norm": 23.9388132376298, "learning_rate": 4.072398190045249e-06, "loss": 1.2498, "step": 45 }, { "epoch": 0.06267029972752043, "grad_norm": 38.07423176784401, "learning_rate": 4.162895927601811e-06, "loss": 1.208, "step": 46 }, { "epoch": 0.06403269754768393, "grad_norm": 71.18926330926837, "learning_rate": 4.2533936651583714e-06, "loss": 1.2788, "step": 47 }, { "epoch": 0.0653950953678474, "grad_norm": 70.71580004276015, "learning_rate": 4.343891402714932e-06, "loss": 1.2649, "step": 48 }, { "epoch": 0.0667574931880109, "grad_norm": 49.64716478685839, "learning_rate": 4.434389140271493e-06, "loss": 1.2505, "step": 49 }, { "epoch": 0.0681198910081744, "grad_norm": 40.076912563487475, "learning_rate": 4.5248868778280546e-06, "loss": 1.1807, "step": 50 }, { "epoch": 0.06948228882833787, "grad_norm": 34.916318154536334, "learning_rate": 4.615384615384616e-06, "loss": 1.3262, "step": 51 }, { "epoch": 0.07084468664850137, "grad_norm": 19.199476044444395, "learning_rate": 4.705882352941177e-06, "loss": 1.2043, "step": 52 }, { "epoch": 0.07220708446866485, "grad_norm": 7.72446807225803, "learning_rate": 4.7963800904977385e-06, "loss": 1.1401, "step": 53 }, { "epoch": 0.07356948228882834, "grad_norm": 17.050014305580405, "learning_rate": 4.886877828054299e-06, "loss": 1.1343, "step": 54 }, { "epoch": 0.07493188010899182, "grad_norm": 42.29546474346193, "learning_rate": 4.97737556561086e-06, "loss": 1.1956, "step": 55 }, { "epoch": 0.07629427792915532, "grad_norm": 36.74378096140129, "learning_rate": 5.067873303167421e-06, "loss": 1.1465, "step": 56 }, { "epoch": 0.0776566757493188, "grad_norm": 37.90434711921804, "learning_rate": 5.158371040723983e-06, "loss": 1.126, "step": 57 }, { "epoch": 0.07901907356948229, "grad_norm": 33.154714724264956, "learning_rate": 5.248868778280544e-06, "loss": 1.0708, "step": 58 }, { "epoch": 0.08038147138964577, "grad_norm": 30.6764279692487, "learning_rate": 5.339366515837105e-06, "loss": 1.125, "step": 59 }, { "epoch": 0.08174386920980926, "grad_norm": 4.575570043436172, "learning_rate": 5.4298642533936655e-06, "loss": 1.1255, "step": 60 }, { "epoch": 0.08310626702997276, "grad_norm": 24.67326159697653, "learning_rate": 5.520361990950227e-06, "loss": 1.0879, "step": 61 }, { "epoch": 0.08446866485013624, "grad_norm": 19.878201336267427, "learning_rate": 5.610859728506788e-06, "loss": 1.1001, "step": 62 }, { "epoch": 0.08583106267029973, "grad_norm": 23.914299167372416, "learning_rate": 5.7013574660633486e-06, "loss": 1.0059, "step": 63 }, { "epoch": 0.08719346049046321, "grad_norm": 16.189762050161043, "learning_rate": 5.79185520361991e-06, "loss": 1.0945, "step": 64 }, { "epoch": 0.0885558583106267, "grad_norm": 11.308019371387259, "learning_rate": 5.882352941176471e-06, "loss": 1.0344, "step": 65 }, { "epoch": 0.08991825613079019, "grad_norm": 7.6470683437723395, "learning_rate": 5.972850678733032e-06, "loss": 0.9832, "step": 66 }, { "epoch": 0.09128065395095368, "grad_norm": 13.464683774951638, "learning_rate": 6.0633484162895924e-06, "loss": 1.0645, "step": 67 }, { "epoch": 0.09264305177111716, "grad_norm": 17.823212140768625, "learning_rate": 6.153846153846155e-06, "loss": 1.0452, "step": 68 }, { "epoch": 0.09400544959128065, "grad_norm": 21.114202049617, "learning_rate": 6.244343891402716e-06, "loss": 0.9932, "step": 69 }, { "epoch": 0.09536784741144415, "grad_norm": 25.37102303271769, "learning_rate": 6.334841628959276e-06, "loss": 1.0059, "step": 70 }, { "epoch": 0.09673024523160763, "grad_norm": 4.885715319796453, "learning_rate": 6.425339366515838e-06, "loss": 0.95, "step": 71 }, { "epoch": 0.09809264305177112, "grad_norm": 16.940429801587726, "learning_rate": 6.515837104072399e-06, "loss": 0.9331, "step": 72 }, { "epoch": 0.0994550408719346, "grad_norm": 20.360972277956055, "learning_rate": 6.6063348416289595e-06, "loss": 0.9155, "step": 73 }, { "epoch": 0.1008174386920981, "grad_norm": 5.887606000027208, "learning_rate": 6.69683257918552e-06, "loss": 0.8604, "step": 74 }, { "epoch": 0.10217983651226158, "grad_norm": 15.708423247558327, "learning_rate": 6.787330316742083e-06, "loss": 0.9578, "step": 75 }, { "epoch": 0.10354223433242507, "grad_norm": 5.630760860398915, "learning_rate": 6.8778280542986434e-06, "loss": 0.9011, "step": 76 }, { "epoch": 0.10490463215258855, "grad_norm": 12.227729671255569, "learning_rate": 6.968325791855204e-06, "loss": 0.9026, "step": 77 }, { "epoch": 0.10626702997275204, "grad_norm": 20.833005075011165, "learning_rate": 7.058823529411766e-06, "loss": 0.8997, "step": 78 }, { "epoch": 0.10762942779291552, "grad_norm": 6.108315874372768, "learning_rate": 7.1493212669683265e-06, "loss": 0.8467, "step": 79 }, { "epoch": 0.10899182561307902, "grad_norm": 4.572826569692256, "learning_rate": 7.239819004524887e-06, "loss": 0.8057, "step": 80 }, { "epoch": 0.11035422343324251, "grad_norm": 23.455299194236385, "learning_rate": 7.330316742081448e-06, "loss": 0.7963, "step": 81 }, { "epoch": 0.11171662125340599, "grad_norm": 7.510020697514017, "learning_rate": 7.42081447963801e-06, "loss": 0.7983, "step": 82 }, { "epoch": 0.11307901907356949, "grad_norm": 11.43206873287313, "learning_rate": 7.51131221719457e-06, "loss": 0.7712, "step": 83 }, { "epoch": 0.11444141689373297, "grad_norm": 22.049278312129488, "learning_rate": 7.601809954751131e-06, "loss": 0.8193, "step": 84 }, { "epoch": 0.11580381471389646, "grad_norm": 16.520418235733054, "learning_rate": 7.692307692307694e-06, "loss": 0.7156, "step": 85 }, { "epoch": 0.11716621253405994, "grad_norm": 6.7260679903436085, "learning_rate": 7.782805429864253e-06, "loss": 0.6829, "step": 86 }, { "epoch": 0.11852861035422343, "grad_norm": 18.186107099679525, "learning_rate": 7.873303167420815e-06, "loss": 0.725, "step": 87 }, { "epoch": 0.11989100817438691, "grad_norm": 5.649287337402105, "learning_rate": 7.963800904977375e-06, "loss": 0.6428, "step": 88 }, { "epoch": 0.12125340599455041, "grad_norm": 23.78210592808567, "learning_rate": 8.054298642533938e-06, "loss": 0.6324, "step": 89 }, { "epoch": 0.1226158038147139, "grad_norm": 12.721086289912732, "learning_rate": 8.144796380090498e-06, "loss": 0.6636, "step": 90 }, { "epoch": 0.12397820163487738, "grad_norm": 5.642830176804904, "learning_rate": 8.23529411764706e-06, "loss": 0.5786, "step": 91 }, { "epoch": 0.12534059945504086, "grad_norm": 25.850026469728597, "learning_rate": 8.325791855203621e-06, "loss": 0.6138, "step": 92 }, { "epoch": 0.12670299727520437, "grad_norm": 7.307054166590461, "learning_rate": 8.416289592760181e-06, "loss": 0.6033, "step": 93 }, { "epoch": 0.12806539509536785, "grad_norm": 14.9988527380283, "learning_rate": 8.506787330316743e-06, "loss": 0.5609, "step": 94 }, { "epoch": 0.12942779291553133, "grad_norm": 16.536047510481964, "learning_rate": 8.597285067873304e-06, "loss": 0.5165, "step": 95 }, { "epoch": 0.1307901907356948, "grad_norm": 5.066474529742607, "learning_rate": 8.687782805429864e-06, "loss": 0.5249, "step": 96 }, { "epoch": 0.13215258855585832, "grad_norm": 4.01426975916683, "learning_rate": 8.778280542986426e-06, "loss": 0.4414, "step": 97 }, { "epoch": 0.1335149863760218, "grad_norm": 8.797603329317086, "learning_rate": 8.868778280542986e-06, "loss": 0.5166, "step": 98 }, { "epoch": 0.13487738419618528, "grad_norm": 9.54857246080667, "learning_rate": 8.95927601809955e-06, "loss": 0.4696, "step": 99 }, { "epoch": 0.1362397820163488, "grad_norm": 13.371714642334242, "learning_rate": 9.049773755656109e-06, "loss": 0.4055, "step": 100 }, { "epoch": 0.13760217983651227, "grad_norm": 29.48619459218086, "learning_rate": 9.14027149321267e-06, "loss": 0.5056, "step": 101 }, { "epoch": 0.13896457765667575, "grad_norm": 26.129977592080206, "learning_rate": 9.230769230769232e-06, "loss": 0.4768, "step": 102 }, { "epoch": 0.14032697547683923, "grad_norm": 30.403365218178966, "learning_rate": 9.321266968325792e-06, "loss": 0.4431, "step": 103 }, { "epoch": 0.14168937329700274, "grad_norm": 22.858347415039315, "learning_rate": 9.411764705882354e-06, "loss": 0.4926, "step": 104 }, { "epoch": 0.14305177111716622, "grad_norm": 30.23212441712278, "learning_rate": 9.502262443438914e-06, "loss": 0.5309, "step": 105 }, { "epoch": 0.1444141689373297, "grad_norm": 5.72606339419515, "learning_rate": 9.592760180995477e-06, "loss": 0.3876, "step": 106 }, { "epoch": 0.14577656675749318, "grad_norm": 23.748293181305367, "learning_rate": 9.683257918552037e-06, "loss": 0.4589, "step": 107 }, { "epoch": 0.14713896457765668, "grad_norm": 30.621534479902472, "learning_rate": 9.773755656108599e-06, "loss": 0.4152, "step": 108 }, { "epoch": 0.14850136239782016, "grad_norm": 12.460117279914495, "learning_rate": 9.86425339366516e-06, "loss": 0.381, "step": 109 }, { "epoch": 0.14986376021798364, "grad_norm": 22.656767272133862, "learning_rate": 9.95475113122172e-06, "loss": 0.4273, "step": 110 }, { "epoch": 0.15122615803814715, "grad_norm": 16.01669670889553, "learning_rate": 1.0045248868778282e-05, "loss": 0.4391, "step": 111 }, { "epoch": 0.15258855585831063, "grad_norm": 13.379112073022817, "learning_rate": 1.0135746606334842e-05, "loss": 0.4576, "step": 112 }, { "epoch": 0.1539509536784741, "grad_norm": 36.45825519288979, "learning_rate": 1.0226244343891403e-05, "loss": 0.4545, "step": 113 }, { "epoch": 0.1553133514986376, "grad_norm": 5.065351977560801, "learning_rate": 1.0316742081447966e-05, "loss": 0.4717, "step": 114 }, { "epoch": 0.1566757493188011, "grad_norm": 37.03464131494869, "learning_rate": 1.0407239819004526e-05, "loss": 0.4019, "step": 115 }, { "epoch": 0.15803814713896458, "grad_norm": 25.41496974907178, "learning_rate": 1.0497737556561088e-05, "loss": 0.4105, "step": 116 }, { "epoch": 0.15940054495912806, "grad_norm": 33.1089547623003, "learning_rate": 1.0588235294117648e-05, "loss": 0.4111, "step": 117 }, { "epoch": 0.16076294277929154, "grad_norm": 33.80884094601913, "learning_rate": 1.067873303167421e-05, "loss": 0.3854, "step": 118 }, { "epoch": 0.16212534059945505, "grad_norm": 3.1699311137329147, "learning_rate": 1.076923076923077e-05, "loss": 0.4211, "step": 119 }, { "epoch": 0.16348773841961853, "grad_norm": 27.807169827758063, "learning_rate": 1.0859728506787331e-05, "loss": 0.3588, "step": 120 }, { "epoch": 0.164850136239782, "grad_norm": 40.72492293228591, "learning_rate": 1.0950226244343893e-05, "loss": 0.4016, "step": 121 }, { "epoch": 0.16621253405994552, "grad_norm": 18.66528420011834, "learning_rate": 1.1040723981900454e-05, "loss": 0.3539, "step": 122 }, { "epoch": 0.167574931880109, "grad_norm": 47.06383014986689, "learning_rate": 1.1131221719457016e-05, "loss": 0.3519, "step": 123 }, { "epoch": 0.16893732970027248, "grad_norm": 6.766647548694476, "learning_rate": 1.1221719457013576e-05, "loss": 0.2905, "step": 124 }, { "epoch": 0.17029972752043596, "grad_norm": 27.354725296086038, "learning_rate": 1.1312217194570137e-05, "loss": 0.3338, "step": 125 }, { "epoch": 0.17166212534059946, "grad_norm": 20.08488402851003, "learning_rate": 1.1402714932126697e-05, "loss": 0.3536, "step": 126 }, { "epoch": 0.17302452316076294, "grad_norm": 2.5330112813787222, "learning_rate": 1.1493212669683259e-05, "loss": 0.3578, "step": 127 }, { "epoch": 0.17438692098092642, "grad_norm": 11.782052469192699, "learning_rate": 1.158371040723982e-05, "loss": 0.2934, "step": 128 }, { "epoch": 0.17574931880108993, "grad_norm": 4.7481763593797055, "learning_rate": 1.1674208144796382e-05, "loss": 0.3511, "step": 129 }, { "epoch": 0.1771117166212534, "grad_norm": 20.121392168933575, "learning_rate": 1.1764705882352942e-05, "loss": 0.3062, "step": 130 }, { "epoch": 0.1784741144414169, "grad_norm": 8.374760670910819, "learning_rate": 1.1855203619909503e-05, "loss": 0.2798, "step": 131 }, { "epoch": 0.17983651226158037, "grad_norm": 9.535486199772704, "learning_rate": 1.1945701357466063e-05, "loss": 0.3932, "step": 132 }, { "epoch": 0.18119891008174388, "grad_norm": 22.91592932172471, "learning_rate": 1.2036199095022625e-05, "loss": 0.3471, "step": 133 }, { "epoch": 0.18256130790190736, "grad_norm": 6.448652725991125, "learning_rate": 1.2126696832579185e-05, "loss": 0.2986, "step": 134 }, { "epoch": 0.18392370572207084, "grad_norm": 8.124584144833126, "learning_rate": 1.2217194570135748e-05, "loss": 0.3035, "step": 135 }, { "epoch": 0.18528610354223432, "grad_norm": 32.49522128744185, "learning_rate": 1.230769230769231e-05, "loss": 0.336, "step": 136 }, { "epoch": 0.18664850136239783, "grad_norm": 3.0095958886531355, "learning_rate": 1.239819004524887e-05, "loss": 0.3541, "step": 137 }, { "epoch": 0.1880108991825613, "grad_norm": 39.2880535556399, "learning_rate": 1.2488687782805431e-05, "loss": 0.3607, "step": 138 }, { "epoch": 0.1893732970027248, "grad_norm": 22.878770197785514, "learning_rate": 1.2579185520361991e-05, "loss": 0.2619, "step": 139 }, { "epoch": 0.1907356948228883, "grad_norm": 31.723804115319314, "learning_rate": 1.2669683257918553e-05, "loss": 0.3801, "step": 140 }, { "epoch": 0.19209809264305178, "grad_norm": 45.52049384005728, "learning_rate": 1.2760180995475113e-05, "loss": 0.3503, "step": 141 }, { "epoch": 0.19346049046321526, "grad_norm": 20.730995771002792, "learning_rate": 1.2850678733031676e-05, "loss": 0.3008, "step": 142 }, { "epoch": 0.19482288828337874, "grad_norm": 31.106631672941145, "learning_rate": 1.2941176470588238e-05, "loss": 0.3646, "step": 143 }, { "epoch": 0.19618528610354224, "grad_norm": 20.317441917754184, "learning_rate": 1.3031674208144797e-05, "loss": 0.343, "step": 144 }, { "epoch": 0.19754768392370572, "grad_norm": 27.751278425087072, "learning_rate": 1.3122171945701359e-05, "loss": 0.308, "step": 145 }, { "epoch": 0.1989100817438692, "grad_norm": 25.70828821679154, "learning_rate": 1.3212669683257919e-05, "loss": 0.2853, "step": 146 }, { "epoch": 0.20027247956403268, "grad_norm": 4.544951074175324, "learning_rate": 1.330316742081448e-05, "loss": 0.2701, "step": 147 }, { "epoch": 0.2016348773841962, "grad_norm": 17.188788662030387, "learning_rate": 1.339366515837104e-05, "loss": 0.3471, "step": 148 }, { "epoch": 0.20299727520435967, "grad_norm": 28.19548756559875, "learning_rate": 1.3484162895927604e-05, "loss": 0.3133, "step": 149 }, { "epoch": 0.20435967302452315, "grad_norm": 4.752739930910506, "learning_rate": 1.3574660633484165e-05, "loss": 0.2819, "step": 150 }, { "epoch": 0.20572207084468666, "grad_norm": 28.15843042587047, "learning_rate": 1.3665158371040725e-05, "loss": 0.3116, "step": 151 }, { "epoch": 0.20708446866485014, "grad_norm": 9.683960810546239, "learning_rate": 1.3755656108597287e-05, "loss": 0.2978, "step": 152 }, { "epoch": 0.20844686648501362, "grad_norm": 13.021940454764978, "learning_rate": 1.3846153846153847e-05, "loss": 0.2785, "step": 153 }, { "epoch": 0.2098092643051771, "grad_norm": 27.828341457554075, "learning_rate": 1.3936651583710408e-05, "loss": 0.2721, "step": 154 }, { "epoch": 0.2111716621253406, "grad_norm": 8.24249672549878, "learning_rate": 1.4027149321266968e-05, "loss": 0.2509, "step": 155 }, { "epoch": 0.2125340599455041, "grad_norm": 16.30702389570065, "learning_rate": 1.4117647058823532e-05, "loss": 0.2313, "step": 156 }, { "epoch": 0.21389645776566757, "grad_norm": 27.911357667483017, "learning_rate": 1.4208144796380091e-05, "loss": 0.2988, "step": 157 }, { "epoch": 0.21525885558583105, "grad_norm": 25.91316858669762, "learning_rate": 1.4298642533936653e-05, "loss": 0.2267, "step": 158 }, { "epoch": 0.21662125340599456, "grad_norm": 38.860421455635965, "learning_rate": 1.4389140271493213e-05, "loss": 0.3076, "step": 159 }, { "epoch": 0.21798365122615804, "grad_norm": 19.422413339593305, "learning_rate": 1.4479638009049775e-05, "loss": 0.2737, "step": 160 }, { "epoch": 0.21934604904632152, "grad_norm": 39.9817346895607, "learning_rate": 1.4570135746606336e-05, "loss": 0.3149, "step": 161 }, { "epoch": 0.22070844686648503, "grad_norm": 9.536402707243134, "learning_rate": 1.4660633484162896e-05, "loss": 0.2839, "step": 162 }, { "epoch": 0.2220708446866485, "grad_norm": 42.552378713994486, "learning_rate": 1.475113122171946e-05, "loss": 0.3077, "step": 163 }, { "epoch": 0.22343324250681199, "grad_norm": 13.390876065752051, "learning_rate": 1.484162895927602e-05, "loss": 0.3168, "step": 164 }, { "epoch": 0.22479564032697547, "grad_norm": 36.0993178880162, "learning_rate": 1.4932126696832581e-05, "loss": 0.2955, "step": 165 }, { "epoch": 0.22615803814713897, "grad_norm": 15.622502446541485, "learning_rate": 1.502262443438914e-05, "loss": 0.2795, "step": 166 }, { "epoch": 0.22752043596730245, "grad_norm": 24.846198865492113, "learning_rate": 1.5113122171945702e-05, "loss": 0.2688, "step": 167 }, { "epoch": 0.22888283378746593, "grad_norm": 2.466000047454243, "learning_rate": 1.5203619909502262e-05, "loss": 0.2984, "step": 168 }, { "epoch": 0.23024523160762944, "grad_norm": 8.132893273415087, "learning_rate": 1.5294117647058822e-05, "loss": 0.2559, "step": 169 }, { "epoch": 0.23160762942779292, "grad_norm": 15.19184666643854, "learning_rate": 1.5384615384615387e-05, "loss": 0.2602, "step": 170 }, { "epoch": 0.2329700272479564, "grad_norm": 3.7562222009094293, "learning_rate": 1.547511312217195e-05, "loss": 0.2585, "step": 171 }, { "epoch": 0.23433242506811988, "grad_norm": 4.730837525251791, "learning_rate": 1.5565610859728507e-05, "loss": 0.2214, "step": 172 }, { "epoch": 0.2356948228882834, "grad_norm": 8.25680953944904, "learning_rate": 1.565610859728507e-05, "loss": 0.2552, "step": 173 }, { "epoch": 0.23705722070844687, "grad_norm": 5.402341055548126, "learning_rate": 1.574660633484163e-05, "loss": 0.2468, "step": 174 }, { "epoch": 0.23841961852861035, "grad_norm": 18.604707633669207, "learning_rate": 1.5837104072398192e-05, "loss": 0.2944, "step": 175 }, { "epoch": 0.23978201634877383, "grad_norm": 16.420587364958596, "learning_rate": 1.592760180995475e-05, "loss": 0.2162, "step": 176 }, { "epoch": 0.24114441416893734, "grad_norm": 23.87567249219404, "learning_rate": 1.6018099547511315e-05, "loss": 0.2521, "step": 177 }, { "epoch": 0.24250681198910082, "grad_norm": 21.981769476305107, "learning_rate": 1.6108597285067877e-05, "loss": 0.2667, "step": 178 }, { "epoch": 0.2438692098092643, "grad_norm": 24.25265523641252, "learning_rate": 1.6199095022624435e-05, "loss": 0.3238, "step": 179 }, { "epoch": 0.2452316076294278, "grad_norm": 1.915056990783849, "learning_rate": 1.6289592760180996e-05, "loss": 0.1965, "step": 180 }, { "epoch": 0.24659400544959129, "grad_norm": 7.17330530808822, "learning_rate": 1.6380090497737558e-05, "loss": 0.2243, "step": 181 }, { "epoch": 0.24795640326975477, "grad_norm": 5.114593293859993, "learning_rate": 1.647058823529412e-05, "loss": 0.2161, "step": 182 }, { "epoch": 0.24931880108991825, "grad_norm": 9.627177771049459, "learning_rate": 1.6561085972850678e-05, "loss": 0.245, "step": 183 }, { "epoch": 0.2506811989100817, "grad_norm": 3.1174754974732046, "learning_rate": 1.6651583710407243e-05, "loss": 0.2254, "step": 184 }, { "epoch": 0.25204359673024523, "grad_norm": 7.502093308148323, "learning_rate": 1.6742081447963804e-05, "loss": 0.255, "step": 185 }, { "epoch": 0.25340599455040874, "grad_norm": 10.987363812812953, "learning_rate": 1.6832579185520363e-05, "loss": 0.2043, "step": 186 }, { "epoch": 0.2547683923705722, "grad_norm": 4.764703531524343, "learning_rate": 1.6923076923076924e-05, "loss": 0.1935, "step": 187 }, { "epoch": 0.2561307901907357, "grad_norm": 3.894069671721759, "learning_rate": 1.7013574660633486e-05, "loss": 0.217, "step": 188 }, { "epoch": 0.2574931880108992, "grad_norm": 20.312612064307128, "learning_rate": 1.7104072398190047e-05, "loss": 0.2495, "step": 189 }, { "epoch": 0.25885558583106266, "grad_norm": 8.052021953842951, "learning_rate": 1.719457013574661e-05, "loss": 0.2515, "step": 190 }, { "epoch": 0.26021798365122617, "grad_norm": 24.78605468050746, "learning_rate": 1.728506787330317e-05, "loss": 0.2285, "step": 191 }, { "epoch": 0.2615803814713896, "grad_norm": 14.034291570547666, "learning_rate": 1.737556561085973e-05, "loss": 0.2568, "step": 192 }, { "epoch": 0.26294277929155313, "grad_norm": 15.16057050504555, "learning_rate": 1.746606334841629e-05, "loss": 0.1612, "step": 193 }, { "epoch": 0.26430517711171664, "grad_norm": 3.383695227242225, "learning_rate": 1.7556561085972852e-05, "loss": 0.2689, "step": 194 }, { "epoch": 0.2656675749318801, "grad_norm": 17.44258283588901, "learning_rate": 1.7647058823529414e-05, "loss": 0.2654, "step": 195 }, { "epoch": 0.2670299727520436, "grad_norm": 8.144336089842438, "learning_rate": 1.7737556561085972e-05, "loss": 0.235, "step": 196 }, { "epoch": 0.2683923705722071, "grad_norm": 12.829289781875827, "learning_rate": 1.7828054298642537e-05, "loss": 0.2036, "step": 197 }, { "epoch": 0.26975476839237056, "grad_norm": 8.824485068709249, "learning_rate": 1.79185520361991e-05, "loss": 0.2225, "step": 198 }, { "epoch": 0.27111716621253407, "grad_norm": 2.682530369981412, "learning_rate": 1.8009049773755657e-05, "loss": 0.2559, "step": 199 }, { "epoch": 0.2724795640326976, "grad_norm": 19.88385663218789, "learning_rate": 1.8099547511312218e-05, "loss": 0.2899, "step": 200 }, { "epoch": 0.273841961852861, "grad_norm": 2.87618428607935, "learning_rate": 1.819004524886878e-05, "loss": 0.2118, "step": 201 }, { "epoch": 0.27520435967302453, "grad_norm": 29.347637139900037, "learning_rate": 1.828054298642534e-05, "loss": 0.2541, "step": 202 }, { "epoch": 0.276566757493188, "grad_norm": 4.110112367812807, "learning_rate": 1.83710407239819e-05, "loss": 0.2092, "step": 203 }, { "epoch": 0.2779291553133515, "grad_norm": 16.305122493170625, "learning_rate": 1.8461538461538465e-05, "loss": 0.2371, "step": 204 }, { "epoch": 0.279291553133515, "grad_norm": 16.080280521895688, "learning_rate": 1.8552036199095026e-05, "loss": 0.2026, "step": 205 }, { "epoch": 0.28065395095367845, "grad_norm": 35.0516654818093, "learning_rate": 1.8642533936651584e-05, "loss": 0.2853, "step": 206 }, { "epoch": 0.28201634877384196, "grad_norm": 6.4466105739807285, "learning_rate": 1.8733031674208146e-05, "loss": 0.2337, "step": 207 }, { "epoch": 0.28337874659400547, "grad_norm": 28.413802219630394, "learning_rate": 1.8823529411764708e-05, "loss": 0.1931, "step": 208 }, { "epoch": 0.2847411444141689, "grad_norm": 10.88093990533548, "learning_rate": 1.891402714932127e-05, "loss": 0.1997, "step": 209 }, { "epoch": 0.28610354223433243, "grad_norm": 17.797170933147296, "learning_rate": 1.9004524886877827e-05, "loss": 0.2045, "step": 210 }, { "epoch": 0.28746594005449594, "grad_norm": 25.488591385127677, "learning_rate": 1.9095022624434392e-05, "loss": 0.3003, "step": 211 }, { "epoch": 0.2888283378746594, "grad_norm": 26.185019229353607, "learning_rate": 1.9185520361990954e-05, "loss": 0.2643, "step": 212 }, { "epoch": 0.2901907356948229, "grad_norm": 17.709566584341708, "learning_rate": 1.9276018099547512e-05, "loss": 0.1959, "step": 213 }, { "epoch": 0.29155313351498635, "grad_norm": 20.07458363395164, "learning_rate": 1.9366515837104074e-05, "loss": 0.2371, "step": 214 }, { "epoch": 0.29291553133514986, "grad_norm": 25.904681883947816, "learning_rate": 1.9457013574660635e-05, "loss": 0.286, "step": 215 }, { "epoch": 0.29427792915531337, "grad_norm": 30.353699119034427, "learning_rate": 1.9547511312217197e-05, "loss": 0.2536, "step": 216 }, { "epoch": 0.2956403269754768, "grad_norm": 27.052849705542794, "learning_rate": 1.9638009049773755e-05, "loss": 0.2262, "step": 217 }, { "epoch": 0.2970027247956403, "grad_norm": 27.00773411734249, "learning_rate": 1.972850678733032e-05, "loss": 0.2128, "step": 218 }, { "epoch": 0.29836512261580383, "grad_norm": 36.3201418614354, "learning_rate": 1.981900452488688e-05, "loss": 0.1873, "step": 219 }, { "epoch": 0.2997275204359673, "grad_norm": 16.589170276255135, "learning_rate": 1.990950226244344e-05, "loss": 0.2645, "step": 220 }, { "epoch": 0.3010899182561308, "grad_norm": 52.078842885713826, "learning_rate": 2e-05, "loss": 0.2468, "step": 221 }, { "epoch": 0.3024523160762943, "grad_norm": 19.760298954403133, "learning_rate": 1.9999999026285234e-05, "loss": 0.2733, "step": 222 }, { "epoch": 0.30381471389645776, "grad_norm": 47.28609657916579, "learning_rate": 1.999999610514112e-05, "loss": 0.2496, "step": 223 }, { "epoch": 0.30517711171662126, "grad_norm": 16.351574396878664, "learning_rate": 1.999999123656823e-05, "loss": 0.1923, "step": 224 }, { "epoch": 0.3065395095367847, "grad_norm": 45.47122489491713, "learning_rate": 1.9999984420567506e-05, "loss": 0.2775, "step": 225 }, { "epoch": 0.3079019073569482, "grad_norm": 4.041973315047657, "learning_rate": 1.9999975657140286e-05, "loss": 0.2206, "step": 226 }, { "epoch": 0.30926430517711173, "grad_norm": 49.46529454303562, "learning_rate": 1.9999964946288268e-05, "loss": 0.2119, "step": 227 }, { "epoch": 0.3106267029972752, "grad_norm": 8.778679748066391, "learning_rate": 1.9999952288013544e-05, "loss": 0.228, "step": 228 }, { "epoch": 0.3119891008174387, "grad_norm": 56.87605812939803, "learning_rate": 1.999993768231857e-05, "loss": 0.2606, "step": 229 }, { "epoch": 0.3133514986376022, "grad_norm": 13.877900134463925, "learning_rate": 1.9999921129206198e-05, "loss": 0.1719, "step": 230 }, { "epoch": 0.31471389645776565, "grad_norm": 40.2906530279512, "learning_rate": 1.999990262867965e-05, "loss": 0.2724, "step": 231 }, { "epoch": 0.31607629427792916, "grad_norm": 24.06023499019854, "learning_rate": 1.9999882180742532e-05, "loss": 0.1892, "step": 232 }, { "epoch": 0.31743869209809267, "grad_norm": 27.693162611497556, "learning_rate": 1.999985978539882e-05, "loss": 0.2182, "step": 233 }, { "epoch": 0.3188010899182561, "grad_norm": 36.3559650399867, "learning_rate": 1.999983544265288e-05, "loss": 0.2184, "step": 234 }, { "epoch": 0.3201634877384196, "grad_norm": 21.229854670875316, "learning_rate": 1.9999809152509447e-05, "loss": 0.1479, "step": 235 }, { "epoch": 0.3215258855585831, "grad_norm": 36.165511711622564, "learning_rate": 1.9999780914973646e-05, "loss": 0.1794, "step": 236 }, { "epoch": 0.3228882833787466, "grad_norm": 6.069575240249152, "learning_rate": 1.9999750730050976e-05, "loss": 0.2021, "step": 237 }, { "epoch": 0.3242506811989101, "grad_norm": 43.59141438253358, "learning_rate": 1.9999718597747312e-05, "loss": 0.2368, "step": 238 }, { "epoch": 0.32561307901907355, "grad_norm": 6.1201840918282455, "learning_rate": 1.9999684518068916e-05, "loss": 0.1174, "step": 239 }, { "epoch": 0.32697547683923706, "grad_norm": 34.69018115063426, "learning_rate": 1.999964849102242e-05, "loss": 0.2525, "step": 240 }, { "epoch": 0.32833787465940056, "grad_norm": 24.276596351541226, "learning_rate": 1.999961051661484e-05, "loss": 0.2313, "step": 241 }, { "epoch": 0.329700272479564, "grad_norm": 36.16119563591004, "learning_rate": 1.9999570594853575e-05, "loss": 0.2474, "step": 242 }, { "epoch": 0.3310626702997275, "grad_norm": 26.806108502699534, "learning_rate": 1.9999528725746398e-05, "loss": 0.1921, "step": 243 }, { "epoch": 0.33242506811989103, "grad_norm": 12.944651866333887, "learning_rate": 1.9999484909301463e-05, "loss": 0.1564, "step": 244 }, { "epoch": 0.3337874659400545, "grad_norm": 43.639735673806605, "learning_rate": 1.9999439145527305e-05, "loss": 0.2417, "step": 245 }, { "epoch": 0.335149863760218, "grad_norm": 3.909368149861113, "learning_rate": 1.999939143443283e-05, "loss": 0.1451, "step": 246 }, { "epoch": 0.33651226158038144, "grad_norm": 42.74205839883459, "learning_rate": 1.9999341776027333e-05, "loss": 0.2467, "step": 247 }, { "epoch": 0.33787465940054495, "grad_norm": 11.751348349836602, "learning_rate": 1.9999290170320487e-05, "loss": 0.2507, "step": 248 }, { "epoch": 0.33923705722070846, "grad_norm": 20.70939982580437, "learning_rate": 1.999923661732234e-05, "loss": 0.163, "step": 249 }, { "epoch": 0.3405994550408719, "grad_norm": 23.981623207369633, "learning_rate": 1.999918111704332e-05, "loss": 0.2423, "step": 250 }, { "epoch": 0.3419618528610354, "grad_norm": 9.455616422194403, "learning_rate": 1.999912366949423e-05, "loss": 0.2328, "step": 251 }, { "epoch": 0.34332425068119893, "grad_norm": 20.476050791506854, "learning_rate": 1.999906427468627e-05, "loss": 0.2149, "step": 252 }, { "epoch": 0.3446866485013624, "grad_norm": 18.497503497293682, "learning_rate": 1.9999002932631e-05, "loss": 0.2093, "step": 253 }, { "epoch": 0.3460490463215259, "grad_norm": 22.424615093848278, "learning_rate": 1.9998939643340364e-05, "loss": 0.1862, "step": 254 }, { "epoch": 0.3474114441416894, "grad_norm": 28.507132028466827, "learning_rate": 1.999887440682669e-05, "loss": 0.2152, "step": 255 }, { "epoch": 0.34877384196185285, "grad_norm": 9.517231715119904, "learning_rate": 1.9998807223102683e-05, "loss": 0.1537, "step": 256 }, { "epoch": 0.35013623978201636, "grad_norm": 16.295292609078796, "learning_rate": 1.9998738092181423e-05, "loss": 0.1827, "step": 257 }, { "epoch": 0.35149863760217986, "grad_norm": 17.54369855421946, "learning_rate": 1.9998667014076375e-05, "loss": 0.2051, "step": 258 }, { "epoch": 0.3528610354223433, "grad_norm": 4.5087830914382785, "learning_rate": 1.9998593988801383e-05, "loss": 0.1529, "step": 259 }, { "epoch": 0.3542234332425068, "grad_norm": 22.27040567793319, "learning_rate": 1.999851901637066e-05, "loss": 0.1842, "step": 260 }, { "epoch": 0.3555858310626703, "grad_norm": 16.819462059312258, "learning_rate": 1.999844209679882e-05, "loss": 0.1671, "step": 261 }, { "epoch": 0.3569482288828338, "grad_norm": 9.539844396374185, "learning_rate": 1.999836323010083e-05, "loss": 0.2029, "step": 262 }, { "epoch": 0.3583106267029973, "grad_norm": 33.643054127404376, "learning_rate": 1.9998282416292057e-05, "loss": 0.1792, "step": 263 }, { "epoch": 0.35967302452316074, "grad_norm": 3.6447089743640806, "learning_rate": 1.9998199655388235e-05, "loss": 0.171, "step": 264 }, { "epoch": 0.36103542234332425, "grad_norm": 20.571329988250397, "learning_rate": 1.9998114947405476e-05, "loss": 0.156, "step": 265 }, { "epoch": 0.36239782016348776, "grad_norm": 18.347262949828604, "learning_rate": 1.9998028292360287e-05, "loss": 0.1406, "step": 266 }, { "epoch": 0.3637602179836512, "grad_norm": 2.342085575226063, "learning_rate": 1.9997939690269538e-05, "loss": 0.146, "step": 267 }, { "epoch": 0.3651226158038147, "grad_norm": 23.719939105174703, "learning_rate": 1.9997849141150486e-05, "loss": 0.1458, "step": 268 }, { "epoch": 0.36648501362397823, "grad_norm": 19.68658737585981, "learning_rate": 1.999775664502076e-05, "loss": 0.1762, "step": 269 }, { "epoch": 0.3678474114441417, "grad_norm": 23.052845865524855, "learning_rate": 1.9997662201898376e-05, "loss": 0.1825, "step": 270 }, { "epoch": 0.3692098092643052, "grad_norm": 29.62378650135902, "learning_rate": 1.9997565811801727e-05, "loss": 0.2579, "step": 271 }, { "epoch": 0.37057220708446864, "grad_norm": 15.640419816467231, "learning_rate": 1.9997467474749583e-05, "loss": 0.1912, "step": 272 }, { "epoch": 0.37193460490463215, "grad_norm": 18.55452922336791, "learning_rate": 1.9997367190761092e-05, "loss": 0.2262, "step": 273 }, { "epoch": 0.37329700272479566, "grad_norm": 8.161098113585062, "learning_rate": 1.9997264959855788e-05, "loss": 0.1988, "step": 274 }, { "epoch": 0.3746594005449591, "grad_norm": 4.17061675922946, "learning_rate": 1.999716078205358e-05, "loss": 0.154, "step": 275 }, { "epoch": 0.3760217983651226, "grad_norm": 22.594089587873942, "learning_rate": 1.999705465737475e-05, "loss": 0.1756, "step": 276 }, { "epoch": 0.3773841961852861, "grad_norm": 12.901811630444842, "learning_rate": 1.9996946585839974e-05, "loss": 0.1964, "step": 277 }, { "epoch": 0.3787465940054496, "grad_norm": 12.941726617280857, "learning_rate": 1.999683656747029e-05, "loss": 0.1661, "step": 278 }, { "epoch": 0.3801089918256131, "grad_norm": 20.988975630607136, "learning_rate": 1.9996724602287124e-05, "loss": 0.1871, "step": 279 }, { "epoch": 0.3814713896457766, "grad_norm": 2.5389340754593217, "learning_rate": 1.9996610690312287e-05, "loss": 0.1508, "step": 280 }, { "epoch": 0.38283378746594005, "grad_norm": 22.207492283803514, "learning_rate": 1.999649483156796e-05, "loss": 0.1481, "step": 281 }, { "epoch": 0.38419618528610355, "grad_norm": 8.869863398897406, "learning_rate": 1.99963770260767e-05, "loss": 0.168, "step": 282 }, { "epoch": 0.385558583106267, "grad_norm": 9.622857788623147, "learning_rate": 1.9996257273861452e-05, "loss": 0.153, "step": 283 }, { "epoch": 0.3869209809264305, "grad_norm": 11.092541354703858, "learning_rate": 1.9996135574945543e-05, "loss": 0.1791, "step": 284 }, { "epoch": 0.388283378746594, "grad_norm": 2.0317698140529865, "learning_rate": 1.9996011929352664e-05, "loss": 0.153, "step": 285 }, { "epoch": 0.3896457765667575, "grad_norm": 13.315528016134003, "learning_rate": 1.9995886337106897e-05, "loss": 0.1538, "step": 286 }, { "epoch": 0.391008174386921, "grad_norm": 4.413349847156077, "learning_rate": 1.9995758798232704e-05, "loss": 0.1447, "step": 287 }, { "epoch": 0.3923705722070845, "grad_norm": 3.06059716352152, "learning_rate": 1.9995629312754916e-05, "loss": 0.1457, "step": 288 }, { "epoch": 0.39373297002724794, "grad_norm": 18.872231582344046, "learning_rate": 1.9995497880698757e-05, "loss": 0.1838, "step": 289 }, { "epoch": 0.39509536784741145, "grad_norm": 6.478502370783577, "learning_rate": 1.9995364502089815e-05, "loss": 0.1465, "step": 290 }, { "epoch": 0.39645776566757496, "grad_norm": 9.521492637407968, "learning_rate": 1.9995229176954067e-05, "loss": 0.1346, "step": 291 }, { "epoch": 0.3978201634877384, "grad_norm": 23.06996740248916, "learning_rate": 1.999509190531787e-05, "loss": 0.1685, "step": 292 }, { "epoch": 0.3991825613079019, "grad_norm": 10.011392986293085, "learning_rate": 1.9994952687207957e-05, "loss": 0.139, "step": 293 }, { "epoch": 0.40054495912806537, "grad_norm": 16.922972586508934, "learning_rate": 1.999481152265143e-05, "loss": 0.1842, "step": 294 }, { "epoch": 0.4019073569482289, "grad_norm": 4.387521557704083, "learning_rate": 1.999466841167579e-05, "loss": 0.2129, "step": 295 }, { "epoch": 0.4032697547683924, "grad_norm": 6.509298218524515, "learning_rate": 1.9994523354308904e-05, "loss": 0.1324, "step": 296 }, { "epoch": 0.40463215258855584, "grad_norm": 21.47662533490085, "learning_rate": 1.999437635057902e-05, "loss": 0.1961, "step": 297 }, { "epoch": 0.40599455040871935, "grad_norm": 12.813961190287216, "learning_rate": 1.9994227400514765e-05, "loss": 0.1974, "step": 298 }, { "epoch": 0.40735694822888285, "grad_norm": 9.468915775070345, "learning_rate": 1.9994076504145147e-05, "loss": 0.1463, "step": 299 }, { "epoch": 0.4087193460490463, "grad_norm": 4.7184285584275605, "learning_rate": 1.9993923661499553e-05, "loss": 0.1629, "step": 300 }, { "epoch": 0.4100817438692098, "grad_norm": 7.3019388453574905, "learning_rate": 1.999376887260775e-05, "loss": 0.1244, "step": 301 }, { "epoch": 0.4114441416893733, "grad_norm": 2.0574228289271805, "learning_rate": 1.9993612137499875e-05, "loss": 0.1234, "step": 302 }, { "epoch": 0.4128065395095368, "grad_norm": 3.0035266999606987, "learning_rate": 1.999345345620646e-05, "loss": 0.163, "step": 303 }, { "epoch": 0.4141689373297003, "grad_norm": 4.350485290237615, "learning_rate": 1.99932928287584e-05, "loss": 0.1708, "step": 304 }, { "epoch": 0.41553133514986373, "grad_norm": 2.8469542349860406, "learning_rate": 1.999313025518698e-05, "loss": 0.1564, "step": 305 }, { "epoch": 0.41689373297002724, "grad_norm": 2.844112349042407, "learning_rate": 1.9992965735523856e-05, "loss": 0.1372, "step": 306 }, { "epoch": 0.41825613079019075, "grad_norm": 4.102468103822996, "learning_rate": 1.999279926980107e-05, "loss": 0.1551, "step": 307 }, { "epoch": 0.4196185286103542, "grad_norm": 3.6991917399040872, "learning_rate": 1.999263085805104e-05, "loss": 0.1042, "step": 308 }, { "epoch": 0.4209809264305177, "grad_norm": 3.5110733297611847, "learning_rate": 1.9992460500306566e-05, "loss": 0.1555, "step": 309 }, { "epoch": 0.4223433242506812, "grad_norm": 3.657549705581189, "learning_rate": 1.9992288196600818e-05, "loss": 0.1388, "step": 310 }, { "epoch": 0.42370572207084467, "grad_norm": 2.23165245188526, "learning_rate": 1.999211394696735e-05, "loss": 0.1307, "step": 311 }, { "epoch": 0.4250681198910082, "grad_norm": 3.2922795067527044, "learning_rate": 1.9991937751440104e-05, "loss": 0.1723, "step": 312 }, { "epoch": 0.4264305177111717, "grad_norm": 9.933533929950572, "learning_rate": 1.9991759610053388e-05, "loss": 0.1491, "step": 313 }, { "epoch": 0.42779291553133514, "grad_norm": 2.366503923887021, "learning_rate": 1.9991579522841894e-05, "loss": 0.1333, "step": 314 }, { "epoch": 0.42915531335149865, "grad_norm": 3.350079145556657, "learning_rate": 1.9991397489840693e-05, "loss": 0.1596, "step": 315 }, { "epoch": 0.4305177111716621, "grad_norm": 11.391861100295085, "learning_rate": 1.9991213511085235e-05, "loss": 0.1453, "step": 316 }, { "epoch": 0.4318801089918256, "grad_norm": 3.98589288560827, "learning_rate": 1.9991027586611346e-05, "loss": 0.1523, "step": 317 }, { "epoch": 0.4332425068119891, "grad_norm": 10.146938709189222, "learning_rate": 1.9990839716455234e-05, "loss": 0.1568, "step": 318 }, { "epoch": 0.43460490463215257, "grad_norm": 13.901656567870011, "learning_rate": 1.9990649900653493e-05, "loss": 0.1208, "step": 319 }, { "epoch": 0.4359673024523161, "grad_norm": 4.44072866453597, "learning_rate": 1.9990458139243075e-05, "loss": 0.1674, "step": 320 }, { "epoch": 0.4373297002724796, "grad_norm": 25.2176444993629, "learning_rate": 1.9990264432261335e-05, "loss": 0.1588, "step": 321 }, { "epoch": 0.43869209809264303, "grad_norm": 7.457391613724245, "learning_rate": 1.9990068779745994e-05, "loss": 0.1444, "step": 322 }, { "epoch": 0.44005449591280654, "grad_norm": 17.42440036553369, "learning_rate": 1.998987118173515e-05, "loss": 0.1539, "step": 323 }, { "epoch": 0.44141689373297005, "grad_norm": 26.926572963214955, "learning_rate": 1.9989671638267288e-05, "loss": 0.1182, "step": 324 }, { "epoch": 0.4427792915531335, "grad_norm": 1.6908727359567979, "learning_rate": 1.9989470149381264e-05, "loss": 0.1261, "step": 325 }, { "epoch": 0.444141689373297, "grad_norm": 31.633074816894613, "learning_rate": 1.9989266715116316e-05, "loss": 0.1606, "step": 326 }, { "epoch": 0.44550408719346046, "grad_norm": 16.825034583061512, "learning_rate": 1.998906133551207e-05, "loss": 0.1352, "step": 327 }, { "epoch": 0.44686648501362397, "grad_norm": 21.34876835760916, "learning_rate": 1.9988854010608514e-05, "loss": 0.194, "step": 328 }, { "epoch": 0.4482288828337875, "grad_norm": 26.63807387473857, "learning_rate": 1.9988644740446022e-05, "loss": 0.187, "step": 329 }, { "epoch": 0.44959128065395093, "grad_norm": 1.6834728630809066, "learning_rate": 1.9988433525065353e-05, "loss": 0.1127, "step": 330 }, { "epoch": 0.45095367847411444, "grad_norm": 21.275570279672497, "learning_rate": 1.998822036450764e-05, "loss": 0.1798, "step": 331 }, { "epoch": 0.45231607629427795, "grad_norm": 16.606279349657836, "learning_rate": 1.998800525881439e-05, "loss": 0.155, "step": 332 }, { "epoch": 0.4536784741144414, "grad_norm": 11.820883583841447, "learning_rate": 1.9987788208027496e-05, "loss": 0.1551, "step": 333 }, { "epoch": 0.4550408719346049, "grad_norm": 12.843570591813421, "learning_rate": 1.9987569212189224e-05, "loss": 0.1266, "step": 334 }, { "epoch": 0.4564032697547684, "grad_norm": 14.015650857231682, "learning_rate": 1.998734827134223e-05, "loss": 0.1217, "step": 335 }, { "epoch": 0.45776566757493187, "grad_norm": 16.63417245794833, "learning_rate": 1.9987125385529528e-05, "loss": 0.1358, "step": 336 }, { "epoch": 0.4591280653950954, "grad_norm": 8.708761384690547, "learning_rate": 1.9986900554794536e-05, "loss": 0.1703, "step": 337 }, { "epoch": 0.4604904632152589, "grad_norm": 3.5220142796511227, "learning_rate": 1.9986673779181033e-05, "loss": 0.1545, "step": 338 }, { "epoch": 0.46185286103542234, "grad_norm": 5.386029207326769, "learning_rate": 1.9986445058733182e-05, "loss": 0.1446, "step": 339 }, { "epoch": 0.46321525885558584, "grad_norm": 3.8224615338309627, "learning_rate": 1.998621439349552e-05, "loss": 0.1254, "step": 340 }, { "epoch": 0.4645776566757493, "grad_norm": 4.229895605225783, "learning_rate": 1.9985981783512977e-05, "loss": 0.1149, "step": 341 }, { "epoch": 0.4659400544959128, "grad_norm": 6.907299321432419, "learning_rate": 1.9985747228830846e-05, "loss": 0.1476, "step": 342 }, { "epoch": 0.4673024523160763, "grad_norm": 4.236825078896579, "learning_rate": 1.9985510729494804e-05, "loss": 0.1537, "step": 343 }, { "epoch": 0.46866485013623976, "grad_norm": 6.427276665204539, "learning_rate": 1.998527228555091e-05, "loss": 0.1454, "step": 344 }, { "epoch": 0.47002724795640327, "grad_norm": 2.78233372516159, "learning_rate": 1.9985031897045602e-05, "loss": 0.1308, "step": 345 }, { "epoch": 0.4713896457765668, "grad_norm": 8.12677809979955, "learning_rate": 1.9984789564025686e-05, "loss": 0.1688, "step": 346 }, { "epoch": 0.47275204359673023, "grad_norm": 8.298665805418251, "learning_rate": 1.9984545286538362e-05, "loss": 0.1465, "step": 347 }, { "epoch": 0.47411444141689374, "grad_norm": 7.118330533120063, "learning_rate": 1.9984299064631197e-05, "loss": 0.1307, "step": 348 }, { "epoch": 0.47547683923705725, "grad_norm": 20.32989469288522, "learning_rate": 1.9984050898352147e-05, "loss": 0.1454, "step": 349 }, { "epoch": 0.4768392370572207, "grad_norm": 3.435902174073289, "learning_rate": 1.998380078774953e-05, "loss": 0.1158, "step": 350 }, { "epoch": 0.4782016348773842, "grad_norm": 17.53430680768268, "learning_rate": 1.9983548732872067e-05, "loss": 0.1885, "step": 351 }, { "epoch": 0.47956403269754766, "grad_norm": 5.594618365372545, "learning_rate": 1.9983294733768832e-05, "loss": 0.1957, "step": 352 }, { "epoch": 0.48092643051771117, "grad_norm": 6.794837679423735, "learning_rate": 1.9983038790489296e-05, "loss": 0.1461, "step": 353 }, { "epoch": 0.4822888283378747, "grad_norm": 11.842089067366512, "learning_rate": 1.99827809030833e-05, "loss": 0.149, "step": 354 }, { "epoch": 0.48365122615803813, "grad_norm": 2.4678243941112212, "learning_rate": 1.9982521071601062e-05, "loss": 0.1207, "step": 355 }, { "epoch": 0.48501362397820164, "grad_norm": 9.551928115947277, "learning_rate": 1.9982259296093192e-05, "loss": 0.1385, "step": 356 }, { "epoch": 0.48637602179836514, "grad_norm": 6.736955471127401, "learning_rate": 1.998199557661066e-05, "loss": 0.1116, "step": 357 }, { "epoch": 0.4877384196185286, "grad_norm": 3.204488247161076, "learning_rate": 1.998172991320483e-05, "loss": 0.1397, "step": 358 }, { "epoch": 0.4891008174386921, "grad_norm": 8.507848794620468, "learning_rate": 1.998146230592743e-05, "loss": 0.1472, "step": 359 }, { "epoch": 0.4904632152588556, "grad_norm": 7.437446783555749, "learning_rate": 1.9981192754830584e-05, "loss": 0.177, "step": 360 }, { "epoch": 0.49182561307901906, "grad_norm": 7.482581535691006, "learning_rate": 1.998092125996678e-05, "loss": 0.0955, "step": 361 }, { "epoch": 0.49318801089918257, "grad_norm": 3.4675704623794825, "learning_rate": 1.9980647821388888e-05, "loss": 0.1287, "step": 362 }, { "epoch": 0.494550408719346, "grad_norm": 7.428140684372201, "learning_rate": 1.998037243915016e-05, "loss": 0.1165, "step": 363 }, { "epoch": 0.49591280653950953, "grad_norm": 3.9897506060299963, "learning_rate": 1.9980095113304228e-05, "loss": 0.1374, "step": 364 }, { "epoch": 0.49727520435967304, "grad_norm": 2.3727819418853193, "learning_rate": 1.9979815843905098e-05, "loss": 0.1237, "step": 365 }, { "epoch": 0.4986376021798365, "grad_norm": 3.5731366817952064, "learning_rate": 1.9979534631007152e-05, "loss": 0.2147, "step": 366 }, { "epoch": 0.5, "grad_norm": 6.822793002836057, "learning_rate": 1.9979251474665154e-05, "loss": 0.1272, "step": 367 }, { "epoch": 0.5013623978201635, "grad_norm": 5.193705996670866, "learning_rate": 1.9978966374934255e-05, "loss": 0.111, "step": 368 }, { "epoch": 0.502724795640327, "grad_norm": 7.363585310523062, "learning_rate": 1.9978679331869967e-05, "loss": 0.1387, "step": 369 }, { "epoch": 0.5040871934604905, "grad_norm": 9.672179548580063, "learning_rate": 1.9978390345528193e-05, "loss": 0.1218, "step": 370 }, { "epoch": 0.5054495912806539, "grad_norm": 5.874223610286406, "learning_rate": 1.9978099415965214e-05, "loss": 0.1734, "step": 371 }, { "epoch": 0.5068119891008175, "grad_norm": 10.553378561289023, "learning_rate": 1.9977806543237683e-05, "loss": 0.1156, "step": 372 }, { "epoch": 0.5081743869209809, "grad_norm": 2.266872373361413, "learning_rate": 1.997751172740263e-05, "loss": 0.1314, "step": 373 }, { "epoch": 0.5095367847411444, "grad_norm": 11.567803253314707, "learning_rate": 1.997721496851748e-05, "loss": 0.1563, "step": 374 }, { "epoch": 0.510899182561308, "grad_norm": 9.09368246969193, "learning_rate": 1.9976916266640018e-05, "loss": 0.1605, "step": 375 }, { "epoch": 0.5122615803814714, "grad_norm": 11.726799977599324, "learning_rate": 1.9976615621828414e-05, "loss": 0.1563, "step": 376 }, { "epoch": 0.5136239782016349, "grad_norm": 8.372016295883585, "learning_rate": 1.9976313034141214e-05, "loss": 0.1297, "step": 377 }, { "epoch": 0.5149863760217984, "grad_norm": 3.2012828056412768, "learning_rate": 1.9976008503637352e-05, "loss": 0.1145, "step": 378 }, { "epoch": 0.5163487738419619, "grad_norm": 6.4263113623927515, "learning_rate": 1.9975702030376127e-05, "loss": 0.1061, "step": 379 }, { "epoch": 0.5177111716621253, "grad_norm": 8.945022899754084, "learning_rate": 1.9975393614417225e-05, "loss": 0.1086, "step": 380 }, { "epoch": 0.5190735694822888, "grad_norm": 2.22081405346318, "learning_rate": 1.9975083255820706e-05, "loss": 0.0805, "step": 381 }, { "epoch": 0.5204359673024523, "grad_norm": 4.878634742489735, "learning_rate": 1.9974770954647015e-05, "loss": 0.1821, "step": 382 }, { "epoch": 0.5217983651226158, "grad_norm": 7.792371348619723, "learning_rate": 1.9974456710956965e-05, "loss": 0.1472, "step": 383 }, { "epoch": 0.5231607629427792, "grad_norm": 10.07427117729835, "learning_rate": 1.9974140524811757e-05, "loss": 0.1469, "step": 384 }, { "epoch": 0.5245231607629428, "grad_norm": 9.70747767459945, "learning_rate": 1.997382239627296e-05, "loss": 0.1688, "step": 385 }, { "epoch": 0.5258855585831063, "grad_norm": 16.18325941915235, "learning_rate": 1.9973502325402533e-05, "loss": 0.2079, "step": 386 }, { "epoch": 0.5272479564032697, "grad_norm": 4.02884491185807, "learning_rate": 1.99731803122628e-05, "loss": 0.1292, "step": 387 }, { "epoch": 0.5286103542234333, "grad_norm": 10.955102272971343, "learning_rate": 1.9972856356916485e-05, "loss": 0.1527, "step": 388 }, { "epoch": 0.5299727520435967, "grad_norm": 10.648611452333519, "learning_rate": 1.9972530459426663e-05, "loss": 0.1108, "step": 389 }, { "epoch": 0.5313351498637602, "grad_norm": 8.32112489836537, "learning_rate": 1.9972202619856807e-05, "loss": 0.1522, "step": 390 }, { "epoch": 0.5326975476839237, "grad_norm": 18.19279406356402, "learning_rate": 1.9971872838270762e-05, "loss": 0.1547, "step": 391 }, { "epoch": 0.5340599455040872, "grad_norm": 3.2550088983814454, "learning_rate": 1.9971541114732743e-05, "loss": 0.1454, "step": 392 }, { "epoch": 0.5354223433242506, "grad_norm": 6.339701277299768, "learning_rate": 1.9971207449307355e-05, "loss": 0.1578, "step": 393 }, { "epoch": 0.5367847411444142, "grad_norm": 4.967931590528822, "learning_rate": 1.997087184205958e-05, "loss": 0.117, "step": 394 }, { "epoch": 0.5381471389645777, "grad_norm": 7.150017860669629, "learning_rate": 1.997053429305477e-05, "loss": 0.124, "step": 395 }, { "epoch": 0.5395095367847411, "grad_norm": 13.392270639538584, "learning_rate": 1.9970194802358665e-05, "loss": 0.1953, "step": 396 }, { "epoch": 0.5408719346049047, "grad_norm": 5.4027493703915574, "learning_rate": 1.9969853370037376e-05, "loss": 0.1636, "step": 397 }, { "epoch": 0.5422343324250681, "grad_norm": 15.12164156281947, "learning_rate": 1.9969509996157398e-05, "loss": 0.1564, "step": 398 }, { "epoch": 0.5435967302452316, "grad_norm": 15.239362630817231, "learning_rate": 1.9969164680785594e-05, "loss": 0.1326, "step": 399 }, { "epoch": 0.5449591280653951, "grad_norm": 13.745316105464838, "learning_rate": 1.9968817423989217e-05, "loss": 0.1484, "step": 400 }, { "epoch": 0.5463215258855586, "grad_norm": 17.585575143329187, "learning_rate": 1.996846822583589e-05, "loss": 0.1392, "step": 401 }, { "epoch": 0.547683923705722, "grad_norm": 5.491639275845271, "learning_rate": 1.9968117086393616e-05, "loss": 0.1087, "step": 402 }, { "epoch": 0.5490463215258855, "grad_norm": 26.127210951330166, "learning_rate": 1.9967764005730785e-05, "loss": 0.1831, "step": 403 }, { "epoch": 0.5504087193460491, "grad_norm": 9.14860323431144, "learning_rate": 1.9967408983916145e-05, "loss": 0.1529, "step": 404 }, { "epoch": 0.5517711171662125, "grad_norm": 12.70575517976176, "learning_rate": 1.996705202101884e-05, "loss": 0.1791, "step": 405 }, { "epoch": 0.553133514986376, "grad_norm": 14.76701839997641, "learning_rate": 1.996669311710839e-05, "loss": 0.1369, "step": 406 }, { "epoch": 0.5544959128065395, "grad_norm": 5.993756887694709, "learning_rate": 1.9966332272254685e-05, "loss": 0.1069, "step": 407 }, { "epoch": 0.555858310626703, "grad_norm": 7.311995202823501, "learning_rate": 1.9965969486527993e-05, "loss": 0.1245, "step": 408 }, { "epoch": 0.5572207084468664, "grad_norm": 18.665142488901704, "learning_rate": 1.9965604759998972e-05, "loss": 0.121, "step": 409 }, { "epoch": 0.55858310626703, "grad_norm": 3.8021189990759243, "learning_rate": 1.9965238092738643e-05, "loss": 0.1031, "step": 410 }, { "epoch": 0.5599455040871935, "grad_norm": 15.358736463791406, "learning_rate": 1.9964869484818418e-05, "loss": 0.1134, "step": 411 }, { "epoch": 0.5613079019073569, "grad_norm": 4.416732497966334, "learning_rate": 1.9964498936310074e-05, "loss": 0.1254, "step": 412 }, { "epoch": 0.5626702997275205, "grad_norm": 3.3985795690198715, "learning_rate": 1.996412644728578e-05, "loss": 0.0986, "step": 413 }, { "epoch": 0.5640326975476839, "grad_norm": 3.7673126499762546, "learning_rate": 1.996375201781807e-05, "loss": 0.1381, "step": 414 }, { "epoch": 0.5653950953678474, "grad_norm": 4.645145765939826, "learning_rate": 1.996337564797986e-05, "loss": 0.1228, "step": 415 }, { "epoch": 0.5667574931880109, "grad_norm": 2.909966107726487, "learning_rate": 1.996299733784445e-05, "loss": 0.1255, "step": 416 }, { "epoch": 0.5681198910081744, "grad_norm": 3.2156460649978174, "learning_rate": 1.9962617087485518e-05, "loss": 0.1565, "step": 417 }, { "epoch": 0.5694822888283378, "grad_norm": 6.936898801135618, "learning_rate": 1.9962234896977103e-05, "loss": 0.1093, "step": 418 }, { "epoch": 0.5708446866485014, "grad_norm": 2.655704904682405, "learning_rate": 1.996185076639364e-05, "loss": 0.1095, "step": 419 }, { "epoch": 0.5722070844686649, "grad_norm": 7.784895790248206, "learning_rate": 1.996146469580994e-05, "loss": 0.1229, "step": 420 }, { "epoch": 0.5735694822888283, "grad_norm": 8.645629734772418, "learning_rate": 1.9961076685301176e-05, "loss": 0.1079, "step": 421 }, { "epoch": 0.5749318801089919, "grad_norm": 8.58034092395465, "learning_rate": 1.996068673494292e-05, "loss": 0.1614, "step": 422 }, { "epoch": 0.5762942779291553, "grad_norm": 6.162096255568699, "learning_rate": 1.9960294844811115e-05, "loss": 0.1083, "step": 423 }, { "epoch": 0.5776566757493188, "grad_norm": 1.5844720454378722, "learning_rate": 1.995990101498207e-05, "loss": 0.1101, "step": 424 }, { "epoch": 0.5790190735694822, "grad_norm": 5.8363151543798315, "learning_rate": 1.995950524553248e-05, "loss": 0.119, "step": 425 }, { "epoch": 0.5803814713896458, "grad_norm": 7.836622655678672, "learning_rate": 1.995910753653943e-05, "loss": 0.1289, "step": 426 }, { "epoch": 0.5817438692098093, "grad_norm": 4.427126469670073, "learning_rate": 1.995870788808036e-05, "loss": 0.1077, "step": 427 }, { "epoch": 0.5831062670299727, "grad_norm": 16.35709208908178, "learning_rate": 1.99583063002331e-05, "loss": 0.1386, "step": 428 }, { "epoch": 0.5844686648501363, "grad_norm": 5.258434654887555, "learning_rate": 1.9957902773075857e-05, "loss": 0.1003, "step": 429 }, { "epoch": 0.5858310626702997, "grad_norm": 10.435669505762167, "learning_rate": 1.995749730668722e-05, "loss": 0.1082, "step": 430 }, { "epoch": 0.5871934604904632, "grad_norm": 6.483331298279957, "learning_rate": 1.995708990114615e-05, "loss": 0.127, "step": 431 }, { "epoch": 0.5885558583106267, "grad_norm": 7.091865096501059, "learning_rate": 1.995668055653198e-05, "loss": 0.2085, "step": 432 }, { "epoch": 0.5899182561307902, "grad_norm": 15.166326951750268, "learning_rate": 1.995626927292443e-05, "loss": 0.1703, "step": 433 }, { "epoch": 0.5912806539509536, "grad_norm": 10.195839649152935, "learning_rate": 1.9955856050403596e-05, "loss": 0.1316, "step": 434 }, { "epoch": 0.5926430517711172, "grad_norm": 5.122897846300403, "learning_rate": 1.9955440889049947e-05, "loss": 0.1296, "step": 435 }, { "epoch": 0.5940054495912807, "grad_norm": 12.522607481352297, "learning_rate": 1.995502378894434e-05, "loss": 0.1118, "step": 436 }, { "epoch": 0.5953678474114441, "grad_norm": 8.052198582100564, "learning_rate": 1.9954604750167995e-05, "loss": 0.1285, "step": 437 }, { "epoch": 0.5967302452316077, "grad_norm": 8.421754451531918, "learning_rate": 1.995418377280252e-05, "loss": 0.1069, "step": 438 }, { "epoch": 0.5980926430517711, "grad_norm": 8.974005579704874, "learning_rate": 1.9953760856929896e-05, "loss": 0.1252, "step": 439 }, { "epoch": 0.5994550408719346, "grad_norm": 1.8943139509270512, "learning_rate": 1.9953336002632483e-05, "loss": 0.1254, "step": 440 }, { "epoch": 0.6008174386920981, "grad_norm": 7.2920719791201325, "learning_rate": 1.9952909209993018e-05, "loss": 0.134, "step": 441 }, { "epoch": 0.6021798365122616, "grad_norm": 5.389043794865126, "learning_rate": 1.995248047909462e-05, "loss": 0.0974, "step": 442 }, { "epoch": 0.603542234332425, "grad_norm": 1.9232739589693977, "learning_rate": 1.9952049810020774e-05, "loss": 0.0759, "step": 443 }, { "epoch": 0.6049046321525886, "grad_norm": 6.8673882319101835, "learning_rate": 1.9951617202855355e-05, "loss": 0.1157, "step": 444 }, { "epoch": 0.6062670299727521, "grad_norm": 8.982094518444482, "learning_rate": 1.9951182657682612e-05, "loss": 0.1198, "step": 445 }, { "epoch": 0.6076294277929155, "grad_norm": 12.222561002587218, "learning_rate": 1.9950746174587163e-05, "loss": 0.1181, "step": 446 }, { "epoch": 0.6089918256130791, "grad_norm": 6.180240415427373, "learning_rate": 1.9950307753654016e-05, "loss": 0.1176, "step": 447 }, { "epoch": 0.6103542234332425, "grad_norm": 3.7457001133676733, "learning_rate": 1.9949867394968546e-05, "loss": 0.1329, "step": 448 }, { "epoch": 0.611716621253406, "grad_norm": 9.360054821549662, "learning_rate": 1.9949425098616513e-05, "loss": 0.0755, "step": 449 }, { "epoch": 0.6130790190735694, "grad_norm": 6.845586233163633, "learning_rate": 1.994898086468405e-05, "loss": 0.1021, "step": 450 }, { "epoch": 0.614441416893733, "grad_norm": 10.048331631965604, "learning_rate": 1.994853469325767e-05, "loss": 0.1198, "step": 451 }, { "epoch": 0.6158038147138964, "grad_norm": 14.994671019544425, "learning_rate": 1.9948086584424258e-05, "loss": 0.114, "step": 452 }, { "epoch": 0.6171662125340599, "grad_norm": 3.949222229103057, "learning_rate": 1.994763653827108e-05, "loss": 0.1065, "step": 453 }, { "epoch": 0.6185286103542235, "grad_norm": 13.577339729639544, "learning_rate": 1.994718455488579e-05, "loss": 0.1228, "step": 454 }, { "epoch": 0.6198910081743869, "grad_norm": 4.687941665206499, "learning_rate": 1.994673063435639e-05, "loss": 0.1008, "step": 455 }, { "epoch": 0.6212534059945504, "grad_norm": 6.38300539127633, "learning_rate": 1.994627477677129e-05, "loss": 0.1012, "step": 456 }, { "epoch": 0.6226158038147139, "grad_norm": 11.262794265531486, "learning_rate": 1.994581698221927e-05, "loss": 0.1103, "step": 457 }, { "epoch": 0.6239782016348774, "grad_norm": 3.9721586541168774, "learning_rate": 1.9945357250789467e-05, "loss": 0.1442, "step": 458 }, { "epoch": 0.6253405994550408, "grad_norm": 3.8813445439240413, "learning_rate": 1.994489558257142e-05, "loss": 0.1094, "step": 459 }, { "epoch": 0.6267029972752044, "grad_norm": 8.410172037830446, "learning_rate": 1.9944431977655036e-05, "loss": 0.0875, "step": 460 }, { "epoch": 0.6280653950953679, "grad_norm": 4.594861933073476, "learning_rate": 1.9943966436130598e-05, "loss": 0.1403, "step": 461 }, { "epoch": 0.6294277929155313, "grad_norm": 4.889974026702112, "learning_rate": 1.9943498958088763e-05, "loss": 0.1096, "step": 462 }, { "epoch": 0.6307901907356949, "grad_norm": 3.7867923059743536, "learning_rate": 1.9943029543620573e-05, "loss": 0.129, "step": 463 }, { "epoch": 0.6321525885558583, "grad_norm": 3.7506993064226566, "learning_rate": 1.994255819281744e-05, "loss": 0.0971, "step": 464 }, { "epoch": 0.6335149863760218, "grad_norm": 6.379333532075522, "learning_rate": 1.9942084905771165e-05, "loss": 0.1136, "step": 465 }, { "epoch": 0.6348773841961853, "grad_norm": 7.651285729610336, "learning_rate": 1.9941609682573908e-05, "loss": 0.1732, "step": 466 }, { "epoch": 0.6362397820163488, "grad_norm": 2.109025744264349, "learning_rate": 1.9941132523318217e-05, "loss": 0.1122, "step": 467 }, { "epoch": 0.6376021798365122, "grad_norm": 2.0014291236698467, "learning_rate": 1.9940653428097016e-05, "loss": 0.1448, "step": 468 }, { "epoch": 0.6389645776566758, "grad_norm": 5.2539801363178915, "learning_rate": 1.9940172397003606e-05, "loss": 0.1232, "step": 469 }, { "epoch": 0.6403269754768393, "grad_norm": 3.0748204411972395, "learning_rate": 1.9939689430131667e-05, "loss": 0.0883, "step": 470 }, { "epoch": 0.6416893732970027, "grad_norm": 9.703660146472542, "learning_rate": 1.9939204527575245e-05, "loss": 0.1098, "step": 471 }, { "epoch": 0.6430517711171662, "grad_norm": 2.4795179131660166, "learning_rate": 1.9938717689428782e-05, "loss": 0.1173, "step": 472 }, { "epoch": 0.6444141689373297, "grad_norm": 5.673696055270973, "learning_rate": 1.993822891578708e-05, "loss": 0.1321, "step": 473 }, { "epoch": 0.6457765667574932, "grad_norm": 4.681685204442489, "learning_rate": 1.9937738206745324e-05, "loss": 0.1218, "step": 474 }, { "epoch": 0.6471389645776566, "grad_norm": 5.65324187891576, "learning_rate": 1.9937245562399078e-05, "loss": 0.1085, "step": 475 }, { "epoch": 0.6485013623978202, "grad_norm": 5.279477133965485, "learning_rate": 1.9936750982844284e-05, "loss": 0.112, "step": 476 }, { "epoch": 0.6498637602179836, "grad_norm": 4.39362205147257, "learning_rate": 1.9936254468177253e-05, "loss": 0.1076, "step": 477 }, { "epoch": 0.6512261580381471, "grad_norm": 4.409214082134216, "learning_rate": 1.993575601849468e-05, "loss": 0.0998, "step": 478 }, { "epoch": 0.6525885558583107, "grad_norm": 5.6414234031031105, "learning_rate": 1.9935255633893632e-05, "loss": 0.124, "step": 479 }, { "epoch": 0.6539509536784741, "grad_norm": 5.308310320890762, "learning_rate": 1.993475331447156e-05, "loss": 0.1747, "step": 480 }, { "epoch": 0.6553133514986376, "grad_norm": 7.011871224327945, "learning_rate": 1.9934249060326283e-05, "loss": 0.082, "step": 481 }, { "epoch": 0.6566757493188011, "grad_norm": 9.214442868293748, "learning_rate": 1.9933742871556e-05, "loss": 0.1291, "step": 482 }, { "epoch": 0.6580381471389646, "grad_norm": 2.887088431130938, "learning_rate": 1.9933234748259294e-05, "loss": 0.1116, "step": 483 }, { "epoch": 0.659400544959128, "grad_norm": 8.741344144309567, "learning_rate": 1.993272469053511e-05, "loss": 0.1334, "step": 484 }, { "epoch": 0.6607629427792916, "grad_norm": 6.487028185456236, "learning_rate": 1.9932212698482787e-05, "loss": 0.0918, "step": 485 }, { "epoch": 0.662125340599455, "grad_norm": 3.0621404717269955, "learning_rate": 1.9931698772202027e-05, "loss": 0.1316, "step": 486 }, { "epoch": 0.6634877384196185, "grad_norm": 15.182854125363813, "learning_rate": 1.9931182911792913e-05, "loss": 0.1178, "step": 487 }, { "epoch": 0.6648501362397821, "grad_norm": 3.0880663554777517, "learning_rate": 1.9930665117355906e-05, "loss": 0.1026, "step": 488 }, { "epoch": 0.6662125340599455, "grad_norm": 3.7890636335757346, "learning_rate": 1.9930145388991845e-05, "loss": 0.0756, "step": 489 }, { "epoch": 0.667574931880109, "grad_norm": 8.68036555967227, "learning_rate": 1.992962372680194e-05, "loss": 0.0611, "step": 490 }, { "epoch": 0.6689373297002725, "grad_norm": 5.235861786563815, "learning_rate": 1.9929100130887782e-05, "loss": 0.111, "step": 491 }, { "epoch": 0.670299727520436, "grad_norm": 6.286788328140697, "learning_rate": 1.992857460135134e-05, "loss": 0.1108, "step": 492 }, { "epoch": 0.6716621253405994, "grad_norm": 2.477746752687283, "learning_rate": 1.9928047138294952e-05, "loss": 0.1034, "step": 493 }, { "epoch": 0.6730245231607629, "grad_norm": 2.5934882918808575, "learning_rate": 1.9927517741821343e-05, "loss": 0.0965, "step": 494 }, { "epoch": 0.6743869209809265, "grad_norm": 11.954458015049578, "learning_rate": 1.9926986412033606e-05, "loss": 0.0864, "step": 495 }, { "epoch": 0.6757493188010899, "grad_norm": 3.2788412116949535, "learning_rate": 1.9926453149035218e-05, "loss": 0.0781, "step": 496 }, { "epoch": 0.6771117166212534, "grad_norm": 13.053997420063189, "learning_rate": 1.9925917952930024e-05, "loss": 0.1017, "step": 497 }, { "epoch": 0.6784741144414169, "grad_norm": 6.017966962537192, "learning_rate": 1.9925380823822248e-05, "loss": 0.0992, "step": 498 }, { "epoch": 0.6798365122615804, "grad_norm": 9.838393798017258, "learning_rate": 1.99248417618165e-05, "loss": 0.1781, "step": 499 }, { "epoch": 0.6811989100817438, "grad_norm": 2.7951987365579583, "learning_rate": 1.992430076701775e-05, "loss": 0.1103, "step": 500 }, { "epoch": 0.6825613079019074, "grad_norm": 3.1311006294407053, "learning_rate": 1.9923757839531355e-05, "loss": 0.0691, "step": 501 }, { "epoch": 0.6839237057220708, "grad_norm": 5.459359408406407, "learning_rate": 1.992321297946305e-05, "loss": 0.1055, "step": 502 }, { "epoch": 0.6852861035422343, "grad_norm": 4.57872558385905, "learning_rate": 1.9922666186918943e-05, "loss": 0.0923, "step": 503 }, { "epoch": 0.6866485013623979, "grad_norm": 3.6071282567298084, "learning_rate": 1.9922117462005512e-05, "loss": 0.1333, "step": 504 }, { "epoch": 0.6880108991825613, "grad_norm": 9.09294470339563, "learning_rate": 1.992156680482962e-05, "loss": 0.1204, "step": 505 }, { "epoch": 0.6893732970027248, "grad_norm": 11.715783633724318, "learning_rate": 1.9921014215498507e-05, "loss": 0.1248, "step": 506 }, { "epoch": 0.6907356948228883, "grad_norm": 11.613015125598086, "learning_rate": 1.992045969411978e-05, "loss": 0.1199, "step": 507 }, { "epoch": 0.6920980926430518, "grad_norm": 8.287916437671468, "learning_rate": 1.991990324080144e-05, "loss": 0.1519, "step": 508 }, { "epoch": 0.6934604904632152, "grad_norm": 2.882387304668855, "learning_rate": 1.9919344855651834e-05, "loss": 0.098, "step": 509 }, { "epoch": 0.6948228882833788, "grad_norm": 5.9807674794944745, "learning_rate": 1.9918784538779717e-05, "loss": 0.101, "step": 510 }, { "epoch": 0.6961852861035422, "grad_norm": 8.835441470618486, "learning_rate": 1.99182222902942e-05, "loss": 0.156, "step": 511 }, { "epoch": 0.6975476839237057, "grad_norm": 4.443385095247063, "learning_rate": 1.9917658110304783e-05, "loss": 0.1079, "step": 512 }, { "epoch": 0.6989100817438693, "grad_norm": 10.527160378009139, "learning_rate": 1.991709199892133e-05, "loss": 0.1422, "step": 513 }, { "epoch": 0.7002724795640327, "grad_norm": 13.0809343689448, "learning_rate": 1.9916523956254096e-05, "loss": 0.1389, "step": 514 }, { "epoch": 0.7016348773841962, "grad_norm": 2.9124086070086617, "learning_rate": 1.9915953982413693e-05, "loss": 0.1303, "step": 515 }, { "epoch": 0.7029972752043597, "grad_norm": 7.419790902953772, "learning_rate": 1.9915382077511124e-05, "loss": 0.1544, "step": 516 }, { "epoch": 0.7043596730245232, "grad_norm": 13.945529675060742, "learning_rate": 1.9914808241657764e-05, "loss": 0.0835, "step": 517 }, { "epoch": 0.7057220708446866, "grad_norm": 5.865260838342199, "learning_rate": 1.9914232474965364e-05, "loss": 0.1183, "step": 518 }, { "epoch": 0.7070844686648501, "grad_norm": 10.460007132623662, "learning_rate": 1.991365477754605e-05, "loss": 0.1193, "step": 519 }, { "epoch": 0.7084468664850136, "grad_norm": 5.4301080889496385, "learning_rate": 1.9913075149512322e-05, "loss": 0.132, "step": 520 }, { "epoch": 0.7098092643051771, "grad_norm": 4.137307786660812, "learning_rate": 1.991249359097706e-05, "loss": 0.1468, "step": 521 }, { "epoch": 0.7111716621253406, "grad_norm": 14.043367684076102, "learning_rate": 1.991191010205352e-05, "loss": 0.1543, "step": 522 }, { "epoch": 0.7125340599455041, "grad_norm": 5.354884540964004, "learning_rate": 1.9911324682855332e-05, "loss": 0.0808, "step": 523 }, { "epoch": 0.7138964577656676, "grad_norm": 7.67984011714943, "learning_rate": 1.99107373334965e-05, "loss": 0.0857, "step": 524 }, { "epoch": 0.715258855585831, "grad_norm": 3.041243705422017, "learning_rate": 1.9910148054091408e-05, "loss": 0.1472, "step": 525 }, { "epoch": 0.7166212534059946, "grad_norm": 8.833751296252926, "learning_rate": 1.990955684475481e-05, "loss": 0.1303, "step": 526 }, { "epoch": 0.717983651226158, "grad_norm": 8.70654275967376, "learning_rate": 1.9908963705601848e-05, "loss": 0.1385, "step": 527 }, { "epoch": 0.7193460490463215, "grad_norm": 5.58947931070462, "learning_rate": 1.990836863674803e-05, "loss": 0.0955, "step": 528 }, { "epoch": 0.720708446866485, "grad_norm": 9.264236543000571, "learning_rate": 1.990777163830923e-05, "loss": 0.1168, "step": 529 }, { "epoch": 0.7220708446866485, "grad_norm": 6.6422060529385965, "learning_rate": 1.9907172710401723e-05, "loss": 0.1045, "step": 530 }, { "epoch": 0.723433242506812, "grad_norm": 3.8967689484849526, "learning_rate": 1.990657185314214e-05, "loss": 0.0982, "step": 531 }, { "epoch": 0.7247956403269755, "grad_norm": 13.809996714550872, "learning_rate": 1.9905969066647494e-05, "loss": 0.1354, "step": 532 }, { "epoch": 0.726158038147139, "grad_norm": 3.3376770324466514, "learning_rate": 1.9905364351035176e-05, "loss": 0.0788, "step": 533 }, { "epoch": 0.7275204359673024, "grad_norm": 10.160779824569389, "learning_rate": 1.9904757706422946e-05, "loss": 0.1145, "step": 534 }, { "epoch": 0.728882833787466, "grad_norm": 7.164798982471702, "learning_rate": 1.990414913292894e-05, "loss": 0.1111, "step": 535 }, { "epoch": 0.7302452316076294, "grad_norm": 6.51673992387129, "learning_rate": 1.9903538630671687e-05, "loss": 0.1283, "step": 536 }, { "epoch": 0.7316076294277929, "grad_norm": 10.73499860824276, "learning_rate": 1.990292619977007e-05, "loss": 0.1402, "step": 537 }, { "epoch": 0.7329700272479565, "grad_norm": 4.9152785918847, "learning_rate": 1.9902311840343356e-05, "loss": 0.1489, "step": 538 }, { "epoch": 0.7343324250681199, "grad_norm": 16.128365492205365, "learning_rate": 1.9901695552511185e-05, "loss": 0.1023, "step": 539 }, { "epoch": 0.7356948228882834, "grad_norm": 10.66437211352579, "learning_rate": 1.9901077336393574e-05, "loss": 0.1325, "step": 540 }, { "epoch": 0.7370572207084468, "grad_norm": 5.77152685750255, "learning_rate": 1.9900457192110924e-05, "loss": 0.1131, "step": 541 }, { "epoch": 0.7384196185286104, "grad_norm": 12.504446405436338, "learning_rate": 1.9899835119783996e-05, "loss": 0.1186, "step": 542 }, { "epoch": 0.7397820163487738, "grad_norm": 5.2655191528761875, "learning_rate": 1.9899211119533938e-05, "loss": 0.0753, "step": 543 }, { "epoch": 0.7411444141689373, "grad_norm": 3.424532067397211, "learning_rate": 1.9898585191482266e-05, "loss": 0.1097, "step": 544 }, { "epoch": 0.7425068119891008, "grad_norm": 11.349354817451053, "learning_rate": 1.989795733575088e-05, "loss": 0.0996, "step": 545 }, { "epoch": 0.7438692098092643, "grad_norm": 3.6181806940425965, "learning_rate": 1.9897327552462048e-05, "loss": 0.0872, "step": 546 }, { "epoch": 0.7452316076294278, "grad_norm": 7.5887173728642034, "learning_rate": 1.9896695841738417e-05, "loss": 0.1313, "step": 547 }, { "epoch": 0.7465940054495913, "grad_norm": 7.272500006151328, "learning_rate": 1.9896062203703004e-05, "loss": 0.095, "step": 548 }, { "epoch": 0.7479564032697548, "grad_norm": 3.335056383399394, "learning_rate": 1.989542663847921e-05, "loss": 0.1186, "step": 549 }, { "epoch": 0.7493188010899182, "grad_norm": 3.7009993171560858, "learning_rate": 1.9894789146190807e-05, "loss": 0.0878, "step": 550 }, { "epoch": 0.7506811989100818, "grad_norm": 3.3276761367298375, "learning_rate": 1.9894149726961938e-05, "loss": 0.071, "step": 551 }, { "epoch": 0.7520435967302452, "grad_norm": 5.336073873251393, "learning_rate": 1.989350838091713e-05, "loss": 0.1126, "step": 552 }, { "epoch": 0.7534059945504087, "grad_norm": 9.0745246593574, "learning_rate": 1.9892865108181278e-05, "loss": 0.1202, "step": 553 }, { "epoch": 0.7547683923705722, "grad_norm": 3.391495751907436, "learning_rate": 1.9892219908879652e-05, "loss": 0.1179, "step": 554 }, { "epoch": 0.7561307901907357, "grad_norm": 5.64677566519112, "learning_rate": 1.989157278313791e-05, "loss": 0.0915, "step": 555 }, { "epoch": 0.7574931880108992, "grad_norm": 5.582658652683965, "learning_rate": 1.9890923731082064e-05, "loss": 0.1282, "step": 556 }, { "epoch": 0.7588555858310627, "grad_norm": 5.793061826167819, "learning_rate": 1.989027275283852e-05, "loss": 0.1304, "step": 557 }, { "epoch": 0.7602179836512262, "grad_norm": 11.821203152481113, "learning_rate": 1.9889619848534044e-05, "loss": 0.0991, "step": 558 }, { "epoch": 0.7615803814713896, "grad_norm": 5.951711066213401, "learning_rate": 1.9888965018295793e-05, "loss": 0.1158, "step": 559 }, { "epoch": 0.7629427792915532, "grad_norm": 3.758500022590574, "learning_rate": 1.9888308262251286e-05, "loss": 0.1095, "step": 560 }, { "epoch": 0.7643051771117166, "grad_norm": 12.466755213145907, "learning_rate": 1.9887649580528423e-05, "loss": 0.1165, "step": 561 }, { "epoch": 0.7656675749318801, "grad_norm": 2.5781844808561827, "learning_rate": 1.9886988973255478e-05, "loss": 0.1003, "step": 562 }, { "epoch": 0.7670299727520435, "grad_norm": 14.458697382632879, "learning_rate": 1.988632644056109e-05, "loss": 0.1351, "step": 563 }, { "epoch": 0.7683923705722071, "grad_norm": 5.526177444855453, "learning_rate": 1.9885661982574297e-05, "loss": 0.1333, "step": 564 }, { "epoch": 0.7697547683923706, "grad_norm": 2.166403154450634, "learning_rate": 1.9884995599424495e-05, "loss": 0.0893, "step": 565 }, { "epoch": 0.771117166212534, "grad_norm": 7.294681813504716, "learning_rate": 1.9884327291241446e-05, "loss": 0.1205, "step": 566 }, { "epoch": 0.7724795640326976, "grad_norm": 2.8892120081005674, "learning_rate": 1.9883657058155313e-05, "loss": 0.1231, "step": 567 }, { "epoch": 0.773841961852861, "grad_norm": 3.31355584193036, "learning_rate": 1.988298490029661e-05, "loss": 0.1222, "step": 568 }, { "epoch": 0.7752043596730245, "grad_norm": 6.139201009634124, "learning_rate": 1.9882310817796235e-05, "loss": 0.0917, "step": 569 }, { "epoch": 0.776566757493188, "grad_norm": 3.043959922459174, "learning_rate": 1.9881634810785466e-05, "loss": 0.0929, "step": 570 }, { "epoch": 0.7779291553133515, "grad_norm": 6.344233746577644, "learning_rate": 1.9880956879395946e-05, "loss": 0.1089, "step": 571 }, { "epoch": 0.779291553133515, "grad_norm": 8.629891544703293, "learning_rate": 1.98802770237597e-05, "loss": 0.1315, "step": 572 }, { "epoch": 0.7806539509536785, "grad_norm": 5.315460768650875, "learning_rate": 1.9879595244009123e-05, "loss": 0.0943, "step": 573 }, { "epoch": 0.782016348773842, "grad_norm": 9.861562172402012, "learning_rate": 1.987891154027699e-05, "loss": 0.092, "step": 574 }, { "epoch": 0.7833787465940054, "grad_norm": 13.757055844671335, "learning_rate": 1.9878225912696446e-05, "loss": 0.0995, "step": 575 }, { "epoch": 0.784741144414169, "grad_norm": 7.332367568455173, "learning_rate": 1.987753836140101e-05, "loss": 0.1047, "step": 576 }, { "epoch": 0.7861035422343324, "grad_norm": 12.09828330958034, "learning_rate": 1.987684888652458e-05, "loss": 0.1123, "step": 577 }, { "epoch": 0.7874659400544959, "grad_norm": 3.5645133137225145, "learning_rate": 1.9876157488201426e-05, "loss": 0.1346, "step": 578 }, { "epoch": 0.7888283378746594, "grad_norm": 4.804598294192389, "learning_rate": 1.987546416656619e-05, "loss": 0.1238, "step": 579 }, { "epoch": 0.7901907356948229, "grad_norm": 11.562529626411777, "learning_rate": 1.98747689217539e-05, "loss": 0.1142, "step": 580 }, { "epoch": 0.7915531335149864, "grad_norm": 2.7211052451965596, "learning_rate": 1.987407175389994e-05, "loss": 0.1077, "step": 581 }, { "epoch": 0.7929155313351499, "grad_norm": 17.549926425113398, "learning_rate": 1.9873372663140084e-05, "loss": 0.1408, "step": 582 }, { "epoch": 0.7942779291553134, "grad_norm": 3.1716741809185813, "learning_rate": 1.9872671649610473e-05, "loss": 0.1056, "step": 583 }, { "epoch": 0.7956403269754768, "grad_norm": 6.8318725062431405, "learning_rate": 1.9871968713447626e-05, "loss": 0.1152, "step": 584 }, { "epoch": 0.7970027247956403, "grad_norm": 6.722749549817554, "learning_rate": 1.987126385478843e-05, "loss": 0.133, "step": 585 }, { "epoch": 0.7983651226158038, "grad_norm": 12.168541620095287, "learning_rate": 1.987055707377016e-05, "loss": 0.1015, "step": 586 }, { "epoch": 0.7997275204359673, "grad_norm": 2.823619984676503, "learning_rate": 1.9869848370530455e-05, "loss": 0.1245, "step": 587 }, { "epoch": 0.8010899182561307, "grad_norm": 17.487956880881747, "learning_rate": 1.9869137745207322e-05, "loss": 0.1313, "step": 588 }, { "epoch": 0.8024523160762943, "grad_norm": 4.921507288195476, "learning_rate": 1.9868425197939155e-05, "loss": 0.0762, "step": 589 }, { "epoch": 0.8038147138964578, "grad_norm": 7.675422748228981, "learning_rate": 1.9867710728864723e-05, "loss": 0.1278, "step": 590 }, { "epoch": 0.8051771117166212, "grad_norm": 13.34897378929093, "learning_rate": 1.9866994338123154e-05, "loss": 0.0917, "step": 591 }, { "epoch": 0.8065395095367848, "grad_norm": 3.327210256969917, "learning_rate": 1.9866276025853964e-05, "loss": 0.0854, "step": 592 }, { "epoch": 0.8079019073569482, "grad_norm": 7.768072545177833, "learning_rate": 1.9865555792197042e-05, "loss": 0.103, "step": 593 }, { "epoch": 0.8092643051771117, "grad_norm": 11.883077038545881, "learning_rate": 1.9864833637292645e-05, "loss": 0.1072, "step": 594 }, { "epoch": 0.8106267029972752, "grad_norm": 3.850736145408535, "learning_rate": 1.9864109561281412e-05, "loss": 0.0893, "step": 595 }, { "epoch": 0.8119891008174387, "grad_norm": 13.878087733480315, "learning_rate": 1.9863383564304344e-05, "loss": 0.0743, "step": 596 }, { "epoch": 0.8133514986376021, "grad_norm": 5.125503300301578, "learning_rate": 1.9862655646502832e-05, "loss": 0.1031, "step": 597 }, { "epoch": 0.8147138964577657, "grad_norm": 15.07466129480369, "learning_rate": 1.986192580801863e-05, "loss": 0.1431, "step": 598 }, { "epoch": 0.8160762942779292, "grad_norm": 9.672720226780346, "learning_rate": 1.9861194048993865e-05, "loss": 0.129, "step": 599 }, { "epoch": 0.8174386920980926, "grad_norm": 10.847776606068464, "learning_rate": 1.9860460369571047e-05, "loss": 0.0816, "step": 600 }, { "epoch": 0.8188010899182562, "grad_norm": 11.425850322294254, "learning_rate": 1.9859724769893048e-05, "loss": 0.1096, "step": 601 }, { "epoch": 0.8201634877384196, "grad_norm": 4.775896875663684, "learning_rate": 1.9858987250103132e-05, "loss": 0.1259, "step": 602 }, { "epoch": 0.8215258855585831, "grad_norm": 5.655912599190316, "learning_rate": 1.985824781034492e-05, "loss": 0.0952, "step": 603 }, { "epoch": 0.8228882833787466, "grad_norm": 8.477291426325083, "learning_rate": 1.985750645076241e-05, "loss": 0.1422, "step": 604 }, { "epoch": 0.8242506811989101, "grad_norm": 12.534626193242909, "learning_rate": 1.985676317149998e-05, "loss": 0.1184, "step": 605 }, { "epoch": 0.8256130790190735, "grad_norm": 8.016559284270597, "learning_rate": 1.9856017972702375e-05, "loss": 0.0671, "step": 606 }, { "epoch": 0.8269754768392371, "grad_norm": 11.751713618224231, "learning_rate": 1.985527085451472e-05, "loss": 0.0813, "step": 607 }, { "epoch": 0.8283378746594006, "grad_norm": 6.695613199567505, "learning_rate": 1.985452181708251e-05, "loss": 0.1006, "step": 608 }, { "epoch": 0.829700272479564, "grad_norm": 18.59578564671291, "learning_rate": 1.9853770860551614e-05, "loss": 0.115, "step": 609 }, { "epoch": 0.8310626702997275, "grad_norm": 8.09244091754091, "learning_rate": 1.985301798506828e-05, "loss": 0.0858, "step": 610 }, { "epoch": 0.832425068119891, "grad_norm": 3.178055512411893, "learning_rate": 1.9852263190779123e-05, "loss": 0.1155, "step": 611 }, { "epoch": 0.8337874659400545, "grad_norm": 19.568900679710357, "learning_rate": 1.985150647783113e-05, "loss": 0.1483, "step": 612 }, { "epoch": 0.8351498637602179, "grad_norm": 5.599583500303736, "learning_rate": 1.985074784637167e-05, "loss": 0.087, "step": 613 }, { "epoch": 0.8365122615803815, "grad_norm": 17.123095146658162, "learning_rate": 1.9849987296548477e-05, "loss": 0.1187, "step": 614 }, { "epoch": 0.837874659400545, "grad_norm": 11.746638555733302, "learning_rate": 1.984922482850967e-05, "loss": 0.1212, "step": 615 }, { "epoch": 0.8392370572207084, "grad_norm": 12.061696551642386, "learning_rate": 1.9848460442403725e-05, "loss": 0.0544, "step": 616 }, { "epoch": 0.840599455040872, "grad_norm": 11.709707973797329, "learning_rate": 1.9847694138379508e-05, "loss": 0.1084, "step": 617 }, { "epoch": 0.8419618528610354, "grad_norm": 2.3795385587018454, "learning_rate": 1.9846925916586248e-05, "loss": 0.1152, "step": 618 }, { "epoch": 0.8433242506811989, "grad_norm": 7.847416460673561, "learning_rate": 1.984615577717355e-05, "loss": 0.13, "step": 619 }, { "epoch": 0.8446866485013624, "grad_norm": 10.482379516397806, "learning_rate": 1.9845383720291392e-05, "loss": 0.0824, "step": 620 }, { "epoch": 0.8460490463215259, "grad_norm": 7.680827390936155, "learning_rate": 1.9844609746090135e-05, "loss": 0.0983, "step": 621 }, { "epoch": 0.8474114441416893, "grad_norm": 8.73421681234181, "learning_rate": 1.9843833854720495e-05, "loss": 0.1049, "step": 622 }, { "epoch": 0.8487738419618529, "grad_norm": 14.517908995765119, "learning_rate": 1.984305604633358e-05, "loss": 0.1033, "step": 623 }, { "epoch": 0.8501362397820164, "grad_norm": 2.5536040776530076, "learning_rate": 1.9842276321080855e-05, "loss": 0.0832, "step": 624 }, { "epoch": 0.8514986376021798, "grad_norm": 7.626499277770217, "learning_rate": 1.9841494679114168e-05, "loss": 0.1033, "step": 625 }, { "epoch": 0.8528610354223434, "grad_norm": 9.509350489610403, "learning_rate": 1.984071112058574e-05, "loss": 0.0763, "step": 626 }, { "epoch": 0.8542234332425068, "grad_norm": 3.3283981465282224, "learning_rate": 1.9839925645648165e-05, "loss": 0.1051, "step": 627 }, { "epoch": 0.8555858310626703, "grad_norm": 5.441481235894, "learning_rate": 1.9839138254454405e-05, "loss": 0.0958, "step": 628 }, { "epoch": 0.8569482288828338, "grad_norm": 9.684750748328259, "learning_rate": 1.9838348947157802e-05, "loss": 0.0864, "step": 629 }, { "epoch": 0.8583106267029973, "grad_norm": 5.931423422030774, "learning_rate": 1.9837557723912067e-05, "loss": 0.0813, "step": 630 }, { "epoch": 0.8596730245231607, "grad_norm": 11.205505791668443, "learning_rate": 1.9836764584871284e-05, "loss": 0.101, "step": 631 }, { "epoch": 0.8610354223433242, "grad_norm": 15.036877476165683, "learning_rate": 1.9835969530189912e-05, "loss": 0.1357, "step": 632 }, { "epoch": 0.8623978201634878, "grad_norm": 1.9831456932983518, "learning_rate": 1.9835172560022785e-05, "loss": 0.079, "step": 633 }, { "epoch": 0.8637602179836512, "grad_norm": 14.150952993564132, "learning_rate": 1.9834373674525103e-05, "loss": 0.0989, "step": 634 }, { "epoch": 0.8651226158038147, "grad_norm": 11.96428648407164, "learning_rate": 1.9833572873852446e-05, "loss": 0.1605, "step": 635 }, { "epoch": 0.8664850136239782, "grad_norm": 4.505209568497574, "learning_rate": 1.983277015816076e-05, "loss": 0.1163, "step": 636 }, { "epoch": 0.8678474114441417, "grad_norm": 12.108741482678669, "learning_rate": 1.9831965527606374e-05, "loss": 0.0765, "step": 637 }, { "epoch": 0.8692098092643051, "grad_norm": 9.570624858234021, "learning_rate": 1.9831158982345982e-05, "loss": 0.0786, "step": 638 }, { "epoch": 0.8705722070844687, "grad_norm": 3.1007514633404134, "learning_rate": 1.9830350522536654e-05, "loss": 0.1087, "step": 639 }, { "epoch": 0.8719346049046321, "grad_norm": 8.215590937013689, "learning_rate": 1.9829540148335825e-05, "loss": 0.064, "step": 640 }, { "epoch": 0.8732970027247956, "grad_norm": 10.350663529024304, "learning_rate": 1.982872785990132e-05, "loss": 0.1116, "step": 641 }, { "epoch": 0.8746594005449592, "grad_norm": 4.151579820720529, "learning_rate": 1.982791365739132e-05, "loss": 0.1156, "step": 642 }, { "epoch": 0.8760217983651226, "grad_norm": 9.901717715682008, "learning_rate": 1.9827097540964385e-05, "loss": 0.0955, "step": 643 }, { "epoch": 0.8773841961852861, "grad_norm": 4.183314426653228, "learning_rate": 1.9826279510779454e-05, "loss": 0.0973, "step": 644 }, { "epoch": 0.8787465940054496, "grad_norm": 2.152863117057348, "learning_rate": 1.9825459566995825e-05, "loss": 0.094, "step": 645 }, { "epoch": 0.8801089918256131, "grad_norm": 5.56307836516068, "learning_rate": 1.9824637709773184e-05, "loss": 0.0795, "step": 646 }, { "epoch": 0.8814713896457765, "grad_norm": 8.270787124022455, "learning_rate": 1.9823813939271573e-05, "loss": 0.0714, "step": 647 }, { "epoch": 0.8828337874659401, "grad_norm": 3.223316171411904, "learning_rate": 1.982298825565142e-05, "loss": 0.1313, "step": 648 }, { "epoch": 0.8841961852861036, "grad_norm": 5.519936595935207, "learning_rate": 1.9822160659073523e-05, "loss": 0.1668, "step": 649 }, { "epoch": 0.885558583106267, "grad_norm": 2.4046527443469383, "learning_rate": 1.982133114969905e-05, "loss": 0.0989, "step": 650 }, { "epoch": 0.8869209809264306, "grad_norm": 3.0538973823308506, "learning_rate": 1.982049972768954e-05, "loss": 0.0793, "step": 651 }, { "epoch": 0.888283378746594, "grad_norm": 3.329136397514447, "learning_rate": 1.9819666393206907e-05, "loss": 0.0912, "step": 652 }, { "epoch": 0.8896457765667575, "grad_norm": 4.550426960974938, "learning_rate": 1.9818831146413435e-05, "loss": 0.111, "step": 653 }, { "epoch": 0.8910081743869209, "grad_norm": 6.295474701485815, "learning_rate": 1.9817993987471786e-05, "loss": 0.0936, "step": 654 }, { "epoch": 0.8923705722070845, "grad_norm": 3.0487116344090053, "learning_rate": 1.9817154916544992e-05, "loss": 0.0989, "step": 655 }, { "epoch": 0.8937329700272479, "grad_norm": 1.886118985939914, "learning_rate": 1.981631393379645e-05, "loss": 0.0868, "step": 656 }, { "epoch": 0.8950953678474114, "grad_norm": 14.409052387597535, "learning_rate": 1.9815471039389943e-05, "loss": 0.1017, "step": 657 }, { "epoch": 0.896457765667575, "grad_norm": 3.3002334325728, "learning_rate": 1.9814626233489615e-05, "loss": 0.1377, "step": 658 }, { "epoch": 0.8978201634877384, "grad_norm": 16.629116906940485, "learning_rate": 1.9813779516259987e-05, "loss": 0.0629, "step": 659 }, { "epoch": 0.8991825613079019, "grad_norm": 3.7791056345695018, "learning_rate": 1.9812930887865946e-05, "loss": 0.141, "step": 660 }, { "epoch": 0.9005449591280654, "grad_norm": 2.351841886287997, "learning_rate": 1.9812080348472765e-05, "loss": 0.1185, "step": 661 }, { "epoch": 0.9019073569482289, "grad_norm": 5.0679341972654655, "learning_rate": 1.9811227898246072e-05, "loss": 0.1145, "step": 662 }, { "epoch": 0.9032697547683923, "grad_norm": 2.684098657609121, "learning_rate": 1.981037353735188e-05, "loss": 0.0847, "step": 663 }, { "epoch": 0.9046321525885559, "grad_norm": 3.0762549603660894, "learning_rate": 1.9809517265956572e-05, "loss": 0.1025, "step": 664 }, { "epoch": 0.9059945504087193, "grad_norm": 2.536324495101548, "learning_rate": 1.98086590842269e-05, "loss": 0.0973, "step": 665 }, { "epoch": 0.9073569482288828, "grad_norm": 4.34355802767225, "learning_rate": 1.9807798992329985e-05, "loss": 0.1149, "step": 666 }, { "epoch": 0.9087193460490464, "grad_norm": 2.152586452120689, "learning_rate": 1.9806936990433328e-05, "loss": 0.0967, "step": 667 }, { "epoch": 0.9100817438692098, "grad_norm": 6.158470166358607, "learning_rate": 1.9806073078704793e-05, "loss": 0.1239, "step": 668 }, { "epoch": 0.9114441416893733, "grad_norm": 3.225562758724376, "learning_rate": 1.9805207257312625e-05, "loss": 0.0763, "step": 669 }, { "epoch": 0.9128065395095368, "grad_norm": 4.027291347545397, "learning_rate": 1.9804339526425437e-05, "loss": 0.0948, "step": 670 }, { "epoch": 0.9141689373297003, "grad_norm": 8.301190725364147, "learning_rate": 1.980346988621221e-05, "loss": 0.0752, "step": 671 }, { "epoch": 0.9155313351498637, "grad_norm": 4.6740858036363555, "learning_rate": 1.98025983368423e-05, "loss": 0.106, "step": 672 }, { "epoch": 0.9168937329700273, "grad_norm": 1.6963225354473412, "learning_rate": 1.9801724878485438e-05, "loss": 0.0701, "step": 673 }, { "epoch": 0.9182561307901907, "grad_norm": 4.931252834344089, "learning_rate": 1.9800849511311726e-05, "loss": 0.1283, "step": 674 }, { "epoch": 0.9196185286103542, "grad_norm": 7.27740087773244, "learning_rate": 1.9799972235491633e-05, "loss": 0.107, "step": 675 }, { "epoch": 0.9209809264305178, "grad_norm": 5.217319904755641, "learning_rate": 1.9799093051195995e-05, "loss": 0.1104, "step": 676 }, { "epoch": 0.9223433242506812, "grad_norm": 12.228834863336068, "learning_rate": 1.979821195859604e-05, "loss": 0.0914, "step": 677 }, { "epoch": 0.9237057220708447, "grad_norm": 4.829936478671115, "learning_rate": 1.979732895786335e-05, "loss": 0.0778, "step": 678 }, { "epoch": 0.9250681198910081, "grad_norm": 14.574465989302462, "learning_rate": 1.9796444049169876e-05, "loss": 0.123, "step": 679 }, { "epoch": 0.9264305177111717, "grad_norm": 3.319193178220925, "learning_rate": 1.9795557232687956e-05, "loss": 0.0916, "step": 680 }, { "epoch": 0.9277929155313351, "grad_norm": 1.7211689707996876, "learning_rate": 1.979466850859029e-05, "loss": 0.1042, "step": 681 }, { "epoch": 0.9291553133514986, "grad_norm": 6.651251465195421, "learning_rate": 1.9793777877049946e-05, "loss": 0.0836, "step": 682 }, { "epoch": 0.9305177111716622, "grad_norm": 7.230251555866428, "learning_rate": 1.9792885338240375e-05, "loss": 0.0891, "step": 683 }, { "epoch": 0.9318801089918256, "grad_norm": 3.4622689067172763, "learning_rate": 1.979199089233539e-05, "loss": 0.066, "step": 684 }, { "epoch": 0.9332425068119891, "grad_norm": 9.684990744773982, "learning_rate": 1.9791094539509173e-05, "loss": 0.0545, "step": 685 }, { "epoch": 0.9346049046321526, "grad_norm": 5.323347855106655, "learning_rate": 1.9790196279936286e-05, "loss": 0.1422, "step": 686 }, { "epoch": 0.9359673024523161, "grad_norm": 4.64805108069019, "learning_rate": 1.9789296113791663e-05, "loss": 0.1167, "step": 687 }, { "epoch": 0.9373297002724795, "grad_norm": 12.291834272933015, "learning_rate": 1.97883940412506e-05, "loss": 0.1178, "step": 688 }, { "epoch": 0.9386920980926431, "grad_norm": 2.5198266086250873, "learning_rate": 1.978749006248877e-05, "loss": 0.075, "step": 689 }, { "epoch": 0.9400544959128065, "grad_norm": 8.289943156944041, "learning_rate": 1.9786584177682217e-05, "loss": 0.1229, "step": 690 }, { "epoch": 0.94141689373297, "grad_norm": 4.972865294296792, "learning_rate": 1.978567638700736e-05, "loss": 0.141, "step": 691 }, { "epoch": 0.9427792915531336, "grad_norm": 3.5670089898745525, "learning_rate": 1.9784766690640976e-05, "loss": 0.0889, "step": 692 }, { "epoch": 0.944141689373297, "grad_norm": 3.481463841366847, "learning_rate": 1.9783855088760226e-05, "loss": 0.0936, "step": 693 }, { "epoch": 0.9455040871934605, "grad_norm": 4.6027776818062085, "learning_rate": 1.9782941581542642e-05, "loss": 0.0869, "step": 694 }, { "epoch": 0.946866485013624, "grad_norm": 2.0667757755164113, "learning_rate": 1.9782026169166116e-05, "loss": 0.1099, "step": 695 }, { "epoch": 0.9482288828337875, "grad_norm": 6.2762004591654454, "learning_rate": 1.9781108851808927e-05, "loss": 0.1102, "step": 696 }, { "epoch": 0.9495912806539509, "grad_norm": 4.136620201107661, "learning_rate": 1.9780189629649708e-05, "loss": 0.0635, "step": 697 }, { "epoch": 0.9509536784741145, "grad_norm": 4.629516617081171, "learning_rate": 1.9779268502867474e-05, "loss": 0.0634, "step": 698 }, { "epoch": 0.952316076294278, "grad_norm": 6.407352602362753, "learning_rate": 1.9778345471641606e-05, "loss": 0.0771, "step": 699 }, { "epoch": 0.9536784741144414, "grad_norm": 4.1071323995083695, "learning_rate": 1.9777420536151865e-05, "loss": 0.1405, "step": 700 }, { "epoch": 0.9550408719346049, "grad_norm": 7.68836383007041, "learning_rate": 1.9776493696578365e-05, "loss": 0.1108, "step": 701 }, { "epoch": 0.9564032697547684, "grad_norm": 3.869874787694522, "learning_rate": 1.977556495310161e-05, "loss": 0.0652, "step": 702 }, { "epoch": 0.9577656675749319, "grad_norm": 2.9481800972428065, "learning_rate": 1.977463430590246e-05, "loss": 0.0785, "step": 703 }, { "epoch": 0.9591280653950953, "grad_norm": 5.4671621996547035, "learning_rate": 1.9773701755162157e-05, "loss": 0.1088, "step": 704 }, { "epoch": 0.9604904632152589, "grad_norm": 9.630602732954069, "learning_rate": 1.9772767301062307e-05, "loss": 0.1102, "step": 705 }, { "epoch": 0.9618528610354223, "grad_norm": 9.005874728621155, "learning_rate": 1.9771830943784887e-05, "loss": 0.0748, "step": 706 }, { "epoch": 0.9632152588555858, "grad_norm": 12.713931651679992, "learning_rate": 1.977089268351225e-05, "loss": 0.1058, "step": 707 }, { "epoch": 0.9645776566757494, "grad_norm": 3.6844550797037288, "learning_rate": 1.976995252042711e-05, "loss": 0.1334, "step": 708 }, { "epoch": 0.9659400544959128, "grad_norm": 12.318313478347163, "learning_rate": 1.9769010454712562e-05, "loss": 0.1158, "step": 709 }, { "epoch": 0.9673024523160763, "grad_norm": 12.650580400431513, "learning_rate": 1.9768066486552064e-05, "loss": 0.0911, "step": 710 }, { "epoch": 0.9686648501362398, "grad_norm": 2.9914816646246516, "learning_rate": 1.9767120616129446e-05, "loss": 0.0474, "step": 711 }, { "epoch": 0.9700272479564033, "grad_norm": 9.330236827274874, "learning_rate": 1.9766172843628912e-05, "loss": 0.0872, "step": 712 }, { "epoch": 0.9713896457765667, "grad_norm": 11.024631890651028, "learning_rate": 1.9765223169235036e-05, "loss": 0.1072, "step": 713 }, { "epoch": 0.9727520435967303, "grad_norm": 4.424198939996877, "learning_rate": 1.976427159313275e-05, "loss": 0.1353, "step": 714 }, { "epoch": 0.9741144414168937, "grad_norm": 3.9554650331640087, "learning_rate": 1.9763318115507383e-05, "loss": 0.0916, "step": 715 }, { "epoch": 0.9754768392370572, "grad_norm": 4.301090173783539, "learning_rate": 1.9762362736544606e-05, "loss": 0.0852, "step": 716 }, { "epoch": 0.9768392370572208, "grad_norm": 2.5470918383822205, "learning_rate": 1.9761405456430477e-05, "loss": 0.0986, "step": 717 }, { "epoch": 0.9782016348773842, "grad_norm": 1.6406483628161972, "learning_rate": 1.9760446275351418e-05, "loss": 0.0802, "step": 718 }, { "epoch": 0.9795640326975477, "grad_norm": 3.330802649536649, "learning_rate": 1.975948519349422e-05, "loss": 0.0624, "step": 719 }, { "epoch": 0.9809264305177112, "grad_norm": 2.4287523865254057, "learning_rate": 1.9758522211046056e-05, "loss": 0.0682, "step": 720 }, { "epoch": 0.9822888283378747, "grad_norm": 3.299954217228674, "learning_rate": 1.975755732819445e-05, "loss": 0.1018, "step": 721 }, { "epoch": 0.9836512261580381, "grad_norm": 1.792328908885687, "learning_rate": 1.975659054512731e-05, "loss": 0.101, "step": 722 }, { "epoch": 0.9850136239782016, "grad_norm": 5.116608054060357, "learning_rate": 1.9755621862032913e-05, "loss": 0.092, "step": 723 }, { "epoch": 0.9863760217983651, "grad_norm": 5.138281045245954, "learning_rate": 1.9754651279099896e-05, "loss": 0.0959, "step": 724 }, { "epoch": 0.9877384196185286, "grad_norm": 4.316360440915721, "learning_rate": 1.975367879651728e-05, "loss": 0.0929, "step": 725 }, { "epoch": 0.989100817438692, "grad_norm": 5.430144176994376, "learning_rate": 1.975270441447445e-05, "loss": 0.0941, "step": 726 }, { "epoch": 0.9904632152588556, "grad_norm": 5.988930701796256, "learning_rate": 1.9751728133161154e-05, "loss": 0.1148, "step": 727 }, { "epoch": 0.9918256130790191, "grad_norm": 2.203738918111364, "learning_rate": 1.975074995276752e-05, "loss": 0.0915, "step": 728 }, { "epoch": 0.9931880108991825, "grad_norm": 4.601025960766328, "learning_rate": 1.974976987348404e-05, "loss": 0.1033, "step": 729 }, { "epoch": 0.9945504087193461, "grad_norm": 3.3411188067402966, "learning_rate": 1.974878789550158e-05, "loss": 0.0574, "step": 730 }, { "epoch": 0.9959128065395095, "grad_norm": 3.6244998624487006, "learning_rate": 1.974780401901137e-05, "loss": 0.0912, "step": 731 }, { "epoch": 0.997275204359673, "grad_norm": 2.3985773788856766, "learning_rate": 1.974681824420501e-05, "loss": 0.1027, "step": 732 }, { "epoch": 0.9986376021798365, "grad_norm": 2.508708475788871, "learning_rate": 1.9745830571274482e-05, "loss": 0.0659, "step": 733 }, { "epoch": 1.0, "grad_norm": 3.6779663766294513, "learning_rate": 1.9744841000412122e-05, "loss": 0.1122, "step": 734 }, { "epoch": 1.0, "eval_accuracy": 0.9185850645704661, "eval_f1": 0.8974614748268672, "eval_loss": 0.13009630143642426, "eval_precision": 0.8966329968974593, "eval_recall": 0.9176198878010202, "eval_runtime": 16.9697, "eval_samples_per_second": 104.952, "eval_steps_per_second": 0.825, "step": 734 }, { "epoch": 1.0013623978201636, "grad_norm": 3.018564234384353, "learning_rate": 1.9743849531810645e-05, "loss": 0.1005, "step": 735 }, { "epoch": 1.002724795640327, "grad_norm": 3.0382995897013654, "learning_rate": 1.974285616566313e-05, "loss": 0.1025, "step": 736 }, { "epoch": 1.0040871934604905, "grad_norm": 9.450204631450147, "learning_rate": 1.9741860902163027e-05, "loss": 0.0877, "step": 737 }, { "epoch": 1.005449591280654, "grad_norm": 6.216491999191689, "learning_rate": 1.9740863741504163e-05, "loss": 0.1133, "step": 738 }, { "epoch": 1.0068119891008174, "grad_norm": 8.548927449944483, "learning_rate": 1.973986468388072e-05, "loss": 0.0852, "step": 739 }, { "epoch": 1.008174386920981, "grad_norm": 8.826280317310417, "learning_rate": 1.973886372948726e-05, "loss": 0.1401, "step": 740 }, { "epoch": 1.0095367847411445, "grad_norm": 3.858422955097934, "learning_rate": 1.9737860878518717e-05, "loss": 0.1128, "step": 741 }, { "epoch": 1.0108991825613078, "grad_norm": 7.330869969271805, "learning_rate": 1.9736856131170384e-05, "loss": 0.1417, "step": 742 }, { "epoch": 1.0122615803814714, "grad_norm": 10.728317553617151, "learning_rate": 1.9735849487637927e-05, "loss": 0.1565, "step": 743 }, { "epoch": 1.013623978201635, "grad_norm": 4.156037144750265, "learning_rate": 1.9734840948117387e-05, "loss": 0.1043, "step": 744 }, { "epoch": 1.0149863760217983, "grad_norm": 15.364627618426749, "learning_rate": 1.973383051280517e-05, "loss": 0.0847, "step": 745 }, { "epoch": 1.0163487738419619, "grad_norm": 13.087358235951982, "learning_rate": 1.9732818181898046e-05, "loss": 0.1119, "step": 746 }, { "epoch": 1.0177111716621254, "grad_norm": 8.528830665748938, "learning_rate": 1.9731803955593163e-05, "loss": 0.057, "step": 747 }, { "epoch": 1.0190735694822888, "grad_norm": 9.01349394694473, "learning_rate": 1.9730787834088033e-05, "loss": 0.1154, "step": 748 }, { "epoch": 1.0204359673024523, "grad_norm": 9.24716934330472, "learning_rate": 1.972976981758054e-05, "loss": 0.1002, "step": 749 }, { "epoch": 1.021798365122616, "grad_norm": 1.9145178486907997, "learning_rate": 1.972874990626894e-05, "loss": 0.0707, "step": 750 }, { "epoch": 1.0231607629427792, "grad_norm": 9.755239390680499, "learning_rate": 1.9727728100351843e-05, "loss": 0.0983, "step": 751 }, { "epoch": 1.0245231607629428, "grad_norm": 8.210536603189132, "learning_rate": 1.972670440002825e-05, "loss": 0.1053, "step": 752 }, { "epoch": 1.0258855585831064, "grad_norm": 10.057261578766777, "learning_rate": 1.9725678805497507e-05, "loss": 0.0619, "step": 753 }, { "epoch": 1.0272479564032697, "grad_norm": 8.448166848130661, "learning_rate": 1.972465131695935e-05, "loss": 0.0873, "step": 754 }, { "epoch": 1.0286103542234333, "grad_norm": 2.312659773522477, "learning_rate": 1.9723621934613873e-05, "loss": 0.0665, "step": 755 }, { "epoch": 1.0299727520435966, "grad_norm": 8.29983017495308, "learning_rate": 1.9722590658661543e-05, "loss": 0.0765, "step": 756 }, { "epoch": 1.0313351498637602, "grad_norm": 5.366105513190883, "learning_rate": 1.972155748930319e-05, "loss": 0.0907, "step": 757 }, { "epoch": 1.0326975476839237, "grad_norm": 2.0733461124523243, "learning_rate": 1.9720522426740022e-05, "loss": 0.112, "step": 758 }, { "epoch": 1.034059945504087, "grad_norm": 4.124242655735262, "learning_rate": 1.9719485471173602e-05, "loss": 0.0569, "step": 759 }, { "epoch": 1.0354223433242506, "grad_norm": 5.81128194360495, "learning_rate": 1.9718446622805874e-05, "loss": 0.0666, "step": 760 }, { "epoch": 1.0367847411444142, "grad_norm": 5.482877970494191, "learning_rate": 1.9717405881839146e-05, "loss": 0.0974, "step": 761 }, { "epoch": 1.0381471389645776, "grad_norm": 4.47286563900795, "learning_rate": 1.9716363248476097e-05, "loss": 0.0796, "step": 762 }, { "epoch": 1.0395095367847411, "grad_norm": 1.649852266374812, "learning_rate": 1.971531872291977e-05, "loss": 0.0534, "step": 763 }, { "epoch": 1.0408719346049047, "grad_norm": 3.0749355223128396, "learning_rate": 1.9714272305373584e-05, "loss": 0.1026, "step": 764 }, { "epoch": 1.042234332425068, "grad_norm": 1.642471875415744, "learning_rate": 1.9713223996041312e-05, "loss": 0.0725, "step": 765 }, { "epoch": 1.0435967302452316, "grad_norm": 4.918134289119915, "learning_rate": 1.9712173795127114e-05, "loss": 0.0977, "step": 766 }, { "epoch": 1.0449591280653951, "grad_norm": 4.149019800694025, "learning_rate": 1.9711121702835504e-05, "loss": 0.0817, "step": 767 }, { "epoch": 1.0463215258855585, "grad_norm": 1.7517615671588622, "learning_rate": 1.971006771937137e-05, "loss": 0.082, "step": 768 }, { "epoch": 1.047683923705722, "grad_norm": 3.289507562063404, "learning_rate": 1.970901184493997e-05, "loss": 0.08, "step": 769 }, { "epoch": 1.0490463215258856, "grad_norm": 2.0845237746599823, "learning_rate": 1.9707954079746928e-05, "loss": 0.0718, "step": 770 }, { "epoch": 1.050408719346049, "grad_norm": 2.1482959746242978, "learning_rate": 1.970689442399823e-05, "loss": 0.0894, "step": 771 }, { "epoch": 1.0517711171662125, "grad_norm": 5.139138224279835, "learning_rate": 1.970583287790025e-05, "loss": 0.0504, "step": 772 }, { "epoch": 1.053133514986376, "grad_norm": 3.563374881993434, "learning_rate": 1.9704769441659703e-05, "loss": 0.1156, "step": 773 }, { "epoch": 1.0544959128065394, "grad_norm": 1.9266319575372137, "learning_rate": 1.9703704115483693e-05, "loss": 0.0853, "step": 774 }, { "epoch": 1.055858310626703, "grad_norm": 3.0794214015544252, "learning_rate": 1.9702636899579685e-05, "loss": 0.0754, "step": 775 }, { "epoch": 1.0572207084468666, "grad_norm": 3.6220419496204594, "learning_rate": 1.9701567794155504e-05, "loss": 0.0536, "step": 776 }, { "epoch": 1.05858310626703, "grad_norm": 3.3702220620887147, "learning_rate": 1.970049679941936e-05, "loss": 0.0802, "step": 777 }, { "epoch": 1.0599455040871935, "grad_norm": 8.392601331960222, "learning_rate": 1.969942391557982e-05, "loss": 0.0786, "step": 778 }, { "epoch": 1.061307901907357, "grad_norm": 1.7982886466505392, "learning_rate": 1.9698349142845813e-05, "loss": 0.0803, "step": 779 }, { "epoch": 1.0626702997275204, "grad_norm": 5.002991214254812, "learning_rate": 1.9697272481426655e-05, "loss": 0.0728, "step": 780 }, { "epoch": 1.064032697547684, "grad_norm": 8.415924067946152, "learning_rate": 1.969619393153201e-05, "loss": 0.0878, "step": 781 }, { "epoch": 1.0653950953678475, "grad_norm": 2.728287167107441, "learning_rate": 1.9695113493371917e-05, "loss": 0.0641, "step": 782 }, { "epoch": 1.0667574931880108, "grad_norm": 6.211182591338857, "learning_rate": 1.969403116715679e-05, "loss": 0.0799, "step": 783 }, { "epoch": 1.0681198910081744, "grad_norm": 10.980510091970453, "learning_rate": 1.96929469530974e-05, "loss": 0.0763, "step": 784 }, { "epoch": 1.069482288828338, "grad_norm": 1.6408557533853416, "learning_rate": 1.9691860851404895e-05, "loss": 0.0701, "step": 785 }, { "epoch": 1.0708446866485013, "grad_norm": 8.74465597755419, "learning_rate": 1.969077286229078e-05, "loss": 0.0465, "step": 786 }, { "epoch": 1.0722070844686649, "grad_norm": 5.631686877178989, "learning_rate": 1.9689682985966937e-05, "loss": 0.0835, "step": 787 }, { "epoch": 1.0735694822888284, "grad_norm": 1.0201320548828408, "learning_rate": 1.9688591222645607e-05, "loss": 0.054, "step": 788 }, { "epoch": 1.0749318801089918, "grad_norm": 6.860727874901393, "learning_rate": 1.9687497572539408e-05, "loss": 0.0817, "step": 789 }, { "epoch": 1.0762942779291553, "grad_norm": 7.724560549723722, "learning_rate": 1.968640203586132e-05, "loss": 0.0836, "step": 790 }, { "epoch": 1.077656675749319, "grad_norm": 5.197858536701243, "learning_rate": 1.968530461282469e-05, "loss": 0.0607, "step": 791 }, { "epoch": 1.0790190735694822, "grad_norm": 7.525417380096277, "learning_rate": 1.968420530364323e-05, "loss": 0.1007, "step": 792 }, { "epoch": 1.0803814713896458, "grad_norm": 2.5011680016373994, "learning_rate": 1.9683104108531032e-05, "loss": 0.0474, "step": 793 }, { "epoch": 1.0817438692098094, "grad_norm": 5.44997566494954, "learning_rate": 1.9682001027702533e-05, "loss": 0.0899, "step": 794 }, { "epoch": 1.0831062670299727, "grad_norm": 3.606158791951456, "learning_rate": 1.968089606137256e-05, "loss": 0.0682, "step": 795 }, { "epoch": 1.0844686648501363, "grad_norm": 1.3354056730897002, "learning_rate": 1.9679789209756296e-05, "loss": 0.0629, "step": 796 }, { "epoch": 1.0858310626702998, "grad_norm": 3.1518698001482113, "learning_rate": 1.9678680473069294e-05, "loss": 0.0757, "step": 797 }, { "epoch": 1.0871934604904632, "grad_norm": 3.1262655807468827, "learning_rate": 1.9677569851527465e-05, "loss": 0.1045, "step": 798 }, { "epoch": 1.0885558583106267, "grad_norm": 2.0073955470561313, "learning_rate": 1.9676457345347102e-05, "loss": 0.0695, "step": 799 }, { "epoch": 1.0899182561307903, "grad_norm": 1.246234227133435, "learning_rate": 1.9675342954744855e-05, "loss": 0.0941, "step": 800 }, { "epoch": 1.0912806539509536, "grad_norm": 4.097498400562215, "learning_rate": 1.967422667993774e-05, "loss": 0.0929, "step": 801 }, { "epoch": 1.0926430517711172, "grad_norm": 7.4356621923899375, "learning_rate": 1.9673108521143155e-05, "loss": 0.0736, "step": 802 }, { "epoch": 1.0940054495912808, "grad_norm": 2.693241170492599, "learning_rate": 1.9671988478578844e-05, "loss": 0.0682, "step": 803 }, { "epoch": 1.095367847411444, "grad_norm": 6.6382046874949046, "learning_rate": 1.9670866552462927e-05, "loss": 0.0826, "step": 804 }, { "epoch": 1.0967302452316077, "grad_norm": 6.085050838790314, "learning_rate": 1.9669742743013897e-05, "loss": 0.0646, "step": 805 }, { "epoch": 1.0980926430517712, "grad_norm": 8.257305484834275, "learning_rate": 1.9668617050450604e-05, "loss": 0.0574, "step": 806 }, { "epoch": 1.0994550408719346, "grad_norm": 5.547724075777579, "learning_rate": 1.9667489474992266e-05, "loss": 0.0907, "step": 807 }, { "epoch": 1.1008174386920981, "grad_norm": 10.332857508015614, "learning_rate": 1.966636001685848e-05, "loss": 0.0952, "step": 808 }, { "epoch": 1.1021798365122615, "grad_norm": 3.258893079158831, "learning_rate": 1.966522867626919e-05, "loss": 0.0841, "step": 809 }, { "epoch": 1.103542234332425, "grad_norm": 6.5983631791408675, "learning_rate": 1.9664095453444723e-05, "loss": 0.058, "step": 810 }, { "epoch": 1.1049046321525886, "grad_norm": 3.3475988669467815, "learning_rate": 1.9662960348605763e-05, "loss": 0.0815, "step": 811 }, { "epoch": 1.106267029972752, "grad_norm": 1.448643901784885, "learning_rate": 1.9661823361973362e-05, "loss": 0.0573, "step": 812 }, { "epoch": 1.1076294277929155, "grad_norm": 3.5178416252291513, "learning_rate": 1.9660684493768945e-05, "loss": 0.0626, "step": 813 }, { "epoch": 1.108991825613079, "grad_norm": 1.59629983573851, "learning_rate": 1.9659543744214296e-05, "loss": 0.0634, "step": 814 }, { "epoch": 1.1103542234332424, "grad_norm": 2.698758075732113, "learning_rate": 1.9658401113531567e-05, "loss": 0.1055, "step": 815 }, { "epoch": 1.111716621253406, "grad_norm": 3.6902121348865116, "learning_rate": 1.9657256601943278e-05, "loss": 0.0985, "step": 816 }, { "epoch": 1.1130790190735695, "grad_norm": 1.1217158958595603, "learning_rate": 1.9656110209672317e-05, "loss": 0.0716, "step": 817 }, { "epoch": 1.1144414168937329, "grad_norm": 2.941063908108425, "learning_rate": 1.965496193694193e-05, "loss": 0.0763, "step": 818 }, { "epoch": 1.1158038147138964, "grad_norm": 1.9318136605244218, "learning_rate": 1.9653811783975744e-05, "loss": 0.0473, "step": 819 }, { "epoch": 1.11716621253406, "grad_norm": 7.272198855237265, "learning_rate": 1.9652659750997734e-05, "loss": 0.1005, "step": 820 }, { "epoch": 1.1185286103542234, "grad_norm": 2.3179103599868505, "learning_rate": 1.9651505838232255e-05, "loss": 0.0698, "step": 821 }, { "epoch": 1.119891008174387, "grad_norm": 10.113233329004329, "learning_rate": 1.9650350045904023e-05, "loss": 0.106, "step": 822 }, { "epoch": 1.1212534059945505, "grad_norm": 2.3778287320026728, "learning_rate": 1.964919237423812e-05, "loss": 0.0758, "step": 823 }, { "epoch": 1.1226158038147138, "grad_norm": 9.044775532790679, "learning_rate": 1.9648032823459996e-05, "loss": 0.0706, "step": 824 }, { "epoch": 1.1239782016348774, "grad_norm": 3.380208222057502, "learning_rate": 1.964687139379546e-05, "loss": 0.0669, "step": 825 }, { "epoch": 1.125340599455041, "grad_norm": 2.7577400460004067, "learning_rate": 1.96457080854707e-05, "loss": 0.069, "step": 826 }, { "epoch": 1.1267029972752043, "grad_norm": 8.419876566158703, "learning_rate": 1.9644542898712253e-05, "loss": 0.0595, "step": 827 }, { "epoch": 1.1280653950953679, "grad_norm": 1.8661821188924146, "learning_rate": 1.964337583374704e-05, "loss": 0.0652, "step": 828 }, { "epoch": 1.1294277929155314, "grad_norm": 4.504922733283438, "learning_rate": 1.9642206890802334e-05, "loss": 0.0601, "step": 829 }, { "epoch": 1.1307901907356948, "grad_norm": 2.1430330194913063, "learning_rate": 1.964103607010578e-05, "loss": 0.0832, "step": 830 }, { "epoch": 1.1321525885558583, "grad_norm": 3.463952724060379, "learning_rate": 1.9639863371885384e-05, "loss": 0.0861, "step": 831 }, { "epoch": 1.1335149863760219, "grad_norm": 5.491910745512407, "learning_rate": 1.9638688796369524e-05, "loss": 0.0647, "step": 832 }, { "epoch": 1.1348773841961852, "grad_norm": 1.2077320310634236, "learning_rate": 1.963751234378694e-05, "loss": 0.0835, "step": 833 }, { "epoch": 1.1362397820163488, "grad_norm": 1.4583610537783986, "learning_rate": 1.9636334014366736e-05, "loss": 0.06, "step": 834 }, { "epoch": 1.1376021798365124, "grad_norm": 5.568408933025322, "learning_rate": 1.9635153808338384e-05, "loss": 0.0835, "step": 835 }, { "epoch": 1.1389645776566757, "grad_norm": 1.3735676097281666, "learning_rate": 1.963397172593172e-05, "loss": 0.0748, "step": 836 }, { "epoch": 1.1403269754768393, "grad_norm": 3.899564349384629, "learning_rate": 1.963278776737695e-05, "loss": 0.087, "step": 837 }, { "epoch": 1.1416893732970028, "grad_norm": 11.016621002312109, "learning_rate": 1.963160193290464e-05, "loss": 0.1099, "step": 838 }, { "epoch": 1.1430517711171662, "grad_norm": 7.565026907375338, "learning_rate": 1.963041422274572e-05, "loss": 0.0532, "step": 839 }, { "epoch": 1.1444141689373297, "grad_norm": 12.228255793603704, "learning_rate": 1.962922463713149e-05, "loss": 0.0943, "step": 840 }, { "epoch": 1.145776566757493, "grad_norm": 2.7431619422970757, "learning_rate": 1.9628033176293614e-05, "loss": 0.0948, "step": 841 }, { "epoch": 1.1471389645776566, "grad_norm": 10.192464182252476, "learning_rate": 1.962683984046412e-05, "loss": 0.0759, "step": 842 }, { "epoch": 1.1485013623978202, "grad_norm": 10.445511060628924, "learning_rate": 1.96256446298754e-05, "loss": 0.0787, "step": 843 }, { "epoch": 1.1498637602179835, "grad_norm": 3.685991132359721, "learning_rate": 1.9624447544760214e-05, "loss": 0.0961, "step": 844 }, { "epoch": 1.151226158038147, "grad_norm": 6.34108901765363, "learning_rate": 1.962324858535169e-05, "loss": 0.0952, "step": 845 }, { "epoch": 1.1525885558583107, "grad_norm": 3.8453314470413913, "learning_rate": 1.9622047751883312e-05, "loss": 0.0858, "step": 846 }, { "epoch": 1.153950953678474, "grad_norm": 2.788319728832846, "learning_rate": 1.9620845044588936e-05, "loss": 0.0487, "step": 847 }, { "epoch": 1.1553133514986376, "grad_norm": 2.1487234075312855, "learning_rate": 1.961964046370278e-05, "loss": 0.0724, "step": 848 }, { "epoch": 1.1566757493188011, "grad_norm": 4.920340677509674, "learning_rate": 1.9618434009459428e-05, "loss": 0.0554, "step": 849 }, { "epoch": 1.1580381471389645, "grad_norm": 3.555333062143099, "learning_rate": 1.9617225682093827e-05, "loss": 0.1032, "step": 850 }, { "epoch": 1.159400544959128, "grad_norm": 3.407926552618953, "learning_rate": 1.9616015481841293e-05, "loss": 0.1203, "step": 851 }, { "epoch": 1.1607629427792916, "grad_norm": 7.4412440624566845, "learning_rate": 1.96148034089375e-05, "loss": 0.0908, "step": 852 }, { "epoch": 1.162125340599455, "grad_norm": 2.5558555529431586, "learning_rate": 1.9613589463618498e-05, "loss": 0.0658, "step": 853 }, { "epoch": 1.1634877384196185, "grad_norm": 4.895906784891113, "learning_rate": 1.9612373646120684e-05, "loss": 0.074, "step": 854 }, { "epoch": 1.164850136239782, "grad_norm": 2.307797339034615, "learning_rate": 1.9611155956680837e-05, "loss": 0.0846, "step": 855 }, { "epoch": 1.1662125340599454, "grad_norm": 2.5684851470013195, "learning_rate": 1.960993639553609e-05, "loss": 0.0754, "step": 856 }, { "epoch": 1.167574931880109, "grad_norm": 8.69933226073892, "learning_rate": 1.960871496292395e-05, "loss": 0.0631, "step": 857 }, { "epoch": 1.1689373297002725, "grad_norm": 2.6821840586204106, "learning_rate": 1.9607491659082276e-05, "loss": 0.0563, "step": 858 }, { "epoch": 1.1702997275204359, "grad_norm": 5.2835458108009865, "learning_rate": 1.9606266484249298e-05, "loss": 0.0605, "step": 859 }, { "epoch": 1.1716621253405994, "grad_norm": 2.8485931506092492, "learning_rate": 1.9605039438663614e-05, "loss": 0.0886, "step": 860 }, { "epoch": 1.173024523160763, "grad_norm": 2.796350301751833, "learning_rate": 1.960381052256418e-05, "loss": 0.1059, "step": 861 }, { "epoch": 1.1743869209809263, "grad_norm": 3.3720715925767424, "learning_rate": 1.9602579736190318e-05, "loss": 0.0691, "step": 862 }, { "epoch": 1.17574931880109, "grad_norm": 2.136247521270464, "learning_rate": 1.9601347079781718e-05, "loss": 0.0749, "step": 863 }, { "epoch": 1.1771117166212535, "grad_norm": 3.5046056941812616, "learning_rate": 1.960011255357843e-05, "loss": 0.1129, "step": 864 }, { "epoch": 1.1784741144414168, "grad_norm": 3.1410557238685786, "learning_rate": 1.9598876157820867e-05, "loss": 0.0778, "step": 865 }, { "epoch": 1.1798365122615804, "grad_norm": 1.0607885741790217, "learning_rate": 1.959763789274981e-05, "loss": 0.0911, "step": 866 }, { "epoch": 1.181198910081744, "grad_norm": 2.556008671464506, "learning_rate": 1.9596397758606403e-05, "loss": 0.0909, "step": 867 }, { "epoch": 1.1825613079019073, "grad_norm": 2.045680579751207, "learning_rate": 1.959515575563215e-05, "loss": 0.1009, "step": 868 }, { "epoch": 1.1839237057220708, "grad_norm": 5.74815444666223, "learning_rate": 1.959391188406893e-05, "loss": 0.0721, "step": 869 }, { "epoch": 1.1852861035422344, "grad_norm": 3.007815469771393, "learning_rate": 1.959266614415897e-05, "loss": 0.0818, "step": 870 }, { "epoch": 1.1866485013623977, "grad_norm": 6.187515162005115, "learning_rate": 1.9591418536144875e-05, "loss": 0.0768, "step": 871 }, { "epoch": 1.1880108991825613, "grad_norm": 5.743700833028989, "learning_rate": 1.9590169060269602e-05, "loss": 0.0893, "step": 872 }, { "epoch": 1.1893732970027249, "grad_norm": 1.861027090963107, "learning_rate": 1.9588917716776486e-05, "loss": 0.1036, "step": 873 }, { "epoch": 1.1907356948228882, "grad_norm": 10.170510578028368, "learning_rate": 1.9587664505909205e-05, "loss": 0.109, "step": 874 }, { "epoch": 1.1920980926430518, "grad_norm": 1.1590491527722486, "learning_rate": 1.9586409427911827e-05, "loss": 0.0946, "step": 875 }, { "epoch": 1.1934604904632153, "grad_norm": 6.7379915217634485, "learning_rate": 1.958515248302876e-05, "loss": 0.0854, "step": 876 }, { "epoch": 1.1948228882833787, "grad_norm": 3.1704907206746804, "learning_rate": 1.9583893671504784e-05, "loss": 0.0521, "step": 877 }, { "epoch": 1.1961852861035422, "grad_norm": 1.4569765238836787, "learning_rate": 1.9582632993585055e-05, "loss": 0.0793, "step": 878 }, { "epoch": 1.1975476839237058, "grad_norm": 3.0304821098228705, "learning_rate": 1.958137044951507e-05, "loss": 0.0863, "step": 879 }, { "epoch": 1.1989100817438691, "grad_norm": 8.613318257913436, "learning_rate": 1.9580106039540703e-05, "loss": 0.0923, "step": 880 }, { "epoch": 1.2002724795640327, "grad_norm": 2.407218453125926, "learning_rate": 1.9578839763908193e-05, "loss": 0.1066, "step": 881 }, { "epoch": 1.2016348773841963, "grad_norm": 3.0112224952489037, "learning_rate": 1.9577571622864134e-05, "loss": 0.0745, "step": 882 }, { "epoch": 1.2029972752043596, "grad_norm": 4.305880458496674, "learning_rate": 1.957630161665549e-05, "loss": 0.0629, "step": 883 }, { "epoch": 1.2043596730245232, "grad_norm": 3.6675510130474054, "learning_rate": 1.9575029745529585e-05, "loss": 0.1005, "step": 884 }, { "epoch": 1.2057220708446867, "grad_norm": 2.017332238888648, "learning_rate": 1.9573756009734104e-05, "loss": 0.0813, "step": 885 }, { "epoch": 1.20708446866485, "grad_norm": 2.390061403533871, "learning_rate": 1.9572480409517104e-05, "loss": 0.0797, "step": 886 }, { "epoch": 1.2084468664850136, "grad_norm": 2.0443035551106314, "learning_rate": 1.9571202945126995e-05, "loss": 0.0884, "step": 887 }, { "epoch": 1.2098092643051772, "grad_norm": 1.4629195823197758, "learning_rate": 1.9569923616812553e-05, "loss": 0.0819, "step": 888 }, { "epoch": 1.2111716621253406, "grad_norm": 2.1513551788838368, "learning_rate": 1.9568642424822926e-05, "loss": 0.0587, "step": 889 }, { "epoch": 1.2125340599455041, "grad_norm": 1.935822219811811, "learning_rate": 1.9567359369407605e-05, "loss": 0.1019, "step": 890 }, { "epoch": 1.2138964577656677, "grad_norm": 1.8201907604373893, "learning_rate": 1.956607445081647e-05, "loss": 0.0798, "step": 891 }, { "epoch": 1.215258855585831, "grad_norm": 3.4492490493451538, "learning_rate": 1.9564787669299733e-05, "loss": 0.0889, "step": 892 }, { "epoch": 1.2166212534059946, "grad_norm": 3.243519050755273, "learning_rate": 1.9563499025108e-05, "loss": 0.087, "step": 893 }, { "epoch": 1.2179836512261581, "grad_norm": 5.43397508699375, "learning_rate": 1.956220851849222e-05, "loss": 0.0895, "step": 894 }, { "epoch": 1.2193460490463215, "grad_norm": 3.1163049520720354, "learning_rate": 1.956091614970371e-05, "loss": 0.0963, "step": 895 }, { "epoch": 1.220708446866485, "grad_norm": 8.385849250758312, "learning_rate": 1.955962191899415e-05, "loss": 0.0648, "step": 896 }, { "epoch": 1.2220708446866486, "grad_norm": 2.300009017001881, "learning_rate": 1.9558325826615584e-05, "loss": 0.0985, "step": 897 }, { "epoch": 1.223433242506812, "grad_norm": 2.9354722577319023, "learning_rate": 1.9557027872820415e-05, "loss": 0.0966, "step": 898 }, { "epoch": 1.2247956403269755, "grad_norm": 4.14904194307782, "learning_rate": 1.955572805786141e-05, "loss": 0.1302, "step": 899 }, { "epoch": 1.226158038147139, "grad_norm": 6.589874054301804, "learning_rate": 1.9554426381991698e-05, "loss": 0.0745, "step": 900 }, { "epoch": 1.2275204359673024, "grad_norm": 6.170122642333146, "learning_rate": 1.9553122845464775e-05, "loss": 0.0984, "step": 901 }, { "epoch": 1.228882833787466, "grad_norm": 6.453717688507663, "learning_rate": 1.955181744853449e-05, "loss": 0.0957, "step": 902 }, { "epoch": 1.2302452316076296, "grad_norm": 3.8491440490363114, "learning_rate": 1.9550510191455066e-05, "loss": 0.0755, "step": 903 }, { "epoch": 1.231607629427793, "grad_norm": 8.381300974505669, "learning_rate": 1.954920107448108e-05, "loss": 0.0767, "step": 904 }, { "epoch": 1.2329700272479565, "grad_norm": 2.8716550057866908, "learning_rate": 1.9547890097867466e-05, "loss": 0.0671, "step": 905 }, { "epoch": 1.2343324250681198, "grad_norm": 9.226675156932588, "learning_rate": 1.954657726186954e-05, "loss": 0.0738, "step": 906 }, { "epoch": 1.2356948228882834, "grad_norm": 2.902761440718107, "learning_rate": 1.9545262566742962e-05, "loss": 0.0991, "step": 907 }, { "epoch": 1.237057220708447, "grad_norm": 1.7343076897662277, "learning_rate": 1.9543946012743757e-05, "loss": 0.0703, "step": 908 }, { "epoch": 1.2384196185286103, "grad_norm": 6.386698160860144, "learning_rate": 1.9542627600128314e-05, "loss": 0.1038, "step": 909 }, { "epoch": 1.2397820163487738, "grad_norm": 3.662089765077342, "learning_rate": 1.9541307329153393e-05, "loss": 0.0574, "step": 910 }, { "epoch": 1.2411444141689374, "grad_norm": 2.3408144376985276, "learning_rate": 1.9539985200076096e-05, "loss": 0.0648, "step": 911 }, { "epoch": 1.2425068119891007, "grad_norm": 7.757355480889283, "learning_rate": 1.953866121315391e-05, "loss": 0.1105, "step": 912 }, { "epoch": 1.2438692098092643, "grad_norm": 5.513677605713362, "learning_rate": 1.9537335368644662e-05, "loss": 0.1093, "step": 913 }, { "epoch": 1.2452316076294279, "grad_norm": 9.71834081538234, "learning_rate": 1.9536007666806555e-05, "loss": 0.0693, "step": 914 }, { "epoch": 1.2465940054495912, "grad_norm": 5.258032365500282, "learning_rate": 1.953467810789815e-05, "loss": 0.0897, "step": 915 }, { "epoch": 1.2479564032697548, "grad_norm": 8.628666896348612, "learning_rate": 1.953334669217837e-05, "loss": 0.088, "step": 916 }, { "epoch": 1.2493188010899183, "grad_norm": 2.2519929012030935, "learning_rate": 1.9532013419906498e-05, "loss": 0.065, "step": 917 }, { "epoch": 1.2506811989100817, "grad_norm": 6.398719612320797, "learning_rate": 1.953067829134218e-05, "loss": 0.0935, "step": 918 }, { "epoch": 1.2520435967302452, "grad_norm": 6.776199750851196, "learning_rate": 1.9529341306745418e-05, "loss": 0.0837, "step": 919 }, { "epoch": 1.2534059945504088, "grad_norm": 2.5658602732850437, "learning_rate": 1.9528002466376587e-05, "loss": 0.0655, "step": 920 }, { "epoch": 1.2547683923705721, "grad_norm": 4.82909598597987, "learning_rate": 1.9526661770496413e-05, "loss": 0.0877, "step": 921 }, { "epoch": 1.2561307901907357, "grad_norm": 5.688627882087246, "learning_rate": 1.9525319219365988e-05, "loss": 0.0643, "step": 922 }, { "epoch": 1.2574931880108993, "grad_norm": 3.355448869513845, "learning_rate": 1.9523974813246765e-05, "loss": 0.0763, "step": 923 }, { "epoch": 1.2588555858310626, "grad_norm": 1.5905909525170288, "learning_rate": 1.9522628552400558e-05, "loss": 0.0746, "step": 924 }, { "epoch": 1.2602179836512262, "grad_norm": 2.4758415266190728, "learning_rate": 1.952128043708954e-05, "loss": 0.0899, "step": 925 }, { "epoch": 1.2615803814713895, "grad_norm": 1.624968195556145, "learning_rate": 1.9519930467576246e-05, "loss": 0.082, "step": 926 }, { "epoch": 1.262942779291553, "grad_norm": 3.3146529337738118, "learning_rate": 1.951857864412358e-05, "loss": 0.1114, "step": 927 }, { "epoch": 1.2643051771117166, "grad_norm": 2.0335936530992784, "learning_rate": 1.951722496699479e-05, "loss": 0.0789, "step": 928 }, { "epoch": 1.26566757493188, "grad_norm": 1.6923295260032463, "learning_rate": 1.9515869436453502e-05, "loss": 0.0885, "step": 929 }, { "epoch": 1.2670299727520435, "grad_norm": 3.432587156531951, "learning_rate": 1.9514512052763695e-05, "loss": 0.0915, "step": 930 }, { "epoch": 1.268392370572207, "grad_norm": 5.45916783880437, "learning_rate": 1.9513152816189706e-05, "loss": 0.0847, "step": 931 }, { "epoch": 1.2697547683923704, "grad_norm": 2.250570300131813, "learning_rate": 1.9511791726996243e-05, "loss": 0.0863, "step": 932 }, { "epoch": 1.271117166212534, "grad_norm": 4.434257181497001, "learning_rate": 1.9510428785448362e-05, "loss": 0.0532, "step": 933 }, { "epoch": 1.2724795640326976, "grad_norm": 3.1140421326818397, "learning_rate": 1.9509063991811493e-05, "loss": 0.0589, "step": 934 }, { "epoch": 1.273841961852861, "grad_norm": 2.56782095205393, "learning_rate": 1.9507697346351414e-05, "loss": 0.054, "step": 935 }, { "epoch": 1.2752043596730245, "grad_norm": 5.1532051768192275, "learning_rate": 1.950632884933427e-05, "loss": 0.0674, "step": 936 }, { "epoch": 1.276566757493188, "grad_norm": 1.239347826512451, "learning_rate": 1.9504958501026574e-05, "loss": 0.0507, "step": 937 }, { "epoch": 1.2779291553133514, "grad_norm": 1.3629710462277211, "learning_rate": 1.9503586301695185e-05, "loss": 0.071, "step": 938 }, { "epoch": 1.279291553133515, "grad_norm": 1.2612181034984569, "learning_rate": 1.9502212251607326e-05, "loss": 0.0639, "step": 939 }, { "epoch": 1.2806539509536785, "grad_norm": 4.316338298587417, "learning_rate": 1.950083635103059e-05, "loss": 0.0569, "step": 940 }, { "epoch": 1.2820163487738419, "grad_norm": 4.450378291619361, "learning_rate": 1.949945860023292e-05, "loss": 0.0609, "step": 941 }, { "epoch": 1.2833787465940054, "grad_norm": 6.666268160025328, "learning_rate": 1.9498078999482627e-05, "loss": 0.1164, "step": 942 }, { "epoch": 1.284741144414169, "grad_norm": 4.775215209330589, "learning_rate": 1.9496697549048377e-05, "loss": 0.081, "step": 943 }, { "epoch": 1.2861035422343323, "grad_norm": 1.3200095375621372, "learning_rate": 1.949531424919919e-05, "loss": 0.0938, "step": 944 }, { "epoch": 1.2874659400544959, "grad_norm": 1.612306322272251, "learning_rate": 1.949392910020447e-05, "loss": 0.0813, "step": 945 }, { "epoch": 1.2888283378746594, "grad_norm": 7.774486023848463, "learning_rate": 1.9492542102333952e-05, "loss": 0.0799, "step": 946 }, { "epoch": 1.2901907356948228, "grad_norm": 2.875543427553611, "learning_rate": 1.949115325585775e-05, "loss": 0.0741, "step": 947 }, { "epoch": 1.2915531335149864, "grad_norm": 3.974917413827015, "learning_rate": 1.9489762561046327e-05, "loss": 0.0749, "step": 948 }, { "epoch": 1.29291553133515, "grad_norm": 6.229949576240109, "learning_rate": 1.9488370018170516e-05, "loss": 0.091, "step": 949 }, { "epoch": 1.2942779291553133, "grad_norm": 3.1629955952877506, "learning_rate": 1.9486975627501503e-05, "loss": 0.0755, "step": 950 }, { "epoch": 1.2956403269754768, "grad_norm": 7.242403446429549, "learning_rate": 1.9485579389310835e-05, "loss": 0.0518, "step": 951 }, { "epoch": 1.2970027247956404, "grad_norm": 5.261227647392252, "learning_rate": 1.948418130387042e-05, "loss": 0.0764, "step": 952 }, { "epoch": 1.2983651226158037, "grad_norm": 6.490860109843713, "learning_rate": 1.9482781371452527e-05, "loss": 0.0729, "step": 953 }, { "epoch": 1.2997275204359673, "grad_norm": 4.908360155284981, "learning_rate": 1.948137959232978e-05, "loss": 0.0901, "step": 954 }, { "epoch": 1.3010899182561309, "grad_norm": 2.913259078495356, "learning_rate": 1.9479975966775167e-05, "loss": 0.0723, "step": 955 }, { "epoch": 1.3024523160762942, "grad_norm": 3.439924924985139, "learning_rate": 1.9478570495062038e-05, "loss": 0.0933, "step": 956 }, { "epoch": 1.3038147138964578, "grad_norm": 1.4633967895996525, "learning_rate": 1.947716317746409e-05, "loss": 0.0882, "step": 957 }, { "epoch": 1.3051771117166213, "grad_norm": 4.581314275203176, "learning_rate": 1.9475754014255397e-05, "loss": 0.1188, "step": 958 }, { "epoch": 1.3065395095367847, "grad_norm": 7.827678206424413, "learning_rate": 1.947434300571038e-05, "loss": 0.0676, "step": 959 }, { "epoch": 1.3079019073569482, "grad_norm": 6.266253137531944, "learning_rate": 1.947293015210382e-05, "loss": 0.0762, "step": 960 }, { "epoch": 1.3092643051771118, "grad_norm": 4.778949543214348, "learning_rate": 1.9471515453710866e-05, "loss": 0.0693, "step": 961 }, { "epoch": 1.3106267029972751, "grad_norm": 11.935150348156627, "learning_rate": 1.9470098910807016e-05, "loss": 0.1027, "step": 962 }, { "epoch": 1.3119891008174387, "grad_norm": 2.3841209667243137, "learning_rate": 1.9468680523668136e-05, "loss": 0.0539, "step": 963 }, { "epoch": 1.3133514986376023, "grad_norm": 11.176131048170644, "learning_rate": 1.946726029257044e-05, "loss": 0.1068, "step": 964 }, { "epoch": 1.3147138964577656, "grad_norm": 3.506422600497175, "learning_rate": 1.9465838217790517e-05, "loss": 0.0737, "step": 965 }, { "epoch": 1.3160762942779292, "grad_norm": 6.818905864894303, "learning_rate": 1.9464414299605295e-05, "loss": 0.0776, "step": 966 }, { "epoch": 1.3174386920980927, "grad_norm": 12.648201635438513, "learning_rate": 1.9462988538292084e-05, "loss": 0.0945, "step": 967 }, { "epoch": 1.318801089918256, "grad_norm": 3.1357791831584847, "learning_rate": 1.9461560934128535e-05, "loss": 0.0602, "step": 968 }, { "epoch": 1.3201634877384196, "grad_norm": 11.540328430211982, "learning_rate": 1.9460131487392663e-05, "loss": 0.0806, "step": 969 }, { "epoch": 1.3215258855585832, "grad_norm": 8.79191621923743, "learning_rate": 1.945870019836284e-05, "loss": 0.0815, "step": 970 }, { "epoch": 1.3228882833787465, "grad_norm": 7.887196854927041, "learning_rate": 1.945726706731781e-05, "loss": 0.0668, "step": 971 }, { "epoch": 1.32425068119891, "grad_norm": 12.399397795500258, "learning_rate": 1.9455832094536658e-05, "loss": 0.0745, "step": 972 }, { "epoch": 1.3256130790190737, "grad_norm": 3.6909823377419215, "learning_rate": 1.945439528029883e-05, "loss": 0.0982, "step": 973 }, { "epoch": 1.326975476839237, "grad_norm": 9.378051780362448, "learning_rate": 1.9452956624884145e-05, "loss": 0.1087, "step": 974 }, { "epoch": 1.3283378746594006, "grad_norm": 3.061331885260218, "learning_rate": 1.9451516128572766e-05, "loss": 0.0685, "step": 975 }, { "epoch": 1.3297002724795641, "grad_norm": 3.4011609053427834, "learning_rate": 1.945007379164522e-05, "loss": 0.0875, "step": 976 }, { "epoch": 1.3310626702997275, "grad_norm": 1.9334467683733956, "learning_rate": 1.9448629614382394e-05, "loss": 0.0614, "step": 977 }, { "epoch": 1.332425068119891, "grad_norm": 3.7798988387754795, "learning_rate": 1.9447183597065528e-05, "loss": 0.0764, "step": 978 }, { "epoch": 1.3337874659400546, "grad_norm": 2.7456862103619457, "learning_rate": 1.9445735739976224e-05, "loss": 0.0735, "step": 979 }, { "epoch": 1.335149863760218, "grad_norm": 8.739886086057323, "learning_rate": 1.9444286043396447e-05, "loss": 0.1152, "step": 980 }, { "epoch": 1.3365122615803815, "grad_norm": 1.6741703245011028, "learning_rate": 1.9442834507608505e-05, "loss": 0.0609, "step": 981 }, { "epoch": 1.337874659400545, "grad_norm": 5.440953606591588, "learning_rate": 1.9441381132895086e-05, "loss": 0.0561, "step": 982 }, { "epoch": 1.3392370572207084, "grad_norm": 5.665892442260272, "learning_rate": 1.943992591953922e-05, "loss": 0.096, "step": 983 }, { "epoch": 1.340599455040872, "grad_norm": 2.5353481812605314, "learning_rate": 1.9438468867824295e-05, "loss": 0.0613, "step": 984 }, { "epoch": 1.3419618528610355, "grad_norm": 6.879393049910721, "learning_rate": 1.943700997803407e-05, "loss": 0.0787, "step": 985 }, { "epoch": 1.3433242506811989, "grad_norm": 2.420356096043858, "learning_rate": 1.9435549250452644e-05, "loss": 0.0642, "step": 986 }, { "epoch": 1.3446866485013624, "grad_norm": 6.154515865527683, "learning_rate": 1.9434086685364494e-05, "loss": 0.0756, "step": 987 }, { "epoch": 1.346049046321526, "grad_norm": 6.407171608695472, "learning_rate": 1.9432622283054434e-05, "loss": 0.0618, "step": 988 }, { "epoch": 1.3474114441416893, "grad_norm": 3.2561867762573486, "learning_rate": 1.9431156043807654e-05, "loss": 0.0457, "step": 989 }, { "epoch": 1.348773841961853, "grad_norm": 4.973935118150674, "learning_rate": 1.9429687967909688e-05, "loss": 0.1158, "step": 990 }, { "epoch": 1.3501362397820165, "grad_norm": 2.4675003284286796, "learning_rate": 1.9428218055646437e-05, "loss": 0.0388, "step": 991 }, { "epoch": 1.3514986376021798, "grad_norm": 1.09168583997992, "learning_rate": 1.9426746307304154e-05, "loss": 0.0529, "step": 992 }, { "epoch": 1.3528610354223434, "grad_norm": 2.077700110899031, "learning_rate": 1.942527272316945e-05, "loss": 0.0583, "step": 993 }, { "epoch": 1.354223433242507, "grad_norm": 2.3320554194959002, "learning_rate": 1.94237973035293e-05, "loss": 0.0777, "step": 994 }, { "epoch": 1.3555858310626703, "grad_norm": 4.142108347769864, "learning_rate": 1.942232004867103e-05, "loss": 0.0478, "step": 995 }, { "epoch": 1.3569482288828338, "grad_norm": 3.0136916344609905, "learning_rate": 1.9420840958882325e-05, "loss": 0.1015, "step": 996 }, { "epoch": 1.3583106267029974, "grad_norm": 4.8472255414608565, "learning_rate": 1.9419360034451225e-05, "loss": 0.0624, "step": 997 }, { "epoch": 1.3596730245231607, "grad_norm": 4.320831058697092, "learning_rate": 1.941787727566613e-05, "loss": 0.0933, "step": 998 }, { "epoch": 1.3610354223433243, "grad_norm": 2.509424958447901, "learning_rate": 1.94163926828158e-05, "loss": 0.0764, "step": 999 }, { "epoch": 1.3623978201634879, "grad_norm": 10.17179292165287, "learning_rate": 1.9414906256189346e-05, "loss": 0.0745, "step": 1000 }, { "epoch": 1.3637602179836512, "grad_norm": 4.071783480348075, "learning_rate": 1.9413417996076244e-05, "loss": 0.0808, "step": 1001 }, { "epoch": 1.3651226158038148, "grad_norm": 11.316994415729797, "learning_rate": 1.941192790276631e-05, "loss": 0.0634, "step": 1002 }, { "epoch": 1.3664850136239783, "grad_norm": 6.599581443778056, "learning_rate": 1.9410435976549742e-05, "loss": 0.0693, "step": 1003 }, { "epoch": 1.3678474114441417, "grad_norm": 3.369059904867828, "learning_rate": 1.940894221771708e-05, "loss": 0.124, "step": 1004 }, { "epoch": 1.3692098092643052, "grad_norm": 11.300261786091623, "learning_rate": 1.9407446626559215e-05, "loss": 0.0423, "step": 1005 }, { "epoch": 1.3705722070844686, "grad_norm": 3.2538299335280834, "learning_rate": 1.940594920336741e-05, "loss": 0.0561, "step": 1006 }, { "epoch": 1.3719346049046321, "grad_norm": 6.701450974391131, "learning_rate": 1.9404449948433278e-05, "loss": 0.0908, "step": 1007 }, { "epoch": 1.3732970027247957, "grad_norm": 5.0789700987791955, "learning_rate": 1.9402948862048786e-05, "loss": 0.0398, "step": 1008 }, { "epoch": 1.374659400544959, "grad_norm": 1.3815095457597593, "learning_rate": 1.940144594450626e-05, "loss": 0.0964, "step": 1009 }, { "epoch": 1.3760217983651226, "grad_norm": 4.844601842860153, "learning_rate": 1.9399941196098382e-05, "loss": 0.0517, "step": 1010 }, { "epoch": 1.3773841961852862, "grad_norm": 2.627093434705396, "learning_rate": 1.939843461711819e-05, "loss": 0.0626, "step": 1011 }, { "epoch": 1.3787465940054495, "grad_norm": 4.610454519968288, "learning_rate": 1.9396926207859085e-05, "loss": 0.0596, "step": 1012 }, { "epoch": 1.380108991825613, "grad_norm": 4.803724674192375, "learning_rate": 1.9395415968614815e-05, "loss": 0.0621, "step": 1013 }, { "epoch": 1.3814713896457766, "grad_norm": 12.792755601856667, "learning_rate": 1.939390389967949e-05, "loss": 0.0729, "step": 1014 }, { "epoch": 1.38283378746594, "grad_norm": 3.1568628579741325, "learning_rate": 1.9392390001347572e-05, "loss": 0.0431, "step": 1015 }, { "epoch": 1.3841961852861036, "grad_norm": 10.527428639586748, "learning_rate": 1.9390874273913884e-05, "loss": 0.0763, "step": 1016 }, { "epoch": 1.385558583106267, "grad_norm": 9.903042552230566, "learning_rate": 1.9389356717673605e-05, "loss": 0.074, "step": 1017 }, { "epoch": 1.3869209809264305, "grad_norm": 6.878464337379117, "learning_rate": 1.9387837332922267e-05, "loss": 0.0674, "step": 1018 }, { "epoch": 1.388283378746594, "grad_norm": 12.633600097057782, "learning_rate": 1.9386316119955757e-05, "loss": 0.06, "step": 1019 }, { "epoch": 1.3896457765667574, "grad_norm": 2.8874763020058656, "learning_rate": 1.938479307907032e-05, "loss": 0.038, "step": 1020 }, { "epoch": 1.391008174386921, "grad_norm": 9.066150312936784, "learning_rate": 1.9383268210562565e-05, "loss": 0.0586, "step": 1021 }, { "epoch": 1.3923705722070845, "grad_norm": 10.724284356110974, "learning_rate": 1.9381741514729443e-05, "loss": 0.0776, "step": 1022 }, { "epoch": 1.3937329700272478, "grad_norm": 2.1833849869227944, "learning_rate": 1.938021299186827e-05, "loss": 0.0736, "step": 1023 }, { "epoch": 1.3950953678474114, "grad_norm": 14.645493190072827, "learning_rate": 1.937868264227671e-05, "loss": 0.1159, "step": 1024 }, { "epoch": 1.396457765667575, "grad_norm": 3.4525851825346607, "learning_rate": 1.9377150466252797e-05, "loss": 0.0664, "step": 1025 }, { "epoch": 1.3978201634877383, "grad_norm": 4.324798000038021, "learning_rate": 1.9375616464094903e-05, "loss": 0.0583, "step": 1026 }, { "epoch": 1.3991825613079019, "grad_norm": 9.775562345482763, "learning_rate": 1.9374080636101764e-05, "loss": 0.066, "step": 1027 }, { "epoch": 1.4005449591280654, "grad_norm": 2.259457262623326, "learning_rate": 1.937254298257248e-05, "loss": 0.0417, "step": 1028 }, { "epoch": 1.4019073569482288, "grad_norm": 5.958684221491279, "learning_rate": 1.9371003503806492e-05, "loss": 0.0703, "step": 1029 }, { "epoch": 1.4032697547683923, "grad_norm": 5.518237121860196, "learning_rate": 1.9369462200103603e-05, "loss": 0.1029, "step": 1030 }, { "epoch": 1.404632152588556, "grad_norm": 7.6865126960740895, "learning_rate": 1.936791907176397e-05, "loss": 0.0815, "step": 1031 }, { "epoch": 1.4059945504087192, "grad_norm": 5.579664870178218, "learning_rate": 1.936637411908811e-05, "loss": 0.0943, "step": 1032 }, { "epoch": 1.4073569482288828, "grad_norm": 2.9794319630334107, "learning_rate": 1.936482734237689e-05, "loss": 0.0959, "step": 1033 }, { "epoch": 1.4087193460490464, "grad_norm": 6.353682589809565, "learning_rate": 1.9363278741931532e-05, "loss": 0.0997, "step": 1034 }, { "epoch": 1.4100817438692097, "grad_norm": 3.5433053209209775, "learning_rate": 1.9361728318053616e-05, "loss": 0.0875, "step": 1035 }, { "epoch": 1.4114441416893733, "grad_norm": 3.159260517169329, "learning_rate": 1.9360176071045077e-05, "loss": 0.0498, "step": 1036 }, { "epoch": 1.4128065395095368, "grad_norm": 8.65094316681781, "learning_rate": 1.9358622001208206e-05, "loss": 0.0919, "step": 1037 }, { "epoch": 1.4141689373297002, "grad_norm": 4.441675970574813, "learning_rate": 1.9357066108845646e-05, "loss": 0.047, "step": 1038 }, { "epoch": 1.4155313351498637, "grad_norm": 6.634190403941709, "learning_rate": 1.9355508394260393e-05, "loss": 0.0851, "step": 1039 }, { "epoch": 1.4168937329700273, "grad_norm": 1.190508586581111, "learning_rate": 1.93539488577558e-05, "loss": 0.0643, "step": 1040 }, { "epoch": 1.4182561307901906, "grad_norm": 4.432596224701426, "learning_rate": 1.9352387499635586e-05, "loss": 0.0591, "step": 1041 }, { "epoch": 1.4196185286103542, "grad_norm": 3.37383142008818, "learning_rate": 1.9350824320203804e-05, "loss": 0.0456, "step": 1042 }, { "epoch": 1.4209809264305178, "grad_norm": 2.4015979961789324, "learning_rate": 1.9349259319764874e-05, "loss": 0.0813, "step": 1043 }, { "epoch": 1.422343324250681, "grad_norm": 5.648600087813398, "learning_rate": 1.9347692498623572e-05, "loss": 0.0661, "step": 1044 }, { "epoch": 1.4237057220708447, "grad_norm": 2.601365349017416, "learning_rate": 1.9346123857085023e-05, "loss": 0.0687, "step": 1045 }, { "epoch": 1.4250681198910082, "grad_norm": 6.929132089240374, "learning_rate": 1.934455339545471e-05, "loss": 0.0573, "step": 1046 }, { "epoch": 1.4264305177111716, "grad_norm": 1.77193023616635, "learning_rate": 1.9342981114038467e-05, "loss": 0.0681, "step": 1047 }, { "epoch": 1.4277929155313351, "grad_norm": 3.184102746772765, "learning_rate": 1.934140701314249e-05, "loss": 0.0828, "step": 1048 }, { "epoch": 1.4291553133514987, "grad_norm": 2.6520911608416324, "learning_rate": 1.9339831093073317e-05, "loss": 0.0678, "step": 1049 }, { "epoch": 1.430517711171662, "grad_norm": 1.956117473446215, "learning_rate": 1.9338253354137854e-05, "loss": 0.0684, "step": 1050 }, { "epoch": 1.4318801089918256, "grad_norm": 2.696991473548868, "learning_rate": 1.933667379664335e-05, "loss": 0.0828, "step": 1051 }, { "epoch": 1.4332425068119892, "grad_norm": 2.1490456971949943, "learning_rate": 1.9335092420897416e-05, "loss": 0.0468, "step": 1052 }, { "epoch": 1.4346049046321525, "grad_norm": 3.251517804647737, "learning_rate": 1.933350922720801e-05, "loss": 0.0475, "step": 1053 }, { "epoch": 1.435967302452316, "grad_norm": 2.5715131243608544, "learning_rate": 1.933192421588345e-05, "loss": 0.0429, "step": 1054 }, { "epoch": 1.4373297002724796, "grad_norm": 1.2475227892116032, "learning_rate": 1.9330337387232406e-05, "loss": 0.0681, "step": 1055 }, { "epoch": 1.438692098092643, "grad_norm": 2.4311265218816813, "learning_rate": 1.9328748741563903e-05, "loss": 0.051, "step": 1056 }, { "epoch": 1.4400544959128065, "grad_norm": 1.5786841214771572, "learning_rate": 1.9327158279187313e-05, "loss": 0.0801, "step": 1057 }, { "epoch": 1.44141689373297, "grad_norm": 2.019962392496165, "learning_rate": 1.9325566000412377e-05, "loss": 0.0771, "step": 1058 }, { "epoch": 1.4427792915531334, "grad_norm": 4.148866303088696, "learning_rate": 1.932397190554917e-05, "loss": 0.0737, "step": 1059 }, { "epoch": 1.444141689373297, "grad_norm": 1.435665202779798, "learning_rate": 1.9322375994908137e-05, "loss": 0.0684, "step": 1060 }, { "epoch": 1.4455040871934606, "grad_norm": 3.7969858392095097, "learning_rate": 1.9320778268800068e-05, "loss": 0.0558, "step": 1061 }, { "epoch": 1.446866485013624, "grad_norm": 1.968993894608525, "learning_rate": 1.9319178727536107e-05, "loss": 0.0902, "step": 1062 }, { "epoch": 1.4482288828337875, "grad_norm": 2.241456653606234, "learning_rate": 1.931757737142776e-05, "loss": 0.0614, "step": 1063 }, { "epoch": 1.449591280653951, "grad_norm": 3.830948062783184, "learning_rate": 1.931597420078687e-05, "loss": 0.0929, "step": 1064 }, { "epoch": 1.4509536784741144, "grad_norm": 4.462705551372664, "learning_rate": 1.9314369215925654e-05, "loss": 0.059, "step": 1065 }, { "epoch": 1.452316076294278, "grad_norm": 1.35838640773932, "learning_rate": 1.9312762417156664e-05, "loss": 0.0431, "step": 1066 }, { "epoch": 1.4536784741144415, "grad_norm": 4.377794890795123, "learning_rate": 1.9311153804792814e-05, "loss": 0.0818, "step": 1067 }, { "epoch": 1.4550408719346049, "grad_norm": 2.2752743321078426, "learning_rate": 1.930954337914737e-05, "loss": 0.0892, "step": 1068 }, { "epoch": 1.4564032697547684, "grad_norm": 2.2203719933894206, "learning_rate": 1.9307931140533954e-05, "loss": 0.0779, "step": 1069 }, { "epoch": 1.457765667574932, "grad_norm": 4.55944473481953, "learning_rate": 1.9306317089266534e-05, "loss": 0.0972, "step": 1070 }, { "epoch": 1.4591280653950953, "grad_norm": 4.486950149302442, "learning_rate": 1.9304701225659438e-05, "loss": 0.0514, "step": 1071 }, { "epoch": 1.4604904632152589, "grad_norm": 1.9897692229150379, "learning_rate": 1.9303083550027344e-05, "loss": 0.045, "step": 1072 }, { "epoch": 1.4618528610354224, "grad_norm": 3.2525988363646783, "learning_rate": 1.9301464062685286e-05, "loss": 0.0613, "step": 1073 }, { "epoch": 1.4632152588555858, "grad_norm": 1.602633880970361, "learning_rate": 1.9299842763948637e-05, "loss": 0.0189, "step": 1074 }, { "epoch": 1.4645776566757494, "grad_norm": 2.3450567958005615, "learning_rate": 1.9298219654133143e-05, "loss": 0.0448, "step": 1075 }, { "epoch": 1.465940054495913, "grad_norm": 1.847703777481898, "learning_rate": 1.929659473355489e-05, "loss": 0.0863, "step": 1076 }, { "epoch": 1.4673024523160763, "grad_norm": 4.5836895549380126, "learning_rate": 1.9294968002530324e-05, "loss": 0.0502, "step": 1077 }, { "epoch": 1.4686648501362398, "grad_norm": 2.2748824574143263, "learning_rate": 1.929333946137623e-05, "loss": 0.0867, "step": 1078 }, { "epoch": 1.4700272479564034, "grad_norm": 4.296740850280554, "learning_rate": 1.9291709110409764e-05, "loss": 0.0936, "step": 1079 }, { "epoch": 1.4713896457765667, "grad_norm": 1.2895404733135514, "learning_rate": 1.929007694994842e-05, "loss": 0.0679, "step": 1080 }, { "epoch": 1.4727520435967303, "grad_norm": 1.8584058647473625, "learning_rate": 1.9288442980310056e-05, "loss": 0.0503, "step": 1081 }, { "epoch": 1.4741144414168939, "grad_norm": 2.8072100010581953, "learning_rate": 1.9286807201812866e-05, "loss": 0.0797, "step": 1082 }, { "epoch": 1.4754768392370572, "grad_norm": 5.25116273824778, "learning_rate": 1.9285169614775417e-05, "loss": 0.0442, "step": 1083 }, { "epoch": 1.4768392370572208, "grad_norm": 5.471950993481448, "learning_rate": 1.928353021951661e-05, "loss": 0.0721, "step": 1084 }, { "epoch": 1.4782016348773843, "grad_norm": 2.42628026619108, "learning_rate": 1.928188901635571e-05, "loss": 0.0656, "step": 1085 }, { "epoch": 1.4795640326975477, "grad_norm": 3.476813597000433, "learning_rate": 1.928024600561233e-05, "loss": 0.1308, "step": 1086 }, { "epoch": 1.4809264305177112, "grad_norm": 2.2437538494680647, "learning_rate": 1.927860118760643e-05, "loss": 0.1087, "step": 1087 }, { "epoch": 1.4822888283378748, "grad_norm": 5.506667227166872, "learning_rate": 1.9276954562658328e-05, "loss": 0.0712, "step": 1088 }, { "epoch": 1.4836512261580381, "grad_norm": 2.169498861128896, "learning_rate": 1.9275306131088697e-05, "loss": 0.0958, "step": 1089 }, { "epoch": 1.4850136239782017, "grad_norm": 5.247864543637921, "learning_rate": 1.9273655893218555e-05, "loss": 0.0433, "step": 1090 }, { "epoch": 1.4863760217983653, "grad_norm": 2.9795080527210764, "learning_rate": 1.9272003849369274e-05, "loss": 0.056, "step": 1091 }, { "epoch": 1.4877384196185286, "grad_norm": 3.718911866511636, "learning_rate": 1.9270349999862575e-05, "loss": 0.0761, "step": 1092 }, { "epoch": 1.4891008174386922, "grad_norm": 7.218607861832872, "learning_rate": 1.926869434502054e-05, "loss": 0.0417, "step": 1093 }, { "epoch": 1.4904632152588557, "grad_norm": 2.4356486254405683, "learning_rate": 1.926703688516559e-05, "loss": 0.0793, "step": 1094 }, { "epoch": 1.491825613079019, "grad_norm": 5.0182144175531205, "learning_rate": 1.9265377620620506e-05, "loss": 0.1136, "step": 1095 }, { "epoch": 1.4931880108991826, "grad_norm": 11.664561686072451, "learning_rate": 1.926371655170842e-05, "loss": 0.0911, "step": 1096 }, { "epoch": 1.494550408719346, "grad_norm": 3.718991492861475, "learning_rate": 1.926205367875281e-05, "loss": 0.0679, "step": 1097 }, { "epoch": 1.4959128065395095, "grad_norm": 6.440636840374398, "learning_rate": 1.926038900207751e-05, "loss": 0.0854, "step": 1098 }, { "epoch": 1.497275204359673, "grad_norm": 8.779864423293883, "learning_rate": 1.9258722522006706e-05, "loss": 0.0878, "step": 1099 }, { "epoch": 1.4986376021798364, "grad_norm": 5.49141957653807, "learning_rate": 1.925705423886493e-05, "loss": 0.069, "step": 1100 }, { "epoch": 1.5, "grad_norm": 8.582617887969814, "learning_rate": 1.9255384152977076e-05, "loss": 0.0959, "step": 1101 }, { "epoch": 1.5013623978201633, "grad_norm": 6.408710729159438, "learning_rate": 1.9253712264668368e-05, "loss": 0.0775, "step": 1102 }, { "epoch": 1.5027247956403271, "grad_norm": 3.7764982346699956, "learning_rate": 1.9252038574264403e-05, "loss": 0.0804, "step": 1103 }, { "epoch": 1.5040871934604905, "grad_norm": 4.749077929146231, "learning_rate": 1.9250363082091123e-05, "loss": 0.0594, "step": 1104 }, { "epoch": 1.5054495912806538, "grad_norm": 1.7357650675196996, "learning_rate": 1.924868578847481e-05, "loss": 0.053, "step": 1105 }, { "epoch": 1.5068119891008176, "grad_norm": 1.9583240360169947, "learning_rate": 1.9247006693742115e-05, "loss": 0.1107, "step": 1106 }, { "epoch": 1.508174386920981, "grad_norm": 4.1144092383330255, "learning_rate": 1.9245325798220023e-05, "loss": 0.0586, "step": 1107 }, { "epoch": 1.5095367847411443, "grad_norm": 4.793732971216867, "learning_rate": 1.9243643102235878e-05, "loss": 0.0769, "step": 1108 }, { "epoch": 1.510899182561308, "grad_norm": 1.2163306277912274, "learning_rate": 1.924195860611737e-05, "loss": 0.0532, "step": 1109 }, { "epoch": 1.5122615803814714, "grad_norm": 7.5907816491339615, "learning_rate": 1.9240272310192552e-05, "loss": 0.1092, "step": 1110 }, { "epoch": 1.5136239782016347, "grad_norm": 2.033379836190613, "learning_rate": 1.923858421478981e-05, "loss": 0.057, "step": 1111 }, { "epoch": 1.5149863760217985, "grad_norm": 2.5525502091585563, "learning_rate": 1.9236894320237894e-05, "loss": 0.0872, "step": 1112 }, { "epoch": 1.5163487738419619, "grad_norm": 4.102867902422709, "learning_rate": 1.923520262686589e-05, "loss": 0.0757, "step": 1113 }, { "epoch": 1.5177111716621252, "grad_norm": 4.7175044850074315, "learning_rate": 1.9233509135003255e-05, "loss": 0.0682, "step": 1114 }, { "epoch": 1.5190735694822888, "grad_norm": 1.9664081987579383, "learning_rate": 1.9231813844979777e-05, "loss": 0.0413, "step": 1115 }, { "epoch": 1.5204359673024523, "grad_norm": 12.159423943792516, "learning_rate": 1.9230116757125602e-05, "loss": 0.0812, "step": 1116 }, { "epoch": 1.5217983651226157, "grad_norm": 2.6060490197462154, "learning_rate": 1.922841787177123e-05, "loss": 0.0952, "step": 1117 }, { "epoch": 1.5231607629427792, "grad_norm": 7.34940234241608, "learning_rate": 1.9226717189247503e-05, "loss": 0.0704, "step": 1118 }, { "epoch": 1.5245231607629428, "grad_norm": 6.141723591723154, "learning_rate": 1.9225014709885623e-05, "loss": 0.0754, "step": 1119 }, { "epoch": 1.5258855585831061, "grad_norm": 1.333412973917593, "learning_rate": 1.9223310434017128e-05, "loss": 0.0642, "step": 1120 }, { "epoch": 1.5272479564032697, "grad_norm": 4.173306356100395, "learning_rate": 1.922160436197392e-05, "loss": 0.0856, "step": 1121 }, { "epoch": 1.5286103542234333, "grad_norm": 3.721757553014528, "learning_rate": 1.9219896494088238e-05, "loss": 0.0467, "step": 1122 }, { "epoch": 1.5299727520435966, "grad_norm": 1.6006912402465536, "learning_rate": 1.9218186830692686e-05, "loss": 0.1014, "step": 1123 }, { "epoch": 1.5313351498637602, "grad_norm": 4.584993955026748, "learning_rate": 1.9216475372120198e-05, "loss": 0.078, "step": 1124 }, { "epoch": 1.5326975476839237, "grad_norm": 1.003266701987148, "learning_rate": 1.921476211870408e-05, "loss": 0.0714, "step": 1125 }, { "epoch": 1.534059945504087, "grad_norm": 2.893926976611405, "learning_rate": 1.9213047070777967e-05, "loss": 0.0745, "step": 1126 }, { "epoch": 1.5354223433242506, "grad_norm": 1.1642159894694806, "learning_rate": 1.9211330228675855e-05, "loss": 0.0547, "step": 1127 }, { "epoch": 1.5367847411444142, "grad_norm": 1.6315066486334384, "learning_rate": 1.920961159273209e-05, "loss": 0.098, "step": 1128 }, { "epoch": 1.5381471389645776, "grad_norm": 1.4680007710501515, "learning_rate": 1.9207891163281364e-05, "loss": 0.0704, "step": 1129 }, { "epoch": 1.5395095367847411, "grad_norm": 0.9386908325268055, "learning_rate": 1.9206168940658713e-05, "loss": 0.059, "step": 1130 }, { "epoch": 1.5408719346049047, "grad_norm": 4.483261398992498, "learning_rate": 1.9204444925199535e-05, "loss": 0.0697, "step": 1131 }, { "epoch": 1.542234332425068, "grad_norm": 2.9059501525267435, "learning_rate": 1.9202719117239562e-05, "loss": 0.0859, "step": 1132 }, { "epoch": 1.5435967302452316, "grad_norm": 3.4805430926188694, "learning_rate": 1.920099151711489e-05, "loss": 0.0679, "step": 1133 }, { "epoch": 1.5449591280653951, "grad_norm": 3.7361684411530627, "learning_rate": 1.9199262125161956e-05, "loss": 0.0487, "step": 1134 }, { "epoch": 1.5463215258855585, "grad_norm": 2.6330674221447823, "learning_rate": 1.9197530941717542e-05, "loss": 0.0699, "step": 1135 }, { "epoch": 1.547683923705722, "grad_norm": 4.211876780690089, "learning_rate": 1.9195797967118785e-05, "loss": 0.0831, "step": 1136 }, { "epoch": 1.5490463215258856, "grad_norm": 4.00304413320829, "learning_rate": 1.9194063201703175e-05, "loss": 0.0785, "step": 1137 }, { "epoch": 1.550408719346049, "grad_norm": 1.641735555960252, "learning_rate": 1.919232664580854e-05, "loss": 0.0498, "step": 1138 }, { "epoch": 1.5517711171662125, "grad_norm": 4.594305273372598, "learning_rate": 1.9190588299773064e-05, "loss": 0.0461, "step": 1139 }, { "epoch": 1.553133514986376, "grad_norm": 5.753592464625482, "learning_rate": 1.9188848163935278e-05, "loss": 0.0749, "step": 1140 }, { "epoch": 1.5544959128065394, "grad_norm": 7.033294014757925, "learning_rate": 1.9187106238634058e-05, "loss": 0.1401, "step": 1141 }, { "epoch": 1.555858310626703, "grad_norm": 1.4093212153557455, "learning_rate": 1.918536252420864e-05, "loss": 0.076, "step": 1142 }, { "epoch": 1.5572207084468666, "grad_norm": 5.555948941726576, "learning_rate": 1.9183617020998587e-05, "loss": 0.0905, "step": 1143 }, { "epoch": 1.55858310626703, "grad_norm": 2.458330805624209, "learning_rate": 1.9181869729343837e-05, "loss": 0.0804, "step": 1144 }, { "epoch": 1.5599455040871935, "grad_norm": 10.294825314345825, "learning_rate": 1.9180120649584655e-05, "loss": 0.0717, "step": 1145 }, { "epoch": 1.561307901907357, "grad_norm": 1.5289871007930118, "learning_rate": 1.9178369782061662e-05, "loss": 0.0541, "step": 1146 }, { "epoch": 1.5626702997275204, "grad_norm": 3.7363143141312736, "learning_rate": 1.9176617127115832e-05, "loss": 0.0648, "step": 1147 }, { "epoch": 1.564032697547684, "grad_norm": 6.817941184154604, "learning_rate": 1.9174862685088472e-05, "loss": 0.0814, "step": 1148 }, { "epoch": 1.5653950953678475, "grad_norm": 2.061749534754831, "learning_rate": 1.917310645632126e-05, "loss": 0.0665, "step": 1149 }, { "epoch": 1.5667574931880108, "grad_norm": 5.8198372921695665, "learning_rate": 1.91713484411562e-05, "loss": 0.0483, "step": 1150 }, { "epoch": 1.5681198910081744, "grad_norm": 7.176100290133193, "learning_rate": 1.916958863993566e-05, "loss": 0.0861, "step": 1151 }, { "epoch": 1.569482288828338, "grad_norm": 5.5897867736459546, "learning_rate": 1.916782705300234e-05, "loss": 0.0859, "step": 1152 }, { "epoch": 1.5708446866485013, "grad_norm": 6.75051961712491, "learning_rate": 1.9166063680699303e-05, "loss": 0.0724, "step": 1153 }, { "epoch": 1.5722070844686649, "grad_norm": 2.2866510202043187, "learning_rate": 1.916429852336996e-05, "loss": 0.091, "step": 1154 }, { "epoch": 1.5735694822888284, "grad_norm": 2.2231654298327026, "learning_rate": 1.9162531581358044e-05, "loss": 0.0659, "step": 1155 }, { "epoch": 1.5749318801089918, "grad_norm": 2.5890778138621573, "learning_rate": 1.916076285500767e-05, "loss": 0.0614, "step": 1156 }, { "epoch": 1.5762942779291553, "grad_norm": 1.0213470138175962, "learning_rate": 1.9158992344663282e-05, "loss": 0.0353, "step": 1157 }, { "epoch": 1.577656675749319, "grad_norm": 1.1556921218923781, "learning_rate": 1.9157220050669672e-05, "loss": 0.0554, "step": 1158 }, { "epoch": 1.5790190735694822, "grad_norm": 4.400219915029384, "learning_rate": 1.9155445973371983e-05, "loss": 0.0948, "step": 1159 }, { "epoch": 1.5803814713896458, "grad_norm": 1.5446842390950373, "learning_rate": 1.9153670113115704e-05, "loss": 0.0725, "step": 1160 }, { "epoch": 1.5817438692098094, "grad_norm": 2.014602842033978, "learning_rate": 1.915189247024667e-05, "loss": 0.0738, "step": 1161 }, { "epoch": 1.5831062670299727, "grad_norm": 3.1430354785964227, "learning_rate": 1.9150113045111068e-05, "loss": 0.0695, "step": 1162 }, { "epoch": 1.5844686648501363, "grad_norm": 2.337493083187442, "learning_rate": 1.9148331838055424e-05, "loss": 0.0934, "step": 1163 }, { "epoch": 1.5858310626702998, "grad_norm": 3.1373690165429706, "learning_rate": 1.9146548849426617e-05, "loss": 0.0537, "step": 1164 }, { "epoch": 1.5871934604904632, "grad_norm": 1.4328280211985103, "learning_rate": 1.9144764079571872e-05, "loss": 0.0799, "step": 1165 }, { "epoch": 1.5885558583106267, "grad_norm": 1.6670862032434128, "learning_rate": 1.9142977528838763e-05, "loss": 0.0626, "step": 1166 }, { "epoch": 1.5899182561307903, "grad_norm": 2.5317418062619845, "learning_rate": 1.9141189197575204e-05, "loss": 0.0492, "step": 1167 }, { "epoch": 1.5912806539509536, "grad_norm": 2.0033930621120635, "learning_rate": 1.913939908612946e-05, "loss": 0.0491, "step": 1168 }, { "epoch": 1.5926430517711172, "grad_norm": 4.1764350051477575, "learning_rate": 1.9137607194850145e-05, "loss": 0.0908, "step": 1169 }, { "epoch": 1.5940054495912808, "grad_norm": 3.6826296523094695, "learning_rate": 1.9135813524086216e-05, "loss": 0.1077, "step": 1170 }, { "epoch": 1.595367847411444, "grad_norm": 2.344643710329624, "learning_rate": 1.913401807418698e-05, "loss": 0.1003, "step": 1171 }, { "epoch": 1.5967302452316077, "grad_norm": 2.1311599942983674, "learning_rate": 1.9132220845502085e-05, "loss": 0.0801, "step": 1172 }, { "epoch": 1.5980926430517712, "grad_norm": 1.5155640110504676, "learning_rate": 1.913042183838153e-05, "loss": 0.0952, "step": 1173 }, { "epoch": 1.5994550408719346, "grad_norm": 1.8200699398788815, "learning_rate": 1.9128621053175657e-05, "loss": 0.0946, "step": 1174 }, { "epoch": 1.6008174386920981, "grad_norm": 3.39090523249687, "learning_rate": 1.912681849023516e-05, "loss": 0.0462, "step": 1175 }, { "epoch": 1.6021798365122617, "grad_norm": 4.059662313777833, "learning_rate": 1.912501414991107e-05, "loss": 0.0874, "step": 1176 }, { "epoch": 1.603542234332425, "grad_norm": 6.563688030411592, "learning_rate": 1.9123208032554778e-05, "loss": 0.076, "step": 1177 }, { "epoch": 1.6049046321525886, "grad_norm": 1.8106251999195755, "learning_rate": 1.9121400138518008e-05, "loss": 0.0976, "step": 1178 }, { "epoch": 1.6062670299727522, "grad_norm": 4.348360296536959, "learning_rate": 1.9119590468152828e-05, "loss": 0.0656, "step": 1179 }, { "epoch": 1.6076294277929155, "grad_norm": 4.371533263019011, "learning_rate": 1.9117779021811672e-05, "loss": 0.115, "step": 1180 }, { "epoch": 1.608991825613079, "grad_norm": 6.03580592207305, "learning_rate": 1.9115965799847295e-05, "loss": 0.1039, "step": 1181 }, { "epoch": 1.6103542234332426, "grad_norm": 1.8637866812680248, "learning_rate": 1.9114150802612812e-05, "loss": 0.0856, "step": 1182 }, { "epoch": 1.611716621253406, "grad_norm": 3.1211786306942892, "learning_rate": 1.9112334030461686e-05, "loss": 0.0571, "step": 1183 }, { "epoch": 1.6130790190735693, "grad_norm": 2.734987223876583, "learning_rate": 1.9110515483747716e-05, "loss": 0.0668, "step": 1184 }, { "epoch": 1.614441416893733, "grad_norm": 2.0838535967873515, "learning_rate": 1.9108695162825047e-05, "loss": 0.0544, "step": 1185 }, { "epoch": 1.6158038147138964, "grad_norm": 6.420260468212693, "learning_rate": 1.9106873068048185e-05, "loss": 0.1105, "step": 1186 }, { "epoch": 1.6171662125340598, "grad_norm": 4.370130291625123, "learning_rate": 1.9105049199771963e-05, "loss": 0.0373, "step": 1187 }, { "epoch": 1.6185286103542236, "grad_norm": 3.505886652995688, "learning_rate": 1.9103223558351566e-05, "loss": 0.0872, "step": 1188 }, { "epoch": 1.619891008174387, "grad_norm": 1.8909945584051173, "learning_rate": 1.9101396144142526e-05, "loss": 0.0698, "step": 1189 }, { "epoch": 1.6212534059945503, "grad_norm": 4.531436696762758, "learning_rate": 1.909956695750072e-05, "loss": 0.063, "step": 1190 }, { "epoch": 1.622615803814714, "grad_norm": 4.642002848123684, "learning_rate": 1.909773599878237e-05, "loss": 0.066, "step": 1191 }, { "epoch": 1.6239782016348774, "grad_norm": 2.125626880437426, "learning_rate": 1.909590326834404e-05, "loss": 0.0575, "step": 1192 }, { "epoch": 1.6253405994550407, "grad_norm": 4.514854668878075, "learning_rate": 1.9094068766542642e-05, "loss": 0.1032, "step": 1193 }, { "epoch": 1.6267029972752045, "grad_norm": 3.126711585884134, "learning_rate": 1.909223249373543e-05, "loss": 0.0773, "step": 1194 }, { "epoch": 1.6280653950953679, "grad_norm": 6.945932516159866, "learning_rate": 1.9090394450280007e-05, "loss": 0.0656, "step": 1195 }, { "epoch": 1.6294277929155312, "grad_norm": 3.0333726007844732, "learning_rate": 1.9088554636534323e-05, "loss": 0.0914, "step": 1196 }, { "epoch": 1.630790190735695, "grad_norm": 5.985820186177696, "learning_rate": 1.9086713052856665e-05, "loss": 0.0705, "step": 1197 }, { "epoch": 1.6321525885558583, "grad_norm": 10.52389797199701, "learning_rate": 1.9084869699605666e-05, "loss": 0.0915, "step": 1198 }, { "epoch": 1.6335149863760217, "grad_norm": 8.729881728355439, "learning_rate": 1.908302457714031e-05, "loss": 0.051, "step": 1199 }, { "epoch": 1.6348773841961854, "grad_norm": 5.939487875710481, "learning_rate": 1.9081177685819924e-05, "loss": 0.0869, "step": 1200 }, { "epoch": 1.6362397820163488, "grad_norm": 2.4667303360653463, "learning_rate": 1.907932902600417e-05, "loss": 0.0291, "step": 1201 }, { "epoch": 1.6376021798365121, "grad_norm": 4.5656591490719025, "learning_rate": 1.9077478598053062e-05, "loss": 0.064, "step": 1202 }, { "epoch": 1.638964577656676, "grad_norm": 8.884325151227685, "learning_rate": 1.9075626402326967e-05, "loss": 0.0592, "step": 1203 }, { "epoch": 1.6403269754768393, "grad_norm": 1.499415514007833, "learning_rate": 1.9073772439186577e-05, "loss": 0.052, "step": 1204 }, { "epoch": 1.6416893732970026, "grad_norm": 6.308317138354004, "learning_rate": 1.9071916708992943e-05, "loss": 0.0745, "step": 1205 }, { "epoch": 1.6430517711171662, "grad_norm": 7.278752737635547, "learning_rate": 1.9070059212107455e-05, "loss": 0.0679, "step": 1206 }, { "epoch": 1.6444141689373297, "grad_norm": 2.2928300672610273, "learning_rate": 1.9068199948891846e-05, "loss": 0.0435, "step": 1207 }, { "epoch": 1.645776566757493, "grad_norm": 9.18692262911308, "learning_rate": 1.90663389197082e-05, "loss": 0.0772, "step": 1208 }, { "epoch": 1.6471389645776566, "grad_norm": 2.1325145772931537, "learning_rate": 1.906447612491893e-05, "loss": 0.0638, "step": 1209 }, { "epoch": 1.6485013623978202, "grad_norm": 7.078965549314047, "learning_rate": 1.9062611564886807e-05, "loss": 0.0841, "step": 1210 }, { "epoch": 1.6498637602179835, "grad_norm": 1.1462781849616666, "learning_rate": 1.906074523997494e-05, "loss": 0.0897, "step": 1211 }, { "epoch": 1.651226158038147, "grad_norm": 3.004811035913179, "learning_rate": 1.9058877150546783e-05, "loss": 0.0625, "step": 1212 }, { "epoch": 1.6525885558583107, "grad_norm": 8.839907975155453, "learning_rate": 1.9057007296966136e-05, "loss": 0.1016, "step": 1213 }, { "epoch": 1.653950953678474, "grad_norm": 2.391541088282247, "learning_rate": 1.9055135679597138e-05, "loss": 0.0783, "step": 1214 }, { "epoch": 1.6553133514986376, "grad_norm": 1.1852057626770958, "learning_rate": 1.905326229880427e-05, "loss": 0.0979, "step": 1215 }, { "epoch": 1.6566757493188011, "grad_norm": 7.213118013796951, "learning_rate": 1.905138715495236e-05, "loss": 0.0676, "step": 1216 }, { "epoch": 1.6580381471389645, "grad_norm": 1.5530951229731718, "learning_rate": 1.9049510248406586e-05, "loss": 0.0797, "step": 1217 }, { "epoch": 1.659400544959128, "grad_norm": 2.5859321076281816, "learning_rate": 1.9047631579532457e-05, "loss": 0.0638, "step": 1218 }, { "epoch": 1.6607629427792916, "grad_norm": 5.195119300103237, "learning_rate": 1.9045751148695833e-05, "loss": 0.0478, "step": 1219 }, { "epoch": 1.662125340599455, "grad_norm": 1.510168905555469, "learning_rate": 1.9043868956262913e-05, "loss": 0.0343, "step": 1220 }, { "epoch": 1.6634877384196185, "grad_norm": 6.406927264541047, "learning_rate": 1.9041985002600236e-05, "loss": 0.0557, "step": 1221 }, { "epoch": 1.664850136239782, "grad_norm": 3.464503357305483, "learning_rate": 1.9040099288074697e-05, "loss": 0.0968, "step": 1222 }, { "epoch": 1.6662125340599454, "grad_norm": 3.1782035044454804, "learning_rate": 1.903821181305352e-05, "loss": 0.0797, "step": 1223 }, { "epoch": 1.667574931880109, "grad_norm": 7.303436394423944, "learning_rate": 1.9036322577904284e-05, "loss": 0.0812, "step": 1224 }, { "epoch": 1.6689373297002725, "grad_norm": 1.1352616338407333, "learning_rate": 1.9034431582994895e-05, "loss": 0.0561, "step": 1225 }, { "epoch": 1.6702997275204359, "grad_norm": 4.959054660419871, "learning_rate": 1.9032538828693615e-05, "loss": 0.0632, "step": 1226 }, { "epoch": 1.6716621253405994, "grad_norm": 4.244523151413489, "learning_rate": 1.9030644315369046e-05, "loss": 0.0643, "step": 1227 }, { "epoch": 1.673024523160763, "grad_norm": 2.695550626121489, "learning_rate": 1.9028748043390134e-05, "loss": 0.074, "step": 1228 }, { "epoch": 1.6743869209809263, "grad_norm": 4.850825873010897, "learning_rate": 1.902685001312616e-05, "loss": 0.0692, "step": 1229 }, { "epoch": 1.67574931880109, "grad_norm": 3.8015035140509967, "learning_rate": 1.902495022494675e-05, "loss": 0.0748, "step": 1230 }, { "epoch": 1.6771117166212535, "grad_norm": 3.191736943814497, "learning_rate": 1.902304867922188e-05, "loss": 0.102, "step": 1231 }, { "epoch": 1.6784741144414168, "grad_norm": 6.08922912791499, "learning_rate": 1.9021145376321857e-05, "loss": 0.0562, "step": 1232 }, { "epoch": 1.6798365122615804, "grad_norm": 1.1564762060129405, "learning_rate": 1.901924031661734e-05, "loss": 0.0462, "step": 1233 }, { "epoch": 1.681198910081744, "grad_norm": 5.2260116602165585, "learning_rate": 1.9017333500479326e-05, "loss": 0.073, "step": 1234 }, { "epoch": 1.6825613079019073, "grad_norm": 2.6824141672043957, "learning_rate": 1.901542492827915e-05, "loss": 0.0602, "step": 1235 }, { "epoch": 1.6839237057220708, "grad_norm": 0.9331904397160945, "learning_rate": 1.9013514600388498e-05, "loss": 0.039, "step": 1236 }, { "epoch": 1.6852861035422344, "grad_norm": 3.9682889711330294, "learning_rate": 1.901160251717939e-05, "loss": 0.0775, "step": 1237 }, { "epoch": 1.6866485013623977, "grad_norm": 3.0347357377228663, "learning_rate": 1.900968867902419e-05, "loss": 0.0818, "step": 1238 }, { "epoch": 1.6880108991825613, "grad_norm": 4.0494425250290815, "learning_rate": 1.900777308629561e-05, "loss": 0.0766, "step": 1239 }, { "epoch": 1.6893732970027249, "grad_norm": 2.383890214030557, "learning_rate": 1.9005855739366688e-05, "loss": 0.0706, "step": 1240 }, { "epoch": 1.6907356948228882, "grad_norm": 2.7838197781849807, "learning_rate": 1.9003936638610826e-05, "loss": 0.0564, "step": 1241 }, { "epoch": 1.6920980926430518, "grad_norm": 1.5061302358937252, "learning_rate": 1.9002015784401745e-05, "loss": 0.0752, "step": 1242 }, { "epoch": 1.6934604904632153, "grad_norm": 4.027892027485079, "learning_rate": 1.9000093177113524e-05, "loss": 0.0667, "step": 1243 }, { "epoch": 1.6948228882833787, "grad_norm": 1.7672567214680333, "learning_rate": 1.8998168817120578e-05, "loss": 0.069, "step": 1244 }, { "epoch": 1.6961852861035422, "grad_norm": 2.208881759008045, "learning_rate": 1.8996242704797655e-05, "loss": 0.0677, "step": 1245 }, { "epoch": 1.6975476839237058, "grad_norm": 2.654724327585999, "learning_rate": 1.8994314840519857e-05, "loss": 0.0591, "step": 1246 }, { "epoch": 1.6989100817438691, "grad_norm": 1.6856974735953616, "learning_rate": 1.8992385224662625e-05, "loss": 0.077, "step": 1247 }, { "epoch": 1.7002724795640327, "grad_norm": 2.9660152839968474, "learning_rate": 1.899045385760173e-05, "loss": 0.0805, "step": 1248 }, { "epoch": 1.7016348773841963, "grad_norm": 2.5098009653341555, "learning_rate": 1.8988520739713298e-05, "loss": 0.1029, "step": 1249 }, { "epoch": 1.7029972752043596, "grad_norm": 3.7428948891556857, "learning_rate": 1.8986585871373792e-05, "loss": 0.0888, "step": 1250 }, { "epoch": 1.7043596730245232, "grad_norm": 2.249722190727873, "learning_rate": 1.8984649252960005e-05, "loss": 0.0461, "step": 1251 }, { "epoch": 1.7057220708446867, "grad_norm": 3.2404348284328273, "learning_rate": 1.898271088484909e-05, "loss": 0.0955, "step": 1252 }, { "epoch": 1.70708446866485, "grad_norm": 4.87239374021039, "learning_rate": 1.898077076741853e-05, "loss": 0.0705, "step": 1253 }, { "epoch": 1.7084468664850136, "grad_norm": 3.963278511619165, "learning_rate": 1.897882890104614e-05, "loss": 0.0578, "step": 1254 }, { "epoch": 1.7098092643051772, "grad_norm": 5.845226015193683, "learning_rate": 1.8976885286110088e-05, "loss": 0.0804, "step": 1255 }, { "epoch": 1.7111716621253406, "grad_norm": 2.7908135281486475, "learning_rate": 1.8974939922988886e-05, "loss": 0.0987, "step": 1256 }, { "epoch": 1.7125340599455041, "grad_norm": 4.468780045100261, "learning_rate": 1.8972992812061374e-05, "loss": 0.048, "step": 1257 }, { "epoch": 1.7138964577656677, "grad_norm": 4.08070357578494, "learning_rate": 1.8971043953706738e-05, "loss": 0.0591, "step": 1258 }, { "epoch": 1.715258855585831, "grad_norm": 2.8988956254988234, "learning_rate": 1.8969093348304507e-05, "loss": 0.0661, "step": 1259 }, { "epoch": 1.7166212534059946, "grad_norm": 5.495316547825978, "learning_rate": 1.8967140996234544e-05, "loss": 0.0522, "step": 1260 }, { "epoch": 1.7179836512261581, "grad_norm": 5.403679460835175, "learning_rate": 1.8965186897877063e-05, "loss": 0.0547, "step": 1261 }, { "epoch": 1.7193460490463215, "grad_norm": 2.592627783480177, "learning_rate": 1.89632310536126e-05, "loss": 0.0806, "step": 1262 }, { "epoch": 1.720708446866485, "grad_norm": 5.3752960297312, "learning_rate": 1.8961273463822052e-05, "loss": 0.0751, "step": 1263 }, { "epoch": 1.7220708446866486, "grad_norm": 2.83987166914508, "learning_rate": 1.8959314128886642e-05, "loss": 0.038, "step": 1264 }, { "epoch": 1.723433242506812, "grad_norm": 1.0563313829490937, "learning_rate": 1.8957353049187936e-05, "loss": 0.0677, "step": 1265 }, { "epoch": 1.7247956403269755, "grad_norm": 2.9977497323352003, "learning_rate": 1.895539022510784e-05, "loss": 0.0794, "step": 1266 }, { "epoch": 1.726158038147139, "grad_norm": 3.855587597659426, "learning_rate": 1.8953425657028602e-05, "loss": 0.0732, "step": 1267 }, { "epoch": 1.7275204359673024, "grad_norm": 1.4653441319974778, "learning_rate": 1.895145934533281e-05, "loss": 0.0626, "step": 1268 }, { "epoch": 1.728882833787466, "grad_norm": 9.000219742711064, "learning_rate": 1.894949129040338e-05, "loss": 0.1111, "step": 1269 }, { "epoch": 1.7302452316076296, "grad_norm": 1.2772509475669296, "learning_rate": 1.8947521492623586e-05, "loss": 0.063, "step": 1270 }, { "epoch": 1.731607629427793, "grad_norm": 8.500998356618018, "learning_rate": 1.894554995237703e-05, "loss": 0.0667, "step": 1271 }, { "epoch": 1.7329700272479565, "grad_norm": 5.296515743601424, "learning_rate": 1.8943576670047653e-05, "loss": 0.0807, "step": 1272 }, { "epoch": 1.73433242506812, "grad_norm": 3.294693004880645, "learning_rate": 1.894160164601974e-05, "loss": 0.0613, "step": 1273 }, { "epoch": 1.7356948228882834, "grad_norm": 10.37821131604562, "learning_rate": 1.893962488067792e-05, "loss": 0.0623, "step": 1274 }, { "epoch": 1.7370572207084467, "grad_norm": 4.6770918553512875, "learning_rate": 1.893764637440714e-05, "loss": 0.0913, "step": 1275 }, { "epoch": 1.7384196185286105, "grad_norm": 8.405157252134615, "learning_rate": 1.8935666127592706e-05, "loss": 0.0774, "step": 1276 }, { "epoch": 1.7397820163487738, "grad_norm": 6.687584006356847, "learning_rate": 1.893368414062026e-05, "loss": 0.0534, "step": 1277 }, { "epoch": 1.7411444141689372, "grad_norm": 3.3289842775287357, "learning_rate": 1.893170041387578e-05, "loss": 0.0987, "step": 1278 }, { "epoch": 1.742506811989101, "grad_norm": 11.252074693532194, "learning_rate": 1.8929714947745576e-05, "loss": 0.0842, "step": 1279 }, { "epoch": 1.7438692098092643, "grad_norm": 7.356790382203387, "learning_rate": 1.892772774261631e-05, "loss": 0.0695, "step": 1280 }, { "epoch": 1.7452316076294276, "grad_norm": 3.320928489649177, "learning_rate": 1.892573879887498e-05, "loss": 0.0527, "step": 1281 }, { "epoch": 1.7465940054495914, "grad_norm": 12.13353006182578, "learning_rate": 1.8923748116908908e-05, "loss": 0.082, "step": 1282 }, { "epoch": 1.7479564032697548, "grad_norm": 4.429141558217945, "learning_rate": 1.892175569710577e-05, "loss": 0.0758, "step": 1283 }, { "epoch": 1.749318801089918, "grad_norm": 6.740071913828972, "learning_rate": 1.891976153985358e-05, "loss": 0.0484, "step": 1284 }, { "epoch": 1.750681198910082, "grad_norm": 11.417336063319658, "learning_rate": 1.8917765645540683e-05, "loss": 0.094, "step": 1285 }, { "epoch": 1.7520435967302452, "grad_norm": 1.6912991683769267, "learning_rate": 1.891576801455576e-05, "loss": 0.0413, "step": 1286 }, { "epoch": 1.7534059945504086, "grad_norm": 3.8818673736151776, "learning_rate": 1.8913768647287844e-05, "loss": 0.1018, "step": 1287 }, { "epoch": 1.7547683923705724, "grad_norm": 12.073627461140125, "learning_rate": 1.8911767544126295e-05, "loss": 0.0634, "step": 1288 }, { "epoch": 1.7561307901907357, "grad_norm": 3.309780515336322, "learning_rate": 1.890976470546081e-05, "loss": 0.074, "step": 1289 }, { "epoch": 1.757493188010899, "grad_norm": 8.399970861106047, "learning_rate": 1.8907760131681435e-05, "loss": 0.0804, "step": 1290 }, { "epoch": 1.7588555858310628, "grad_norm": 4.9223844997497626, "learning_rate": 1.8905753823178537e-05, "loss": 0.0435, "step": 1291 }, { "epoch": 1.7602179836512262, "grad_norm": 4.545786815077727, "learning_rate": 1.8903745780342838e-05, "loss": 0.0706, "step": 1292 }, { "epoch": 1.7615803814713895, "grad_norm": 7.926017999302076, "learning_rate": 1.8901736003565386e-05, "loss": 0.0692, "step": 1293 }, { "epoch": 1.7629427792915533, "grad_norm": 8.270856753032966, "learning_rate": 1.8899724493237575e-05, "loss": 0.0712, "step": 1294 }, { "epoch": 1.7643051771117166, "grad_norm": 1.686728716370382, "learning_rate": 1.889771124975113e-05, "loss": 0.0402, "step": 1295 }, { "epoch": 1.76566757493188, "grad_norm": 9.404142774821247, "learning_rate": 1.8895696273498118e-05, "loss": 0.0906, "step": 1296 }, { "epoch": 1.7670299727520435, "grad_norm": 10.222614443161254, "learning_rate": 1.8893679564870935e-05, "loss": 0.0522, "step": 1297 }, { "epoch": 1.768392370572207, "grad_norm": 4.768790291413092, "learning_rate": 1.889166112426233e-05, "loss": 0.0538, "step": 1298 }, { "epoch": 1.7697547683923704, "grad_norm": 10.108637297367705, "learning_rate": 1.8889640952065374e-05, "loss": 0.0869, "step": 1299 }, { "epoch": 1.771117166212534, "grad_norm": 8.923495348483279, "learning_rate": 1.888761904867348e-05, "loss": 0.0523, "step": 1300 }, { "epoch": 1.7724795640326976, "grad_norm": 8.75630294670783, "learning_rate": 1.8885595414480408e-05, "loss": 0.077, "step": 1301 }, { "epoch": 1.773841961852861, "grad_norm": 10.889128531796095, "learning_rate": 1.8883570049880237e-05, "loss": 0.1207, "step": 1302 }, { "epoch": 1.7752043596730245, "grad_norm": 2.99683647624533, "learning_rate": 1.8881542955267394e-05, "loss": 0.0749, "step": 1303 }, { "epoch": 1.776566757493188, "grad_norm": 11.645431960277412, "learning_rate": 1.8879514131036643e-05, "loss": 0.0675, "step": 1304 }, { "epoch": 1.7779291553133514, "grad_norm": 12.141431010316671, "learning_rate": 1.8877483577583087e-05, "loss": 0.0692, "step": 1305 }, { "epoch": 1.779291553133515, "grad_norm": 5.42509126756315, "learning_rate": 1.887545129530216e-05, "loss": 0.0853, "step": 1306 }, { "epoch": 1.7806539509536785, "grad_norm": 14.214416143419374, "learning_rate": 1.887341728458963e-05, "loss": 0.0677, "step": 1307 }, { "epoch": 1.7820163487738419, "grad_norm": 6.417505205656769, "learning_rate": 1.8871381545841613e-05, "loss": 0.0955, "step": 1308 }, { "epoch": 1.7833787465940054, "grad_norm": 9.746226145062176, "learning_rate": 1.886934407945455e-05, "loss": 0.0865, "step": 1309 }, { "epoch": 1.784741144414169, "grad_norm": 12.118725789141427, "learning_rate": 1.8867304885825222e-05, "loss": 0.0892, "step": 1310 }, { "epoch": 1.7861035422343323, "grad_norm": 3.5169661526889877, "learning_rate": 1.8865263965350754e-05, "loss": 0.0336, "step": 1311 }, { "epoch": 1.7874659400544959, "grad_norm": 8.346645941764702, "learning_rate": 1.8863221318428596e-05, "loss": 0.0548, "step": 1312 }, { "epoch": 1.7888283378746594, "grad_norm": 13.580495937899494, "learning_rate": 1.8861176945456542e-05, "loss": 0.073, "step": 1313 }, { "epoch": 1.7901907356948228, "grad_norm": 2.3816390475873828, "learning_rate": 1.885913084683271e-05, "loss": 0.0575, "step": 1314 }, { "epoch": 1.7915531335149864, "grad_norm": 11.22370238844941, "learning_rate": 1.885708302295558e-05, "loss": 0.0818, "step": 1315 }, { "epoch": 1.79291553133515, "grad_norm": 10.523648677594192, "learning_rate": 1.8855033474223937e-05, "loss": 0.0423, "step": 1316 }, { "epoch": 1.7942779291553133, "grad_norm": 3.254952526189065, "learning_rate": 1.8852982201036922e-05, "loss": 0.0688, "step": 1317 }, { "epoch": 1.7956403269754768, "grad_norm": 6.27488889008601, "learning_rate": 1.8850929203794005e-05, "loss": 0.0351, "step": 1318 }, { "epoch": 1.7970027247956404, "grad_norm": 8.509858817415752, "learning_rate": 1.8848874482894995e-05, "loss": 0.0676, "step": 1319 }, { "epoch": 1.7983651226158037, "grad_norm": 2.072035473157954, "learning_rate": 1.884681803874003e-05, "loss": 0.042, "step": 1320 }, { "epoch": 1.7997275204359673, "grad_norm": 8.46063605445282, "learning_rate": 1.8844759871729593e-05, "loss": 0.0676, "step": 1321 }, { "epoch": 1.8010899182561309, "grad_norm": 5.8133354432565785, "learning_rate": 1.884269998226449e-05, "loss": 0.0584, "step": 1322 }, { "epoch": 1.8024523160762942, "grad_norm": 5.482140693120729, "learning_rate": 1.884063837074588e-05, "loss": 0.0412, "step": 1323 }, { "epoch": 1.8038147138964578, "grad_norm": 5.161332536445433, "learning_rate": 1.8838575037575243e-05, "loss": 0.0528, "step": 1324 }, { "epoch": 1.8051771117166213, "grad_norm": 2.5871775465545563, "learning_rate": 1.883650998315439e-05, "loss": 0.087, "step": 1325 }, { "epoch": 1.8065395095367847, "grad_norm": 2.9874428540734415, "learning_rate": 1.8834443207885492e-05, "loss": 0.0809, "step": 1326 }, { "epoch": 1.8079019073569482, "grad_norm": 5.400490618521038, "learning_rate": 1.8832374712171026e-05, "loss": 0.0541, "step": 1327 }, { "epoch": 1.8092643051771118, "grad_norm": 1.6670053181022322, "learning_rate": 1.8830304496413822e-05, "loss": 0.1055, "step": 1328 }, { "epoch": 1.8106267029972751, "grad_norm": 2.059743026095188, "learning_rate": 1.882823256101704e-05, "loss": 0.0387, "step": 1329 }, { "epoch": 1.8119891008174387, "grad_norm": 3.9482969802684846, "learning_rate": 1.8826158906384174e-05, "loss": 0.049, "step": 1330 }, { "epoch": 1.8133514986376023, "grad_norm": 2.2701593982063106, "learning_rate": 1.8824083532919054e-05, "loss": 0.0421, "step": 1331 }, { "epoch": 1.8147138964577656, "grad_norm": 1.3398325178144301, "learning_rate": 1.8822006441025844e-05, "loss": 0.0985, "step": 1332 }, { "epoch": 1.8160762942779292, "grad_norm": 3.8601449078066783, "learning_rate": 1.8819927631109043e-05, "loss": 0.081, "step": 1333 }, { "epoch": 1.8174386920980927, "grad_norm": 2.6455679832468952, "learning_rate": 1.8817847103573484e-05, "loss": 0.0865, "step": 1334 }, { "epoch": 1.818801089918256, "grad_norm": 2.389389419473169, "learning_rate": 1.881576485882434e-05, "loss": 0.1039, "step": 1335 }, { "epoch": 1.8201634877384196, "grad_norm": 5.52299250082807, "learning_rate": 1.8813680897267105e-05, "loss": 0.0568, "step": 1336 }, { "epoch": 1.8215258855585832, "grad_norm": 10.476215090575495, "learning_rate": 1.8811595219307622e-05, "loss": 0.1028, "step": 1337 }, { "epoch": 1.8228882833787465, "grad_norm": 5.848066664245126, "learning_rate": 1.880950782535206e-05, "loss": 0.0502, "step": 1338 }, { "epoch": 1.82425068119891, "grad_norm": 11.140270935648179, "learning_rate": 1.880741871580692e-05, "loss": 0.0858, "step": 1339 }, { "epoch": 1.8256130790190737, "grad_norm": 3.037791680020436, "learning_rate": 1.8805327891079055e-05, "loss": 0.1136, "step": 1340 }, { "epoch": 1.826975476839237, "grad_norm": 8.299731935791385, "learning_rate": 1.8803235351575623e-05, "loss": 0.0575, "step": 1341 }, { "epoch": 1.8283378746594006, "grad_norm": 4.739263147254709, "learning_rate": 1.880114109770414e-05, "loss": 0.0848, "step": 1342 }, { "epoch": 1.8297002724795641, "grad_norm": 4.44851471633581, "learning_rate": 1.8799045129872442e-05, "loss": 0.0795, "step": 1343 }, { "epoch": 1.8310626702997275, "grad_norm": 5.11719551317294, "learning_rate": 1.879694744848871e-05, "loss": 0.0554, "step": 1344 }, { "epoch": 1.832425068119891, "grad_norm": 7.7110417524822665, "learning_rate": 1.8794848053961447e-05, "loss": 0.0637, "step": 1345 }, { "epoch": 1.8337874659400546, "grad_norm": 1.2987586348723479, "learning_rate": 1.87927469466995e-05, "loss": 0.0682, "step": 1346 }, { "epoch": 1.835149863760218, "grad_norm": 3.8181944174159836, "learning_rate": 1.8790644127112042e-05, "loss": 0.0654, "step": 1347 }, { "epoch": 1.8365122615803815, "grad_norm": 7.1726643801082135, "learning_rate": 1.878853959560858e-05, "loss": 0.0813, "step": 1348 }, { "epoch": 1.837874659400545, "grad_norm": 2.1597619267451758, "learning_rate": 1.8786433352598963e-05, "loss": 0.0675, "step": 1349 }, { "epoch": 1.8392370572207084, "grad_norm": 4.9902846896620865, "learning_rate": 1.8784325398493363e-05, "loss": 0.0438, "step": 1350 }, { "epoch": 1.840599455040872, "grad_norm": 6.012448306997265, "learning_rate": 1.8782215733702286e-05, "loss": 0.0689, "step": 1351 }, { "epoch": 1.8419618528610355, "grad_norm": 4.192078458301871, "learning_rate": 1.8780104358636583e-05, "loss": 0.052, "step": 1352 }, { "epoch": 1.8433242506811989, "grad_norm": 9.527417947720329, "learning_rate": 1.8777991273707422e-05, "loss": 0.0855, "step": 1353 }, { "epoch": 1.8446866485013624, "grad_norm": 2.8282495623342387, "learning_rate": 1.8775876479326314e-05, "loss": 0.0599, "step": 1354 }, { "epoch": 1.846049046321526, "grad_norm": 5.19910250061087, "learning_rate": 1.8773759975905098e-05, "loss": 0.0342, "step": 1355 }, { "epoch": 1.8474114441416893, "grad_norm": 1.8387660802416748, "learning_rate": 1.8771641763855956e-05, "loss": 0.0854, "step": 1356 }, { "epoch": 1.848773841961853, "grad_norm": 3.425639663763872, "learning_rate": 1.8769521843591386e-05, "loss": 0.0847, "step": 1357 }, { "epoch": 1.8501362397820165, "grad_norm": 2.309959634210905, "learning_rate": 1.876740021552423e-05, "loss": 0.0996, "step": 1358 }, { "epoch": 1.8514986376021798, "grad_norm": 2.2227310099337756, "learning_rate": 1.876527688006766e-05, "loss": 0.0385, "step": 1359 }, { "epoch": 1.8528610354223434, "grad_norm": 1.1675962684266998, "learning_rate": 1.8763151837635184e-05, "loss": 0.0492, "step": 1360 }, { "epoch": 1.854223433242507, "grad_norm": 1.9217471113071307, "learning_rate": 1.8761025088640633e-05, "loss": 0.0722, "step": 1361 }, { "epoch": 1.8555858310626703, "grad_norm": 1.7591760536645444, "learning_rate": 1.8758896633498184e-05, "loss": 0.0916, "step": 1362 }, { "epoch": 1.8569482288828338, "grad_norm": 4.58235279945573, "learning_rate": 1.875676647262233e-05, "loss": 0.0963, "step": 1363 }, { "epoch": 1.8583106267029974, "grad_norm": 4.149028749579163, "learning_rate": 1.8754634606427914e-05, "loss": 0.0505, "step": 1364 }, { "epoch": 1.8596730245231607, "grad_norm": 3.0411543700960952, "learning_rate": 1.8752501035330096e-05, "loss": 0.0555, "step": 1365 }, { "epoch": 1.861035422343324, "grad_norm": 0.9851834230015283, "learning_rate": 1.8750365759744373e-05, "loss": 0.0626, "step": 1366 }, { "epoch": 1.8623978201634879, "grad_norm": 4.516240892230305, "learning_rate": 1.8748228780086578e-05, "loss": 0.0814, "step": 1367 }, { "epoch": 1.8637602179836512, "grad_norm": 3.087386750584683, "learning_rate": 1.8746090096772877e-05, "loss": 0.0696, "step": 1368 }, { "epoch": 1.8651226158038146, "grad_norm": 2.1738666202887003, "learning_rate": 1.874394971021975e-05, "loss": 0.0589, "step": 1369 }, { "epoch": 1.8664850136239783, "grad_norm": 2.5747885475597094, "learning_rate": 1.8741807620844037e-05, "loss": 0.0909, "step": 1370 }, { "epoch": 1.8678474114441417, "grad_norm": 2.6049163029901736, "learning_rate": 1.8739663829062884e-05, "loss": 0.0753, "step": 1371 }, { "epoch": 1.869209809264305, "grad_norm": 1.6542345226026436, "learning_rate": 1.8737518335293785e-05, "loss": 0.0565, "step": 1372 }, { "epoch": 1.8705722070844688, "grad_norm": 1.8486571646866976, "learning_rate": 1.873537113995456e-05, "loss": 0.0807, "step": 1373 }, { "epoch": 1.8719346049046321, "grad_norm": 1.6079670593679305, "learning_rate": 1.8733222243463357e-05, "loss": 0.0755, "step": 1374 }, { "epoch": 1.8732970027247955, "grad_norm": 3.042085729837508, "learning_rate": 1.873107164623866e-05, "loss": 0.0538, "step": 1375 }, { "epoch": 1.8746594005449593, "grad_norm": 1.425371902739451, "learning_rate": 1.8728919348699285e-05, "loss": 0.0795, "step": 1376 }, { "epoch": 1.8760217983651226, "grad_norm": 3.664411453146259, "learning_rate": 1.872676535126437e-05, "loss": 0.068, "step": 1377 }, { "epoch": 1.877384196185286, "grad_norm": 6.820119883721684, "learning_rate": 1.8724609654353397e-05, "loss": 0.0963, "step": 1378 }, { "epoch": 1.8787465940054497, "grad_norm": 7.379645980400941, "learning_rate": 1.8722452258386172e-05, "loss": 0.0685, "step": 1379 }, { "epoch": 1.880108991825613, "grad_norm": 4.678853802880201, "learning_rate": 1.872029316378283e-05, "loss": 0.0711, "step": 1380 }, { "epoch": 1.8814713896457764, "grad_norm": 1.0156137725654273, "learning_rate": 1.8718132370963843e-05, "loss": 0.0702, "step": 1381 }, { "epoch": 1.8828337874659402, "grad_norm": 4.716804850455912, "learning_rate": 1.8715969880350013e-05, "loss": 0.0689, "step": 1382 }, { "epoch": 1.8841961852861036, "grad_norm": 3.677897659292998, "learning_rate": 1.8713805692362458e-05, "loss": 0.0754, "step": 1383 }, { "epoch": 1.885558583106267, "grad_norm": 1.5026292046609349, "learning_rate": 1.8711639807422646e-05, "loss": 0.0535, "step": 1384 }, { "epoch": 1.8869209809264307, "grad_norm": 6.6277953293916365, "learning_rate": 1.8709472225952372e-05, "loss": 0.0698, "step": 1385 }, { "epoch": 1.888283378746594, "grad_norm": 1.8513688015942964, "learning_rate": 1.870730294837375e-05, "loss": 0.0589, "step": 1386 }, { "epoch": 1.8896457765667574, "grad_norm": 4.7207323885504335, "learning_rate": 1.8705131975109235e-05, "loss": 0.0658, "step": 1387 }, { "epoch": 1.891008174386921, "grad_norm": 1.972440460033183, "learning_rate": 1.8702959306581605e-05, "loss": 0.0579, "step": 1388 }, { "epoch": 1.8923705722070845, "grad_norm": 0.9916319423210729, "learning_rate": 1.8700784943213977e-05, "loss": 0.0791, "step": 1389 }, { "epoch": 1.8937329700272478, "grad_norm": 1.5398177095966037, "learning_rate": 1.869860888542979e-05, "loss": 0.0496, "step": 1390 }, { "epoch": 1.8950953678474114, "grad_norm": 2.1197673028193926, "learning_rate": 1.8696431133652818e-05, "loss": 0.0854, "step": 1391 }, { "epoch": 1.896457765667575, "grad_norm": 2.508668947361539, "learning_rate": 1.869425168830716e-05, "loss": 0.0764, "step": 1392 }, { "epoch": 1.8978201634877383, "grad_norm": 2.1364173285282826, "learning_rate": 1.8692070549817253e-05, "loss": 0.061, "step": 1393 }, { "epoch": 1.8991825613079019, "grad_norm": 1.1645604016532523, "learning_rate": 1.868988771860785e-05, "loss": 0.0847, "step": 1394 }, { "epoch": 1.9005449591280654, "grad_norm": 1.0442278173491528, "learning_rate": 1.8687703195104047e-05, "loss": 0.0658, "step": 1395 }, { "epoch": 1.9019073569482288, "grad_norm": 1.5599270251302404, "learning_rate": 1.8685516979731266e-05, "loss": 0.0394, "step": 1396 }, { "epoch": 1.9032697547683923, "grad_norm": 1.8813814617681057, "learning_rate": 1.8683329072915253e-05, "loss": 0.0428, "step": 1397 }, { "epoch": 1.904632152588556, "grad_norm": 1.4255531384233378, "learning_rate": 1.868113947508209e-05, "loss": 0.0447, "step": 1398 }, { "epoch": 1.9059945504087192, "grad_norm": 3.421934447960191, "learning_rate": 1.8678948186658187e-05, "loss": 0.0564, "step": 1399 }, { "epoch": 1.9073569482288828, "grad_norm": 1.3901222605704024, "learning_rate": 1.8676755208070276e-05, "loss": 0.0975, "step": 1400 }, { "epoch": 1.9087193460490464, "grad_norm": 5.3937725140502035, "learning_rate": 1.867456053974543e-05, "loss": 0.0738, "step": 1401 }, { "epoch": 1.9100817438692097, "grad_norm": 2.5158538289459664, "learning_rate": 1.8672364182111046e-05, "loss": 0.0621, "step": 1402 }, { "epoch": 1.9114441416893733, "grad_norm": 3.9737569191454587, "learning_rate": 1.8670166135594843e-05, "loss": 0.0625, "step": 1403 }, { "epoch": 1.9128065395095368, "grad_norm": 5.292179251683473, "learning_rate": 1.866796640062488e-05, "loss": 0.0548, "step": 1404 }, { "epoch": 1.9141689373297002, "grad_norm": 1.8746752442492007, "learning_rate": 1.8665764977629542e-05, "loss": 0.0707, "step": 1405 }, { "epoch": 1.9155313351498637, "grad_norm": 5.352024180031912, "learning_rate": 1.8663561867037533e-05, "loss": 0.0698, "step": 1406 }, { "epoch": 1.9168937329700273, "grad_norm": 1.6605120322445373, "learning_rate": 1.8661357069277903e-05, "loss": 0.0755, "step": 1407 }, { "epoch": 1.9182561307901906, "grad_norm": 1.4553093835676199, "learning_rate": 1.865915058478001e-05, "loss": 0.0384, "step": 1408 }, { "epoch": 1.9196185286103542, "grad_norm": 6.225977308112692, "learning_rate": 1.8656942413973556e-05, "loss": 0.0641, "step": 1409 }, { "epoch": 1.9209809264305178, "grad_norm": 1.4016502346683686, "learning_rate": 1.865473255728857e-05, "loss": 0.0627, "step": 1410 }, { "epoch": 1.922343324250681, "grad_norm": 2.634589716264124, "learning_rate": 1.8652521015155404e-05, "loss": 0.0493, "step": 1411 }, { "epoch": 1.9237057220708447, "grad_norm": 3.563322593079842, "learning_rate": 1.8650307788004735e-05, "loss": 0.089, "step": 1412 }, { "epoch": 1.9250681198910082, "grad_norm": 5.536147378318501, "learning_rate": 1.864809287626758e-05, "loss": 0.0671, "step": 1413 }, { "epoch": 1.9264305177111716, "grad_norm": 2.2620533444138116, "learning_rate": 1.864587628037528e-05, "loss": 0.0752, "step": 1414 }, { "epoch": 1.9277929155313351, "grad_norm": 1.8156474117793373, "learning_rate": 1.8643658000759495e-05, "loss": 0.0533, "step": 1415 }, { "epoch": 1.9291553133514987, "grad_norm": 6.400312777255736, "learning_rate": 1.864143803785222e-05, "loss": 0.06, "step": 1416 }, { "epoch": 1.930517711171662, "grad_norm": 4.966116851759666, "learning_rate": 1.8639216392085778e-05, "loss": 0.0755, "step": 1417 }, { "epoch": 1.9318801089918256, "grad_norm": 2.2580903783657997, "learning_rate": 1.8636993063892822e-05, "loss": 0.0652, "step": 1418 }, { "epoch": 1.9332425068119892, "grad_norm": 8.717799880264476, "learning_rate": 1.8634768053706323e-05, "loss": 0.0637, "step": 1419 }, { "epoch": 1.9346049046321525, "grad_norm": 3.9691823222398526, "learning_rate": 1.863254136195959e-05, "loss": 0.0792, "step": 1420 }, { "epoch": 1.935967302452316, "grad_norm": 4.493892183769901, "learning_rate": 1.8630312989086257e-05, "loss": 0.0494, "step": 1421 }, { "epoch": 1.9373297002724796, "grad_norm": 9.226731402213641, "learning_rate": 1.862808293552028e-05, "loss": 0.0803, "step": 1422 }, { "epoch": 1.938692098092643, "grad_norm": 3.063767659495139, "learning_rate": 1.862585120169595e-05, "loss": 0.0839, "step": 1423 }, { "epoch": 1.9400544959128065, "grad_norm": 9.229730985676593, "learning_rate": 1.8623617788047882e-05, "loss": 0.1124, "step": 1424 }, { "epoch": 1.94141689373297, "grad_norm": 4.105806343651029, "learning_rate": 1.8621382695011013e-05, "loss": 0.0413, "step": 1425 }, { "epoch": 1.9427792915531334, "grad_norm": 3.5881294059357387, "learning_rate": 1.8619145923020614e-05, "loss": 0.0839, "step": 1426 }, { "epoch": 1.944141689373297, "grad_norm": 7.1317053082359925, "learning_rate": 1.8616907472512282e-05, "loss": 0.0588, "step": 1427 }, { "epoch": 1.9455040871934606, "grad_norm": 2.041437932369394, "learning_rate": 1.8614667343921936e-05, "loss": 0.057, "step": 1428 }, { "epoch": 1.946866485013624, "grad_norm": 6.138044887357014, "learning_rate": 1.861242553768583e-05, "loss": 0.0511, "step": 1429 }, { "epoch": 1.9482288828337875, "grad_norm": 2.445509901960952, "learning_rate": 1.861018205424053e-05, "loss": 0.0724, "step": 1430 }, { "epoch": 1.949591280653951, "grad_norm": 2.907797302309853, "learning_rate": 1.8607936894022954e-05, "loss": 0.071, "step": 1431 }, { "epoch": 1.9509536784741144, "grad_norm": 1.7261112406961867, "learning_rate": 1.860569005747032e-05, "loss": 0.0502, "step": 1432 }, { "epoch": 1.952316076294278, "grad_norm": 2.76837630146208, "learning_rate": 1.860344154502019e-05, "loss": 0.102, "step": 1433 }, { "epoch": 1.9536784741144415, "grad_norm": 4.63567647181488, "learning_rate": 1.860119135711044e-05, "loss": 0.0451, "step": 1434 }, { "epoch": 1.9550408719346049, "grad_norm": 4.38546094805022, "learning_rate": 1.859893949417928e-05, "loss": 0.0839, "step": 1435 }, { "epoch": 1.9564032697547684, "grad_norm": 3.0997499076142248, "learning_rate": 1.8596685956665245e-05, "loss": 0.1093, "step": 1436 }, { "epoch": 1.957765667574932, "grad_norm": 6.037939359066803, "learning_rate": 1.8594430745007197e-05, "loss": 0.0502, "step": 1437 }, { "epoch": 1.9591280653950953, "grad_norm": 5.474566401238722, "learning_rate": 1.8592173859644323e-05, "loss": 0.0711, "step": 1438 }, { "epoch": 1.9604904632152589, "grad_norm": 6.889691823646975, "learning_rate": 1.858991530101613e-05, "loss": 0.0544, "step": 1439 }, { "epoch": 1.9618528610354224, "grad_norm": 2.11327004611694, "learning_rate": 1.8587655069562464e-05, "loss": 0.1179, "step": 1440 }, { "epoch": 1.9632152588555858, "grad_norm": 4.063146037948928, "learning_rate": 1.8585393165723484e-05, "loss": 0.0943, "step": 1441 }, { "epoch": 1.9645776566757494, "grad_norm": 4.563086517712793, "learning_rate": 1.8583129589939682e-05, "loss": 0.0737, "step": 1442 }, { "epoch": 1.965940054495913, "grad_norm": 2.1841318141735364, "learning_rate": 1.8580864342651873e-05, "loss": 0.1153, "step": 1443 }, { "epoch": 1.9673024523160763, "grad_norm": 2.855032728092222, "learning_rate": 1.8578597424301192e-05, "loss": 0.0989, "step": 1444 }, { "epoch": 1.9686648501362398, "grad_norm": 2.1768585788027184, "learning_rate": 1.8576328835329117e-05, "loss": 0.0574, "step": 1445 }, { "epoch": 1.9700272479564034, "grad_norm": 6.60506426483279, "learning_rate": 1.8574058576177432e-05, "loss": 0.0986, "step": 1446 }, { "epoch": 1.9713896457765667, "grad_norm": 3.064543033912941, "learning_rate": 1.8571786647288256e-05, "loss": 0.0743, "step": 1447 }, { "epoch": 1.9727520435967303, "grad_norm": 3.532545256151224, "learning_rate": 1.8569513049104033e-05, "loss": 0.0431, "step": 1448 }, { "epoch": 1.9741144414168939, "grad_norm": 1.7625218462224634, "learning_rate": 1.8567237782067528e-05, "loss": 0.0552, "step": 1449 }, { "epoch": 1.9754768392370572, "grad_norm": 0.7571103528384893, "learning_rate": 1.8564960846621828e-05, "loss": 0.033, "step": 1450 }, { "epoch": 1.9768392370572208, "grad_norm": 5.370120679414657, "learning_rate": 1.856268224321036e-05, "loss": 0.0862, "step": 1451 }, { "epoch": 1.9782016348773843, "grad_norm": 1.8167447564943795, "learning_rate": 1.856040197227686e-05, "loss": 0.0703, "step": 1452 }, { "epoch": 1.9795640326975477, "grad_norm": 6.125187683527702, "learning_rate": 1.8558120034265396e-05, "loss": 0.0774, "step": 1453 }, { "epoch": 1.9809264305177112, "grad_norm": 2.8254034182437016, "learning_rate": 1.855583642962036e-05, "loss": 0.0682, "step": 1454 }, { "epoch": 1.9822888283378748, "grad_norm": 2.317282583054705, "learning_rate": 1.8553551158786467e-05, "loss": 0.052, "step": 1455 }, { "epoch": 1.9836512261580381, "grad_norm": 3.2765125702095035, "learning_rate": 1.8551264222208758e-05, "loss": 0.0804, "step": 1456 }, { "epoch": 1.9850136239782015, "grad_norm": 2.304134842927453, "learning_rate": 1.85489756203326e-05, "loss": 0.0339, "step": 1457 }, { "epoch": 1.9863760217983653, "grad_norm": 1.1262494142101707, "learning_rate": 1.8546685353603674e-05, "loss": 0.0705, "step": 1458 }, { "epoch": 1.9877384196185286, "grad_norm": 1.6769191277390894, "learning_rate": 1.8544393422468002e-05, "loss": 0.0769, "step": 1459 }, { "epoch": 1.989100817438692, "grad_norm": 2.5016357408430934, "learning_rate": 1.854209982737192e-05, "loss": 0.0719, "step": 1460 }, { "epoch": 1.9904632152588557, "grad_norm": 2.1790225920277955, "learning_rate": 1.8539804568762084e-05, "loss": 0.0621, "step": 1461 }, { "epoch": 1.991825613079019, "grad_norm": 0.9783220168580641, "learning_rate": 1.853750764708549e-05, "loss": 0.0475, "step": 1462 }, { "epoch": 1.9931880108991824, "grad_norm": 5.311155429591129, "learning_rate": 1.8535209062789434e-05, "loss": 0.0724, "step": 1463 }, { "epoch": 1.9945504087193462, "grad_norm": 0.7754404943723984, "learning_rate": 1.8532908816321557e-05, "loss": 0.0453, "step": 1464 }, { "epoch": 1.9959128065395095, "grad_norm": 3.7812630126140623, "learning_rate": 1.8530606908129818e-05, "loss": 0.0686, "step": 1465 }, { "epoch": 1.9972752043596729, "grad_norm": 3.319983207709068, "learning_rate": 1.852830333866249e-05, "loss": 0.0733, "step": 1466 }, { "epoch": 1.9986376021798367, "grad_norm": 3.3683948131882357, "learning_rate": 1.8525998108368185e-05, "loss": 0.0733, "step": 1467 }, { "epoch": 2.0, "grad_norm": 2.3007472525162393, "learning_rate": 1.852369121769582e-05, "loss": 0.0659, "step": 1468 }, { "epoch": 2.0, "eval_accuracy": 0.9376754632229085, "eval_f1": 0.92119898731627, "eval_loss": 0.08309026807546616, "eval_precision": 0.9158363283159896, "eval_recall": 0.9373803607849304, "eval_runtime": 16.9085, "eval_samples_per_second": 105.332, "eval_steps_per_second": 0.828, "step": 1468 }, { "epoch": 2.0013623978201633, "grad_norm": 3.5310712705826925, "learning_rate": 1.8521382667094658e-05, "loss": 0.0754, "step": 1469 }, { "epoch": 2.002724795640327, "grad_norm": 2.1716863659554986, "learning_rate": 1.8519072457014262e-05, "loss": 0.0792, "step": 1470 }, { "epoch": 2.0040871934604905, "grad_norm": 1.7611172653422311, "learning_rate": 1.8516760587904538e-05, "loss": 0.0663, "step": 1471 }, { "epoch": 2.005449591280654, "grad_norm": 3.9385898746926395, "learning_rate": 1.85144470602157e-05, "loss": 0.074, "step": 1472 }, { "epoch": 2.0068119891008176, "grad_norm": 4.749432094200595, "learning_rate": 1.851213187439829e-05, "loss": 0.0423, "step": 1473 }, { "epoch": 2.008174386920981, "grad_norm": 3.49288832349029, "learning_rate": 1.8509815030903185e-05, "loss": 0.0522, "step": 1474 }, { "epoch": 2.0095367847411443, "grad_norm": 2.639760741477273, "learning_rate": 1.850749653018156e-05, "loss": 0.0431, "step": 1475 }, { "epoch": 2.010899182561308, "grad_norm": 2.0858403832413024, "learning_rate": 1.8505176372684937e-05, "loss": 0.046, "step": 1476 }, { "epoch": 2.0122615803814714, "grad_norm": 2.419278160280498, "learning_rate": 1.8502854558865145e-05, "loss": 0.0581, "step": 1477 }, { "epoch": 2.0136239782016347, "grad_norm": 2.352455069674849, "learning_rate": 1.8500531089174343e-05, "loss": 0.0358, "step": 1478 }, { "epoch": 2.0149863760217985, "grad_norm": 0.9024912520057123, "learning_rate": 1.8498205964065012e-05, "loss": 0.0556, "step": 1479 }, { "epoch": 2.016348773841962, "grad_norm": 4.126257235089047, "learning_rate": 1.8495879183989945e-05, "loss": 0.0571, "step": 1480 }, { "epoch": 2.017711171662125, "grad_norm": 2.1781946685002893, "learning_rate": 1.8493550749402278e-05, "loss": 0.0438, "step": 1481 }, { "epoch": 2.019073569482289, "grad_norm": 3.4905009516166077, "learning_rate": 1.8491220660755452e-05, "loss": 0.0272, "step": 1482 }, { "epoch": 2.0204359673024523, "grad_norm": 4.70996844214945, "learning_rate": 1.848888891850323e-05, "loss": 0.1118, "step": 1483 }, { "epoch": 2.0217983651226157, "grad_norm": 1.234773719140926, "learning_rate": 1.8486555523099712e-05, "loss": 0.0276, "step": 1484 }, { "epoch": 2.0231607629427795, "grad_norm": 6.182604393483949, "learning_rate": 1.8484220474999305e-05, "loss": 0.0803, "step": 1485 }, { "epoch": 2.024523160762943, "grad_norm": 1.3387207673062245, "learning_rate": 1.8481883774656743e-05, "loss": 0.0406, "step": 1486 }, { "epoch": 2.025885558583106, "grad_norm": 4.734504506170737, "learning_rate": 1.8479545422527083e-05, "loss": 0.0578, "step": 1487 }, { "epoch": 2.02724795640327, "grad_norm": 2.3379978128683363, "learning_rate": 1.8477205419065702e-05, "loss": 0.0634, "step": 1488 }, { "epoch": 2.0286103542234333, "grad_norm": 2.862332043129353, "learning_rate": 1.8474863764728298e-05, "loss": 0.0498, "step": 1489 }, { "epoch": 2.0299727520435966, "grad_norm": 5.015218411268047, "learning_rate": 1.8472520459970896e-05, "loss": 0.0584, "step": 1490 }, { "epoch": 2.0313351498637604, "grad_norm": 1.8844984728912393, "learning_rate": 1.8470175505249837e-05, "loss": 0.071, "step": 1491 }, { "epoch": 2.0326975476839237, "grad_norm": 3.6817105487345607, "learning_rate": 1.846782890102178e-05, "loss": 0.0351, "step": 1492 }, { "epoch": 2.034059945504087, "grad_norm": 4.404992799306951, "learning_rate": 1.8465480647743713e-05, "loss": 0.0539, "step": 1493 }, { "epoch": 2.035422343324251, "grad_norm": 1.6812014792736998, "learning_rate": 1.8463130745872942e-05, "loss": 0.0524, "step": 1494 }, { "epoch": 2.036784741144414, "grad_norm": 2.6452037007625986, "learning_rate": 1.846077919586709e-05, "loss": 0.0675, "step": 1495 }, { "epoch": 2.0381471389645776, "grad_norm": 5.270897460332697, "learning_rate": 1.8458425998184114e-05, "loss": 0.0388, "step": 1496 }, { "epoch": 2.0395095367847413, "grad_norm": 1.6163472892088488, "learning_rate": 1.845607115328227e-05, "loss": 0.0543, "step": 1497 }, { "epoch": 2.0408719346049047, "grad_norm": 4.045988396244454, "learning_rate": 1.8453714661620153e-05, "loss": 0.0689, "step": 1498 }, { "epoch": 2.042234332425068, "grad_norm": 2.6033967760109586, "learning_rate": 1.845135652365668e-05, "loss": 0.0388, "step": 1499 }, { "epoch": 2.043596730245232, "grad_norm": 1.1796535347005792, "learning_rate": 1.8448996739851073e-05, "loss": 0.0434, "step": 1500 }, { "epoch": 2.044959128065395, "grad_norm": 3.429078445925941, "learning_rate": 1.8446635310662886e-05, "loss": 0.0588, "step": 1501 }, { "epoch": 2.0463215258855585, "grad_norm": 7.155270999725693, "learning_rate": 1.844427223655199e-05, "loss": 0.0316, "step": 1502 }, { "epoch": 2.0476839237057223, "grad_norm": 2.0832059732664976, "learning_rate": 1.844190751797858e-05, "loss": 0.0867, "step": 1503 }, { "epoch": 2.0490463215258856, "grad_norm": 12.152674185578988, "learning_rate": 1.8439541155403167e-05, "loss": 0.0608, "step": 1504 }, { "epoch": 2.050408719346049, "grad_norm": 6.406515478471289, "learning_rate": 1.8437173149286582e-05, "loss": 0.0601, "step": 1505 }, { "epoch": 2.0517711171662127, "grad_norm": 9.305624803191671, "learning_rate": 1.8434803500089977e-05, "loss": 0.0438, "step": 1506 }, { "epoch": 2.053133514986376, "grad_norm": 10.265916693553008, "learning_rate": 1.8432432208274825e-05, "loss": 0.0457, "step": 1507 }, { "epoch": 2.0544959128065394, "grad_norm": 1.5806065648924963, "learning_rate": 1.843005927430292e-05, "loss": 0.0457, "step": 1508 }, { "epoch": 2.055858310626703, "grad_norm": 10.931676778609788, "learning_rate": 1.842768469863637e-05, "loss": 0.0741, "step": 1509 }, { "epoch": 2.0572207084468666, "grad_norm": 7.704582704376154, "learning_rate": 1.8425308481737615e-05, "loss": 0.073, "step": 1510 }, { "epoch": 2.05858310626703, "grad_norm": 5.660406128657401, "learning_rate": 1.8422930624069398e-05, "loss": 0.0395, "step": 1511 }, { "epoch": 2.0599455040871932, "grad_norm": 12.784568017180431, "learning_rate": 1.8420551126094793e-05, "loss": 0.0606, "step": 1512 }, { "epoch": 2.061307901907357, "grad_norm": 2.6776976904939773, "learning_rate": 1.8418169988277193e-05, "loss": 0.045, "step": 1513 }, { "epoch": 2.0626702997275204, "grad_norm": 11.2939512549339, "learning_rate": 1.8415787211080304e-05, "loss": 0.0445, "step": 1514 }, { "epoch": 2.0640326975476837, "grad_norm": 9.134973938081826, "learning_rate": 1.8413402794968157e-05, "loss": 0.0383, "step": 1515 }, { "epoch": 2.0653950953678475, "grad_norm": 2.7385949372624117, "learning_rate": 1.8411016740405103e-05, "loss": 0.0455, "step": 1516 }, { "epoch": 2.066757493188011, "grad_norm": 10.414605901753756, "learning_rate": 1.8408629047855804e-05, "loss": 0.0383, "step": 1517 }, { "epoch": 2.068119891008174, "grad_norm": 5.903661651220329, "learning_rate": 1.8406239717785247e-05, "loss": 0.0433, "step": 1518 }, { "epoch": 2.069482288828338, "grad_norm": 3.966702606266931, "learning_rate": 1.8403848750658744e-05, "loss": 0.0258, "step": 1519 }, { "epoch": 2.0708446866485013, "grad_norm": 8.950831351969423, "learning_rate": 1.840145614694191e-05, "loss": 0.0713, "step": 1520 }, { "epoch": 2.0722070844686646, "grad_norm": 2.670258329354054, "learning_rate": 1.8399061907100693e-05, "loss": 0.0456, "step": 1521 }, { "epoch": 2.0735694822888284, "grad_norm": 4.403387609788619, "learning_rate": 1.8396666031601352e-05, "loss": 0.0586, "step": 1522 }, { "epoch": 2.0749318801089918, "grad_norm": 7.47385932269498, "learning_rate": 1.8394268520910467e-05, "loss": 0.0495, "step": 1523 }, { "epoch": 2.076294277929155, "grad_norm": 2.3392605873740364, "learning_rate": 1.8391869375494938e-05, "loss": 0.0464, "step": 1524 }, { "epoch": 2.077656675749319, "grad_norm": 8.870155043281976, "learning_rate": 1.8389468595821986e-05, "loss": 0.0827, "step": 1525 }, { "epoch": 2.0790190735694822, "grad_norm": 7.495172480029597, "learning_rate": 1.8387066182359135e-05, "loss": 0.0678, "step": 1526 }, { "epoch": 2.0803814713896456, "grad_norm": 4.0606163423885375, "learning_rate": 1.8384662135574243e-05, "loss": 0.0409, "step": 1527 }, { "epoch": 2.0817438692098094, "grad_norm": 7.991340757585969, "learning_rate": 1.8382256455935482e-05, "loss": 0.0835, "step": 1528 }, { "epoch": 2.0831062670299727, "grad_norm": 5.8002322131147865, "learning_rate": 1.8379849143911342e-05, "loss": 0.0535, "step": 1529 }, { "epoch": 2.084468664850136, "grad_norm": 6.967028061988623, "learning_rate": 1.8377440199970632e-05, "loss": 0.0627, "step": 1530 }, { "epoch": 2.0858310626703, "grad_norm": 10.487697614454092, "learning_rate": 1.8375029624582473e-05, "loss": 0.0507, "step": 1531 }, { "epoch": 2.087193460490463, "grad_norm": 3.5817660379959904, "learning_rate": 1.8372617418216308e-05, "loss": 0.0185, "step": 1532 }, { "epoch": 2.0885558583106265, "grad_norm": 4.959903490124983, "learning_rate": 1.8370203581341894e-05, "loss": 0.048, "step": 1533 }, { "epoch": 2.0899182561307903, "grad_norm": 5.550345900737738, "learning_rate": 1.8367788114429317e-05, "loss": 0.0985, "step": 1534 }, { "epoch": 2.0912806539509536, "grad_norm": 1.290382488876572, "learning_rate": 1.8365371017948966e-05, "loss": 0.0513, "step": 1535 }, { "epoch": 2.092643051771117, "grad_norm": 6.406319226770183, "learning_rate": 1.8362952292371557e-05, "loss": 0.0378, "step": 1536 }, { "epoch": 2.0940054495912808, "grad_norm": 4.954462332494713, "learning_rate": 1.8360531938168115e-05, "loss": 0.0222, "step": 1537 }, { "epoch": 2.095367847411444, "grad_norm": 2.182817349753275, "learning_rate": 1.8358109955809993e-05, "loss": 0.0567, "step": 1538 }, { "epoch": 2.0967302452316074, "grad_norm": 4.175809409040535, "learning_rate": 1.8355686345768847e-05, "loss": 0.0488, "step": 1539 }, { "epoch": 2.0980926430517712, "grad_norm": 3.5652759760675865, "learning_rate": 1.835326110851667e-05, "loss": 0.0428, "step": 1540 }, { "epoch": 2.0994550408719346, "grad_norm": 2.8864956729167117, "learning_rate": 1.8350834244525747e-05, "loss": 0.043, "step": 1541 }, { "epoch": 2.100817438692098, "grad_norm": 7.1139118806424095, "learning_rate": 1.83484057542687e-05, "loss": 0.0759, "step": 1542 }, { "epoch": 2.1021798365122617, "grad_norm": 4.7552118549079445, "learning_rate": 1.834597563821846e-05, "loss": 0.0385, "step": 1543 }, { "epoch": 2.103542234332425, "grad_norm": 2.3159142790543634, "learning_rate": 1.8343543896848275e-05, "loss": 0.0595, "step": 1544 }, { "epoch": 2.1049046321525884, "grad_norm": 7.0123290276034025, "learning_rate": 1.8341110530631705e-05, "loss": 0.0491, "step": 1545 }, { "epoch": 2.106267029972752, "grad_norm": 2.4059760147026665, "learning_rate": 1.8338675540042633e-05, "loss": 0.063, "step": 1546 }, { "epoch": 2.1076294277929155, "grad_norm": 6.370941070954723, "learning_rate": 1.8336238925555263e-05, "loss": 0.0687, "step": 1547 }, { "epoch": 2.108991825613079, "grad_norm": 2.100260263342582, "learning_rate": 1.83338006876441e-05, "loss": 0.0611, "step": 1548 }, { "epoch": 2.1103542234332426, "grad_norm": 3.85682822716398, "learning_rate": 1.8331360826783973e-05, "loss": 0.0638, "step": 1549 }, { "epoch": 2.111716621253406, "grad_norm": 2.114826252762876, "learning_rate": 1.8328919343450036e-05, "loss": 0.0323, "step": 1550 }, { "epoch": 2.1130790190735693, "grad_norm": 2.1949745953773117, "learning_rate": 1.8326476238117745e-05, "loss": 0.0431, "step": 1551 }, { "epoch": 2.114441416893733, "grad_norm": 7.222876439884929, "learning_rate": 1.8324031511262877e-05, "loss": 0.0635, "step": 1552 }, { "epoch": 2.1158038147138964, "grad_norm": 2.466265460592054, "learning_rate": 1.832158516336153e-05, "loss": 0.0417, "step": 1553 }, { "epoch": 2.11716621253406, "grad_norm": 1.8937137734614162, "learning_rate": 1.8319137194890104e-05, "loss": 0.0728, "step": 1554 }, { "epoch": 2.1185286103542236, "grad_norm": 2.8093874401897914, "learning_rate": 1.8316687606325335e-05, "loss": 0.0349, "step": 1555 }, { "epoch": 2.119891008174387, "grad_norm": 3.223824727464659, "learning_rate": 1.8314236398144256e-05, "loss": 0.0933, "step": 1556 }, { "epoch": 2.1212534059945503, "grad_norm": 1.5688734862094293, "learning_rate": 1.831178357082422e-05, "loss": 0.0307, "step": 1557 }, { "epoch": 2.122615803814714, "grad_norm": 0.9716526298080255, "learning_rate": 1.8309329124842906e-05, "loss": 0.0696, "step": 1558 }, { "epoch": 2.1239782016348774, "grad_norm": 2.3897558771968197, "learning_rate": 1.8306873060678295e-05, "loss": 0.0233, "step": 1559 }, { "epoch": 2.1253405994550407, "grad_norm": 4.622507672870106, "learning_rate": 1.8304415378808694e-05, "loss": 0.0426, "step": 1560 }, { "epoch": 2.1267029972752045, "grad_norm": 1.3222755031256663, "learning_rate": 1.8301956079712707e-05, "loss": 0.0278, "step": 1561 }, { "epoch": 2.128065395095368, "grad_norm": 0.8979424426503565, "learning_rate": 1.8299495163869277e-05, "loss": 0.0381, "step": 1562 }, { "epoch": 2.129427792915531, "grad_norm": 1.6538360545392423, "learning_rate": 1.8297032631757642e-05, "loss": 0.0307, "step": 1563 }, { "epoch": 2.130790190735695, "grad_norm": 1.8469295449114536, "learning_rate": 1.8294568483857368e-05, "loss": 0.0593, "step": 1564 }, { "epoch": 2.1321525885558583, "grad_norm": 1.8724960251781648, "learning_rate": 1.8292102720648333e-05, "loss": 0.0585, "step": 1565 }, { "epoch": 2.1335149863760217, "grad_norm": 1.427419338800114, "learning_rate": 1.828963534261072e-05, "loss": 0.0434, "step": 1566 }, { "epoch": 2.1348773841961854, "grad_norm": 1.5552783540545705, "learning_rate": 1.8287166350225034e-05, "loss": 0.048, "step": 1567 }, { "epoch": 2.136239782016349, "grad_norm": 1.0915619901836089, "learning_rate": 1.8284695743972095e-05, "loss": 0.0373, "step": 1568 }, { "epoch": 2.137602179836512, "grad_norm": 3.855934628249622, "learning_rate": 1.828222352433304e-05, "loss": 0.0644, "step": 1569 }, { "epoch": 2.138964577656676, "grad_norm": 2.798628811228492, "learning_rate": 1.8279749691789316e-05, "loss": 0.0454, "step": 1570 }, { "epoch": 2.1403269754768393, "grad_norm": 3.514754744033713, "learning_rate": 1.827727424682268e-05, "loss": 0.054, "step": 1571 }, { "epoch": 2.1416893732970026, "grad_norm": 0.9824336086972407, "learning_rate": 1.827479718991521e-05, "loss": 0.0317, "step": 1572 }, { "epoch": 2.1430517711171664, "grad_norm": 4.809350452829025, "learning_rate": 1.8272318521549293e-05, "loss": 0.0536, "step": 1573 }, { "epoch": 2.1444141689373297, "grad_norm": 2.5520947243871617, "learning_rate": 1.8269838242207634e-05, "loss": 0.0936, "step": 1574 }, { "epoch": 2.145776566757493, "grad_norm": 2.7089263079790493, "learning_rate": 1.8267356352373252e-05, "loss": 0.029, "step": 1575 }, { "epoch": 2.147138964577657, "grad_norm": 2.2909472681716645, "learning_rate": 1.8264872852529478e-05, "loss": 0.031, "step": 1576 }, { "epoch": 2.14850136239782, "grad_norm": 2.764953119839949, "learning_rate": 1.826238774315995e-05, "loss": 0.0414, "step": 1577 }, { "epoch": 2.1498637602179835, "grad_norm": 3.6810542045119865, "learning_rate": 1.825990102474863e-05, "loss": 0.0439, "step": 1578 }, { "epoch": 2.1512261580381473, "grad_norm": 2.415077118485967, "learning_rate": 1.825741269777979e-05, "loss": 0.0581, "step": 1579 }, { "epoch": 2.1525885558583107, "grad_norm": 0.9663050178131266, "learning_rate": 1.825492276273801e-05, "loss": 0.029, "step": 1580 }, { "epoch": 2.153950953678474, "grad_norm": 3.2892358961308465, "learning_rate": 1.8252431220108192e-05, "loss": 0.0591, "step": 1581 }, { "epoch": 2.155313351498638, "grad_norm": 2.2455575803369396, "learning_rate": 1.8249938070375543e-05, "loss": 0.0609, "step": 1582 }, { "epoch": 2.156675749318801, "grad_norm": 4.222483182312733, "learning_rate": 1.8247443314025585e-05, "loss": 0.0255, "step": 1583 }, { "epoch": 2.1580381471389645, "grad_norm": 5.106176062902703, "learning_rate": 1.824494695154416e-05, "loss": 0.0461, "step": 1584 }, { "epoch": 2.1594005449591283, "grad_norm": 3.193644368144876, "learning_rate": 1.824244898341741e-05, "loss": 0.0693, "step": 1585 }, { "epoch": 2.1607629427792916, "grad_norm": 5.195791091531736, "learning_rate": 1.8239949410131803e-05, "loss": 0.0327, "step": 1586 }, { "epoch": 2.162125340599455, "grad_norm": 2.0019842325718615, "learning_rate": 1.823744823217411e-05, "loss": 0.0667, "step": 1587 }, { "epoch": 2.1634877384196187, "grad_norm": 2.0925282360909927, "learning_rate": 1.8234945450031417e-05, "loss": 0.0395, "step": 1588 }, { "epoch": 2.164850136239782, "grad_norm": 1.7913478193909123, "learning_rate": 1.8232441064191125e-05, "loss": 0.0679, "step": 1589 }, { "epoch": 2.1662125340599454, "grad_norm": 1.6927733934122644, "learning_rate": 1.822993507514095e-05, "loss": 0.0272, "step": 1590 }, { "epoch": 2.167574931880109, "grad_norm": 3.4741945318383447, "learning_rate": 1.8227427483368904e-05, "loss": 0.0413, "step": 1591 }, { "epoch": 2.1689373297002725, "grad_norm": 1.651410169394946, "learning_rate": 1.8224918289363333e-05, "loss": 0.0414, "step": 1592 }, { "epoch": 2.170299727520436, "grad_norm": 1.1644509136847645, "learning_rate": 1.8222407493612878e-05, "loss": 0.0412, "step": 1593 }, { "epoch": 2.1716621253405997, "grad_norm": 3.25900386758134, "learning_rate": 1.8219895096606504e-05, "loss": 0.0339, "step": 1594 }, { "epoch": 2.173024523160763, "grad_norm": 3.6281693134180775, "learning_rate": 1.8217381098833482e-05, "loss": 0.052, "step": 1595 }, { "epoch": 2.1743869209809263, "grad_norm": 1.8730129562208802, "learning_rate": 1.8214865500783393e-05, "loss": 0.046, "step": 1596 }, { "epoch": 2.17574931880109, "grad_norm": 2.676301852137592, "learning_rate": 1.8212348302946134e-05, "loss": 0.0526, "step": 1597 }, { "epoch": 2.1771117166212535, "grad_norm": 2.908069737319617, "learning_rate": 1.820982950581191e-05, "loss": 0.0626, "step": 1598 }, { "epoch": 2.178474114441417, "grad_norm": 3.565710390108232, "learning_rate": 1.820730910987124e-05, "loss": 0.0407, "step": 1599 }, { "epoch": 2.1798365122615806, "grad_norm": 2.636794150332545, "learning_rate": 1.8204787115614952e-05, "loss": 0.0389, "step": 1600 }, { "epoch": 2.181198910081744, "grad_norm": 1.9165501891517467, "learning_rate": 1.820226352353419e-05, "loss": 0.0268, "step": 1601 }, { "epoch": 2.1825613079019073, "grad_norm": 1.6601554731933046, "learning_rate": 1.81997383341204e-05, "loss": 0.055, "step": 1602 }, { "epoch": 2.183923705722071, "grad_norm": 2.0312935567686266, "learning_rate": 1.819721154786535e-05, "loss": 0.0339, "step": 1603 }, { "epoch": 2.1852861035422344, "grad_norm": 2.188039159253517, "learning_rate": 1.8194683165261116e-05, "loss": 0.0698, "step": 1604 }, { "epoch": 2.1866485013623977, "grad_norm": 1.99929310049688, "learning_rate": 1.8192153186800075e-05, "loss": 0.0422, "step": 1605 }, { "epoch": 2.1880108991825615, "grad_norm": 0.9146042419039005, "learning_rate": 1.8189621612974925e-05, "loss": 0.014, "step": 1606 }, { "epoch": 2.189373297002725, "grad_norm": 1.1986231039423512, "learning_rate": 1.8187088444278675e-05, "loss": 0.0323, "step": 1607 }, { "epoch": 2.190735694822888, "grad_norm": 3.195579434289185, "learning_rate": 1.818455368120464e-05, "loss": 0.071, "step": 1608 }, { "epoch": 2.192098092643052, "grad_norm": 5.5502412238461645, "learning_rate": 1.8182017324246448e-05, "loss": 0.0577, "step": 1609 }, { "epoch": 2.1934604904632153, "grad_norm": 3.4686473326515825, "learning_rate": 1.8179479373898036e-05, "loss": 0.0438, "step": 1610 }, { "epoch": 2.1948228882833787, "grad_norm": 1.977329810047122, "learning_rate": 1.817693983065365e-05, "loss": 0.0545, "step": 1611 }, { "epoch": 2.1961852861035425, "grad_norm": 4.717139794960698, "learning_rate": 1.817439869500785e-05, "loss": 0.0719, "step": 1612 }, { "epoch": 2.197547683923706, "grad_norm": 2.856474396877721, "learning_rate": 1.8171855967455508e-05, "loss": 0.0609, "step": 1613 }, { "epoch": 2.198910081743869, "grad_norm": 1.3374204082051835, "learning_rate": 1.8169311648491798e-05, "loss": 0.031, "step": 1614 }, { "epoch": 2.2002724795640325, "grad_norm": 4.012696166906004, "learning_rate": 1.8166765738612206e-05, "loss": 0.0269, "step": 1615 }, { "epoch": 2.2016348773841963, "grad_norm": 1.1853636368244977, "learning_rate": 1.8164218238312534e-05, "loss": 0.0507, "step": 1616 }, { "epoch": 2.2029972752043596, "grad_norm": 3.5111048588344387, "learning_rate": 1.816166914808889e-05, "loss": 0.0623, "step": 1617 }, { "epoch": 2.204359673024523, "grad_norm": 2.2334530137640725, "learning_rate": 1.815911846843769e-05, "loss": 0.0756, "step": 1618 }, { "epoch": 2.2057220708446867, "grad_norm": 3.3826635441107458, "learning_rate": 1.8156566199855657e-05, "loss": 0.0385, "step": 1619 }, { "epoch": 2.20708446866485, "grad_norm": 2.5436411400431433, "learning_rate": 1.8154012342839837e-05, "loss": 0.0473, "step": 1620 }, { "epoch": 2.2084468664850134, "grad_norm": 2.154889059656485, "learning_rate": 1.8151456897887566e-05, "loss": 0.0289, "step": 1621 }, { "epoch": 2.209809264305177, "grad_norm": 3.93263349638782, "learning_rate": 1.8148899865496503e-05, "loss": 0.0746, "step": 1622 }, { "epoch": 2.2111716621253406, "grad_norm": 2.3618243079693304, "learning_rate": 1.814634124616461e-05, "loss": 0.0259, "step": 1623 }, { "epoch": 2.212534059945504, "grad_norm": 1.6439041061397535, "learning_rate": 1.814378104039017e-05, "loss": 0.0475, "step": 1624 }, { "epoch": 2.2138964577656677, "grad_norm": 1.0867812620665578, "learning_rate": 1.8141219248671747e-05, "loss": 0.0439, "step": 1625 }, { "epoch": 2.215258855585831, "grad_norm": 1.9402618008324641, "learning_rate": 1.8138655871508245e-05, "loss": 0.0456, "step": 1626 }, { "epoch": 2.2166212534059944, "grad_norm": 1.158248486875599, "learning_rate": 1.8136090909398862e-05, "loss": 0.0507, "step": 1627 }, { "epoch": 2.217983651226158, "grad_norm": 4.0836107333058385, "learning_rate": 1.8133524362843105e-05, "loss": 0.0727, "step": 1628 }, { "epoch": 2.2193460490463215, "grad_norm": 3.8440367867358782, "learning_rate": 1.8130956232340788e-05, "loss": 0.0615, "step": 1629 }, { "epoch": 2.220708446866485, "grad_norm": 4.1617444041846445, "learning_rate": 1.8128386518392044e-05, "loss": 0.095, "step": 1630 }, { "epoch": 2.2220708446866486, "grad_norm": 3.021043516348499, "learning_rate": 1.8125815221497296e-05, "loss": 0.053, "step": 1631 }, { "epoch": 2.223433242506812, "grad_norm": 3.344391239733685, "learning_rate": 1.8123242342157293e-05, "loss": 0.0667, "step": 1632 }, { "epoch": 2.2247956403269753, "grad_norm": 2.5596504230562673, "learning_rate": 1.8120667880873086e-05, "loss": 0.076, "step": 1633 }, { "epoch": 2.226158038147139, "grad_norm": 3.4880693981502704, "learning_rate": 1.811809183814603e-05, "loss": 0.0514, "step": 1634 }, { "epoch": 2.2275204359673024, "grad_norm": 4.442253708879706, "learning_rate": 1.8115514214477793e-05, "loss": 0.035, "step": 1635 }, { "epoch": 2.2288828337874658, "grad_norm": 1.6842235665659215, "learning_rate": 1.8112935010370343e-05, "loss": 0.0363, "step": 1636 }, { "epoch": 2.2302452316076296, "grad_norm": 6.223740135434686, "learning_rate": 1.811035422632597e-05, "loss": 0.05, "step": 1637 }, { "epoch": 2.231607629427793, "grad_norm": 1.4119496676438688, "learning_rate": 1.810777186284726e-05, "loss": 0.0409, "step": 1638 }, { "epoch": 2.2329700272479562, "grad_norm": 4.554345978605188, "learning_rate": 1.8105187920437112e-05, "loss": 0.0823, "step": 1639 }, { "epoch": 2.23433242506812, "grad_norm": 3.9141134133656412, "learning_rate": 1.810260239959873e-05, "loss": 0.0401, "step": 1640 }, { "epoch": 2.2356948228882834, "grad_norm": 1.423274849759003, "learning_rate": 1.8100015300835624e-05, "loss": 0.043, "step": 1641 }, { "epoch": 2.2370572207084467, "grad_norm": 5.516859864954428, "learning_rate": 1.809742662465161e-05, "loss": 0.0498, "step": 1642 }, { "epoch": 2.2384196185286105, "grad_norm": 2.7034748003760445, "learning_rate": 1.8094836371550823e-05, "loss": 0.0253, "step": 1643 }, { "epoch": 2.239782016348774, "grad_norm": 5.887434101089413, "learning_rate": 1.8092244542037694e-05, "loss": 0.0406, "step": 1644 }, { "epoch": 2.241144414168937, "grad_norm": 5.03894177985759, "learning_rate": 1.8089651136616958e-05, "loss": 0.0642, "step": 1645 }, { "epoch": 2.242506811989101, "grad_norm": 2.16699832983453, "learning_rate": 1.808705615579367e-05, "loss": 0.0724, "step": 1646 }, { "epoch": 2.2438692098092643, "grad_norm": 4.42091949049522, "learning_rate": 1.8084459600073177e-05, "loss": 0.0566, "step": 1647 }, { "epoch": 2.2452316076294276, "grad_norm": 3.394306983014253, "learning_rate": 1.8081861469961144e-05, "loss": 0.0547, "step": 1648 }, { "epoch": 2.2465940054495914, "grad_norm": 2.013795910000289, "learning_rate": 1.8079261765963537e-05, "loss": 0.0651, "step": 1649 }, { "epoch": 2.2479564032697548, "grad_norm": 3.491200359317834, "learning_rate": 1.8076660488586632e-05, "loss": 0.0372, "step": 1650 }, { "epoch": 2.249318801089918, "grad_norm": 3.545513965551666, "learning_rate": 1.8074057638337012e-05, "loss": 0.0435, "step": 1651 }, { "epoch": 2.250681198910082, "grad_norm": 1.883688786186045, "learning_rate": 1.8071453215721554e-05, "loss": 0.044, "step": 1652 }, { "epoch": 2.2520435967302452, "grad_norm": 3.2297219152818335, "learning_rate": 1.806884722124746e-05, "loss": 0.0339, "step": 1653 }, { "epoch": 2.2534059945504086, "grad_norm": 1.9859118158411422, "learning_rate": 1.806623965542223e-05, "loss": 0.0701, "step": 1654 }, { "epoch": 2.2547683923705724, "grad_norm": 2.503310950289426, "learning_rate": 1.806363051875366e-05, "loss": 0.0397, "step": 1655 }, { "epoch": 2.2561307901907357, "grad_norm": 2.2116119621534858, "learning_rate": 1.806101981174987e-05, "loss": 0.0468, "step": 1656 }, { "epoch": 2.257493188010899, "grad_norm": 1.239459133551071, "learning_rate": 1.805840753491927e-05, "loss": 0.0397, "step": 1657 }, { "epoch": 2.258855585831063, "grad_norm": 2.8673724717065565, "learning_rate": 1.8055793688770586e-05, "loss": 0.0515, "step": 1658 }, { "epoch": 2.260217983651226, "grad_norm": 1.8285159971300684, "learning_rate": 1.8053178273812845e-05, "loss": 0.0609, "step": 1659 }, { "epoch": 2.2615803814713895, "grad_norm": 3.077087951687627, "learning_rate": 1.805056129055538e-05, "loss": 0.0527, "step": 1660 }, { "epoch": 2.2629427792915533, "grad_norm": 1.1171719856360427, "learning_rate": 1.8047942739507836e-05, "loss": 0.0258, "step": 1661 }, { "epoch": 2.2643051771117166, "grad_norm": 1.1786915914808678, "learning_rate": 1.804532262118015e-05, "loss": 0.0314, "step": 1662 }, { "epoch": 2.26566757493188, "grad_norm": 2.6028662707544195, "learning_rate": 1.8042700936082574e-05, "loss": 0.0506, "step": 1663 }, { "epoch": 2.2670299727520438, "grad_norm": 1.9664782298294337, "learning_rate": 1.8040077684725667e-05, "loss": 0.056, "step": 1664 }, { "epoch": 2.268392370572207, "grad_norm": 1.4288593815965616, "learning_rate": 1.8037452867620276e-05, "loss": 0.0394, "step": 1665 }, { "epoch": 2.2697547683923704, "grad_norm": 4.101391529490831, "learning_rate": 1.8034826485277583e-05, "loss": 0.0873, "step": 1666 }, { "epoch": 2.2711171662125342, "grad_norm": 2.8940827137113474, "learning_rate": 1.8032198538209042e-05, "loss": 0.0381, "step": 1667 }, { "epoch": 2.2724795640326976, "grad_norm": 2.9587784333111364, "learning_rate": 1.8029569026926438e-05, "loss": 0.049, "step": 1668 }, { "epoch": 2.273841961852861, "grad_norm": 3.4854690557301735, "learning_rate": 1.8026937951941847e-05, "loss": 0.0673, "step": 1669 }, { "epoch": 2.2752043596730247, "grad_norm": 2.313119131412926, "learning_rate": 1.8024305313767648e-05, "loss": 0.047, "step": 1670 }, { "epoch": 2.276566757493188, "grad_norm": 1.5685895007956907, "learning_rate": 1.8021671112916534e-05, "loss": 0.0419, "step": 1671 }, { "epoch": 2.2779291553133514, "grad_norm": 1.84299664876171, "learning_rate": 1.801903534990149e-05, "loss": 0.0553, "step": 1672 }, { "epoch": 2.279291553133515, "grad_norm": 1.1550583316984968, "learning_rate": 1.8016398025235822e-05, "loss": 0.039, "step": 1673 }, { "epoch": 2.2806539509536785, "grad_norm": 3.4811743839487046, "learning_rate": 1.8013759139433124e-05, "loss": 0.0662, "step": 1674 }, { "epoch": 2.282016348773842, "grad_norm": 4.983804820430985, "learning_rate": 1.8011118693007304e-05, "loss": 0.0945, "step": 1675 }, { "epoch": 2.2833787465940056, "grad_norm": 3.817520864095382, "learning_rate": 1.8008476686472563e-05, "loss": 0.0472, "step": 1676 }, { "epoch": 2.284741144414169, "grad_norm": 2.5772912474046965, "learning_rate": 1.8005833120343426e-05, "loss": 0.0311, "step": 1677 }, { "epoch": 2.2861035422343323, "grad_norm": 4.959584858788865, "learning_rate": 1.8003187995134698e-05, "loss": 0.0949, "step": 1678 }, { "epoch": 2.287465940054496, "grad_norm": 2.328236281021207, "learning_rate": 1.80005413113615e-05, "loss": 0.038, "step": 1679 }, { "epoch": 2.2888283378746594, "grad_norm": 3.364407709637316, "learning_rate": 1.7997893069539257e-05, "loss": 0.0645, "step": 1680 }, { "epoch": 2.290190735694823, "grad_norm": 2.1861367553087194, "learning_rate": 1.7995243270183694e-05, "loss": 0.041, "step": 1681 }, { "epoch": 2.291553133514986, "grad_norm": 1.9254151817723089, "learning_rate": 1.7992591913810846e-05, "loss": 0.0605, "step": 1682 }, { "epoch": 2.29291553133515, "grad_norm": 3.4688602857259014, "learning_rate": 1.798993900093704e-05, "loss": 0.0412, "step": 1683 }, { "epoch": 2.2942779291553133, "grad_norm": 2.2009757427091996, "learning_rate": 1.7987284532078914e-05, "loss": 0.064, "step": 1684 }, { "epoch": 2.2956403269754766, "grad_norm": 2.541538040723765, "learning_rate": 1.7984628507753406e-05, "loss": 0.0674, "step": 1685 }, { "epoch": 2.2970027247956404, "grad_norm": 1.4416037698763204, "learning_rate": 1.798197092847776e-05, "loss": 0.0287, "step": 1686 }, { "epoch": 2.2983651226158037, "grad_norm": 2.6910532299527614, "learning_rate": 1.797931179476952e-05, "loss": 0.0381, "step": 1687 }, { "epoch": 2.299727520435967, "grad_norm": 0.9393939798344205, "learning_rate": 1.7976651107146533e-05, "loss": 0.0341, "step": 1688 }, { "epoch": 2.301089918256131, "grad_norm": 1.2280976929736367, "learning_rate": 1.7973988866126952e-05, "loss": 0.0805, "step": 1689 }, { "epoch": 2.302452316076294, "grad_norm": 1.4332749023247393, "learning_rate": 1.7971325072229227e-05, "loss": 0.0432, "step": 1690 }, { "epoch": 2.3038147138964575, "grad_norm": 1.5083079094676, "learning_rate": 1.7968659725972113e-05, "loss": 0.0533, "step": 1691 }, { "epoch": 2.3051771117166213, "grad_norm": 2.041590445781099, "learning_rate": 1.7965992827874666e-05, "loss": 0.044, "step": 1692 }, { "epoch": 2.3065395095367847, "grad_norm": 1.214951138369112, "learning_rate": 1.796332437845625e-05, "loss": 0.0445, "step": 1693 }, { "epoch": 2.307901907356948, "grad_norm": 2.5289375345532115, "learning_rate": 1.796065437823652e-05, "loss": 0.0377, "step": 1694 }, { "epoch": 2.309264305177112, "grad_norm": 2.6763184663634734, "learning_rate": 1.795798282773545e-05, "loss": 0.0754, "step": 1695 }, { "epoch": 2.310626702997275, "grad_norm": 1.4948363421418127, "learning_rate": 1.7955309727473297e-05, "loss": 0.0512, "step": 1696 }, { "epoch": 2.3119891008174385, "grad_norm": 3.3645433361359216, "learning_rate": 1.795263507797063e-05, "loss": 0.0545, "step": 1697 }, { "epoch": 2.3133514986376023, "grad_norm": 1.4537290319222835, "learning_rate": 1.7949958879748322e-05, "loss": 0.0612, "step": 1698 }, { "epoch": 2.3147138964577656, "grad_norm": 2.6239334204297164, "learning_rate": 1.7947281133327538e-05, "loss": 0.0253, "step": 1699 }, { "epoch": 2.316076294277929, "grad_norm": 3.8307866581941634, "learning_rate": 1.7944601839229755e-05, "loss": 0.044, "step": 1700 }, { "epoch": 2.3174386920980927, "grad_norm": 3.2611273712346525, "learning_rate": 1.7941920997976742e-05, "loss": 0.056, "step": 1701 }, { "epoch": 2.318801089918256, "grad_norm": 1.459451898007314, "learning_rate": 1.7939238610090578e-05, "loss": 0.0814, "step": 1702 }, { "epoch": 2.3201634877384194, "grad_norm": 1.765964210188064, "learning_rate": 1.793655467609364e-05, "loss": 0.0602, "step": 1703 }, { "epoch": 2.321525885558583, "grad_norm": 3.3586272307887457, "learning_rate": 1.7933869196508604e-05, "loss": 0.0567, "step": 1704 }, { "epoch": 2.3228882833787465, "grad_norm": 1.704497875394047, "learning_rate": 1.7931182171858444e-05, "loss": 0.0641, "step": 1705 }, { "epoch": 2.32425068119891, "grad_norm": 3.2952320314229238, "learning_rate": 1.7928493602666446e-05, "loss": 0.0494, "step": 1706 }, { "epoch": 2.3256130790190737, "grad_norm": 3.2277300248561476, "learning_rate": 1.792580348945618e-05, "loss": 0.0353, "step": 1707 }, { "epoch": 2.326975476839237, "grad_norm": 1.2507695701060335, "learning_rate": 1.792311183275154e-05, "loss": 0.0416, "step": 1708 }, { "epoch": 2.3283378746594003, "grad_norm": 2.4326374646599085, "learning_rate": 1.79204186330767e-05, "loss": 0.0477, "step": 1709 }, { "epoch": 2.329700272479564, "grad_norm": 4.205805086396204, "learning_rate": 1.7917723890956135e-05, "loss": 0.0479, "step": 1710 }, { "epoch": 2.3310626702997275, "grad_norm": 3.5158574963040445, "learning_rate": 1.791502760691464e-05, "loss": 0.0423, "step": 1711 }, { "epoch": 2.332425068119891, "grad_norm": 5.937518366888001, "learning_rate": 1.7912329781477287e-05, "loss": 0.0672, "step": 1712 }, { "epoch": 2.3337874659400546, "grad_norm": 4.225112868080086, "learning_rate": 1.7909630415169466e-05, "loss": 0.0576, "step": 1713 }, { "epoch": 2.335149863760218, "grad_norm": 2.2460821484188687, "learning_rate": 1.7906929508516856e-05, "loss": 0.0665, "step": 1714 }, { "epoch": 2.3365122615803813, "grad_norm": 2.9935006630414707, "learning_rate": 1.790422706204544e-05, "loss": 0.0503, "step": 1715 }, { "epoch": 2.337874659400545, "grad_norm": 3.165861474955794, "learning_rate": 1.7901523076281498e-05, "loss": 0.0643, "step": 1716 }, { "epoch": 2.3392370572207084, "grad_norm": 2.568677195488107, "learning_rate": 1.7898817551751616e-05, "loss": 0.0396, "step": 1717 }, { "epoch": 2.3405994550408717, "grad_norm": 3.464077749080659, "learning_rate": 1.789611048898267e-05, "loss": 0.0646, "step": 1718 }, { "epoch": 2.3419618528610355, "grad_norm": 1.7058549466157757, "learning_rate": 1.789340188850185e-05, "loss": 0.0645, "step": 1719 }, { "epoch": 2.343324250681199, "grad_norm": 3.016697793612342, "learning_rate": 1.7890691750836632e-05, "loss": 0.0509, "step": 1720 }, { "epoch": 2.344686648501362, "grad_norm": 3.307595687792988, "learning_rate": 1.7887980076514798e-05, "loss": 0.0594, "step": 1721 }, { "epoch": 2.346049046321526, "grad_norm": 3.7748635575009355, "learning_rate": 1.7885266866064422e-05, "loss": 0.0869, "step": 1722 }, { "epoch": 2.3474114441416893, "grad_norm": 1.9644489368423814, "learning_rate": 1.7882552120013892e-05, "loss": 0.045, "step": 1723 }, { "epoch": 2.3487738419618527, "grad_norm": 3.058946616203421, "learning_rate": 1.7879835838891877e-05, "loss": 0.0567, "step": 1724 }, { "epoch": 2.3501362397820165, "grad_norm": 5.067747116236859, "learning_rate": 1.787711802322736e-05, "loss": 0.0689, "step": 1725 }, { "epoch": 2.35149863760218, "grad_norm": 2.45039005243007, "learning_rate": 1.787439867354961e-05, "loss": 0.0626, "step": 1726 }, { "epoch": 2.352861035422343, "grad_norm": 2.3694567254237215, "learning_rate": 1.7871677790388206e-05, "loss": 0.0537, "step": 1727 }, { "epoch": 2.354223433242507, "grad_norm": 3.2393564739181637, "learning_rate": 1.786895537427302e-05, "loss": 0.0973, "step": 1728 }, { "epoch": 2.3555858310626703, "grad_norm": 3.288952340858231, "learning_rate": 1.786623142573422e-05, "loss": 0.0347, "step": 1729 }, { "epoch": 2.3569482288828336, "grad_norm": 1.109238844595395, "learning_rate": 1.786350594530228e-05, "loss": 0.031, "step": 1730 }, { "epoch": 2.3583106267029974, "grad_norm": 2.7302133420779433, "learning_rate": 1.7860778933507967e-05, "loss": 0.0517, "step": 1731 }, { "epoch": 2.3596730245231607, "grad_norm": 2.865094704275373, "learning_rate": 1.7858050390882348e-05, "loss": 0.0661, "step": 1732 }, { "epoch": 2.361035422343324, "grad_norm": 1.525738582395576, "learning_rate": 1.7855320317956785e-05, "loss": 0.0623, "step": 1733 }, { "epoch": 2.362397820163488, "grad_norm": 3.2495769930797396, "learning_rate": 1.7852588715262945e-05, "loss": 0.0421, "step": 1734 }, { "epoch": 2.363760217983651, "grad_norm": 3.3669033870890765, "learning_rate": 1.7849855583332783e-05, "loss": 0.0524, "step": 1735 }, { "epoch": 2.3651226158038146, "grad_norm": 2.515857369186589, "learning_rate": 1.784712092269856e-05, "loss": 0.0347, "step": 1736 }, { "epoch": 2.3664850136239783, "grad_norm": 4.415186099832343, "learning_rate": 1.7844384733892833e-05, "loss": 0.0341, "step": 1737 }, { "epoch": 2.3678474114441417, "grad_norm": 1.8809735894499855, "learning_rate": 1.7841647017448452e-05, "loss": 0.0434, "step": 1738 }, { "epoch": 2.369209809264305, "grad_norm": 3.887186218911135, "learning_rate": 1.783890777389857e-05, "loss": 0.047, "step": 1739 }, { "epoch": 2.370572207084469, "grad_norm": 1.623021413565826, "learning_rate": 1.7836167003776635e-05, "loss": 0.0644, "step": 1740 }, { "epoch": 2.371934604904632, "grad_norm": 3.6557146285931856, "learning_rate": 1.7833424707616393e-05, "loss": 0.0313, "step": 1741 }, { "epoch": 2.3732970027247955, "grad_norm": 4.089892711848627, "learning_rate": 1.783068088595189e-05, "loss": 0.0339, "step": 1742 }, { "epoch": 2.3746594005449593, "grad_norm": 3.4158992594213955, "learning_rate": 1.7827935539317456e-05, "loss": 0.062, "step": 1743 }, { "epoch": 2.3760217983651226, "grad_norm": 3.748653589956908, "learning_rate": 1.7825188668247742e-05, "loss": 0.0278, "step": 1744 }, { "epoch": 2.377384196185286, "grad_norm": 4.671415101573751, "learning_rate": 1.7822440273277674e-05, "loss": 0.0361, "step": 1745 }, { "epoch": 2.3787465940054497, "grad_norm": 1.411990193161829, "learning_rate": 1.781969035494248e-05, "loss": 0.0518, "step": 1746 }, { "epoch": 2.380108991825613, "grad_norm": 3.528577157477356, "learning_rate": 1.781693891377769e-05, "loss": 0.0269, "step": 1747 }, { "epoch": 2.3814713896457764, "grad_norm": 2.7692503577592555, "learning_rate": 1.7814185950319127e-05, "loss": 0.0532, "step": 1748 }, { "epoch": 2.38283378746594, "grad_norm": 1.5396875373854382, "learning_rate": 1.7811431465102913e-05, "loss": 0.0342, "step": 1749 }, { "epoch": 2.3841961852861036, "grad_norm": 1.3577061747415202, "learning_rate": 1.7808675458665464e-05, "loss": 0.0329, "step": 1750 }, { "epoch": 2.385558583106267, "grad_norm": 4.611977663863237, "learning_rate": 1.7805917931543493e-05, "loss": 0.0441, "step": 1751 }, { "epoch": 2.3869209809264307, "grad_norm": 2.7390052835273795, "learning_rate": 1.780315888427401e-05, "loss": 0.0736, "step": 1752 }, { "epoch": 2.388283378746594, "grad_norm": 7.781037425533446, "learning_rate": 1.7800398317394315e-05, "loss": 0.0436, "step": 1753 }, { "epoch": 2.3896457765667574, "grad_norm": 5.607909054945247, "learning_rate": 1.7797636231442018e-05, "loss": 0.0575, "step": 1754 }, { "epoch": 2.391008174386921, "grad_norm": 3.503270579148962, "learning_rate": 1.7794872626955005e-05, "loss": 0.059, "step": 1755 }, { "epoch": 2.3923705722070845, "grad_norm": 12.152005699162729, "learning_rate": 1.7792107504471476e-05, "loss": 0.0447, "step": 1756 }, { "epoch": 2.393732970027248, "grad_norm": 1.197360963322096, "learning_rate": 1.778934086452992e-05, "loss": 0.0786, "step": 1757 }, { "epoch": 2.3950953678474116, "grad_norm": 7.058860524394337, "learning_rate": 1.7786572707669112e-05, "loss": 0.0602, "step": 1758 }, { "epoch": 2.396457765667575, "grad_norm": 5.985894055658338, "learning_rate": 1.778380303442814e-05, "loss": 0.056, "step": 1759 }, { "epoch": 2.3978201634877383, "grad_norm": 4.855950215875714, "learning_rate": 1.7781031845346375e-05, "loss": 0.0249, "step": 1760 }, { "epoch": 2.399182561307902, "grad_norm": 4.523007263889931, "learning_rate": 1.7778259140963484e-05, "loss": 0.0508, "step": 1761 }, { "epoch": 2.4005449591280654, "grad_norm": 2.8087908868759968, "learning_rate": 1.777548492181944e-05, "loss": 0.0421, "step": 1762 }, { "epoch": 2.4019073569482288, "grad_norm": 1.9712452230641198, "learning_rate": 1.777270918845449e-05, "loss": 0.0544, "step": 1763 }, { "epoch": 2.4032697547683926, "grad_norm": 5.199155708328499, "learning_rate": 1.77699319414092e-05, "loss": 0.0584, "step": 1764 }, { "epoch": 2.404632152588556, "grad_norm": 3.504333907507121, "learning_rate": 1.7767153181224413e-05, "loss": 0.0348, "step": 1765 }, { "epoch": 2.4059945504087192, "grad_norm": 2.1640904443970594, "learning_rate": 1.7764372908441276e-05, "loss": 0.0563, "step": 1766 }, { "epoch": 2.407356948228883, "grad_norm": 3.1023149327835604, "learning_rate": 1.7761591123601224e-05, "loss": 0.0469, "step": 1767 }, { "epoch": 2.4087193460490464, "grad_norm": 5.3094150746149005, "learning_rate": 1.7758807827245993e-05, "loss": 0.0371, "step": 1768 }, { "epoch": 2.4100817438692097, "grad_norm": 1.1375956354946417, "learning_rate": 1.7756023019917607e-05, "loss": 0.0515, "step": 1769 }, { "epoch": 2.4114441416893735, "grad_norm": 4.870638332385271, "learning_rate": 1.7753236702158394e-05, "loss": 0.0976, "step": 1770 }, { "epoch": 2.412806539509537, "grad_norm": 2.8385421266335134, "learning_rate": 1.7750448874510967e-05, "loss": 0.0464, "step": 1771 }, { "epoch": 2.4141689373297, "grad_norm": 2.800841984288107, "learning_rate": 1.774765953751823e-05, "loss": 0.056, "step": 1772 }, { "epoch": 2.415531335149864, "grad_norm": 5.162460476062454, "learning_rate": 1.774486869172339e-05, "loss": 0.0334, "step": 1773 }, { "epoch": 2.4168937329700273, "grad_norm": 2.773494970102417, "learning_rate": 1.7742076337669954e-05, "loss": 0.0346, "step": 1774 }, { "epoch": 2.4182561307901906, "grad_norm": 4.047503380454023, "learning_rate": 1.77392824759017e-05, "loss": 0.0262, "step": 1775 }, { "epoch": 2.4196185286103544, "grad_norm": 3.7813372815680206, "learning_rate": 1.7736487106962717e-05, "loss": 0.0343, "step": 1776 }, { "epoch": 2.4209809264305178, "grad_norm": 1.7417694944494708, "learning_rate": 1.7733690231397385e-05, "loss": 0.0555, "step": 1777 }, { "epoch": 2.422343324250681, "grad_norm": 2.1229618895472595, "learning_rate": 1.7730891849750377e-05, "loss": 0.0977, "step": 1778 }, { "epoch": 2.423705722070845, "grad_norm": 4.6001830127273085, "learning_rate": 1.7728091962566655e-05, "loss": 0.0528, "step": 1779 }, { "epoch": 2.4250681198910082, "grad_norm": 3.0464847535570385, "learning_rate": 1.772529057039148e-05, "loss": 0.0578, "step": 1780 }, { "epoch": 2.4264305177111716, "grad_norm": 4.739078794636771, "learning_rate": 1.7722487673770403e-05, "loss": 0.0827, "step": 1781 }, { "epoch": 2.4277929155313354, "grad_norm": 2.2735334486302268, "learning_rate": 1.7719683273249264e-05, "loss": 0.015, "step": 1782 }, { "epoch": 2.4291553133514987, "grad_norm": 1.9382870846622888, "learning_rate": 1.7716877369374206e-05, "loss": 0.0352, "step": 1783 }, { "epoch": 2.430517711171662, "grad_norm": 2.2517673233738296, "learning_rate": 1.7714069962691656e-05, "loss": 0.0746, "step": 1784 }, { "epoch": 2.431880108991826, "grad_norm": 2.1189377979393873, "learning_rate": 1.7711261053748338e-05, "loss": 0.0257, "step": 1785 }, { "epoch": 2.433242506811989, "grad_norm": 2.650369883545915, "learning_rate": 1.7708450643091268e-05, "loss": 0.0587, "step": 1786 }, { "epoch": 2.4346049046321525, "grad_norm": 2.4090584691826056, "learning_rate": 1.7705638731267747e-05, "loss": 0.0552, "step": 1787 }, { "epoch": 2.4359673024523163, "grad_norm": 1.9465683773101712, "learning_rate": 1.7702825318825384e-05, "loss": 0.0651, "step": 1788 }, { "epoch": 2.4373297002724796, "grad_norm": 3.6176792565539704, "learning_rate": 1.770001040631207e-05, "loss": 0.0579, "step": 1789 }, { "epoch": 2.438692098092643, "grad_norm": 3.2350167943445434, "learning_rate": 1.7697193994275983e-05, "loss": 0.0573, "step": 1790 }, { "epoch": 2.4400544959128068, "grad_norm": 4.097965604692832, "learning_rate": 1.7694376083265604e-05, "loss": 0.0515, "step": 1791 }, { "epoch": 2.44141689373297, "grad_norm": 1.7007555520626647, "learning_rate": 1.76915566738297e-05, "loss": 0.0346, "step": 1792 }, { "epoch": 2.4427792915531334, "grad_norm": 3.2621277608983865, "learning_rate": 1.7688735766517334e-05, "loss": 0.0383, "step": 1793 }, { "epoch": 2.4441416893732972, "grad_norm": 1.8423072292206026, "learning_rate": 1.7685913361877854e-05, "loss": 0.1038, "step": 1794 }, { "epoch": 2.4455040871934606, "grad_norm": 2.458152613734443, "learning_rate": 1.7683089460460907e-05, "loss": 0.0446, "step": 1795 }, { "epoch": 2.446866485013624, "grad_norm": 3.045700844868449, "learning_rate": 1.768026406281642e-05, "loss": 0.088, "step": 1796 }, { "epoch": 2.4482288828337877, "grad_norm": 1.859997320368701, "learning_rate": 1.7677437169494632e-05, "loss": 0.048, "step": 1797 }, { "epoch": 2.449591280653951, "grad_norm": 1.7128010170153112, "learning_rate": 1.7674608781046052e-05, "loss": 0.0657, "step": 1798 }, { "epoch": 2.4509536784741144, "grad_norm": 1.170926680469709, "learning_rate": 1.7671778898021487e-05, "loss": 0.0541, "step": 1799 }, { "epoch": 2.452316076294278, "grad_norm": 1.5683489813555325, "learning_rate": 1.7668947520972044e-05, "loss": 0.0448, "step": 1800 }, { "epoch": 2.4536784741144415, "grad_norm": 1.5863920042195991, "learning_rate": 1.7666114650449108e-05, "loss": 0.0482, "step": 1801 }, { "epoch": 2.455040871934605, "grad_norm": 1.2149076200584596, "learning_rate": 1.7663280287004364e-05, "loss": 0.0607, "step": 1802 }, { "epoch": 2.4564032697547686, "grad_norm": 1.8925555308507243, "learning_rate": 1.766044443118978e-05, "loss": 0.0637, "step": 1803 }, { "epoch": 2.457765667574932, "grad_norm": 2.610192205537942, "learning_rate": 1.7657607083557626e-05, "loss": 0.0465, "step": 1804 }, { "epoch": 2.4591280653950953, "grad_norm": 1.1786051209436719, "learning_rate": 1.765476824466045e-05, "loss": 0.0374, "step": 1805 }, { "epoch": 2.460490463215259, "grad_norm": 1.7771094587617153, "learning_rate": 1.7651927915051096e-05, "loss": 0.0413, "step": 1806 }, { "epoch": 2.4618528610354224, "grad_norm": 2.5071101689195046, "learning_rate": 1.76490860952827e-05, "loss": 0.0632, "step": 1807 }, { "epoch": 2.463215258855586, "grad_norm": 1.2248808163736278, "learning_rate": 1.7646242785908684e-05, "loss": 0.0468, "step": 1808 }, { "epoch": 2.464577656675749, "grad_norm": 1.036335327526233, "learning_rate": 1.7643397987482763e-05, "loss": 0.0408, "step": 1809 }, { "epoch": 2.465940054495913, "grad_norm": 2.902960268083666, "learning_rate": 1.7640551700558946e-05, "loss": 0.0574, "step": 1810 }, { "epoch": 2.4673024523160763, "grad_norm": 4.028436958770635, "learning_rate": 1.763770392569152e-05, "loss": 0.0342, "step": 1811 }, { "epoch": 2.4686648501362396, "grad_norm": 4.355848549214753, "learning_rate": 1.7634854663435077e-05, "loss": 0.025, "step": 1812 }, { "epoch": 2.4700272479564034, "grad_norm": 3.0760679715757355, "learning_rate": 1.7632003914344485e-05, "loss": 0.0406, "step": 1813 }, { "epoch": 2.4713896457765667, "grad_norm": 3.715016354068798, "learning_rate": 1.762915167897491e-05, "loss": 0.0583, "step": 1814 }, { "epoch": 2.47275204359673, "grad_norm": 1.4192737229506018, "learning_rate": 1.76262979578818e-05, "loss": 0.0368, "step": 1815 }, { "epoch": 2.474114441416894, "grad_norm": 2.0195147525403447, "learning_rate": 1.7623442751620906e-05, "loss": 0.0309, "step": 1816 }, { "epoch": 2.475476839237057, "grad_norm": 6.261598949914292, "learning_rate": 1.762058606074825e-05, "loss": 0.0413, "step": 1817 }, { "epoch": 2.4768392370572205, "grad_norm": 3.4702599237629173, "learning_rate": 1.761772788582016e-05, "loss": 0.049, "step": 1818 }, { "epoch": 2.4782016348773843, "grad_norm": 3.571839991237613, "learning_rate": 1.761486822739324e-05, "loss": 0.0546, "step": 1819 }, { "epoch": 2.4795640326975477, "grad_norm": 1.5110005087052927, "learning_rate": 1.761200708602439e-05, "loss": 0.0465, "step": 1820 }, { "epoch": 2.480926430517711, "grad_norm": 1.2471052797554194, "learning_rate": 1.7609144462270797e-05, "loss": 0.0545, "step": 1821 }, { "epoch": 2.482288828337875, "grad_norm": 1.9652522500951206, "learning_rate": 1.7606280356689938e-05, "loss": 0.0376, "step": 1822 }, { "epoch": 2.483651226158038, "grad_norm": 2.6780034873153165, "learning_rate": 1.760341476983958e-05, "loss": 0.0372, "step": 1823 }, { "epoch": 2.4850136239782015, "grad_norm": 4.320849076891749, "learning_rate": 1.7600547702277765e-05, "loss": 0.0567, "step": 1824 }, { "epoch": 2.4863760217983653, "grad_norm": 1.2928980398087802, "learning_rate": 1.7597679154562847e-05, "loss": 0.035, "step": 1825 }, { "epoch": 2.4877384196185286, "grad_norm": 3.8991164535814016, "learning_rate": 1.759480912725345e-05, "loss": 0.0627, "step": 1826 }, { "epoch": 2.489100817438692, "grad_norm": 2.741667736974448, "learning_rate": 1.759193762090849e-05, "loss": 0.0498, "step": 1827 }, { "epoch": 2.4904632152588557, "grad_norm": 1.3114649647573893, "learning_rate": 1.7589064636087177e-05, "loss": 0.0517, "step": 1828 }, { "epoch": 2.491825613079019, "grad_norm": 2.700622606665766, "learning_rate": 1.7586190173349e-05, "loss": 0.0202, "step": 1829 }, { "epoch": 2.4931880108991824, "grad_norm": 3.0281849986090927, "learning_rate": 1.758331423325374e-05, "loss": 0.0394, "step": 1830 }, { "epoch": 2.494550408719346, "grad_norm": 3.066574203454447, "learning_rate": 1.758043681636147e-05, "loss": 0.0719, "step": 1831 }, { "epoch": 2.4959128065395095, "grad_norm": 3.688715858437725, "learning_rate": 1.7577557923232548e-05, "loss": 0.0535, "step": 1832 }, { "epoch": 2.497275204359673, "grad_norm": 2.603599072144188, "learning_rate": 1.757467755442761e-05, "loss": 0.054, "step": 1833 }, { "epoch": 2.4986376021798367, "grad_norm": 4.590014235135179, "learning_rate": 1.7571795710507594e-05, "loss": 0.0316, "step": 1834 }, { "epoch": 2.5, "grad_norm": 5.723012483102207, "learning_rate": 1.7568912392033723e-05, "loss": 0.0314, "step": 1835 }, { "epoch": 2.5013623978201633, "grad_norm": 2.400850355393799, "learning_rate": 1.7566027599567492e-05, "loss": 0.0331, "step": 1836 }, { "epoch": 2.502724795640327, "grad_norm": 6.215105996016498, "learning_rate": 1.75631413336707e-05, "loss": 0.0611, "step": 1837 }, { "epoch": 2.5040871934604905, "grad_norm": 5.06902687538766, "learning_rate": 1.7560253594905425e-05, "loss": 0.0487, "step": 1838 }, { "epoch": 2.505449591280654, "grad_norm": 3.8365625685869404, "learning_rate": 1.7557364383834038e-05, "loss": 0.0359, "step": 1839 }, { "epoch": 2.5068119891008176, "grad_norm": 7.8181351541090205, "learning_rate": 1.7554473701019187e-05, "loss": 0.037, "step": 1840 }, { "epoch": 2.508174386920981, "grad_norm": 1.5025084569337062, "learning_rate": 1.755158154702382e-05, "loss": 0.0378, "step": 1841 }, { "epoch": 2.5095367847411443, "grad_norm": 4.0423841208009135, "learning_rate": 1.7548687922411153e-05, "loss": 0.0247, "step": 1842 }, { "epoch": 2.510899182561308, "grad_norm": 5.220056403208977, "learning_rate": 1.7545792827744708e-05, "loss": 0.0771, "step": 1843 }, { "epoch": 2.5122615803814714, "grad_norm": 1.3141791658073267, "learning_rate": 1.7542896263588274e-05, "loss": 0.0473, "step": 1844 }, { "epoch": 2.5136239782016347, "grad_norm": 4.991966160147327, "learning_rate": 1.7539998230505952e-05, "loss": 0.0253, "step": 1845 }, { "epoch": 2.5149863760217985, "grad_norm": 4.352910401840358, "learning_rate": 1.7537098729062098e-05, "loss": 0.05, "step": 1846 }, { "epoch": 2.516348773841962, "grad_norm": 2.0416337065237067, "learning_rate": 1.753419775982138e-05, "loss": 0.0457, "step": 1847 }, { "epoch": 2.517711171662125, "grad_norm": 6.809359879606753, "learning_rate": 1.7531295323348735e-05, "loss": 0.0596, "step": 1848 }, { "epoch": 2.5190735694822886, "grad_norm": 1.3534412509585754, "learning_rate": 1.7528391420209398e-05, "loss": 0.0495, "step": 1849 }, { "epoch": 2.5204359673024523, "grad_norm": 6.367976045935764, "learning_rate": 1.7525486050968875e-05, "loss": 0.0662, "step": 1850 }, { "epoch": 2.5217983651226157, "grad_norm": 3.6507737163801974, "learning_rate": 1.7522579216192976e-05, "loss": 0.0475, "step": 1851 }, { "epoch": 2.523160762942779, "grad_norm": 2.957934379444933, "learning_rate": 1.7519670916447777e-05, "loss": 0.0565, "step": 1852 }, { "epoch": 2.524523160762943, "grad_norm": 5.837105737978653, "learning_rate": 1.7516761152299656e-05, "loss": 0.0526, "step": 1853 }, { "epoch": 2.525885558583106, "grad_norm": 2.304307340871374, "learning_rate": 1.7513849924315267e-05, "loss": 0.0693, "step": 1854 }, { "epoch": 2.5272479564032695, "grad_norm": 4.611558109455836, "learning_rate": 1.7510937233061556e-05, "loss": 0.044, "step": 1855 }, { "epoch": 2.5286103542234333, "grad_norm": 5.194657409165454, "learning_rate": 1.7508023079105737e-05, "loss": 0.033, "step": 1856 }, { "epoch": 2.5299727520435966, "grad_norm": 0.8013063999690163, "learning_rate": 1.750510746301533e-05, "loss": 0.0321, "step": 1857 }, { "epoch": 2.53133514986376, "grad_norm": 4.170181234850391, "learning_rate": 1.750219038535813e-05, "loss": 0.0315, "step": 1858 }, { "epoch": 2.5326975476839237, "grad_norm": 2.8776390031158483, "learning_rate": 1.7499271846702216e-05, "loss": 0.075, "step": 1859 }, { "epoch": 2.534059945504087, "grad_norm": 2.2016389856673375, "learning_rate": 1.749635184761595e-05, "loss": 0.0974, "step": 1860 }, { "epoch": 2.5354223433242504, "grad_norm": 5.1955714231484516, "learning_rate": 1.749343038866799e-05, "loss": 0.0238, "step": 1861 }, { "epoch": 2.536784741144414, "grad_norm": 2.736439266168756, "learning_rate": 1.749050747042726e-05, "loss": 0.0912, "step": 1862 }, { "epoch": 2.5381471389645776, "grad_norm": 2.201591187791689, "learning_rate": 1.748758309346298e-05, "loss": 0.0586, "step": 1863 }, { "epoch": 2.539509536784741, "grad_norm": 5.445437356732867, "learning_rate": 1.7484657258344654e-05, "loss": 0.0258, "step": 1864 }, { "epoch": 2.5408719346049047, "grad_norm": 5.803259763404338, "learning_rate": 1.7481729965642068e-05, "loss": 0.042, "step": 1865 }, { "epoch": 2.542234332425068, "grad_norm": 4.069746842450507, "learning_rate": 1.747880121592529e-05, "loss": 0.0639, "step": 1866 }, { "epoch": 2.5435967302452314, "grad_norm": 9.208634978579006, "learning_rate": 1.7475871009764674e-05, "loss": 0.0431, "step": 1867 }, { "epoch": 2.544959128065395, "grad_norm": 4.367439239665345, "learning_rate": 1.7472939347730857e-05, "loss": 0.0586, "step": 1868 }, { "epoch": 2.5463215258855585, "grad_norm": 8.562116208193645, "learning_rate": 1.747000623039476e-05, "loss": 0.0401, "step": 1869 }, { "epoch": 2.547683923705722, "grad_norm": 7.9613343810122394, "learning_rate": 1.7467071658327588e-05, "loss": 0.0388, "step": 1870 }, { "epoch": 2.5490463215258856, "grad_norm": 5.0458642793547055, "learning_rate": 1.7464135632100824e-05, "loss": 0.0595, "step": 1871 }, { "epoch": 2.550408719346049, "grad_norm": 7.25630942670956, "learning_rate": 1.7461198152286244e-05, "loss": 0.0506, "step": 1872 }, { "epoch": 2.5517711171662123, "grad_norm": 3.5727098171269485, "learning_rate": 1.7458259219455896e-05, "loss": 0.0541, "step": 1873 }, { "epoch": 2.553133514986376, "grad_norm": 7.765735676305512, "learning_rate": 1.7455318834182117e-05, "loss": 0.0788, "step": 1874 }, { "epoch": 2.5544959128065394, "grad_norm": 4.63946867280992, "learning_rate": 1.7452376997037532e-05, "loss": 0.0618, "step": 1875 }, { "epoch": 2.5558583106267028, "grad_norm": 4.309310407223953, "learning_rate": 1.7449433708595037e-05, "loss": 0.0547, "step": 1876 }, { "epoch": 2.5572207084468666, "grad_norm": 3.784990345556633, "learning_rate": 1.7446488969427823e-05, "loss": 0.0683, "step": 1877 }, { "epoch": 2.55858310626703, "grad_norm": 2.1314371494312505, "learning_rate": 1.7443542780109353e-05, "loss": 0.0626, "step": 1878 }, { "epoch": 2.5599455040871932, "grad_norm": 1.9078940228988306, "learning_rate": 1.7440595141213372e-05, "loss": 0.0452, "step": 1879 }, { "epoch": 2.561307901907357, "grad_norm": 1.7960660995935362, "learning_rate": 1.743764605331392e-05, "loss": 0.0389, "step": 1880 }, { "epoch": 2.5626702997275204, "grad_norm": 1.4506281001346917, "learning_rate": 1.7434695516985306e-05, "loss": 0.057, "step": 1881 }, { "epoch": 2.5640326975476837, "grad_norm": 2.350824356458497, "learning_rate": 1.7431743532802134e-05, "loss": 0.0837, "step": 1882 }, { "epoch": 2.5653950953678475, "grad_norm": 2.7653749287425105, "learning_rate": 1.742879010133927e-05, "loss": 0.0573, "step": 1883 }, { "epoch": 2.566757493188011, "grad_norm": 2.676325773769293, "learning_rate": 1.742583522317188e-05, "loss": 0.067, "step": 1884 }, { "epoch": 2.568119891008174, "grad_norm": 2.5785738010429413, "learning_rate": 1.742287889887541e-05, "loss": 0.0789, "step": 1885 }, { "epoch": 2.569482288828338, "grad_norm": 4.001386223136175, "learning_rate": 1.7419921129025578e-05, "loss": 0.048, "step": 1886 }, { "epoch": 2.5708446866485013, "grad_norm": 3.0581355794146243, "learning_rate": 1.741696191419839e-05, "loss": 0.051, "step": 1887 }, { "epoch": 2.5722070844686646, "grad_norm": 1.4879964004908015, "learning_rate": 1.741400125497013e-05, "loss": 0.0512, "step": 1888 }, { "epoch": 2.5735694822888284, "grad_norm": 2.761532238956789, "learning_rate": 1.7411039151917368e-05, "loss": 0.0277, "step": 1889 }, { "epoch": 2.5749318801089918, "grad_norm": 2.6703477299451923, "learning_rate": 1.7408075605616955e-05, "loss": 0.0619, "step": 1890 }, { "epoch": 2.576294277929155, "grad_norm": 1.448170042547423, "learning_rate": 1.7405110616646018e-05, "loss": 0.0319, "step": 1891 }, { "epoch": 2.577656675749319, "grad_norm": 3.5919926555018025, "learning_rate": 1.7402144185581965e-05, "loss": 0.0607, "step": 1892 }, { "epoch": 2.5790190735694822, "grad_norm": 1.7742648844184015, "learning_rate": 1.739917631300249e-05, "loss": 0.0339, "step": 1893 }, { "epoch": 2.5803814713896456, "grad_norm": 1.8169634830698018, "learning_rate": 1.739620699948557e-05, "loss": 0.0429, "step": 1894 }, { "epoch": 2.5817438692098094, "grad_norm": 2.3330303750286063, "learning_rate": 1.739323624560945e-05, "loss": 0.0412, "step": 1895 }, { "epoch": 2.5831062670299727, "grad_norm": 2.3770834862622507, "learning_rate": 1.739026405195267e-05, "loss": 0.0451, "step": 1896 }, { "epoch": 2.584468664850136, "grad_norm": 1.3010980761725421, "learning_rate": 1.738729041909404e-05, "loss": 0.0407, "step": 1897 }, { "epoch": 2.5858310626703, "grad_norm": 1.0302838832795467, "learning_rate": 1.7384315347612655e-05, "loss": 0.0423, "step": 1898 }, { "epoch": 2.587193460490463, "grad_norm": 2.6076606306203693, "learning_rate": 1.738133883808789e-05, "loss": 0.0596, "step": 1899 }, { "epoch": 2.5885558583106265, "grad_norm": 2.6559785237050018, "learning_rate": 1.7378360891099396e-05, "loss": 0.0336, "step": 1900 }, { "epoch": 2.5899182561307903, "grad_norm": 0.8916217787155367, "learning_rate": 1.7375381507227108e-05, "loss": 0.0284, "step": 1901 }, { "epoch": 2.5912806539509536, "grad_norm": 1.0929498868927001, "learning_rate": 1.7372400687051246e-05, "loss": 0.051, "step": 1902 }, { "epoch": 2.592643051771117, "grad_norm": 1.4024338615891319, "learning_rate": 1.7369418431152295e-05, "loss": 0.0424, "step": 1903 }, { "epoch": 2.5940054495912808, "grad_norm": 4.326356525193581, "learning_rate": 1.7366434740111036e-05, "loss": 0.0522, "step": 1904 }, { "epoch": 2.595367847411444, "grad_norm": 1.9395227272236277, "learning_rate": 1.7363449614508516e-05, "loss": 0.0617, "step": 1905 }, { "epoch": 2.5967302452316074, "grad_norm": 1.952814940387924, "learning_rate": 1.736046305492607e-05, "loss": 0.0271, "step": 1906 }, { "epoch": 2.5980926430517712, "grad_norm": 3.263334555037609, "learning_rate": 1.735747506194531e-05, "loss": 0.0314, "step": 1907 }, { "epoch": 2.5994550408719346, "grad_norm": 1.9358274319435675, "learning_rate": 1.7354485636148127e-05, "loss": 0.0533, "step": 1908 }, { "epoch": 2.600817438692098, "grad_norm": 3.6663781040360686, "learning_rate": 1.7351494778116685e-05, "loss": 0.0482, "step": 1909 }, { "epoch": 2.6021798365122617, "grad_norm": 2.0910818942293385, "learning_rate": 1.7348502488433442e-05, "loss": 0.0585, "step": 1910 }, { "epoch": 2.603542234332425, "grad_norm": 1.2580935808269955, "learning_rate": 1.7345508767681116e-05, "loss": 0.0269, "step": 1911 }, { "epoch": 2.6049046321525884, "grad_norm": 1.2985970803458824, "learning_rate": 1.734251361644272e-05, "loss": 0.019, "step": 1912 }, { "epoch": 2.606267029972752, "grad_norm": 3.66019053844415, "learning_rate": 1.7339517035301533e-05, "loss": 0.0449, "step": 1913 }, { "epoch": 2.6076294277929155, "grad_norm": 1.4833041380790406, "learning_rate": 1.733651902484112e-05, "loss": 0.0489, "step": 1914 }, { "epoch": 2.608991825613079, "grad_norm": 4.963012853234388, "learning_rate": 1.7333519585645323e-05, "loss": 0.0682, "step": 1915 }, { "epoch": 2.6103542234332426, "grad_norm": 2.1343584341083117, "learning_rate": 1.7330518718298263e-05, "loss": 0.0306, "step": 1916 }, { "epoch": 2.611716621253406, "grad_norm": 1.9156979614967435, "learning_rate": 1.732751642338434e-05, "loss": 0.0386, "step": 1917 }, { "epoch": 2.6130790190735693, "grad_norm": 2.9208073853156757, "learning_rate": 1.7324512701488224e-05, "loss": 0.0355, "step": 1918 }, { "epoch": 2.614441416893733, "grad_norm": 1.485565789065312, "learning_rate": 1.7321507553194866e-05, "loss": 0.0611, "step": 1919 }, { "epoch": 2.6158038147138964, "grad_norm": 1.4133431111484192, "learning_rate": 1.731850097908951e-05, "loss": 0.0501, "step": 1920 }, { "epoch": 2.61716621253406, "grad_norm": 3.56496717259159, "learning_rate": 1.7315492979757652e-05, "loss": 0.0278, "step": 1921 }, { "epoch": 2.6185286103542236, "grad_norm": 1.3069607249297979, "learning_rate": 1.7312483555785087e-05, "loss": 0.0378, "step": 1922 }, { "epoch": 2.619891008174387, "grad_norm": 1.5838133442908768, "learning_rate": 1.730947270775788e-05, "loss": 0.0392, "step": 1923 }, { "epoch": 2.6212534059945503, "grad_norm": 4.6403039236491495, "learning_rate": 1.7306460436262363e-05, "loss": 0.0955, "step": 1924 }, { "epoch": 2.622615803814714, "grad_norm": 1.1766176028622424, "learning_rate": 1.730344674188516e-05, "loss": 0.0527, "step": 1925 }, { "epoch": 2.6239782016348774, "grad_norm": 2.9919136243607243, "learning_rate": 1.7300431625213168e-05, "loss": 0.0442, "step": 1926 }, { "epoch": 2.6253405994550407, "grad_norm": 2.311319892834096, "learning_rate": 1.729741508683356e-05, "loss": 0.0395, "step": 1927 }, { "epoch": 2.6267029972752045, "grad_norm": 2.132256173488098, "learning_rate": 1.7294397127333785e-05, "loss": 0.0667, "step": 1928 }, { "epoch": 2.628065395095368, "grad_norm": 5.7529099195430735, "learning_rate": 1.729137774730157e-05, "loss": 0.1167, "step": 1929 }, { "epoch": 2.629427792915531, "grad_norm": 2.956334414290412, "learning_rate": 1.7288356947324916e-05, "loss": 0.0374, "step": 1930 }, { "epoch": 2.630790190735695, "grad_norm": 2.6385456243982532, "learning_rate": 1.7285334727992104e-05, "loss": 0.0507, "step": 1931 }, { "epoch": 2.6321525885558583, "grad_norm": 4.5150285055450485, "learning_rate": 1.7282311089891686e-05, "loss": 0.0515, "step": 1932 }, { "epoch": 2.6335149863760217, "grad_norm": 2.0324123842689428, "learning_rate": 1.7279286033612498e-05, "loss": 0.0469, "step": 1933 }, { "epoch": 2.6348773841961854, "grad_norm": 3.9521319919959557, "learning_rate": 1.727625955974365e-05, "loss": 0.0475, "step": 1934 }, { "epoch": 2.636239782016349, "grad_norm": 4.037426710336796, "learning_rate": 1.7273231668874526e-05, "loss": 0.0738, "step": 1935 }, { "epoch": 2.637602179836512, "grad_norm": 6.93633426133973, "learning_rate": 1.7270202361594782e-05, "loss": 0.0236, "step": 1936 }, { "epoch": 2.638964577656676, "grad_norm": 4.096737351557914, "learning_rate": 1.7267171638494358e-05, "loss": 0.1178, "step": 1937 }, { "epoch": 2.6403269754768393, "grad_norm": 3.777732664416906, "learning_rate": 1.726413950016346e-05, "loss": 0.0569, "step": 1938 }, { "epoch": 2.6416893732970026, "grad_norm": 6.2682865362918125, "learning_rate": 1.7261105947192588e-05, "loss": 0.0681, "step": 1939 }, { "epoch": 2.6430517711171664, "grad_norm": 0.9574349246434692, "learning_rate": 1.7258070980172494e-05, "loss": 0.0442, "step": 1940 }, { "epoch": 2.6444141689373297, "grad_norm": 5.2479980314135, "learning_rate": 1.725503459969422e-05, "loss": 0.0331, "step": 1941 }, { "epoch": 2.645776566757493, "grad_norm": 3.1240546693527946, "learning_rate": 1.725199680634908e-05, "loss": 0.0179, "step": 1942 }, { "epoch": 2.647138964577657, "grad_norm": 1.8790678744493006, "learning_rate": 1.7248957600728664e-05, "loss": 0.041, "step": 1943 }, { "epoch": 2.64850136239782, "grad_norm": 4.013298706234086, "learning_rate": 1.7245916983424833e-05, "loss": 0.0287, "step": 1944 }, { "epoch": 2.6498637602179835, "grad_norm": 3.9307126088049493, "learning_rate": 1.724287495502973e-05, "loss": 0.0985, "step": 1945 }, { "epoch": 2.6512261580381473, "grad_norm": 3.518886814962557, "learning_rate": 1.723983151613576e-05, "loss": 0.0463, "step": 1946 }, { "epoch": 2.6525885558583107, "grad_norm": 2.723653076673351, "learning_rate": 1.7236786667335625e-05, "loss": 0.0429, "step": 1947 }, { "epoch": 2.653950953678474, "grad_norm": 5.311281191829222, "learning_rate": 1.723374040922227e-05, "loss": 0.0569, "step": 1948 }, { "epoch": 2.655313351498638, "grad_norm": 3.023882629289252, "learning_rate": 1.723069274238895e-05, "loss": 0.036, "step": 1949 }, { "epoch": 2.656675749318801, "grad_norm": 7.416760497625288, "learning_rate": 1.7227643667429167e-05, "loss": 0.0387, "step": 1950 }, { "epoch": 2.6580381471389645, "grad_norm": 2.999928799057134, "learning_rate": 1.722459318493671e-05, "loss": 0.0601, "step": 1951 }, { "epoch": 2.6594005449591283, "grad_norm": 6.340269865533081, "learning_rate": 1.7221541295505638e-05, "loss": 0.0298, "step": 1952 }, { "epoch": 2.6607629427792916, "grad_norm": 3.7487549939993747, "learning_rate": 1.7218487999730283e-05, "loss": 0.0797, "step": 1953 }, { "epoch": 2.662125340599455, "grad_norm": 2.8513014905629674, "learning_rate": 1.7215433298205252e-05, "loss": 0.0488, "step": 1954 }, { "epoch": 2.6634877384196187, "grad_norm": 2.6287365781382337, "learning_rate": 1.7212377191525434e-05, "loss": 0.0294, "step": 1955 }, { "epoch": 2.664850136239782, "grad_norm": 5.535229299988281, "learning_rate": 1.7209319680285977e-05, "loss": 0.0375, "step": 1956 }, { "epoch": 2.6662125340599454, "grad_norm": 1.598333627340122, "learning_rate": 1.7206260765082312e-05, "loss": 0.0492, "step": 1957 }, { "epoch": 2.667574931880109, "grad_norm": 3.2882969199295005, "learning_rate": 1.7203200446510143e-05, "loss": 0.0568, "step": 1958 }, { "epoch": 2.6689373297002725, "grad_norm": 2.2731575977006586, "learning_rate": 1.7200138725165444e-05, "loss": 0.0229, "step": 1959 }, { "epoch": 2.670299727520436, "grad_norm": 3.325756613366335, "learning_rate": 1.7197075601644463e-05, "loss": 0.0762, "step": 1960 }, { "epoch": 2.6716621253405997, "grad_norm": 1.4503880334894985, "learning_rate": 1.7194011076543716e-05, "loss": 0.0357, "step": 1961 }, { "epoch": 2.673024523160763, "grad_norm": 0.964761943486052, "learning_rate": 1.7190945150460012e-05, "loss": 0.0392, "step": 1962 }, { "epoch": 2.6743869209809263, "grad_norm": 0.6077492182792883, "learning_rate": 1.7187877823990408e-05, "loss": 0.0342, "step": 1963 }, { "epoch": 2.67574931880109, "grad_norm": 1.0485951255377615, "learning_rate": 1.7184809097732247e-05, "loss": 0.0298, "step": 1964 }, { "epoch": 2.6771117166212535, "grad_norm": 1.4186166807993412, "learning_rate": 1.7181738972283142e-05, "loss": 0.0287, "step": 1965 }, { "epoch": 2.678474114441417, "grad_norm": 2.0644083017607033, "learning_rate": 1.7178667448240974e-05, "loss": 0.0464, "step": 1966 }, { "epoch": 2.6798365122615806, "grad_norm": 4.399744899915729, "learning_rate": 1.7175594526203906e-05, "loss": 0.0814, "step": 1967 }, { "epoch": 2.681198910081744, "grad_norm": 1.1174949813879298, "learning_rate": 1.7172520206770368e-05, "loss": 0.0297, "step": 1968 }, { "epoch": 2.6825613079019073, "grad_norm": 2.268421742832191, "learning_rate": 1.716944449053906e-05, "loss": 0.0562, "step": 1969 }, { "epoch": 2.683923705722071, "grad_norm": 4.91514203458925, "learning_rate": 1.7166367378108954e-05, "loss": 0.0557, "step": 1970 }, { "epoch": 2.6852861035422344, "grad_norm": 1.5059606176486051, "learning_rate": 1.7163288870079304e-05, "loss": 0.0498, "step": 1971 }, { "epoch": 2.6866485013623977, "grad_norm": 2.367466107938998, "learning_rate": 1.716020896704962e-05, "loss": 0.0275, "step": 1972 }, { "epoch": 2.6880108991825615, "grad_norm": 3.8933637229193843, "learning_rate": 1.715712766961969e-05, "loss": 0.0512, "step": 1973 }, { "epoch": 2.689373297002725, "grad_norm": 1.537861649724166, "learning_rate": 1.715404497838958e-05, "loss": 0.0312, "step": 1974 }, { "epoch": 2.690735694822888, "grad_norm": 1.3419271195135032, "learning_rate": 1.715096089395962e-05, "loss": 0.0724, "step": 1975 }, { "epoch": 2.692098092643052, "grad_norm": 3.892372916975282, "learning_rate": 1.7147875416930414e-05, "loss": 0.0495, "step": 1976 }, { "epoch": 2.6934604904632153, "grad_norm": 1.9626090055476586, "learning_rate": 1.714478854790284e-05, "loss": 0.0371, "step": 1977 }, { "epoch": 2.6948228882833787, "grad_norm": 2.490714782881143, "learning_rate": 1.7141700287478045e-05, "loss": 0.0332, "step": 1978 }, { "epoch": 2.6961852861035425, "grad_norm": 4.448935096067083, "learning_rate": 1.7138610636257434e-05, "loss": 0.0182, "step": 1979 }, { "epoch": 2.697547683923706, "grad_norm": 2.3292197947228224, "learning_rate": 1.713551959484271e-05, "loss": 0.0394, "step": 1980 }, { "epoch": 2.698910081743869, "grad_norm": 2.5044463135692014, "learning_rate": 1.7132427163835822e-05, "loss": 0.0347, "step": 1981 }, { "epoch": 2.700272479564033, "grad_norm": 2.8897875365504975, "learning_rate": 1.7129333343839004e-05, "loss": 0.0673, "step": 1982 }, { "epoch": 2.7016348773841963, "grad_norm": 2.9314007304521126, "learning_rate": 1.712623813545475e-05, "loss": 0.0469, "step": 1983 }, { "epoch": 2.7029972752043596, "grad_norm": 2.801785929132762, "learning_rate": 1.7123141539285834e-05, "loss": 0.0349, "step": 1984 }, { "epoch": 2.7043596730245234, "grad_norm": 3.9861644654479096, "learning_rate": 1.71200435559353e-05, "loss": 0.0538, "step": 1985 }, { "epoch": 2.7057220708446867, "grad_norm": 0.9018018210833185, "learning_rate": 1.711694418600645e-05, "loss": 0.0511, "step": 1986 }, { "epoch": 2.70708446866485, "grad_norm": 3.4348876533718298, "learning_rate": 1.7113843430102873e-05, "loss": 0.0679, "step": 1987 }, { "epoch": 2.708446866485014, "grad_norm": 3.202588376276205, "learning_rate": 1.711074128882841e-05, "loss": 0.0486, "step": 1988 }, { "epoch": 2.709809264305177, "grad_norm": 2.5707161678439774, "learning_rate": 1.710763776278719e-05, "loss": 0.0416, "step": 1989 }, { "epoch": 2.7111716621253406, "grad_norm": 4.046761995194761, "learning_rate": 1.7104532852583598e-05, "loss": 0.0476, "step": 1990 }, { "epoch": 2.7125340599455043, "grad_norm": 2.2850557128431532, "learning_rate": 1.7101426558822292e-05, "loss": 0.0727, "step": 1991 }, { "epoch": 2.7138964577656677, "grad_norm": 1.2936457309586797, "learning_rate": 1.709831888210821e-05, "loss": 0.0577, "step": 1992 }, { "epoch": 2.715258855585831, "grad_norm": 2.9926438285285517, "learning_rate": 1.7095209823046536e-05, "loss": 0.0218, "step": 1993 }, { "epoch": 2.716621253405995, "grad_norm": 1.8531819672947958, "learning_rate": 1.709209938224275e-05, "loss": 0.0454, "step": 1994 }, { "epoch": 2.717983651226158, "grad_norm": 1.5655042894416342, "learning_rate": 1.708898756030258e-05, "loss": 0.0458, "step": 1995 }, { "epoch": 2.7193460490463215, "grad_norm": 3.3823131906798705, "learning_rate": 1.7085874357832034e-05, "loss": 0.0519, "step": 1996 }, { "epoch": 2.7207084468664853, "grad_norm": 2.24403338248265, "learning_rate": 1.7082759775437392e-05, "loss": 0.046, "step": 1997 }, { "epoch": 2.7220708446866486, "grad_norm": 1.3561445931549292, "learning_rate": 1.707964381372519e-05, "loss": 0.0517, "step": 1998 }, { "epoch": 2.723433242506812, "grad_norm": 5.136896157999796, "learning_rate": 1.7076526473302237e-05, "loss": 0.0525, "step": 1999 }, { "epoch": 2.7247956403269757, "grad_norm": 1.6906897182983731, "learning_rate": 1.707340775477562e-05, "loss": 0.0394, "step": 2000 }, { "epoch": 2.726158038147139, "grad_norm": 2.3717552708407976, "learning_rate": 1.7070287658752688e-05, "loss": 0.0399, "step": 2001 }, { "epoch": 2.7275204359673024, "grad_norm": 5.689887673028591, "learning_rate": 1.706716618584105e-05, "loss": 0.0474, "step": 2002 }, { "epoch": 2.728882833787466, "grad_norm": 1.5916150946659713, "learning_rate": 1.70640433366486e-05, "loss": 0.0647, "step": 2003 }, { "epoch": 2.7302452316076296, "grad_norm": 5.765807886367997, "learning_rate": 1.7060919111783483e-05, "loss": 0.0496, "step": 2004 }, { "epoch": 2.731607629427793, "grad_norm": 3.5667639751781386, "learning_rate": 1.7057793511854125e-05, "loss": 0.0358, "step": 2005 }, { "epoch": 2.7329700272479567, "grad_norm": 5.8150774135888845, "learning_rate": 1.7054666537469213e-05, "loss": 0.0534, "step": 2006 }, { "epoch": 2.73433242506812, "grad_norm": 3.1799408251983943, "learning_rate": 1.7051538189237703e-05, "loss": 0.0442, "step": 2007 }, { "epoch": 2.7356948228882834, "grad_norm": 1.3079353353924077, "learning_rate": 1.704840846776882e-05, "loss": 0.0587, "step": 2008 }, { "epoch": 2.7370572207084467, "grad_norm": 3.9964283202757103, "learning_rate": 1.704527737367205e-05, "loss": 0.0434, "step": 2009 }, { "epoch": 2.7384196185286105, "grad_norm": 3.833112769976248, "learning_rate": 1.7042144907557156e-05, "loss": 0.0404, "step": 2010 }, { "epoch": 2.739782016348774, "grad_norm": 2.4494202350628873, "learning_rate": 1.7039011070034165e-05, "loss": 0.0177, "step": 2011 }, { "epoch": 2.741144414168937, "grad_norm": 5.885163302972065, "learning_rate": 1.703587586171337e-05, "loss": 0.0305, "step": 2012 }, { "epoch": 2.742506811989101, "grad_norm": 2.1009042120526478, "learning_rate": 1.7032739283205324e-05, "loss": 0.0457, "step": 2013 }, { "epoch": 2.7438692098092643, "grad_norm": 1.2504368148934588, "learning_rate": 1.702960133512086e-05, "loss": 0.0274, "step": 2014 }, { "epoch": 2.7452316076294276, "grad_norm": 3.2439620667115885, "learning_rate": 1.702646201807107e-05, "loss": 0.0572, "step": 2015 }, { "epoch": 2.7465940054495914, "grad_norm": 3.711451978665321, "learning_rate": 1.7023321332667312e-05, "loss": 0.0777, "step": 2016 }, { "epoch": 2.7479564032697548, "grad_norm": 1.4008314314228392, "learning_rate": 1.7020179279521218e-05, "loss": 0.039, "step": 2017 }, { "epoch": 2.749318801089918, "grad_norm": 2.740451611701938, "learning_rate": 1.7017035859244674e-05, "loss": 0.046, "step": 2018 }, { "epoch": 2.750681198910082, "grad_norm": 2.2694398839936896, "learning_rate": 1.701389107244984e-05, "loss": 0.0611, "step": 2019 }, { "epoch": 2.7520435967302452, "grad_norm": 1.455415178186302, "learning_rate": 1.7010744919749143e-05, "loss": 0.0331, "step": 2020 }, { "epoch": 2.7534059945504086, "grad_norm": 3.6539841884373323, "learning_rate": 1.7007597401755277e-05, "loss": 0.0227, "step": 2021 }, { "epoch": 2.7547683923705724, "grad_norm": 2.2344681523101717, "learning_rate": 1.7004448519081188e-05, "loss": 0.0403, "step": 2022 }, { "epoch": 2.7561307901907357, "grad_norm": 3.449268890637575, "learning_rate": 1.7001298272340114e-05, "loss": 0.0546, "step": 2023 }, { "epoch": 2.757493188010899, "grad_norm": 5.09812465313549, "learning_rate": 1.699814666214553e-05, "loss": 0.0422, "step": 2024 }, { "epoch": 2.758855585831063, "grad_norm": 5.223112680937299, "learning_rate": 1.69949936891112e-05, "loss": 0.0227, "step": 2025 }, { "epoch": 2.760217983651226, "grad_norm": 2.9700351938124214, "learning_rate": 1.6991839353851136e-05, "loss": 0.0306, "step": 2026 }, { "epoch": 2.7615803814713895, "grad_norm": 7.11046017272177, "learning_rate": 1.6988683656979624e-05, "loss": 0.0288, "step": 2027 }, { "epoch": 2.7629427792915533, "grad_norm": 2.2275141814282287, "learning_rate": 1.6985526599111218e-05, "loss": 0.0258, "step": 2028 }, { "epoch": 2.7643051771117166, "grad_norm": 4.644920651063114, "learning_rate": 1.698236818086073e-05, "loss": 0.0678, "step": 2029 }, { "epoch": 2.76566757493188, "grad_norm": 6.2067530577804915, "learning_rate": 1.6979208402843235e-05, "loss": 0.0504, "step": 2030 }, { "epoch": 2.7670299727520433, "grad_norm": 4.115655758950556, "learning_rate": 1.6976047265674086e-05, "loss": 0.047, "step": 2031 }, { "epoch": 2.768392370572207, "grad_norm": 5.0638419046741445, "learning_rate": 1.697288476996889e-05, "loss": 0.046, "step": 2032 }, { "epoch": 2.7697547683923704, "grad_norm": 2.2820861448951404, "learning_rate": 1.6969720916343515e-05, "loss": 0.0832, "step": 2033 }, { "epoch": 2.771117166212534, "grad_norm": 4.971192443365605, "learning_rate": 1.6966555705414105e-05, "loss": 0.0886, "step": 2034 }, { "epoch": 2.7724795640326976, "grad_norm": 3.4403109843137, "learning_rate": 1.6963389137797062e-05, "loss": 0.0393, "step": 2035 }, { "epoch": 2.773841961852861, "grad_norm": 3.216996119205095, "learning_rate": 1.6960221214109046e-05, "loss": 0.0673, "step": 2036 }, { "epoch": 2.7752043596730243, "grad_norm": 3.1478113991014687, "learning_rate": 1.6957051934967e-05, "loss": 0.0553, "step": 2037 }, { "epoch": 2.776566757493188, "grad_norm": 5.35423193349116, "learning_rate": 1.6953881300988108e-05, "loss": 0.0412, "step": 2038 }, { "epoch": 2.7779291553133514, "grad_norm": 1.7993446286817179, "learning_rate": 1.695070931278983e-05, "loss": 0.0791, "step": 2039 }, { "epoch": 2.7792915531335147, "grad_norm": 2.8841380805905623, "learning_rate": 1.69475359709899e-05, "loss": 0.0774, "step": 2040 }, { "epoch": 2.7806539509536785, "grad_norm": 6.658959293126175, "learning_rate": 1.6944361276206287e-05, "loss": 0.062, "step": 2041 }, { "epoch": 2.782016348773842, "grad_norm": 3.1967896316587416, "learning_rate": 1.694118522905725e-05, "loss": 0.0914, "step": 2042 }, { "epoch": 2.783378746594005, "grad_norm": 2.04751048636536, "learning_rate": 1.6938007830161303e-05, "loss": 0.0706, "step": 2043 }, { "epoch": 2.784741144414169, "grad_norm": 6.734695742985276, "learning_rate": 1.6934829080137216e-05, "loss": 0.067, "step": 2044 }, { "epoch": 2.7861035422343323, "grad_norm": 1.1287240917052062, "learning_rate": 1.693164897960403e-05, "loss": 0.0271, "step": 2045 }, { "epoch": 2.7874659400544957, "grad_norm": 3.545642831657671, "learning_rate": 1.692846752918105e-05, "loss": 0.0787, "step": 2046 }, { "epoch": 2.7888283378746594, "grad_norm": 5.447650683628712, "learning_rate": 1.6925284729487842e-05, "loss": 0.0563, "step": 2047 }, { "epoch": 2.790190735694823, "grad_norm": 1.6436354311588612, "learning_rate": 1.6922100581144228e-05, "loss": 0.0507, "step": 2048 }, { "epoch": 2.791553133514986, "grad_norm": 5.61206936513275, "learning_rate": 1.6918915084770308e-05, "loss": 0.0649, "step": 2049 }, { "epoch": 2.79291553133515, "grad_norm": 3.62798544911997, "learning_rate": 1.691572824098642e-05, "loss": 0.0499, "step": 2050 }, { "epoch": 2.7942779291553133, "grad_norm": 5.000230818300183, "learning_rate": 1.6912540050413196e-05, "loss": 0.0966, "step": 2051 }, { "epoch": 2.7956403269754766, "grad_norm": 2.7944299565013564, "learning_rate": 1.6909350513671504e-05, "loss": 0.0381, "step": 2052 }, { "epoch": 2.7970027247956404, "grad_norm": 2.269574042473445, "learning_rate": 1.690615963138248e-05, "loss": 0.0242, "step": 2053 }, { "epoch": 2.7983651226158037, "grad_norm": 2.2331614023755386, "learning_rate": 1.6902967404167538e-05, "loss": 0.0361, "step": 2054 }, { "epoch": 2.799727520435967, "grad_norm": 3.370198611867538, "learning_rate": 1.6899773832648335e-05, "loss": 0.0591, "step": 2055 }, { "epoch": 2.801089918256131, "grad_norm": 3.9081930842943096, "learning_rate": 1.6896578917446793e-05, "loss": 0.052, "step": 2056 }, { "epoch": 2.802452316076294, "grad_norm": 2.397853442968233, "learning_rate": 1.6893382659185106e-05, "loss": 0.0513, "step": 2057 }, { "epoch": 2.8038147138964575, "grad_norm": 3.6545185130200353, "learning_rate": 1.689018505848572e-05, "loss": 0.0601, "step": 2058 }, { "epoch": 2.8051771117166213, "grad_norm": 1.6495309051318494, "learning_rate": 1.688698611597134e-05, "loss": 0.0431, "step": 2059 }, { "epoch": 2.8065395095367847, "grad_norm": 3.7452118853419574, "learning_rate": 1.688378583226495e-05, "loss": 0.0553, "step": 2060 }, { "epoch": 2.807901907356948, "grad_norm": 2.613747728862983, "learning_rate": 1.688058420798977e-05, "loss": 0.0446, "step": 2061 }, { "epoch": 2.809264305177112, "grad_norm": 3.9221941910133498, "learning_rate": 1.68773812437693e-05, "loss": 0.0622, "step": 2062 }, { "epoch": 2.810626702997275, "grad_norm": 3.8189892319411887, "learning_rate": 1.6874176940227296e-05, "loss": 0.0411, "step": 2063 }, { "epoch": 2.8119891008174385, "grad_norm": 1.835644469148427, "learning_rate": 1.687097129798777e-05, "loss": 0.0666, "step": 2064 }, { "epoch": 2.8133514986376023, "grad_norm": 6.610743859866733, "learning_rate": 1.6867764317674997e-05, "loss": 0.0644, "step": 2065 }, { "epoch": 2.8147138964577656, "grad_norm": 2.8865018955297077, "learning_rate": 1.686455599991352e-05, "loss": 0.1054, "step": 2066 }, { "epoch": 2.816076294277929, "grad_norm": 2.932571397035475, "learning_rate": 1.6861346345328133e-05, "loss": 0.0619, "step": 2067 }, { "epoch": 2.8174386920980927, "grad_norm": 2.608150481696515, "learning_rate": 1.6858135354543888e-05, "loss": 0.0694, "step": 2068 }, { "epoch": 2.818801089918256, "grad_norm": 2.3014287895196377, "learning_rate": 1.6854923028186112e-05, "loss": 0.0641, "step": 2069 }, { "epoch": 2.8201634877384194, "grad_norm": 4.179530561949199, "learning_rate": 1.685170936688038e-05, "loss": 0.115, "step": 2070 }, { "epoch": 2.821525885558583, "grad_norm": 5.225889501850796, "learning_rate": 1.6848494371252525e-05, "loss": 0.0276, "step": 2071 }, { "epoch": 2.8228882833787465, "grad_norm": 2.997003554057292, "learning_rate": 1.684527804192865e-05, "loss": 0.0327, "step": 2072 }, { "epoch": 2.82425068119891, "grad_norm": 2.340624279038803, "learning_rate": 1.684206037953511e-05, "loss": 0.0485, "step": 2073 }, { "epoch": 2.8256130790190737, "grad_norm": 5.322141637604027, "learning_rate": 1.6838841384698527e-05, "loss": 0.0298, "step": 2074 }, { "epoch": 2.826975476839237, "grad_norm": 4.071096004784868, "learning_rate": 1.683562105804577e-05, "loss": 0.0415, "step": 2075 }, { "epoch": 2.8283378746594003, "grad_norm": 2.5412010889481333, "learning_rate": 1.683239940020398e-05, "loss": 0.0496, "step": 2076 }, { "epoch": 2.829700272479564, "grad_norm": 5.77716314452948, "learning_rate": 1.682917641180055e-05, "loss": 0.0354, "step": 2077 }, { "epoch": 2.8310626702997275, "grad_norm": 6.639560526883214, "learning_rate": 1.6825952093463137e-05, "loss": 0.0693, "step": 2078 }, { "epoch": 2.832425068119891, "grad_norm": 4.039420115294934, "learning_rate": 1.682272644581965e-05, "loss": 0.0497, "step": 2079 }, { "epoch": 2.8337874659400546, "grad_norm": 8.222133591234737, "learning_rate": 1.6819499469498265e-05, "loss": 0.0597, "step": 2080 }, { "epoch": 2.835149863760218, "grad_norm": 2.5701315107955343, "learning_rate": 1.681627116512741e-05, "loss": 0.0263, "step": 2081 }, { "epoch": 2.8365122615803813, "grad_norm": 7.862163310727281, "learning_rate": 1.6813041533335778e-05, "loss": 0.0329, "step": 2082 }, { "epoch": 2.837874659400545, "grad_norm": 5.809833359582509, "learning_rate": 1.6809810574752316e-05, "loss": 0.0768, "step": 2083 }, { "epoch": 2.8392370572207084, "grad_norm": 1.3247319563338709, "learning_rate": 1.6806578290006225e-05, "loss": 0.0308, "step": 2084 }, { "epoch": 2.8405994550408717, "grad_norm": 6.083358817245227, "learning_rate": 1.6803344679726975e-05, "loss": 0.0224, "step": 2085 }, { "epoch": 2.8419618528610355, "grad_norm": 2.7588020116444234, "learning_rate": 1.680010974454429e-05, "loss": 0.0605, "step": 2086 }, { "epoch": 2.843324250681199, "grad_norm": 4.943216453213996, "learning_rate": 1.6796873485088147e-05, "loss": 0.0528, "step": 2087 }, { "epoch": 2.844686648501362, "grad_norm": 5.386923147950004, "learning_rate": 1.6793635901988787e-05, "loss": 0.081, "step": 2088 }, { "epoch": 2.846049046321526, "grad_norm": 2.631355890653041, "learning_rate": 1.6790396995876707e-05, "loss": 0.0594, "step": 2089 }, { "epoch": 2.8474114441416893, "grad_norm": 5.966967416001427, "learning_rate": 1.678715676738266e-05, "loss": 0.0618, "step": 2090 }, { "epoch": 2.8487738419618527, "grad_norm": 3.6783104002542304, "learning_rate": 1.6783915217137656e-05, "loss": 0.0265, "step": 2091 }, { "epoch": 2.8501362397820165, "grad_norm": 4.366367017714518, "learning_rate": 1.6780672345772968e-05, "loss": 0.0384, "step": 2092 }, { "epoch": 2.85149863760218, "grad_norm": 3.465496573620984, "learning_rate": 1.677742815392012e-05, "loss": 0.0348, "step": 2093 }, { "epoch": 2.852861035422343, "grad_norm": 3.8065541114035013, "learning_rate": 1.6774182642210894e-05, "loss": 0.0534, "step": 2094 }, { "epoch": 2.854223433242507, "grad_norm": 2.3354060899539926, "learning_rate": 1.6770935811277333e-05, "loss": 0.0717, "step": 2095 }, { "epoch": 2.8555858310626703, "grad_norm": 1.8091581784104636, "learning_rate": 1.6767687661751734e-05, "loss": 0.0409, "step": 2096 }, { "epoch": 2.8569482288828336, "grad_norm": 0.9701600621741694, "learning_rate": 1.6764438194266646e-05, "loss": 0.0318, "step": 2097 }, { "epoch": 2.8583106267029974, "grad_norm": 2.309169716492112, "learning_rate": 1.6761187409454892e-05, "loss": 0.0425, "step": 2098 }, { "epoch": 2.8596730245231607, "grad_norm": 3.3300621863277313, "learning_rate": 1.675793530794953e-05, "loss": 0.0535, "step": 2099 }, { "epoch": 2.861035422343324, "grad_norm": 5.129865426640599, "learning_rate": 1.6754681890383886e-05, "loss": 0.0806, "step": 2100 }, { "epoch": 2.862397820163488, "grad_norm": 6.1300070385563385, "learning_rate": 1.675142715739154e-05, "loss": 0.0638, "step": 2101 }, { "epoch": 2.863760217983651, "grad_norm": 3.5224009473369997, "learning_rate": 1.674817110960633e-05, "loss": 0.0661, "step": 2102 }, { "epoch": 2.8651226158038146, "grad_norm": 2.256115593306094, "learning_rate": 1.674491374766234e-05, "loss": 0.031, "step": 2103 }, { "epoch": 2.8664850136239783, "grad_norm": 6.023573917966272, "learning_rate": 1.6741655072193932e-05, "loss": 0.0365, "step": 2104 }, { "epoch": 2.8678474114441417, "grad_norm": 4.427653230711098, "learning_rate": 1.67383950838357e-05, "loss": 0.0383, "step": 2105 }, { "epoch": 2.869209809264305, "grad_norm": 3.935100100783993, "learning_rate": 1.6735133783222508e-05, "loss": 0.0678, "step": 2106 }, { "epoch": 2.870572207084469, "grad_norm": 3.6343741885132457, "learning_rate": 1.6731871170989473e-05, "loss": 0.0412, "step": 2107 }, { "epoch": 2.871934604904632, "grad_norm": 2.319836112359916, "learning_rate": 1.6728607247771957e-05, "loss": 0.0633, "step": 2108 }, { "epoch": 2.8732970027247955, "grad_norm": 2.895092835160175, "learning_rate": 1.6725342014205595e-05, "loss": 0.0411, "step": 2109 }, { "epoch": 2.8746594005449593, "grad_norm": 1.5012616354080845, "learning_rate": 1.672207547092627e-05, "loss": 0.0402, "step": 2110 }, { "epoch": 2.8760217983651226, "grad_norm": 2.5241397089321556, "learning_rate": 1.671880761857011e-05, "loss": 0.0267, "step": 2111 }, { "epoch": 2.877384196185286, "grad_norm": 2.191059710770033, "learning_rate": 1.671553845777351e-05, "loss": 0.0318, "step": 2112 }, { "epoch": 2.8787465940054497, "grad_norm": 2.041632575345814, "learning_rate": 1.6712267989173115e-05, "loss": 0.0465, "step": 2113 }, { "epoch": 2.880108991825613, "grad_norm": 1.5703923815478444, "learning_rate": 1.6708996213405826e-05, "loss": 0.0578, "step": 2114 }, { "epoch": 2.8814713896457764, "grad_norm": 1.4621207316197358, "learning_rate": 1.67057231311088e-05, "loss": 0.0393, "step": 2115 }, { "epoch": 2.88283378746594, "grad_norm": 1.3093106021775662, "learning_rate": 1.6702448742919444e-05, "loss": 0.0425, "step": 2116 }, { "epoch": 2.8841961852861036, "grad_norm": 2.0894430559307, "learning_rate": 1.6699173049475425e-05, "loss": 0.06, "step": 2117 }, { "epoch": 2.885558583106267, "grad_norm": 1.6865125814732764, "learning_rate": 1.6695896051414662e-05, "loss": 0.054, "step": 2118 }, { "epoch": 2.8869209809264307, "grad_norm": 2.1894726922314787, "learning_rate": 1.669261774937532e-05, "loss": 0.0177, "step": 2119 }, { "epoch": 2.888283378746594, "grad_norm": 0.9749751603962568, "learning_rate": 1.6689338143995835e-05, "loss": 0.0221, "step": 2120 }, { "epoch": 2.8896457765667574, "grad_norm": 1.9798526021187537, "learning_rate": 1.668605723591488e-05, "loss": 0.028, "step": 2121 }, { "epoch": 2.891008174386921, "grad_norm": 2.938502324943887, "learning_rate": 1.6682775025771395e-05, "loss": 0.0511, "step": 2122 }, { "epoch": 2.8923705722070845, "grad_norm": 3.8031925081922413, "learning_rate": 1.6679491514204555e-05, "loss": 0.036, "step": 2123 }, { "epoch": 2.893732970027248, "grad_norm": 1.4119106317518522, "learning_rate": 1.6676206701853815e-05, "loss": 0.0239, "step": 2124 }, { "epoch": 2.8950953678474116, "grad_norm": 4.061536570984042, "learning_rate": 1.6672920589358863e-05, "loss": 0.0693, "step": 2125 }, { "epoch": 2.896457765667575, "grad_norm": 3.2211843776369697, "learning_rate": 1.6669633177359647e-05, "loss": 0.0428, "step": 2126 }, { "epoch": 2.8978201634877383, "grad_norm": 2.9110536420090742, "learning_rate": 1.6666344466496366e-05, "loss": 0.0381, "step": 2127 }, { "epoch": 2.899182561307902, "grad_norm": 3.896280551978932, "learning_rate": 1.6663054457409474e-05, "loss": 0.0571, "step": 2128 }, { "epoch": 2.9005449591280654, "grad_norm": 8.7195606519591, "learning_rate": 1.6659763150739675e-05, "loss": 0.0536, "step": 2129 }, { "epoch": 2.9019073569482288, "grad_norm": 4.1519611524284095, "learning_rate": 1.6656470547127932e-05, "loss": 0.0506, "step": 2130 }, { "epoch": 2.9032697547683926, "grad_norm": 3.4715502424911007, "learning_rate": 1.6653176647215455e-05, "loss": 0.0425, "step": 2131 }, { "epoch": 2.904632152588556, "grad_norm": 9.321607064182086, "learning_rate": 1.6649881451643706e-05, "loss": 0.0526, "step": 2132 }, { "epoch": 2.9059945504087192, "grad_norm": 1.6672663420702338, "learning_rate": 1.66465849610544e-05, "loss": 0.0348, "step": 2133 }, { "epoch": 2.907356948228883, "grad_norm": 8.17958727279081, "learning_rate": 1.6643287176089508e-05, "loss": 0.0642, "step": 2134 }, { "epoch": 2.9087193460490464, "grad_norm": 6.071793029215629, "learning_rate": 1.6639988097391252e-05, "loss": 0.0599, "step": 2135 }, { "epoch": 2.9100817438692097, "grad_norm": 2.7586524754892667, "learning_rate": 1.6636687725602105e-05, "loss": 0.0302, "step": 2136 }, { "epoch": 2.9114441416893735, "grad_norm": 4.804112634348829, "learning_rate": 1.6633386061364783e-05, "loss": 0.0165, "step": 2137 }, { "epoch": 2.912806539509537, "grad_norm": 3.349664673161962, "learning_rate": 1.6630083105322267e-05, "loss": 0.0323, "step": 2138 }, { "epoch": 2.9141689373297, "grad_norm": 3.6328258439304415, "learning_rate": 1.662677885811779e-05, "loss": 0.045, "step": 2139 }, { "epoch": 2.915531335149864, "grad_norm": 6.226167233783499, "learning_rate": 1.662347332039482e-05, "loss": 0.071, "step": 2140 }, { "epoch": 2.9168937329700273, "grad_norm": 2.8630506442354426, "learning_rate": 1.6620166492797095e-05, "loss": 0.0448, "step": 2141 }, { "epoch": 2.9182561307901906, "grad_norm": 4.473958533972163, "learning_rate": 1.6616858375968596e-05, "loss": 0.0329, "step": 2142 }, { "epoch": 2.9196185286103544, "grad_norm": 2.423072724755843, "learning_rate": 1.661354897055355e-05, "loss": 0.0867, "step": 2143 }, { "epoch": 2.9209809264305178, "grad_norm": 2.0791826217744798, "learning_rate": 1.6610238277196448e-05, "loss": 0.0429, "step": 2144 }, { "epoch": 2.922343324250681, "grad_norm": 2.072895551556551, "learning_rate": 1.6606926296542016e-05, "loss": 0.0779, "step": 2145 }, { "epoch": 2.923705722070845, "grad_norm": 4.2778931350101175, "learning_rate": 1.6603613029235244e-05, "loss": 0.0325, "step": 2146 }, { "epoch": 2.9250681198910082, "grad_norm": 3.1577286182594118, "learning_rate": 1.6600298475921367e-05, "loss": 0.0653, "step": 2147 }, { "epoch": 2.9264305177111716, "grad_norm": 2.111126716756337, "learning_rate": 1.659698263724587e-05, "loss": 0.0466, "step": 2148 }, { "epoch": 2.9277929155313354, "grad_norm": 3.0534505486582777, "learning_rate": 1.659366551385449e-05, "loss": 0.0477, "step": 2149 }, { "epoch": 2.9291553133514987, "grad_norm": 1.8887786119861711, "learning_rate": 1.659034710639321e-05, "loss": 0.0504, "step": 2150 }, { "epoch": 2.930517711171662, "grad_norm": 1.9265008489960882, "learning_rate": 1.6587027415508275e-05, "loss": 0.0555, "step": 2151 }, { "epoch": 2.931880108991826, "grad_norm": 1.4450313706024298, "learning_rate": 1.658370644184616e-05, "loss": 0.0255, "step": 2152 }, { "epoch": 2.933242506811989, "grad_norm": 2.347245396484914, "learning_rate": 1.658038418605361e-05, "loss": 0.06, "step": 2153 }, { "epoch": 2.9346049046321525, "grad_norm": 2.0554430742544714, "learning_rate": 1.6577060648777606e-05, "loss": 0.0311, "step": 2154 }, { "epoch": 2.9359673024523163, "grad_norm": 3.1168143641060886, "learning_rate": 1.657373583066539e-05, "loss": 0.0597, "step": 2155 }, { "epoch": 2.9373297002724796, "grad_norm": 4.201578752870151, "learning_rate": 1.657040973236444e-05, "loss": 0.0439, "step": 2156 }, { "epoch": 2.938692098092643, "grad_norm": 2.5612183878150505, "learning_rate": 1.656708235452249e-05, "loss": 0.0453, "step": 2157 }, { "epoch": 2.9400544959128068, "grad_norm": 1.492786117880557, "learning_rate": 1.6563753697787528e-05, "loss": 0.0941, "step": 2158 }, { "epoch": 2.94141689373297, "grad_norm": 1.0061590700601621, "learning_rate": 1.6560423762807783e-05, "loss": 0.0354, "step": 2159 }, { "epoch": 2.9427792915531334, "grad_norm": 3.6707579644151123, "learning_rate": 1.655709255023174e-05, "loss": 0.0451, "step": 2160 }, { "epoch": 2.9441416893732972, "grad_norm": 2.4772642866075305, "learning_rate": 1.6553760060708123e-05, "loss": 0.0561, "step": 2161 }, { "epoch": 2.9455040871934606, "grad_norm": 2.244906773457703, "learning_rate": 1.655042629488592e-05, "loss": 0.0456, "step": 2162 }, { "epoch": 2.946866485013624, "grad_norm": 1.7361308779336093, "learning_rate": 1.654709125341435e-05, "loss": 0.0231, "step": 2163 }, { "epoch": 2.9482288828337877, "grad_norm": 2.7320589810490388, "learning_rate": 1.654375493694289e-05, "loss": 0.0607, "step": 2164 }, { "epoch": 2.949591280653951, "grad_norm": 2.80284997910979, "learning_rate": 1.6540417346121272e-05, "loss": 0.0574, "step": 2165 }, { "epoch": 2.9509536784741144, "grad_norm": 3.6042051108553306, "learning_rate": 1.653707848159946e-05, "loss": 0.0595, "step": 2166 }, { "epoch": 2.952316076294278, "grad_norm": 3.8283231632055266, "learning_rate": 1.653373834402767e-05, "loss": 0.0488, "step": 2167 }, { "epoch": 2.9536784741144415, "grad_norm": 2.3224056151533006, "learning_rate": 1.6530396934056386e-05, "loss": 0.0627, "step": 2168 }, { "epoch": 2.955040871934605, "grad_norm": 5.141537179203237, "learning_rate": 1.6527054252336313e-05, "loss": 0.033, "step": 2169 }, { "epoch": 2.9564032697547686, "grad_norm": 3.8988014925508008, "learning_rate": 1.6523710299518416e-05, "loss": 0.0329, "step": 2170 }, { "epoch": 2.957765667574932, "grad_norm": 3.576052512702297, "learning_rate": 1.6520365076253907e-05, "loss": 0.0195, "step": 2171 }, { "epoch": 2.9591280653950953, "grad_norm": 3.9938021516081332, "learning_rate": 1.6517018583194243e-05, "loss": 0.0607, "step": 2172 }, { "epoch": 2.960490463215259, "grad_norm": 2.695275412995254, "learning_rate": 1.651367082099113e-05, "loss": 0.0662, "step": 2173 }, { "epoch": 2.9618528610354224, "grad_norm": 1.2432186252133735, "learning_rate": 1.6510321790296527e-05, "loss": 0.0435, "step": 2174 }, { "epoch": 2.963215258855586, "grad_norm": 2.6259635680567066, "learning_rate": 1.6506971491762627e-05, "loss": 0.0432, "step": 2175 }, { "epoch": 2.9645776566757496, "grad_norm": 5.293160974515883, "learning_rate": 1.650361992604188e-05, "loss": 0.038, "step": 2176 }, { "epoch": 2.965940054495913, "grad_norm": 1.3409732273104897, "learning_rate": 1.6500267093786983e-05, "loss": 0.0398, "step": 2177 }, { "epoch": 2.9673024523160763, "grad_norm": 4.645856878855443, "learning_rate": 1.649691299565087e-05, "loss": 0.0471, "step": 2178 }, { "epoch": 2.96866485013624, "grad_norm": 1.741087805316016, "learning_rate": 1.649355763228673e-05, "loss": 0.0622, "step": 2179 }, { "epoch": 2.9700272479564034, "grad_norm": 2.2688402628188493, "learning_rate": 1.6490201004348003e-05, "loss": 0.0711, "step": 2180 }, { "epoch": 2.9713896457765667, "grad_norm": 1.3371622268876022, "learning_rate": 1.6486843112488358e-05, "loss": 0.0488, "step": 2181 }, { "epoch": 2.9727520435967305, "grad_norm": 2.237209357597437, "learning_rate": 1.648348395736173e-05, "loss": 0.0733, "step": 2182 }, { "epoch": 2.974114441416894, "grad_norm": 2.5853440898126996, "learning_rate": 1.6480123539622283e-05, "loss": 0.0441, "step": 2183 }, { "epoch": 2.975476839237057, "grad_norm": 3.256849862188337, "learning_rate": 1.6476761859924438e-05, "loss": 0.0657, "step": 2184 }, { "epoch": 2.976839237057221, "grad_norm": 2.020735831938007, "learning_rate": 1.647339891892286e-05, "loss": 0.0336, "step": 2185 }, { "epoch": 2.9782016348773843, "grad_norm": 2.889417038211386, "learning_rate": 1.6470034717272456e-05, "loss": 0.0502, "step": 2186 }, { "epoch": 2.9795640326975477, "grad_norm": 1.825302738362664, "learning_rate": 1.6466669255628382e-05, "loss": 0.0386, "step": 2187 }, { "epoch": 2.9809264305177114, "grad_norm": 2.91497477096749, "learning_rate": 1.646330253464604e-05, "loss": 0.0453, "step": 2188 }, { "epoch": 2.982288828337875, "grad_norm": 3.5431533385059817, "learning_rate": 1.6459934554981067e-05, "loss": 0.037, "step": 2189 }, { "epoch": 2.983651226158038, "grad_norm": 1.5935872786289913, "learning_rate": 1.6456565317289362e-05, "loss": 0.0419, "step": 2190 }, { "epoch": 2.9850136239782015, "grad_norm": 1.4465326462086938, "learning_rate": 1.645319482222706e-05, "loss": 0.0706, "step": 2191 }, { "epoch": 2.9863760217983653, "grad_norm": 2.2755831702819322, "learning_rate": 1.644982307045053e-05, "loss": 0.0505, "step": 2192 }, { "epoch": 2.9877384196185286, "grad_norm": 4.621111919084465, "learning_rate": 1.6446450062616415e-05, "loss": 0.0331, "step": 2193 }, { "epoch": 2.989100817438692, "grad_norm": 3.92686076783441, "learning_rate": 1.644307579938157e-05, "loss": 0.0716, "step": 2194 }, { "epoch": 2.9904632152588557, "grad_norm": 4.640135651414589, "learning_rate": 1.6439700281403113e-05, "loss": 0.0629, "step": 2195 }, { "epoch": 2.991825613079019, "grad_norm": 4.747269176960156, "learning_rate": 1.6436323509338404e-05, "loss": 0.0413, "step": 2196 }, { "epoch": 2.9931880108991824, "grad_norm": 2.4696941325572133, "learning_rate": 1.6432945483845046e-05, "loss": 0.0566, "step": 2197 }, { "epoch": 2.994550408719346, "grad_norm": 5.440590278527277, "learning_rate": 1.6429566205580886e-05, "loss": 0.0645, "step": 2198 }, { "epoch": 2.9959128065395095, "grad_norm": 1.737299152035284, "learning_rate": 1.6426185675204007e-05, "loss": 0.0257, "step": 2199 }, { "epoch": 2.997275204359673, "grad_norm": 3.467261819949941, "learning_rate": 1.6422803893372754e-05, "loss": 0.0342, "step": 2200 }, { "epoch": 2.9986376021798367, "grad_norm": 1.1198534452114886, "learning_rate": 1.6419420860745698e-05, "loss": 0.0341, "step": 2201 }, { "epoch": 3.0, "grad_norm": 4.04389535426201, "learning_rate": 1.6416036577981665e-05, "loss": 0.0589, "step": 2202 }, { "epoch": 3.0, "eval_accuracy": 0.9421673217293656, "eval_f1": 0.9274360142579641, "eval_loss": 0.11350680142641068, "eval_precision": 0.9215700409832888, "eval_recall": 0.9435020210615892, "eval_runtime": 16.6533, "eval_samples_per_second": 106.946, "eval_steps_per_second": 0.841, "step": 2202 }, { "epoch": 3.0013623978201633, "grad_norm": 1.4878329689382837, "learning_rate": 1.6412651045739717e-05, "loss": 0.0264, "step": 2203 }, { "epoch": 3.002724795640327, "grad_norm": 2.359112161621986, "learning_rate": 1.6409264264679165e-05, "loss": 0.024, "step": 2204 }, { "epoch": 3.0040871934604905, "grad_norm": 2.2273902981128497, "learning_rate": 1.640587623545956e-05, "loss": 0.0348, "step": 2205 }, { "epoch": 3.005449591280654, "grad_norm": 1.194709859609329, "learning_rate": 1.64024869587407e-05, "loss": 0.0403, "step": 2206 }, { "epoch": 3.0068119891008176, "grad_norm": 2.0862045499686146, "learning_rate": 1.6399096435182614e-05, "loss": 0.0493, "step": 2207 }, { "epoch": 3.008174386920981, "grad_norm": 1.6600641456069203, "learning_rate": 1.6395704665445587e-05, "loss": 0.0303, "step": 2208 }, { "epoch": 3.0095367847411443, "grad_norm": 2.24519381288842, "learning_rate": 1.6392311650190146e-05, "loss": 0.0306, "step": 2209 }, { "epoch": 3.010899182561308, "grad_norm": 1.2323432319184504, "learning_rate": 1.6388917390077054e-05, "loss": 0.0197, "step": 2210 }, { "epoch": 3.0122615803814714, "grad_norm": 1.73381846728279, "learning_rate": 1.638552188576732e-05, "loss": 0.0129, "step": 2211 }, { "epoch": 3.0136239782016347, "grad_norm": 1.7764472899382209, "learning_rate": 1.6382125137922194e-05, "loss": 0.029, "step": 2212 }, { "epoch": 3.0149863760217985, "grad_norm": 2.1407352877828054, "learning_rate": 1.6378727147203166e-05, "loss": 0.0191, "step": 2213 }, { "epoch": 3.016348773841962, "grad_norm": 1.3940834621006382, "learning_rate": 1.6375327914271977e-05, "loss": 0.0287, "step": 2214 }, { "epoch": 3.017711171662125, "grad_norm": 2.8167188182783223, "learning_rate": 1.6371927439790598e-05, "loss": 0.0567, "step": 2215 }, { "epoch": 3.019073569482289, "grad_norm": 1.8507733435418032, "learning_rate": 1.6368525724421248e-05, "loss": 0.0248, "step": 2216 }, { "epoch": 3.0204359673024523, "grad_norm": 2.5528728322598004, "learning_rate": 1.6365122768826392e-05, "loss": 0.0497, "step": 2217 }, { "epoch": 3.0217983651226157, "grad_norm": 2.638182793829113, "learning_rate": 1.6361718573668722e-05, "loss": 0.0253, "step": 2218 }, { "epoch": 3.0231607629427795, "grad_norm": 1.6674403667755313, "learning_rate": 1.6358313139611194e-05, "loss": 0.0271, "step": 2219 }, { "epoch": 3.024523160762943, "grad_norm": 4.1892911725167785, "learning_rate": 1.6354906467316986e-05, "loss": 0.0235, "step": 2220 }, { "epoch": 3.025885558583106, "grad_norm": 2.2798493821294645, "learning_rate": 1.6351498557449515e-05, "loss": 0.0189, "step": 2221 }, { "epoch": 3.02724795640327, "grad_norm": 2.6013747250488466, "learning_rate": 1.634808941067246e-05, "loss": 0.0394, "step": 2222 }, { "epoch": 3.0286103542234333, "grad_norm": 3.7379317337310445, "learning_rate": 1.6344679027649726e-05, "loss": 0.0317, "step": 2223 }, { "epoch": 3.0299727520435966, "grad_norm": 1.404092771023934, "learning_rate": 1.6341267409045453e-05, "loss": 0.0173, "step": 2224 }, { "epoch": 3.0313351498637604, "grad_norm": 2.369135461980632, "learning_rate": 1.633785455552404e-05, "loss": 0.0182, "step": 2225 }, { "epoch": 3.0326975476839237, "grad_norm": 1.183685730113961, "learning_rate": 1.633444046775011e-05, "loss": 0.0267, "step": 2226 }, { "epoch": 3.034059945504087, "grad_norm": 1.362068136724179, "learning_rate": 1.6331025146388532e-05, "loss": 0.0107, "step": 2227 }, { "epoch": 3.035422343324251, "grad_norm": 2.908877015360229, "learning_rate": 1.632760859210442e-05, "loss": 0.0236, "step": 2228 }, { "epoch": 3.036784741144414, "grad_norm": 2.509371240260807, "learning_rate": 1.632419080556312e-05, "loss": 0.0395, "step": 2229 }, { "epoch": 3.0381471389645776, "grad_norm": 2.006146642813896, "learning_rate": 1.6320771787430226e-05, "loss": 0.0216, "step": 2230 }, { "epoch": 3.0395095367847413, "grad_norm": 1.2863393651854338, "learning_rate": 1.6317351538371563e-05, "loss": 0.0229, "step": 2231 }, { "epoch": 3.0408719346049047, "grad_norm": 1.5923504338444685, "learning_rate": 1.6313930059053204e-05, "loss": 0.0295, "step": 2232 }, { "epoch": 3.042234332425068, "grad_norm": 2.018137241765579, "learning_rate": 1.6310507350141457e-05, "loss": 0.0155, "step": 2233 }, { "epoch": 3.043596730245232, "grad_norm": 3.7992251227406615, "learning_rate": 1.630708341230287e-05, "loss": 0.0289, "step": 2234 }, { "epoch": 3.044959128065395, "grad_norm": 2.3076235051118883, "learning_rate": 1.630365824620423e-05, "loss": 0.0411, "step": 2235 }, { "epoch": 3.0463215258855585, "grad_norm": 2.861332354471765, "learning_rate": 1.6300231852512564e-05, "loss": 0.0217, "step": 2236 }, { "epoch": 3.0476839237057223, "grad_norm": 5.018234695837732, "learning_rate": 1.629680423189514e-05, "loss": 0.0259, "step": 2237 }, { "epoch": 3.0490463215258856, "grad_norm": 3.9141439314404285, "learning_rate": 1.6293375385019464e-05, "loss": 0.0186, "step": 2238 }, { "epoch": 3.050408719346049, "grad_norm": 5.105179309553415, "learning_rate": 1.6289945312553275e-05, "loss": 0.029, "step": 2239 }, { "epoch": 3.0517711171662127, "grad_norm": 3.3163982506116745, "learning_rate": 1.628651401516456e-05, "loss": 0.0348, "step": 2240 }, { "epoch": 3.053133514986376, "grad_norm": 5.19160269320665, "learning_rate": 1.6283081493521536e-05, "loss": 0.0116, "step": 2241 }, { "epoch": 3.0544959128065394, "grad_norm": 1.1279425947884794, "learning_rate": 1.627964774829266e-05, "loss": 0.0485, "step": 2242 }, { "epoch": 3.055858310626703, "grad_norm": 2.3696409980494244, "learning_rate": 1.627621278014664e-05, "loss": 0.0195, "step": 2243 }, { "epoch": 3.0572207084468666, "grad_norm": 2.3996820246707204, "learning_rate": 1.6272776589752407e-05, "loss": 0.0365, "step": 2244 }, { "epoch": 3.05858310626703, "grad_norm": 2.302873093213755, "learning_rate": 1.6269339177779128e-05, "loss": 0.0437, "step": 2245 }, { "epoch": 3.0599455040871932, "grad_norm": 2.9931246986717857, "learning_rate": 1.6265900544896223e-05, "loss": 0.0472, "step": 2246 }, { "epoch": 3.061307901907357, "grad_norm": 1.3515733229577862, "learning_rate": 1.6262460691773342e-05, "loss": 0.0283, "step": 2247 }, { "epoch": 3.0626702997275204, "grad_norm": 4.924597549668401, "learning_rate": 1.625901961908037e-05, "loss": 0.0315, "step": 2248 }, { "epoch": 3.0640326975476837, "grad_norm": 1.201686673518136, "learning_rate": 1.6255577327487425e-05, "loss": 0.0217, "step": 2249 }, { "epoch": 3.0653950953678475, "grad_norm": 2.0457446277300217, "learning_rate": 1.625213381766488e-05, "loss": 0.0327, "step": 2250 }, { "epoch": 3.066757493188011, "grad_norm": 3.8359617705782934, "learning_rate": 1.6248689090283327e-05, "loss": 0.0132, "step": 2251 }, { "epoch": 3.068119891008174, "grad_norm": 3.8366698092394818, "learning_rate": 1.6245243146013602e-05, "loss": 0.0353, "step": 2252 }, { "epoch": 3.069482288828338, "grad_norm": 3.5044835613536867, "learning_rate": 1.6241795985526785e-05, "loss": 0.0406, "step": 2253 }, { "epoch": 3.0708446866485013, "grad_norm": 5.023266093079782, "learning_rate": 1.6238347609494177e-05, "loss": 0.0369, "step": 2254 }, { "epoch": 3.0722070844686646, "grad_norm": 1.4578618743340115, "learning_rate": 1.6234898018587336e-05, "loss": 0.0225, "step": 2255 }, { "epoch": 3.0735694822888284, "grad_norm": 5.42542207179071, "learning_rate": 1.623144721347804e-05, "loss": 0.0153, "step": 2256 }, { "epoch": 3.0749318801089918, "grad_norm": 3.8792113020807353, "learning_rate": 1.6227995194838304e-05, "loss": 0.0343, "step": 2257 }, { "epoch": 3.076294277929155, "grad_norm": 6.498640504208021, "learning_rate": 1.6224541963340392e-05, "loss": 0.0938, "step": 2258 }, { "epoch": 3.077656675749319, "grad_norm": 2.8433339463288516, "learning_rate": 1.622108751965679e-05, "loss": 0.0211, "step": 2259 }, { "epoch": 3.0790190735694822, "grad_norm": 3.481886503999588, "learning_rate": 1.6217631864460234e-05, "loss": 0.0264, "step": 2260 }, { "epoch": 3.0803814713896456, "grad_norm": 3.8764613508898687, "learning_rate": 1.621417499842368e-05, "loss": 0.0397, "step": 2261 }, { "epoch": 3.0817438692098094, "grad_norm": 2.525403199369836, "learning_rate": 1.6210716922220336e-05, "loss": 0.0279, "step": 2262 }, { "epoch": 3.0831062670299727, "grad_norm": 4.315091650434302, "learning_rate": 1.6207257636523636e-05, "loss": 0.0417, "step": 2263 }, { "epoch": 3.084468664850136, "grad_norm": 2.0367958345411585, "learning_rate": 1.620379714200725e-05, "loss": 0.0469, "step": 2264 }, { "epoch": 3.0858310626703, "grad_norm": 2.116774658238664, "learning_rate": 1.620033543934508e-05, "loss": 0.0368, "step": 2265 }, { "epoch": 3.087193460490463, "grad_norm": 3.259685679536692, "learning_rate": 1.6196872529211282e-05, "loss": 0.0141, "step": 2266 }, { "epoch": 3.0885558583106265, "grad_norm": 1.131152842191504, "learning_rate": 1.6193408412280217e-05, "loss": 0.0184, "step": 2267 }, { "epoch": 3.0899182561307903, "grad_norm": 4.744847313970165, "learning_rate": 1.6189943089226508e-05, "loss": 0.0336, "step": 2268 }, { "epoch": 3.0912806539509536, "grad_norm": 4.256412754292794, "learning_rate": 1.6186476560725e-05, "loss": 0.0709, "step": 2269 }, { "epoch": 3.092643051771117, "grad_norm": 4.8785634267549245, "learning_rate": 1.618300882745077e-05, "loss": 0.0442, "step": 2270 }, { "epoch": 3.0940054495912808, "grad_norm": 6.762466013124348, "learning_rate": 1.6179539890079145e-05, "loss": 0.0346, "step": 2271 }, { "epoch": 3.095367847411444, "grad_norm": 2.555192417297478, "learning_rate": 1.6176069749285668e-05, "loss": 0.0625, "step": 2272 }, { "epoch": 3.0967302452316074, "grad_norm": 6.522878118159352, "learning_rate": 1.6172598405746125e-05, "loss": 0.0401, "step": 2273 }, { "epoch": 3.0980926430517712, "grad_norm": 5.623186159676121, "learning_rate": 1.616912586013654e-05, "loss": 0.0362, "step": 2274 }, { "epoch": 3.0994550408719346, "grad_norm": 3.7798065296950734, "learning_rate": 1.616565211313316e-05, "loss": 0.0453, "step": 2275 }, { "epoch": 3.100817438692098, "grad_norm": 7.457233143882241, "learning_rate": 1.616217716541248e-05, "loss": 0.0399, "step": 2276 }, { "epoch": 3.1021798365122617, "grad_norm": 1.8557809520847544, "learning_rate": 1.6158701017651216e-05, "loss": 0.0678, "step": 2277 }, { "epoch": 3.103542234332425, "grad_norm": 5.712554650723511, "learning_rate": 1.6155223670526328e-05, "loss": 0.0417, "step": 2278 }, { "epoch": 3.1049046321525884, "grad_norm": 3.226789043964936, "learning_rate": 1.6151745124715003e-05, "loss": 0.0283, "step": 2279 }, { "epoch": 3.106267029972752, "grad_norm": 2.7031037880596926, "learning_rate": 1.614826538089466e-05, "loss": 0.0344, "step": 2280 }, { "epoch": 3.1076294277929155, "grad_norm": 2.0134499677203186, "learning_rate": 1.6144784439742956e-05, "loss": 0.0323, "step": 2281 }, { "epoch": 3.108991825613079, "grad_norm": 0.762908727732797, "learning_rate": 1.6141302301937785e-05, "loss": 0.0166, "step": 2282 }, { "epoch": 3.1103542234332426, "grad_norm": 3.333665208691447, "learning_rate": 1.6137818968157265e-05, "loss": 0.0215, "step": 2283 }, { "epoch": 3.111716621253406, "grad_norm": 1.8917849608310102, "learning_rate": 1.613433443907975e-05, "loss": 0.0206, "step": 2284 }, { "epoch": 3.1130790190735693, "grad_norm": 2.4247827873095833, "learning_rate": 1.6130848715383827e-05, "loss": 0.0178, "step": 2285 }, { "epoch": 3.114441416893733, "grad_norm": 4.292718696386189, "learning_rate": 1.6127361797748316e-05, "loss": 0.0162, "step": 2286 }, { "epoch": 3.1158038147138964, "grad_norm": 1.7699691555926702, "learning_rate": 1.6123873686852274e-05, "loss": 0.0253, "step": 2287 }, { "epoch": 3.11716621253406, "grad_norm": 4.157052932524091, "learning_rate": 1.612038438337498e-05, "loss": 0.0364, "step": 2288 }, { "epoch": 3.1185286103542236, "grad_norm": 2.353685514077468, "learning_rate": 1.6116893887995954e-05, "loss": 0.0179, "step": 2289 }, { "epoch": 3.119891008174387, "grad_norm": 2.989150772907531, "learning_rate": 1.611340220139495e-05, "loss": 0.0323, "step": 2290 }, { "epoch": 3.1212534059945503, "grad_norm": 0.9154299485823997, "learning_rate": 1.610990932425194e-05, "loss": 0.0496, "step": 2291 }, { "epoch": 3.122615803814714, "grad_norm": 1.023089133654152, "learning_rate": 1.610641525724714e-05, "loss": 0.0367, "step": 2292 }, { "epoch": 3.1239782016348774, "grad_norm": 1.745438761125955, "learning_rate": 1.6102920001061003e-05, "loss": 0.0306, "step": 2293 }, { "epoch": 3.1253405994550407, "grad_norm": 2.367836274326053, "learning_rate": 1.6099423556374198e-05, "loss": 0.0257, "step": 2294 }, { "epoch": 3.1267029972752045, "grad_norm": 1.8043280258305339, "learning_rate": 1.6095925923867636e-05, "loss": 0.0349, "step": 2295 }, { "epoch": 3.128065395095368, "grad_norm": 4.447893832197556, "learning_rate": 1.6092427104222453e-05, "loss": 0.0623, "step": 2296 }, { "epoch": 3.129427792915531, "grad_norm": 1.9191393134841894, "learning_rate": 1.608892709812002e-05, "loss": 0.0257, "step": 2297 }, { "epoch": 3.130790190735695, "grad_norm": 2.678810714833835, "learning_rate": 1.608542590624194e-05, "loss": 0.0262, "step": 2298 }, { "epoch": 3.1321525885558583, "grad_norm": 2.3835246791047644, "learning_rate": 1.608192352927005e-05, "loss": 0.0315, "step": 2299 }, { "epoch": 3.1335149863760217, "grad_norm": 0.8454691973045277, "learning_rate": 1.6078419967886402e-05, "loss": 0.0412, "step": 2300 }, { "epoch": 3.1348773841961854, "grad_norm": 3.1362382684003722, "learning_rate": 1.60749152227733e-05, "loss": 0.0457, "step": 2301 }, { "epoch": 3.136239782016349, "grad_norm": 1.620846074352802, "learning_rate": 1.6071409294613263e-05, "loss": 0.0295, "step": 2302 }, { "epoch": 3.137602179836512, "grad_norm": 2.183961694773792, "learning_rate": 1.606790218408905e-05, "loss": 0.0256, "step": 2303 }, { "epoch": 3.138964577656676, "grad_norm": 2.1715505550270486, "learning_rate": 1.606439389188364e-05, "loss": 0.014, "step": 2304 }, { "epoch": 3.1403269754768393, "grad_norm": 2.183706650476771, "learning_rate": 1.6060884418680255e-05, "loss": 0.0359, "step": 2305 }, { "epoch": 3.1416893732970026, "grad_norm": 3.1452304969490115, "learning_rate": 1.6057373765162333e-05, "loss": 0.0194, "step": 2306 }, { "epoch": 3.1430517711171664, "grad_norm": 1.0348262999545172, "learning_rate": 1.6053861932013556e-05, "loss": 0.0095, "step": 2307 }, { "epoch": 3.1444141689373297, "grad_norm": 2.07523135450355, "learning_rate": 1.6050348919917828e-05, "loss": 0.0222, "step": 2308 }, { "epoch": 3.145776566757493, "grad_norm": 2.7868395566965485, "learning_rate": 1.604683472955928e-05, "loss": 0.0256, "step": 2309 }, { "epoch": 3.147138964577657, "grad_norm": 1.293301532296923, "learning_rate": 1.6043319361622277e-05, "loss": 0.0173, "step": 2310 }, { "epoch": 3.14850136239782, "grad_norm": 1.7476466180786407, "learning_rate": 1.603980281679141e-05, "loss": 0.0096, "step": 2311 }, { "epoch": 3.1498637602179835, "grad_norm": 1.8841748138690264, "learning_rate": 1.60362850957515e-05, "loss": 0.0407, "step": 2312 }, { "epoch": 3.1512261580381473, "grad_norm": 0.9518992364586545, "learning_rate": 1.603276619918761e-05, "loss": 0.0231, "step": 2313 }, { "epoch": 3.1525885558583107, "grad_norm": 1.5673401378081047, "learning_rate": 1.6029246127785008e-05, "loss": 0.0301, "step": 2314 }, { "epoch": 3.153950953678474, "grad_norm": 1.8860782641334153, "learning_rate": 1.6025724882229206e-05, "loss": 0.0156, "step": 2315 }, { "epoch": 3.155313351498638, "grad_norm": 2.1261126523089957, "learning_rate": 1.6022202463205947e-05, "loss": 0.0291, "step": 2316 }, { "epoch": 3.156675749318801, "grad_norm": 1.7559114268256255, "learning_rate": 1.6018678871401187e-05, "loss": 0.0255, "step": 2317 }, { "epoch": 3.1580381471389645, "grad_norm": 1.1838279755436951, "learning_rate": 1.6015154107501132e-05, "loss": 0.0241, "step": 2318 }, { "epoch": 3.1594005449591283, "grad_norm": 2.126825227903771, "learning_rate": 1.60116281721922e-05, "loss": 0.0277, "step": 2319 }, { "epoch": 3.1607629427792916, "grad_norm": 1.8677564771560302, "learning_rate": 1.6008101066161038e-05, "loss": 0.0182, "step": 2320 }, { "epoch": 3.162125340599455, "grad_norm": 2.1857838165317514, "learning_rate": 1.6004572790094535e-05, "loss": 0.0467, "step": 2321 }, { "epoch": 3.1634877384196187, "grad_norm": 1.7116831846514293, "learning_rate": 1.600104334467979e-05, "loss": 0.0328, "step": 2322 }, { "epoch": 3.164850136239782, "grad_norm": 2.2825609496482078, "learning_rate": 1.5997512730604135e-05, "loss": 0.0331, "step": 2323 }, { "epoch": 3.1662125340599454, "grad_norm": 2.6013806122095455, "learning_rate": 1.599398094855514e-05, "loss": 0.0158, "step": 2324 }, { "epoch": 3.167574931880109, "grad_norm": 0.956656934334016, "learning_rate": 1.599044799922059e-05, "loss": 0.0152, "step": 2325 }, { "epoch": 3.1689373297002725, "grad_norm": 0.7258859308865774, "learning_rate": 1.5986913883288505e-05, "loss": 0.0185, "step": 2326 }, { "epoch": 3.170299727520436, "grad_norm": 2.0300294516270214, "learning_rate": 1.5983378601447128e-05, "loss": 0.0307, "step": 2327 }, { "epoch": 3.1716621253405997, "grad_norm": 0.7347530373998817, "learning_rate": 1.5979842154384928e-05, "loss": 0.0242, "step": 2328 }, { "epoch": 3.173024523160763, "grad_norm": 1.6591359936534682, "learning_rate": 1.5976304542790607e-05, "loss": 0.0254, "step": 2329 }, { "epoch": 3.1743869209809263, "grad_norm": 1.8736130068423082, "learning_rate": 1.5972765767353088e-05, "loss": 0.0516, "step": 2330 }, { "epoch": 3.17574931880109, "grad_norm": 2.656323275954217, "learning_rate": 1.5969225828761516e-05, "loss": 0.0628, "step": 2331 }, { "epoch": 3.1771117166212535, "grad_norm": 2.360632130481381, "learning_rate": 1.596568472770528e-05, "loss": 0.025, "step": 2332 }, { "epoch": 3.178474114441417, "grad_norm": 2.385524110372595, "learning_rate": 1.5962142464873985e-05, "loss": 0.0255, "step": 2333 }, { "epoch": 3.1798365122615806, "grad_norm": 1.328176169745769, "learning_rate": 1.595859904095745e-05, "loss": 0.0255, "step": 2334 }, { "epoch": 3.181198910081744, "grad_norm": 0.7883049600440815, "learning_rate": 1.595505445664574e-05, "loss": 0.0065, "step": 2335 }, { "epoch": 3.1825613079019073, "grad_norm": 2.9039073337326817, "learning_rate": 1.595150871262914e-05, "loss": 0.0336, "step": 2336 }, { "epoch": 3.183923705722071, "grad_norm": 3.185226474539221, "learning_rate": 1.594796180959815e-05, "loss": 0.025, "step": 2337 }, { "epoch": 3.1852861035422344, "grad_norm": 2.335530541196924, "learning_rate": 1.5944413748243513e-05, "loss": 0.0342, "step": 2338 }, { "epoch": 3.1866485013623977, "grad_norm": 1.6431678240353997, "learning_rate": 1.5940864529256187e-05, "loss": 0.0237, "step": 2339 }, { "epoch": 3.1880108991825615, "grad_norm": 2.637667757113222, "learning_rate": 1.593731415332735e-05, "loss": 0.0229, "step": 2340 }, { "epoch": 3.189373297002725, "grad_norm": 2.915811522720921, "learning_rate": 1.593376262114842e-05, "loss": 0.0363, "step": 2341 }, { "epoch": 3.190735694822888, "grad_norm": 4.699744112820189, "learning_rate": 1.5930209933411036e-05, "loss": 0.0326, "step": 2342 }, { "epoch": 3.192098092643052, "grad_norm": 1.3479916820612803, "learning_rate": 1.592665609080705e-05, "loss": 0.0262, "step": 2343 }, { "epoch": 3.1934604904632153, "grad_norm": 2.999876971962308, "learning_rate": 1.592310109402855e-05, "loss": 0.0673, "step": 2344 }, { "epoch": 3.1948228882833787, "grad_norm": 1.2938179732527118, "learning_rate": 1.5919544943767856e-05, "loss": 0.0145, "step": 2345 }, { "epoch": 3.1961852861035425, "grad_norm": 2.0461624523617283, "learning_rate": 1.591598764071749e-05, "loss": 0.0078, "step": 2346 }, { "epoch": 3.197547683923706, "grad_norm": 1.7640302888383013, "learning_rate": 1.591242918557022e-05, "loss": 0.0401, "step": 2347 }, { "epoch": 3.198910081743869, "grad_norm": 0.5749510912293452, "learning_rate": 1.5908869579019025e-05, "loss": 0.0297, "step": 2348 }, { "epoch": 3.2002724795640325, "grad_norm": 1.7167232809178605, "learning_rate": 1.590530882175712e-05, "loss": 0.0193, "step": 2349 }, { "epoch": 3.2016348773841963, "grad_norm": 2.3943682192414033, "learning_rate": 1.590174691447793e-05, "loss": 0.0477, "step": 2350 }, { "epoch": 3.2029972752043596, "grad_norm": 1.4667734695302068, "learning_rate": 1.5898183857875115e-05, "loss": 0.0311, "step": 2351 }, { "epoch": 3.204359673024523, "grad_norm": 2.8654206758120555, "learning_rate": 1.5894619652642552e-05, "loss": 0.0454, "step": 2352 }, { "epoch": 3.2057220708446867, "grad_norm": 2.0074163715059905, "learning_rate": 1.5891054299474352e-05, "loss": 0.0343, "step": 2353 }, { "epoch": 3.20708446866485, "grad_norm": 2.220112254952992, "learning_rate": 1.588748779906484e-05, "loss": 0.0404, "step": 2354 }, { "epoch": 3.2084468664850134, "grad_norm": 2.959636455309174, "learning_rate": 1.5883920152108557e-05, "loss": 0.017, "step": 2355 }, { "epoch": 3.209809264305177, "grad_norm": 2.2394702407758147, "learning_rate": 1.588035135930029e-05, "loss": 0.0288, "step": 2356 }, { "epoch": 3.2111716621253406, "grad_norm": 2.5642639883086624, "learning_rate": 1.5876781421335033e-05, "loss": 0.0228, "step": 2357 }, { "epoch": 3.212534059945504, "grad_norm": 3.5181560265013205, "learning_rate": 1.5873210338908002e-05, "loss": 0.0238, "step": 2358 }, { "epoch": 3.2138964577656677, "grad_norm": 3.794410068453196, "learning_rate": 1.5869638112714642e-05, "loss": 0.0538, "step": 2359 }, { "epoch": 3.215258855585831, "grad_norm": 3.115253248197455, "learning_rate": 1.5866064743450618e-05, "loss": 0.0324, "step": 2360 }, { "epoch": 3.2166212534059944, "grad_norm": 2.072752633884402, "learning_rate": 1.5862490231811825e-05, "loss": 0.042, "step": 2361 }, { "epoch": 3.217983651226158, "grad_norm": 2.0431178693273084, "learning_rate": 1.5858914578494365e-05, "loss": 0.0117, "step": 2362 }, { "epoch": 3.2193460490463215, "grad_norm": 4.147963966962615, "learning_rate": 1.5855337784194576e-05, "loss": 0.0262, "step": 2363 }, { "epoch": 3.220708446866485, "grad_norm": 3.433638739127044, "learning_rate": 1.5851759849609016e-05, "loss": 0.0359, "step": 2364 }, { "epoch": 3.2220708446866486, "grad_norm": 3.815565704803568, "learning_rate": 1.5848180775434455e-05, "loss": 0.0378, "step": 2365 }, { "epoch": 3.223433242506812, "grad_norm": 2.130869870957259, "learning_rate": 1.58446005623679e-05, "loss": 0.0223, "step": 2366 }, { "epoch": 3.2247956403269753, "grad_norm": 1.5827682128606926, "learning_rate": 1.584101921110657e-05, "loss": 0.0254, "step": 2367 }, { "epoch": 3.226158038147139, "grad_norm": 2.377472323583393, "learning_rate": 1.5837436722347902e-05, "loss": 0.0354, "step": 2368 }, { "epoch": 3.2275204359673024, "grad_norm": 1.0703907953309508, "learning_rate": 1.5833853096789566e-05, "loss": 0.027, "step": 2369 }, { "epoch": 3.2288828337874658, "grad_norm": 2.841911528981791, "learning_rate": 1.583026833512945e-05, "loss": 0.0361, "step": 2370 }, { "epoch": 3.2302452316076296, "grad_norm": 2.3793093189025143, "learning_rate": 1.5826682438065657e-05, "loss": 0.0227, "step": 2371 }, { "epoch": 3.231607629427793, "grad_norm": 1.5122261191845194, "learning_rate": 1.5823095406296515e-05, "loss": 0.0378, "step": 2372 }, { "epoch": 3.2329700272479562, "grad_norm": 1.9274170683870444, "learning_rate": 1.5819507240520574e-05, "loss": 0.0245, "step": 2373 }, { "epoch": 3.23433242506812, "grad_norm": 3.01826692999813, "learning_rate": 1.581591794143661e-05, "loss": 0.0272, "step": 2374 }, { "epoch": 3.2356948228882834, "grad_norm": 1.4673190711493889, "learning_rate": 1.5812327509743603e-05, "loss": 0.0424, "step": 2375 }, { "epoch": 3.2370572207084467, "grad_norm": 4.404199726627562, "learning_rate": 1.580873594614077e-05, "loss": 0.0116, "step": 2376 }, { "epoch": 3.2384196185286105, "grad_norm": 1.6478833515800002, "learning_rate": 1.5805143251327547e-05, "loss": 0.0486, "step": 2377 }, { "epoch": 3.239782016348774, "grad_norm": 1.6095261545483872, "learning_rate": 1.5801549426003578e-05, "loss": 0.0268, "step": 2378 }, { "epoch": 3.241144414168937, "grad_norm": 1.8957572315298281, "learning_rate": 1.5797954470868737e-05, "loss": 0.036, "step": 2379 }, { "epoch": 3.242506811989101, "grad_norm": 0.7617618631552572, "learning_rate": 1.579435838662312e-05, "loss": 0.0251, "step": 2380 }, { "epoch": 3.2438692098092643, "grad_norm": 1.5224491345466657, "learning_rate": 1.5790761173967036e-05, "loss": 0.0245, "step": 2381 }, { "epoch": 3.2452316076294276, "grad_norm": 2.6653104678936055, "learning_rate": 1.5787162833601017e-05, "loss": 0.0362, "step": 2382 }, { "epoch": 3.2465940054495914, "grad_norm": 0.804825687907921, "learning_rate": 1.578356336622582e-05, "loss": 0.0129, "step": 2383 }, { "epoch": 3.2479564032697548, "grad_norm": 1.8118592899257093, "learning_rate": 1.5779962772542404e-05, "loss": 0.0114, "step": 2384 }, { "epoch": 3.249318801089918, "grad_norm": 2.9065433910388805, "learning_rate": 1.577636105325197e-05, "loss": 0.0335, "step": 2385 }, { "epoch": 3.250681198910082, "grad_norm": 2.187241561692798, "learning_rate": 1.5772758209055918e-05, "loss": 0.0157, "step": 2386 }, { "epoch": 3.2520435967302452, "grad_norm": 2.259768837532865, "learning_rate": 1.5769154240655886e-05, "loss": 0.0162, "step": 2387 }, { "epoch": 3.2534059945504086, "grad_norm": 2.8370412820159157, "learning_rate": 1.5765549148753718e-05, "loss": 0.0308, "step": 2388 }, { "epoch": 3.2547683923705724, "grad_norm": 0.7833525314206444, "learning_rate": 1.576194293405148e-05, "loss": 0.0258, "step": 2389 }, { "epoch": 3.2561307901907357, "grad_norm": 5.406438923185175, "learning_rate": 1.5758335597251456e-05, "loss": 0.0394, "step": 2390 }, { "epoch": 3.257493188010899, "grad_norm": 1.307293752844784, "learning_rate": 1.5754727139056153e-05, "loss": 0.0338, "step": 2391 }, { "epoch": 3.258855585831063, "grad_norm": 4.242195853676302, "learning_rate": 1.5751117560168284e-05, "loss": 0.0053, "step": 2392 }, { "epoch": 3.260217983651226, "grad_norm": 2.1366022226355064, "learning_rate": 1.5747506861290796e-05, "loss": 0.039, "step": 2393 }, { "epoch": 3.2615803814713895, "grad_norm": 2.20974391560119, "learning_rate": 1.5743895043126852e-05, "loss": 0.0269, "step": 2394 }, { "epoch": 3.2629427792915533, "grad_norm": 5.984766883528873, "learning_rate": 1.5740282106379816e-05, "loss": 0.0359, "step": 2395 }, { "epoch": 3.2643051771117166, "grad_norm": 2.1807473219387385, "learning_rate": 1.573666805175329e-05, "loss": 0.0355, "step": 2396 }, { "epoch": 3.26566757493188, "grad_norm": 5.346623580227264, "learning_rate": 1.5733052879951086e-05, "loss": 0.0414, "step": 2397 }, { "epoch": 3.2670299727520438, "grad_norm": 3.72375532943194, "learning_rate": 1.572943659167723e-05, "loss": 0.0171, "step": 2398 }, { "epoch": 3.268392370572207, "grad_norm": 3.115823339407583, "learning_rate": 1.5725819187635968e-05, "loss": 0.0217, "step": 2399 }, { "epoch": 3.2697547683923704, "grad_norm": 6.430473785709965, "learning_rate": 1.572220066853177e-05, "loss": 0.0409, "step": 2400 }, { "epoch": 3.2711171662125342, "grad_norm": 3.3488244202390396, "learning_rate": 1.571858103506931e-05, "loss": 0.0499, "step": 2401 }, { "epoch": 3.2724795640326976, "grad_norm": 7.204712876581532, "learning_rate": 1.5714960287953487e-05, "loss": 0.0244, "step": 2402 }, { "epoch": 3.273841961852861, "grad_norm": 6.541048843064083, "learning_rate": 1.571133842788942e-05, "loss": 0.0405, "step": 2403 }, { "epoch": 3.2752043596730247, "grad_norm": 4.231382848996052, "learning_rate": 1.570771545558244e-05, "loss": 0.0665, "step": 2404 }, { "epoch": 3.276566757493188, "grad_norm": 5.914098254421983, "learning_rate": 1.570409137173809e-05, "loss": 0.0237, "step": 2405 }, { "epoch": 3.2779291553133514, "grad_norm": 2.2058823492813144, "learning_rate": 1.5700466177062145e-05, "loss": 0.0144, "step": 2406 }, { "epoch": 3.279291553133515, "grad_norm": 4.26009862733648, "learning_rate": 1.5696839872260575e-05, "loss": 0.0263, "step": 2407 }, { "epoch": 3.2806539509536785, "grad_norm": 4.269394131187087, "learning_rate": 1.5693212458039585e-05, "loss": 0.0273, "step": 2408 }, { "epoch": 3.282016348773842, "grad_norm": 1.3467754615634782, "learning_rate": 1.5689583935105583e-05, "loss": 0.0178, "step": 2409 }, { "epoch": 3.2833787465940056, "grad_norm": 4.708548137490227, "learning_rate": 1.56859543041652e-05, "loss": 0.0351, "step": 2410 }, { "epoch": 3.284741144414169, "grad_norm": 2.3200549824420804, "learning_rate": 1.5682323565925287e-05, "loss": 0.0348, "step": 2411 }, { "epoch": 3.2861035422343323, "grad_norm": 3.83169066219053, "learning_rate": 1.5678691721092895e-05, "loss": 0.0194, "step": 2412 }, { "epoch": 3.287465940054496, "grad_norm": 3.756579657876847, "learning_rate": 1.5675058770375307e-05, "loss": 0.0516, "step": 2413 }, { "epoch": 3.2888283378746594, "grad_norm": 1.460482942011385, "learning_rate": 1.5671424714480012e-05, "loss": 0.0245, "step": 2414 }, { "epoch": 3.290190735694823, "grad_norm": 1.6856140432649043, "learning_rate": 1.5667789554114713e-05, "loss": 0.0086, "step": 2415 }, { "epoch": 3.291553133514986, "grad_norm": 3.0403569517584, "learning_rate": 1.5664153289987343e-05, "loss": 0.0413, "step": 2416 }, { "epoch": 3.29291553133515, "grad_norm": 1.3101891408728825, "learning_rate": 1.566051592280603e-05, "loss": 0.0365, "step": 2417 }, { "epoch": 3.2942779291553133, "grad_norm": 3.18722261470205, "learning_rate": 1.5656877453279124e-05, "loss": 0.0255, "step": 2418 }, { "epoch": 3.2956403269754766, "grad_norm": 4.313703129003527, "learning_rate": 1.5653237882115194e-05, "loss": 0.0235, "step": 2419 }, { "epoch": 3.2970027247956404, "grad_norm": 4.115186887876881, "learning_rate": 1.5649597210023027e-05, "loss": 0.0336, "step": 2420 }, { "epoch": 3.2983651226158037, "grad_norm": 5.814672800186344, "learning_rate": 1.564595543771161e-05, "loss": 0.0368, "step": 2421 }, { "epoch": 3.299727520435967, "grad_norm": 2.577283442198953, "learning_rate": 1.5642312565890153e-05, "loss": 0.0299, "step": 2422 }, { "epoch": 3.301089918256131, "grad_norm": 5.864406851596754, "learning_rate": 1.5638668595268086e-05, "loss": 0.0332, "step": 2423 }, { "epoch": 3.302452316076294, "grad_norm": 2.667083171689745, "learning_rate": 1.563502352655504e-05, "loss": 0.0221, "step": 2424 }, { "epoch": 3.3038147138964575, "grad_norm": 4.287701495713412, "learning_rate": 1.5631377360460872e-05, "loss": 0.0183, "step": 2425 }, { "epoch": 3.3051771117166213, "grad_norm": 2.290899462410005, "learning_rate": 1.562773009769564e-05, "loss": 0.0302, "step": 2426 }, { "epoch": 3.3065395095367847, "grad_norm": 1.9363903756441958, "learning_rate": 1.562408173896963e-05, "loss": 0.0432, "step": 2427 }, { "epoch": 3.307901907356948, "grad_norm": 1.0438037031803018, "learning_rate": 1.5620432284993327e-05, "loss": 0.0224, "step": 2428 }, { "epoch": 3.309264305177112, "grad_norm": 1.2167300911805408, "learning_rate": 1.5616781736477445e-05, "loss": 0.0241, "step": 2429 }, { "epoch": 3.310626702997275, "grad_norm": 1.3269615749225045, "learning_rate": 1.5613130094132896e-05, "loss": 0.0189, "step": 2430 }, { "epoch": 3.3119891008174385, "grad_norm": 1.9681686778415852, "learning_rate": 1.5609477358670813e-05, "loss": 0.0405, "step": 2431 }, { "epoch": 3.3133514986376023, "grad_norm": 3.157415333608894, "learning_rate": 1.5605823530802543e-05, "loss": 0.0238, "step": 2432 }, { "epoch": 3.3147138964577656, "grad_norm": 3.4874205331016674, "learning_rate": 1.560216861123964e-05, "loss": 0.0573, "step": 2433 }, { "epoch": 3.316076294277929, "grad_norm": 1.74904796091721, "learning_rate": 1.5598512600693875e-05, "loss": 0.0267, "step": 2434 }, { "epoch": 3.3174386920980927, "grad_norm": 1.9224411248004871, "learning_rate": 1.559485549987723e-05, "loss": 0.0164, "step": 2435 }, { "epoch": 3.318801089918256, "grad_norm": 2.8643543032497685, "learning_rate": 1.55911973095019e-05, "loss": 0.0288, "step": 2436 }, { "epoch": 3.3201634877384194, "grad_norm": 1.3600975111838072, "learning_rate": 1.5587538030280293e-05, "loss": 0.0129, "step": 2437 }, { "epoch": 3.321525885558583, "grad_norm": 2.173530973687945, "learning_rate": 1.5583877662925032e-05, "loss": 0.0638, "step": 2438 }, { "epoch": 3.3228882833787465, "grad_norm": 2.8623679985070525, "learning_rate": 1.5580216208148935e-05, "loss": 0.0429, "step": 2439 }, { "epoch": 3.32425068119891, "grad_norm": 4.743574053014761, "learning_rate": 1.5576553666665054e-05, "loss": 0.0174, "step": 2440 }, { "epoch": 3.3256130790190737, "grad_norm": 1.459517788146225, "learning_rate": 1.557289003918664e-05, "loss": 0.0168, "step": 2441 }, { "epoch": 3.326975476839237, "grad_norm": 2.3465118750534857, "learning_rate": 1.5569225326427163e-05, "loss": 0.0268, "step": 2442 }, { "epoch": 3.3283378746594003, "grad_norm": 2.371375319878403, "learning_rate": 1.5565559529100293e-05, "loss": 0.0206, "step": 2443 }, { "epoch": 3.329700272479564, "grad_norm": 2.9128117267047178, "learning_rate": 1.5561892647919922e-05, "loss": 0.0498, "step": 2444 }, { "epoch": 3.3310626702997275, "grad_norm": 1.7441723208291828, "learning_rate": 1.5558224683600154e-05, "loss": 0.0255, "step": 2445 }, { "epoch": 3.332425068119891, "grad_norm": 2.7384361086580347, "learning_rate": 1.5554555636855293e-05, "loss": 0.0264, "step": 2446 }, { "epoch": 3.3337874659400546, "grad_norm": 1.1393275623049952, "learning_rate": 1.5550885508399857e-05, "loss": 0.0147, "step": 2447 }, { "epoch": 3.335149863760218, "grad_norm": 2.2306136616935675, "learning_rate": 1.5547214298948587e-05, "loss": 0.0395, "step": 2448 }, { "epoch": 3.3365122615803813, "grad_norm": 2.305701529706429, "learning_rate": 1.554354200921642e-05, "loss": 0.0198, "step": 2449 }, { "epoch": 3.337874659400545, "grad_norm": 2.3986694077349173, "learning_rate": 1.5539868639918507e-05, "loss": 0.0306, "step": 2450 }, { "epoch": 3.3392370572207084, "grad_norm": 2.705267907819014, "learning_rate": 1.5536194191770213e-05, "loss": 0.0288, "step": 2451 }, { "epoch": 3.3405994550408717, "grad_norm": 1.5732123434313738, "learning_rate": 1.5532518665487112e-05, "loss": 0.0497, "step": 2452 }, { "epoch": 3.3419618528610355, "grad_norm": 1.6828011865734345, "learning_rate": 1.552884206178498e-05, "loss": 0.0237, "step": 2453 }, { "epoch": 3.343324250681199, "grad_norm": 2.2395523340852383, "learning_rate": 1.5525164381379823e-05, "loss": 0.0549, "step": 2454 }, { "epoch": 3.344686648501362, "grad_norm": 1.5933713889164955, "learning_rate": 1.552148562498783e-05, "loss": 0.0284, "step": 2455 }, { "epoch": 3.346049046321526, "grad_norm": 1.003529536728463, "learning_rate": 1.551780579332542e-05, "loss": 0.0433, "step": 2456 }, { "epoch": 3.3474114441416893, "grad_norm": 1.5197235685138346, "learning_rate": 1.5514124887109213e-05, "loss": 0.042, "step": 2457 }, { "epoch": 3.3487738419618527, "grad_norm": 2.314342798745496, "learning_rate": 1.5510442907056038e-05, "loss": 0.0442, "step": 2458 }, { "epoch": 3.3501362397820165, "grad_norm": 4.258083855539177, "learning_rate": 1.5506759853882936e-05, "loss": 0.0413, "step": 2459 }, { "epoch": 3.35149863760218, "grad_norm": 0.8782570760588443, "learning_rate": 1.5503075728307157e-05, "loss": 0.0189, "step": 2460 }, { "epoch": 3.352861035422343, "grad_norm": 2.3702598062056928, "learning_rate": 1.5499390531046153e-05, "loss": 0.0342, "step": 2461 }, { "epoch": 3.354223433242507, "grad_norm": 4.117101535995388, "learning_rate": 1.5495704262817595e-05, "loss": 0.0353, "step": 2462 }, { "epoch": 3.3555858310626703, "grad_norm": 1.484382216471467, "learning_rate": 1.549201692433936e-05, "loss": 0.0179, "step": 2463 }, { "epoch": 3.3569482288828336, "grad_norm": 2.3459424785354726, "learning_rate": 1.5488328516329528e-05, "loss": 0.0381, "step": 2464 }, { "epoch": 3.3583106267029974, "grad_norm": 4.924634175309881, "learning_rate": 1.548463903950639e-05, "loss": 0.018, "step": 2465 }, { "epoch": 3.3596730245231607, "grad_norm": 1.306653155039036, "learning_rate": 1.548094849458844e-05, "loss": 0.0276, "step": 2466 }, { "epoch": 3.361035422343324, "grad_norm": 3.009254141941765, "learning_rate": 1.54772568822944e-05, "loss": 0.0469, "step": 2467 }, { "epoch": 3.362397820163488, "grad_norm": 5.269294964975472, "learning_rate": 1.5473564203343173e-05, "loss": 0.0426, "step": 2468 }, { "epoch": 3.363760217983651, "grad_norm": 2.465330836621245, "learning_rate": 1.546987045845389e-05, "loss": 0.0349, "step": 2469 }, { "epoch": 3.3651226158038146, "grad_norm": 5.535297664286983, "learning_rate": 1.5466175648345875e-05, "loss": 0.029, "step": 2470 }, { "epoch": 3.3664850136239783, "grad_norm": 3.004230870278467, "learning_rate": 1.546247977373867e-05, "loss": 0.0369, "step": 2471 }, { "epoch": 3.3678474114441417, "grad_norm": 5.445947519771516, "learning_rate": 1.545878283535202e-05, "loss": 0.0262, "step": 2472 }, { "epoch": 3.369209809264305, "grad_norm": 8.15009719767928, "learning_rate": 1.5455084833905875e-05, "loss": 0.0397, "step": 2473 }, { "epoch": 3.370572207084469, "grad_norm": 3.710759239164355, "learning_rate": 1.5451385770120402e-05, "loss": 0.0369, "step": 2474 }, { "epoch": 3.371934604904632, "grad_norm": 7.532491471890521, "learning_rate": 1.5447685644715963e-05, "loss": 0.0353, "step": 2475 }, { "epoch": 3.3732970027247955, "grad_norm": 2.8658917879176955, "learning_rate": 1.544398445841313e-05, "loss": 0.0126, "step": 2476 }, { "epoch": 3.3746594005449593, "grad_norm": 5.47874710710165, "learning_rate": 1.544028221193268e-05, "loss": 0.0204, "step": 2477 }, { "epoch": 3.3760217983651226, "grad_norm": 5.175731949020486, "learning_rate": 1.5436578905995612e-05, "loss": 0.0226, "step": 2478 }, { "epoch": 3.377384196185286, "grad_norm": 2.0421133046972213, "learning_rate": 1.5432874541323105e-05, "loss": 0.0238, "step": 2479 }, { "epoch": 3.3787465940054497, "grad_norm": 5.4689262966417225, "learning_rate": 1.5429169118636566e-05, "loss": 0.0263, "step": 2480 }, { "epoch": 3.380108991825613, "grad_norm": 5.096448757637276, "learning_rate": 1.5425462638657597e-05, "loss": 0.0079, "step": 2481 }, { "epoch": 3.3814713896457764, "grad_norm": 1.1369108465169553, "learning_rate": 1.5421755102108008e-05, "loss": 0.0236, "step": 2482 }, { "epoch": 3.38283378746594, "grad_norm": 5.763663801180963, "learning_rate": 1.5418046509709817e-05, "loss": 0.0256, "step": 2483 }, { "epoch": 3.3841961852861036, "grad_norm": 2.486036774852712, "learning_rate": 1.5414336862185246e-05, "loss": 0.0066, "step": 2484 }, { "epoch": 3.385558583106267, "grad_norm": 0.9494608331946462, "learning_rate": 1.541062616025672e-05, "loss": 0.0035, "step": 2485 }, { "epoch": 3.3869209809264307, "grad_norm": 2.281362421800216, "learning_rate": 1.540691440464688e-05, "loss": 0.0209, "step": 2486 }, { "epoch": 3.388283378746594, "grad_norm": 3.755841344876947, "learning_rate": 1.540320159607856e-05, "loss": 0.0341, "step": 2487 }, { "epoch": 3.3896457765667574, "grad_norm": 1.5787235226194292, "learning_rate": 1.5399487735274795e-05, "loss": 0.01, "step": 2488 }, { "epoch": 3.391008174386921, "grad_norm": 4.899575462024144, "learning_rate": 1.5395772822958844e-05, "loss": 0.0195, "step": 2489 }, { "epoch": 3.3923705722070845, "grad_norm": 2.735356394290371, "learning_rate": 1.539205685985416e-05, "loss": 0.0228, "step": 2490 }, { "epoch": 3.393732970027248, "grad_norm": 2.854519042976746, "learning_rate": 1.5388339846684396e-05, "loss": 0.0346, "step": 2491 }, { "epoch": 3.3950953678474116, "grad_norm": 6.015976184400875, "learning_rate": 1.5384621784173415e-05, "loss": 0.0208, "step": 2492 }, { "epoch": 3.396457765667575, "grad_norm": 1.3762342855384542, "learning_rate": 1.5380902673045284e-05, "loss": 0.0099, "step": 2493 }, { "epoch": 3.3978201634877383, "grad_norm": 5.8912261808416755, "learning_rate": 1.5377182514024268e-05, "loss": 0.0807, "step": 2494 }, { "epoch": 3.399182561307902, "grad_norm": 3.768450385699161, "learning_rate": 1.5373461307834854e-05, "loss": 0.0334, "step": 2495 }, { "epoch": 3.4005449591280654, "grad_norm": 2.3131783196689897, "learning_rate": 1.5369739055201713e-05, "loss": 0.0203, "step": 2496 }, { "epoch": 3.4019073569482288, "grad_norm": 4.570171263043837, "learning_rate": 1.5366015756849728e-05, "loss": 0.0231, "step": 2497 }, { "epoch": 3.4032697547683926, "grad_norm": 3.4986443206153486, "learning_rate": 1.5362291413503984e-05, "loss": 0.0282, "step": 2498 }, { "epoch": 3.404632152588556, "grad_norm": 3.8263102744310853, "learning_rate": 1.5358566025889775e-05, "loss": 0.0095, "step": 2499 }, { "epoch": 3.4059945504087192, "grad_norm": 5.999290854495908, "learning_rate": 1.535483959473259e-05, "loss": 0.0388, "step": 2500 }, { "epoch": 3.407356948228883, "grad_norm": 2.2032138428391432, "learning_rate": 1.5351112120758122e-05, "loss": 0.0165, "step": 2501 }, { "epoch": 3.4087193460490464, "grad_norm": 4.005796050361744, "learning_rate": 1.534738360469228e-05, "loss": 0.018, "step": 2502 }, { "epoch": 3.4100817438692097, "grad_norm": 2.9548549493089147, "learning_rate": 1.534365404726116e-05, "loss": 0.013, "step": 2503 }, { "epoch": 3.4114441416893735, "grad_norm": 2.7127416588094513, "learning_rate": 1.5339923449191067e-05, "loss": 0.0602, "step": 2504 }, { "epoch": 3.412806539509537, "grad_norm": 4.319500265699035, "learning_rate": 1.5336191811208508e-05, "loss": 0.0287, "step": 2505 }, { "epoch": 3.4141689373297, "grad_norm": 1.166400422731748, "learning_rate": 1.53324591340402e-05, "loss": 0.0341, "step": 2506 }, { "epoch": 3.415531335149864, "grad_norm": 3.305849285444634, "learning_rate": 1.5328725418413045e-05, "loss": 0.027, "step": 2507 }, { "epoch": 3.4168937329700273, "grad_norm": 4.0456075887531675, "learning_rate": 1.5324990665054165e-05, "loss": 0.0292, "step": 2508 }, { "epoch": 3.4182561307901906, "grad_norm": 3.2208784514256354, "learning_rate": 1.5321254874690876e-05, "loss": 0.0621, "step": 2509 }, { "epoch": 3.4196185286103544, "grad_norm": 4.351389967434829, "learning_rate": 1.5317518048050698e-05, "loss": 0.0231, "step": 2510 }, { "epoch": 3.4209809264305178, "grad_norm": 2.445713781282391, "learning_rate": 1.5313780185861344e-05, "loss": 0.0388, "step": 2511 }, { "epoch": 3.422343324250681, "grad_norm": 2.833136768180505, "learning_rate": 1.5310041288850747e-05, "loss": 0.0267, "step": 2512 }, { "epoch": 3.423705722070845, "grad_norm": 6.393807603426435, "learning_rate": 1.530630135774702e-05, "loss": 0.0413, "step": 2513 }, { "epoch": 3.4250681198910082, "grad_norm": 1.9843671539422294, "learning_rate": 1.53025603932785e-05, "loss": 0.029, "step": 2514 }, { "epoch": 3.4264305177111716, "grad_norm": 5.568346634967174, "learning_rate": 1.5298818396173707e-05, "loss": 0.0397, "step": 2515 }, { "epoch": 3.4277929155313354, "grad_norm": 3.9867695441903312, "learning_rate": 1.5295075367161366e-05, "loss": 0.0275, "step": 2516 }, { "epoch": 3.4291553133514987, "grad_norm": 2.772640302313864, "learning_rate": 1.529133130697041e-05, "loss": 0.03, "step": 2517 }, { "epoch": 3.430517711171662, "grad_norm": 6.343056317073301, "learning_rate": 1.5287586216329966e-05, "loss": 0.0223, "step": 2518 }, { "epoch": 3.431880108991826, "grad_norm": 2.9379016016991515, "learning_rate": 1.5283840095969365e-05, "loss": 0.0366, "step": 2519 }, { "epoch": 3.433242506811989, "grad_norm": 2.5508910036606642, "learning_rate": 1.528009294661814e-05, "loss": 0.0227, "step": 2520 }, { "epoch": 3.4346049046321525, "grad_norm": 6.372724361435863, "learning_rate": 1.527634476900602e-05, "loss": 0.0475, "step": 2521 }, { "epoch": 3.4359673024523163, "grad_norm": 2.5062525966694618, "learning_rate": 1.5272595563862935e-05, "loss": 0.022, "step": 2522 }, { "epoch": 3.4373297002724796, "grad_norm": 5.029824473080129, "learning_rate": 1.5268845331919013e-05, "loss": 0.0251, "step": 2523 }, { "epoch": 3.438692098092643, "grad_norm": 3.650126090439522, "learning_rate": 1.526509407390459e-05, "loss": 0.0411, "step": 2524 }, { "epoch": 3.4400544959128068, "grad_norm": 3.353523305338685, "learning_rate": 1.5261341790550196e-05, "loss": 0.0166, "step": 2525 }, { "epoch": 3.44141689373297, "grad_norm": 4.258941768628841, "learning_rate": 1.5257588482586566e-05, "loss": 0.0154, "step": 2526 }, { "epoch": 3.4427792915531334, "grad_norm": 1.9325284646502678, "learning_rate": 1.5253834150744625e-05, "loss": 0.0411, "step": 2527 }, { "epoch": 3.4441416893732972, "grad_norm": 1.627930272348953, "learning_rate": 1.5250078795755498e-05, "loss": 0.0162, "step": 2528 }, { "epoch": 3.4455040871934606, "grad_norm": 3.3948043375191563, "learning_rate": 1.5246322418350525e-05, "loss": 0.0086, "step": 2529 }, { "epoch": 3.446866485013624, "grad_norm": 2.4423170123934366, "learning_rate": 1.5242565019261229e-05, "loss": 0.042, "step": 2530 }, { "epoch": 3.4482288828337877, "grad_norm": 3.922543005432571, "learning_rate": 1.5238806599219337e-05, "loss": 0.0344, "step": 2531 }, { "epoch": 3.449591280653951, "grad_norm": 1.9048959320326084, "learning_rate": 1.5235047158956775e-05, "loss": 0.0137, "step": 2532 }, { "epoch": 3.4509536784741144, "grad_norm": 1.9387407353562258, "learning_rate": 1.5231286699205665e-05, "loss": 0.0465, "step": 2533 }, { "epoch": 3.452316076294278, "grad_norm": 2.916889423877399, "learning_rate": 1.5227525220698332e-05, "loss": 0.0495, "step": 2534 }, { "epoch": 3.4536784741144415, "grad_norm": 3.993507200136313, "learning_rate": 1.5223762724167299e-05, "loss": 0.0325, "step": 2535 }, { "epoch": 3.455040871934605, "grad_norm": 1.260304219556344, "learning_rate": 1.5219999210345285e-05, "loss": 0.0252, "step": 2536 }, { "epoch": 3.4564032697547686, "grad_norm": 3.6956622926285307, "learning_rate": 1.5216234679965205e-05, "loss": 0.034, "step": 2537 }, { "epoch": 3.457765667574932, "grad_norm": 3.905283377706892, "learning_rate": 1.5212469133760178e-05, "loss": 0.0465, "step": 2538 }, { "epoch": 3.4591280653950953, "grad_norm": 2.612065263297633, "learning_rate": 1.5208702572463519e-05, "loss": 0.0261, "step": 2539 }, { "epoch": 3.460490463215259, "grad_norm": 6.303432831663987, "learning_rate": 1.5204934996808729e-05, "loss": 0.0578, "step": 2540 }, { "epoch": 3.4618528610354224, "grad_norm": 1.1655724958530675, "learning_rate": 1.5201166407529533e-05, "loss": 0.0116, "step": 2541 }, { "epoch": 3.463215258855586, "grad_norm": 4.000170852804495, "learning_rate": 1.5197396805359823e-05, "loss": 0.0207, "step": 2542 }, { "epoch": 3.464577656675749, "grad_norm": 4.193644395665097, "learning_rate": 1.5193626191033713e-05, "loss": 0.0487, "step": 2543 }, { "epoch": 3.465940054495913, "grad_norm": 4.3092705920674135, "learning_rate": 1.5189854565285495e-05, "loss": 0.0361, "step": 2544 }, { "epoch": 3.4673024523160763, "grad_norm": 1.757780717801722, "learning_rate": 1.5186081928849672e-05, "loss": 0.0471, "step": 2545 }, { "epoch": 3.4686648501362396, "grad_norm": 5.133848280373157, "learning_rate": 1.5182308282460936e-05, "loss": 0.0304, "step": 2546 }, { "epoch": 3.4700272479564034, "grad_norm": 3.6714093416216156, "learning_rate": 1.5178533626854179e-05, "loss": 0.0158, "step": 2547 }, { "epoch": 3.4713896457765667, "grad_norm": 1.9111301372411988, "learning_rate": 1.5174757962764488e-05, "loss": 0.0095, "step": 2548 }, { "epoch": 3.47275204359673, "grad_norm": 4.603393675550013, "learning_rate": 1.5170981290927148e-05, "loss": 0.0286, "step": 2549 }, { "epoch": 3.474114441416894, "grad_norm": 3.1349201450336954, "learning_rate": 1.5167203612077638e-05, "loss": 0.0178, "step": 2550 }, { "epoch": 3.475476839237057, "grad_norm": 3.2339153390988806, "learning_rate": 1.5163424926951638e-05, "loss": 0.0207, "step": 2551 }, { "epoch": 3.4768392370572205, "grad_norm": 4.612960396212066, "learning_rate": 1.515964523628501e-05, "loss": 0.0437, "step": 2552 }, { "epoch": 3.4782016348773843, "grad_norm": 1.8193608998839421, "learning_rate": 1.5155864540813837e-05, "loss": 0.0574, "step": 2553 }, { "epoch": 3.4795640326975477, "grad_norm": 1.2590588478791882, "learning_rate": 1.5152082841274368e-05, "loss": 0.0321, "step": 2554 }, { "epoch": 3.480926430517711, "grad_norm": 4.4073211865614965, "learning_rate": 1.5148300138403076e-05, "loss": 0.0239, "step": 2555 }, { "epoch": 3.482288828337875, "grad_norm": 1.8079972239030417, "learning_rate": 1.5144516432936604e-05, "loss": 0.0325, "step": 2556 }, { "epoch": 3.483651226158038, "grad_norm": 1.5456812020364126, "learning_rate": 1.5140731725611808e-05, "loss": 0.0328, "step": 2557 }, { "epoch": 3.4850136239782015, "grad_norm": 4.2023911626903905, "learning_rate": 1.5136946017165734e-05, "loss": 0.0236, "step": 2558 }, { "epoch": 3.4863760217983653, "grad_norm": 2.922941506573032, "learning_rate": 1.5133159308335621e-05, "loss": 0.0274, "step": 2559 }, { "epoch": 3.4877384196185286, "grad_norm": 4.460777222074488, "learning_rate": 1.51293715998589e-05, "loss": 0.0457, "step": 2560 }, { "epoch": 3.489100817438692, "grad_norm": 0.6230017644350958, "learning_rate": 1.5125582892473206e-05, "loss": 0.0292, "step": 2561 }, { "epoch": 3.4904632152588557, "grad_norm": 1.1918239722104556, "learning_rate": 1.512179318691636e-05, "loss": 0.0232, "step": 2562 }, { "epoch": 3.491825613079019, "grad_norm": 1.8749607464712161, "learning_rate": 1.5118002483926381e-05, "loss": 0.0287, "step": 2563 }, { "epoch": 3.4931880108991824, "grad_norm": 2.1866881261167443, "learning_rate": 1.5114210784241483e-05, "loss": 0.0142, "step": 2564 }, { "epoch": 3.494550408719346, "grad_norm": 1.5300249907319317, "learning_rate": 1.5110418088600071e-05, "loss": 0.0388, "step": 2565 }, { "epoch": 3.4959128065395095, "grad_norm": 4.017070538908385, "learning_rate": 1.5106624397740745e-05, "loss": 0.0528, "step": 2566 }, { "epoch": 3.497275204359673, "grad_norm": 2.7861542557745738, "learning_rate": 1.5102829712402303e-05, "loss": 0.0307, "step": 2567 }, { "epoch": 3.4986376021798367, "grad_norm": 1.675009481165617, "learning_rate": 1.5099034033323732e-05, "loss": 0.0114, "step": 2568 }, { "epoch": 3.5, "grad_norm": 2.068446588372538, "learning_rate": 1.5095237361244207e-05, "loss": 0.0222, "step": 2569 }, { "epoch": 3.5013623978201633, "grad_norm": 2.0310618164733847, "learning_rate": 1.5091439696903116e-05, "loss": 0.039, "step": 2570 }, { "epoch": 3.502724795640327, "grad_norm": 3.4189639256488404, "learning_rate": 1.5087641041040018e-05, "loss": 0.0228, "step": 2571 }, { "epoch": 3.5040871934604905, "grad_norm": 1.00997961289068, "learning_rate": 1.5083841394394676e-05, "loss": 0.0174, "step": 2572 }, { "epoch": 3.505449591280654, "grad_norm": 3.8426269389823084, "learning_rate": 1.5080040757707045e-05, "loss": 0.0267, "step": 2573 }, { "epoch": 3.5068119891008176, "grad_norm": 1.9917554859659174, "learning_rate": 1.5076239131717279e-05, "loss": 0.0386, "step": 2574 }, { "epoch": 3.508174386920981, "grad_norm": 1.4910142100236552, "learning_rate": 1.5072436517165703e-05, "loss": 0.0285, "step": 2575 }, { "epoch": 3.5095367847411443, "grad_norm": 1.3204380795746467, "learning_rate": 1.5068632914792862e-05, "loss": 0.02, "step": 2576 }, { "epoch": 3.510899182561308, "grad_norm": 2.1035902659472265, "learning_rate": 1.5064828325339475e-05, "loss": 0.0395, "step": 2577 }, { "epoch": 3.5122615803814714, "grad_norm": 1.383566540215782, "learning_rate": 1.506102274954646e-05, "loss": 0.0428, "step": 2578 }, { "epoch": 3.5136239782016347, "grad_norm": 3.4389128047981936, "learning_rate": 1.5057216188154928e-05, "loss": 0.0313, "step": 2579 }, { "epoch": 3.5149863760217985, "grad_norm": 2.5301103478088236, "learning_rate": 1.5053408641906181e-05, "loss": 0.0277, "step": 2580 }, { "epoch": 3.516348773841962, "grad_norm": 2.091955524527521, "learning_rate": 1.5049600111541704e-05, "loss": 0.0335, "step": 2581 }, { "epoch": 3.517711171662125, "grad_norm": 2.4234649113857634, "learning_rate": 1.5045790597803191e-05, "loss": 0.0268, "step": 2582 }, { "epoch": 3.5190735694822886, "grad_norm": 2.588137429114274, "learning_rate": 1.504198010143251e-05, "loss": 0.0189, "step": 2583 }, { "epoch": 3.5204359673024523, "grad_norm": 1.6642161481353421, "learning_rate": 1.5038168623171736e-05, "loss": 0.0236, "step": 2584 }, { "epoch": 3.5217983651226157, "grad_norm": 1.4031745408779421, "learning_rate": 1.5034356163763125e-05, "loss": 0.027, "step": 2585 }, { "epoch": 3.523160762942779, "grad_norm": 4.127877770883526, "learning_rate": 1.5030542723949122e-05, "loss": 0.0046, "step": 2586 }, { "epoch": 3.524523160762943, "grad_norm": 1.5793537103825133, "learning_rate": 1.502672830447237e-05, "loss": 0.0284, "step": 2587 }, { "epoch": 3.525885558583106, "grad_norm": 3.8339833048403498, "learning_rate": 1.5022912906075703e-05, "loss": 0.0322, "step": 2588 }, { "epoch": 3.5272479564032695, "grad_norm": 3.2387220779998924, "learning_rate": 1.501909652950214e-05, "loss": 0.0257, "step": 2589 }, { "epoch": 3.5286103542234333, "grad_norm": 1.378057612541891, "learning_rate": 1.5015279175494897e-05, "loss": 0.0538, "step": 2590 }, { "epoch": 3.5299727520435966, "grad_norm": 5.037619368002515, "learning_rate": 1.5011460844797372e-05, "loss": 0.0288, "step": 2591 }, { "epoch": 3.53133514986376, "grad_norm": 2.8151511917762986, "learning_rate": 1.5007641538153161e-05, "loss": 0.0398, "step": 2592 }, { "epoch": 3.5326975476839237, "grad_norm": 1.9888423749634212, "learning_rate": 1.5003821256306049e-05, "loss": 0.0202, "step": 2593 }, { "epoch": 3.534059945504087, "grad_norm": 2.3397484973109215, "learning_rate": 1.5000000000000002e-05, "loss": 0.0203, "step": 2594 }, { "epoch": 3.5354223433242504, "grad_norm": 4.123537122594973, "learning_rate": 1.4996177769979188e-05, "loss": 0.0291, "step": 2595 }, { "epoch": 3.536784741144414, "grad_norm": 4.8387152364530275, "learning_rate": 1.499235456698796e-05, "loss": 0.0165, "step": 2596 }, { "epoch": 3.5381471389645776, "grad_norm": 2.341743122823418, "learning_rate": 1.4988530391770857e-05, "loss": 0.0222, "step": 2597 }, { "epoch": 3.539509536784741, "grad_norm": 1.7578059878371604, "learning_rate": 1.4984705245072613e-05, "loss": 0.0231, "step": 2598 }, { "epoch": 3.5408719346049047, "grad_norm": 1.9683136508223298, "learning_rate": 1.4980879127638146e-05, "loss": 0.0213, "step": 2599 }, { "epoch": 3.542234332425068, "grad_norm": 1.2635423052909103, "learning_rate": 1.4977052040212567e-05, "loss": 0.048, "step": 2600 }, { "epoch": 3.5435967302452314, "grad_norm": 1.364452045153054, "learning_rate": 1.4973223983541172e-05, "loss": 0.0389, "step": 2601 }, { "epoch": 3.544959128065395, "grad_norm": 1.6305238144346155, "learning_rate": 1.4969394958369453e-05, "loss": 0.0411, "step": 2602 }, { "epoch": 3.5463215258855585, "grad_norm": 2.0147785601557473, "learning_rate": 1.4965564965443077e-05, "loss": 0.0163, "step": 2603 }, { "epoch": 3.547683923705722, "grad_norm": 2.74251659758781, "learning_rate": 1.4961734005507917e-05, "loss": 0.055, "step": 2604 }, { "epoch": 3.5490463215258856, "grad_norm": 2.2307560286626105, "learning_rate": 1.4957902079310023e-05, "loss": 0.0332, "step": 2605 }, { "epoch": 3.550408719346049, "grad_norm": 1.1898489171055713, "learning_rate": 1.4954069187595633e-05, "loss": 0.0306, "step": 2606 }, { "epoch": 3.5517711171662123, "grad_norm": 2.5021818554583346, "learning_rate": 1.4950235331111173e-05, "loss": 0.0163, "step": 2607 }, { "epoch": 3.553133514986376, "grad_norm": 0.6073955845854401, "learning_rate": 1.494640051060327e-05, "loss": 0.0031, "step": 2608 }, { "epoch": 3.5544959128065394, "grad_norm": 3.7464349070408085, "learning_rate": 1.4942564726818721e-05, "loss": 0.0687, "step": 2609 }, { "epoch": 3.5558583106267028, "grad_norm": 2.119567248947708, "learning_rate": 1.4938727980504517e-05, "loss": 0.0678, "step": 2610 }, { "epoch": 3.5572207084468666, "grad_norm": 3.7540698747095536, "learning_rate": 1.493489027240784e-05, "loss": 0.051, "step": 2611 }, { "epoch": 3.55858310626703, "grad_norm": 3.558790361745178, "learning_rate": 1.4931051603276055e-05, "loss": 0.0426, "step": 2612 }, { "epoch": 3.5599455040871932, "grad_norm": 1.6323963256709801, "learning_rate": 1.4927211973856717e-05, "loss": 0.0284, "step": 2613 }, { "epoch": 3.561307901907357, "grad_norm": 4.239470926963566, "learning_rate": 1.4923371384897567e-05, "loss": 0.0394, "step": 2614 }, { "epoch": 3.5626702997275204, "grad_norm": 2.689700060508437, "learning_rate": 1.4919529837146529e-05, "loss": 0.0425, "step": 2615 }, { "epoch": 3.5640326975476837, "grad_norm": 2.4965492399145406, "learning_rate": 1.4915687331351725e-05, "loss": 0.0154, "step": 2616 }, { "epoch": 3.5653950953678475, "grad_norm": 2.320047394158472, "learning_rate": 1.4911843868261449e-05, "loss": 0.0466, "step": 2617 }, { "epoch": 3.566757493188011, "grad_norm": 1.1119924070666536, "learning_rate": 1.4907999448624188e-05, "loss": 0.0077, "step": 2618 }, { "epoch": 3.568119891008174, "grad_norm": 1.2882773516019148, "learning_rate": 1.4904154073188622e-05, "loss": 0.0317, "step": 2619 }, { "epoch": 3.569482288828338, "grad_norm": 3.091933945253336, "learning_rate": 1.4900307742703606e-05, "loss": 0.0219, "step": 2620 }, { "epoch": 3.5708446866485013, "grad_norm": 2.448094126777468, "learning_rate": 1.4896460457918184e-05, "loss": 0.0382, "step": 2621 }, { "epoch": 3.5722070844686646, "grad_norm": 0.7929489233470784, "learning_rate": 1.4892612219581596e-05, "loss": 0.0067, "step": 2622 }, { "epoch": 3.5735694822888284, "grad_norm": 2.6809483279940975, "learning_rate": 1.4888763028443249e-05, "loss": 0.0358, "step": 2623 }, { "epoch": 3.5749318801089918, "grad_norm": 2.0432429428792904, "learning_rate": 1.4884912885252751e-05, "loss": 0.0408, "step": 2624 }, { "epoch": 3.576294277929155, "grad_norm": 2.585410806048396, "learning_rate": 1.488106179075989e-05, "loss": 0.0269, "step": 2625 }, { "epoch": 3.577656675749319, "grad_norm": 1.3828751167488833, "learning_rate": 1.487720974571464e-05, "loss": 0.0086, "step": 2626 }, { "epoch": 3.5790190735694822, "grad_norm": 6.006109466775856, "learning_rate": 1.487335675086716e-05, "loss": 0.0721, "step": 2627 }, { "epoch": 3.5803814713896456, "grad_norm": 2.107719027655656, "learning_rate": 1.486950280696779e-05, "loss": 0.0139, "step": 2628 }, { "epoch": 3.5817438692098094, "grad_norm": 3.066818425555037, "learning_rate": 1.4865647914767066e-05, "loss": 0.0502, "step": 2629 }, { "epoch": 3.5831062670299727, "grad_norm": 4.160755697589673, "learning_rate": 1.4861792075015688e-05, "loss": 0.0208, "step": 2630 }, { "epoch": 3.584468664850136, "grad_norm": 2.7592379155688023, "learning_rate": 1.4857935288464566e-05, "loss": 0.0381, "step": 2631 }, { "epoch": 3.5858310626703, "grad_norm": 5.024216263749785, "learning_rate": 1.4854077555864778e-05, "loss": 0.0437, "step": 2632 }, { "epoch": 3.587193460490463, "grad_norm": 1.751912110683383, "learning_rate": 1.485021887796759e-05, "loss": 0.0296, "step": 2633 }, { "epoch": 3.5885558583106265, "grad_norm": 3.4543922025407556, "learning_rate": 1.4846359255524451e-05, "loss": 0.0239, "step": 2634 }, { "epoch": 3.5899182561307903, "grad_norm": 4.587287363451887, "learning_rate": 1.4842498689287e-05, "loss": 0.0399, "step": 2635 }, { "epoch": 3.5912806539509536, "grad_norm": 2.3154834703290468, "learning_rate": 1.4838637180007048e-05, "loss": 0.0545, "step": 2636 }, { "epoch": 3.592643051771117, "grad_norm": 3.071896811252783, "learning_rate": 1.4834774728436598e-05, "loss": 0.0163, "step": 2637 }, { "epoch": 3.5940054495912808, "grad_norm": 3.1299129294761197, "learning_rate": 1.483091133532784e-05, "loss": 0.0379, "step": 2638 }, { "epoch": 3.595367847411444, "grad_norm": 2.014192366074853, "learning_rate": 1.4827047001433142e-05, "loss": 0.0322, "step": 2639 }, { "epoch": 3.5967302452316074, "grad_norm": 4.1596950227464555, "learning_rate": 1.4823181727505054e-05, "loss": 0.014, "step": 2640 }, { "epoch": 3.5980926430517712, "grad_norm": 2.220492630273376, "learning_rate": 1.4819315514296306e-05, "loss": 0.0262, "step": 2641 }, { "epoch": 3.5994550408719346, "grad_norm": 2.9029831098912275, "learning_rate": 1.4815448362559826e-05, "loss": 0.0189, "step": 2642 }, { "epoch": 3.600817438692098, "grad_norm": 2.2960143187390587, "learning_rate": 1.4811580273048707e-05, "loss": 0.0174, "step": 2643 }, { "epoch": 3.6021798365122617, "grad_norm": 2.4574833225271875, "learning_rate": 1.4807711246516233e-05, "loss": 0.0525, "step": 2644 }, { "epoch": 3.603542234332425, "grad_norm": 3.372667904526388, "learning_rate": 1.4803841283715872e-05, "loss": 0.0262, "step": 2645 }, { "epoch": 3.6049046321525884, "grad_norm": 2.939522615967936, "learning_rate": 1.4799970385401272e-05, "loss": 0.0294, "step": 2646 }, { "epoch": 3.606267029972752, "grad_norm": 1.9175541211974505, "learning_rate": 1.479609855232626e-05, "loss": 0.031, "step": 2647 }, { "epoch": 3.6076294277929155, "grad_norm": 2.6512213725953075, "learning_rate": 1.4792225785244849e-05, "loss": 0.0248, "step": 2648 }, { "epoch": 3.608991825613079, "grad_norm": 1.918231338229193, "learning_rate": 1.4788352084911237e-05, "loss": 0.0233, "step": 2649 }, { "epoch": 3.6103542234332426, "grad_norm": 1.6330633020895173, "learning_rate": 1.4784477452079794e-05, "loss": 0.0454, "step": 2650 }, { "epoch": 3.611716621253406, "grad_norm": 1.828540361094691, "learning_rate": 1.4780601887505088e-05, "loss": 0.0264, "step": 2651 }, { "epoch": 3.6130790190735693, "grad_norm": 2.3291224390465057, "learning_rate": 1.4776725391941847e-05, "loss": 0.031, "step": 2652 }, { "epoch": 3.614441416893733, "grad_norm": 3.3997274379751206, "learning_rate": 1.4772847966144989e-05, "loss": 0.0316, "step": 2653 }, { "epoch": 3.6158038147138964, "grad_norm": 1.8365901991237252, "learning_rate": 1.4768969610869628e-05, "loss": 0.0292, "step": 2654 }, { "epoch": 3.61716621253406, "grad_norm": 1.3594356900452949, "learning_rate": 1.4765090326871037e-05, "loss": 0.0376, "step": 2655 }, { "epoch": 3.6185286103542236, "grad_norm": 2.490681305018136, "learning_rate": 1.476121011490468e-05, "loss": 0.034, "step": 2656 }, { "epoch": 3.619891008174387, "grad_norm": 3.404383150587659, "learning_rate": 1.4757328975726207e-05, "loss": 0.0189, "step": 2657 }, { "epoch": 3.6212534059945503, "grad_norm": 3.769766253705344, "learning_rate": 1.4753446910091436e-05, "loss": 0.0227, "step": 2658 }, { "epoch": 3.622615803814714, "grad_norm": 1.2964041022850052, "learning_rate": 1.474956391875637e-05, "loss": 0.0167, "step": 2659 }, { "epoch": 3.6239782016348774, "grad_norm": 5.4096720525251385, "learning_rate": 1.4745680002477206e-05, "loss": 0.05, "step": 2660 }, { "epoch": 3.6253405994550407, "grad_norm": 2.113974043714016, "learning_rate": 1.4741795162010296e-05, "loss": 0.0413, "step": 2661 }, { "epoch": 3.6267029972752045, "grad_norm": 2.6656592097854555, "learning_rate": 1.4737909398112192e-05, "loss": 0.0529, "step": 2662 }, { "epoch": 3.628065395095368, "grad_norm": 3.291211522259082, "learning_rate": 1.4734022711539619e-05, "loss": 0.0395, "step": 2663 }, { "epoch": 3.629427792915531, "grad_norm": 1.8702031495451716, "learning_rate": 1.4730135103049478e-05, "loss": 0.0361, "step": 2664 }, { "epoch": 3.630790190735695, "grad_norm": 3.7391071237711118, "learning_rate": 1.472624657339886e-05, "loss": 0.0145, "step": 2665 }, { "epoch": 3.6321525885558583, "grad_norm": 1.7177268898309561, "learning_rate": 1.4722357123345023e-05, "loss": 0.0215, "step": 2666 }, { "epoch": 3.6335149863760217, "grad_norm": 3.8671009179336533, "learning_rate": 1.471846675364541e-05, "loss": 0.0253, "step": 2667 }, { "epoch": 3.6348773841961854, "grad_norm": 3.2753558945126797, "learning_rate": 1.471457546505765e-05, "loss": 0.0096, "step": 2668 }, { "epoch": 3.636239782016349, "grad_norm": 1.5444126820302357, "learning_rate": 1.4710683258339536e-05, "loss": 0.03, "step": 2669 }, { "epoch": 3.637602179836512, "grad_norm": 2.8099355120757057, "learning_rate": 1.4706790134249051e-05, "loss": 0.0213, "step": 2670 }, { "epoch": 3.638964577656676, "grad_norm": 2.9329655753572106, "learning_rate": 1.4702896093544357e-05, "loss": 0.0558, "step": 2671 }, { "epoch": 3.6403269754768393, "grad_norm": 3.3709082513601674, "learning_rate": 1.4699001136983783e-05, "loss": 0.0458, "step": 2672 }, { "epoch": 3.6416893732970026, "grad_norm": 2.5889088018100717, "learning_rate": 1.4695105265325849e-05, "loss": 0.019, "step": 2673 }, { "epoch": 3.6430517711171664, "grad_norm": 6.2160148022289965, "learning_rate": 1.469120847932925e-05, "loss": 0.0416, "step": 2674 }, { "epoch": 3.6444141689373297, "grad_norm": 3.6008918091484947, "learning_rate": 1.4687310779752854e-05, "loss": 0.046, "step": 2675 }, { "epoch": 3.645776566757493, "grad_norm": 3.5427426723737376, "learning_rate": 1.4683412167355715e-05, "loss": 0.0379, "step": 2676 }, { "epoch": 3.647138964577657, "grad_norm": 7.329397402807545, "learning_rate": 1.4679512642897055e-05, "loss": 0.0268, "step": 2677 }, { "epoch": 3.64850136239782, "grad_norm": 2.8892373150490815, "learning_rate": 1.4675612207136283e-05, "loss": 0.0274, "step": 2678 }, { "epoch": 3.6498637602179835, "grad_norm": 2.0376665202958075, "learning_rate": 1.4671710860832979e-05, "loss": 0.0163, "step": 2679 }, { "epoch": 3.6512261580381473, "grad_norm": 5.410619130038559, "learning_rate": 1.4667808604746905e-05, "loss": 0.032, "step": 2680 }, { "epoch": 3.6525885558583107, "grad_norm": 1.8132935114067519, "learning_rate": 1.4663905439637995e-05, "loss": 0.0197, "step": 2681 }, { "epoch": 3.653950953678474, "grad_norm": 3.304116525840416, "learning_rate": 1.4660001366266367e-05, "loss": 0.034, "step": 2682 }, { "epoch": 3.655313351498638, "grad_norm": 5.200224844781568, "learning_rate": 1.4656096385392306e-05, "loss": 0.0322, "step": 2683 }, { "epoch": 3.656675749318801, "grad_norm": 2.1889887844940543, "learning_rate": 1.4652190497776287e-05, "loss": 0.033, "step": 2684 }, { "epoch": 3.6580381471389645, "grad_norm": 3.7482857088020265, "learning_rate": 1.4648283704178945e-05, "loss": 0.0201, "step": 2685 }, { "epoch": 3.6594005449591283, "grad_norm": 4.804303560413866, "learning_rate": 1.464437600536111e-05, "loss": 0.0327, "step": 2686 }, { "epoch": 3.6607629427792916, "grad_norm": 4.160511628140845, "learning_rate": 1.4640467402083772e-05, "loss": 0.0518, "step": 2687 }, { "epoch": 3.662125340599455, "grad_norm": 4.608523003219672, "learning_rate": 1.4636557895108105e-05, "loss": 0.0348, "step": 2688 }, { "epoch": 3.6634877384196187, "grad_norm": 6.654475349335736, "learning_rate": 1.4632647485195461e-05, "loss": 0.021, "step": 2689 }, { "epoch": 3.664850136239782, "grad_norm": 1.9588862783045768, "learning_rate": 1.4628736173107362e-05, "loss": 0.0161, "step": 2690 }, { "epoch": 3.6662125340599454, "grad_norm": 4.734688509371477, "learning_rate": 1.4624823959605507e-05, "loss": 0.0152, "step": 2691 }, { "epoch": 3.667574931880109, "grad_norm": 5.597239443617518, "learning_rate": 1.4620910845451779e-05, "loss": 0.0431, "step": 2692 }, { "epoch": 3.6689373297002725, "grad_norm": 3.0534395687993205, "learning_rate": 1.4616996831408222e-05, "loss": 0.0051, "step": 2693 }, { "epoch": 3.670299727520436, "grad_norm": 4.7456389039719395, "learning_rate": 1.4613081918237066e-05, "loss": 0.0332, "step": 2694 }, { "epoch": 3.6716621253405997, "grad_norm": 1.875754816408976, "learning_rate": 1.4609166106700713e-05, "loss": 0.0252, "step": 2695 }, { "epoch": 3.673024523160763, "grad_norm": 5.335460845101655, "learning_rate": 1.4605249397561735e-05, "loss": 0.0612, "step": 2696 }, { "epoch": 3.6743869209809263, "grad_norm": 1.6621758551456374, "learning_rate": 1.4601331791582891e-05, "loss": 0.0501, "step": 2697 }, { "epoch": 3.67574931880109, "grad_norm": 1.6271028057557828, "learning_rate": 1.4597413289527102e-05, "loss": 0.0165, "step": 2698 }, { "epoch": 3.6771117166212535, "grad_norm": 1.6475677877512072, "learning_rate": 1.4593493892157473e-05, "loss": 0.0484, "step": 2699 }, { "epoch": 3.678474114441417, "grad_norm": 1.2327746206672456, "learning_rate": 1.4589573600237275e-05, "loss": 0.015, "step": 2700 }, { "epoch": 3.6798365122615806, "grad_norm": 1.6757278810473601, "learning_rate": 1.4585652414529956e-05, "loss": 0.028, "step": 2701 }, { "epoch": 3.681198910081744, "grad_norm": 2.5331536304858147, "learning_rate": 1.4581730335799143e-05, "loss": 0.0248, "step": 2702 }, { "epoch": 3.6825613079019073, "grad_norm": 3.178283080465595, "learning_rate": 1.4577807364808632e-05, "loss": 0.042, "step": 2703 }, { "epoch": 3.683923705722071, "grad_norm": 0.9595285313763184, "learning_rate": 1.4573883502322394e-05, "loss": 0.0301, "step": 2704 }, { "epoch": 3.6852861035422344, "grad_norm": 3.994717185307859, "learning_rate": 1.4569958749104576e-05, "loss": 0.0209, "step": 2705 }, { "epoch": 3.6866485013623977, "grad_norm": 2.6709802304341315, "learning_rate": 1.456603310591949e-05, "loss": 0.0127, "step": 2706 }, { "epoch": 3.6880108991825615, "grad_norm": 1.3560148841681674, "learning_rate": 1.4562106573531632e-05, "loss": 0.0368, "step": 2707 }, { "epoch": 3.689373297002725, "grad_norm": 3.750585040776051, "learning_rate": 1.4558179152705663e-05, "loss": 0.024, "step": 2708 }, { "epoch": 3.690735694822888, "grad_norm": 3.2540229789388384, "learning_rate": 1.4554250844206426e-05, "loss": 0.0377, "step": 2709 }, { "epoch": 3.692098092643052, "grad_norm": 2.366713233193916, "learning_rate": 1.4550321648798926e-05, "loss": 0.0383, "step": 2710 }, { "epoch": 3.6934604904632153, "grad_norm": 2.6745124023191975, "learning_rate": 1.4546391567248353e-05, "loss": 0.0456, "step": 2711 }, { "epoch": 3.6948228882833787, "grad_norm": 4.725571382259451, "learning_rate": 1.4542460600320053e-05, "loss": 0.0248, "step": 2712 }, { "epoch": 3.6961852861035425, "grad_norm": 1.6011015736575047, "learning_rate": 1.4538528748779561e-05, "loss": 0.0061, "step": 2713 }, { "epoch": 3.697547683923706, "grad_norm": 4.867472514647853, "learning_rate": 1.4534596013392574e-05, "loss": 0.0228, "step": 2714 }, { "epoch": 3.698910081743869, "grad_norm": 2.614039762581825, "learning_rate": 1.4530662394924969e-05, "loss": 0.0395, "step": 2715 }, { "epoch": 3.700272479564033, "grad_norm": 1.610435054681634, "learning_rate": 1.4526727894142787e-05, "loss": 0.0149, "step": 2716 }, { "epoch": 3.7016348773841963, "grad_norm": 4.149238341361272, "learning_rate": 1.4522792511812242e-05, "loss": 0.0266, "step": 2717 }, { "epoch": 3.7029972752043596, "grad_norm": 1.8973057772275321, "learning_rate": 1.451885624869973e-05, "loss": 0.0408, "step": 2718 }, { "epoch": 3.7043596730245234, "grad_norm": 2.2188008451327508, "learning_rate": 1.45149191055718e-05, "loss": 0.0318, "step": 2719 }, { "epoch": 3.7057220708446867, "grad_norm": 3.249004257922648, "learning_rate": 1.4510981083195188e-05, "loss": 0.0194, "step": 2720 }, { "epoch": 3.70708446866485, "grad_norm": 2.615017579990171, "learning_rate": 1.4507042182336802e-05, "loss": 0.0326, "step": 2721 }, { "epoch": 3.708446866485014, "grad_norm": 2.689463114566489, "learning_rate": 1.4503102403763705e-05, "loss": 0.0223, "step": 2722 }, { "epoch": 3.709809264305177, "grad_norm": 4.375511031800275, "learning_rate": 1.4499161748243147e-05, "loss": 0.035, "step": 2723 }, { "epoch": 3.7111716621253406, "grad_norm": 1.1041992327688175, "learning_rate": 1.4495220216542541e-05, "loss": 0.0315, "step": 2724 }, { "epoch": 3.7125340599455043, "grad_norm": 4.300198876324826, "learning_rate": 1.4491277809429472e-05, "loss": 0.0184, "step": 2725 }, { "epoch": 3.7138964577656677, "grad_norm": 3.6285872072131795, "learning_rate": 1.44873345276717e-05, "loss": 0.0152, "step": 2726 }, { "epoch": 3.715258855585831, "grad_norm": 3.9009684721899855, "learning_rate": 1.4483390372037147e-05, "loss": 0.026, "step": 2727 }, { "epoch": 3.716621253405995, "grad_norm": 3.034944220655506, "learning_rate": 1.4479445343293909e-05, "loss": 0.0161, "step": 2728 }, { "epoch": 3.717983651226158, "grad_norm": 1.4456328407388432, "learning_rate": 1.447549944221026e-05, "loss": 0.0184, "step": 2729 }, { "epoch": 3.7193460490463215, "grad_norm": 1.823293768948392, "learning_rate": 1.4471552669554624e-05, "loss": 0.0154, "step": 2730 }, { "epoch": 3.7207084468664853, "grad_norm": 2.0935885194723127, "learning_rate": 1.4467605026095617e-05, "loss": 0.0518, "step": 2731 }, { "epoch": 3.7220708446866486, "grad_norm": 0.7376757884803281, "learning_rate": 1.4463656512602012e-05, "loss": 0.0064, "step": 2732 }, { "epoch": 3.723433242506812, "grad_norm": 1.5833600768651204, "learning_rate": 1.4459707129842755e-05, "loss": 0.0201, "step": 2733 }, { "epoch": 3.7247956403269757, "grad_norm": 1.2625939657858178, "learning_rate": 1.4455756878586955e-05, "loss": 0.0379, "step": 2734 }, { "epoch": 3.726158038147139, "grad_norm": 2.672936152555764, "learning_rate": 1.4451805759603907e-05, "loss": 0.0442, "step": 2735 }, { "epoch": 3.7275204359673024, "grad_norm": 3.6438727225014422, "learning_rate": 1.4447853773663054e-05, "loss": 0.0302, "step": 2736 }, { "epoch": 3.728882833787466, "grad_norm": 0.8365923961238665, "learning_rate": 1.4443900921534017e-05, "loss": 0.0194, "step": 2737 }, { "epoch": 3.7302452316076296, "grad_norm": 2.529522370705674, "learning_rate": 1.443994720398659e-05, "loss": 0.0091, "step": 2738 }, { "epoch": 3.731607629427793, "grad_norm": 2.113043821111555, "learning_rate": 1.4435992621790734e-05, "loss": 0.0314, "step": 2739 }, { "epoch": 3.7329700272479567, "grad_norm": 1.3720818995973518, "learning_rate": 1.443203717571657e-05, "loss": 0.019, "step": 2740 }, { "epoch": 3.73433242506812, "grad_norm": 2.029314499296029, "learning_rate": 1.4428080866534397e-05, "loss": 0.0489, "step": 2741 }, { "epoch": 3.7356948228882834, "grad_norm": 2.3444113330774377, "learning_rate": 1.4424123695014678e-05, "loss": 0.0504, "step": 2742 }, { "epoch": 3.7370572207084467, "grad_norm": 1.5909335128611617, "learning_rate": 1.4420165661928041e-05, "loss": 0.0366, "step": 2743 }, { "epoch": 3.7384196185286105, "grad_norm": 2.2687478072688165, "learning_rate": 1.441620676804529e-05, "loss": 0.0215, "step": 2744 }, { "epoch": 3.739782016348774, "grad_norm": 3.747042872240779, "learning_rate": 1.4412247014137389e-05, "loss": 0.0294, "step": 2745 }, { "epoch": 3.741144414168937, "grad_norm": 1.2871182644892345, "learning_rate": 1.440828640097547e-05, "loss": 0.0169, "step": 2746 }, { "epoch": 3.742506811989101, "grad_norm": 3.063744279619798, "learning_rate": 1.4404324929330842e-05, "loss": 0.0284, "step": 2747 }, { "epoch": 3.7438692098092643, "grad_norm": 3.026536977963757, "learning_rate": 1.4400362599974966e-05, "loss": 0.0447, "step": 2748 }, { "epoch": 3.7452316076294276, "grad_norm": 2.398958503289552, "learning_rate": 1.439639941367948e-05, "loss": 0.0335, "step": 2749 }, { "epoch": 3.7465940054495914, "grad_norm": 3.4403501131594503, "learning_rate": 1.4392435371216188e-05, "loss": 0.0084, "step": 2750 }, { "epoch": 3.7479564032697548, "grad_norm": 1.561001827525386, "learning_rate": 1.4388470473357057e-05, "loss": 0.0236, "step": 2751 }, { "epoch": 3.749318801089918, "grad_norm": 3.0248566587615175, "learning_rate": 1.4384504720874226e-05, "loss": 0.0193, "step": 2752 }, { "epoch": 3.750681198910082, "grad_norm": 3.779061417255947, "learning_rate": 1.4380538114539998e-05, "loss": 0.0367, "step": 2753 }, { "epoch": 3.7520435967302452, "grad_norm": 0.942513095620581, "learning_rate": 1.4376570655126834e-05, "loss": 0.0132, "step": 2754 }, { "epoch": 3.7534059945504086, "grad_norm": 5.049332371565687, "learning_rate": 1.4372602343407378e-05, "loss": 0.052, "step": 2755 }, { "epoch": 3.7547683923705724, "grad_norm": 2.7535389899596265, "learning_rate": 1.4368633180154423e-05, "loss": 0.0339, "step": 2756 }, { "epoch": 3.7561307901907357, "grad_norm": 2.096196657142885, "learning_rate": 1.4364663166140942e-05, "loss": 0.0559, "step": 2757 }, { "epoch": 3.757493188010899, "grad_norm": 6.163361858595093, "learning_rate": 1.4360692302140065e-05, "loss": 0.0398, "step": 2758 }, { "epoch": 3.758855585831063, "grad_norm": 3.3114906245597187, "learning_rate": 1.435672058892509e-05, "loss": 0.0225, "step": 2759 }, { "epoch": 3.760217983651226, "grad_norm": 3.762410470260898, "learning_rate": 1.4352748027269479e-05, "loss": 0.0413, "step": 2760 }, { "epoch": 3.7615803814713895, "grad_norm": 3.695933277408496, "learning_rate": 1.4348774617946859e-05, "loss": 0.0433, "step": 2761 }, { "epoch": 3.7629427792915533, "grad_norm": 2.6836666428861817, "learning_rate": 1.4344800361731028e-05, "loss": 0.0454, "step": 2762 }, { "epoch": 3.7643051771117166, "grad_norm": 1.722257918053327, "learning_rate": 1.4340825259395939e-05, "loss": 0.0308, "step": 2763 }, { "epoch": 3.76566757493188, "grad_norm": 2.7439798533588267, "learning_rate": 1.433684931171572e-05, "loss": 0.0274, "step": 2764 }, { "epoch": 3.7670299727520433, "grad_norm": 3.985136201770553, "learning_rate": 1.433287251946466e-05, "loss": 0.0174, "step": 2765 }, { "epoch": 3.768392370572207, "grad_norm": 3.6170970887439187, "learning_rate": 1.4328894883417201e-05, "loss": 0.0247, "step": 2766 }, { "epoch": 3.7697547683923704, "grad_norm": 3.683049186086168, "learning_rate": 1.432491640434797e-05, "loss": 0.0369, "step": 2767 }, { "epoch": 3.771117166212534, "grad_norm": 1.1316676528939227, "learning_rate": 1.4320937083031748e-05, "loss": 0.0337, "step": 2768 }, { "epoch": 3.7724795640326976, "grad_norm": 2.5437865467073664, "learning_rate": 1.431695692024347e-05, "loss": 0.025, "step": 2769 }, { "epoch": 3.773841961852861, "grad_norm": 3.0049769352355185, "learning_rate": 1.4312975916758257e-05, "loss": 0.0278, "step": 2770 }, { "epoch": 3.7752043596730243, "grad_norm": 2.3390480636032502, "learning_rate": 1.4308994073351373e-05, "loss": 0.0255, "step": 2771 }, { "epoch": 3.776566757493188, "grad_norm": 4.36324723089309, "learning_rate": 1.4305011390798253e-05, "loss": 0.0685, "step": 2772 }, { "epoch": 3.7779291553133514, "grad_norm": 3.0203596666376185, "learning_rate": 1.4301027869874502e-05, "loss": 0.0247, "step": 2773 }, { "epoch": 3.7792915531335147, "grad_norm": 2.849758264856689, "learning_rate": 1.4297043511355882e-05, "loss": 0.0554, "step": 2774 }, { "epoch": 3.7806539509536785, "grad_norm": 3.899212149019227, "learning_rate": 1.4293058316018313e-05, "loss": 0.0424, "step": 2775 }, { "epoch": 3.782016348773842, "grad_norm": 6.166712835875737, "learning_rate": 1.428907228463789e-05, "loss": 0.0521, "step": 2776 }, { "epoch": 3.783378746594005, "grad_norm": 2.4000319933685836, "learning_rate": 1.4285085417990863e-05, "loss": 0.0382, "step": 2777 }, { "epoch": 3.784741144414169, "grad_norm": 5.67922397091601, "learning_rate": 1.4281097716853642e-05, "loss": 0.0378, "step": 2778 }, { "epoch": 3.7861035422343323, "grad_norm": 2.949069492786064, "learning_rate": 1.427710918200281e-05, "loss": 0.0273, "step": 2779 }, { "epoch": 3.7874659400544957, "grad_norm": 2.6718003350183563, "learning_rate": 1.4273119814215102e-05, "loss": 0.0287, "step": 2780 }, { "epoch": 3.7888283378746594, "grad_norm": 6.645567282038477, "learning_rate": 1.426912961426742e-05, "loss": 0.035, "step": 2781 }, { "epoch": 3.790190735694823, "grad_norm": 1.7058439667884535, "learning_rate": 1.426513858293683e-05, "loss": 0.0409, "step": 2782 }, { "epoch": 3.791553133514986, "grad_norm": 4.562456087000707, "learning_rate": 1.4261146721000554e-05, "loss": 0.0499, "step": 2783 }, { "epoch": 3.79291553133515, "grad_norm": 3.0814573048152956, "learning_rate": 1.425715402923598e-05, "loss": 0.0432, "step": 2784 }, { "epoch": 3.7942779291553133, "grad_norm": 2.697235458664321, "learning_rate": 1.4253160508420656e-05, "loss": 0.0279, "step": 2785 }, { "epoch": 3.7956403269754766, "grad_norm": 5.554310386058974, "learning_rate": 1.424916615933229e-05, "loss": 0.0392, "step": 2786 }, { "epoch": 3.7970027247956404, "grad_norm": 1.9951044553000483, "learning_rate": 1.4245170982748762e-05, "loss": 0.0317, "step": 2787 }, { "epoch": 3.7983651226158037, "grad_norm": 3.302232748845826, "learning_rate": 1.4241174979448094e-05, "loss": 0.0269, "step": 2788 }, { "epoch": 3.799727520435967, "grad_norm": 1.2413256000102424, "learning_rate": 1.4237178150208486e-05, "loss": 0.017, "step": 2789 }, { "epoch": 3.801089918256131, "grad_norm": 2.6772797187656296, "learning_rate": 1.4233180495808288e-05, "loss": 0.0295, "step": 2790 }, { "epoch": 3.802452316076294, "grad_norm": 1.958135746773323, "learning_rate": 1.422918201702602e-05, "loss": 0.0266, "step": 2791 }, { "epoch": 3.8038147138964575, "grad_norm": 1.1649122041573094, "learning_rate": 1.422518271464035e-05, "loss": 0.0127, "step": 2792 }, { "epoch": 3.8051771117166213, "grad_norm": 2.3057685220313124, "learning_rate": 1.4221182589430122e-05, "loss": 0.0111, "step": 2793 }, { "epoch": 3.8065395095367847, "grad_norm": 4.440740143023808, "learning_rate": 1.4217181642174329e-05, "loss": 0.0549, "step": 2794 }, { "epoch": 3.807901907356948, "grad_norm": 2.267611881951178, "learning_rate": 1.4213179873652127e-05, "loss": 0.0527, "step": 2795 }, { "epoch": 3.809264305177112, "grad_norm": 3.8077383460093777, "learning_rate": 1.4209177284642832e-05, "loss": 0.0269, "step": 2796 }, { "epoch": 3.810626702997275, "grad_norm": 1.2709361598451403, "learning_rate": 1.4205173875925922e-05, "loss": 0.0205, "step": 2797 }, { "epoch": 3.8119891008174385, "grad_norm": 2.609040616603392, "learning_rate": 1.4201169648281027e-05, "loss": 0.0374, "step": 2798 }, { "epoch": 3.8133514986376023, "grad_norm": 2.7271612681569075, "learning_rate": 1.4197164602487949e-05, "loss": 0.0143, "step": 2799 }, { "epoch": 3.8147138964577656, "grad_norm": 1.413521299223428, "learning_rate": 1.4193158739326642e-05, "loss": 0.0529, "step": 2800 }, { "epoch": 3.816076294277929, "grad_norm": 4.952399566955376, "learning_rate": 1.4189152059577214e-05, "loss": 0.0386, "step": 2801 }, { "epoch": 3.8174386920980927, "grad_norm": 1.6222793412308016, "learning_rate": 1.4185144564019942e-05, "loss": 0.0182, "step": 2802 }, { "epoch": 3.818801089918256, "grad_norm": 1.1512040541026798, "learning_rate": 1.4181136253435257e-05, "loss": 0.0207, "step": 2803 }, { "epoch": 3.8201634877384194, "grad_norm": 0.841895142659805, "learning_rate": 1.4177127128603748e-05, "loss": 0.0182, "step": 2804 }, { "epoch": 3.821525885558583, "grad_norm": 1.2916686500146177, "learning_rate": 1.4173117190306163e-05, "loss": 0.0159, "step": 2805 }, { "epoch": 3.8228882833787465, "grad_norm": 1.498573970828065, "learning_rate": 1.4169106439323414e-05, "loss": 0.0215, "step": 2806 }, { "epoch": 3.82425068119891, "grad_norm": 1.598232380488642, "learning_rate": 1.4165094876436562e-05, "loss": 0.0183, "step": 2807 }, { "epoch": 3.8256130790190737, "grad_norm": 1.2488228226950238, "learning_rate": 1.416108250242683e-05, "loss": 0.0252, "step": 2808 }, { "epoch": 3.826975476839237, "grad_norm": 1.4468684256373614, "learning_rate": 1.4157069318075602e-05, "loss": 0.0208, "step": 2809 }, { "epoch": 3.8283378746594003, "grad_norm": 1.9613353036398047, "learning_rate": 1.4153055324164417e-05, "loss": 0.0351, "step": 2810 }, { "epoch": 3.829700272479564, "grad_norm": 1.9140129351779878, "learning_rate": 1.4149040521474974e-05, "loss": 0.0575, "step": 2811 }, { "epoch": 3.8310626702997275, "grad_norm": 1.6890879188565928, "learning_rate": 1.4145024910789124e-05, "loss": 0.0468, "step": 2812 }, { "epoch": 3.832425068119891, "grad_norm": 2.2378688888486242, "learning_rate": 1.4141008492888881e-05, "loss": 0.0257, "step": 2813 }, { "epoch": 3.8337874659400546, "grad_norm": 0.9026471308886062, "learning_rate": 1.4136991268556413e-05, "loss": 0.0178, "step": 2814 }, { "epoch": 3.835149863760218, "grad_norm": 2.809652542672, "learning_rate": 1.413297323857404e-05, "loss": 0.032, "step": 2815 }, { "epoch": 3.8365122615803813, "grad_norm": 1.043818995373769, "learning_rate": 1.412895440372426e-05, "loss": 0.0292, "step": 2816 }, { "epoch": 3.837874659400545, "grad_norm": 1.6057276587671812, "learning_rate": 1.4124934764789701e-05, "loss": 0.0302, "step": 2817 }, { "epoch": 3.8392370572207084, "grad_norm": 1.8202711974448997, "learning_rate": 1.4120914322553162e-05, "loss": 0.0217, "step": 2818 }, { "epoch": 3.8405994550408717, "grad_norm": 0.8385183947362335, "learning_rate": 1.41168930777976e-05, "loss": 0.0272, "step": 2819 }, { "epoch": 3.8419618528610355, "grad_norm": 1.9006338612745501, "learning_rate": 1.4112871031306118e-05, "loss": 0.0319, "step": 2820 }, { "epoch": 3.843324250681199, "grad_norm": 1.8167847170416471, "learning_rate": 1.4108848183861983e-05, "loss": 0.0353, "step": 2821 }, { "epoch": 3.844686648501362, "grad_norm": 1.0311035708209346, "learning_rate": 1.4104824536248615e-05, "loss": 0.019, "step": 2822 }, { "epoch": 3.846049046321526, "grad_norm": 0.7903351793336061, "learning_rate": 1.4100800089249597e-05, "loss": 0.0031, "step": 2823 }, { "epoch": 3.8474114441416893, "grad_norm": 1.4669920901866986, "learning_rate": 1.4096774843648655e-05, "loss": 0.024, "step": 2824 }, { "epoch": 3.8487738419618527, "grad_norm": 2.013355114438928, "learning_rate": 1.4092748800229684e-05, "loss": 0.0346, "step": 2825 }, { "epoch": 3.8501362397820165, "grad_norm": 1.8529705764839566, "learning_rate": 1.4088721959776719e-05, "loss": 0.0111, "step": 2826 }, { "epoch": 3.85149863760218, "grad_norm": 3.9617283435474118, "learning_rate": 1.4084694323073961e-05, "loss": 0.0562, "step": 2827 }, { "epoch": 3.852861035422343, "grad_norm": 1.5819326715155417, "learning_rate": 1.408066589090577e-05, "loss": 0.0357, "step": 2828 }, { "epoch": 3.854223433242507, "grad_norm": 3.4532502037310335, "learning_rate": 1.4076636664056651e-05, "loss": 0.0387, "step": 2829 }, { "epoch": 3.8555858310626703, "grad_norm": 4.102925432330909, "learning_rate": 1.4072606643311267e-05, "loss": 0.0403, "step": 2830 }, { "epoch": 3.8569482288828336, "grad_norm": 2.3639692405979322, "learning_rate": 1.4068575829454436e-05, "loss": 0.0444, "step": 2831 }, { "epoch": 3.8583106267029974, "grad_norm": 5.4947992834911155, "learning_rate": 1.406454422327113e-05, "loss": 0.0318, "step": 2832 }, { "epoch": 3.8596730245231607, "grad_norm": 4.52895912597725, "learning_rate": 1.406051182554648e-05, "loss": 0.0317, "step": 2833 }, { "epoch": 3.861035422343324, "grad_norm": 3.2401084495785675, "learning_rate": 1.4056478637065763e-05, "loss": 0.0237, "step": 2834 }, { "epoch": 3.862397820163488, "grad_norm": 4.687250120010683, "learning_rate": 1.405244465861441e-05, "loss": 0.0266, "step": 2835 }, { "epoch": 3.863760217983651, "grad_norm": 1.4939013709512579, "learning_rate": 1.4048409890978023e-05, "loss": 0.0257, "step": 2836 }, { "epoch": 3.8651226158038146, "grad_norm": 1.8954690920341257, "learning_rate": 1.4044374334942333e-05, "loss": 0.0292, "step": 2837 }, { "epoch": 3.8664850136239783, "grad_norm": 1.705273421842055, "learning_rate": 1.4040337991293237e-05, "loss": 0.028, "step": 2838 }, { "epoch": 3.8678474114441417, "grad_norm": 2.2563441033429648, "learning_rate": 1.4036300860816789e-05, "loss": 0.0336, "step": 2839 }, { "epoch": 3.869209809264305, "grad_norm": 3.0887232877136666, "learning_rate": 1.4032262944299193e-05, "loss": 0.0614, "step": 2840 }, { "epoch": 3.870572207084469, "grad_norm": 3.8624416780609505, "learning_rate": 1.4028224242526798e-05, "loss": 0.0241, "step": 2841 }, { "epoch": 3.871934604904632, "grad_norm": 3.944913147152699, "learning_rate": 1.4024184756286116e-05, "loss": 0.0267, "step": 2842 }, { "epoch": 3.8732970027247955, "grad_norm": 0.7974566948323724, "learning_rate": 1.4020144486363813e-05, "loss": 0.0336, "step": 2843 }, { "epoch": 3.8746594005449593, "grad_norm": 3.0251225753592057, "learning_rate": 1.4016103433546695e-05, "loss": 0.0392, "step": 2844 }, { "epoch": 3.8760217983651226, "grad_norm": 2.788100767713312, "learning_rate": 1.4012061598621734e-05, "loss": 0.0102, "step": 2845 }, { "epoch": 3.877384196185286, "grad_norm": 1.7732663801715944, "learning_rate": 1.4008018982376045e-05, "loss": 0.027, "step": 2846 }, { "epoch": 3.8787465940054497, "grad_norm": 2.442898076782123, "learning_rate": 1.4003975585596902e-05, "loss": 0.0313, "step": 2847 }, { "epoch": 3.880108991825613, "grad_norm": 4.598695479610511, "learning_rate": 1.399993140907173e-05, "loss": 0.064, "step": 2848 }, { "epoch": 3.8814713896457764, "grad_norm": 0.8680040344125864, "learning_rate": 1.3995886453588103e-05, "loss": 0.0099, "step": 2849 }, { "epoch": 3.88283378746594, "grad_norm": 4.368730427927343, "learning_rate": 1.399184071993374e-05, "loss": 0.0312, "step": 2850 }, { "epoch": 3.8841961852861036, "grad_norm": 2.485370818035199, "learning_rate": 1.398779420889653e-05, "loss": 0.0387, "step": 2851 }, { "epoch": 3.885558583106267, "grad_norm": 1.6627677799114275, "learning_rate": 1.3983746921264496e-05, "loss": 0.0514, "step": 2852 }, { "epoch": 3.8869209809264307, "grad_norm": 4.9257310657693445, "learning_rate": 1.3979698857825816e-05, "loss": 0.043, "step": 2853 }, { "epoch": 3.888283378746594, "grad_norm": 1.992934842298387, "learning_rate": 1.3975650019368832e-05, "loss": 0.0243, "step": 2854 }, { "epoch": 3.8896457765667574, "grad_norm": 4.70252553273378, "learning_rate": 1.397160040668202e-05, "loss": 0.0178, "step": 2855 }, { "epoch": 3.891008174386921, "grad_norm": 2.977048379032932, "learning_rate": 1.396755002055401e-05, "loss": 0.0312, "step": 2856 }, { "epoch": 3.8923705722070845, "grad_norm": 1.62374192010036, "learning_rate": 1.3963498861773595e-05, "loss": 0.0456, "step": 2857 }, { "epoch": 3.893732970027248, "grad_norm": 3.191839610271327, "learning_rate": 1.3959446931129703e-05, "loss": 0.0112, "step": 2858 }, { "epoch": 3.8950953678474116, "grad_norm": 2.4961294034835015, "learning_rate": 1.395539422941142e-05, "loss": 0.0373, "step": 2859 }, { "epoch": 3.896457765667575, "grad_norm": 1.802254828833613, "learning_rate": 1.3951340757407985e-05, "loss": 0.0169, "step": 2860 }, { "epoch": 3.8978201634877383, "grad_norm": 2.6496074605220645, "learning_rate": 1.394728651590878e-05, "loss": 0.0248, "step": 2861 }, { "epoch": 3.899182561307902, "grad_norm": 2.753919318196539, "learning_rate": 1.394323150570334e-05, "loss": 0.0304, "step": 2862 }, { "epoch": 3.9005449591280654, "grad_norm": 2.496796984081209, "learning_rate": 1.393917572758135e-05, "loss": 0.0368, "step": 2863 }, { "epoch": 3.9019073569482288, "grad_norm": 2.5573873723100085, "learning_rate": 1.3935119182332641e-05, "loss": 0.0412, "step": 2864 }, { "epoch": 3.9032697547683926, "grad_norm": 2.014897683774849, "learning_rate": 1.3931061870747203e-05, "loss": 0.0234, "step": 2865 }, { "epoch": 3.904632152588556, "grad_norm": 2.5857640025204214, "learning_rate": 1.3927003793615166e-05, "loss": 0.0164, "step": 2866 }, { "epoch": 3.9059945504087192, "grad_norm": 2.211855429730536, "learning_rate": 1.3922944951726811e-05, "loss": 0.0136, "step": 2867 }, { "epoch": 3.907356948228883, "grad_norm": 2.9142862735703488, "learning_rate": 1.3918885345872569e-05, "loss": 0.0155, "step": 2868 }, { "epoch": 3.9087193460490464, "grad_norm": 1.8803908368128823, "learning_rate": 1.391482497684302e-05, "loss": 0.0414, "step": 2869 }, { "epoch": 3.9100817438692097, "grad_norm": 2.11852529549256, "learning_rate": 1.391076384542889e-05, "loss": 0.0134, "step": 2870 }, { "epoch": 3.9114441416893735, "grad_norm": 4.274001217862862, "learning_rate": 1.3906701952421062e-05, "loss": 0.0451, "step": 2871 }, { "epoch": 3.912806539509537, "grad_norm": 1.761886163108665, "learning_rate": 1.3902639298610554e-05, "loss": 0.0169, "step": 2872 }, { "epoch": 3.9141689373297, "grad_norm": 3.776928980311935, "learning_rate": 1.3898575884788544e-05, "loss": 0.0272, "step": 2873 }, { "epoch": 3.915531335149864, "grad_norm": 3.71875227943263, "learning_rate": 1.3894511711746352e-05, "loss": 0.0372, "step": 2874 }, { "epoch": 3.9168937329700273, "grad_norm": 1.3326795207034965, "learning_rate": 1.3890446780275446e-05, "loss": 0.0202, "step": 2875 }, { "epoch": 3.9182561307901906, "grad_norm": 4.6182865068045045, "learning_rate": 1.388638109116744e-05, "loss": 0.0273, "step": 2876 }, { "epoch": 3.9196185286103544, "grad_norm": 2.3139275825339154, "learning_rate": 1.3882314645214106e-05, "loss": 0.0261, "step": 2877 }, { "epoch": 3.9209809264305178, "grad_norm": 3.139122649456597, "learning_rate": 1.387824744320735e-05, "loss": 0.0174, "step": 2878 }, { "epoch": 3.922343324250681, "grad_norm": 3.2953759175587893, "learning_rate": 1.387417948593923e-05, "loss": 0.0111, "step": 2879 }, { "epoch": 3.923705722070845, "grad_norm": 1.6178665272707262, "learning_rate": 1.3870110774201956e-05, "loss": 0.0301, "step": 2880 }, { "epoch": 3.9250681198910082, "grad_norm": 2.1305445664750837, "learning_rate": 1.3866041308787881e-05, "loss": 0.0437, "step": 2881 }, { "epoch": 3.9264305177111716, "grad_norm": 3.040778253851338, "learning_rate": 1.3861971090489498e-05, "loss": 0.0315, "step": 2882 }, { "epoch": 3.9277929155313354, "grad_norm": 1.9492588518821583, "learning_rate": 1.3857900120099461e-05, "loss": 0.0436, "step": 2883 }, { "epoch": 3.9291553133514987, "grad_norm": 2.561602774976124, "learning_rate": 1.3853828398410562e-05, "loss": 0.0315, "step": 2884 }, { "epoch": 3.930517711171662, "grad_norm": 3.341200376456324, "learning_rate": 1.3849755926215736e-05, "loss": 0.0261, "step": 2885 }, { "epoch": 3.931880108991826, "grad_norm": 1.5660925716083443, "learning_rate": 1.384568270430807e-05, "loss": 0.0253, "step": 2886 }, { "epoch": 3.933242506811989, "grad_norm": 2.6687122901110936, "learning_rate": 1.3841608733480798e-05, "loss": 0.0275, "step": 2887 }, { "epoch": 3.9346049046321525, "grad_norm": 3.4879502608408477, "learning_rate": 1.3837534014527292e-05, "loss": 0.0156, "step": 2888 }, { "epoch": 3.9359673024523163, "grad_norm": 1.113473874844523, "learning_rate": 1.3833458548241078e-05, "loss": 0.022, "step": 2889 }, { "epoch": 3.9373297002724796, "grad_norm": 3.993662596245122, "learning_rate": 1.3829382335415824e-05, "loss": 0.0137, "step": 2890 }, { "epoch": 3.938692098092643, "grad_norm": 4.39404254976206, "learning_rate": 1.3825305376845346e-05, "loss": 0.0222, "step": 2891 }, { "epoch": 3.9400544959128068, "grad_norm": 3.1822945808143213, "learning_rate": 1.3821227673323598e-05, "loss": 0.0207, "step": 2892 }, { "epoch": 3.94141689373297, "grad_norm": 5.184026223231562, "learning_rate": 1.3817149225644686e-05, "loss": 0.0297, "step": 2893 }, { "epoch": 3.9427792915531334, "grad_norm": 2.1045613346886056, "learning_rate": 1.3813070034602862e-05, "loss": 0.0249, "step": 2894 }, { "epoch": 3.9441416893732972, "grad_norm": 4.8378246593117735, "learning_rate": 1.3808990100992516e-05, "loss": 0.0502, "step": 2895 }, { "epoch": 3.9455040871934606, "grad_norm": 5.465493242688493, "learning_rate": 1.3804909425608188e-05, "loss": 0.0183, "step": 2896 }, { "epoch": 3.946866485013624, "grad_norm": 1.7136897063205832, "learning_rate": 1.380082800924456e-05, "loss": 0.0362, "step": 2897 }, { "epoch": 3.9482288828337877, "grad_norm": 6.215874636681533, "learning_rate": 1.379674585269646e-05, "loss": 0.0223, "step": 2898 }, { "epoch": 3.949591280653951, "grad_norm": 3.6445081387574305, "learning_rate": 1.3792662956758858e-05, "loss": 0.0532, "step": 2899 }, { "epoch": 3.9509536784741144, "grad_norm": 2.931889741564502, "learning_rate": 1.3788579322226869e-05, "loss": 0.0307, "step": 2900 }, { "epoch": 3.952316076294278, "grad_norm": 6.112842523637982, "learning_rate": 1.3784494949895754e-05, "loss": 0.0411, "step": 2901 }, { "epoch": 3.9536784741144415, "grad_norm": 2.0483429723797792, "learning_rate": 1.3780409840560915e-05, "loss": 0.0287, "step": 2902 }, { "epoch": 3.955040871934605, "grad_norm": 2.9334189958374406, "learning_rate": 1.3776323995017897e-05, "loss": 0.0323, "step": 2903 }, { "epoch": 3.9564032697547686, "grad_norm": 1.957820933353535, "learning_rate": 1.3772237414062393e-05, "loss": 0.0298, "step": 2904 }, { "epoch": 3.957765667574932, "grad_norm": 2.6417331800791404, "learning_rate": 1.376815009849023e-05, "loss": 0.0207, "step": 2905 }, { "epoch": 3.9591280653950953, "grad_norm": 2.683011403184696, "learning_rate": 1.376406204909739e-05, "loss": 0.0261, "step": 2906 }, { "epoch": 3.960490463215259, "grad_norm": 3.0821241031093964, "learning_rate": 1.3759973266679987e-05, "loss": 0.03, "step": 2907 }, { "epoch": 3.9618528610354224, "grad_norm": 2.1152704875191173, "learning_rate": 1.3755883752034287e-05, "loss": 0.0464, "step": 2908 }, { "epoch": 3.963215258855586, "grad_norm": 2.594104684559974, "learning_rate": 1.375179350595669e-05, "loss": 0.0189, "step": 2909 }, { "epoch": 3.9645776566757496, "grad_norm": 1.6645397818226169, "learning_rate": 1.3747702529243745e-05, "loss": 0.0529, "step": 2910 }, { "epoch": 3.965940054495913, "grad_norm": 1.9110177477162722, "learning_rate": 1.374361082269214e-05, "loss": 0.0398, "step": 2911 }, { "epoch": 3.9673024523160763, "grad_norm": 3.591498284616819, "learning_rate": 1.3739518387098704e-05, "loss": 0.0384, "step": 2912 }, { "epoch": 3.96866485013624, "grad_norm": 1.594512405143418, "learning_rate": 1.3735425223260416e-05, "loss": 0.0112, "step": 2913 }, { "epoch": 3.9700272479564034, "grad_norm": 3.127498874601897, "learning_rate": 1.3731331331974387e-05, "loss": 0.0309, "step": 2914 }, { "epoch": 3.9713896457765667, "grad_norm": 1.2648259752017261, "learning_rate": 1.3727236714037873e-05, "loss": 0.0329, "step": 2915 }, { "epoch": 3.9727520435967305, "grad_norm": 2.0609904783205306, "learning_rate": 1.372314137024827e-05, "loss": 0.031, "step": 2916 }, { "epoch": 3.974114441416894, "grad_norm": 1.0361367602913916, "learning_rate": 1.3719045301403125e-05, "loss": 0.0062, "step": 2917 }, { "epoch": 3.975476839237057, "grad_norm": 1.7981960150267566, "learning_rate": 1.371494850830011e-05, "loss": 0.0302, "step": 2918 }, { "epoch": 3.976839237057221, "grad_norm": 2.5214574217708363, "learning_rate": 1.3710850991737048e-05, "loss": 0.0148, "step": 2919 }, { "epoch": 3.9782016348773843, "grad_norm": 0.9590528054647846, "learning_rate": 1.370675275251191e-05, "loss": 0.0221, "step": 2920 }, { "epoch": 3.9795640326975477, "grad_norm": 1.7680378401727468, "learning_rate": 1.3702653791422788e-05, "loss": 0.0179, "step": 2921 }, { "epoch": 3.9809264305177114, "grad_norm": 3.8466767309488845, "learning_rate": 1.3698554109267932e-05, "loss": 0.0284, "step": 2922 }, { "epoch": 3.982288828337875, "grad_norm": 2.0629251904665837, "learning_rate": 1.3694453706845725e-05, "loss": 0.0489, "step": 2923 }, { "epoch": 3.983651226158038, "grad_norm": 3.0822382988880634, "learning_rate": 1.369035258495469e-05, "loss": 0.0386, "step": 2924 }, { "epoch": 3.9850136239782015, "grad_norm": 4.55774293744247, "learning_rate": 1.3686250744393492e-05, "loss": 0.0482, "step": 2925 }, { "epoch": 3.9863760217983653, "grad_norm": 1.9455372756760156, "learning_rate": 1.3682148185960942e-05, "loss": 0.0237, "step": 2926 }, { "epoch": 3.9877384196185286, "grad_norm": 2.850414839906306, "learning_rate": 1.3678044910455975e-05, "loss": 0.0405, "step": 2927 }, { "epoch": 3.989100817438692, "grad_norm": 3.2966090190871062, "learning_rate": 1.3673940918677678e-05, "loss": 0.0375, "step": 2928 }, { "epoch": 3.9904632152588557, "grad_norm": 2.7304261456102914, "learning_rate": 1.3669836211425276e-05, "loss": 0.0443, "step": 2929 }, { "epoch": 3.991825613079019, "grad_norm": 1.6505180233085812, "learning_rate": 1.366573078949813e-05, "loss": 0.0473, "step": 2930 }, { "epoch": 3.9931880108991824, "grad_norm": 4.315125035804415, "learning_rate": 1.3661624653695741e-05, "loss": 0.0556, "step": 2931 }, { "epoch": 3.994550408719346, "grad_norm": 2.766231550265839, "learning_rate": 1.365751780481776e-05, "loss": 0.0329, "step": 2932 }, { "epoch": 3.9959128065395095, "grad_norm": 2.017131278921111, "learning_rate": 1.3653410243663953e-05, "loss": 0.0272, "step": 2933 }, { "epoch": 3.997275204359673, "grad_norm": 3.1646273323276803, "learning_rate": 1.3649301971034243e-05, "loss": 0.0459, "step": 2934 }, { "epoch": 3.9986376021798367, "grad_norm": 4.215314441780194, "learning_rate": 1.3645192987728693e-05, "loss": 0.0417, "step": 2935 }, { "epoch": 4.0, "grad_norm": 2.38468689275733, "learning_rate": 1.3641083294547492e-05, "loss": 0.0333, "step": 2936 }, { "epoch": 4.0, "eval_accuracy": 0.9410443571027512, "eval_f1": 0.9259864469476304, "eval_loss": 0.10676796734333038, "eval_precision": 0.9195241266601705, "eval_recall": 0.9416976917349218, "eval_runtime": 17.3105, "eval_samples_per_second": 102.886, "eval_steps_per_second": 0.809, "step": 2936 }, { "epoch": 4.001362397820164, "grad_norm": 4.523751962851928, "learning_rate": 1.3636972892290973e-05, "loss": 0.0236, "step": 2937 }, { "epoch": 4.002724795640327, "grad_norm": 3.991625014868774, "learning_rate": 1.3632861781759619e-05, "loss": 0.0117, "step": 2938 }, { "epoch": 4.0040871934604905, "grad_norm": 1.0063047563612024, "learning_rate": 1.3628749963754026e-05, "loss": 0.0186, "step": 2939 }, { "epoch": 4.005449591280654, "grad_norm": 3.6655616881358823, "learning_rate": 1.3624637439074949e-05, "loss": 0.0155, "step": 2940 }, { "epoch": 4.006811989100817, "grad_norm": 4.3178558892110575, "learning_rate": 1.3620524208523269e-05, "loss": 0.0141, "step": 2941 }, { "epoch": 4.008174386920981, "grad_norm": 1.600443191017939, "learning_rate": 1.3616410272900014e-05, "loss": 0.0033, "step": 2942 }, { "epoch": 4.009536784741145, "grad_norm": 4.863459900931282, "learning_rate": 1.3612295633006336e-05, "loss": 0.0155, "step": 2943 }, { "epoch": 4.010899182561308, "grad_norm": 1.7775675077088327, "learning_rate": 1.3608180289643545e-05, "loss": 0.0066, "step": 2944 }, { "epoch": 4.012261580381471, "grad_norm": 1.6501949202563804, "learning_rate": 1.3604064243613063e-05, "loss": 0.0126, "step": 2945 }, { "epoch": 4.013623978201635, "grad_norm": 3.7076646724888307, "learning_rate": 1.3599947495716466e-05, "loss": 0.0082, "step": 2946 }, { "epoch": 4.014986376021798, "grad_norm": 1.899595262765503, "learning_rate": 1.359583004675546e-05, "loss": 0.0235, "step": 2947 }, { "epoch": 4.016348773841962, "grad_norm": 2.060575339060402, "learning_rate": 1.359171189753189e-05, "loss": 0.0168, "step": 2948 }, { "epoch": 4.017711171662126, "grad_norm": 3.4059039180245936, "learning_rate": 1.3587593048847735e-05, "loss": 0.0524, "step": 2949 }, { "epoch": 4.0190735694822886, "grad_norm": 1.4220912403275994, "learning_rate": 1.358347350150512e-05, "loss": 0.0136, "step": 2950 }, { "epoch": 4.020435967302452, "grad_norm": 1.4443386120513524, "learning_rate": 1.3579353256306287e-05, "loss": 0.0228, "step": 2951 }, { "epoch": 4.021798365122616, "grad_norm": 1.3150053272342492, "learning_rate": 1.3575232314053628e-05, "loss": 0.0259, "step": 2952 }, { "epoch": 4.023160762942779, "grad_norm": 0.9894954733000272, "learning_rate": 1.357111067554967e-05, "loss": 0.011, "step": 2953 }, { "epoch": 4.024523160762943, "grad_norm": 2.034427498622662, "learning_rate": 1.3566988341597069e-05, "loss": 0.0079, "step": 2954 }, { "epoch": 4.025885558583107, "grad_norm": 2.3761004982148837, "learning_rate": 1.3562865312998628e-05, "loss": 0.0298, "step": 2955 }, { "epoch": 4.0272479564032695, "grad_norm": 2.4034719589462132, "learning_rate": 1.355874159055727e-05, "loss": 0.0124, "step": 2956 }, { "epoch": 4.028610354223433, "grad_norm": 1.8946961313024622, "learning_rate": 1.3554617175076064e-05, "loss": 0.0111, "step": 2957 }, { "epoch": 4.029972752043597, "grad_norm": 1.4500438948235135, "learning_rate": 1.355049206735821e-05, "loss": 0.0115, "step": 2958 }, { "epoch": 4.03133514986376, "grad_norm": 2.4957451294943245, "learning_rate": 1.3546366268207043e-05, "loss": 0.0095, "step": 2959 }, { "epoch": 4.032697547683924, "grad_norm": 1.4213196093166778, "learning_rate": 1.3542239778426034e-05, "loss": 0.0078, "step": 2960 }, { "epoch": 4.0340599455040875, "grad_norm": 3.095531133078659, "learning_rate": 1.3538112598818791e-05, "loss": 0.0392, "step": 2961 }, { "epoch": 4.03542234332425, "grad_norm": 0.615817305837346, "learning_rate": 1.3533984730189049e-05, "loss": 0.0087, "step": 2962 }, { "epoch": 4.036784741144414, "grad_norm": 1.6630905674028362, "learning_rate": 1.3529856173340683e-05, "loss": 0.0067, "step": 2963 }, { "epoch": 4.038147138964578, "grad_norm": 1.4463470501840632, "learning_rate": 1.35257269290777e-05, "loss": 0.0108, "step": 2964 }, { "epoch": 4.039509536784741, "grad_norm": 0.9956222266473368, "learning_rate": 1.3521596998204242e-05, "loss": 0.0136, "step": 2965 }, { "epoch": 4.040871934604905, "grad_norm": 0.6077679779771097, "learning_rate": 1.3517466381524582e-05, "loss": 0.0148, "step": 2966 }, { "epoch": 4.0422343324250685, "grad_norm": 1.195908449224647, "learning_rate": 1.351333507984313e-05, "loss": 0.0152, "step": 2967 }, { "epoch": 4.043596730245231, "grad_norm": 2.635681231203456, "learning_rate": 1.3509203093964429e-05, "loss": 0.0234, "step": 2968 }, { "epoch": 4.044959128065395, "grad_norm": 1.2868954130996721, "learning_rate": 1.3505070424693152e-05, "loss": 0.011, "step": 2969 }, { "epoch": 4.046321525885559, "grad_norm": 2.6846869801457687, "learning_rate": 1.3500937072834108e-05, "loss": 0.0154, "step": 2970 }, { "epoch": 4.047683923705722, "grad_norm": 1.164212617272656, "learning_rate": 1.3496803039192237e-05, "loss": 0.009, "step": 2971 }, { "epoch": 4.049046321525886, "grad_norm": 2.3916319334636915, "learning_rate": 1.3492668324572615e-05, "loss": 0.0137, "step": 2972 }, { "epoch": 4.050408719346049, "grad_norm": 1.8359292458445327, "learning_rate": 1.3488532929780449e-05, "loss": 0.0196, "step": 2973 }, { "epoch": 4.051771117166212, "grad_norm": 1.9365153354613804, "learning_rate": 1.3484396855621075e-05, "loss": 0.0024, "step": 2974 }, { "epoch": 4.053133514986376, "grad_norm": 2.1886186026707675, "learning_rate": 1.3480260102899967e-05, "loss": 0.0274, "step": 2975 }, { "epoch": 4.05449591280654, "grad_norm": 1.2818949062220977, "learning_rate": 1.3476122672422728e-05, "loss": 0.0455, "step": 2976 }, { "epoch": 4.055858310626703, "grad_norm": 1.5813027064760232, "learning_rate": 1.347198456499509e-05, "loss": 0.0242, "step": 2977 }, { "epoch": 4.0572207084468666, "grad_norm": 3.0994585866718793, "learning_rate": 1.3467845781422924e-05, "loss": 0.0443, "step": 2978 }, { "epoch": 4.05858310626703, "grad_norm": 1.3140158672849038, "learning_rate": 1.3463706322512229e-05, "loss": 0.0093, "step": 2979 }, { "epoch": 4.059945504087193, "grad_norm": 1.862411820161422, "learning_rate": 1.3459566189069134e-05, "loss": 0.011, "step": 2980 }, { "epoch": 4.061307901907357, "grad_norm": 3.417986844196657, "learning_rate": 1.34554253818999e-05, "loss": 0.0254, "step": 2981 }, { "epoch": 4.062670299727521, "grad_norm": 2.352789095370666, "learning_rate": 1.3451283901810923e-05, "loss": 0.0161, "step": 2982 }, { "epoch": 4.064032697547684, "grad_norm": 3.644798729696446, "learning_rate": 1.3447141749608723e-05, "loss": 0.0125, "step": 2983 }, { "epoch": 4.0653950953678475, "grad_norm": 2.289178785815204, "learning_rate": 1.3442998926099959e-05, "loss": 0.0173, "step": 2984 }, { "epoch": 4.066757493188011, "grad_norm": 1.8102957895759133, "learning_rate": 1.3438855432091416e-05, "loss": 0.0275, "step": 2985 }, { "epoch": 4.068119891008174, "grad_norm": 2.6636355735238024, "learning_rate": 1.3434711268390008e-05, "loss": 0.0102, "step": 2986 }, { "epoch": 4.069482288828338, "grad_norm": 2.658424620028336, "learning_rate": 1.3430566435802783e-05, "loss": 0.024, "step": 2987 }, { "epoch": 4.070844686648502, "grad_norm": 1.5172842400439726, "learning_rate": 1.3426420935136917e-05, "loss": 0.0174, "step": 2988 }, { "epoch": 4.072207084468665, "grad_norm": 2.9593297461988213, "learning_rate": 1.3422274767199718e-05, "loss": 0.0052, "step": 2989 }, { "epoch": 4.073569482288828, "grad_norm": 3.251581344661195, "learning_rate": 1.3418127932798623e-05, "loss": 0.0258, "step": 2990 }, { "epoch": 4.074931880108992, "grad_norm": 0.5955170878864294, "learning_rate": 1.34139804327412e-05, "loss": 0.0041, "step": 2991 }, { "epoch": 4.076294277929155, "grad_norm": 3.340963026522464, "learning_rate": 1.3409832267835144e-05, "loss": 0.0185, "step": 2992 }, { "epoch": 4.077656675749319, "grad_norm": 2.2651708864702726, "learning_rate": 1.3405683438888281e-05, "loss": 0.0139, "step": 2993 }, { "epoch": 4.079019073569483, "grad_norm": 2.0783720329169415, "learning_rate": 1.3401533946708565e-05, "loss": 0.0566, "step": 2994 }, { "epoch": 4.080381471389646, "grad_norm": 3.5483698839275104, "learning_rate": 1.3397383792104082e-05, "loss": 0.0243, "step": 2995 }, { "epoch": 4.081743869209809, "grad_norm": 2.1772165552916065, "learning_rate": 1.3393232975883044e-05, "loss": 0.011, "step": 2996 }, { "epoch": 4.083106267029973, "grad_norm": 1.68022259068815, "learning_rate": 1.3389081498853795e-05, "loss": 0.0202, "step": 2997 }, { "epoch": 4.084468664850136, "grad_norm": 2.7570764101499896, "learning_rate": 1.3384929361824805e-05, "loss": 0.0161, "step": 2998 }, { "epoch": 4.0858310626703, "grad_norm": 1.260665310973173, "learning_rate": 1.3380776565604677e-05, "loss": 0.0186, "step": 2999 }, { "epoch": 4.087193460490464, "grad_norm": 2.7611922949339442, "learning_rate": 1.3376623111002127e-05, "loss": 0.0353, "step": 3000 }, { "epoch": 4.0885558583106265, "grad_norm": 3.699577906070448, "learning_rate": 1.3372468998826026e-05, "loss": 0.0252, "step": 3001 }, { "epoch": 4.08991825613079, "grad_norm": 2.6146959207864477, "learning_rate": 1.3368314229885349e-05, "loss": 0.0149, "step": 3002 }, { "epoch": 4.091280653950954, "grad_norm": 3.3099166157141946, "learning_rate": 1.3364158804989212e-05, "loss": 0.0227, "step": 3003 }, { "epoch": 4.092643051771117, "grad_norm": 2.8370714509477946, "learning_rate": 1.336000272494685e-05, "loss": 0.0258, "step": 3004 }, { "epoch": 4.094005449591281, "grad_norm": 1.2541817465599432, "learning_rate": 1.3355845990567635e-05, "loss": 0.0113, "step": 3005 }, { "epoch": 4.0953678474114446, "grad_norm": 2.053695751176839, "learning_rate": 1.3351688602661059e-05, "loss": 0.0338, "step": 3006 }, { "epoch": 4.0967302452316074, "grad_norm": 3.6536562934327566, "learning_rate": 1.3347530562036746e-05, "loss": 0.0222, "step": 3007 }, { "epoch": 4.098092643051771, "grad_norm": 3.206020156498966, "learning_rate": 1.3343371869504444e-05, "loss": 0.0472, "step": 3008 }, { "epoch": 4.099455040871935, "grad_norm": 2.74633132037176, "learning_rate": 1.3339212525874031e-05, "loss": 0.0281, "step": 3009 }, { "epoch": 4.100817438692098, "grad_norm": 1.5458513688990947, "learning_rate": 1.3335052531955506e-05, "loss": 0.0216, "step": 3010 }, { "epoch": 4.102179836512262, "grad_norm": 1.5769658939354063, "learning_rate": 1.3330891888559002e-05, "loss": 0.0095, "step": 3011 }, { "epoch": 4.1035422343324255, "grad_norm": 1.3838956610760806, "learning_rate": 1.332673059649477e-05, "loss": 0.011, "step": 3012 }, { "epoch": 4.104904632152588, "grad_norm": 1.7348115070795773, "learning_rate": 1.3322568656573199e-05, "loss": 0.0058, "step": 3013 }, { "epoch": 4.106267029972752, "grad_norm": 2.634588267653392, "learning_rate": 1.3318406069604794e-05, "loss": 0.0121, "step": 3014 }, { "epoch": 4.107629427792916, "grad_norm": 0.8929956443487588, "learning_rate": 1.3314242836400189e-05, "loss": 0.0127, "step": 3015 }, { "epoch": 4.108991825613079, "grad_norm": 1.5274844996143824, "learning_rate": 1.3310078957770147e-05, "loss": 0.0018, "step": 3016 }, { "epoch": 4.110354223433243, "grad_norm": 1.550514331892911, "learning_rate": 1.3305914434525551e-05, "loss": 0.0035, "step": 3017 }, { "epoch": 4.111716621253406, "grad_norm": 1.4556654746367335, "learning_rate": 1.3301749267477413e-05, "loss": 0.0118, "step": 3018 }, { "epoch": 4.113079019073569, "grad_norm": 2.467452590489836, "learning_rate": 1.3297583457436872e-05, "loss": 0.0118, "step": 3019 }, { "epoch": 4.114441416893733, "grad_norm": 0.6133602994020068, "learning_rate": 1.3293417005215187e-05, "loss": 0.0157, "step": 3020 }, { "epoch": 4.115803814713897, "grad_norm": 1.6097432804332301, "learning_rate": 1.3289249911623747e-05, "loss": 0.0106, "step": 3021 }, { "epoch": 4.11716621253406, "grad_norm": 1.7148301249447253, "learning_rate": 1.3285082177474068e-05, "loss": 0.0084, "step": 3022 }, { "epoch": 4.118528610354224, "grad_norm": 1.9947053252694893, "learning_rate": 1.3280913803577781e-05, "loss": 0.0165, "step": 3023 }, { "epoch": 4.1198910081743865, "grad_norm": 0.95497240290156, "learning_rate": 1.3276744790746647e-05, "loss": 0.0062, "step": 3024 }, { "epoch": 4.12125340599455, "grad_norm": 1.1564570147524194, "learning_rate": 1.3272575139792556e-05, "loss": 0.023, "step": 3025 }, { "epoch": 4.122615803814714, "grad_norm": 2.2409403187418553, "learning_rate": 1.3268404851527518e-05, "loss": 0.0162, "step": 3026 }, { "epoch": 4.123978201634877, "grad_norm": 1.1502170761803403, "learning_rate": 1.3264233926763663e-05, "loss": 0.0229, "step": 3027 }, { "epoch": 4.125340599455041, "grad_norm": 1.575860427259944, "learning_rate": 1.3260062366313255e-05, "loss": 0.0194, "step": 3028 }, { "epoch": 4.1267029972752045, "grad_norm": 1.861406512028515, "learning_rate": 1.325589017098867e-05, "loss": 0.0139, "step": 3029 }, { "epoch": 4.128065395095367, "grad_norm": 1.7426759547429838, "learning_rate": 1.325171734160242e-05, "loss": 0.0172, "step": 3030 }, { "epoch": 4.129427792915531, "grad_norm": 2.4344919185712257, "learning_rate": 1.3247543878967127e-05, "loss": 0.0261, "step": 3031 }, { "epoch": 4.130790190735695, "grad_norm": 1.333015338441437, "learning_rate": 1.3243369783895548e-05, "loss": 0.0215, "step": 3032 }, { "epoch": 4.132152588555858, "grad_norm": 2.454525774874537, "learning_rate": 1.3239195057200558e-05, "loss": 0.0117, "step": 3033 }, { "epoch": 4.133514986376022, "grad_norm": 1.2528517639018357, "learning_rate": 1.3235019699695157e-05, "loss": 0.0263, "step": 3034 }, { "epoch": 4.1348773841961854, "grad_norm": 1.9341931347358143, "learning_rate": 1.3230843712192463e-05, "loss": 0.036, "step": 3035 }, { "epoch": 4.136239782016348, "grad_norm": 2.052120194769729, "learning_rate": 1.3226667095505722e-05, "loss": 0.0096, "step": 3036 }, { "epoch": 4.137602179836512, "grad_norm": 0.5478748333091819, "learning_rate": 1.3222489850448303e-05, "loss": 0.0098, "step": 3037 }, { "epoch": 4.138964577656676, "grad_norm": 1.4052000316537072, "learning_rate": 1.321831197783369e-05, "loss": 0.0117, "step": 3038 }, { "epoch": 4.140326975476839, "grad_norm": 1.4955192927928749, "learning_rate": 1.3214133478475496e-05, "loss": 0.0378, "step": 3039 }, { "epoch": 4.141689373297003, "grad_norm": 0.8121275737419278, "learning_rate": 1.320995435318746e-05, "loss": 0.0029, "step": 3040 }, { "epoch": 4.143051771117166, "grad_norm": 0.9201913365815286, "learning_rate": 1.3205774602783428e-05, "loss": 0.0064, "step": 3041 }, { "epoch": 4.144414168937329, "grad_norm": 1.9908623415490623, "learning_rate": 1.3201594228077383e-05, "loss": 0.0105, "step": 3042 }, { "epoch": 4.145776566757493, "grad_norm": 0.8010989703539293, "learning_rate": 1.3197413229883423e-05, "loss": 0.0076, "step": 3043 }, { "epoch": 4.147138964577657, "grad_norm": 1.640126750400451, "learning_rate": 1.3193231609015762e-05, "loss": 0.0476, "step": 3044 }, { "epoch": 4.14850136239782, "grad_norm": 1.1637509164913713, "learning_rate": 1.318904936628875e-05, "loss": 0.0149, "step": 3045 }, { "epoch": 4.1498637602179835, "grad_norm": 0.6966791390144302, "learning_rate": 1.3184866502516846e-05, "loss": 0.0044, "step": 3046 }, { "epoch": 4.151226158038147, "grad_norm": 1.301220285746218, "learning_rate": 1.318068301851463e-05, "loss": 0.0164, "step": 3047 }, { "epoch": 4.15258855585831, "grad_norm": 3.029623194702931, "learning_rate": 1.3176498915096811e-05, "loss": 0.0467, "step": 3048 }, { "epoch": 4.153950953678474, "grad_norm": 0.9907912405972185, "learning_rate": 1.317231419307821e-05, "loss": 0.0242, "step": 3049 }, { "epoch": 4.155313351498638, "grad_norm": 2.828592786288818, "learning_rate": 1.3168128853273771e-05, "loss": 0.0389, "step": 3050 }, { "epoch": 4.156675749318801, "grad_norm": 2.9349567639216985, "learning_rate": 1.3163942896498564e-05, "loss": 0.0041, "step": 3051 }, { "epoch": 4.1580381471389645, "grad_norm": 0.9766695448028804, "learning_rate": 1.3159756323567775e-05, "loss": 0.014, "step": 3052 }, { "epoch": 4.159400544959128, "grad_norm": 4.111170323857593, "learning_rate": 1.3155569135296705e-05, "loss": 0.0218, "step": 3053 }, { "epoch": 4.160762942779291, "grad_norm": 2.02646899945706, "learning_rate": 1.3151381332500782e-05, "loss": 0.0186, "step": 3054 }, { "epoch": 4.162125340599455, "grad_norm": 2.4885315650067152, "learning_rate": 1.3147192915995546e-05, "loss": 0.0133, "step": 3055 }, { "epoch": 4.163487738419619, "grad_norm": 3.399073276821763, "learning_rate": 1.314300388659667e-05, "loss": 0.0276, "step": 3056 }, { "epoch": 4.164850136239782, "grad_norm": 1.9106858412812264, "learning_rate": 1.3138814245119936e-05, "loss": 0.0305, "step": 3057 }, { "epoch": 4.166212534059945, "grad_norm": 3.391562680439195, "learning_rate": 1.3134623992381242e-05, "loss": 0.031, "step": 3058 }, { "epoch": 4.167574931880109, "grad_norm": 3.740858656516221, "learning_rate": 1.3130433129196613e-05, "loss": 0.0081, "step": 3059 }, { "epoch": 4.168937329700272, "grad_norm": 3.1955006199240352, "learning_rate": 1.3126241656382192e-05, "loss": 0.029, "step": 3060 }, { "epoch": 4.170299727520436, "grad_norm": 2.3971557753394803, "learning_rate": 1.3122049574754235e-05, "loss": 0.0073, "step": 3061 }, { "epoch": 4.1716621253406, "grad_norm": 1.8094147249651178, "learning_rate": 1.3117856885129125e-05, "loss": 0.0244, "step": 3062 }, { "epoch": 4.1730245231607626, "grad_norm": 2.3242456265850486, "learning_rate": 1.3113663588323356e-05, "loss": 0.0252, "step": 3063 }, { "epoch": 4.174386920980926, "grad_norm": 2.320969983571576, "learning_rate": 1.3109469685153544e-05, "loss": 0.0225, "step": 3064 }, { "epoch": 4.17574931880109, "grad_norm": 2.2566398138029697, "learning_rate": 1.310527517643642e-05, "loss": 0.0228, "step": 3065 }, { "epoch": 4.177111716621253, "grad_norm": 2.688417630971107, "learning_rate": 1.3101080062988839e-05, "loss": 0.0082, "step": 3066 }, { "epoch": 4.178474114441417, "grad_norm": 2.0002006093621163, "learning_rate": 1.3096884345627763e-05, "loss": 0.023, "step": 3067 }, { "epoch": 4.179836512261581, "grad_norm": 1.2359618268793948, "learning_rate": 1.3092688025170283e-05, "loss": 0.0198, "step": 3068 }, { "epoch": 4.1811989100817435, "grad_norm": 4.1101576982460575, "learning_rate": 1.3088491102433607e-05, "loss": 0.024, "step": 3069 }, { "epoch": 4.182561307901907, "grad_norm": 1.3910350450893105, "learning_rate": 1.3084293578235048e-05, "loss": 0.0083, "step": 3070 }, { "epoch": 4.183923705722071, "grad_norm": 1.9252196307840126, "learning_rate": 1.308009545339205e-05, "loss": 0.0253, "step": 3071 }, { "epoch": 4.185286103542234, "grad_norm": 0.88550564082499, "learning_rate": 1.3075896728722164e-05, "loss": 0.0142, "step": 3072 }, { "epoch": 4.186648501362398, "grad_norm": 1.26957231420453, "learning_rate": 1.3071697405043065e-05, "loss": 0.0072, "step": 3073 }, { "epoch": 4.1880108991825615, "grad_norm": 1.402398106045797, "learning_rate": 1.306749748317254e-05, "loss": 0.0179, "step": 3074 }, { "epoch": 4.189373297002724, "grad_norm": 1.0901131539070466, "learning_rate": 1.3063296963928496e-05, "loss": 0.0086, "step": 3075 }, { "epoch": 4.190735694822888, "grad_norm": 0.7990348721443177, "learning_rate": 1.3059095848128953e-05, "loss": 0.0236, "step": 3076 }, { "epoch": 4.192098092643052, "grad_norm": 2.417436651603479, "learning_rate": 1.3054894136592052e-05, "loss": 0.0145, "step": 3077 }, { "epoch": 4.193460490463215, "grad_norm": 1.4526279002938307, "learning_rate": 1.3050691830136042e-05, "loss": 0.0236, "step": 3078 }, { "epoch": 4.194822888283379, "grad_norm": 0.4870131245570978, "learning_rate": 1.3046488929579293e-05, "loss": 0.0093, "step": 3079 }, { "epoch": 4.1961852861035425, "grad_norm": 1.8036274962149488, "learning_rate": 1.3042285435740292e-05, "loss": 0.0257, "step": 3080 }, { "epoch": 4.197547683923705, "grad_norm": 0.5474100809636573, "learning_rate": 1.3038081349437641e-05, "loss": 0.0189, "step": 3081 }, { "epoch": 4.198910081743869, "grad_norm": 1.7597616694231097, "learning_rate": 1.3033876671490055e-05, "loss": 0.025, "step": 3082 }, { "epoch": 4.200272479564033, "grad_norm": 1.7088508594325584, "learning_rate": 1.3029671402716366e-05, "loss": 0.0142, "step": 3083 }, { "epoch": 4.201634877384196, "grad_norm": 1.2452336993485242, "learning_rate": 1.3025465543935516e-05, "loss": 0.0108, "step": 3084 }, { "epoch": 4.20299727520436, "grad_norm": 1.7841693287131204, "learning_rate": 1.3021259095966574e-05, "loss": 0.0119, "step": 3085 }, { "epoch": 4.204359673024523, "grad_norm": 1.9490337149298496, "learning_rate": 1.301705205962871e-05, "loss": 0.0168, "step": 3086 }, { "epoch": 4.205722070844686, "grad_norm": 2.169639705813366, "learning_rate": 1.3012844435741218e-05, "loss": 0.04, "step": 3087 }, { "epoch": 4.20708446866485, "grad_norm": 1.2644463212587251, "learning_rate": 1.3008636225123501e-05, "loss": 0.0121, "step": 3088 }, { "epoch": 4.208446866485014, "grad_norm": 2.473005021970001, "learning_rate": 1.300442742859508e-05, "loss": 0.011, "step": 3089 }, { "epoch": 4.209809264305177, "grad_norm": 0.7252581306420103, "learning_rate": 1.3000218046975586e-05, "loss": 0.0269, "step": 3090 }, { "epoch": 4.2111716621253406, "grad_norm": 0.9359804179702238, "learning_rate": 1.299600808108477e-05, "loss": 0.0129, "step": 3091 }, { "epoch": 4.212534059945504, "grad_norm": 1.7788256110317981, "learning_rate": 1.2991797531742492e-05, "loss": 0.0054, "step": 3092 }, { "epoch": 4.213896457765667, "grad_norm": 1.5069673915137813, "learning_rate": 1.2987586399768723e-05, "loss": 0.0188, "step": 3093 }, { "epoch": 4.215258855585831, "grad_norm": 1.6830513998963397, "learning_rate": 1.2983374685983558e-05, "loss": 0.0036, "step": 3094 }, { "epoch": 4.216621253405995, "grad_norm": 2.851660349996638, "learning_rate": 1.2979162391207193e-05, "loss": 0.0281, "step": 3095 }, { "epoch": 4.217983651226158, "grad_norm": 4.186755704392417, "learning_rate": 1.2974949516259945e-05, "loss": 0.0227, "step": 3096 }, { "epoch": 4.2193460490463215, "grad_norm": 1.2905598073276037, "learning_rate": 1.2970736061962242e-05, "loss": 0.0128, "step": 3097 }, { "epoch": 4.220708446866485, "grad_norm": 1.77825588092785, "learning_rate": 1.2966522029134622e-05, "loss": 0.0063, "step": 3098 }, { "epoch": 4.222070844686648, "grad_norm": 2.347698605706879, "learning_rate": 1.2962307418597744e-05, "loss": 0.0215, "step": 3099 }, { "epoch": 4.223433242506812, "grad_norm": 1.870148712707033, "learning_rate": 1.2958092231172368e-05, "loss": 0.0338, "step": 3100 }, { "epoch": 4.224795640326976, "grad_norm": 3.569352438234793, "learning_rate": 1.2953876467679373e-05, "loss": 0.0332, "step": 3101 }, { "epoch": 4.226158038147139, "grad_norm": 3.559616906416121, "learning_rate": 1.294966012893975e-05, "loss": 0.0291, "step": 3102 }, { "epoch": 4.227520435967302, "grad_norm": 2.453195704597571, "learning_rate": 1.2945443215774602e-05, "loss": 0.0391, "step": 3103 }, { "epoch": 4.228882833787466, "grad_norm": 4.911021910446651, "learning_rate": 1.2941225729005144e-05, "loss": 0.0345, "step": 3104 }, { "epoch": 4.230245231607629, "grad_norm": 2.1978035095229473, "learning_rate": 1.2937007669452699e-05, "loss": 0.0147, "step": 3105 }, { "epoch": 4.231607629427793, "grad_norm": 2.4909170018696574, "learning_rate": 1.2932789037938706e-05, "loss": 0.0266, "step": 3106 }, { "epoch": 4.232970027247957, "grad_norm": 4.612949552573333, "learning_rate": 1.2928569835284714e-05, "loss": 0.0281, "step": 3107 }, { "epoch": 4.23433242506812, "grad_norm": 2.80037911922463, "learning_rate": 1.2924350062312381e-05, "loss": 0.0088, "step": 3108 }, { "epoch": 4.235694822888283, "grad_norm": 4.551887577072054, "learning_rate": 1.2920129719843479e-05, "loss": 0.0385, "step": 3109 }, { "epoch": 4.237057220708447, "grad_norm": 4.875058742158277, "learning_rate": 1.2915908808699893e-05, "loss": 0.0404, "step": 3110 }, { "epoch": 4.23841961852861, "grad_norm": 1.2267281098195102, "learning_rate": 1.2911687329703612e-05, "loss": 0.0173, "step": 3111 }, { "epoch": 4.239782016348774, "grad_norm": 3.5386552797559183, "learning_rate": 1.2907465283676744e-05, "loss": 0.0181, "step": 3112 }, { "epoch": 4.241144414168938, "grad_norm": 3.878838974358458, "learning_rate": 1.2903242671441495e-05, "loss": 0.0176, "step": 3113 }, { "epoch": 4.2425068119891005, "grad_norm": 0.8899091917435672, "learning_rate": 1.2899019493820193e-05, "loss": 0.0037, "step": 3114 }, { "epoch": 4.243869209809264, "grad_norm": 5.618220713769656, "learning_rate": 1.2894795751635274e-05, "loss": 0.0316, "step": 3115 }, { "epoch": 4.245231607629428, "grad_norm": 1.9576154920996416, "learning_rate": 1.289057144570928e-05, "loss": 0.0208, "step": 3116 }, { "epoch": 4.246594005449591, "grad_norm": 2.084661212536652, "learning_rate": 1.2886346576864868e-05, "loss": 0.018, "step": 3117 }, { "epoch": 4.247956403269755, "grad_norm": 4.327887582980022, "learning_rate": 1.2882121145924795e-05, "loss": 0.0124, "step": 3118 }, { "epoch": 4.2493188010899186, "grad_norm": 3.1376378286867976, "learning_rate": 1.2877895153711935e-05, "loss": 0.0121, "step": 3119 }, { "epoch": 4.2506811989100814, "grad_norm": 3.3218552403898447, "learning_rate": 1.2873668601049278e-05, "loss": 0.0149, "step": 3120 }, { "epoch": 4.252043596730245, "grad_norm": 5.609482621538234, "learning_rate": 1.2869441488759908e-05, "loss": 0.0121, "step": 3121 }, { "epoch": 4.253405994550409, "grad_norm": 1.2265976044927884, "learning_rate": 1.2865213817667024e-05, "loss": 0.0149, "step": 3122 }, { "epoch": 4.254768392370572, "grad_norm": 3.5047909318078982, "learning_rate": 1.2860985588593944e-05, "loss": 0.0287, "step": 3123 }, { "epoch": 4.256130790190736, "grad_norm": 5.91573692419685, "learning_rate": 1.2856756802364079e-05, "loss": 0.0412, "step": 3124 }, { "epoch": 4.2574931880108995, "grad_norm": 2.776933886641956, "learning_rate": 1.2852527459800954e-05, "loss": 0.0369, "step": 3125 }, { "epoch": 4.258855585831062, "grad_norm": 5.1851066924535765, "learning_rate": 1.2848297561728209e-05, "loss": 0.0565, "step": 3126 }, { "epoch": 4.260217983651226, "grad_norm": 4.729015225159831, "learning_rate": 1.2844067108969585e-05, "loss": 0.0112, "step": 3127 }, { "epoch": 4.26158038147139, "grad_norm": 1.7150005363842435, "learning_rate": 1.2839836102348926e-05, "loss": 0.0185, "step": 3128 }, { "epoch": 4.262942779291553, "grad_norm": 2.9518758876346967, "learning_rate": 1.2835604542690203e-05, "loss": 0.0249, "step": 3129 }, { "epoch": 4.264305177111717, "grad_norm": 3.287354014183834, "learning_rate": 1.2831372430817476e-05, "loss": 0.0303, "step": 3130 }, { "epoch": 4.26566757493188, "grad_norm": 3.0593395955971046, "learning_rate": 1.2827139767554915e-05, "loss": 0.0279, "step": 3131 }, { "epoch": 4.267029972752043, "grad_norm": 4.807591781722683, "learning_rate": 1.2822906553726809e-05, "loss": 0.0164, "step": 3132 }, { "epoch": 4.268392370572207, "grad_norm": 5.476907975237246, "learning_rate": 1.2818672790157543e-05, "loss": 0.021, "step": 3133 }, { "epoch": 4.269754768392371, "grad_norm": 1.96189435415317, "learning_rate": 1.281443847767161e-05, "loss": 0.0376, "step": 3134 }, { "epoch": 4.271117166212534, "grad_norm": 6.085045375862789, "learning_rate": 1.2810203617093615e-05, "loss": 0.0089, "step": 3135 }, { "epoch": 4.272479564032698, "grad_norm": 3.962450839239248, "learning_rate": 1.2805968209248275e-05, "loss": 0.0155, "step": 3136 }, { "epoch": 4.273841961852861, "grad_norm": 2.0635873563412632, "learning_rate": 1.2801732254960389e-05, "loss": 0.0292, "step": 3137 }, { "epoch": 4.275204359673024, "grad_norm": 4.837112428677244, "learning_rate": 1.2797495755054892e-05, "loss": 0.0206, "step": 3138 }, { "epoch": 4.276566757493188, "grad_norm": 4.268384575690902, "learning_rate": 1.2793258710356807e-05, "loss": 0.0296, "step": 3139 }, { "epoch": 4.277929155313352, "grad_norm": 2.50078042699945, "learning_rate": 1.2789021121691273e-05, "loss": 0.0143, "step": 3140 }, { "epoch": 4.279291553133515, "grad_norm": 3.2520656785832958, "learning_rate": 1.2784782989883528e-05, "loss": 0.0043, "step": 3141 }, { "epoch": 4.2806539509536785, "grad_norm": 2.46712423556904, "learning_rate": 1.278054431575892e-05, "loss": 0.0042, "step": 3142 }, { "epoch": 4.282016348773842, "grad_norm": 1.433596088486315, "learning_rate": 1.2776305100142898e-05, "loss": 0.0161, "step": 3143 }, { "epoch": 4.283378746594005, "grad_norm": 5.32124845772538, "learning_rate": 1.277206534386102e-05, "loss": 0.0385, "step": 3144 }, { "epoch": 4.284741144414169, "grad_norm": 1.3985017895558471, "learning_rate": 1.2767825047738948e-05, "loss": 0.0186, "step": 3145 }, { "epoch": 4.286103542234333, "grad_norm": 4.455393861882215, "learning_rate": 1.2763584212602453e-05, "loss": 0.0119, "step": 3146 }, { "epoch": 4.287465940054496, "grad_norm": 2.60674383975954, "learning_rate": 1.2759342839277407e-05, "loss": 0.0131, "step": 3147 }, { "epoch": 4.2888283378746594, "grad_norm": 0.5237238095080826, "learning_rate": 1.2755100928589785e-05, "loss": 0.0098, "step": 3148 }, { "epoch": 4.290190735694823, "grad_norm": 4.552624414761127, "learning_rate": 1.2750858481365673e-05, "loss": 0.0302, "step": 3149 }, { "epoch": 4.291553133514986, "grad_norm": 1.1539232622817128, "learning_rate": 1.2746615498431254e-05, "loss": 0.0208, "step": 3150 }, { "epoch": 4.29291553133515, "grad_norm": 2.384285449023448, "learning_rate": 1.274237198061282e-05, "loss": 0.0102, "step": 3151 }, { "epoch": 4.294277929155314, "grad_norm": 3.309796470195663, "learning_rate": 1.2738127928736766e-05, "loss": 0.02, "step": 3152 }, { "epoch": 4.295640326975477, "grad_norm": 1.0009493231802302, "learning_rate": 1.2733883343629593e-05, "loss": 0.0086, "step": 3153 }, { "epoch": 4.29700272479564, "grad_norm": 2.638747945270971, "learning_rate": 1.2729638226117905e-05, "loss": 0.0204, "step": 3154 }, { "epoch": 4.298365122615804, "grad_norm": 3.335220326687753, "learning_rate": 1.2725392577028403e-05, "loss": 0.041, "step": 3155 }, { "epoch": 4.299727520435967, "grad_norm": 3.137951222588479, "learning_rate": 1.2721146397187905e-05, "loss": 0.0258, "step": 3156 }, { "epoch": 4.301089918256131, "grad_norm": 3.946273912828588, "learning_rate": 1.2716899687423316e-05, "loss": 0.0255, "step": 3157 }, { "epoch": 4.302452316076295, "grad_norm": 1.585286114027447, "learning_rate": 1.2712652448561657e-05, "loss": 0.0165, "step": 3158 }, { "epoch": 4.3038147138964575, "grad_norm": 2.438831470532442, "learning_rate": 1.2708404681430054e-05, "loss": 0.0251, "step": 3159 }, { "epoch": 4.305177111716621, "grad_norm": 2.7517219007464386, "learning_rate": 1.270415638685572e-05, "loss": 0.0131, "step": 3160 }, { "epoch": 4.306539509536785, "grad_norm": 0.8477571921672792, "learning_rate": 1.2699907565665983e-05, "loss": 0.0207, "step": 3161 }, { "epoch": 4.307901907356948, "grad_norm": 1.5175492266820618, "learning_rate": 1.2695658218688274e-05, "loss": 0.0149, "step": 3162 }, { "epoch": 4.309264305177112, "grad_norm": 2.9140433534454564, "learning_rate": 1.269140834675012e-05, "loss": 0.0126, "step": 3163 }, { "epoch": 4.310626702997276, "grad_norm": 1.782250601732668, "learning_rate": 1.2687157950679158e-05, "loss": 0.0257, "step": 3164 }, { "epoch": 4.3119891008174385, "grad_norm": 0.7829758256079178, "learning_rate": 1.2682907031303118e-05, "loss": 0.0127, "step": 3165 }, { "epoch": 4.313351498637602, "grad_norm": 1.7759724185977535, "learning_rate": 1.267865558944984e-05, "loss": 0.0082, "step": 3166 }, { "epoch": 4.314713896457766, "grad_norm": 1.1995979662675447, "learning_rate": 1.267440362594726e-05, "loss": 0.0143, "step": 3167 }, { "epoch": 4.316076294277929, "grad_norm": 0.3253296694225978, "learning_rate": 1.2670151141623416e-05, "loss": 0.0031, "step": 3168 }, { "epoch": 4.317438692098093, "grad_norm": 1.84068524038276, "learning_rate": 1.2665898137306458e-05, "loss": 0.031, "step": 3169 }, { "epoch": 4.3188010899182565, "grad_norm": 1.56765221258631, "learning_rate": 1.266164461382462e-05, "loss": 0.0108, "step": 3170 }, { "epoch": 4.320163487738419, "grad_norm": 0.9374567683477221, "learning_rate": 1.265739057200625e-05, "loss": 0.0107, "step": 3171 }, { "epoch": 4.321525885558583, "grad_norm": 0.8761215469953005, "learning_rate": 1.265313601267979e-05, "loss": 0.0051, "step": 3172 }, { "epoch": 4.322888283378747, "grad_norm": 3.9467757615733605, "learning_rate": 1.2648880936673787e-05, "loss": 0.025, "step": 3173 }, { "epoch": 4.32425068119891, "grad_norm": 1.962054808509072, "learning_rate": 1.2644625344816883e-05, "loss": 0.0478, "step": 3174 }, { "epoch": 4.325613079019074, "grad_norm": 1.3116995297601501, "learning_rate": 1.264036923793783e-05, "loss": 0.0146, "step": 3175 }, { "epoch": 4.3269754768392374, "grad_norm": 1.1330696255468446, "learning_rate": 1.2636112616865476e-05, "loss": 0.0407, "step": 3176 }, { "epoch": 4.3283378746594, "grad_norm": 2.3128939439101637, "learning_rate": 1.2631855482428762e-05, "loss": 0.0271, "step": 3177 }, { "epoch": 4.329700272479564, "grad_norm": 1.9730517047068228, "learning_rate": 1.2627597835456742e-05, "loss": 0.0346, "step": 3178 }, { "epoch": 4.331062670299728, "grad_norm": 2.9917787723890545, "learning_rate": 1.2623339676778557e-05, "loss": 0.0293, "step": 3179 }, { "epoch": 4.332425068119891, "grad_norm": 2.5802553021843435, "learning_rate": 1.2619081007223453e-05, "loss": 0.0193, "step": 3180 }, { "epoch": 4.333787465940055, "grad_norm": 3.230795618442211, "learning_rate": 1.2614821827620777e-05, "loss": 0.0196, "step": 3181 }, { "epoch": 4.335149863760218, "grad_norm": 3.0232220908008935, "learning_rate": 1.2610562138799977e-05, "loss": 0.0147, "step": 3182 }, { "epoch": 4.336512261580381, "grad_norm": 1.457159634755722, "learning_rate": 1.2606301941590597e-05, "loss": 0.0339, "step": 3183 }, { "epoch": 4.337874659400545, "grad_norm": 2.5191061102246577, "learning_rate": 1.260204123682228e-05, "loss": 0.0137, "step": 3184 }, { "epoch": 4.339237057220709, "grad_norm": 1.580539504772457, "learning_rate": 1.2597780025324764e-05, "loss": 0.0029, "step": 3185 }, { "epoch": 4.340599455040872, "grad_norm": 2.8447646319908033, "learning_rate": 1.2593518307927894e-05, "loss": 0.022, "step": 3186 }, { "epoch": 4.3419618528610355, "grad_norm": 1.4258732961156135, "learning_rate": 1.2589256085461607e-05, "loss": 0.026, "step": 3187 }, { "epoch": 4.343324250681199, "grad_norm": 1.7710225359759106, "learning_rate": 1.2584993358755946e-05, "loss": 0.0123, "step": 3188 }, { "epoch": 4.344686648501362, "grad_norm": 3.038965607040499, "learning_rate": 1.2580730128641042e-05, "loss": 0.0151, "step": 3189 }, { "epoch": 4.346049046321526, "grad_norm": 1.9914779047269022, "learning_rate": 1.2576466395947131e-05, "loss": 0.0245, "step": 3190 }, { "epoch": 4.34741144414169, "grad_norm": 3.233252742975547, "learning_rate": 1.2572202161504543e-05, "loss": 0.0106, "step": 3191 }, { "epoch": 4.348773841961853, "grad_norm": 0.9962171355467622, "learning_rate": 1.2567937426143706e-05, "loss": 0.0103, "step": 3192 }, { "epoch": 4.3501362397820165, "grad_norm": 1.9930428820011914, "learning_rate": 1.2563672190695155e-05, "loss": 0.0122, "step": 3193 }, { "epoch": 4.35149863760218, "grad_norm": 2.9988339828914707, "learning_rate": 1.2559406455989506e-05, "loss": 0.0022, "step": 3194 }, { "epoch": 4.352861035422343, "grad_norm": 0.577563993635376, "learning_rate": 1.2555140222857488e-05, "loss": 0.0151, "step": 3195 }, { "epoch": 4.354223433242507, "grad_norm": 1.767948472759637, "learning_rate": 1.2550873492129913e-05, "loss": 0.0097, "step": 3196 }, { "epoch": 4.355585831062671, "grad_norm": 3.7013019828742837, "learning_rate": 1.25466062646377e-05, "loss": 0.0258, "step": 3197 }, { "epoch": 4.356948228882834, "grad_norm": 1.0349910982817079, "learning_rate": 1.2542338541211863e-05, "loss": 0.0271, "step": 3198 }, { "epoch": 4.358310626702997, "grad_norm": 1.7212692383868093, "learning_rate": 1.2538070322683509e-05, "loss": 0.0166, "step": 3199 }, { "epoch": 4.359673024523161, "grad_norm": 3.4668301457047197, "learning_rate": 1.2533801609883842e-05, "loss": 0.0196, "step": 3200 }, { "epoch": 4.361035422343324, "grad_norm": 0.5530794843468994, "learning_rate": 1.2529532403644171e-05, "loss": 0.0093, "step": 3201 }, { "epoch": 4.362397820163488, "grad_norm": 2.8551284210484402, "learning_rate": 1.2525262704795886e-05, "loss": 0.0097, "step": 3202 }, { "epoch": 4.363760217983652, "grad_norm": 2.3822829649087782, "learning_rate": 1.252099251417048e-05, "loss": 0.0069, "step": 3203 }, { "epoch": 4.3651226158038146, "grad_norm": 1.5406899254248934, "learning_rate": 1.251672183259955e-05, "loss": 0.0286, "step": 3204 }, { "epoch": 4.366485013623978, "grad_norm": 1.9472136557089739, "learning_rate": 1.2512450660914775e-05, "loss": 0.0346, "step": 3205 }, { "epoch": 4.367847411444142, "grad_norm": 3.842999577254071, "learning_rate": 1.2508178999947935e-05, "loss": 0.0241, "step": 3206 }, { "epoch": 4.369209809264305, "grad_norm": 1.6205912848290425, "learning_rate": 1.2503906850530916e-05, "loss": 0.0042, "step": 3207 }, { "epoch": 4.370572207084469, "grad_norm": 2.2412795248394923, "learning_rate": 1.2499634213495677e-05, "loss": 0.0115, "step": 3208 }, { "epoch": 4.371934604904633, "grad_norm": 4.2933955903321035, "learning_rate": 1.2495361089674284e-05, "loss": 0.0162, "step": 3209 }, { "epoch": 4.3732970027247955, "grad_norm": 1.4356868360778672, "learning_rate": 1.249108747989891e-05, "loss": 0.0324, "step": 3210 }, { "epoch": 4.374659400544959, "grad_norm": 2.3431360024571593, "learning_rate": 1.2486813385001798e-05, "loss": 0.0268, "step": 3211 }, { "epoch": 4.376021798365123, "grad_norm": 4.44492825640697, "learning_rate": 1.2482538805815301e-05, "loss": 0.0302, "step": 3212 }, { "epoch": 4.377384196185286, "grad_norm": 0.733590793683502, "learning_rate": 1.247826374317187e-05, "loss": 0.007, "step": 3213 }, { "epoch": 4.37874659400545, "grad_norm": 3.1613904362683436, "learning_rate": 1.2473988197904034e-05, "loss": 0.0102, "step": 3214 }, { "epoch": 4.3801089918256135, "grad_norm": 2.5170661964510046, "learning_rate": 1.2469712170844431e-05, "loss": 0.0201, "step": 3215 }, { "epoch": 4.381471389645776, "grad_norm": 1.6621890878782812, "learning_rate": 1.2465435662825782e-05, "loss": 0.0128, "step": 3216 }, { "epoch": 4.38283378746594, "grad_norm": 2.8789769548524444, "learning_rate": 1.2461158674680913e-05, "loss": 0.0151, "step": 3217 }, { "epoch": 4.384196185286104, "grad_norm": 2.780205026991278, "learning_rate": 1.2456881207242732e-05, "loss": 0.0041, "step": 3218 }, { "epoch": 4.385558583106267, "grad_norm": 1.6580216307836435, "learning_rate": 1.2452603261344254e-05, "loss": 0.0208, "step": 3219 }, { "epoch": 4.386920980926431, "grad_norm": 3.5292134331290743, "learning_rate": 1.244832483781857e-05, "loss": 0.0113, "step": 3220 }, { "epoch": 4.3882833787465945, "grad_norm": 2.5750294033960612, "learning_rate": 1.2444045937498874e-05, "loss": 0.0191, "step": 3221 }, { "epoch": 4.389645776566757, "grad_norm": 1.960061924550652, "learning_rate": 1.2439766561218453e-05, "loss": 0.0289, "step": 3222 }, { "epoch": 4.391008174386921, "grad_norm": 2.707296357963857, "learning_rate": 1.2435486709810687e-05, "loss": 0.0139, "step": 3223 }, { "epoch": 4.392370572207085, "grad_norm": 2.4179032582375934, "learning_rate": 1.2431206384109045e-05, "loss": 0.0062, "step": 3224 }, { "epoch": 4.393732970027248, "grad_norm": 0.8976603853798442, "learning_rate": 1.2426925584947096e-05, "loss": 0.0116, "step": 3225 }, { "epoch": 4.395095367847412, "grad_norm": 4.10832548, "learning_rate": 1.2422644313158483e-05, "loss": 0.0133, "step": 3226 }, { "epoch": 4.396457765667575, "grad_norm": 3.4786638008369253, "learning_rate": 1.2418362569576965e-05, "loss": 0.0285, "step": 3227 }, { "epoch": 4.397820163487738, "grad_norm": 2.56837032567125, "learning_rate": 1.2414080355036378e-05, "loss": 0.0231, "step": 3228 }, { "epoch": 4.399182561307902, "grad_norm": 3.4528811444533107, "learning_rate": 1.240979767037065e-05, "loss": 0.0142, "step": 3229 }, { "epoch": 4.400544959128065, "grad_norm": 1.2283792330622396, "learning_rate": 1.2405514516413806e-05, "loss": 0.0218, "step": 3230 }, { "epoch": 4.401907356948229, "grad_norm": 2.499846525934187, "learning_rate": 1.2401230893999965e-05, "loss": 0.0103, "step": 3231 }, { "epoch": 4.4032697547683926, "grad_norm": 2.373556696183748, "learning_rate": 1.2396946803963322e-05, "loss": 0.0468, "step": 3232 }, { "epoch": 4.4046321525885554, "grad_norm": 1.958413798963565, "learning_rate": 1.2392662247138181e-05, "loss": 0.0264, "step": 3233 }, { "epoch": 4.405994550408719, "grad_norm": 0.6069678273668027, "learning_rate": 1.2388377224358928e-05, "loss": 0.0086, "step": 3234 }, { "epoch": 4.407356948228883, "grad_norm": 2.086021972145717, "learning_rate": 1.2384091736460037e-05, "loss": 0.0118, "step": 3235 }, { "epoch": 4.408719346049046, "grad_norm": 2.8542466759356517, "learning_rate": 1.2379805784276081e-05, "loss": 0.0118, "step": 3236 }, { "epoch": 4.41008174386921, "grad_norm": 1.929731312968753, "learning_rate": 1.237551936864172e-05, "loss": 0.0341, "step": 3237 }, { "epoch": 4.4114441416893735, "grad_norm": 0.8578785904721179, "learning_rate": 1.2371232490391699e-05, "loss": 0.0115, "step": 3238 }, { "epoch": 4.412806539509536, "grad_norm": 0.8359507660872341, "learning_rate": 1.236694515036086e-05, "loss": 0.0111, "step": 3239 }, { "epoch": 4.4141689373297, "grad_norm": 0.3241574378686933, "learning_rate": 1.236265734938413e-05, "loss": 0.0177, "step": 3240 }, { "epoch": 4.415531335149864, "grad_norm": 1.8501262884388994, "learning_rate": 1.2358369088296529e-05, "loss": 0.0221, "step": 3241 }, { "epoch": 4.416893732970027, "grad_norm": 2.5808905987469166, "learning_rate": 1.2354080367933168e-05, "loss": 0.0303, "step": 3242 }, { "epoch": 4.418256130790191, "grad_norm": 1.1646267946427176, "learning_rate": 1.2349791189129244e-05, "loss": 0.0103, "step": 3243 }, { "epoch": 4.419618528610354, "grad_norm": 2.701169726049489, "learning_rate": 1.2345501552720044e-05, "loss": 0.0304, "step": 3244 }, { "epoch": 4.420980926430517, "grad_norm": 2.228872061186175, "learning_rate": 1.234121145954094e-05, "loss": 0.0101, "step": 3245 }, { "epoch": 4.422343324250681, "grad_norm": 3.573254282825778, "learning_rate": 1.2336920910427405e-05, "loss": 0.0225, "step": 3246 }, { "epoch": 4.423705722070845, "grad_norm": 1.8758345364800493, "learning_rate": 1.2332629906214986e-05, "loss": 0.0277, "step": 3247 }, { "epoch": 4.425068119891008, "grad_norm": 2.6379075396721916, "learning_rate": 1.2328338447739331e-05, "loss": 0.0145, "step": 3248 }, { "epoch": 4.426430517711172, "grad_norm": 0.6717400023365141, "learning_rate": 1.2324046535836171e-05, "loss": 0.0138, "step": 3249 }, { "epoch": 4.427792915531335, "grad_norm": 3.0951791549498804, "learning_rate": 1.2319754171341326e-05, "loss": 0.0234, "step": 3250 }, { "epoch": 4.429155313351498, "grad_norm": 1.3394219002041912, "learning_rate": 1.2315461355090698e-05, "loss": 0.0098, "step": 3251 }, { "epoch": 4.430517711171662, "grad_norm": 0.8186339341610755, "learning_rate": 1.231116808792029e-05, "loss": 0.0091, "step": 3252 }, { "epoch": 4.431880108991826, "grad_norm": 3.30735504090944, "learning_rate": 1.230687437066618e-05, "loss": 0.0397, "step": 3253 }, { "epoch": 4.433242506811989, "grad_norm": 1.5144392106372802, "learning_rate": 1.2302580204164542e-05, "loss": 0.0137, "step": 3254 }, { "epoch": 4.4346049046321525, "grad_norm": 2.08610952660897, "learning_rate": 1.2298285589251635e-05, "loss": 0.0115, "step": 3255 }, { "epoch": 4.435967302452316, "grad_norm": 2.4052254062306284, "learning_rate": 1.2293990526763805e-05, "loss": 0.0336, "step": 3256 }, { "epoch": 4.437329700272479, "grad_norm": 1.3390399052888617, "learning_rate": 1.2289695017537483e-05, "loss": 0.0225, "step": 3257 }, { "epoch": 4.438692098092643, "grad_norm": 2.8377636715006775, "learning_rate": 1.2285399062409191e-05, "loss": 0.0181, "step": 3258 }, { "epoch": 4.440054495912807, "grad_norm": 2.021267174162182, "learning_rate": 1.2281102662215534e-05, "loss": 0.0243, "step": 3259 }, { "epoch": 4.44141689373297, "grad_norm": 2.2804341358121545, "learning_rate": 1.2276805817793208e-05, "loss": 0.0113, "step": 3260 }, { "epoch": 4.4427792915531334, "grad_norm": 2.6763654671222623, "learning_rate": 1.2272508529978994e-05, "loss": 0.0551, "step": 3261 }, { "epoch": 4.444141689373297, "grad_norm": 2.861795896721063, "learning_rate": 1.2268210799609756e-05, "loss": 0.0169, "step": 3262 }, { "epoch": 4.44550408719346, "grad_norm": 1.977881561668869, "learning_rate": 1.2263912627522449e-05, "loss": 0.0225, "step": 3263 }, { "epoch": 4.446866485013624, "grad_norm": 3.883862026725706, "learning_rate": 1.2259614014554107e-05, "loss": 0.0098, "step": 3264 }, { "epoch": 4.448228882833788, "grad_norm": 1.9294210056773384, "learning_rate": 1.225531496154186e-05, "loss": 0.0298, "step": 3265 }, { "epoch": 4.449591280653951, "grad_norm": 1.459411182776784, "learning_rate": 1.2251015469322915e-05, "loss": 0.0146, "step": 3266 }, { "epoch": 4.450953678474114, "grad_norm": 2.686790468542693, "learning_rate": 1.2246715538734568e-05, "loss": 0.0125, "step": 3267 }, { "epoch": 4.452316076294278, "grad_norm": 1.454759871541517, "learning_rate": 1.2242415170614204e-05, "loss": 0.0196, "step": 3268 }, { "epoch": 4.453678474114441, "grad_norm": 1.2067761011720377, "learning_rate": 1.2238114365799285e-05, "loss": 0.0041, "step": 3269 }, { "epoch": 4.455040871934605, "grad_norm": 2.178068272292019, "learning_rate": 1.2233813125127363e-05, "loss": 0.015, "step": 3270 }, { "epoch": 4.456403269754769, "grad_norm": 2.1194040686541635, "learning_rate": 1.2229511449436075e-05, "loss": 0.0206, "step": 3271 }, { "epoch": 4.4577656675749315, "grad_norm": 2.5975590529716843, "learning_rate": 1.2225209339563144e-05, "loss": 0.0106, "step": 3272 }, { "epoch": 4.459128065395095, "grad_norm": 1.7491872277897358, "learning_rate": 1.2220906796346375e-05, "loss": 0.0296, "step": 3273 }, { "epoch": 4.460490463215259, "grad_norm": 2.625182298992535, "learning_rate": 1.2216603820623657e-05, "loss": 0.0249, "step": 3274 }, { "epoch": 4.461852861035422, "grad_norm": 3.0449494878091126, "learning_rate": 1.2212300413232963e-05, "loss": 0.0034, "step": 3275 }, { "epoch": 4.463215258855586, "grad_norm": 2.4248476882774566, "learning_rate": 1.2207996575012348e-05, "loss": 0.0121, "step": 3276 }, { "epoch": 4.46457765667575, "grad_norm": 0.7718844888772345, "learning_rate": 1.2203692306799963e-05, "loss": 0.0092, "step": 3277 }, { "epoch": 4.4659400544959125, "grad_norm": 3.0514520031916197, "learning_rate": 1.219938760943403e-05, "loss": 0.0166, "step": 3278 }, { "epoch": 4.467302452316076, "grad_norm": 2.5649758519235184, "learning_rate": 1.2195082483752856e-05, "loss": 0.0346, "step": 3279 }, { "epoch": 4.46866485013624, "grad_norm": 3.094251419957597, "learning_rate": 1.2190776930594836e-05, "loss": 0.0272, "step": 3280 }, { "epoch": 4.470027247956403, "grad_norm": 4.134653843684095, "learning_rate": 1.2186470950798446e-05, "loss": 0.0068, "step": 3281 }, { "epoch": 4.471389645776567, "grad_norm": 2.373090369304603, "learning_rate": 1.2182164545202245e-05, "loss": 0.053, "step": 3282 }, { "epoch": 4.4727520435967305, "grad_norm": 3.918713002005885, "learning_rate": 1.2177857714644875e-05, "loss": 0.0215, "step": 3283 }, { "epoch": 4.474114441416893, "grad_norm": 3.9999397640879866, "learning_rate": 1.2173550459965062e-05, "loss": 0.0213, "step": 3284 }, { "epoch": 4.475476839237057, "grad_norm": 1.2178283079513113, "learning_rate": 1.2169242782001613e-05, "loss": 0.0352, "step": 3285 }, { "epoch": 4.476839237057221, "grad_norm": 3.5653140196244206, "learning_rate": 1.2164934681593419e-05, "loss": 0.015, "step": 3286 }, { "epoch": 4.478201634877384, "grad_norm": 3.709251303182723, "learning_rate": 1.2160626159579447e-05, "loss": 0.0248, "step": 3287 }, { "epoch": 4.479564032697548, "grad_norm": 1.8187795597681664, "learning_rate": 1.2156317216798756e-05, "loss": 0.0266, "step": 3288 }, { "epoch": 4.4809264305177114, "grad_norm": 2.318032318761709, "learning_rate": 1.2152007854090484e-05, "loss": 0.0203, "step": 3289 }, { "epoch": 4.482288828337874, "grad_norm": 2.791877205962536, "learning_rate": 1.2147698072293844e-05, "loss": 0.0135, "step": 3290 }, { "epoch": 4.483651226158038, "grad_norm": 3.7347828905316574, "learning_rate": 1.214338787224814e-05, "loss": 0.0442, "step": 3291 }, { "epoch": 4.485013623978202, "grad_norm": 1.4642819468478552, "learning_rate": 1.2139077254792751e-05, "loss": 0.0179, "step": 3292 }, { "epoch": 4.486376021798365, "grad_norm": 3.2241750936084768, "learning_rate": 1.2134766220767135e-05, "loss": 0.0258, "step": 3293 }, { "epoch": 4.487738419618529, "grad_norm": 1.3809995087990237, "learning_rate": 1.2130454771010845e-05, "loss": 0.037, "step": 3294 }, { "epoch": 4.489100817438692, "grad_norm": 1.6771203261422722, "learning_rate": 1.2126142906363498e-05, "loss": 0.0081, "step": 3295 }, { "epoch": 4.490463215258855, "grad_norm": 3.73096897668665, "learning_rate": 1.21218306276648e-05, "loss": 0.0193, "step": 3296 }, { "epoch": 4.491825613079019, "grad_norm": 1.5128558083854422, "learning_rate": 1.2117517935754543e-05, "loss": 0.008, "step": 3297 }, { "epoch": 4.493188010899183, "grad_norm": 2.460974460501882, "learning_rate": 1.2113204831472587e-05, "loss": 0.0095, "step": 3298 }, { "epoch": 4.494550408719346, "grad_norm": 2.50888156145235, "learning_rate": 1.210889131565888e-05, "loss": 0.0071, "step": 3299 }, { "epoch": 4.4959128065395095, "grad_norm": 1.5252493506505942, "learning_rate": 1.210457738915345e-05, "loss": 0.007, "step": 3300 }, { "epoch": 4.497275204359673, "grad_norm": 5.053006120697657, "learning_rate": 1.2100263052796404e-05, "loss": 0.0252, "step": 3301 }, { "epoch": 4.498637602179836, "grad_norm": 2.2049611380796286, "learning_rate": 1.2095948307427925e-05, "loss": 0.0335, "step": 3302 }, { "epoch": 4.5, "grad_norm": 4.380274980617037, "learning_rate": 1.2091633153888287e-05, "loss": 0.0354, "step": 3303 }, { "epoch": 4.501362397820164, "grad_norm": 3.4720036783476775, "learning_rate": 1.2087317593017826e-05, "loss": 0.0069, "step": 3304 }, { "epoch": 4.502724795640327, "grad_norm": 1.2818077981036076, "learning_rate": 1.2083001625656974e-05, "loss": 0.0163, "step": 3305 }, { "epoch": 4.5040871934604905, "grad_norm": 2.7711337741367106, "learning_rate": 1.2078685252646232e-05, "loss": 0.01, "step": 3306 }, { "epoch": 4.505449591280654, "grad_norm": 3.6759355569447316, "learning_rate": 1.2074368474826184e-05, "loss": 0.0157, "step": 3307 }, { "epoch": 4.506811989100817, "grad_norm": 2.6312205272464815, "learning_rate": 1.2070051293037493e-05, "loss": 0.0472, "step": 3308 }, { "epoch": 4.508174386920981, "grad_norm": 4.1943208704301425, "learning_rate": 1.2065733708120901e-05, "loss": 0.0196, "step": 3309 }, { "epoch": 4.509536784741145, "grad_norm": 4.086368231048667, "learning_rate": 1.2061415720917223e-05, "loss": 0.0075, "step": 3310 }, { "epoch": 4.510899182561308, "grad_norm": 1.336377962507577, "learning_rate": 1.205709733226736e-05, "loss": 0.0086, "step": 3311 }, { "epoch": 4.512261580381471, "grad_norm": 3.7633605554139122, "learning_rate": 1.2052778543012286e-05, "loss": 0.007, "step": 3312 }, { "epoch": 4.513623978201635, "grad_norm": 2.5187488405154186, "learning_rate": 1.2048459353993056e-05, "loss": 0.0096, "step": 3313 }, { "epoch": 4.514986376021798, "grad_norm": 1.5967869391375187, "learning_rate": 1.2044139766050803e-05, "loss": 0.0179, "step": 3314 }, { "epoch": 4.516348773841962, "grad_norm": 2.8385215655053844, "learning_rate": 1.2039819780026734e-05, "loss": 0.0286, "step": 3315 }, { "epoch": 4.517711171662126, "grad_norm": 3.5341987653293083, "learning_rate": 1.2035499396762134e-05, "loss": 0.0199, "step": 3316 }, { "epoch": 4.5190735694822886, "grad_norm": 1.6064546668197583, "learning_rate": 1.2031178617098372e-05, "loss": 0.022, "step": 3317 }, { "epoch": 4.520435967302452, "grad_norm": 3.229263199977515, "learning_rate": 1.2026857441876884e-05, "loss": 0.0118, "step": 3318 }, { "epoch": 4.521798365122616, "grad_norm": 3.097814464424498, "learning_rate": 1.2022535871939192e-05, "loss": 0.0148, "step": 3319 }, { "epoch": 4.523160762942779, "grad_norm": 1.3281141067286717, "learning_rate": 1.201821390812689e-05, "loss": 0.0209, "step": 3320 }, { "epoch": 4.524523160762943, "grad_norm": 2.4351790444020383, "learning_rate": 1.2013891551281655e-05, "loss": 0.0363, "step": 3321 }, { "epoch": 4.525885558583107, "grad_norm": 2.0639147130328057, "learning_rate": 1.2009568802245227e-05, "loss": 0.0152, "step": 3322 }, { "epoch": 4.5272479564032695, "grad_norm": 0.9642271840281846, "learning_rate": 1.2005245661859435e-05, "loss": 0.0137, "step": 3323 }, { "epoch": 4.528610354223433, "grad_norm": 2.276455923874458, "learning_rate": 1.2000922130966182e-05, "loss": 0.0062, "step": 3324 }, { "epoch": 4.529972752043597, "grad_norm": 1.8288153549182182, "learning_rate": 1.199659821040744e-05, "loss": 0.0128, "step": 3325 }, { "epoch": 4.53133514986376, "grad_norm": 1.312353401729939, "learning_rate": 1.1992273901025268e-05, "loss": 0.0241, "step": 3326 }, { "epoch": 4.532697547683924, "grad_norm": 2.2489677973987554, "learning_rate": 1.1987949203661794e-05, "loss": 0.0174, "step": 3327 }, { "epoch": 4.5340599455040875, "grad_norm": 2.7262366526747774, "learning_rate": 1.1983624119159218e-05, "loss": 0.0191, "step": 3328 }, { "epoch": 4.53542234332425, "grad_norm": 2.028829251094913, "learning_rate": 1.1979298648359823e-05, "loss": 0.035, "step": 3329 }, { "epoch": 4.536784741144414, "grad_norm": 2.304073544692891, "learning_rate": 1.1974972792105964e-05, "loss": 0.0518, "step": 3330 }, { "epoch": 4.538147138964578, "grad_norm": 3.253681624161772, "learning_rate": 1.1970646551240066e-05, "loss": 0.0228, "step": 3331 }, { "epoch": 4.539509536784741, "grad_norm": 1.5592356539430228, "learning_rate": 1.1966319926604642e-05, "loss": 0.0212, "step": 3332 }, { "epoch": 4.540871934604905, "grad_norm": 2.380076408593, "learning_rate": 1.196199291904227e-05, "loss": 0.0213, "step": 3333 }, { "epoch": 4.5422343324250685, "grad_norm": 2.1885363967858487, "learning_rate": 1.1957665529395597e-05, "loss": 0.0266, "step": 3334 }, { "epoch": 4.543596730245231, "grad_norm": 1.2974305220837308, "learning_rate": 1.195333775850736e-05, "loss": 0.0121, "step": 3335 }, { "epoch": 4.544959128065395, "grad_norm": 2.723538150810753, "learning_rate": 1.1949009607220356e-05, "loss": 0.0202, "step": 3336 }, { "epoch": 4.546321525885559, "grad_norm": 2.8226591834029575, "learning_rate": 1.1944681076377466e-05, "loss": 0.0093, "step": 3337 }, { "epoch": 4.547683923705722, "grad_norm": 1.3671673238678805, "learning_rate": 1.194035216682164e-05, "loss": 0.0118, "step": 3338 }, { "epoch": 4.549046321525886, "grad_norm": 2.401159659480191, "learning_rate": 1.1936022879395902e-05, "loss": 0.013, "step": 3339 }, { "epoch": 4.550408719346049, "grad_norm": 1.2971968978873039, "learning_rate": 1.1931693214943349e-05, "loss": 0.0091, "step": 3340 }, { "epoch": 4.551771117166212, "grad_norm": 1.766833218969461, "learning_rate": 1.1927363174307155e-05, "loss": 0.0227, "step": 3341 }, { "epoch": 4.553133514986376, "grad_norm": 2.484985358864496, "learning_rate": 1.1923032758330564e-05, "loss": 0.022, "step": 3342 }, { "epoch": 4.55449591280654, "grad_norm": 0.2348856991071846, "learning_rate": 1.1918701967856892e-05, "loss": 0.009, "step": 3343 }, { "epoch": 4.555858310626703, "grad_norm": 1.683518540929336, "learning_rate": 1.1914370803729533e-05, "loss": 0.0285, "step": 3344 }, { "epoch": 4.5572207084468666, "grad_norm": 1.6327366301643593, "learning_rate": 1.191003926679195e-05, "loss": 0.0179, "step": 3345 }, { "epoch": 4.55858310626703, "grad_norm": 1.3589117192297457, "learning_rate": 1.190570735788768e-05, "loss": 0.0361, "step": 3346 }, { "epoch": 4.559945504087193, "grad_norm": 2.2210716948327978, "learning_rate": 1.1901375077860328e-05, "loss": 0.0238, "step": 3347 }, { "epoch": 4.561307901907357, "grad_norm": 0.8799537045201822, "learning_rate": 1.1897042427553578e-05, "loss": 0.0302, "step": 3348 }, { "epoch": 4.562670299727521, "grad_norm": 1.6168954744315642, "learning_rate": 1.1892709407811182e-05, "loss": 0.0215, "step": 3349 }, { "epoch": 4.564032697547684, "grad_norm": 0.9612999290543424, "learning_rate": 1.1888376019476966e-05, "loss": 0.0309, "step": 3350 }, { "epoch": 4.5653950953678475, "grad_norm": 0.9897389384347296, "learning_rate": 1.1884042263394829e-05, "loss": 0.0143, "step": 3351 }, { "epoch": 4.566757493188011, "grad_norm": 1.7665583007763732, "learning_rate": 1.1879708140408734e-05, "loss": 0.0132, "step": 3352 }, { "epoch": 4.568119891008174, "grad_norm": 0.9480545109632322, "learning_rate": 1.1875373651362727e-05, "loss": 0.0167, "step": 3353 }, { "epoch": 4.569482288828338, "grad_norm": 1.7978683915461529, "learning_rate": 1.1871038797100912e-05, "loss": 0.0264, "step": 3354 }, { "epoch": 4.570844686648502, "grad_norm": 2.5223100110073333, "learning_rate": 1.1866703578467478e-05, "loss": 0.0294, "step": 3355 }, { "epoch": 4.572207084468665, "grad_norm": 1.903639905650468, "learning_rate": 1.1862367996306675e-05, "loss": 0.0272, "step": 3356 }, { "epoch": 4.573569482288828, "grad_norm": 3.0834368226988893, "learning_rate": 1.1858032051462827e-05, "loss": 0.0348, "step": 3357 }, { "epoch": 4.574931880108992, "grad_norm": 2.518281165341392, "learning_rate": 1.1853695744780332e-05, "loss": 0.0266, "step": 3358 }, { "epoch": 4.576294277929155, "grad_norm": 0.773565040187828, "learning_rate": 1.184935907710365e-05, "loss": 0.0166, "step": 3359 }, { "epoch": 4.577656675749319, "grad_norm": 2.6259808184588573, "learning_rate": 1.1845022049277317e-05, "loss": 0.0233, "step": 3360 }, { "epoch": 4.579019073569482, "grad_norm": 2.3253766604558344, "learning_rate": 1.1840684662145944e-05, "loss": 0.0306, "step": 3361 }, { "epoch": 4.580381471389646, "grad_norm": 2.776086114143856, "learning_rate": 1.1836346916554204e-05, "loss": 0.021, "step": 3362 }, { "epoch": 4.581743869209809, "grad_norm": 1.558024494865056, "learning_rate": 1.183200881334684e-05, "loss": 0.0116, "step": 3363 }, { "epoch": 4.583106267029972, "grad_norm": 1.0899698841985728, "learning_rate": 1.182767035336867e-05, "loss": 0.0558, "step": 3364 }, { "epoch": 4.584468664850136, "grad_norm": 1.7297855303172769, "learning_rate": 1.1823331537464576e-05, "loss": 0.0239, "step": 3365 }, { "epoch": 4.5858310626703, "grad_norm": 2.0801786264295306, "learning_rate": 1.1818992366479514e-05, "loss": 0.015, "step": 3366 }, { "epoch": 4.587193460490463, "grad_norm": 1.1690169193936681, "learning_rate": 1.1814652841258504e-05, "loss": 0.0053, "step": 3367 }, { "epoch": 4.5885558583106265, "grad_norm": 1.217417935937674, "learning_rate": 1.1810312962646643e-05, "loss": 0.0072, "step": 3368 }, { "epoch": 4.58991825613079, "grad_norm": 1.881312615678644, "learning_rate": 1.1805972731489088e-05, "loss": 0.0224, "step": 3369 }, { "epoch": 4.591280653950953, "grad_norm": 1.1901118321359796, "learning_rate": 1.1801632148631068e-05, "loss": 0.0458, "step": 3370 }, { "epoch": 4.592643051771117, "grad_norm": 1.9544318793872142, "learning_rate": 1.1797291214917882e-05, "loss": 0.0222, "step": 3371 }, { "epoch": 4.594005449591281, "grad_norm": 1.8789858080866757, "learning_rate": 1.1792949931194897e-05, "loss": 0.0306, "step": 3372 }, { "epoch": 4.595367847411444, "grad_norm": 1.7527824796162585, "learning_rate": 1.1788608298307546e-05, "loss": 0.0057, "step": 3373 }, { "epoch": 4.5967302452316074, "grad_norm": 1.5746770781487291, "learning_rate": 1.1784266317101334e-05, "loss": 0.0425, "step": 3374 }, { "epoch": 4.598092643051771, "grad_norm": 0.8993890712216198, "learning_rate": 1.1779923988421828e-05, "loss": 0.0038, "step": 3375 }, { "epoch": 4.599455040871934, "grad_norm": 1.3699486668560332, "learning_rate": 1.1775581313114668e-05, "loss": 0.0024, "step": 3376 }, { "epoch": 4.600817438692098, "grad_norm": 1.4170646341502606, "learning_rate": 1.1771238292025558e-05, "loss": 0.0113, "step": 3377 }, { "epoch": 4.602179836512262, "grad_norm": 1.994871374039603, "learning_rate": 1.1766894926000271e-05, "loss": 0.0209, "step": 3378 }, { "epoch": 4.603542234332425, "grad_norm": 1.1077205639227221, "learning_rate": 1.176255121588465e-05, "loss": 0.0372, "step": 3379 }, { "epoch": 4.604904632152588, "grad_norm": 2.8833036524355986, "learning_rate": 1.1758207162524597e-05, "loss": 0.0243, "step": 3380 }, { "epoch": 4.606267029972752, "grad_norm": 2.045284183102769, "learning_rate": 1.1753862766766091e-05, "loss": 0.0126, "step": 3381 }, { "epoch": 4.607629427792915, "grad_norm": 4.264224520531283, "learning_rate": 1.1749518029455167e-05, "loss": 0.0481, "step": 3382 }, { "epoch": 4.608991825613079, "grad_norm": 1.2910593097574605, "learning_rate": 1.1745172951437933e-05, "loss": 0.0096, "step": 3383 }, { "epoch": 4.610354223433243, "grad_norm": 2.6974552762673016, "learning_rate": 1.1740827533560568e-05, "loss": 0.0135, "step": 3384 }, { "epoch": 4.6117166212534055, "grad_norm": 2.052260321998687, "learning_rate": 1.1736481776669307e-05, "loss": 0.0104, "step": 3385 }, { "epoch": 4.613079019073569, "grad_norm": 1.125684646839902, "learning_rate": 1.1732135681610453e-05, "loss": 0.0135, "step": 3386 }, { "epoch": 4.614441416893733, "grad_norm": 2.2000954035237674, "learning_rate": 1.1727789249230383e-05, "loss": 0.0122, "step": 3387 }, { "epoch": 4.615803814713896, "grad_norm": 1.7901096590727965, "learning_rate": 1.172344248037553e-05, "loss": 0.0043, "step": 3388 }, { "epoch": 4.61716621253406, "grad_norm": 1.8901413046813385, "learning_rate": 1.1719095375892398e-05, "loss": 0.0149, "step": 3389 }, { "epoch": 4.618528610354224, "grad_norm": 1.5690484233946893, "learning_rate": 1.1714747936627556e-05, "loss": 0.0386, "step": 3390 }, { "epoch": 4.6198910081743865, "grad_norm": 2.211300281295644, "learning_rate": 1.1710400163427633e-05, "loss": 0.0174, "step": 3391 }, { "epoch": 4.62125340599455, "grad_norm": 2.2651217964723744, "learning_rate": 1.1706052057139335e-05, "loss": 0.0115, "step": 3392 }, { "epoch": 4.622615803814714, "grad_norm": 2.286507036183985, "learning_rate": 1.1701703618609419e-05, "loss": 0.0278, "step": 3393 }, { "epoch": 4.623978201634877, "grad_norm": 2.3182371802159434, "learning_rate": 1.1697354848684712e-05, "loss": 0.0333, "step": 3394 }, { "epoch": 4.625340599455041, "grad_norm": 1.350201864841655, "learning_rate": 1.169300574821211e-05, "loss": 0.0149, "step": 3395 }, { "epoch": 4.6267029972752045, "grad_norm": 0.7620339313424793, "learning_rate": 1.168865631803857e-05, "loss": 0.0088, "step": 3396 }, { "epoch": 4.628065395095367, "grad_norm": 2.0025605845504084, "learning_rate": 1.1684306559011107e-05, "loss": 0.0157, "step": 3397 }, { "epoch": 4.629427792915531, "grad_norm": 0.9055123944840502, "learning_rate": 1.1679956471976814e-05, "loss": 0.0112, "step": 3398 }, { "epoch": 4.630790190735695, "grad_norm": 1.3509383661331589, "learning_rate": 1.1675606057782836e-05, "loss": 0.008, "step": 3399 }, { "epoch": 4.632152588555858, "grad_norm": 3.198633860945474, "learning_rate": 1.167125531727638e-05, "loss": 0.0465, "step": 3400 }, { "epoch": 4.633514986376022, "grad_norm": 0.6365174861486492, "learning_rate": 1.1666904251304732e-05, "loss": 0.017, "step": 3401 }, { "epoch": 4.6348773841961854, "grad_norm": 2.1341870988258163, "learning_rate": 1.1662552860715226e-05, "loss": 0.0432, "step": 3402 }, { "epoch": 4.636239782016348, "grad_norm": 1.8100328051276897, "learning_rate": 1.1658201146355263e-05, "loss": 0.0205, "step": 3403 }, { "epoch": 4.637602179836512, "grad_norm": 0.8922260581634535, "learning_rate": 1.1653849109072315e-05, "loss": 0.0122, "step": 3404 }, { "epoch": 4.638964577656676, "grad_norm": 1.959555129087422, "learning_rate": 1.1649496749713906e-05, "loss": 0.0187, "step": 3405 }, { "epoch": 4.640326975476839, "grad_norm": 2.8050220917162902, "learning_rate": 1.1645144069127624e-05, "loss": 0.0275, "step": 3406 }, { "epoch": 4.641689373297003, "grad_norm": 3.270557943699527, "learning_rate": 1.164079106816113e-05, "loss": 0.0245, "step": 3407 }, { "epoch": 4.643051771117166, "grad_norm": 0.8312822667552212, "learning_rate": 1.1636437747662139e-05, "loss": 0.0167, "step": 3408 }, { "epoch": 4.644414168937329, "grad_norm": 0.6438336489865809, "learning_rate": 1.1632084108478423e-05, "loss": 0.023, "step": 3409 }, { "epoch": 4.645776566757493, "grad_norm": 2.6987690557367947, "learning_rate": 1.162773015145783e-05, "loss": 0.0249, "step": 3410 }, { "epoch": 4.647138964577657, "grad_norm": 1.4319710283987699, "learning_rate": 1.1623375877448262e-05, "loss": 0.016, "step": 3411 }, { "epoch": 4.64850136239782, "grad_norm": 1.677273482252266, "learning_rate": 1.1619021287297676e-05, "loss": 0.0498, "step": 3412 }, { "epoch": 4.6498637602179835, "grad_norm": 2.000534753416805, "learning_rate": 1.1614666381854107e-05, "loss": 0.0507, "step": 3413 }, { "epoch": 4.651226158038147, "grad_norm": 1.41355181703132, "learning_rate": 1.1610311161965635e-05, "loss": 0.013, "step": 3414 }, { "epoch": 4.65258855585831, "grad_norm": 1.2424600302391142, "learning_rate": 1.1605955628480409e-05, "loss": 0.0114, "step": 3415 }, { "epoch": 4.653950953678474, "grad_norm": 1.8183904031155937, "learning_rate": 1.1601599782246646e-05, "loss": 0.0396, "step": 3416 }, { "epoch": 4.655313351498638, "grad_norm": 1.889395705488788, "learning_rate": 1.1597243624112611e-05, "loss": 0.0249, "step": 3417 }, { "epoch": 4.656675749318801, "grad_norm": 1.5639538530646924, "learning_rate": 1.159288715492663e-05, "loss": 0.0201, "step": 3418 }, { "epoch": 4.6580381471389645, "grad_norm": 2.175664833455245, "learning_rate": 1.1588530375537102e-05, "loss": 0.0242, "step": 3419 }, { "epoch": 4.659400544959128, "grad_norm": 2.642164792831777, "learning_rate": 1.1584173286792475e-05, "loss": 0.0142, "step": 3420 }, { "epoch": 4.660762942779291, "grad_norm": 1.278565070405986, "learning_rate": 1.1579815889541267e-05, "loss": 0.0038, "step": 3421 }, { "epoch": 4.662125340599455, "grad_norm": 2.3826199933931256, "learning_rate": 1.1575458184632045e-05, "loss": 0.0594, "step": 3422 }, { "epoch": 4.663487738419619, "grad_norm": 2.5086154403011864, "learning_rate": 1.1571100172913443e-05, "loss": 0.0309, "step": 3423 }, { "epoch": 4.664850136239782, "grad_norm": 2.7487052896639343, "learning_rate": 1.1566741855234152e-05, "loss": 0.0382, "step": 3424 }, { "epoch": 4.666212534059945, "grad_norm": 3.350161680164919, "learning_rate": 1.1562383232442927e-05, "loss": 0.0245, "step": 3425 }, { "epoch": 4.667574931880109, "grad_norm": 2.391652704936073, "learning_rate": 1.1558024305388572e-05, "loss": 0.014, "step": 3426 }, { "epoch": 4.668937329700272, "grad_norm": 0.979530034017477, "learning_rate": 1.1553665074919966e-05, "loss": 0.0194, "step": 3427 }, { "epoch": 4.670299727520436, "grad_norm": 2.822950639150893, "learning_rate": 1.1549305541886033e-05, "loss": 0.0157, "step": 3428 }, { "epoch": 4.6716621253406, "grad_norm": 2.6522113364105637, "learning_rate": 1.1544945707135764e-05, "loss": 0.0295, "step": 3429 }, { "epoch": 4.6730245231607626, "grad_norm": 4.454675396392945, "learning_rate": 1.1540585571518203e-05, "loss": 0.0389, "step": 3430 }, { "epoch": 4.674386920980926, "grad_norm": 5.765869740241431, "learning_rate": 1.153622513588246e-05, "loss": 0.0161, "step": 3431 }, { "epoch": 4.67574931880109, "grad_norm": 3.1091427434093695, "learning_rate": 1.1531864401077692e-05, "loss": 0.0209, "step": 3432 }, { "epoch": 4.677111716621253, "grad_norm": 3.5182212517317413, "learning_rate": 1.1527503367953133e-05, "loss": 0.0156, "step": 3433 }, { "epoch": 4.678474114441417, "grad_norm": 5.071825310616716, "learning_rate": 1.152314203735805e-05, "loss": 0.0199, "step": 3434 }, { "epoch": 4.679836512261581, "grad_norm": 1.8542082135674847, "learning_rate": 1.1518780410141792e-05, "loss": 0.0217, "step": 3435 }, { "epoch": 4.6811989100817435, "grad_norm": 3.913615445291175, "learning_rate": 1.1514418487153751e-05, "loss": 0.0268, "step": 3436 }, { "epoch": 4.682561307901907, "grad_norm": 4.019324309078163, "learning_rate": 1.151005626924338e-05, "loss": 0.0215, "step": 3437 }, { "epoch": 4.683923705722071, "grad_norm": 3.096979404210251, "learning_rate": 1.1505693757260187e-05, "loss": 0.0319, "step": 3438 }, { "epoch": 4.685286103542234, "grad_norm": 2.396802575695067, "learning_rate": 1.150133095205375e-05, "loss": 0.0162, "step": 3439 }, { "epoch": 4.686648501362398, "grad_norm": 2.123988742423684, "learning_rate": 1.1496967854473688e-05, "loss": 0.0161, "step": 3440 }, { "epoch": 4.6880108991825615, "grad_norm": 2.4838698738142595, "learning_rate": 1.1492604465369686e-05, "loss": 0.0677, "step": 3441 }, { "epoch": 4.689373297002724, "grad_norm": 3.2074943040479424, "learning_rate": 1.148824078559148e-05, "loss": 0.007, "step": 3442 }, { "epoch": 4.690735694822888, "grad_norm": 3.7266995824897817, "learning_rate": 1.1483876815988867e-05, "loss": 0.0296, "step": 3443 }, { "epoch": 4.692098092643052, "grad_norm": 2.2161713640344853, "learning_rate": 1.1479512557411698e-05, "loss": 0.0104, "step": 3444 }, { "epoch": 4.693460490463215, "grad_norm": 4.1599903938362655, "learning_rate": 1.1475148010709889e-05, "loss": 0.0309, "step": 3445 }, { "epoch": 4.694822888283379, "grad_norm": 2.431998969681423, "learning_rate": 1.1470783176733396e-05, "loss": 0.0291, "step": 3446 }, { "epoch": 4.6961852861035425, "grad_norm": 2.42950407265473, "learning_rate": 1.1466418056332245e-05, "loss": 0.0327, "step": 3447 }, { "epoch": 4.697547683923705, "grad_norm": 2.0475475482477523, "learning_rate": 1.146205265035651e-05, "loss": 0.0186, "step": 3448 }, { "epoch": 4.698910081743869, "grad_norm": 2.443495019605626, "learning_rate": 1.1457686959656322e-05, "loss": 0.0323, "step": 3449 }, { "epoch": 4.700272479564033, "grad_norm": 2.57534938566505, "learning_rate": 1.1453320985081871e-05, "loss": 0.0203, "step": 3450 }, { "epoch": 4.701634877384196, "grad_norm": 3.532135029698723, "learning_rate": 1.1448954727483399e-05, "loss": 0.0359, "step": 3451 }, { "epoch": 4.70299727520436, "grad_norm": 0.8460088968498952, "learning_rate": 1.1444588187711205e-05, "loss": 0.0089, "step": 3452 }, { "epoch": 4.704359673024523, "grad_norm": 1.7171954964080212, "learning_rate": 1.144022136661564e-05, "loss": 0.0203, "step": 3453 }, { "epoch": 4.705722070844686, "grad_norm": 1.4710486535788494, "learning_rate": 1.143585426504711e-05, "loss": 0.0066, "step": 3454 }, { "epoch": 4.70708446866485, "grad_norm": 0.2539628699895332, "learning_rate": 1.1431486883856082e-05, "loss": 0.01, "step": 3455 }, { "epoch": 4.708446866485014, "grad_norm": 1.621921874565806, "learning_rate": 1.1427119223893068e-05, "loss": 0.0193, "step": 3456 }, { "epoch": 4.709809264305177, "grad_norm": 1.5819753118218685, "learning_rate": 1.1422751286008645e-05, "loss": 0.0189, "step": 3457 }, { "epoch": 4.7111716621253406, "grad_norm": 1.7433476617529198, "learning_rate": 1.141838307105343e-05, "loss": 0.0133, "step": 3458 }, { "epoch": 4.712534059945504, "grad_norm": 2.0317120361174275, "learning_rate": 1.141401457987811e-05, "loss": 0.0112, "step": 3459 }, { "epoch": 4.713896457765667, "grad_norm": 2.5687577923271436, "learning_rate": 1.1409645813333414e-05, "loss": 0.0624, "step": 3460 }, { "epoch": 4.715258855585831, "grad_norm": 3.134815669796895, "learning_rate": 1.1405276772270126e-05, "loss": 0.0178, "step": 3461 }, { "epoch": 4.716621253405995, "grad_norm": 1.502550631880158, "learning_rate": 1.140090745753909e-05, "loss": 0.0304, "step": 3462 }, { "epoch": 4.717983651226158, "grad_norm": 1.9695849477573044, "learning_rate": 1.13965378699912e-05, "loss": 0.0206, "step": 3463 }, { "epoch": 4.7193460490463215, "grad_norm": 0.6853486800469709, "learning_rate": 1.1392168010477398e-05, "loss": 0.0087, "step": 3464 }, { "epoch": 4.720708446866485, "grad_norm": 0.5429485310935555, "learning_rate": 1.1387797879848687e-05, "loss": 0.0194, "step": 3465 }, { "epoch": 4.722070844686648, "grad_norm": 1.1084008078394265, "learning_rate": 1.1383427478956119e-05, "loss": 0.0183, "step": 3466 }, { "epoch": 4.723433242506812, "grad_norm": 1.3031686925514985, "learning_rate": 1.1379056808650795e-05, "loss": 0.0136, "step": 3467 }, { "epoch": 4.724795640326976, "grad_norm": 2.11885004313473, "learning_rate": 1.1374685869783875e-05, "loss": 0.0154, "step": 3468 }, { "epoch": 4.726158038147139, "grad_norm": 1.1368254376320441, "learning_rate": 1.1370314663206569e-05, "loss": 0.0209, "step": 3469 }, { "epoch": 4.727520435967302, "grad_norm": 2.1795105733028253, "learning_rate": 1.136594318977014e-05, "loss": 0.0244, "step": 3470 }, { "epoch": 4.728882833787466, "grad_norm": 1.449799604588852, "learning_rate": 1.1361571450325899e-05, "loss": 0.0181, "step": 3471 }, { "epoch": 4.730245231607629, "grad_norm": 1.4920601359266363, "learning_rate": 1.1357199445725213e-05, "loss": 0.0327, "step": 3472 }, { "epoch": 4.731607629427793, "grad_norm": 1.0915653765811688, "learning_rate": 1.1352827176819496e-05, "loss": 0.0056, "step": 3473 }, { "epoch": 4.732970027247957, "grad_norm": 1.7139037411521842, "learning_rate": 1.1348454644460221e-05, "loss": 0.0207, "step": 3474 }, { "epoch": 4.73433242506812, "grad_norm": 1.2859093644615414, "learning_rate": 1.1344081849498902e-05, "loss": 0.014, "step": 3475 }, { "epoch": 4.735694822888283, "grad_norm": 1.7329856993657855, "learning_rate": 1.1339708792787119e-05, "loss": 0.0203, "step": 3476 }, { "epoch": 4.737057220708447, "grad_norm": 1.478962843266935, "learning_rate": 1.1335335475176488e-05, "loss": 0.0044, "step": 3477 }, { "epoch": 4.73841961852861, "grad_norm": 1.8831399993609643, "learning_rate": 1.1330961897518678e-05, "loss": 0.0227, "step": 3478 }, { "epoch": 4.739782016348774, "grad_norm": 1.7165650544503088, "learning_rate": 1.132658806066542e-05, "loss": 0.0049, "step": 3479 }, { "epoch": 4.741144414168938, "grad_norm": 0.9460440523420199, "learning_rate": 1.1322213965468484e-05, "loss": 0.0253, "step": 3480 }, { "epoch": 4.7425068119891005, "grad_norm": 1.423673016195491, "learning_rate": 1.1317839612779696e-05, "loss": 0.0098, "step": 3481 }, { "epoch": 4.743869209809264, "grad_norm": 1.8305572246221027, "learning_rate": 1.1313465003450931e-05, "loss": 0.0272, "step": 3482 }, { "epoch": 4.745231607629428, "grad_norm": 1.0114062671971165, "learning_rate": 1.1309090138334112e-05, "loss": 0.0115, "step": 3483 }, { "epoch": 4.746594005449591, "grad_norm": 1.4159387622525312, "learning_rate": 1.1304715018281209e-05, "loss": 0.0091, "step": 3484 }, { "epoch": 4.747956403269755, "grad_norm": 1.5794378864298428, "learning_rate": 1.130033964414425e-05, "loss": 0.0062, "step": 3485 }, { "epoch": 4.7493188010899186, "grad_norm": 1.477066777566012, "learning_rate": 1.1295964016775311e-05, "loss": 0.0294, "step": 3486 }, { "epoch": 4.7506811989100814, "grad_norm": 2.203409933230144, "learning_rate": 1.129158813702651e-05, "loss": 0.0396, "step": 3487 }, { "epoch": 4.752043596730245, "grad_norm": 1.8748983594599562, "learning_rate": 1.1287212005750023e-05, "loss": 0.0294, "step": 3488 }, { "epoch": 4.753405994550409, "grad_norm": 2.2185739847214925, "learning_rate": 1.1282835623798067e-05, "loss": 0.0389, "step": 3489 }, { "epoch": 4.754768392370572, "grad_norm": 3.3955487137467895, "learning_rate": 1.1278458992022911e-05, "loss": 0.0294, "step": 3490 }, { "epoch": 4.756130790190736, "grad_norm": 2.346023736730932, "learning_rate": 1.1274082111276877e-05, "loss": 0.0313, "step": 3491 }, { "epoch": 4.7574931880108995, "grad_norm": 3.5797691364597357, "learning_rate": 1.1269704982412327e-05, "loss": 0.0458, "step": 3492 }, { "epoch": 4.758855585831062, "grad_norm": 2.788894541558979, "learning_rate": 1.1265327606281678e-05, "loss": 0.0234, "step": 3493 }, { "epoch": 4.760217983651226, "grad_norm": 1.40478707478218, "learning_rate": 1.1260949983737399e-05, "loss": 0.0088, "step": 3494 }, { "epoch": 4.76158038147139, "grad_norm": 2.5106134836456557, "learning_rate": 1.1256572115631991e-05, "loss": 0.0206, "step": 3495 }, { "epoch": 4.762942779291553, "grad_norm": 4.099069156103984, "learning_rate": 1.1252194002818018e-05, "loss": 0.0331, "step": 3496 }, { "epoch": 4.764305177111717, "grad_norm": 3.157884695397396, "learning_rate": 1.1247815646148088e-05, "loss": 0.0324, "step": 3497 }, { "epoch": 4.76566757493188, "grad_norm": 1.696988021396957, "learning_rate": 1.1243437046474854e-05, "loss": 0.0236, "step": 3498 }, { "epoch": 4.767029972752043, "grad_norm": 1.649664960765033, "learning_rate": 1.1239058204651014e-05, "loss": 0.0347, "step": 3499 }, { "epoch": 4.768392370572207, "grad_norm": 1.1071101593546813, "learning_rate": 1.1234679121529325e-05, "loss": 0.0252, "step": 3500 }, { "epoch": 4.769754768392371, "grad_norm": 1.6980548711569035, "learning_rate": 1.1230299797962574e-05, "loss": 0.008, "step": 3501 }, { "epoch": 4.771117166212534, "grad_norm": 1.005618781985145, "learning_rate": 1.1225920234803605e-05, "loss": 0.0261, "step": 3502 }, { "epoch": 4.772479564032698, "grad_norm": 0.9940366398078495, "learning_rate": 1.1221540432905309e-05, "loss": 0.0138, "step": 3503 }, { "epoch": 4.773841961852861, "grad_norm": 1.6313853558233484, "learning_rate": 1.121716039312062e-05, "loss": 0.0105, "step": 3504 }, { "epoch": 4.775204359673024, "grad_norm": 1.4632497850132518, "learning_rate": 1.1212780116302524e-05, "loss": 0.0353, "step": 3505 }, { "epoch": 4.776566757493188, "grad_norm": 1.5230408744280257, "learning_rate": 1.1208399603304048e-05, "loss": 0.0313, "step": 3506 }, { "epoch": 4.777929155313352, "grad_norm": 1.2165489191707977, "learning_rate": 1.1204018854978261e-05, "loss": 0.0084, "step": 3507 }, { "epoch": 4.779291553133515, "grad_norm": 3.071254526517291, "learning_rate": 1.1199637872178286e-05, "loss": 0.0197, "step": 3508 }, { "epoch": 4.7806539509536785, "grad_norm": 1.945953994677183, "learning_rate": 1.119525665575729e-05, "loss": 0.0074, "step": 3509 }, { "epoch": 4.782016348773842, "grad_norm": 2.5910816166784274, "learning_rate": 1.119087520656848e-05, "loss": 0.0175, "step": 3510 }, { "epoch": 4.783378746594005, "grad_norm": 2.0653372866332065, "learning_rate": 1.1186493525465116e-05, "loss": 0.005, "step": 3511 }, { "epoch": 4.784741144414169, "grad_norm": 1.0274755403200897, "learning_rate": 1.1182111613300502e-05, "loss": 0.0181, "step": 3512 }, { "epoch": 4.786103542234333, "grad_norm": 1.50608782370553, "learning_rate": 1.1177729470927976e-05, "loss": 0.0133, "step": 3513 }, { "epoch": 4.787465940054496, "grad_norm": 1.833575944009871, "learning_rate": 1.1173347099200937e-05, "loss": 0.0105, "step": 3514 }, { "epoch": 4.7888283378746594, "grad_norm": 1.6865666422892265, "learning_rate": 1.1168964498972819e-05, "loss": 0.0226, "step": 3515 }, { "epoch": 4.790190735694823, "grad_norm": 1.1816986602757071, "learning_rate": 1.1164581671097099e-05, "loss": 0.0052, "step": 3516 }, { "epoch": 4.791553133514986, "grad_norm": 0.6918394815090099, "learning_rate": 1.1160198616427307e-05, "loss": 0.0027, "step": 3517 }, { "epoch": 4.79291553133515, "grad_norm": 1.6730709631696385, "learning_rate": 1.115581533581701e-05, "loss": 0.0135, "step": 3518 }, { "epoch": 4.794277929155314, "grad_norm": 1.835637470645484, "learning_rate": 1.1151431830119818e-05, "loss": 0.0278, "step": 3519 }, { "epoch": 4.795640326975477, "grad_norm": 0.9680646414897277, "learning_rate": 1.1147048100189393e-05, "loss": 0.0059, "step": 3520 }, { "epoch": 4.79700272479564, "grad_norm": 1.48284233999681, "learning_rate": 1.1142664146879432e-05, "loss": 0.0118, "step": 3521 }, { "epoch": 4.798365122615804, "grad_norm": 1.8001028998350541, "learning_rate": 1.1138279971043679e-05, "loss": 0.0208, "step": 3522 }, { "epoch": 4.799727520435967, "grad_norm": 0.7638986438122236, "learning_rate": 1.1133895573535924e-05, "loss": 0.0086, "step": 3523 }, { "epoch": 4.801089918256131, "grad_norm": 1.8577708468257423, "learning_rate": 1.1129510955209996e-05, "loss": 0.0151, "step": 3524 }, { "epoch": 4.802452316076295, "grad_norm": 1.0693352432746603, "learning_rate": 1.1125126116919768e-05, "loss": 0.0084, "step": 3525 }, { "epoch": 4.8038147138964575, "grad_norm": 2.759774988893254, "learning_rate": 1.1120741059519158e-05, "loss": 0.0249, "step": 3526 }, { "epoch": 4.805177111716621, "grad_norm": 0.9357157125301245, "learning_rate": 1.1116355783862122e-05, "loss": 0.0239, "step": 3527 }, { "epoch": 4.806539509536785, "grad_norm": 1.1787543371806184, "learning_rate": 1.1111970290802665e-05, "loss": 0.0302, "step": 3528 }, { "epoch": 4.807901907356948, "grad_norm": 0.2660939567158777, "learning_rate": 1.1107584581194828e-05, "loss": 0.0013, "step": 3529 }, { "epoch": 4.809264305177112, "grad_norm": 2.0013230766585837, "learning_rate": 1.11031986558927e-05, "loss": 0.0188, "step": 3530 }, { "epoch": 4.810626702997276, "grad_norm": 2.6341061676140387, "learning_rate": 1.1098812515750405e-05, "loss": 0.0198, "step": 3531 }, { "epoch": 4.8119891008174385, "grad_norm": 3.6443445556724927, "learning_rate": 1.1094426161622119e-05, "loss": 0.0283, "step": 3532 }, { "epoch": 4.813351498637602, "grad_norm": 1.2961335066488993, "learning_rate": 1.1090039594362046e-05, "loss": 0.0072, "step": 3533 }, { "epoch": 4.814713896457766, "grad_norm": 5.620017973661717, "learning_rate": 1.1085652814824443e-05, "loss": 0.0186, "step": 3534 }, { "epoch": 4.816076294277929, "grad_norm": 0.6534267880764, "learning_rate": 1.1081265823863607e-05, "loss": 0.0242, "step": 3535 }, { "epoch": 4.817438692098093, "grad_norm": 0.34686840274193975, "learning_rate": 1.1076878622333869e-05, "loss": 0.0139, "step": 3536 }, { "epoch": 4.8188010899182565, "grad_norm": 0.5826168720834003, "learning_rate": 1.107249121108961e-05, "loss": 0.0291, "step": 3537 }, { "epoch": 4.820163487738419, "grad_norm": 0.9629331710677701, "learning_rate": 1.1068103590985241e-05, "loss": 0.0099, "step": 3538 }, { "epoch": 4.821525885558583, "grad_norm": 1.134028191532294, "learning_rate": 1.1063715762875225e-05, "loss": 0.0191, "step": 3539 }, { "epoch": 4.822888283378747, "grad_norm": 0.5344501450144461, "learning_rate": 1.105932772761406e-05, "loss": 0.0163, "step": 3540 }, { "epoch": 4.82425068119891, "grad_norm": 1.2502295544765667, "learning_rate": 1.1054939486056284e-05, "loss": 0.0206, "step": 3541 }, { "epoch": 4.825613079019074, "grad_norm": 1.0776348885427158, "learning_rate": 1.1050551039056479e-05, "loss": 0.0032, "step": 3542 }, { "epoch": 4.8269754768392374, "grad_norm": 1.210846983840452, "learning_rate": 1.104616238746926e-05, "loss": 0.0073, "step": 3543 }, { "epoch": 4.8283378746594, "grad_norm": 2.7522449110848513, "learning_rate": 1.1041773532149289e-05, "loss": 0.0228, "step": 3544 }, { "epoch": 4.829700272479564, "grad_norm": 1.0218429134077733, "learning_rate": 1.103738447395126e-05, "loss": 0.0229, "step": 3545 }, { "epoch": 4.831062670299728, "grad_norm": 2.3506365597606003, "learning_rate": 1.1032995213729918e-05, "loss": 0.0169, "step": 3546 }, { "epoch": 4.832425068119891, "grad_norm": 2.840512777037081, "learning_rate": 1.1028605752340036e-05, "loss": 0.0223, "step": 3547 }, { "epoch": 4.833787465940055, "grad_norm": 1.8802152037518254, "learning_rate": 1.1024216090636433e-05, "loss": 0.0154, "step": 3548 }, { "epoch": 4.835149863760218, "grad_norm": 2.4190492905391037, "learning_rate": 1.1019826229473962e-05, "loss": 0.0231, "step": 3549 }, { "epoch": 4.836512261580381, "grad_norm": 3.1915901581312633, "learning_rate": 1.1015436169707518e-05, "loss": 0.0318, "step": 3550 }, { "epoch": 4.837874659400545, "grad_norm": 1.0520913211025504, "learning_rate": 1.1011045912192036e-05, "loss": 0.0392, "step": 3551 }, { "epoch": 4.839237057220709, "grad_norm": 1.3510309313130375, "learning_rate": 1.1006655457782486e-05, "loss": 0.0049, "step": 3552 }, { "epoch": 4.840599455040872, "grad_norm": 1.9069879506720226, "learning_rate": 1.100226480733388e-05, "loss": 0.0199, "step": 3553 }, { "epoch": 4.8419618528610355, "grad_norm": 2.7700969223006298, "learning_rate": 1.0997873961701267e-05, "loss": 0.0473, "step": 3554 }, { "epoch": 4.843324250681199, "grad_norm": 0.9428752219952516, "learning_rate": 1.0993482921739728e-05, "loss": 0.0354, "step": 3555 }, { "epoch": 4.844686648501362, "grad_norm": 1.6067511497593479, "learning_rate": 1.0989091688304394e-05, "loss": 0.0271, "step": 3556 }, { "epoch": 4.846049046321526, "grad_norm": 2.258480203612967, "learning_rate": 1.0984700262250418e-05, "loss": 0.022, "step": 3557 }, { "epoch": 4.84741144414169, "grad_norm": 2.0387918292812324, "learning_rate": 1.098030864443301e-05, "loss": 0.0284, "step": 3558 }, { "epoch": 4.848773841961853, "grad_norm": 1.2139052168211362, "learning_rate": 1.0975916835707398e-05, "loss": 0.0148, "step": 3559 }, { "epoch": 4.8501362397820165, "grad_norm": 1.5828343541530983, "learning_rate": 1.097152483692886e-05, "loss": 0.0017, "step": 3560 }, { "epoch": 4.85149863760218, "grad_norm": 1.1294357942091982, "learning_rate": 1.0967132648952708e-05, "loss": 0.0171, "step": 3561 }, { "epoch": 4.852861035422343, "grad_norm": 0.6606150221304805, "learning_rate": 1.0962740272634282e-05, "loss": 0.0146, "step": 3562 }, { "epoch": 4.854223433242507, "grad_norm": 2.6741481427141216, "learning_rate": 1.0958347708828976e-05, "loss": 0.0177, "step": 3563 }, { "epoch": 4.855585831062671, "grad_norm": 0.6733234731412286, "learning_rate": 1.0953954958392206e-05, "loss": 0.0193, "step": 3564 }, { "epoch": 4.856948228882834, "grad_norm": 4.860277449388022, "learning_rate": 1.094956202217943e-05, "loss": 0.0249, "step": 3565 }, { "epoch": 4.858310626702997, "grad_norm": 2.7808824032173125, "learning_rate": 1.094516890104614e-05, "loss": 0.0169, "step": 3566 }, { "epoch": 4.859673024523161, "grad_norm": 1.7122965208549783, "learning_rate": 1.0940775595847868e-05, "loss": 0.0315, "step": 3567 }, { "epoch": 4.861035422343324, "grad_norm": 2.934074669768755, "learning_rate": 1.0936382107440173e-05, "loss": 0.0114, "step": 3568 }, { "epoch": 4.862397820163488, "grad_norm": 2.5061040493714417, "learning_rate": 1.0931988436678666e-05, "loss": 0.0123, "step": 3569 }, { "epoch": 4.863760217983652, "grad_norm": 1.7308374845322727, "learning_rate": 1.0927594584418976e-05, "loss": 0.0056, "step": 3570 }, { "epoch": 4.8651226158038146, "grad_norm": 1.4325129291875953, "learning_rate": 1.092320055151678e-05, "loss": 0.0147, "step": 3571 }, { "epoch": 4.866485013623978, "grad_norm": 1.9282272801242641, "learning_rate": 1.091880633882778e-05, "loss": 0.008, "step": 3572 }, { "epoch": 4.867847411444142, "grad_norm": 2.8784305788999474, "learning_rate": 1.0914411947207719e-05, "loss": 0.0219, "step": 3573 }, { "epoch": 4.869209809264305, "grad_norm": 1.4927873495271236, "learning_rate": 1.0910017377512375e-05, "loss": 0.005, "step": 3574 }, { "epoch": 4.870572207084469, "grad_norm": 1.5931568726542895, "learning_rate": 1.0905622630597558e-05, "loss": 0.0362, "step": 3575 }, { "epoch": 4.871934604904633, "grad_norm": 0.8990476577297469, "learning_rate": 1.0901227707319117e-05, "loss": 0.0313, "step": 3576 }, { "epoch": 4.8732970027247955, "grad_norm": 0.42947109165290176, "learning_rate": 1.089683260853293e-05, "loss": 0.008, "step": 3577 }, { "epoch": 4.874659400544959, "grad_norm": 1.6613982720832858, "learning_rate": 1.0892437335094911e-05, "loss": 0.0123, "step": 3578 }, { "epoch": 4.876021798365123, "grad_norm": 2.0158766694403716, "learning_rate": 1.0888041887861011e-05, "loss": 0.0098, "step": 3579 }, { "epoch": 4.877384196185286, "grad_norm": 2.030512621881084, "learning_rate": 1.0883646267687207e-05, "loss": 0.0354, "step": 3580 }, { "epoch": 4.87874659400545, "grad_norm": 2.3814720825596147, "learning_rate": 1.0879250475429523e-05, "loss": 0.0197, "step": 3581 }, { "epoch": 4.8801089918256135, "grad_norm": 1.9299973456059507, "learning_rate": 1.0874854511944004e-05, "loss": 0.0211, "step": 3582 }, { "epoch": 4.881471389645776, "grad_norm": 3.5942024295208355, "learning_rate": 1.087045837808673e-05, "loss": 0.0092, "step": 3583 }, { "epoch": 4.88283378746594, "grad_norm": 3.5277686819555054, "learning_rate": 1.0866062074713825e-05, "loss": 0.0261, "step": 3584 }, { "epoch": 4.884196185286104, "grad_norm": 2.4043620899702893, "learning_rate": 1.0861665602681432e-05, "loss": 0.0195, "step": 3585 }, { "epoch": 4.885558583106267, "grad_norm": 1.1412239298663138, "learning_rate": 1.0857268962845734e-05, "loss": 0.018, "step": 3586 }, { "epoch": 4.886920980926431, "grad_norm": 1.1521569714219375, "learning_rate": 1.0852872156062947e-05, "loss": 0.0193, "step": 3587 }, { "epoch": 4.8882833787465945, "grad_norm": 2.11825855022647, "learning_rate": 1.0848475183189316e-05, "loss": 0.0192, "step": 3588 }, { "epoch": 4.889645776566757, "grad_norm": 2.2542929270711087, "learning_rate": 1.0844078045081121e-05, "loss": 0.0184, "step": 3589 }, { "epoch": 4.891008174386921, "grad_norm": 1.2642859575254448, "learning_rate": 1.0839680742594679e-05, "loss": 0.0149, "step": 3590 }, { "epoch": 4.892370572207085, "grad_norm": 1.6203698482484992, "learning_rate": 1.0835283276586323e-05, "loss": 0.0148, "step": 3591 }, { "epoch": 4.893732970027248, "grad_norm": 0.7947987212612314, "learning_rate": 1.083088564791244e-05, "loss": 0.0143, "step": 3592 }, { "epoch": 4.895095367847412, "grad_norm": 2.624716854681951, "learning_rate": 1.0826487857429428e-05, "loss": 0.0354, "step": 3593 }, { "epoch": 4.896457765667575, "grad_norm": 0.7007343729665717, "learning_rate": 1.082208990599373e-05, "loss": 0.0239, "step": 3594 }, { "epoch": 4.897820163487738, "grad_norm": 2.5610495185794964, "learning_rate": 1.0817691794461817e-05, "loss": 0.0202, "step": 3595 }, { "epoch": 4.899182561307902, "grad_norm": 0.9634821508587694, "learning_rate": 1.0813293523690191e-05, "loss": 0.0061, "step": 3596 }, { "epoch": 4.900544959128066, "grad_norm": 2.1934108278884006, "learning_rate": 1.080889509453538e-05, "loss": 0.0103, "step": 3597 }, { "epoch": 4.901907356948229, "grad_norm": 1.6125108154692975, "learning_rate": 1.0804496507853947e-05, "loss": 0.0197, "step": 3598 }, { "epoch": 4.9032697547683926, "grad_norm": 1.5975683356890682, "learning_rate": 1.080009776450249e-05, "loss": 0.0219, "step": 3599 }, { "epoch": 4.904632152588556, "grad_norm": 1.120403027761138, "learning_rate": 1.079569886533763e-05, "loss": 0.018, "step": 3600 }, { "epoch": 4.905994550408719, "grad_norm": 1.085571545103211, "learning_rate": 1.0791299811216025e-05, "loss": 0.005, "step": 3601 }, { "epoch": 4.907356948228883, "grad_norm": 2.104074708284211, "learning_rate": 1.078690060299436e-05, "loss": 0.0231, "step": 3602 }, { "epoch": 4.908719346049047, "grad_norm": 1.4848425328218386, "learning_rate": 1.0782501241529339e-05, "loss": 0.0241, "step": 3603 }, { "epoch": 4.91008174386921, "grad_norm": 3.484730289905689, "learning_rate": 1.0778101727677721e-05, "loss": 0.0652, "step": 3604 }, { "epoch": 4.9114441416893735, "grad_norm": 2.4792695872806254, "learning_rate": 1.0773702062296273e-05, "loss": 0.04, "step": 3605 }, { "epoch": 4.912806539509537, "grad_norm": 1.6885403829856214, "learning_rate": 1.07693022462418e-05, "loss": 0.0661, "step": 3606 }, { "epoch": 4.9141689373297, "grad_norm": 2.898909399506897, "learning_rate": 1.0764902280371134e-05, "loss": 0.0351, "step": 3607 }, { "epoch": 4.915531335149864, "grad_norm": 1.3525888539604276, "learning_rate": 1.0760502165541145e-05, "loss": 0.0252, "step": 3608 }, { "epoch": 4.916893732970028, "grad_norm": 2.421812173064506, "learning_rate": 1.0756101902608711e-05, "loss": 0.0077, "step": 3609 }, { "epoch": 4.918256130790191, "grad_norm": 0.8578979920573719, "learning_rate": 1.0751701492430761e-05, "loss": 0.0114, "step": 3610 }, { "epoch": 4.919618528610354, "grad_norm": 2.27872541320297, "learning_rate": 1.0747300935864245e-05, "loss": 0.0204, "step": 3611 }, { "epoch": 4.920980926430518, "grad_norm": 2.3552204517808235, "learning_rate": 1.0742900233766133e-05, "loss": 0.0136, "step": 3612 }, { "epoch": 4.922343324250681, "grad_norm": 2.9014038661544976, "learning_rate": 1.0738499386993439e-05, "loss": 0.0103, "step": 3613 }, { "epoch": 4.923705722070845, "grad_norm": 2.5843247108362046, "learning_rate": 1.0734098396403192e-05, "loss": 0.0347, "step": 3614 }, { "epoch": 4.925068119891008, "grad_norm": 2.6530498806000273, "learning_rate": 1.0729697262852454e-05, "loss": 0.0136, "step": 3615 }, { "epoch": 4.926430517711172, "grad_norm": 1.8207944187175065, "learning_rate": 1.0725295987198318e-05, "loss": 0.014, "step": 3616 }, { "epoch": 4.927792915531335, "grad_norm": 2.4851260823801975, "learning_rate": 1.0720894570297898e-05, "loss": 0.016, "step": 3617 }, { "epoch": 4.929155313351498, "grad_norm": 3.917505551812192, "learning_rate": 1.071649301300834e-05, "loss": 0.0483, "step": 3618 }, { "epoch": 4.930517711171662, "grad_norm": 2.7304149153522954, "learning_rate": 1.0712091316186817e-05, "loss": 0.026, "step": 3619 }, { "epoch": 4.931880108991826, "grad_norm": 2.364963812310441, "learning_rate": 1.0707689480690526e-05, "loss": 0.0176, "step": 3620 }, { "epoch": 4.933242506811989, "grad_norm": 2.626023856704278, "learning_rate": 1.0703287507376699e-05, "loss": 0.0523, "step": 3621 }, { "epoch": 4.9346049046321525, "grad_norm": 1.675682707911374, "learning_rate": 1.0698885397102584e-05, "loss": 0.0381, "step": 3622 }, { "epoch": 4.935967302452316, "grad_norm": 2.3641396398302827, "learning_rate": 1.0694483150725458e-05, "loss": 0.0554, "step": 3623 }, { "epoch": 4.937329700272479, "grad_norm": 2.6902569517215063, "learning_rate": 1.0690080769102638e-05, "loss": 0.0319, "step": 3624 }, { "epoch": 4.938692098092643, "grad_norm": 2.9070675625303064, "learning_rate": 1.0685678253091449e-05, "loss": 0.006, "step": 3625 }, { "epoch": 4.940054495912807, "grad_norm": 2.6791245330830917, "learning_rate": 1.0681275603549253e-05, "loss": 0.0258, "step": 3626 }, { "epoch": 4.94141689373297, "grad_norm": 3.881329708394896, "learning_rate": 1.067687282133343e-05, "loss": 0.0082, "step": 3627 }, { "epoch": 4.9427792915531334, "grad_norm": 4.737145495857374, "learning_rate": 1.06724699073014e-05, "loss": 0.0481, "step": 3628 }, { "epoch": 4.944141689373297, "grad_norm": 2.231766022548187, "learning_rate": 1.0668066862310588e-05, "loss": 0.0446, "step": 3629 }, { "epoch": 4.94550408719346, "grad_norm": 5.223137907788839, "learning_rate": 1.0663663687218467e-05, "loss": 0.0264, "step": 3630 }, { "epoch": 4.946866485013624, "grad_norm": 2.0211972225839347, "learning_rate": 1.0659260382882516e-05, "loss": 0.0418, "step": 3631 }, { "epoch": 4.948228882833788, "grad_norm": 4.50279818547591, "learning_rate": 1.0654856950160252e-05, "loss": 0.0313, "step": 3632 }, { "epoch": 4.949591280653951, "grad_norm": 2.1765253037261316, "learning_rate": 1.0650453389909212e-05, "loss": 0.0176, "step": 3633 }, { "epoch": 4.950953678474114, "grad_norm": 2.305822442989859, "learning_rate": 1.0646049702986958e-05, "loss": 0.027, "step": 3634 }, { "epoch": 4.952316076294278, "grad_norm": 3.0553937764187866, "learning_rate": 1.0641645890251075e-05, "loss": 0.0214, "step": 3635 }, { "epoch": 4.953678474114441, "grad_norm": 2.1421820802895923, "learning_rate": 1.0637241952559176e-05, "loss": 0.0175, "step": 3636 }, { "epoch": 4.955040871934605, "grad_norm": 0.47396508520080655, "learning_rate": 1.06328378907689e-05, "loss": 0.0026, "step": 3637 }, { "epoch": 4.956403269754769, "grad_norm": 2.5369976217664605, "learning_rate": 1.0628433705737902e-05, "loss": 0.0215, "step": 3638 }, { "epoch": 4.9577656675749315, "grad_norm": 2.123300805500405, "learning_rate": 1.062402939832387e-05, "loss": 0.0051, "step": 3639 }, { "epoch": 4.959128065395095, "grad_norm": 2.6395814332547864, "learning_rate": 1.0619624969384508e-05, "loss": 0.0315, "step": 3640 }, { "epoch": 4.960490463215259, "grad_norm": 0.9566528232823971, "learning_rate": 1.0615220419777548e-05, "loss": 0.0261, "step": 3641 }, { "epoch": 4.961852861035422, "grad_norm": 2.0261987372808057, "learning_rate": 1.0610815750360748e-05, "loss": 0.0105, "step": 3642 }, { "epoch": 4.963215258855586, "grad_norm": 1.8084918769792204, "learning_rate": 1.0606410961991884e-05, "loss": 0.0225, "step": 3643 }, { "epoch": 4.96457765667575, "grad_norm": 1.2162608994058264, "learning_rate": 1.060200605552876e-05, "loss": 0.0212, "step": 3644 }, { "epoch": 4.9659400544959125, "grad_norm": 2.100142773817672, "learning_rate": 1.0597601031829197e-05, "loss": 0.0252, "step": 3645 }, { "epoch": 4.967302452316076, "grad_norm": 1.7214186792179285, "learning_rate": 1.0593195891751042e-05, "loss": 0.0185, "step": 3646 }, { "epoch": 4.96866485013624, "grad_norm": 0.6715155003455395, "learning_rate": 1.0588790636152169e-05, "loss": 0.0026, "step": 3647 }, { "epoch": 4.970027247956403, "grad_norm": 0.8505017644055539, "learning_rate": 1.058438526589047e-05, "loss": 0.0422, "step": 3648 }, { "epoch": 4.971389645776567, "grad_norm": 4.154034676030612, "learning_rate": 1.0579979781823856e-05, "loss": 0.0535, "step": 3649 }, { "epoch": 4.9727520435967305, "grad_norm": 1.1055358859559796, "learning_rate": 1.057557418481027e-05, "loss": 0.0077, "step": 3650 }, { "epoch": 4.974114441416893, "grad_norm": 2.5235094136460106, "learning_rate": 1.0571168475707664e-05, "loss": 0.0324, "step": 3651 }, { "epoch": 4.975476839237057, "grad_norm": 3.96728818326299, "learning_rate": 1.0566762655374021e-05, "loss": 0.0205, "step": 3652 }, { "epoch": 4.976839237057221, "grad_norm": 1.1364005439326708, "learning_rate": 1.0562356724667346e-05, "loss": 0.0049, "step": 3653 }, { "epoch": 4.978201634877384, "grad_norm": 2.982658249221755, "learning_rate": 1.0557950684445662e-05, "loss": 0.0241, "step": 3654 }, { "epoch": 4.979564032697548, "grad_norm": 3.0189962077959422, "learning_rate": 1.0553544535567015e-05, "loss": 0.0303, "step": 3655 }, { "epoch": 4.9809264305177114, "grad_norm": 0.7167606333549763, "learning_rate": 1.0549138278889468e-05, "loss": 0.0148, "step": 3656 }, { "epoch": 4.982288828337874, "grad_norm": 2.6008473272540833, "learning_rate": 1.0544731915271111e-05, "loss": 0.0272, "step": 3657 }, { "epoch": 4.983651226158038, "grad_norm": 3.6581953309342703, "learning_rate": 1.0540325445570052e-05, "loss": 0.0455, "step": 3658 }, { "epoch": 4.985013623978202, "grad_norm": 1.704874550846183, "learning_rate": 1.0535918870644419e-05, "loss": 0.0328, "step": 3659 }, { "epoch": 4.986376021798365, "grad_norm": 2.015504089163355, "learning_rate": 1.0531512191352365e-05, "loss": 0.0136, "step": 3660 }, { "epoch": 4.987738419618529, "grad_norm": 1.17180966429713, "learning_rate": 1.0527105408552054e-05, "loss": 0.0238, "step": 3661 }, { "epoch": 4.989100817438692, "grad_norm": 1.5702899725428234, "learning_rate": 1.0522698523101683e-05, "loss": 0.0276, "step": 3662 }, { "epoch": 4.990463215258855, "grad_norm": 1.2048967280322853, "learning_rate": 1.0518291535859452e-05, "loss": 0.0327, "step": 3663 }, { "epoch": 4.991825613079019, "grad_norm": 1.1600058770582238, "learning_rate": 1.05138844476836e-05, "loss": 0.0087, "step": 3664 }, { "epoch": 4.993188010899183, "grad_norm": 4.130940880460577, "learning_rate": 1.0509477259432372e-05, "loss": 0.0398, "step": 3665 }, { "epoch": 4.994550408719346, "grad_norm": 1.348912953195957, "learning_rate": 1.0505069971964036e-05, "loss": 0.0281, "step": 3666 }, { "epoch": 4.9959128065395095, "grad_norm": 1.0652543231118923, "learning_rate": 1.0500662586136888e-05, "loss": 0.0123, "step": 3667 }, { "epoch": 4.997275204359673, "grad_norm": 2.478146016899061, "learning_rate": 1.0496255102809224e-05, "loss": 0.0195, "step": 3668 }, { "epoch": 4.998637602179836, "grad_norm": 1.736326368519367, "learning_rate": 1.0491847522839376e-05, "loss": 0.0225, "step": 3669 }, { "epoch": 5.0, "grad_norm": 1.474384093389617, "learning_rate": 1.0487439847085688e-05, "loss": 0.0132, "step": 3670 }, { "epoch": 5.0, "eval_accuracy": 0.9483436271757439, "eval_f1": 0.9391795138021971, "eval_loss": 0.11607106029987335, "eval_precision": 0.9329894271663146, "eval_recall": 0.9483978635482812, "eval_runtime": 16.9697, "eval_samples_per_second": 104.951, "eval_steps_per_second": 0.825, "step": 3670 }, { "epoch": 5.001362397820164, "grad_norm": 3.6128975501090466, "learning_rate": 1.0483032076406528e-05, "loss": 0.0275, "step": 3671 }, { "epoch": 5.002724795640327, "grad_norm": 1.7968288888914856, "learning_rate": 1.047862421166027e-05, "loss": 0.0174, "step": 3672 }, { "epoch": 5.0040871934604905, "grad_norm": 2.9136585555185444, "learning_rate": 1.0474216253705325e-05, "loss": 0.0075, "step": 3673 }, { "epoch": 5.005449591280654, "grad_norm": 2.8277561008052743, "learning_rate": 1.0469808203400103e-05, "loss": 0.0129, "step": 3674 }, { "epoch": 5.006811989100817, "grad_norm": 1.540919015560752, "learning_rate": 1.046540006160304e-05, "loss": 0.0107, "step": 3675 }, { "epoch": 5.008174386920981, "grad_norm": 3.5695222268744558, "learning_rate": 1.04609918291726e-05, "loss": 0.0309, "step": 3676 }, { "epoch": 5.009536784741145, "grad_norm": 0.9792400204346754, "learning_rate": 1.0456583506967248e-05, "loss": 0.0025, "step": 3677 }, { "epoch": 5.010899182561308, "grad_norm": 1.5395417104847924, "learning_rate": 1.0452175095845474e-05, "loss": 0.0184, "step": 3678 }, { "epoch": 5.012261580381471, "grad_norm": 3.5286779039261384, "learning_rate": 1.0447766596665789e-05, "loss": 0.0039, "step": 3679 }, { "epoch": 5.013623978201635, "grad_norm": 1.3616082826077598, "learning_rate": 1.0443358010286714e-05, "loss": 0.0122, "step": 3680 }, { "epoch": 5.014986376021798, "grad_norm": 2.03341409310747, "learning_rate": 1.0438949337566789e-05, "loss": 0.0046, "step": 3681 }, { "epoch": 5.016348773841962, "grad_norm": 2.082927625129205, "learning_rate": 1.0434540579364574e-05, "loss": 0.0028, "step": 3682 }, { "epoch": 5.017711171662126, "grad_norm": 2.157336903878437, "learning_rate": 1.0430131736538645e-05, "loss": 0.0169, "step": 3683 }, { "epoch": 5.0190735694822886, "grad_norm": 1.307609683341958, "learning_rate": 1.0425722809947588e-05, "loss": 0.012, "step": 3684 }, { "epoch": 5.020435967302452, "grad_norm": 1.1125673087555739, "learning_rate": 1.0421313800450015e-05, "loss": 0.0306, "step": 3685 }, { "epoch": 5.021798365122616, "grad_norm": 1.059254875260584, "learning_rate": 1.041690470890455e-05, "loss": 0.0282, "step": 3686 }, { "epoch": 5.023160762942779, "grad_norm": 1.3568306698952113, "learning_rate": 1.0412495536169826e-05, "loss": 0.0051, "step": 3687 }, { "epoch": 5.024523160762943, "grad_norm": 1.4409765761104496, "learning_rate": 1.0408086283104506e-05, "loss": 0.0112, "step": 3688 }, { "epoch": 5.025885558583107, "grad_norm": 0.6070600746872727, "learning_rate": 1.040367695056726e-05, "loss": 0.0043, "step": 3689 }, { "epoch": 5.0272479564032695, "grad_norm": 1.9200232032791031, "learning_rate": 1.0399267539416766e-05, "loss": 0.0256, "step": 3690 }, { "epoch": 5.028610354223433, "grad_norm": 0.3794633931737352, "learning_rate": 1.039485805051174e-05, "loss": 0.0076, "step": 3691 }, { "epoch": 5.029972752043597, "grad_norm": 0.2559239730743419, "learning_rate": 1.0390448484710887e-05, "loss": 0.002, "step": 3692 }, { "epoch": 5.03133514986376, "grad_norm": 1.6049178347674156, "learning_rate": 1.038603884287294e-05, "loss": 0.0332, "step": 3693 }, { "epoch": 5.032697547683924, "grad_norm": 0.21001693067773491, "learning_rate": 1.0381629125856653e-05, "loss": 0.0012, "step": 3694 }, { "epoch": 5.0340599455040875, "grad_norm": 1.8976778156026437, "learning_rate": 1.0377219334520783e-05, "loss": 0.0188, "step": 3695 }, { "epoch": 5.03542234332425, "grad_norm": 2.384066298422506, "learning_rate": 1.0372809469724101e-05, "loss": 0.0319, "step": 3696 }, { "epoch": 5.036784741144414, "grad_norm": 2.517111274287285, "learning_rate": 1.0368399532325406e-05, "loss": 0.0121, "step": 3697 }, { "epoch": 5.038147138964578, "grad_norm": 1.703426248389894, "learning_rate": 1.0363989523183496e-05, "loss": 0.0107, "step": 3698 }, { "epoch": 5.039509536784741, "grad_norm": 0.36246631356068837, "learning_rate": 1.0359579443157193e-05, "loss": 0.0084, "step": 3699 }, { "epoch": 5.040871934604905, "grad_norm": 1.4536668031744335, "learning_rate": 1.0355169293105325e-05, "loss": 0.01, "step": 3700 }, { "epoch": 5.0422343324250685, "grad_norm": 1.844353686698864, "learning_rate": 1.035075907388674e-05, "loss": 0.0037, "step": 3701 }, { "epoch": 5.043596730245231, "grad_norm": 1.1868641509075386, "learning_rate": 1.0346348786360298e-05, "loss": 0.0068, "step": 3702 }, { "epoch": 5.044959128065395, "grad_norm": 1.6167299614712776, "learning_rate": 1.034193843138487e-05, "loss": 0.0109, "step": 3703 }, { "epoch": 5.046321525885559, "grad_norm": 2.8127146751504033, "learning_rate": 1.0337528009819343e-05, "loss": 0.029, "step": 3704 }, { "epoch": 5.047683923705722, "grad_norm": 1.382400061034093, "learning_rate": 1.0333117522522615e-05, "loss": 0.0155, "step": 3705 }, { "epoch": 5.049046321525886, "grad_norm": 2.3277575396231955, "learning_rate": 1.0328706970353595e-05, "loss": 0.0181, "step": 3706 }, { "epoch": 5.050408719346049, "grad_norm": 2.3596971458534375, "learning_rate": 1.0324296354171209e-05, "loss": 0.0219, "step": 3707 }, { "epoch": 5.051771117166212, "grad_norm": 0.9139172476518665, "learning_rate": 1.0319885674834393e-05, "loss": 0.0239, "step": 3708 }, { "epoch": 5.053133514986376, "grad_norm": 0.7146115805526178, "learning_rate": 1.0315474933202097e-05, "loss": 0.014, "step": 3709 }, { "epoch": 5.05449591280654, "grad_norm": 2.9256622283861566, "learning_rate": 1.0311064130133278e-05, "loss": 0.0256, "step": 3710 }, { "epoch": 5.055858310626703, "grad_norm": 1.5402640889743529, "learning_rate": 1.0306653266486915e-05, "loss": 0.0281, "step": 3711 }, { "epoch": 5.0572207084468666, "grad_norm": 1.1809258263036566, "learning_rate": 1.0302242343121989e-05, "loss": 0.0201, "step": 3712 }, { "epoch": 5.05858310626703, "grad_norm": 3.0738303747585776, "learning_rate": 1.0297831360897493e-05, "loss": 0.0256, "step": 3713 }, { "epoch": 5.059945504087193, "grad_norm": 1.4260348146380082, "learning_rate": 1.029342032067244e-05, "loss": 0.0039, "step": 3714 }, { "epoch": 5.061307901907357, "grad_norm": 2.09804268863721, "learning_rate": 1.0289009223305848e-05, "loss": 0.005, "step": 3715 }, { "epoch": 5.062670299727521, "grad_norm": 1.1892668986941528, "learning_rate": 1.0284598069656746e-05, "loss": 0.0097, "step": 3716 }, { "epoch": 5.064032697547684, "grad_norm": 3.3246288933875414, "learning_rate": 1.0280186860584175e-05, "loss": 0.0181, "step": 3717 }, { "epoch": 5.0653950953678475, "grad_norm": 1.0604234956915604, "learning_rate": 1.0275775596947189e-05, "loss": 0.004, "step": 3718 }, { "epoch": 5.066757493188011, "grad_norm": 2.4848045359375295, "learning_rate": 1.0271364279604844e-05, "loss": 0.005, "step": 3719 }, { "epoch": 5.068119891008174, "grad_norm": 1.335730112171245, "learning_rate": 1.0266952909416221e-05, "loss": 0.0166, "step": 3720 }, { "epoch": 5.069482288828338, "grad_norm": 1.1860507134177307, "learning_rate": 1.0262541487240401e-05, "loss": 0.0036, "step": 3721 }, { "epoch": 5.070844686648502, "grad_norm": 0.8826065886457815, "learning_rate": 1.0258130013936473e-05, "loss": 0.0061, "step": 3722 }, { "epoch": 5.072207084468665, "grad_norm": 0.9457916042487873, "learning_rate": 1.0253718490363547e-05, "loss": 0.0019, "step": 3723 }, { "epoch": 5.073569482288828, "grad_norm": 1.059711857719507, "learning_rate": 1.0249306917380731e-05, "loss": 0.0105, "step": 3724 }, { "epoch": 5.074931880108992, "grad_norm": 0.9660304735720028, "learning_rate": 1.0244895295847148e-05, "loss": 0.0105, "step": 3725 }, { "epoch": 5.076294277929155, "grad_norm": 1.6878714716469074, "learning_rate": 1.0240483626621933e-05, "loss": 0.0278, "step": 3726 }, { "epoch": 5.077656675749319, "grad_norm": 1.4802004937374327, "learning_rate": 1.0236071910564229e-05, "loss": 0.0137, "step": 3727 }, { "epoch": 5.079019073569483, "grad_norm": 1.565148167218117, "learning_rate": 1.0231660148533183e-05, "loss": 0.0227, "step": 3728 }, { "epoch": 5.080381471389646, "grad_norm": 0.1918541615248023, "learning_rate": 1.0227248341387955e-05, "loss": 0.0081, "step": 3729 }, { "epoch": 5.081743869209809, "grad_norm": 0.7188187355634447, "learning_rate": 1.022283648998771e-05, "loss": 0.0097, "step": 3730 }, { "epoch": 5.083106267029973, "grad_norm": 1.1259385558950539, "learning_rate": 1.0218424595191632e-05, "loss": 0.0018, "step": 3731 }, { "epoch": 5.084468664850136, "grad_norm": 1.687654316004312, "learning_rate": 1.0214012657858903e-05, "loss": 0.024, "step": 3732 }, { "epoch": 5.0858310626703, "grad_norm": 1.832449796111267, "learning_rate": 1.0209600678848717e-05, "loss": 0.0075, "step": 3733 }, { "epoch": 5.087193460490464, "grad_norm": 0.6656130913246562, "learning_rate": 1.0205188659020275e-05, "loss": 0.0055, "step": 3734 }, { "epoch": 5.0885558583106265, "grad_norm": 1.5432268997754344, "learning_rate": 1.020077659923279e-05, "loss": 0.009, "step": 3735 }, { "epoch": 5.08991825613079, "grad_norm": 1.6048804921849646, "learning_rate": 1.0196364500345473e-05, "loss": 0.0052, "step": 3736 }, { "epoch": 5.091280653950954, "grad_norm": 0.1832326458522336, "learning_rate": 1.0191952363217557e-05, "loss": 0.008, "step": 3737 }, { "epoch": 5.092643051771117, "grad_norm": 2.393602830085593, "learning_rate": 1.018754018870827e-05, "loss": 0.0172, "step": 3738 }, { "epoch": 5.094005449591281, "grad_norm": 1.4508467509618672, "learning_rate": 1.018312797767685e-05, "loss": 0.0159, "step": 3739 }, { "epoch": 5.0953678474114446, "grad_norm": 0.7016395452431612, "learning_rate": 1.017871573098255e-05, "loss": 0.0028, "step": 3740 }, { "epoch": 5.0967302452316074, "grad_norm": 1.001305073953219, "learning_rate": 1.017430344948462e-05, "loss": 0.0078, "step": 3741 }, { "epoch": 5.098092643051771, "grad_norm": 1.7386471930914293, "learning_rate": 1.0169891134042318e-05, "loss": 0.0229, "step": 3742 }, { "epoch": 5.099455040871935, "grad_norm": 1.3130540456087691, "learning_rate": 1.0165478785514919e-05, "loss": 0.0062, "step": 3743 }, { "epoch": 5.100817438692098, "grad_norm": 1.532484777374295, "learning_rate": 1.016106640476169e-05, "loss": 0.0118, "step": 3744 }, { "epoch": 5.102179836512262, "grad_norm": 3.017389431637663, "learning_rate": 1.0156653992641916e-05, "loss": 0.0297, "step": 3745 }, { "epoch": 5.1035422343324255, "grad_norm": 2.554251381429934, "learning_rate": 1.015224155001488e-05, "loss": 0.0046, "step": 3746 }, { "epoch": 5.104904632152588, "grad_norm": 1.782172563291434, "learning_rate": 1.0147829077739875e-05, "loss": 0.0031, "step": 3747 }, { "epoch": 5.106267029972752, "grad_norm": 1.8051030640098271, "learning_rate": 1.0143416576676198e-05, "loss": 0.0076, "step": 3748 }, { "epoch": 5.107629427792916, "grad_norm": 2.2156978359366724, "learning_rate": 1.0139004047683152e-05, "loss": 0.0144, "step": 3749 }, { "epoch": 5.108991825613079, "grad_norm": 2.9688277255864843, "learning_rate": 1.013459149162005e-05, "loss": 0.0151, "step": 3750 }, { "epoch": 5.110354223433243, "grad_norm": 0.7281501620158236, "learning_rate": 1.0130178909346201e-05, "loss": 0.0044, "step": 3751 }, { "epoch": 5.111716621253406, "grad_norm": 1.7451256214077828, "learning_rate": 1.012576630172093e-05, "loss": 0.0057, "step": 3752 }, { "epoch": 5.113079019073569, "grad_norm": 3.845959723158212, "learning_rate": 1.0121353669603552e-05, "loss": 0.0303, "step": 3753 }, { "epoch": 5.114441416893733, "grad_norm": 2.4655358711894464, "learning_rate": 1.0116941013853407e-05, "loss": 0.028, "step": 3754 }, { "epoch": 5.115803814713897, "grad_norm": 2.484789297598725, "learning_rate": 1.0112528335329824e-05, "loss": 0.0266, "step": 3755 }, { "epoch": 5.11716621253406, "grad_norm": 1.2534655442566613, "learning_rate": 1.0108115634892134e-05, "loss": 0.049, "step": 3756 }, { "epoch": 5.118528610354224, "grad_norm": 3.28548564307562, "learning_rate": 1.0103702913399695e-05, "loss": 0.0309, "step": 3757 }, { "epoch": 5.1198910081743865, "grad_norm": 3.76487046444419, "learning_rate": 1.0099290171711841e-05, "loss": 0.0124, "step": 3758 }, { "epoch": 5.12125340599455, "grad_norm": 0.695183743923656, "learning_rate": 1.0094877410687922e-05, "loss": 0.0109, "step": 3759 }, { "epoch": 5.122615803814714, "grad_norm": 4.069878899608481, "learning_rate": 1.00904646311873e-05, "loss": 0.0129, "step": 3760 }, { "epoch": 5.123978201634877, "grad_norm": 1.8026710765770053, "learning_rate": 1.0086051834069327e-05, "loss": 0.0022, "step": 3761 }, { "epoch": 5.125340599455041, "grad_norm": 0.4706719928909071, "learning_rate": 1.0081639020193366e-05, "loss": 0.0084, "step": 3762 }, { "epoch": 5.1267029972752045, "grad_norm": 2.858942539507461, "learning_rate": 1.0077226190418783e-05, "loss": 0.008, "step": 3763 }, { "epoch": 5.128065395095367, "grad_norm": 2.829806715783769, "learning_rate": 1.0072813345604942e-05, "loss": 0.013, "step": 3764 }, { "epoch": 5.129427792915531, "grad_norm": 1.4053685752168952, "learning_rate": 1.0068400486611213e-05, "loss": 0.013, "step": 3765 }, { "epoch": 5.130790190735695, "grad_norm": 2.956980932484498, "learning_rate": 1.0063987614296972e-05, "loss": 0.0202, "step": 3766 }, { "epoch": 5.132152588555858, "grad_norm": 2.0464634154779495, "learning_rate": 1.0059574729521595e-05, "loss": 0.0026, "step": 3767 }, { "epoch": 5.133514986376022, "grad_norm": 1.8681546384879213, "learning_rate": 1.0055161833144458e-05, "loss": 0.0073, "step": 3768 }, { "epoch": 5.1348773841961854, "grad_norm": 1.030553188734171, "learning_rate": 1.0050748926024944e-05, "loss": 0.0033, "step": 3769 }, { "epoch": 5.136239782016348, "grad_norm": 1.860331204434374, "learning_rate": 1.0046336009022435e-05, "loss": 0.0153, "step": 3770 }, { "epoch": 5.137602179836512, "grad_norm": 2.013247134317706, "learning_rate": 1.0041923082996312e-05, "loss": 0.0294, "step": 3771 }, { "epoch": 5.138964577656676, "grad_norm": 0.41750848076970765, "learning_rate": 1.0037510148805966e-05, "loss": 0.0084, "step": 3772 }, { "epoch": 5.140326975476839, "grad_norm": 2.367139219394875, "learning_rate": 1.0033097207310784e-05, "loss": 0.0133, "step": 3773 }, { "epoch": 5.141689373297003, "grad_norm": 1.5000533405235672, "learning_rate": 1.0028684259370149e-05, "loss": 0.009, "step": 3774 }, { "epoch": 5.143051771117166, "grad_norm": 2.1647118009383433, "learning_rate": 1.0024271305843466e-05, "loss": 0.0296, "step": 3775 }, { "epoch": 5.144414168937329, "grad_norm": 0.5239627148590298, "learning_rate": 1.001985834759011e-05, "loss": 0.0165, "step": 3776 }, { "epoch": 5.145776566757493, "grad_norm": 1.8377009074139168, "learning_rate": 1.001544538546948e-05, "loss": 0.0116, "step": 3777 }, { "epoch": 5.147138964577657, "grad_norm": 0.9570869894363767, "learning_rate": 1.0011032420340973e-05, "loss": 0.0235, "step": 3778 }, { "epoch": 5.14850136239782, "grad_norm": 0.9402880394637168, "learning_rate": 1.000661945306398e-05, "loss": 0.008, "step": 3779 }, { "epoch": 5.1498637602179835, "grad_norm": 3.415031340166857, "learning_rate": 1.0002206484497891e-05, "loss": 0.0287, "step": 3780 }, { "epoch": 5.151226158038147, "grad_norm": 1.9145596738400377, "learning_rate": 9.997793515502109e-06, "loss": 0.024, "step": 3781 }, { "epoch": 5.15258855585831, "grad_norm": 1.9931294152372385, "learning_rate": 9.993380546936023e-06, "loss": 0.0261, "step": 3782 }, { "epoch": 5.153950953678474, "grad_norm": 1.4446346895962558, "learning_rate": 9.98896757965903e-06, "loss": 0.01, "step": 3783 }, { "epoch": 5.155313351498638, "grad_norm": 1.588610712847131, "learning_rate": 9.984554614530523e-06, "loss": 0.0111, "step": 3784 }, { "epoch": 5.156675749318801, "grad_norm": 0.7748507108522603, "learning_rate": 9.980141652409896e-06, "loss": 0.0056, "step": 3785 }, { "epoch": 5.1580381471389645, "grad_norm": 1.6141010277011822, "learning_rate": 9.975728694156541e-06, "loss": 0.0209, "step": 3786 }, { "epoch": 5.159400544959128, "grad_norm": 3.317527558291178, "learning_rate": 9.97131574062985e-06, "loss": 0.0254, "step": 3787 }, { "epoch": 5.160762942779291, "grad_norm": 1.973431068144296, "learning_rate": 9.966902792689219e-06, "loss": 0.0114, "step": 3788 }, { "epoch": 5.162125340599455, "grad_norm": 0.8815719419268645, "learning_rate": 9.962489851194035e-06, "loss": 0.0159, "step": 3789 }, { "epoch": 5.163487738419619, "grad_norm": 2.319507396781177, "learning_rate": 9.958076917003691e-06, "loss": 0.0283, "step": 3790 }, { "epoch": 5.164850136239782, "grad_norm": 1.2956833522260414, "learning_rate": 9.953663990977568e-06, "loss": 0.0074, "step": 3791 }, { "epoch": 5.166212534059945, "grad_norm": 1.2363878580658563, "learning_rate": 9.949251073975059e-06, "loss": 0.0092, "step": 3792 }, { "epoch": 5.167574931880109, "grad_norm": 1.5887596072479089, "learning_rate": 9.944838166855544e-06, "loss": 0.0213, "step": 3793 }, { "epoch": 5.168937329700272, "grad_norm": 2.1947468387779865, "learning_rate": 9.940425270478407e-06, "loss": 0.0162, "step": 3794 }, { "epoch": 5.170299727520436, "grad_norm": 1.8628165061865105, "learning_rate": 9.936012385703031e-06, "loss": 0.0147, "step": 3795 }, { "epoch": 5.1716621253406, "grad_norm": 0.8996078185778584, "learning_rate": 9.93159951338879e-06, "loss": 0.0064, "step": 3796 }, { "epoch": 5.1730245231607626, "grad_norm": 3.1742094875477047, "learning_rate": 9.927186654395063e-06, "loss": 0.0453, "step": 3797 }, { "epoch": 5.174386920980926, "grad_norm": 1.6230455253948974, "learning_rate": 9.922773809581222e-06, "loss": 0.0181, "step": 3798 }, { "epoch": 5.17574931880109, "grad_norm": 2.0860588942003164, "learning_rate": 9.918360979806636e-06, "loss": 0.0086, "step": 3799 }, { "epoch": 5.177111716621253, "grad_norm": 1.7807769728933276, "learning_rate": 9.913948165930676e-06, "loss": 0.0058, "step": 3800 }, { "epoch": 5.178474114441417, "grad_norm": 0.7584022635214972, "learning_rate": 9.909535368812702e-06, "loss": 0.002, "step": 3801 }, { "epoch": 5.179836512261581, "grad_norm": 1.7200548492993761, "learning_rate": 9.90512258931208e-06, "loss": 0.0115, "step": 3802 }, { "epoch": 5.1811989100817435, "grad_norm": 0.8989391370385581, "learning_rate": 9.900709828288164e-06, "loss": 0.0075, "step": 3803 }, { "epoch": 5.182561307901907, "grad_norm": 2.467542871171936, "learning_rate": 9.896297086600307e-06, "loss": 0.0159, "step": 3804 }, { "epoch": 5.183923705722071, "grad_norm": 2.2022695001975143, "learning_rate": 9.891884365107864e-06, "loss": 0.0252, "step": 3805 }, { "epoch": 5.185286103542234, "grad_norm": 0.5106403262976917, "learning_rate": 9.88747166467018e-06, "loss": 0.0117, "step": 3806 }, { "epoch": 5.186648501362398, "grad_norm": 1.0565900168722253, "learning_rate": 9.883058986146595e-06, "loss": 0.0024, "step": 3807 }, { "epoch": 5.1880108991825615, "grad_norm": 0.6559450884707487, "learning_rate": 9.87864633039645e-06, "loss": 0.0279, "step": 3808 }, { "epoch": 5.189373297002724, "grad_norm": 0.8908576153367423, "learning_rate": 9.874233698279076e-06, "loss": 0.0036, "step": 3809 }, { "epoch": 5.190735694822888, "grad_norm": 0.2672486780201981, "learning_rate": 9.8698210906538e-06, "loss": 0.0013, "step": 3810 }, { "epoch": 5.192098092643052, "grad_norm": 0.23517723974215618, "learning_rate": 9.865408508379953e-06, "loss": 0.0014, "step": 3811 }, { "epoch": 5.193460490463215, "grad_norm": 1.067451884481919, "learning_rate": 9.860995952316851e-06, "loss": 0.0148, "step": 3812 }, { "epoch": 5.194822888283379, "grad_norm": 0.8880606401575666, "learning_rate": 9.856583423323807e-06, "loss": 0.0113, "step": 3813 }, { "epoch": 5.1961852861035425, "grad_norm": 0.5859294065153318, "learning_rate": 9.85217092226013e-06, "loss": 0.0089, "step": 3814 }, { "epoch": 5.197547683923705, "grad_norm": 1.3019027330781612, "learning_rate": 9.847758449985124e-06, "loss": 0.004, "step": 3815 }, { "epoch": 5.198910081743869, "grad_norm": 0.9817058243563722, "learning_rate": 9.843346007358087e-06, "loss": 0.0099, "step": 3816 }, { "epoch": 5.200272479564033, "grad_norm": 0.47296507686998956, "learning_rate": 9.838933595238311e-06, "loss": 0.0086, "step": 3817 }, { "epoch": 5.201634877384196, "grad_norm": 3.6103694933496606, "learning_rate": 9.834521214485084e-06, "loss": 0.0179, "step": 3818 }, { "epoch": 5.20299727520436, "grad_norm": 1.671039268904082, "learning_rate": 9.830108865957684e-06, "loss": 0.0046, "step": 3819 }, { "epoch": 5.204359673024523, "grad_norm": 1.0696088240506325, "learning_rate": 9.825696550515385e-06, "loss": 0.0138, "step": 3820 }, { "epoch": 5.205722070844686, "grad_norm": 0.7547919888044744, "learning_rate": 9.821284269017456e-06, "loss": 0.0097, "step": 3821 }, { "epoch": 5.20708446866485, "grad_norm": 2.943830491465148, "learning_rate": 9.816872022323151e-06, "loss": 0.0257, "step": 3822 }, { "epoch": 5.208446866485014, "grad_norm": 1.7173207769633254, "learning_rate": 9.812459811291734e-06, "loss": 0.0071, "step": 3823 }, { "epoch": 5.209809264305177, "grad_norm": 1.222902574198059, "learning_rate": 9.808047636782448e-06, "loss": 0.028, "step": 3824 }, { "epoch": 5.2111716621253406, "grad_norm": 0.8613611570338126, "learning_rate": 9.80363549965453e-06, "loss": 0.0237, "step": 3825 }, { "epoch": 5.212534059945504, "grad_norm": 1.7742638457419642, "learning_rate": 9.799223400767216e-06, "loss": 0.0188, "step": 3826 }, { "epoch": 5.213896457765667, "grad_norm": 4.783072704083541, "learning_rate": 9.794811340979725e-06, "loss": 0.025, "step": 3827 }, { "epoch": 5.215258855585831, "grad_norm": 2.2212026758930086, "learning_rate": 9.790399321151286e-06, "loss": 0.0075, "step": 3828 }, { "epoch": 5.216621253405995, "grad_norm": 2.988918584401963, "learning_rate": 9.7859873421411e-06, "loss": 0.0123, "step": 3829 }, { "epoch": 5.217983651226158, "grad_norm": 6.521873073463549, "learning_rate": 9.781575404808372e-06, "loss": 0.0217, "step": 3830 }, { "epoch": 5.2193460490463215, "grad_norm": 2.8413209701580033, "learning_rate": 9.777163510012292e-06, "loss": 0.008, "step": 3831 }, { "epoch": 5.220708446866485, "grad_norm": 4.651873212355321, "learning_rate": 9.772751658612052e-06, "loss": 0.0218, "step": 3832 }, { "epoch": 5.222070844686648, "grad_norm": 3.9071061594118985, "learning_rate": 9.768339851466818e-06, "loss": 0.0126, "step": 3833 }, { "epoch": 5.223433242506812, "grad_norm": 0.9987189928841448, "learning_rate": 9.763928089435774e-06, "loss": 0.0091, "step": 3834 }, { "epoch": 5.224795640326976, "grad_norm": 3.9998308247279635, "learning_rate": 9.759516373378068e-06, "loss": 0.0082, "step": 3835 }, { "epoch": 5.226158038147139, "grad_norm": 2.1514915154580043, "learning_rate": 9.755104704152855e-06, "loss": 0.0177, "step": 3836 }, { "epoch": 5.227520435967302, "grad_norm": 3.082472709775724, "learning_rate": 9.750693082619274e-06, "loss": 0.0071, "step": 3837 }, { "epoch": 5.228882833787466, "grad_norm": 3.226068700508276, "learning_rate": 9.746281509636458e-06, "loss": 0.026, "step": 3838 }, { "epoch": 5.230245231607629, "grad_norm": 0.8443413265359831, "learning_rate": 9.741869986063527e-06, "loss": 0.0096, "step": 3839 }, { "epoch": 5.231607629427793, "grad_norm": 2.851387534979331, "learning_rate": 9.737458512759604e-06, "loss": 0.0235, "step": 3840 }, { "epoch": 5.232970027247957, "grad_norm": 3.052077601789162, "learning_rate": 9.733047090583782e-06, "loss": 0.0214, "step": 3841 }, { "epoch": 5.23433242506812, "grad_norm": 3.0952476301208036, "learning_rate": 9.728635720395158e-06, "loss": 0.0045, "step": 3842 }, { "epoch": 5.235694822888283, "grad_norm": 3.8734379784728996, "learning_rate": 9.724224403052818e-06, "loss": 0.026, "step": 3843 }, { "epoch": 5.237057220708447, "grad_norm": 1.031138844247765, "learning_rate": 9.71981313941583e-06, "loss": 0.0096, "step": 3844 }, { "epoch": 5.23841961852861, "grad_norm": 0.5377010562835965, "learning_rate": 9.715401930343255e-06, "loss": 0.0181, "step": 3845 }, { "epoch": 5.239782016348774, "grad_norm": 0.6207366747191216, "learning_rate": 9.710990776694154e-06, "loss": 0.0027, "step": 3846 }, { "epoch": 5.241144414168938, "grad_norm": 3.9404505960126714, "learning_rate": 9.706579679327563e-06, "loss": 0.0323, "step": 3847 }, { "epoch": 5.2425068119891005, "grad_norm": 2.0348377965338194, "learning_rate": 9.702168639102509e-06, "loss": 0.0086, "step": 3848 }, { "epoch": 5.243869209809264, "grad_norm": 3.1307772611686238, "learning_rate": 9.697757656878016e-06, "loss": 0.0326, "step": 3849 }, { "epoch": 5.245231607629428, "grad_norm": 1.3112792550660757, "learning_rate": 9.693346733513089e-06, "loss": 0.0134, "step": 3850 }, { "epoch": 5.246594005449591, "grad_norm": 1.5616172011702374, "learning_rate": 9.688935869866721e-06, "loss": 0.0153, "step": 3851 }, { "epoch": 5.247956403269755, "grad_norm": 3.1362649309966524, "learning_rate": 9.684525066797906e-06, "loss": 0.0163, "step": 3852 }, { "epoch": 5.2493188010899186, "grad_norm": 0.34001873979073266, "learning_rate": 9.68011432516561e-06, "loss": 0.0023, "step": 3853 }, { "epoch": 5.2506811989100814, "grad_norm": 2.1921397753618685, "learning_rate": 9.675703645828795e-06, "loss": 0.0055, "step": 3854 }, { "epoch": 5.252043596730245, "grad_norm": 0.8556213858732569, "learning_rate": 9.67129302964641e-06, "loss": 0.0182, "step": 3855 }, { "epoch": 5.253405994550409, "grad_norm": 1.4427854414434282, "learning_rate": 9.666882477477387e-06, "loss": 0.0116, "step": 3856 }, { "epoch": 5.254768392370572, "grad_norm": 1.8401029837009073, "learning_rate": 9.662471990180658e-06, "loss": 0.0021, "step": 3857 }, { "epoch": 5.256130790190736, "grad_norm": 1.461002025459634, "learning_rate": 9.658061568615132e-06, "loss": 0.0243, "step": 3858 }, { "epoch": 5.2574931880108995, "grad_norm": 1.3941309308832923, "learning_rate": 9.653651213639705e-06, "loss": 0.0012, "step": 3859 }, { "epoch": 5.258855585831062, "grad_norm": 0.8984931773772579, "learning_rate": 9.649240926113263e-06, "loss": 0.0209, "step": 3860 }, { "epoch": 5.260217983651226, "grad_norm": 3.9273843427646398, "learning_rate": 9.644830706894678e-06, "loss": 0.0164, "step": 3861 }, { "epoch": 5.26158038147139, "grad_norm": 1.1876421277795217, "learning_rate": 9.640420556842808e-06, "loss": 0.0115, "step": 3862 }, { "epoch": 5.262942779291553, "grad_norm": 3.5144990646158134, "learning_rate": 9.636010476816504e-06, "loss": 0.0104, "step": 3863 }, { "epoch": 5.264305177111717, "grad_norm": 2.1602738680168723, "learning_rate": 9.631600467674598e-06, "loss": 0.0102, "step": 3864 }, { "epoch": 5.26566757493188, "grad_norm": 1.681438529343349, "learning_rate": 9.627190530275902e-06, "loss": 0.0131, "step": 3865 }, { "epoch": 5.267029972752043, "grad_norm": 2.285804011962839, "learning_rate": 9.622780665479223e-06, "loss": 0.0203, "step": 3866 }, { "epoch": 5.268392370572207, "grad_norm": 1.8589330290258423, "learning_rate": 9.618370874143352e-06, "loss": 0.0208, "step": 3867 }, { "epoch": 5.269754768392371, "grad_norm": 1.396334375237414, "learning_rate": 9.61396115712706e-06, "loss": 0.0052, "step": 3868 }, { "epoch": 5.271117166212534, "grad_norm": 1.4389002325102276, "learning_rate": 9.609551515289117e-06, "loss": 0.0237, "step": 3869 }, { "epoch": 5.272479564032698, "grad_norm": 1.045992548273615, "learning_rate": 9.605141949488263e-06, "loss": 0.0158, "step": 3870 }, { "epoch": 5.273841961852861, "grad_norm": 2.449458495724311, "learning_rate": 9.600732460583237e-06, "loss": 0.0103, "step": 3871 }, { "epoch": 5.275204359673024, "grad_norm": 1.6223997232681615, "learning_rate": 9.596323049432746e-06, "loss": 0.0209, "step": 3872 }, { "epoch": 5.276566757493188, "grad_norm": 1.0958721159013434, "learning_rate": 9.591913716895499e-06, "loss": 0.027, "step": 3873 }, { "epoch": 5.277929155313352, "grad_norm": 2.4619484016672044, "learning_rate": 9.587504463830173e-06, "loss": 0.0063, "step": 3874 }, { "epoch": 5.279291553133515, "grad_norm": 0.8451768857236555, "learning_rate": 9.583095291095454e-06, "loss": 0.0084, "step": 3875 }, { "epoch": 5.2806539509536785, "grad_norm": 1.9804824818785272, "learning_rate": 9.578686199549987e-06, "loss": 0.0063, "step": 3876 }, { "epoch": 5.282016348773842, "grad_norm": 2.4711471521756865, "learning_rate": 9.574277190052417e-06, "loss": 0.0114, "step": 3877 }, { "epoch": 5.283378746594005, "grad_norm": 0.9218370205882757, "learning_rate": 9.569868263461362e-06, "loss": 0.0092, "step": 3878 }, { "epoch": 5.284741144414169, "grad_norm": 2.577396450761637, "learning_rate": 9.565459420635426e-06, "loss": 0.0115, "step": 3879 }, { "epoch": 5.286103542234333, "grad_norm": 2.284874090918053, "learning_rate": 9.561050662433211e-06, "loss": 0.0066, "step": 3880 }, { "epoch": 5.287465940054496, "grad_norm": 0.838533377134026, "learning_rate": 9.556641989713287e-06, "loss": 0.0213, "step": 3881 }, { "epoch": 5.2888283378746594, "grad_norm": 1.0205897387397938, "learning_rate": 9.552233403334213e-06, "loss": 0.0293, "step": 3882 }, { "epoch": 5.290190735694823, "grad_norm": 2.1911575332439224, "learning_rate": 9.54782490415453e-06, "loss": 0.0136, "step": 3883 }, { "epoch": 5.291553133514986, "grad_norm": 0.8130192743864296, "learning_rate": 9.543416493032757e-06, "loss": 0.0074, "step": 3884 }, { "epoch": 5.29291553133515, "grad_norm": 3.4882392441258783, "learning_rate": 9.5390081708274e-06, "loss": 0.0108, "step": 3885 }, { "epoch": 5.294277929155314, "grad_norm": 1.4486482661062356, "learning_rate": 9.53459993839696e-06, "loss": 0.0091, "step": 3886 }, { "epoch": 5.295640326975477, "grad_norm": 1.2772560422213748, "learning_rate": 9.5301917965999e-06, "loss": 0.0113, "step": 3887 }, { "epoch": 5.29700272479564, "grad_norm": 2.089801392361107, "learning_rate": 9.525783746294678e-06, "loss": 0.008, "step": 3888 }, { "epoch": 5.298365122615804, "grad_norm": 1.51083151000925, "learning_rate": 9.521375788339734e-06, "loss": 0.0279, "step": 3889 }, { "epoch": 5.299727520435967, "grad_norm": 2.225758329407442, "learning_rate": 9.516967923593479e-06, "loss": 0.02, "step": 3890 }, { "epoch": 5.301089918256131, "grad_norm": 1.0064360322274253, "learning_rate": 9.512560152914312e-06, "loss": 0.0028, "step": 3891 }, { "epoch": 5.302452316076295, "grad_norm": 1.4493018802238955, "learning_rate": 9.508152477160626e-06, "loss": 0.0107, "step": 3892 }, { "epoch": 5.3038147138964575, "grad_norm": 1.2663823473765043, "learning_rate": 9.503744897190778e-06, "loss": 0.0104, "step": 3893 }, { "epoch": 5.305177111716621, "grad_norm": 1.026842606161782, "learning_rate": 9.499337413863117e-06, "loss": 0.021, "step": 3894 }, { "epoch": 5.306539509536785, "grad_norm": 0.7063694939076706, "learning_rate": 9.494930028035967e-06, "loss": 0.0166, "step": 3895 }, { "epoch": 5.307901907356948, "grad_norm": 1.7694672892648524, "learning_rate": 9.490522740567633e-06, "loss": 0.0221, "step": 3896 }, { "epoch": 5.309264305177112, "grad_norm": 2.549574811754476, "learning_rate": 9.486115552316402e-06, "loss": 0.0563, "step": 3897 }, { "epoch": 5.310626702997276, "grad_norm": 0.8575531487118506, "learning_rate": 9.48170846414055e-06, "loss": 0.0199, "step": 3898 }, { "epoch": 5.3119891008174385, "grad_norm": 3.050592104739102, "learning_rate": 9.477301476898322e-06, "loss": 0.0437, "step": 3899 }, { "epoch": 5.313351498637602, "grad_norm": 2.5341899328636934, "learning_rate": 9.472894591447949e-06, "loss": 0.0272, "step": 3900 }, { "epoch": 5.314713896457766, "grad_norm": 2.4476831260794616, "learning_rate": 9.46848780864764e-06, "loss": 0.0138, "step": 3901 }, { "epoch": 5.316076294277929, "grad_norm": 1.67477813024224, "learning_rate": 9.464081129355586e-06, "loss": 0.0043, "step": 3902 }, { "epoch": 5.317438692098093, "grad_norm": 2.734726374254838, "learning_rate": 9.45967455442995e-06, "loss": 0.0222, "step": 3903 }, { "epoch": 5.3188010899182565, "grad_norm": 1.9279568553782709, "learning_rate": 9.455268084728892e-06, "loss": 0.0097, "step": 3904 }, { "epoch": 5.320163487738419, "grad_norm": 0.7697171552628441, "learning_rate": 9.450861721110535e-06, "loss": 0.0085, "step": 3905 }, { "epoch": 5.321525885558583, "grad_norm": 2.5084876768605606, "learning_rate": 9.446455464432988e-06, "loss": 0.0161, "step": 3906 }, { "epoch": 5.322888283378747, "grad_norm": 1.215950137408681, "learning_rate": 9.442049315554343e-06, "loss": 0.0265, "step": 3907 }, { "epoch": 5.32425068119891, "grad_norm": 0.929726804979651, "learning_rate": 9.437643275332654e-06, "loss": 0.0048, "step": 3908 }, { "epoch": 5.325613079019074, "grad_norm": 3.972139225401131, "learning_rate": 9.433237344625979e-06, "loss": 0.0165, "step": 3909 }, { "epoch": 5.3269754768392374, "grad_norm": 1.2688117838798472, "learning_rate": 9.428831524292339e-06, "loss": 0.0055, "step": 3910 }, { "epoch": 5.3283378746594, "grad_norm": 0.2553547766985364, "learning_rate": 9.424425815189734e-06, "loss": 0.0012, "step": 3911 }, { "epoch": 5.329700272479564, "grad_norm": 3.270598508477762, "learning_rate": 9.420020218176145e-06, "loss": 0.0257, "step": 3912 }, { "epoch": 5.331062670299728, "grad_norm": 1.0105712196227634, "learning_rate": 9.415614734109535e-06, "loss": 0.01, "step": 3913 }, { "epoch": 5.332425068119891, "grad_norm": 0.6818324389254193, "learning_rate": 9.41120936384783e-06, "loss": 0.0144, "step": 3914 }, { "epoch": 5.333787465940055, "grad_norm": 1.9861074013409508, "learning_rate": 9.40680410824896e-06, "loss": 0.0262, "step": 3915 }, { "epoch": 5.335149863760218, "grad_norm": 1.0493095327426225, "learning_rate": 9.402398968170806e-06, "loss": 0.0089, "step": 3916 }, { "epoch": 5.336512261580381, "grad_norm": 2.546789979880754, "learning_rate": 9.397993944471245e-06, "loss": 0.0256, "step": 3917 }, { "epoch": 5.337874659400545, "grad_norm": 1.4771258969087764, "learning_rate": 9.39358903800812e-06, "loss": 0.0183, "step": 3918 }, { "epoch": 5.339237057220709, "grad_norm": 0.6599153844620063, "learning_rate": 9.389184249639256e-06, "loss": 0.0079, "step": 3919 }, { "epoch": 5.340599455040872, "grad_norm": 1.15722362395839, "learning_rate": 9.384779580222454e-06, "loss": 0.0117, "step": 3920 }, { "epoch": 5.3419618528610355, "grad_norm": 1.0635026900110305, "learning_rate": 9.380375030615497e-06, "loss": 0.0117, "step": 3921 }, { "epoch": 5.343324250681199, "grad_norm": 1.290525766597621, "learning_rate": 9.375970601676135e-06, "loss": 0.0102, "step": 3922 }, { "epoch": 5.344686648501362, "grad_norm": 1.1196088186160758, "learning_rate": 9.371566294262101e-06, "loss": 0.0186, "step": 3923 }, { "epoch": 5.346049046321526, "grad_norm": 0.9933298167826629, "learning_rate": 9.367162109231102e-06, "loss": 0.0106, "step": 3924 }, { "epoch": 5.34741144414169, "grad_norm": 0.856216776494817, "learning_rate": 9.362758047440826e-06, "loss": 0.0058, "step": 3925 }, { "epoch": 5.348773841961853, "grad_norm": 0.5686418323398967, "learning_rate": 9.358354109748927e-06, "loss": 0.0098, "step": 3926 }, { "epoch": 5.3501362397820165, "grad_norm": 0.9274347245786886, "learning_rate": 9.353950297013045e-06, "loss": 0.0049, "step": 3927 }, { "epoch": 5.35149863760218, "grad_norm": 0.4467658207694875, "learning_rate": 9.349546610090791e-06, "loss": 0.0092, "step": 3928 }, { "epoch": 5.352861035422343, "grad_norm": 1.7551921227558382, "learning_rate": 9.34514304983975e-06, "loss": 0.035, "step": 3929 }, { "epoch": 5.354223433242507, "grad_norm": 2.6333572724010974, "learning_rate": 9.340739617117487e-06, "loss": 0.0176, "step": 3930 }, { "epoch": 5.355585831062671, "grad_norm": 1.1884976070103246, "learning_rate": 9.336336312781538e-06, "loss": 0.0404, "step": 3931 }, { "epoch": 5.356948228882834, "grad_norm": 2.067302399770906, "learning_rate": 9.331933137689413e-06, "loss": 0.0052, "step": 3932 }, { "epoch": 5.358310626702997, "grad_norm": 2.427421498412787, "learning_rate": 9.327530092698605e-06, "loss": 0.0171, "step": 3933 }, { "epoch": 5.359673024523161, "grad_norm": 2.410711067800201, "learning_rate": 9.323127178666572e-06, "loss": 0.0145, "step": 3934 }, { "epoch": 5.361035422343324, "grad_norm": 2.6050799976680237, "learning_rate": 9.31872439645075e-06, "loss": 0.0169, "step": 3935 }, { "epoch": 5.362397820163488, "grad_norm": 0.48721620679307664, "learning_rate": 9.314321746908554e-06, "loss": 0.0026, "step": 3936 }, { "epoch": 5.363760217983652, "grad_norm": 2.4665137622144515, "learning_rate": 9.309919230897364e-06, "loss": 0.0171, "step": 3937 }, { "epoch": 5.3651226158038146, "grad_norm": 2.463542105527571, "learning_rate": 9.305516849274542e-06, "loss": 0.0235, "step": 3938 }, { "epoch": 5.366485013623978, "grad_norm": 1.1422392695224064, "learning_rate": 9.30111460289742e-06, "loss": 0.0136, "step": 3939 }, { "epoch": 5.367847411444142, "grad_norm": 2.0422966460545555, "learning_rate": 9.296712492623305e-06, "loss": 0.0163, "step": 3940 }, { "epoch": 5.369209809264305, "grad_norm": 1.2352064609445066, "learning_rate": 9.292310519309476e-06, "loss": 0.0122, "step": 3941 }, { "epoch": 5.370572207084469, "grad_norm": 1.3939646520478113, "learning_rate": 9.287908683813188e-06, "loss": 0.0024, "step": 3942 }, { "epoch": 5.371934604904633, "grad_norm": 1.7883125700682887, "learning_rate": 9.283506986991662e-06, "loss": 0.0335, "step": 3943 }, { "epoch": 5.3732970027247955, "grad_norm": 1.7015706828919737, "learning_rate": 9.279105429702104e-06, "loss": 0.0129, "step": 3944 }, { "epoch": 5.374659400544959, "grad_norm": 0.8147019860801644, "learning_rate": 9.274704012801684e-06, "loss": 0.0043, "step": 3945 }, { "epoch": 5.376021798365123, "grad_norm": 2.2651008644615027, "learning_rate": 9.270302737147548e-06, "loss": 0.0272, "step": 3946 }, { "epoch": 5.377384196185286, "grad_norm": 1.3339609546936604, "learning_rate": 9.265901603596811e-06, "loss": 0.0088, "step": 3947 }, { "epoch": 5.37874659400545, "grad_norm": 0.8587971892173865, "learning_rate": 9.261500613006564e-06, "loss": 0.0047, "step": 3948 }, { "epoch": 5.3801089918256135, "grad_norm": 1.3796059441083885, "learning_rate": 9.257099766233867e-06, "loss": 0.0037, "step": 3949 }, { "epoch": 5.381471389645776, "grad_norm": 1.5766778964764738, "learning_rate": 9.252699064135759e-06, "loss": 0.0117, "step": 3950 }, { "epoch": 5.38283378746594, "grad_norm": 0.5279454497869496, "learning_rate": 9.24829850756924e-06, "loss": 0.0085, "step": 3951 }, { "epoch": 5.384196185286104, "grad_norm": 1.8358773778904394, "learning_rate": 9.243898097391292e-06, "loss": 0.0257, "step": 3952 }, { "epoch": 5.385558583106267, "grad_norm": 0.8426176845154254, "learning_rate": 9.23949783445886e-06, "loss": 0.0132, "step": 3953 }, { "epoch": 5.386920980926431, "grad_norm": 0.5617198554707583, "learning_rate": 9.235097719628868e-06, "loss": 0.0184, "step": 3954 }, { "epoch": 5.3882833787465945, "grad_norm": 1.6977565205399725, "learning_rate": 9.2306977537582e-06, "loss": 0.0162, "step": 3955 }, { "epoch": 5.389645776566757, "grad_norm": 0.9876539225834204, "learning_rate": 9.226297937703728e-06, "loss": 0.0125, "step": 3956 }, { "epoch": 5.391008174386921, "grad_norm": 2.7915511591359268, "learning_rate": 9.221898272322282e-06, "loss": 0.0242, "step": 3957 }, { "epoch": 5.392370572207085, "grad_norm": 0.5275006076903138, "learning_rate": 9.217498758470663e-06, "loss": 0.0027, "step": 3958 }, { "epoch": 5.393732970027248, "grad_norm": 2.0543982674528536, "learning_rate": 9.213099397005647e-06, "loss": 0.0166, "step": 3959 }, { "epoch": 5.395095367847412, "grad_norm": 1.344262651728421, "learning_rate": 9.20870018878398e-06, "loss": 0.0029, "step": 3960 }, { "epoch": 5.396457765667575, "grad_norm": 2.5298892720568165, "learning_rate": 9.20430113466237e-06, "loss": 0.0343, "step": 3961 }, { "epoch": 5.397820163487738, "grad_norm": 0.8413791903169094, "learning_rate": 9.199902235497513e-06, "loss": 0.003, "step": 3962 }, { "epoch": 5.399182561307902, "grad_norm": 1.399141288804288, "learning_rate": 9.195503492146055e-06, "loss": 0.0087, "step": 3963 }, { "epoch": 5.400544959128065, "grad_norm": 3.0317545596939066, "learning_rate": 9.191104905464624e-06, "loss": 0.0452, "step": 3964 }, { "epoch": 5.401907356948229, "grad_norm": 2.2874654581197955, "learning_rate": 9.186706476309812e-06, "loss": 0.0045, "step": 3965 }, { "epoch": 5.4032697547683926, "grad_norm": 1.987587495255009, "learning_rate": 9.182308205538183e-06, "loss": 0.0026, "step": 3966 }, { "epoch": 5.4046321525885554, "grad_norm": 0.7216683093390343, "learning_rate": 9.17791009400627e-06, "loss": 0.0072, "step": 3967 }, { "epoch": 5.405994550408719, "grad_norm": 1.1513450258824824, "learning_rate": 9.173512142570574e-06, "loss": 0.0252, "step": 3968 }, { "epoch": 5.407356948228883, "grad_norm": 2.7741352373972856, "learning_rate": 9.169114352087564e-06, "loss": 0.0026, "step": 3969 }, { "epoch": 5.408719346049046, "grad_norm": 0.36372493683623086, "learning_rate": 9.164716723413678e-06, "loss": 0.0083, "step": 3970 }, { "epoch": 5.41008174386921, "grad_norm": 1.7224431460854088, "learning_rate": 9.160319257405327e-06, "loss": 0.0105, "step": 3971 }, { "epoch": 5.4114441416893735, "grad_norm": 2.9555772747489506, "learning_rate": 9.155921954918879e-06, "loss": 0.0256, "step": 3972 }, { "epoch": 5.412806539509536, "grad_norm": 0.6039477108522768, "learning_rate": 9.151524816810686e-06, "loss": 0.0152, "step": 3973 }, { "epoch": 5.4141689373297, "grad_norm": 2.9135391065066703, "learning_rate": 9.147127843937055e-06, "loss": 0.0025, "step": 3974 }, { "epoch": 5.415531335149864, "grad_norm": 1.8008568936785194, "learning_rate": 9.142731037154268e-06, "loss": 0.0158, "step": 3975 }, { "epoch": 5.416893732970027, "grad_norm": 1.3189725598685826, "learning_rate": 9.138334397318571e-06, "loss": 0.0091, "step": 3976 }, { "epoch": 5.418256130790191, "grad_norm": 2.710329625836445, "learning_rate": 9.13393792528618e-06, "loss": 0.0111, "step": 3977 }, { "epoch": 5.419618528610354, "grad_norm": 2.168545518355167, "learning_rate": 9.12954162191327e-06, "loss": 0.0244, "step": 3978 }, { "epoch": 5.420980926430517, "grad_norm": 1.3456514495373895, "learning_rate": 9.125145488056e-06, "loss": 0.0093, "step": 3979 }, { "epoch": 5.422343324250681, "grad_norm": 2.218724448988178, "learning_rate": 9.12074952457048e-06, "loss": 0.0155, "step": 3980 }, { "epoch": 5.423705722070845, "grad_norm": 2.25257192000547, "learning_rate": 9.116353732312795e-06, "loss": 0.023, "step": 3981 }, { "epoch": 5.425068119891008, "grad_norm": 1.7396504457842745, "learning_rate": 9.111958112138994e-06, "loss": 0.0198, "step": 3982 }, { "epoch": 5.426430517711172, "grad_norm": 2.0772903074723597, "learning_rate": 9.107562664905094e-06, "loss": 0.0077, "step": 3983 }, { "epoch": 5.427792915531335, "grad_norm": 1.3024748360822371, "learning_rate": 9.103167391467074e-06, "loss": 0.0057, "step": 3984 }, { "epoch": 5.429155313351498, "grad_norm": 3.209495697318018, "learning_rate": 9.098772292680886e-06, "loss": 0.0421, "step": 3985 }, { "epoch": 5.430517711171662, "grad_norm": 3.337296894723217, "learning_rate": 9.094377369402445e-06, "loss": 0.0167, "step": 3986 }, { "epoch": 5.431880108991826, "grad_norm": 1.5896802406447526, "learning_rate": 9.08998262248763e-06, "loss": 0.0082, "step": 3987 }, { "epoch": 5.433242506811989, "grad_norm": 3.181188091827291, "learning_rate": 9.085588052792286e-06, "loss": 0.0114, "step": 3988 }, { "epoch": 5.4346049046321525, "grad_norm": 0.932592383748813, "learning_rate": 9.081193661172226e-06, "loss": 0.0021, "step": 3989 }, { "epoch": 5.435967302452316, "grad_norm": 2.7302057602908616, "learning_rate": 9.076799448483224e-06, "loss": 0.0141, "step": 3990 }, { "epoch": 5.437329700272479, "grad_norm": 2.726103471940194, "learning_rate": 9.072405415581025e-06, "loss": 0.014, "step": 3991 }, { "epoch": 5.438692098092643, "grad_norm": 2.1704513481983674, "learning_rate": 9.068011563321335e-06, "loss": 0.0074, "step": 3992 }, { "epoch": 5.440054495912807, "grad_norm": 1.3486336461732111, "learning_rate": 9.063617892559829e-06, "loss": 0.0078, "step": 3993 }, { "epoch": 5.44141689373297, "grad_norm": 3.9945917748809885, "learning_rate": 9.059224404152139e-06, "loss": 0.036, "step": 3994 }, { "epoch": 5.4427792915531334, "grad_norm": 2.8329344054322507, "learning_rate": 9.054831098953863e-06, "loss": 0.0463, "step": 3995 }, { "epoch": 5.444141689373297, "grad_norm": 1.1537083236113816, "learning_rate": 9.050437977820574e-06, "loss": 0.0179, "step": 3996 }, { "epoch": 5.44550408719346, "grad_norm": 3.037448884352663, "learning_rate": 9.046045041607797e-06, "loss": 0.0023, "step": 3997 }, { "epoch": 5.446866485013624, "grad_norm": 1.926336514637395, "learning_rate": 9.041652291171029e-06, "loss": 0.026, "step": 3998 }, { "epoch": 5.448228882833788, "grad_norm": 1.7661485377654385, "learning_rate": 9.037259727365722e-06, "loss": 0.0073, "step": 3999 }, { "epoch": 5.449591280653951, "grad_norm": 6.291135232999263, "learning_rate": 9.032867351047299e-06, "loss": 0.0077, "step": 4000 }, { "epoch": 5.450953678474114, "grad_norm": 2.323894408687898, "learning_rate": 9.028475163071142e-06, "loss": 0.016, "step": 4001 }, { "epoch": 5.452316076294278, "grad_norm": 1.1056428682925683, "learning_rate": 9.024083164292603e-06, "loss": 0.0127, "step": 4002 }, { "epoch": 5.453678474114441, "grad_norm": 3.055919752825631, "learning_rate": 9.019691355566994e-06, "loss": 0.0156, "step": 4003 }, { "epoch": 5.455040871934605, "grad_norm": 1.7759027651491508, "learning_rate": 9.015299737749584e-06, "loss": 0.003, "step": 4004 }, { "epoch": 5.456403269754769, "grad_norm": 1.276944835669355, "learning_rate": 9.010908311695611e-06, "loss": 0.0033, "step": 4005 }, { "epoch": 5.4577656675749315, "grad_norm": 2.807569575116651, "learning_rate": 9.006517078260277e-06, "loss": 0.0164, "step": 4006 }, { "epoch": 5.459128065395095, "grad_norm": 3.0748204148200258, "learning_rate": 9.002126038298736e-06, "loss": 0.0176, "step": 4007 }, { "epoch": 5.460490463215259, "grad_norm": 1.297790994984173, "learning_rate": 8.997735192666122e-06, "loss": 0.0152, "step": 4008 }, { "epoch": 5.461852861035422, "grad_norm": 4.933676037500646, "learning_rate": 8.993344542217515e-06, "loss": 0.0149, "step": 4009 }, { "epoch": 5.463215258855586, "grad_norm": 2.537534053459679, "learning_rate": 8.988954087807968e-06, "loss": 0.008, "step": 4010 }, { "epoch": 5.46457765667575, "grad_norm": 3.082096935094654, "learning_rate": 8.984563830292487e-06, "loss": 0.0156, "step": 4011 }, { "epoch": 5.4659400544959125, "grad_norm": 3.1424739901066254, "learning_rate": 8.980173770526043e-06, "loss": 0.0199, "step": 4012 }, { "epoch": 5.467302452316076, "grad_norm": 1.560295591769245, "learning_rate": 8.97578390936357e-06, "loss": 0.0091, "step": 4013 }, { "epoch": 5.46866485013624, "grad_norm": 2.101236966962965, "learning_rate": 8.971394247659968e-06, "loss": 0.0028, "step": 4014 }, { "epoch": 5.470027247956403, "grad_norm": 2.5637819502742842, "learning_rate": 8.967004786270085e-06, "loss": 0.0095, "step": 4015 }, { "epoch": 5.471389645776567, "grad_norm": 0.9815295380679557, "learning_rate": 8.962615526048742e-06, "loss": 0.0161, "step": 4016 }, { "epoch": 5.4727520435967305, "grad_norm": 1.9718038124933808, "learning_rate": 8.958226467850716e-06, "loss": 0.0198, "step": 4017 }, { "epoch": 5.474114441416893, "grad_norm": 0.7658205381627083, "learning_rate": 8.953837612530741e-06, "loss": 0.009, "step": 4018 }, { "epoch": 5.475476839237057, "grad_norm": 2.319414336731191, "learning_rate": 8.949448960943524e-06, "loss": 0.0129, "step": 4019 }, { "epoch": 5.476839237057221, "grad_norm": 1.0388966913950863, "learning_rate": 8.945060513943718e-06, "loss": 0.0098, "step": 4020 }, { "epoch": 5.478201634877384, "grad_norm": 2.1378608958581022, "learning_rate": 8.940672272385942e-06, "loss": 0.0034, "step": 4021 }, { "epoch": 5.479564032697548, "grad_norm": 0.6918503856692688, "learning_rate": 8.936284237124779e-06, "loss": 0.0023, "step": 4022 }, { "epoch": 5.4809264305177114, "grad_norm": 0.6912513454759324, "learning_rate": 8.931896409014764e-06, "loss": 0.01, "step": 4023 }, { "epoch": 5.482288828337874, "grad_norm": 2.908072003942685, "learning_rate": 8.927508788910392e-06, "loss": 0.0181, "step": 4024 }, { "epoch": 5.483651226158038, "grad_norm": 2.8552698484239714, "learning_rate": 8.923121377666135e-06, "loss": 0.0195, "step": 4025 }, { "epoch": 5.485013623978202, "grad_norm": 1.6157102353755304, "learning_rate": 8.918734176136396e-06, "loss": 0.0082, "step": 4026 }, { "epoch": 5.486376021798365, "grad_norm": 2.4283203837098983, "learning_rate": 8.914347185175558e-06, "loss": 0.0151, "step": 4027 }, { "epoch": 5.487738419618529, "grad_norm": 1.9569836027788534, "learning_rate": 8.909960405637958e-06, "loss": 0.0198, "step": 4028 }, { "epoch": 5.489100817438692, "grad_norm": 1.8169320097659472, "learning_rate": 8.905573838377886e-06, "loss": 0.0124, "step": 4029 }, { "epoch": 5.490463215258855, "grad_norm": 1.9168019253592505, "learning_rate": 8.901187484249595e-06, "loss": 0.014, "step": 4030 }, { "epoch": 5.491825613079019, "grad_norm": 1.1524730352425967, "learning_rate": 8.896801344107303e-06, "loss": 0.0147, "step": 4031 }, { "epoch": 5.493188010899183, "grad_norm": 1.9011219452815245, "learning_rate": 8.892415418805176e-06, "loss": 0.0158, "step": 4032 }, { "epoch": 5.494550408719346, "grad_norm": 2.851150698776294, "learning_rate": 8.888029709197339e-06, "loss": 0.0271, "step": 4033 }, { "epoch": 5.4959128065395095, "grad_norm": 0.42181194657328536, "learning_rate": 8.883644216137881e-06, "loss": 0.0113, "step": 4034 }, { "epoch": 5.497275204359673, "grad_norm": 1.8811392272094631, "learning_rate": 8.879258940480847e-06, "loss": 0.0061, "step": 4035 }, { "epoch": 5.498637602179836, "grad_norm": 3.823853651274888, "learning_rate": 8.874873883080232e-06, "loss": 0.0185, "step": 4036 }, { "epoch": 5.5, "grad_norm": 0.741949156142006, "learning_rate": 8.870489044790007e-06, "loss": 0.0085, "step": 4037 }, { "epoch": 5.501362397820164, "grad_norm": 2.5180752027998765, "learning_rate": 8.86610442646408e-06, "loss": 0.0093, "step": 4038 }, { "epoch": 5.502724795640327, "grad_norm": 1.3795764206599057, "learning_rate": 8.861720028956324e-06, "loss": 0.0189, "step": 4039 }, { "epoch": 5.5040871934604905, "grad_norm": 1.0766862070105077, "learning_rate": 8.857335853120573e-06, "loss": 0.0109, "step": 4040 }, { "epoch": 5.505449591280654, "grad_norm": 1.667882064964339, "learning_rate": 8.852951899810612e-06, "loss": 0.0119, "step": 4041 }, { "epoch": 5.506811989100817, "grad_norm": 1.0960805203555535, "learning_rate": 8.848568169880184e-06, "loss": 0.0214, "step": 4042 }, { "epoch": 5.508174386920981, "grad_norm": 1.74947409494823, "learning_rate": 8.844184664182993e-06, "loss": 0.0212, "step": 4043 }, { "epoch": 5.509536784741145, "grad_norm": 1.6719684628656408, "learning_rate": 8.839801383572698e-06, "loss": 0.01, "step": 4044 }, { "epoch": 5.510899182561308, "grad_norm": 1.0921113704470533, "learning_rate": 8.835418328902905e-06, "loss": 0.0166, "step": 4045 }, { "epoch": 5.512261580381471, "grad_norm": 2.0200954471000077, "learning_rate": 8.831035501027186e-06, "loss": 0.0156, "step": 4046 }, { "epoch": 5.513623978201635, "grad_norm": 1.1790141969538812, "learning_rate": 8.826652900799063e-06, "loss": 0.0187, "step": 4047 }, { "epoch": 5.514986376021798, "grad_norm": 2.0505496094555187, "learning_rate": 8.822270529072025e-06, "loss": 0.0058, "step": 4048 }, { "epoch": 5.516348773841962, "grad_norm": 1.7813473203417176, "learning_rate": 8.8178883866995e-06, "loss": 0.0207, "step": 4049 }, { "epoch": 5.517711171662126, "grad_norm": 2.5946233755949506, "learning_rate": 8.813506474534887e-06, "loss": 0.0251, "step": 4050 }, { "epoch": 5.5190735694822886, "grad_norm": 2.1533851564960482, "learning_rate": 8.809124793431522e-06, "loss": 0.0267, "step": 4051 }, { "epoch": 5.520435967302452, "grad_norm": 1.8381603957696138, "learning_rate": 8.804743344242716e-06, "loss": 0.0113, "step": 4052 }, { "epoch": 5.521798365122616, "grad_norm": 1.6325614560689066, "learning_rate": 8.800362127821714e-06, "loss": 0.015, "step": 4053 }, { "epoch": 5.523160762942779, "grad_norm": 1.8025713981243263, "learning_rate": 8.79598114502174e-06, "loss": 0.0135, "step": 4054 }, { "epoch": 5.524523160762943, "grad_norm": 2.263007763420889, "learning_rate": 8.791600396695954e-06, "loss": 0.0095, "step": 4055 }, { "epoch": 5.525885558583107, "grad_norm": 1.785608131404248, "learning_rate": 8.787219883697478e-06, "loss": 0.0184, "step": 4056 }, { "epoch": 5.5272479564032695, "grad_norm": 2.0196484998470483, "learning_rate": 8.782839606879383e-06, "loss": 0.0127, "step": 4057 }, { "epoch": 5.528610354223433, "grad_norm": 2.2724820207001772, "learning_rate": 8.778459567094696e-06, "loss": 0.0095, "step": 4058 }, { "epoch": 5.529972752043597, "grad_norm": 0.5250651437831408, "learning_rate": 8.774079765196395e-06, "loss": 0.0038, "step": 4059 }, { "epoch": 5.53133514986376, "grad_norm": 2.3692710037767175, "learning_rate": 8.76970020203743e-06, "loss": 0.0087, "step": 4060 }, { "epoch": 5.532697547683924, "grad_norm": 1.7354752947905736, "learning_rate": 8.765320878470679e-06, "loss": 0.0144, "step": 4061 }, { "epoch": 5.5340599455040875, "grad_norm": 1.9137093785457888, "learning_rate": 8.760941795348989e-06, "loss": 0.0177, "step": 4062 }, { "epoch": 5.53542234332425, "grad_norm": 2.3382999392838295, "learning_rate": 8.756562953525151e-06, "loss": 0.0046, "step": 4063 }, { "epoch": 5.536784741144414, "grad_norm": 1.202271830066984, "learning_rate": 8.752184353851917e-06, "loss": 0.0287, "step": 4064 }, { "epoch": 5.538147138964578, "grad_norm": 2.443455519490262, "learning_rate": 8.747805997181983e-06, "loss": 0.0249, "step": 4065 }, { "epoch": 5.539509536784741, "grad_norm": 1.5622263303193038, "learning_rate": 8.74342788436801e-06, "loss": 0.0311, "step": 4066 }, { "epoch": 5.540871934604905, "grad_norm": 1.7376592006809612, "learning_rate": 8.739050016262605e-06, "loss": 0.0094, "step": 4067 }, { "epoch": 5.5422343324250685, "grad_norm": 0.6775570150219957, "learning_rate": 8.734672393718323e-06, "loss": 0.028, "step": 4068 }, { "epoch": 5.543596730245231, "grad_norm": 0.5760145363722907, "learning_rate": 8.730295017587678e-06, "loss": 0.0168, "step": 4069 }, { "epoch": 5.544959128065395, "grad_norm": 1.3693612601740865, "learning_rate": 8.72591788872313e-06, "loss": 0.0111, "step": 4070 }, { "epoch": 5.546321525885559, "grad_norm": 0.6328548162459661, "learning_rate": 8.72154100797709e-06, "loss": 0.0123, "step": 4071 }, { "epoch": 5.547683923705722, "grad_norm": 0.9808801230108897, "learning_rate": 8.717164376201935e-06, "loss": 0.0214, "step": 4072 }, { "epoch": 5.549046321525886, "grad_norm": 1.12638218516347, "learning_rate": 8.712787994249979e-06, "loss": 0.0086, "step": 4073 }, { "epoch": 5.550408719346049, "grad_norm": 0.4189295437973448, "learning_rate": 8.708411862973492e-06, "loss": 0.0021, "step": 4074 }, { "epoch": 5.551771117166212, "grad_norm": 0.5330981389257067, "learning_rate": 8.704035983224692e-06, "loss": 0.0325, "step": 4075 }, { "epoch": 5.553133514986376, "grad_norm": 1.3005346618628608, "learning_rate": 8.699660355855747e-06, "loss": 0.0125, "step": 4076 }, { "epoch": 5.55449591280654, "grad_norm": 1.8831477432912684, "learning_rate": 8.695284981718793e-06, "loss": 0.0149, "step": 4077 }, { "epoch": 5.555858310626703, "grad_norm": 1.3536161304326306, "learning_rate": 8.690909861665892e-06, "loss": 0.0103, "step": 4078 }, { "epoch": 5.5572207084468666, "grad_norm": 2.4727816987371636, "learning_rate": 8.68653499654907e-06, "loss": 0.0136, "step": 4079 }, { "epoch": 5.55858310626703, "grad_norm": 0.2967482996884188, "learning_rate": 8.682160387220306e-06, "loss": 0.0086, "step": 4080 }, { "epoch": 5.559945504087193, "grad_norm": 1.5894624355907592, "learning_rate": 8.67778603453152e-06, "loss": 0.0027, "step": 4081 }, { "epoch": 5.561307901907357, "grad_norm": 1.5228571101321988, "learning_rate": 8.67341193933458e-06, "loss": 0.0015, "step": 4082 }, { "epoch": 5.562670299727521, "grad_norm": 1.0841542671472844, "learning_rate": 8.669038102481322e-06, "loss": 0.0124, "step": 4083 }, { "epoch": 5.564032697547684, "grad_norm": 2.9246977245877837, "learning_rate": 8.664664524823517e-06, "loss": 0.0129, "step": 4084 }, { "epoch": 5.5653950953678475, "grad_norm": 1.9560636668231226, "learning_rate": 8.660291207212883e-06, "loss": 0.0112, "step": 4085 }, { "epoch": 5.566757493188011, "grad_norm": 1.8653086282290583, "learning_rate": 8.655918150501101e-06, "loss": 0.0101, "step": 4086 }, { "epoch": 5.568119891008174, "grad_norm": 2.3708910599797015, "learning_rate": 8.651545355539786e-06, "loss": 0.0268, "step": 4087 }, { "epoch": 5.569482288828338, "grad_norm": 1.8222573598799638, "learning_rate": 8.647172823180505e-06, "loss": 0.011, "step": 4088 }, { "epoch": 5.570844686648502, "grad_norm": 1.2081839534738181, "learning_rate": 8.64280055427479e-06, "loss": 0.0043, "step": 4089 }, { "epoch": 5.572207084468665, "grad_norm": 1.4895039435689743, "learning_rate": 8.638428549674103e-06, "loss": 0.0021, "step": 4090 }, { "epoch": 5.573569482288828, "grad_norm": 2.6163270865481607, "learning_rate": 8.634056810229863e-06, "loss": 0.0235, "step": 4091 }, { "epoch": 5.574931880108992, "grad_norm": 0.9471282777788009, "learning_rate": 8.629685336793433e-06, "loss": 0.0022, "step": 4092 }, { "epoch": 5.576294277929155, "grad_norm": 1.762531069872099, "learning_rate": 8.625314130216129e-06, "loss": 0.01, "step": 4093 }, { "epoch": 5.577656675749319, "grad_norm": 2.94840486971713, "learning_rate": 8.620943191349207e-06, "loss": 0.0108, "step": 4094 }, { "epoch": 5.579019073569482, "grad_norm": 2.113740914518984, "learning_rate": 8.616572521043884e-06, "loss": 0.0325, "step": 4095 }, { "epoch": 5.580381471389646, "grad_norm": 0.9561187365809265, "learning_rate": 8.612202120151314e-06, "loss": 0.0079, "step": 4096 }, { "epoch": 5.581743869209809, "grad_norm": 1.584828420055624, "learning_rate": 8.607831989522604e-06, "loss": 0.0035, "step": 4097 }, { "epoch": 5.583106267029972, "grad_norm": 2.251359050232399, "learning_rate": 8.603462130008804e-06, "loss": 0.0235, "step": 4098 }, { "epoch": 5.584468664850136, "grad_norm": 2.1197219624809, "learning_rate": 8.599092542460914e-06, "loss": 0.0074, "step": 4099 }, { "epoch": 5.5858310626703, "grad_norm": 1.7733204913966214, "learning_rate": 8.594723227729875e-06, "loss": 0.006, "step": 4100 }, { "epoch": 5.587193460490463, "grad_norm": 0.973586807803693, "learning_rate": 8.590354186666589e-06, "loss": 0.0018, "step": 4101 }, { "epoch": 5.5885558583106265, "grad_norm": 1.7427921798334756, "learning_rate": 8.585985420121894e-06, "loss": 0.009, "step": 4102 }, { "epoch": 5.58991825613079, "grad_norm": 1.307656138993479, "learning_rate": 8.581616928946571e-06, "loss": 0.0069, "step": 4103 }, { "epoch": 5.591280653950953, "grad_norm": 1.0859489038916452, "learning_rate": 8.577248713991359e-06, "loss": 0.0097, "step": 4104 }, { "epoch": 5.592643051771117, "grad_norm": 0.8281173101578804, "learning_rate": 8.572880776106932e-06, "loss": 0.0141, "step": 4105 }, { "epoch": 5.594005449591281, "grad_norm": 2.0141998328521367, "learning_rate": 8.56851311614392e-06, "loss": 0.004, "step": 4106 }, { "epoch": 5.595367847411444, "grad_norm": 1.9814223133375202, "learning_rate": 8.564145734952892e-06, "loss": 0.0308, "step": 4107 }, { "epoch": 5.5967302452316074, "grad_norm": 2.6274189694078585, "learning_rate": 8.559778633384364e-06, "loss": 0.0333, "step": 4108 }, { "epoch": 5.598092643051771, "grad_norm": 1.3184425327317035, "learning_rate": 8.555411812288799e-06, "loss": 0.004, "step": 4109 }, { "epoch": 5.599455040871934, "grad_norm": 0.7195967517860785, "learning_rate": 8.551045272516604e-06, "loss": 0.0014, "step": 4110 }, { "epoch": 5.600817438692098, "grad_norm": 0.9026008339398806, "learning_rate": 8.54667901491813e-06, "loss": 0.0096, "step": 4111 }, { "epoch": 5.602179836512262, "grad_norm": 1.0699323846899538, "learning_rate": 8.54231304034368e-06, "loss": 0.0195, "step": 4112 }, { "epoch": 5.603542234332425, "grad_norm": 1.3566577559812325, "learning_rate": 8.537947349643493e-06, "loss": 0.0041, "step": 4113 }, { "epoch": 5.604904632152588, "grad_norm": 1.9125745195902235, "learning_rate": 8.533581943667759e-06, "loss": 0.0163, "step": 4114 }, { "epoch": 5.606267029972752, "grad_norm": 1.2539612725947717, "learning_rate": 8.529216823266605e-06, "loss": 0.0125, "step": 4115 }, { "epoch": 5.607629427792915, "grad_norm": 0.4398314238593836, "learning_rate": 8.524851989290114e-06, "loss": 0.0013, "step": 4116 }, { "epoch": 5.608991825613079, "grad_norm": 1.2692510978379294, "learning_rate": 8.520487442588302e-06, "loss": 0.0025, "step": 4117 }, { "epoch": 5.610354223433243, "grad_norm": 0.668776256392417, "learning_rate": 8.516123184011136e-06, "loss": 0.0087, "step": 4118 }, { "epoch": 5.6117166212534055, "grad_norm": 0.9859627838588628, "learning_rate": 8.511759214408523e-06, "loss": 0.0165, "step": 4119 }, { "epoch": 5.613079019073569, "grad_norm": 0.3026781475207102, "learning_rate": 8.507395534630319e-06, "loss": 0.0009, "step": 4120 }, { "epoch": 5.614441416893733, "grad_norm": 0.6566740897997384, "learning_rate": 8.503032145526314e-06, "loss": 0.0065, "step": 4121 }, { "epoch": 5.615803814713896, "grad_norm": 0.9995453634606469, "learning_rate": 8.498669047946252e-06, "loss": 0.0025, "step": 4122 }, { "epoch": 5.61716621253406, "grad_norm": 0.6482856960217399, "learning_rate": 8.494306242739811e-06, "loss": 0.0013, "step": 4123 }, { "epoch": 5.618528610354224, "grad_norm": 1.5533147988532294, "learning_rate": 8.489943730756622e-06, "loss": 0.0204, "step": 4124 }, { "epoch": 5.6198910081743865, "grad_norm": 1.0013878543777743, "learning_rate": 8.485581512846252e-06, "loss": 0.014, "step": 4125 }, { "epoch": 5.62125340599455, "grad_norm": 0.9879426844225746, "learning_rate": 8.481219589858211e-06, "loss": 0.0094, "step": 4126 }, { "epoch": 5.622615803814714, "grad_norm": 0.7448893395742925, "learning_rate": 8.476857962641951e-06, "loss": 0.0232, "step": 4127 }, { "epoch": 5.623978201634877, "grad_norm": 2.2585363139845867, "learning_rate": 8.47249663204687e-06, "loss": 0.0069, "step": 4128 }, { "epoch": 5.625340599455041, "grad_norm": 1.8739533015620295, "learning_rate": 8.468135598922306e-06, "loss": 0.0218, "step": 4129 }, { "epoch": 5.6267029972752045, "grad_norm": 0.9138384844715135, "learning_rate": 8.463774864117542e-06, "loss": 0.0017, "step": 4130 }, { "epoch": 5.628065395095367, "grad_norm": 3.159543892998921, "learning_rate": 8.459414428481798e-06, "loss": 0.0141, "step": 4131 }, { "epoch": 5.629427792915531, "grad_norm": 2.682014846104653, "learning_rate": 8.455054292864239e-06, "loss": 0.0523, "step": 4132 }, { "epoch": 5.630790190735695, "grad_norm": 0.28755840442291175, "learning_rate": 8.450694458113969e-06, "loss": 0.0151, "step": 4133 }, { "epoch": 5.632152588555858, "grad_norm": 0.9721318773670438, "learning_rate": 8.446334925080036e-06, "loss": 0.0154, "step": 4134 }, { "epoch": 5.633514986376022, "grad_norm": 0.9495039623891364, "learning_rate": 8.441975694611428e-06, "loss": 0.0164, "step": 4135 }, { "epoch": 5.6348773841961854, "grad_norm": 2.4510108652806406, "learning_rate": 8.437616767557078e-06, "loss": 0.0234, "step": 4136 }, { "epoch": 5.636239782016348, "grad_norm": 2.2451774438992387, "learning_rate": 8.43325814476585e-06, "loss": 0.0133, "step": 4137 }, { "epoch": 5.637602179836512, "grad_norm": 0.49600487689471073, "learning_rate": 8.42889982708656e-06, "loss": 0.0078, "step": 4138 }, { "epoch": 5.638964577656676, "grad_norm": 1.3597830352929534, "learning_rate": 8.424541815367958e-06, "loss": 0.0065, "step": 4139 }, { "epoch": 5.640326975476839, "grad_norm": 1.2047798603239719, "learning_rate": 8.420184110458735e-06, "loss": 0.0166, "step": 4140 }, { "epoch": 5.641689373297003, "grad_norm": 1.0031065030229518, "learning_rate": 8.415826713207525e-06, "loss": 0.0184, "step": 4141 }, { "epoch": 5.643051771117166, "grad_norm": 1.5641468698820171, "learning_rate": 8.4114696244629e-06, "loss": 0.0235, "step": 4142 }, { "epoch": 5.644414168937329, "grad_norm": 2.0261164071503917, "learning_rate": 8.407112845073373e-06, "loss": 0.0117, "step": 4143 }, { "epoch": 5.645776566757493, "grad_norm": 0.9717745601787396, "learning_rate": 8.402756375887396e-06, "loss": 0.0108, "step": 4144 }, { "epoch": 5.647138964577657, "grad_norm": 1.9850620439460023, "learning_rate": 8.398400217753357e-06, "loss": 0.0094, "step": 4145 }, { "epoch": 5.64850136239782, "grad_norm": 1.2995288328383106, "learning_rate": 8.39404437151959e-06, "loss": 0.0012, "step": 4146 }, { "epoch": 5.6498637602179835, "grad_norm": 1.646140561927221, "learning_rate": 8.389688838034368e-06, "loss": 0.028, "step": 4147 }, { "epoch": 5.651226158038147, "grad_norm": 2.3109247316426282, "learning_rate": 8.385333618145896e-06, "loss": 0.017, "step": 4148 }, { "epoch": 5.65258855585831, "grad_norm": 0.785443283119271, "learning_rate": 8.380978712702326e-06, "loss": 0.0192, "step": 4149 }, { "epoch": 5.653950953678474, "grad_norm": 2.5907917553862747, "learning_rate": 8.376624122551743e-06, "loss": 0.0156, "step": 4150 }, { "epoch": 5.655313351498638, "grad_norm": 2.194365672930948, "learning_rate": 8.372269848542173e-06, "loss": 0.0129, "step": 4151 }, { "epoch": 5.656675749318801, "grad_norm": 1.4997949334076528, "learning_rate": 8.367915891521577e-06, "loss": 0.0114, "step": 4152 }, { "epoch": 5.6580381471389645, "grad_norm": 2.197898771108595, "learning_rate": 8.363562252337864e-06, "loss": 0.0516, "step": 4153 }, { "epoch": 5.659400544959128, "grad_norm": 1.2172999773865218, "learning_rate": 8.359208931838872e-06, "loss": 0.0087, "step": 4154 }, { "epoch": 5.660762942779291, "grad_norm": 0.8493561760949578, "learning_rate": 8.354855930872379e-06, "loss": 0.0015, "step": 4155 }, { "epoch": 5.662125340599455, "grad_norm": 1.4414225720500633, "learning_rate": 8.3505032502861e-06, "loss": 0.0033, "step": 4156 }, { "epoch": 5.663487738419619, "grad_norm": 1.601497044340131, "learning_rate": 8.346150890927688e-06, "loss": 0.0131, "step": 4157 }, { "epoch": 5.664850136239782, "grad_norm": 2.15617971948388, "learning_rate": 8.341798853644739e-06, "loss": 0.0231, "step": 4158 }, { "epoch": 5.666212534059945, "grad_norm": 1.2916994917356417, "learning_rate": 8.337447139284776e-06, "loss": 0.0086, "step": 4159 }, { "epoch": 5.667574931880109, "grad_norm": 2.192593164878696, "learning_rate": 8.333095748695271e-06, "loss": 0.0232, "step": 4160 }, { "epoch": 5.668937329700272, "grad_norm": 0.9838179393873808, "learning_rate": 8.328744682723622e-06, "loss": 0.0053, "step": 4161 }, { "epoch": 5.670299727520436, "grad_norm": 0.31557634922434713, "learning_rate": 8.324393942217169e-06, "loss": 0.0081, "step": 4162 }, { "epoch": 5.6716621253406, "grad_norm": 2.2233881422305912, "learning_rate": 8.320043528023188e-06, "loss": 0.0207, "step": 4163 }, { "epoch": 5.6730245231607626, "grad_norm": 1.4023526061354985, "learning_rate": 8.315693440988893e-06, "loss": 0.0118, "step": 4164 }, { "epoch": 5.674386920980926, "grad_norm": 0.5888466323019067, "learning_rate": 8.311343681961434e-06, "loss": 0.0229, "step": 4165 }, { "epoch": 5.67574931880109, "grad_norm": 0.9847786457978804, "learning_rate": 8.306994251787891e-06, "loss": 0.0129, "step": 4166 }, { "epoch": 5.677111716621253, "grad_norm": 1.0241214413705755, "learning_rate": 8.302645151315291e-06, "loss": 0.0079, "step": 4167 }, { "epoch": 5.678474114441417, "grad_norm": 0.9097136524907073, "learning_rate": 8.298296381390587e-06, "loss": 0.0071, "step": 4168 }, { "epoch": 5.679836512261581, "grad_norm": 0.7840512127074337, "learning_rate": 8.293947942860667e-06, "loss": 0.0074, "step": 4169 }, { "epoch": 5.6811989100817435, "grad_norm": 1.8767522978951887, "learning_rate": 8.289599836572368e-06, "loss": 0.01, "step": 4170 }, { "epoch": 5.682561307901907, "grad_norm": 0.588396020910041, "learning_rate": 8.285252063372448e-06, "loss": 0.0021, "step": 4171 }, { "epoch": 5.683923705722071, "grad_norm": 1.0322820787903104, "learning_rate": 8.280904624107606e-06, "loss": 0.0067, "step": 4172 }, { "epoch": 5.685286103542234, "grad_norm": 2.3521546337302235, "learning_rate": 8.276557519624474e-06, "loss": 0.0117, "step": 4173 }, { "epoch": 5.686648501362398, "grad_norm": 1.0511910851457265, "learning_rate": 8.272210750769622e-06, "loss": 0.0144, "step": 4174 }, { "epoch": 5.6880108991825615, "grad_norm": 0.7672076830361018, "learning_rate": 8.267864318389548e-06, "loss": 0.0112, "step": 4175 }, { "epoch": 5.689373297002724, "grad_norm": 4.537082977076293, "learning_rate": 8.263518223330698e-06, "loss": 0.0266, "step": 4176 }, { "epoch": 5.690735694822888, "grad_norm": 1.8629697049637024, "learning_rate": 8.259172466439435e-06, "loss": 0.0165, "step": 4177 }, { "epoch": 5.692098092643052, "grad_norm": 1.664964837400496, "learning_rate": 8.254827048562069e-06, "loss": 0.0094, "step": 4178 }, { "epoch": 5.693460490463215, "grad_norm": 1.1198074711794974, "learning_rate": 8.250481970544838e-06, "loss": 0.0229, "step": 4179 }, { "epoch": 5.694822888283379, "grad_norm": 0.6582434808521979, "learning_rate": 8.246137233233915e-06, "loss": 0.009, "step": 4180 }, { "epoch": 5.6961852861035425, "grad_norm": 0.9815471165498652, "learning_rate": 8.241792837475405e-06, "loss": 0.0075, "step": 4181 }, { "epoch": 5.697547683923705, "grad_norm": 1.385312460521564, "learning_rate": 8.237448784115353e-06, "loss": 0.0029, "step": 4182 }, { "epoch": 5.698910081743869, "grad_norm": 2.4876900866047187, "learning_rate": 8.23310507399973e-06, "loss": 0.0087, "step": 4183 }, { "epoch": 5.700272479564033, "grad_norm": 0.7443553741452062, "learning_rate": 8.228761707974445e-06, "loss": 0.0107, "step": 4184 }, { "epoch": 5.701634877384196, "grad_norm": 3.1909314801039756, "learning_rate": 8.224418686885336e-06, "loss": 0.0173, "step": 4185 }, { "epoch": 5.70299727520436, "grad_norm": 1.1343033231807385, "learning_rate": 8.220076011578173e-06, "loss": 0.006, "step": 4186 }, { "epoch": 5.704359673024523, "grad_norm": 0.6732886481731195, "learning_rate": 8.215733682898669e-06, "loss": 0.0137, "step": 4187 }, { "epoch": 5.705722070844686, "grad_norm": 2.257879996802598, "learning_rate": 8.211391701692455e-06, "loss": 0.0365, "step": 4188 }, { "epoch": 5.70708446866485, "grad_norm": 1.9054712856056595, "learning_rate": 8.207050068805108e-06, "loss": 0.0188, "step": 4189 }, { "epoch": 5.708446866485014, "grad_norm": 1.5402562014125878, "learning_rate": 8.202708785082122e-06, "loss": 0.0058, "step": 4190 }, { "epoch": 5.709809264305177, "grad_norm": 1.6887519305195322, "learning_rate": 8.198367851368937e-06, "loss": 0.0493, "step": 4191 }, { "epoch": 5.7111716621253406, "grad_norm": 0.9035317735241467, "learning_rate": 8.194027268510915e-06, "loss": 0.0128, "step": 4192 }, { "epoch": 5.712534059945504, "grad_norm": 0.9769405088160726, "learning_rate": 8.18968703735336e-06, "loss": 0.0266, "step": 4193 }, { "epoch": 5.713896457765667, "grad_norm": 0.9024992725001544, "learning_rate": 8.185347158741497e-06, "loss": 0.0016, "step": 4194 }, { "epoch": 5.715258855585831, "grad_norm": 1.2788348447806939, "learning_rate": 8.181007633520491e-06, "loss": 0.0065, "step": 4195 }, { "epoch": 5.716621253405995, "grad_norm": 2.6355898372769695, "learning_rate": 8.176668462535427e-06, "loss": 0.0149, "step": 4196 }, { "epoch": 5.717983651226158, "grad_norm": 0.5709794912823141, "learning_rate": 8.172329646631334e-06, "loss": 0.0017, "step": 4197 }, { "epoch": 5.7193460490463215, "grad_norm": 1.858267985383528, "learning_rate": 8.167991186653162e-06, "loss": 0.0095, "step": 4198 }, { "epoch": 5.720708446866485, "grad_norm": 0.8663617840705554, "learning_rate": 8.163653083445799e-06, "loss": 0.0079, "step": 4199 }, { "epoch": 5.722070844686648, "grad_norm": 1.0793960848911415, "learning_rate": 8.159315337854057e-06, "loss": 0.0063, "step": 4200 }, { "epoch": 5.723433242506812, "grad_norm": 1.9981307051156647, "learning_rate": 8.154977950722685e-06, "loss": 0.021, "step": 4201 }, { "epoch": 5.724795640326976, "grad_norm": 2.251092327298442, "learning_rate": 8.150640922896356e-06, "loss": 0.0145, "step": 4202 }, { "epoch": 5.726158038147139, "grad_norm": 2.1758653858212766, "learning_rate": 8.146304255219675e-06, "loss": 0.0219, "step": 4203 }, { "epoch": 5.727520435967302, "grad_norm": 1.2788925557467692, "learning_rate": 8.141967948537177e-06, "loss": 0.0129, "step": 4204 }, { "epoch": 5.728882833787466, "grad_norm": 1.3897780837248417, "learning_rate": 8.137632003693329e-06, "loss": 0.0091, "step": 4205 }, { "epoch": 5.730245231607629, "grad_norm": 1.7365011261586485, "learning_rate": 8.133296421532526e-06, "loss": 0.0405, "step": 4206 }, { "epoch": 5.731607629427793, "grad_norm": 2.0924424670232553, "learning_rate": 8.128961202899092e-06, "loss": 0.0041, "step": 4207 }, { "epoch": 5.732970027247957, "grad_norm": 1.780786589734944, "learning_rate": 8.124626348637278e-06, "loss": 0.0091, "step": 4208 }, { "epoch": 5.73433242506812, "grad_norm": 2.1666550833175067, "learning_rate": 8.12029185959127e-06, "loss": 0.0096, "step": 4209 }, { "epoch": 5.735694822888283, "grad_norm": 1.901239514993063, "learning_rate": 8.115957736605174e-06, "loss": 0.0177, "step": 4210 }, { "epoch": 5.737057220708447, "grad_norm": 1.594136657551228, "learning_rate": 8.111623980523036e-06, "loss": 0.0052, "step": 4211 }, { "epoch": 5.73841961852861, "grad_norm": 2.226909879893301, "learning_rate": 8.107290592188821e-06, "loss": 0.0108, "step": 4212 }, { "epoch": 5.739782016348774, "grad_norm": 1.1971880648923003, "learning_rate": 8.102957572446425e-06, "loss": 0.0017, "step": 4213 }, { "epoch": 5.741144414168938, "grad_norm": 1.1284875006397774, "learning_rate": 8.098624922139677e-06, "loss": 0.0074, "step": 4214 }, { "epoch": 5.7425068119891005, "grad_norm": 1.3807921891066128, "learning_rate": 8.09429264211232e-06, "loss": 0.0153, "step": 4215 }, { "epoch": 5.743869209809264, "grad_norm": 3.4309037305587218, "learning_rate": 8.08996073320805e-06, "loss": 0.0284, "step": 4216 }, { "epoch": 5.745231607629428, "grad_norm": 2.8741961924332564, "learning_rate": 8.085629196270468e-06, "loss": 0.0271, "step": 4217 }, { "epoch": 5.746594005449591, "grad_norm": 2.5656090141329573, "learning_rate": 8.08129803214311e-06, "loss": 0.0061, "step": 4218 }, { "epoch": 5.747956403269755, "grad_norm": 0.7864384813460363, "learning_rate": 8.07696724166944e-06, "loss": 0.0061, "step": 4219 }, { "epoch": 5.7493188010899186, "grad_norm": 0.6310402940838155, "learning_rate": 8.07263682569285e-06, "loss": 0.0025, "step": 4220 }, { "epoch": 5.7506811989100814, "grad_norm": 1.4835920529272317, "learning_rate": 8.068306785056651e-06, "loss": 0.003, "step": 4221 }, { "epoch": 5.752043596730245, "grad_norm": 0.3504032330675849, "learning_rate": 8.063977120604098e-06, "loss": 0.0013, "step": 4222 }, { "epoch": 5.753405994550409, "grad_norm": 2.0738156730599866, "learning_rate": 8.059647833178363e-06, "loss": 0.0188, "step": 4223 }, { "epoch": 5.754768392370572, "grad_norm": 1.2526319229952, "learning_rate": 8.055318923622537e-06, "loss": 0.017, "step": 4224 }, { "epoch": 5.756130790190736, "grad_norm": 0.33812808183965193, "learning_rate": 8.050990392779648e-06, "loss": 0.0162, "step": 4225 }, { "epoch": 5.7574931880108995, "grad_norm": 3.044444275700061, "learning_rate": 8.046662241492645e-06, "loss": 0.0263, "step": 4226 }, { "epoch": 5.758855585831062, "grad_norm": 1.0494756028428447, "learning_rate": 8.042334470604403e-06, "loss": 0.0217, "step": 4227 }, { "epoch": 5.760217983651226, "grad_norm": 2.6990547770468982, "learning_rate": 8.038007080957733e-06, "loss": 0.0231, "step": 4228 }, { "epoch": 5.76158038147139, "grad_norm": 2.704948567038696, "learning_rate": 8.03368007339536e-06, "loss": 0.012, "step": 4229 }, { "epoch": 5.762942779291553, "grad_norm": 1.465716812347237, "learning_rate": 8.029353448759935e-06, "loss": 0.0133, "step": 4230 }, { "epoch": 5.764305177111717, "grad_norm": 2.174247969640355, "learning_rate": 8.025027207894041e-06, "loss": 0.0111, "step": 4231 }, { "epoch": 5.76566757493188, "grad_norm": 1.6513363215113148, "learning_rate": 8.020701351640182e-06, "loss": 0.0159, "step": 4232 }, { "epoch": 5.767029972752043, "grad_norm": 1.3598554433891012, "learning_rate": 8.016375880840783e-06, "loss": 0.0094, "step": 4233 }, { "epoch": 5.768392370572207, "grad_norm": 1.8863560253017786, "learning_rate": 8.012050796338207e-06, "loss": 0.0019, "step": 4234 }, { "epoch": 5.769754768392371, "grad_norm": 1.8289261563713048, "learning_rate": 8.007726098974733e-06, "loss": 0.022, "step": 4235 }, { "epoch": 5.771117166212534, "grad_norm": 1.5707191417191368, "learning_rate": 8.003401789592561e-06, "loss": 0.0073, "step": 4236 }, { "epoch": 5.772479564032698, "grad_norm": 2.646043675635408, "learning_rate": 7.999077869033823e-06, "loss": 0.0037, "step": 4237 }, { "epoch": 5.773841961852861, "grad_norm": 1.1119620752053123, "learning_rate": 7.99475433814057e-06, "loss": 0.0016, "step": 4238 }, { "epoch": 5.775204359673024, "grad_norm": 1.0103783425632593, "learning_rate": 7.990431197754775e-06, "loss": 0.01, "step": 4239 }, { "epoch": 5.776566757493188, "grad_norm": 2.2790218109137363, "learning_rate": 7.986108448718348e-06, "loss": 0.0068, "step": 4240 }, { "epoch": 5.777929155313352, "grad_norm": 0.7544099233970399, "learning_rate": 7.981786091873112e-06, "loss": 0.0104, "step": 4241 }, { "epoch": 5.779291553133515, "grad_norm": 1.8409928772069746, "learning_rate": 7.977464128060812e-06, "loss": 0.0038, "step": 4242 }, { "epoch": 5.7806539509536785, "grad_norm": 2.8264976189543978, "learning_rate": 7.973142558123121e-06, "loss": 0.0152, "step": 4243 }, { "epoch": 5.782016348773842, "grad_norm": 0.9748414407164278, "learning_rate": 7.96882138290163e-06, "loss": 0.0218, "step": 4244 }, { "epoch": 5.783378746594005, "grad_norm": 1.073288223830888, "learning_rate": 7.964500603237868e-06, "loss": 0.0081, "step": 4245 }, { "epoch": 5.784741144414169, "grad_norm": 2.5762559786555874, "learning_rate": 7.960180219973268e-06, "loss": 0.0042, "step": 4246 }, { "epoch": 5.786103542234333, "grad_norm": 0.6335619523662461, "learning_rate": 7.9558602339492e-06, "loss": 0.0023, "step": 4247 }, { "epoch": 5.787465940054496, "grad_norm": 3.228303487767949, "learning_rate": 7.951540646006946e-06, "loss": 0.0069, "step": 4248 }, { "epoch": 5.7888283378746594, "grad_norm": 1.7231468003976782, "learning_rate": 7.947221456987717e-06, "loss": 0.0169, "step": 4249 }, { "epoch": 5.790190735694823, "grad_norm": 1.6525402668134817, "learning_rate": 7.942902667732639e-06, "loss": 0.0062, "step": 4250 }, { "epoch": 5.791553133514986, "grad_norm": 1.672493115684143, "learning_rate": 7.938584279082777e-06, "loss": 0.0097, "step": 4251 }, { "epoch": 5.79291553133515, "grad_norm": 1.2800367475321297, "learning_rate": 7.9342662918791e-06, "loss": 0.0076, "step": 4252 }, { "epoch": 5.794277929155314, "grad_norm": 2.255781903457142, "learning_rate": 7.929948706962508e-06, "loss": 0.0105, "step": 4253 }, { "epoch": 5.795640326975477, "grad_norm": 1.209426589811088, "learning_rate": 7.92563152517382e-06, "loss": 0.0175, "step": 4254 }, { "epoch": 5.79700272479564, "grad_norm": 2.5265705020966673, "learning_rate": 7.921314747353773e-06, "loss": 0.018, "step": 4255 }, { "epoch": 5.798365122615804, "grad_norm": 1.9184302711159855, "learning_rate": 7.916998374343028e-06, "loss": 0.0147, "step": 4256 }, { "epoch": 5.799727520435967, "grad_norm": 2.8447144179544157, "learning_rate": 7.912682406982176e-06, "loss": 0.013, "step": 4257 }, { "epoch": 5.801089918256131, "grad_norm": 2.486579779045109, "learning_rate": 7.908366846111717e-06, "loss": 0.0058, "step": 4258 }, { "epoch": 5.802452316076295, "grad_norm": 1.0895490771419443, "learning_rate": 7.904051692572078e-06, "loss": 0.0243, "step": 4259 }, { "epoch": 5.8038147138964575, "grad_norm": 0.7943016128989622, "learning_rate": 7.899736947203602e-06, "loss": 0.0084, "step": 4260 }, { "epoch": 5.805177111716621, "grad_norm": 3.424661828130783, "learning_rate": 7.895422610846556e-06, "loss": 0.0042, "step": 4261 }, { "epoch": 5.806539509536785, "grad_norm": 2.25773614135434, "learning_rate": 7.891108684341122e-06, "loss": 0.0139, "step": 4262 }, { "epoch": 5.807901907356948, "grad_norm": 1.46780592005338, "learning_rate": 7.886795168527415e-06, "loss": 0.0011, "step": 4263 }, { "epoch": 5.809264305177112, "grad_norm": 2.6404461212120593, "learning_rate": 7.882482064245459e-06, "loss": 0.0054, "step": 4264 }, { "epoch": 5.810626702997276, "grad_norm": 1.8994292467496041, "learning_rate": 7.878169372335201e-06, "loss": 0.0269, "step": 4265 }, { "epoch": 5.8119891008174385, "grad_norm": 1.348900423089847, "learning_rate": 7.873857093636507e-06, "loss": 0.0104, "step": 4266 }, { "epoch": 5.813351498637602, "grad_norm": 1.1877452593890605, "learning_rate": 7.869545228989162e-06, "loss": 0.0009, "step": 4267 }, { "epoch": 5.814713896457766, "grad_norm": 0.9347007735400861, "learning_rate": 7.865233779232867e-06, "loss": 0.0109, "step": 4268 }, { "epoch": 5.816076294277929, "grad_norm": 2.061966200725475, "learning_rate": 7.860922745207254e-06, "loss": 0.0147, "step": 4269 }, { "epoch": 5.817438692098093, "grad_norm": 1.2733382138944456, "learning_rate": 7.856612127751864e-06, "loss": 0.007, "step": 4270 }, { "epoch": 5.8188010899182565, "grad_norm": 0.8327756687653927, "learning_rate": 7.852301927706158e-06, "loss": 0.008, "step": 4271 }, { "epoch": 5.820163487738419, "grad_norm": 1.4757969199059018, "learning_rate": 7.847992145909523e-06, "loss": 0.0101, "step": 4272 }, { "epoch": 5.821525885558583, "grad_norm": 1.9846328671589393, "learning_rate": 7.843682783201244e-06, "loss": 0.0075, "step": 4273 }, { "epoch": 5.822888283378747, "grad_norm": 1.0420462843320475, "learning_rate": 7.839373840420555e-06, "loss": 0.0187, "step": 4274 }, { "epoch": 5.82425068119891, "grad_norm": 0.9948038042398324, "learning_rate": 7.835065318406585e-06, "loss": 0.0022, "step": 4275 }, { "epoch": 5.825613079019074, "grad_norm": 2.0040600600570624, "learning_rate": 7.83075721799839e-06, "loss": 0.0299, "step": 4276 }, { "epoch": 5.8269754768392374, "grad_norm": 0.5657627239101893, "learning_rate": 7.82644954003494e-06, "loss": 0.0098, "step": 4277 }, { "epoch": 5.8283378746594, "grad_norm": 1.1584865512538483, "learning_rate": 7.82214228535513e-06, "loss": 0.0104, "step": 4278 }, { "epoch": 5.829700272479564, "grad_norm": 1.6553953736781517, "learning_rate": 7.817835454797755e-06, "loss": 0.0162, "step": 4279 }, { "epoch": 5.831062670299728, "grad_norm": 1.5183916410425813, "learning_rate": 7.813529049201555e-06, "loss": 0.0136, "step": 4280 }, { "epoch": 5.832425068119891, "grad_norm": 2.562956763180836, "learning_rate": 7.809223069405166e-06, "loss": 0.0147, "step": 4281 }, { "epoch": 5.833787465940055, "grad_norm": 1.4543153829180406, "learning_rate": 7.804917516247147e-06, "loss": 0.0164, "step": 4282 }, { "epoch": 5.835149863760218, "grad_norm": 1.851446025721401, "learning_rate": 7.800612390565974e-06, "loss": 0.0158, "step": 4283 }, { "epoch": 5.836512261580381, "grad_norm": 1.360464504099927, "learning_rate": 7.796307693200042e-06, "loss": 0.0163, "step": 4284 }, { "epoch": 5.837874659400545, "grad_norm": 0.7647448868934339, "learning_rate": 7.792003424987652e-06, "loss": 0.0086, "step": 4285 }, { "epoch": 5.839237057220709, "grad_norm": 0.9891391134588101, "learning_rate": 7.787699586767042e-06, "loss": 0.0069, "step": 4286 }, { "epoch": 5.840599455040872, "grad_norm": 0.24264764375993372, "learning_rate": 7.783396179376347e-06, "loss": 0.0019, "step": 4287 }, { "epoch": 5.8419618528610355, "grad_norm": 1.1989683919691776, "learning_rate": 7.779093203653626e-06, "loss": 0.004, "step": 4288 }, { "epoch": 5.843324250681199, "grad_norm": 1.8470391475139543, "learning_rate": 7.774790660436857e-06, "loss": 0.0092, "step": 4289 }, { "epoch": 5.844686648501362, "grad_norm": 1.366816389182126, "learning_rate": 7.770488550563928e-06, "loss": 0.0102, "step": 4290 }, { "epoch": 5.846049046321526, "grad_norm": 1.2237929821653926, "learning_rate": 7.766186874872637e-06, "loss": 0.0108, "step": 4291 }, { "epoch": 5.84741144414169, "grad_norm": 1.1893751776679162, "learning_rate": 7.761885634200717e-06, "loss": 0.0057, "step": 4292 }, { "epoch": 5.848773841961853, "grad_norm": 0.9276447562801207, "learning_rate": 7.757584829385798e-06, "loss": 0.0016, "step": 4293 }, { "epoch": 5.8501362397820165, "grad_norm": 0.5818729067360338, "learning_rate": 7.753284461265434e-06, "loss": 0.0052, "step": 4294 }, { "epoch": 5.85149863760218, "grad_norm": 0.6513402992909451, "learning_rate": 7.748984530677088e-06, "loss": 0.0234, "step": 4295 }, { "epoch": 5.852861035422343, "grad_norm": 1.4788130845825733, "learning_rate": 7.744685038458146e-06, "loss": 0.0179, "step": 4296 }, { "epoch": 5.854223433242507, "grad_norm": 1.389913212609265, "learning_rate": 7.740385985445895e-06, "loss": 0.0162, "step": 4297 }, { "epoch": 5.855585831062671, "grad_norm": 3.264930823186756, "learning_rate": 7.736087372477554e-06, "loss": 0.0217, "step": 4298 }, { "epoch": 5.856948228882834, "grad_norm": 0.7759999798040592, "learning_rate": 7.731789200390246e-06, "loss": 0.0125, "step": 4299 }, { "epoch": 5.858310626702997, "grad_norm": 1.618773198769896, "learning_rate": 7.727491470021007e-06, "loss": 0.0193, "step": 4300 }, { "epoch": 5.859673024523161, "grad_norm": 2.0446773878656774, "learning_rate": 7.723194182206793e-06, "loss": 0.0192, "step": 4301 }, { "epoch": 5.861035422343324, "grad_norm": 1.8188761367723612, "learning_rate": 7.718897337784466e-06, "loss": 0.0109, "step": 4302 }, { "epoch": 5.862397820163488, "grad_norm": 1.2572416792807168, "learning_rate": 7.71460093759081e-06, "loss": 0.0165, "step": 4303 }, { "epoch": 5.863760217983652, "grad_norm": 0.9942504009536683, "learning_rate": 7.710304982462518e-06, "loss": 0.016, "step": 4304 }, { "epoch": 5.8651226158038146, "grad_norm": 1.9448366351457378, "learning_rate": 7.706009473236198e-06, "loss": 0.0229, "step": 4305 }, { "epoch": 5.866485013623978, "grad_norm": 2.2803743880574774, "learning_rate": 7.701714410748366e-06, "loss": 0.041, "step": 4306 }, { "epoch": 5.867847411444142, "grad_norm": 1.4716859419741175, "learning_rate": 7.697419795835461e-06, "loss": 0.0163, "step": 4307 }, { "epoch": 5.869209809264305, "grad_norm": 1.9565143887548468, "learning_rate": 7.69312562933382e-06, "loss": 0.0204, "step": 4308 }, { "epoch": 5.870572207084469, "grad_norm": 3.296944478896785, "learning_rate": 7.688831912079712e-06, "loss": 0.026, "step": 4309 }, { "epoch": 5.871934604904633, "grad_norm": 2.4503429724887718, "learning_rate": 7.684538644909303e-06, "loss": 0.0182, "step": 4310 }, { "epoch": 5.8732970027247955, "grad_norm": 1.3774965362594223, "learning_rate": 7.680245828658678e-06, "loss": 0.033, "step": 4311 }, { "epoch": 5.874659400544959, "grad_norm": 0.9481477973551613, "learning_rate": 7.67595346416383e-06, "loss": 0.0092, "step": 4312 }, { "epoch": 5.876021798365123, "grad_norm": 0.8963689967917722, "learning_rate": 7.67166155226067e-06, "loss": 0.0058, "step": 4313 }, { "epoch": 5.877384196185286, "grad_norm": 0.2720469843529916, "learning_rate": 7.667370093785014e-06, "loss": 0.0157, "step": 4314 }, { "epoch": 5.87874659400545, "grad_norm": 0.8310230818039993, "learning_rate": 7.663079089572598e-06, "loss": 0.0036, "step": 4315 }, { "epoch": 5.8801089918256135, "grad_norm": 1.1084330039220252, "learning_rate": 7.658788540459063e-06, "loss": 0.0272, "step": 4316 }, { "epoch": 5.881471389645776, "grad_norm": 1.5979974917911641, "learning_rate": 7.654498447279961e-06, "loss": 0.0118, "step": 4317 }, { "epoch": 5.88283378746594, "grad_norm": 1.4392444715935548, "learning_rate": 7.65020881087076e-06, "loss": 0.0189, "step": 4318 }, { "epoch": 5.884196185286104, "grad_norm": 0.6075102546929251, "learning_rate": 7.645919632066833e-06, "loss": 0.0014, "step": 4319 }, { "epoch": 5.885558583106267, "grad_norm": 0.6646243659619554, "learning_rate": 7.641630911703471e-06, "loss": 0.0019, "step": 4320 }, { "epoch": 5.886920980926431, "grad_norm": 2.476484076481586, "learning_rate": 7.637342650615872e-06, "loss": 0.0202, "step": 4321 }, { "epoch": 5.8882833787465945, "grad_norm": 0.714581182754326, "learning_rate": 7.633054849639143e-06, "loss": 0.0042, "step": 4322 }, { "epoch": 5.889645776566757, "grad_norm": 0.8426328847659812, "learning_rate": 7.628767509608304e-06, "loss": 0.0199, "step": 4323 }, { "epoch": 5.891008174386921, "grad_norm": 1.6241711614859955, "learning_rate": 7.624480631358283e-06, "loss": 0.0099, "step": 4324 }, { "epoch": 5.892370572207085, "grad_norm": 1.3958576585647782, "learning_rate": 7.620194215723919e-06, "loss": 0.0075, "step": 4325 }, { "epoch": 5.893732970027248, "grad_norm": 1.0384592887065056, "learning_rate": 7.615908263539964e-06, "loss": 0.0021, "step": 4326 }, { "epoch": 5.895095367847412, "grad_norm": 2.405342860300515, "learning_rate": 7.6116227756410754e-06, "loss": 0.0173, "step": 4327 }, { "epoch": 5.896457765667575, "grad_norm": 1.2376777835769937, "learning_rate": 7.607337752861821e-06, "loss": 0.0059, "step": 4328 }, { "epoch": 5.897820163487738, "grad_norm": 0.7999552427156305, "learning_rate": 7.60305319603668e-06, "loss": 0.016, "step": 4329 }, { "epoch": 5.899182561307902, "grad_norm": 2.1658730736645833, "learning_rate": 7.59876910600004e-06, "loss": 0.0126, "step": 4330 }, { "epoch": 5.900544959128066, "grad_norm": 2.9189433439824315, "learning_rate": 7.594485483586193e-06, "loss": 0.0322, "step": 4331 }, { "epoch": 5.901907356948229, "grad_norm": 0.9077333003963185, "learning_rate": 7.5902023296293515e-06, "loss": 0.0142, "step": 4332 }, { "epoch": 5.9032697547683926, "grad_norm": 1.6489722176738455, "learning_rate": 7.5859196449636255e-06, "loss": 0.025, "step": 4333 }, { "epoch": 5.904632152588556, "grad_norm": 0.6310799276959241, "learning_rate": 7.581637430423038e-06, "loss": 0.0108, "step": 4334 }, { "epoch": 5.905994550408719, "grad_norm": 1.9172834090804445, "learning_rate": 7.577355686841519e-06, "loss": 0.0219, "step": 4335 }, { "epoch": 5.907356948228883, "grad_norm": 1.1976539496003151, "learning_rate": 7.57307441505291e-06, "loss": 0.0015, "step": 4336 }, { "epoch": 5.908719346049047, "grad_norm": 1.533566751483893, "learning_rate": 7.568793615890955e-06, "loss": 0.0261, "step": 4337 }, { "epoch": 5.91008174386921, "grad_norm": 2.949279947744296, "learning_rate": 7.564513290189315e-06, "loss": 0.0107, "step": 4338 }, { "epoch": 5.9114441416893735, "grad_norm": 1.7548705782158236, "learning_rate": 7.560233438781549e-06, "loss": 0.003, "step": 4339 }, { "epoch": 5.912806539509537, "grad_norm": 1.8431488649126462, "learning_rate": 7.55595406250113e-06, "loss": 0.002, "step": 4340 }, { "epoch": 5.9141689373297, "grad_norm": 1.2265521025458768, "learning_rate": 7.551675162181436e-06, "loss": 0.0068, "step": 4341 }, { "epoch": 5.915531335149864, "grad_norm": 1.8020346482979854, "learning_rate": 7.54739673865575e-06, "loss": 0.0036, "step": 4342 }, { "epoch": 5.916893732970028, "grad_norm": 1.5990549312195843, "learning_rate": 7.543118792757267e-06, "loss": 0.0065, "step": 4343 }, { "epoch": 5.918256130790191, "grad_norm": 1.824873025443666, "learning_rate": 7.538841325319089e-06, "loss": 0.0234, "step": 4344 }, { "epoch": 5.919618528610354, "grad_norm": 1.0233540752989398, "learning_rate": 7.53456433717422e-06, "loss": 0.0035, "step": 4345 }, { "epoch": 5.920980926430518, "grad_norm": 0.8324877241973933, "learning_rate": 7.530287829155574e-06, "loss": 0.0104, "step": 4346 }, { "epoch": 5.922343324250681, "grad_norm": 1.4450527926929913, "learning_rate": 7.5260118020959695e-06, "loss": 0.0197, "step": 4347 }, { "epoch": 5.923705722070845, "grad_norm": 2.1660482595243673, "learning_rate": 7.5217362568281345e-06, "loss": 0.0196, "step": 4348 }, { "epoch": 5.925068119891008, "grad_norm": 1.2421908847504752, "learning_rate": 7.517461194184699e-06, "loss": 0.0229, "step": 4349 }, { "epoch": 5.926430517711172, "grad_norm": 1.8309891087724635, "learning_rate": 7.513186614998205e-06, "loss": 0.014, "step": 4350 }, { "epoch": 5.927792915531335, "grad_norm": 1.8478462601290961, "learning_rate": 7.508912520101095e-06, "loss": 0.0136, "step": 4351 }, { "epoch": 5.929155313351498, "grad_norm": 0.5596777369166498, "learning_rate": 7.504638910325716e-06, "loss": 0.0091, "step": 4352 }, { "epoch": 5.930517711171662, "grad_norm": 1.7465964686224489, "learning_rate": 7.500365786504329e-06, "loss": 0.0276, "step": 4353 }, { "epoch": 5.931880108991826, "grad_norm": 1.5436709850792796, "learning_rate": 7.496093149469088e-06, "loss": 0.0085, "step": 4354 }, { "epoch": 5.933242506811989, "grad_norm": 1.8370953759940167, "learning_rate": 7.4918210000520645e-06, "loss": 0.0273, "step": 4355 }, { "epoch": 5.9346049046321525, "grad_norm": 1.7824937566927146, "learning_rate": 7.487549339085227e-06, "loss": 0.0025, "step": 4356 }, { "epoch": 5.935967302452316, "grad_norm": 3.62772549439384, "learning_rate": 7.483278167400454e-06, "loss": 0.0292, "step": 4357 }, { "epoch": 5.937329700272479, "grad_norm": 0.756878706126452, "learning_rate": 7.479007485829523e-06, "loss": 0.018, "step": 4358 }, { "epoch": 5.938692098092643, "grad_norm": 2.145510268853218, "learning_rate": 7.47473729520412e-06, "loss": 0.009, "step": 4359 }, { "epoch": 5.940054495912807, "grad_norm": 1.4352312555704332, "learning_rate": 7.470467596355831e-06, "loss": 0.0188, "step": 4360 }, { "epoch": 5.94141689373297, "grad_norm": 0.7957613798342466, "learning_rate": 7.466198390116157e-06, "loss": 0.0021, "step": 4361 }, { "epoch": 5.9427792915531334, "grad_norm": 1.7623486050369892, "learning_rate": 7.461929677316493e-06, "loss": 0.0113, "step": 4362 }, { "epoch": 5.944141689373297, "grad_norm": 1.6286381763468338, "learning_rate": 7.457661458788139e-06, "loss": 0.0095, "step": 4363 }, { "epoch": 5.94550408719346, "grad_norm": 1.8008405558943246, "learning_rate": 7.453393735362302e-06, "loss": 0.0021, "step": 4364 }, { "epoch": 5.946866485013624, "grad_norm": 2.568388452516356, "learning_rate": 7.44912650787009e-06, "loss": 0.0052, "step": 4365 }, { "epoch": 5.948228882833788, "grad_norm": 1.5261699296049518, "learning_rate": 7.444859777142515e-06, "loss": 0.009, "step": 4366 }, { "epoch": 5.949591280653951, "grad_norm": 1.4736944746646705, "learning_rate": 7.4405935440104946e-06, "loss": 0.0341, "step": 4367 }, { "epoch": 5.950953678474114, "grad_norm": 1.3152495173906664, "learning_rate": 7.436327809304849e-06, "loss": 0.031, "step": 4368 }, { "epoch": 5.952316076294278, "grad_norm": 0.18241292990399743, "learning_rate": 7.432062573856295e-06, "loss": 0.0083, "step": 4369 }, { "epoch": 5.953678474114441, "grad_norm": 2.6773434299579595, "learning_rate": 7.427797838495463e-06, "loss": 0.0253, "step": 4370 }, { "epoch": 5.955040871934605, "grad_norm": 0.6295993940275917, "learning_rate": 7.423533604052875e-06, "loss": 0.0134, "step": 4371 }, { "epoch": 5.956403269754769, "grad_norm": 1.1379656360366783, "learning_rate": 7.419269871358961e-06, "loss": 0.0292, "step": 4372 }, { "epoch": 5.9577656675749315, "grad_norm": 1.8950478347401345, "learning_rate": 7.415006641244057e-06, "loss": 0.0167, "step": 4373 }, { "epoch": 5.959128065395095, "grad_norm": 1.1284805656899881, "learning_rate": 7.410743914538394e-06, "loss": 0.0169, "step": 4374 }, { "epoch": 5.960490463215259, "grad_norm": 1.4155384869216345, "learning_rate": 7.40648169207211e-06, "loss": 0.0146, "step": 4375 }, { "epoch": 5.961852861035422, "grad_norm": 3.797950775062442, "learning_rate": 7.40221997467524e-06, "loss": 0.0176, "step": 4376 }, { "epoch": 5.963215258855586, "grad_norm": 1.2974207730011504, "learning_rate": 7.397958763177726e-06, "loss": 0.0278, "step": 4377 }, { "epoch": 5.96457765667575, "grad_norm": 2.3998436976736395, "learning_rate": 7.3936980584094045e-06, "loss": 0.0078, "step": 4378 }, { "epoch": 5.9659400544959125, "grad_norm": 1.9576427206932372, "learning_rate": 7.389437861200024e-06, "loss": 0.0145, "step": 4379 }, { "epoch": 5.967302452316076, "grad_norm": 1.4236368316393235, "learning_rate": 7.385178172379225e-06, "loss": 0.0244, "step": 4380 }, { "epoch": 5.96866485013624, "grad_norm": 1.1582123592720808, "learning_rate": 7.3809189927765515e-06, "loss": 0.0064, "step": 4381 }, { "epoch": 5.970027247956403, "grad_norm": 5.3365407010135355, "learning_rate": 7.376660323221449e-06, "loss": 0.028, "step": 4382 }, { "epoch": 5.971389645776567, "grad_norm": 2.015996016199245, "learning_rate": 7.372402164543262e-06, "loss": 0.019, "step": 4383 }, { "epoch": 5.9727520435967305, "grad_norm": 2.8912167991777173, "learning_rate": 7.3681445175712384e-06, "loss": 0.0275, "step": 4384 }, { "epoch": 5.974114441416893, "grad_norm": 2.6142851382725016, "learning_rate": 7.363887383134527e-06, "loss": 0.0236, "step": 4385 }, { "epoch": 5.975476839237057, "grad_norm": 1.4920980899567091, "learning_rate": 7.359630762062171e-06, "loss": 0.0128, "step": 4386 }, { "epoch": 5.976839237057221, "grad_norm": 1.8097505600698527, "learning_rate": 7.355374655183121e-06, "loss": 0.0225, "step": 4387 }, { "epoch": 5.978201634877384, "grad_norm": 2.4334131689449974, "learning_rate": 7.3511190633262195e-06, "loss": 0.014, "step": 4388 }, { "epoch": 5.979564032697548, "grad_norm": 0.9809922496900021, "learning_rate": 7.346863987320212e-06, "loss": 0.0241, "step": 4389 }, { "epoch": 5.9809264305177114, "grad_norm": 1.6590665853697497, "learning_rate": 7.3426094279937545e-06, "loss": 0.0047, "step": 4390 }, { "epoch": 5.982288828337874, "grad_norm": 3.6440840841791795, "learning_rate": 7.338355386175382e-06, "loss": 0.0254, "step": 4391 }, { "epoch": 5.983651226158038, "grad_norm": 4.407384856887358, "learning_rate": 7.3341018626935455e-06, "loss": 0.0287, "step": 4392 }, { "epoch": 5.985013623978202, "grad_norm": 2.7545987855526812, "learning_rate": 7.329848858376585e-06, "loss": 0.0096, "step": 4393 }, { "epoch": 5.986376021798365, "grad_norm": 2.7933815397133284, "learning_rate": 7.3255963740527435e-06, "loss": 0.0272, "step": 4394 }, { "epoch": 5.987738419618529, "grad_norm": 2.0383868705022894, "learning_rate": 7.321344410550159e-06, "loss": 0.0081, "step": 4395 }, { "epoch": 5.989100817438692, "grad_norm": 2.0035335137791632, "learning_rate": 7.317092968696884e-06, "loss": 0.0103, "step": 4396 }, { "epoch": 5.990463215258855, "grad_norm": 2.892381934469804, "learning_rate": 7.312842049320845e-06, "loss": 0.0028, "step": 4397 }, { "epoch": 5.991825613079019, "grad_norm": 2.0256072073640223, "learning_rate": 7.308591653249881e-06, "loss": 0.0074, "step": 4398 }, { "epoch": 5.993188010899183, "grad_norm": 1.9461695029676034, "learning_rate": 7.3043417813117305e-06, "loss": 0.0084, "step": 4399 }, { "epoch": 5.994550408719346, "grad_norm": 1.8420655442863985, "learning_rate": 7.300092434334021e-06, "loss": 0.0208, "step": 4400 }, { "epoch": 5.9959128065395095, "grad_norm": 1.1442747620521774, "learning_rate": 7.295843613144282e-06, "loss": 0.0224, "step": 4401 }, { "epoch": 5.997275204359673, "grad_norm": 2.3346129884446642, "learning_rate": 7.291595318569951e-06, "loss": 0.0106, "step": 4402 }, { "epoch": 5.998637602179836, "grad_norm": 2.033904300624697, "learning_rate": 7.287347551438344e-06, "loss": 0.0051, "step": 4403 }, { "epoch": 6.0, "grad_norm": 2.601331231721129, "learning_rate": 7.283100312576687e-06, "loss": 0.0094, "step": 4404 }, { "epoch": 6.0, "eval_accuracy": 0.9500280741156654, "eval_f1": 0.9416423502402118, "eval_loss": 0.1276320219039917, "eval_precision": 0.9345794362857855, "eval_recall": 0.9520895383469844, "eval_runtime": 17.0799, "eval_samples_per_second": 104.275, "eval_steps_per_second": 0.82, "step": 4404 }, { "epoch": 6.001362397820164, "grad_norm": 1.1033069975902527, "learning_rate": 7.278853602812101e-06, "loss": 0.0053, "step": 4405 }, { "epoch": 6.002724795640327, "grad_norm": 2.018159426250486, "learning_rate": 7.2746074229716e-06, "loss": 0.0157, "step": 4406 }, { "epoch": 6.0040871934604905, "grad_norm": 1.814154743767948, "learning_rate": 7.270361773882097e-06, "loss": 0.0106, "step": 4407 }, { "epoch": 6.005449591280654, "grad_norm": 1.7451662048636665, "learning_rate": 7.266116656370409e-06, "loss": 0.0157, "step": 4408 }, { "epoch": 6.006811989100817, "grad_norm": 0.9164314205390753, "learning_rate": 7.2618720712632366e-06, "loss": 0.0013, "step": 4409 }, { "epoch": 6.008174386920981, "grad_norm": 1.5001923475501149, "learning_rate": 7.257628019387184e-06, "loss": 0.0256, "step": 4410 }, { "epoch": 6.009536784741145, "grad_norm": 1.4098243307987819, "learning_rate": 7.25338450156875e-06, "loss": 0.009, "step": 4411 }, { "epoch": 6.010899182561308, "grad_norm": 0.6015781415221483, "learning_rate": 7.2491415186343275e-06, "loss": 0.0076, "step": 4412 }, { "epoch": 6.012261580381471, "grad_norm": 1.9116803401600206, "learning_rate": 7.244899071410214e-06, "loss": 0.0147, "step": 4413 }, { "epoch": 6.013623978201635, "grad_norm": 1.4723693563523421, "learning_rate": 7.240657160722595e-06, "loss": 0.0056, "step": 4414 }, { "epoch": 6.014986376021798, "grad_norm": 1.1336765607401025, "learning_rate": 7.236415787397549e-06, "loss": 0.0132, "step": 4415 }, { "epoch": 6.016348773841962, "grad_norm": 0.9024250881023028, "learning_rate": 7.232174952261055e-06, "loss": 0.0016, "step": 4416 }, { "epoch": 6.017711171662126, "grad_norm": 0.3639612444914675, "learning_rate": 7.227934656138984e-06, "loss": 0.0026, "step": 4417 }, { "epoch": 6.0190735694822886, "grad_norm": 0.5817802314326118, "learning_rate": 7.2236948998571035e-06, "loss": 0.0097, "step": 4418 }, { "epoch": 6.020435967302452, "grad_norm": 2.2905458189851715, "learning_rate": 7.2194556842410815e-06, "loss": 0.0065, "step": 4419 }, { "epoch": 6.021798365122616, "grad_norm": 1.8894100438046002, "learning_rate": 7.215217010116474e-06, "loss": 0.0105, "step": 4420 }, { "epoch": 6.023160762942779, "grad_norm": 1.3885247690041713, "learning_rate": 7.21097887830873e-06, "loss": 0.0291, "step": 4421 }, { "epoch": 6.024523160762943, "grad_norm": 1.264452458753988, "learning_rate": 7.2067412896431954e-06, "loss": 0.001, "step": 4422 }, { "epoch": 6.025885558583107, "grad_norm": 2.0923660057563405, "learning_rate": 7.202504244945113e-06, "loss": 0.0079, "step": 4423 }, { "epoch": 6.0272479564032695, "grad_norm": 0.33209388342904106, "learning_rate": 7.198267745039612e-06, "loss": 0.0172, "step": 4424 }, { "epoch": 6.028610354223433, "grad_norm": 0.8866660446255399, "learning_rate": 7.1940317907517294e-06, "loss": 0.0067, "step": 4425 }, { "epoch": 6.029972752043597, "grad_norm": 1.9989253133165064, "learning_rate": 7.189796382906386e-06, "loss": 0.013, "step": 4426 }, { "epoch": 6.03133514986376, "grad_norm": 0.7171741956642896, "learning_rate": 7.185561522328395e-06, "loss": 0.0072, "step": 4427 }, { "epoch": 6.032697547683924, "grad_norm": 2.1735598441944943, "learning_rate": 7.181327209842463e-06, "loss": 0.0123, "step": 4428 }, { "epoch": 6.0340599455040875, "grad_norm": 1.081189164911408, "learning_rate": 7.177093446273196e-06, "loss": 0.0126, "step": 4429 }, { "epoch": 6.03542234332425, "grad_norm": 0.6961383621167273, "learning_rate": 7.172860232445085e-06, "loss": 0.0228, "step": 4430 }, { "epoch": 6.036784741144414, "grad_norm": 1.3592355812460086, "learning_rate": 7.168627569182527e-06, "loss": 0.0085, "step": 4431 }, { "epoch": 6.038147138964578, "grad_norm": 0.8003028557797303, "learning_rate": 7.164395457309801e-06, "loss": 0.0083, "step": 4432 }, { "epoch": 6.039509536784741, "grad_norm": 1.3828935723197848, "learning_rate": 7.1601638976510756e-06, "loss": 0.0047, "step": 4433 }, { "epoch": 6.040871934604905, "grad_norm": 0.4581300694966471, "learning_rate": 7.1559328910304216e-06, "loss": 0.008, "step": 4434 }, { "epoch": 6.0422343324250685, "grad_norm": 0.16816526883809488, "learning_rate": 7.151702438271796e-06, "loss": 0.0005, "step": 4435 }, { "epoch": 6.043596730245231, "grad_norm": 1.5248905156558281, "learning_rate": 7.1474725401990465e-06, "loss": 0.0101, "step": 4436 }, { "epoch": 6.044959128065395, "grad_norm": 0.7343821092893142, "learning_rate": 7.143243197635923e-06, "loss": 0.0022, "step": 4437 }, { "epoch": 6.046321525885559, "grad_norm": 0.9092274548641709, "learning_rate": 7.139014411406058e-06, "loss": 0.015, "step": 4438 }, { "epoch": 6.047683923705722, "grad_norm": 1.7550912233502869, "learning_rate": 7.134786182332978e-06, "loss": 0.0357, "step": 4439 }, { "epoch": 6.049046321525886, "grad_norm": 1.087195042885653, "learning_rate": 7.130558511240097e-06, "loss": 0.011, "step": 4440 }, { "epoch": 6.050408719346049, "grad_norm": 0.49824466416116836, "learning_rate": 7.126331398950723e-06, "loss": 0.002, "step": 4441 }, { "epoch": 6.051771117166212, "grad_norm": 0.5667862883385695, "learning_rate": 7.122104846288065e-06, "loss": 0.0011, "step": 4442 }, { "epoch": 6.053133514986376, "grad_norm": 1.3895487840766516, "learning_rate": 7.117878854075209e-06, "loss": 0.0091, "step": 4443 }, { "epoch": 6.05449591280654, "grad_norm": 0.803383951186871, "learning_rate": 7.113653423135136e-06, "loss": 0.0083, "step": 4444 }, { "epoch": 6.055858310626703, "grad_norm": 0.3798510289416586, "learning_rate": 7.109428554290725e-06, "loss": 0.0108, "step": 4445 }, { "epoch": 6.0572207084468666, "grad_norm": 1.4958532912077578, "learning_rate": 7.10520424836473e-06, "loss": 0.0134, "step": 4446 }, { "epoch": 6.05858310626703, "grad_norm": 1.0528187489385534, "learning_rate": 7.100980506179806e-06, "loss": 0.0009, "step": 4447 }, { "epoch": 6.059945504087193, "grad_norm": 2.6250512522562994, "learning_rate": 7.096757328558507e-06, "loss": 0.0165, "step": 4448 }, { "epoch": 6.061307901907357, "grad_norm": 2.679537177440685, "learning_rate": 7.0925347163232595e-06, "loss": 0.0191, "step": 4449 }, { "epoch": 6.062670299727521, "grad_norm": 1.3275159283564177, "learning_rate": 7.088312670296389e-06, "loss": 0.0078, "step": 4450 }, { "epoch": 6.064032697547684, "grad_norm": 1.6276618109627188, "learning_rate": 7.084091191300111e-06, "loss": 0.0072, "step": 4451 }, { "epoch": 6.0653950953678475, "grad_norm": 1.7437110400743032, "learning_rate": 7.0798702801565244e-06, "loss": 0.0163, "step": 4452 }, { "epoch": 6.066757493188011, "grad_norm": 0.28516606471350237, "learning_rate": 7.0756499376876195e-06, "loss": 0.0076, "step": 4453 }, { "epoch": 6.068119891008174, "grad_norm": 1.2808265269521288, "learning_rate": 7.071430164715288e-06, "loss": 0.0097, "step": 4454 }, { "epoch": 6.069482288828338, "grad_norm": 1.5308378836375027, "learning_rate": 7.067210962061296e-06, "loss": 0.0083, "step": 4455 }, { "epoch": 6.070844686648502, "grad_norm": 0.738243740104186, "learning_rate": 7.0629923305473034e-06, "loss": 0.0312, "step": 4456 }, { "epoch": 6.072207084468665, "grad_norm": 1.3028240871149674, "learning_rate": 7.0587742709948615e-06, "loss": 0.0056, "step": 4457 }, { "epoch": 6.073569482288828, "grad_norm": 1.9964700847036143, "learning_rate": 7.054556784225402e-06, "loss": 0.0084, "step": 4458 }, { "epoch": 6.074931880108992, "grad_norm": 1.1899957177035139, "learning_rate": 7.050339871060251e-06, "loss": 0.0113, "step": 4459 }, { "epoch": 6.076294277929155, "grad_norm": 0.8453471870905988, "learning_rate": 7.04612353232063e-06, "loss": 0.0069, "step": 4460 }, { "epoch": 6.077656675749319, "grad_norm": 1.804220527025062, "learning_rate": 7.041907768827636e-06, "loss": 0.0261, "step": 4461 }, { "epoch": 6.079019073569483, "grad_norm": 1.1192267181350515, "learning_rate": 7.03769258140226e-06, "loss": 0.0303, "step": 4462 }, { "epoch": 6.080381471389646, "grad_norm": 1.0170603146776183, "learning_rate": 7.033477970865381e-06, "loss": 0.003, "step": 4463 }, { "epoch": 6.081743869209809, "grad_norm": 2.3722677789998667, "learning_rate": 7.029263938037763e-06, "loss": 0.0126, "step": 4464 }, { "epoch": 6.083106267029973, "grad_norm": 1.3056535574505523, "learning_rate": 7.025050483740057e-06, "loss": 0.0255, "step": 4465 }, { "epoch": 6.084468664850136, "grad_norm": 0.42294533312859695, "learning_rate": 7.02083760879281e-06, "loss": 0.0018, "step": 4466 }, { "epoch": 6.0858310626703, "grad_norm": 1.5134504927661174, "learning_rate": 7.016625314016445e-06, "loss": 0.0037, "step": 4467 }, { "epoch": 6.087193460490464, "grad_norm": 1.0332945029137799, "learning_rate": 7.012413600231278e-06, "loss": 0.0168, "step": 4468 }, { "epoch": 6.0885558583106265, "grad_norm": 0.37630544348664163, "learning_rate": 7.008202468257514e-06, "loss": 0.0017, "step": 4469 }, { "epoch": 6.08991825613079, "grad_norm": 1.5888710926601934, "learning_rate": 7.003991918915232e-06, "loss": 0.0302, "step": 4470 }, { "epoch": 6.091280653950954, "grad_norm": 1.6405261059916858, "learning_rate": 6.999781953024415e-06, "loss": 0.0081, "step": 4471 }, { "epoch": 6.092643051771117, "grad_norm": 2.6756319577585086, "learning_rate": 6.9955725714049225e-06, "loss": 0.0091, "step": 4472 }, { "epoch": 6.094005449591281, "grad_norm": 0.41213501976566586, "learning_rate": 6.991363774876501e-06, "loss": 0.0021, "step": 4473 }, { "epoch": 6.0953678474114446, "grad_norm": 1.7902705807820254, "learning_rate": 6.9871555642587855e-06, "loss": 0.0198, "step": 4474 }, { "epoch": 6.0967302452316074, "grad_norm": 1.1739520720259728, "learning_rate": 6.982947940371294e-06, "loss": 0.0077, "step": 4475 }, { "epoch": 6.098092643051771, "grad_norm": 0.9847409860453935, "learning_rate": 6.978740904033427e-06, "loss": 0.0142, "step": 4476 }, { "epoch": 6.099455040871935, "grad_norm": 1.233697098072117, "learning_rate": 6.974534456064484e-06, "loss": 0.0077, "step": 4477 }, { "epoch": 6.100817438692098, "grad_norm": 1.1744818412210074, "learning_rate": 6.970328597283638e-06, "loss": 0.0101, "step": 4478 }, { "epoch": 6.102179836512262, "grad_norm": 0.40747377699330956, "learning_rate": 6.966123328509947e-06, "loss": 0.0025, "step": 4479 }, { "epoch": 6.1035422343324255, "grad_norm": 1.930877092117, "learning_rate": 6.96191865056236e-06, "loss": 0.0245, "step": 4480 }, { "epoch": 6.104904632152588, "grad_norm": 0.34621368071397857, "learning_rate": 6.957714564259712e-06, "loss": 0.001, "step": 4481 }, { "epoch": 6.106267029972752, "grad_norm": 0.9286042746428876, "learning_rate": 6.953511070420709e-06, "loss": 0.0023, "step": 4482 }, { "epoch": 6.107629427792916, "grad_norm": 0.5307029554314885, "learning_rate": 6.949308169863962e-06, "loss": 0.0017, "step": 4483 }, { "epoch": 6.108991825613079, "grad_norm": 0.418177043034553, "learning_rate": 6.94510586340795e-06, "loss": 0.0006, "step": 4484 }, { "epoch": 6.110354223433243, "grad_norm": 1.721915880099325, "learning_rate": 6.940904151871048e-06, "loss": 0.0146, "step": 4485 }, { "epoch": 6.111716621253406, "grad_norm": 1.1602167229665294, "learning_rate": 6.936703036071506e-06, "loss": 0.0084, "step": 4486 }, { "epoch": 6.113079019073569, "grad_norm": 2.371178376590364, "learning_rate": 6.932502516827462e-06, "loss": 0.0161, "step": 4487 }, { "epoch": 6.114441416893733, "grad_norm": 0.5341449622168055, "learning_rate": 6.928302594956937e-06, "loss": 0.0091, "step": 4488 }, { "epoch": 6.115803814713897, "grad_norm": 0.553838306509679, "learning_rate": 6.924103271277838e-06, "loss": 0.0012, "step": 4489 }, { "epoch": 6.11716621253406, "grad_norm": 1.4256301018909947, "learning_rate": 6.919904546607954e-06, "loss": 0.0082, "step": 4490 }, { "epoch": 6.118528610354224, "grad_norm": 0.4867298942502663, "learning_rate": 6.915706421764954e-06, "loss": 0.0054, "step": 4491 }, { "epoch": 6.1198910081743865, "grad_norm": 0.41346446173273776, "learning_rate": 6.911508897566397e-06, "loss": 0.0082, "step": 4492 }, { "epoch": 6.12125340599455, "grad_norm": 0.8702413945818679, "learning_rate": 6.907311974829716e-06, "loss": 0.0215, "step": 4493 }, { "epoch": 6.122615803814714, "grad_norm": 2.295978674497289, "learning_rate": 6.903115654372239e-06, "loss": 0.0245, "step": 4494 }, { "epoch": 6.123978201634877, "grad_norm": 1.128806424843795, "learning_rate": 6.8989199370111655e-06, "loss": 0.0247, "step": 4495 }, { "epoch": 6.125340599455041, "grad_norm": 1.7200539481830293, "learning_rate": 6.894724823563584e-06, "loss": 0.0029, "step": 4496 }, { "epoch": 6.1267029972752045, "grad_norm": 2.933300287560963, "learning_rate": 6.8905303148464595e-06, "loss": 0.022, "step": 4497 }, { "epoch": 6.128065395095367, "grad_norm": 0.4539109812611848, "learning_rate": 6.886336411676646e-06, "loss": 0.0075, "step": 4498 }, { "epoch": 6.129427792915531, "grad_norm": 2.1769987362485246, "learning_rate": 6.882143114870876e-06, "loss": 0.0013, "step": 4499 }, { "epoch": 6.130790190735695, "grad_norm": 1.6647309495013165, "learning_rate": 6.877950425245765e-06, "loss": 0.0113, "step": 4500 }, { "epoch": 6.132152588555858, "grad_norm": 0.4274631082039932, "learning_rate": 6.87375834361781e-06, "loss": 0.0191, "step": 4501 }, { "epoch": 6.133514986376022, "grad_norm": 0.7140295745247717, "learning_rate": 6.869566870803389e-06, "loss": 0.0018, "step": 4502 }, { "epoch": 6.1348773841961854, "grad_norm": 1.4593529060282906, "learning_rate": 6.865376007618761e-06, "loss": 0.0029, "step": 4503 }, { "epoch": 6.136239782016348, "grad_norm": 1.3927558023891549, "learning_rate": 6.861185754880068e-06, "loss": 0.018, "step": 4504 }, { "epoch": 6.137602179836512, "grad_norm": 0.7078796995152383, "learning_rate": 6.85699611340333e-06, "loss": 0.0195, "step": 4505 }, { "epoch": 6.138964577656676, "grad_norm": 1.9868001892838518, "learning_rate": 6.852807084004453e-06, "loss": 0.0166, "step": 4506 }, { "epoch": 6.140326975476839, "grad_norm": 1.1549936597305082, "learning_rate": 6.848618667499222e-06, "loss": 0.0093, "step": 4507 }, { "epoch": 6.141689373297003, "grad_norm": 0.4330711143845998, "learning_rate": 6.844430864703298e-06, "loss": 0.0087, "step": 4508 }, { "epoch": 6.143051771117166, "grad_norm": 1.0526853258790347, "learning_rate": 6.840243676432228e-06, "loss": 0.0044, "step": 4509 }, { "epoch": 6.144414168937329, "grad_norm": 1.4144413648430363, "learning_rate": 6.836057103501436e-06, "loss": 0.0102, "step": 4510 }, { "epoch": 6.145776566757493, "grad_norm": 1.1351039776701268, "learning_rate": 6.831871146726228e-06, "loss": 0.0116, "step": 4511 }, { "epoch": 6.147138964577657, "grad_norm": 2.000322865077643, "learning_rate": 6.827685806921792e-06, "loss": 0.0127, "step": 4512 }, { "epoch": 6.14850136239782, "grad_norm": 0.5518628265533143, "learning_rate": 6.823501084903192e-06, "loss": 0.0011, "step": 4513 }, { "epoch": 6.1498637602179835, "grad_norm": 0.7201185139529808, "learning_rate": 6.8193169814853725e-06, "loss": 0.0095, "step": 4514 }, { "epoch": 6.151226158038147, "grad_norm": 1.508802753065667, "learning_rate": 6.815133497483157e-06, "loss": 0.013, "step": 4515 }, { "epoch": 6.15258855585831, "grad_norm": 3.215904579607178, "learning_rate": 6.810950633711253e-06, "loss": 0.0133, "step": 4516 }, { "epoch": 6.153950953678474, "grad_norm": 0.33176670690394444, "learning_rate": 6.8067683909842376e-06, "loss": 0.0078, "step": 4517 }, { "epoch": 6.155313351498638, "grad_norm": 0.6447529590317951, "learning_rate": 6.802586770116581e-06, "loss": 0.0083, "step": 4518 }, { "epoch": 6.156675749318801, "grad_norm": 1.9128262785297896, "learning_rate": 6.79840577192262e-06, "loss": 0.0027, "step": 4519 }, { "epoch": 6.1580381471389645, "grad_norm": 0.6627615678037372, "learning_rate": 6.794225397216575e-06, "loss": 0.0083, "step": 4520 }, { "epoch": 6.159400544959128, "grad_norm": 1.289538969800861, "learning_rate": 6.790045646812544e-06, "loss": 0.0143, "step": 4521 }, { "epoch": 6.160762942779291, "grad_norm": 1.3138914802318964, "learning_rate": 6.7858665215245045e-06, "loss": 0.0081, "step": 4522 }, { "epoch": 6.162125340599455, "grad_norm": 0.8305813469930011, "learning_rate": 6.781688022166312e-06, "loss": 0.0009, "step": 4523 }, { "epoch": 6.163487738419619, "grad_norm": 1.8798183095496184, "learning_rate": 6.777510149551701e-06, "loss": 0.0219, "step": 4524 }, { "epoch": 6.164850136239782, "grad_norm": 0.6704222884771553, "learning_rate": 6.77333290449428e-06, "loss": 0.0319, "step": 4525 }, { "epoch": 6.166212534059945, "grad_norm": 3.274548126522056, "learning_rate": 6.769156287807539e-06, "loss": 0.0036, "step": 4526 }, { "epoch": 6.167574931880109, "grad_norm": 1.977501644595325, "learning_rate": 6.764980300304846e-06, "loss": 0.0078, "step": 4527 }, { "epoch": 6.168937329700272, "grad_norm": 2.8295782441599147, "learning_rate": 6.7608049427994425e-06, "loss": 0.0062, "step": 4528 }, { "epoch": 6.170299727520436, "grad_norm": 2.5911662789638474, "learning_rate": 6.756630216104454e-06, "loss": 0.002, "step": 4529 }, { "epoch": 6.1716621253406, "grad_norm": 1.8128099715456296, "learning_rate": 6.752456121032876e-06, "loss": 0.0049, "step": 4530 }, { "epoch": 6.1730245231607626, "grad_norm": 1.3368579703973495, "learning_rate": 6.748282658397584e-06, "loss": 0.0015, "step": 4531 }, { "epoch": 6.174386920980926, "grad_norm": 2.7334577373836186, "learning_rate": 6.744109829011333e-06, "loss": 0.0024, "step": 4532 }, { "epoch": 6.17574931880109, "grad_norm": 0.9201679606914559, "learning_rate": 6.73993763368675e-06, "loss": 0.0082, "step": 4533 }, { "epoch": 6.177111716621253, "grad_norm": 1.1644701952676062, "learning_rate": 6.735766073236338e-06, "loss": 0.0071, "step": 4534 }, { "epoch": 6.178474114441417, "grad_norm": 2.7183617167256204, "learning_rate": 6.731595148472485e-06, "loss": 0.0336, "step": 4535 }, { "epoch": 6.179836512261581, "grad_norm": 1.3824785187556983, "learning_rate": 6.727424860207445e-06, "loss": 0.0019, "step": 4536 }, { "epoch": 6.1811989100817435, "grad_norm": 0.4319673556277465, "learning_rate": 6.723255209253355e-06, "loss": 0.0077, "step": 4537 }, { "epoch": 6.182561307901907, "grad_norm": 1.6488062332239137, "learning_rate": 6.719086196422224e-06, "loss": 0.0011, "step": 4538 }, { "epoch": 6.183923705722071, "grad_norm": 1.7766784783043954, "learning_rate": 6.714917822525937e-06, "loss": 0.0153, "step": 4539 }, { "epoch": 6.185286103542234, "grad_norm": 0.7102884730319912, "learning_rate": 6.710750088376253e-06, "loss": 0.0011, "step": 4540 }, { "epoch": 6.186648501362398, "grad_norm": 1.472303260587953, "learning_rate": 6.706582994784815e-06, "loss": 0.0016, "step": 4541 }, { "epoch": 6.1880108991825615, "grad_norm": 1.394998500120704, "learning_rate": 6.702416542563132e-06, "loss": 0.0081, "step": 4542 }, { "epoch": 6.189373297002724, "grad_norm": 2.2502189136707065, "learning_rate": 6.698250732522591e-06, "loss": 0.0103, "step": 4543 }, { "epoch": 6.190735694822888, "grad_norm": 1.3195588102005804, "learning_rate": 6.694085565474453e-06, "loss": 0.0011, "step": 4544 }, { "epoch": 6.192098092643052, "grad_norm": 1.6708880550036267, "learning_rate": 6.689921042229858e-06, "loss": 0.0156, "step": 4545 }, { "epoch": 6.193460490463215, "grad_norm": 1.7184864638524109, "learning_rate": 6.685757163599813e-06, "loss": 0.0028, "step": 4546 }, { "epoch": 6.194822888283379, "grad_norm": 1.4046285203690894, "learning_rate": 6.681593930395209e-06, "loss": 0.003, "step": 4547 }, { "epoch": 6.1961852861035425, "grad_norm": 2.625338604960775, "learning_rate": 6.6774313434268035e-06, "loss": 0.0102, "step": 4548 }, { "epoch": 6.197547683923705, "grad_norm": 1.0685020086783446, "learning_rate": 6.6732694035052325e-06, "loss": 0.0014, "step": 4549 }, { "epoch": 6.198910081743869, "grad_norm": 0.5235821989481195, "learning_rate": 6.669108111441004e-06, "loss": 0.0039, "step": 4550 }, { "epoch": 6.200272479564033, "grad_norm": 1.0660968994978353, "learning_rate": 6.664947468044497e-06, "loss": 0.0121, "step": 4551 }, { "epoch": 6.201634877384196, "grad_norm": 2.070756417003134, "learning_rate": 6.660787474125972e-06, "loss": 0.0148, "step": 4552 }, { "epoch": 6.20299727520436, "grad_norm": 0.9895416504688539, "learning_rate": 6.6566281304955584e-06, "loss": 0.0165, "step": 4553 }, { "epoch": 6.204359673024523, "grad_norm": 1.8990752233150532, "learning_rate": 6.652469437963256e-06, "loss": 0.014, "step": 4554 }, { "epoch": 6.205722070844686, "grad_norm": 2.552053260076985, "learning_rate": 6.6483113973389446e-06, "loss": 0.0039, "step": 4555 }, { "epoch": 6.20708446866485, "grad_norm": 1.4087617562878691, "learning_rate": 6.644154009432369e-06, "loss": 0.0015, "step": 4556 }, { "epoch": 6.208446866485014, "grad_norm": 3.267296170569019, "learning_rate": 6.639997275053152e-06, "loss": 0.0403, "step": 4557 }, { "epoch": 6.209809264305177, "grad_norm": 2.0758162431769462, "learning_rate": 6.6358411950107926e-06, "loss": 0.0232, "step": 4558 }, { "epoch": 6.2111716621253406, "grad_norm": 1.154824354163917, "learning_rate": 6.631685770114655e-06, "loss": 0.0022, "step": 4559 }, { "epoch": 6.212534059945504, "grad_norm": 0.608097848409334, "learning_rate": 6.627531001173977e-06, "loss": 0.0034, "step": 4560 }, { "epoch": 6.213896457765667, "grad_norm": 2.723496240684422, "learning_rate": 6.623376888997874e-06, "loss": 0.0234, "step": 4561 }, { "epoch": 6.215258855585831, "grad_norm": 0.561244698894135, "learning_rate": 6.61922343439533e-06, "loss": 0.002, "step": 4562 }, { "epoch": 6.216621253405995, "grad_norm": 1.189354949918982, "learning_rate": 6.615070638175196e-06, "loss": 0.0139, "step": 4563 }, { "epoch": 6.217983651226158, "grad_norm": 2.138868167243667, "learning_rate": 6.610918501146206e-06, "loss": 0.0185, "step": 4564 }, { "epoch": 6.2193460490463215, "grad_norm": 0.6643666525789853, "learning_rate": 6.606767024116958e-06, "loss": 0.0007, "step": 4565 }, { "epoch": 6.220708446866485, "grad_norm": 1.9869273377261107, "learning_rate": 6.602616207895922e-06, "loss": 0.0038, "step": 4566 }, { "epoch": 6.222070844686648, "grad_norm": 1.8929515642897825, "learning_rate": 6.5984660532914395e-06, "loss": 0.0166, "step": 4567 }, { "epoch": 6.223433242506812, "grad_norm": 1.9827230302732408, "learning_rate": 6.5943165611117244e-06, "loss": 0.0163, "step": 4568 }, { "epoch": 6.224795640326976, "grad_norm": 2.701628828324632, "learning_rate": 6.590167732164858e-06, "loss": 0.0097, "step": 4569 }, { "epoch": 6.226158038147139, "grad_norm": 2.2151433723930873, "learning_rate": 6.586019567258801e-06, "loss": 0.007, "step": 4570 }, { "epoch": 6.227520435967302, "grad_norm": 0.9337511300201403, "learning_rate": 6.581872067201378e-06, "loss": 0.0015, "step": 4571 }, { "epoch": 6.228882833787466, "grad_norm": 1.2116684198288656, "learning_rate": 6.577725232800285e-06, "loss": 0.0013, "step": 4572 }, { "epoch": 6.230245231607629, "grad_norm": 1.6161137220472508, "learning_rate": 6.573579064863087e-06, "loss": 0.0054, "step": 4573 }, { "epoch": 6.231607629427793, "grad_norm": 1.396765865604056, "learning_rate": 6.569433564197222e-06, "loss": 0.0009, "step": 4574 }, { "epoch": 6.232970027247957, "grad_norm": 1.0456746630386233, "learning_rate": 6.565288731609995e-06, "loss": 0.0062, "step": 4575 }, { "epoch": 6.23433242506812, "grad_norm": 1.6271370452732519, "learning_rate": 6.561144567908589e-06, "loss": 0.0036, "step": 4576 }, { "epoch": 6.235694822888283, "grad_norm": 1.3617258422337042, "learning_rate": 6.5570010739000445e-06, "loss": 0.0164, "step": 4577 }, { "epoch": 6.237057220708447, "grad_norm": 1.4323395931829, "learning_rate": 6.5528582503912805e-06, "loss": 0.0322, "step": 4578 }, { "epoch": 6.23841961852861, "grad_norm": 1.2953869252975523, "learning_rate": 6.548716098189082e-06, "loss": 0.0148, "step": 4579 }, { "epoch": 6.239782016348774, "grad_norm": 1.2171678029559287, "learning_rate": 6.5445746181001015e-06, "loss": 0.0088, "step": 4580 }, { "epoch": 6.241144414168938, "grad_norm": 1.7664831414696769, "learning_rate": 6.540433810930871e-06, "loss": 0.0041, "step": 4581 }, { "epoch": 6.2425068119891005, "grad_norm": 1.0153773177169472, "learning_rate": 6.536293677487775e-06, "loss": 0.0072, "step": 4582 }, { "epoch": 6.243869209809264, "grad_norm": 1.6034863917837707, "learning_rate": 6.532154218577081e-06, "loss": 0.0043, "step": 4583 }, { "epoch": 6.245231607629428, "grad_norm": 0.7590283510140068, "learning_rate": 6.5280154350049155e-06, "loss": 0.0109, "step": 4584 }, { "epoch": 6.246594005449591, "grad_norm": 0.6482168768794825, "learning_rate": 6.523877327577278e-06, "loss": 0.0088, "step": 4585 }, { "epoch": 6.247956403269755, "grad_norm": 0.21351432862843459, "learning_rate": 6.519739897100034e-06, "loss": 0.0077, "step": 4586 }, { "epoch": 6.2493188010899186, "grad_norm": 1.1417772521215594, "learning_rate": 6.515603144378928e-06, "loss": 0.002, "step": 4587 }, { "epoch": 6.2506811989100814, "grad_norm": 0.36244756882819085, "learning_rate": 6.511467070219555e-06, "loss": 0.0008, "step": 4588 }, { "epoch": 6.252043596730245, "grad_norm": 1.7285633060398637, "learning_rate": 6.507331675427388e-06, "loss": 0.0075, "step": 4589 }, { "epoch": 6.253405994550409, "grad_norm": 0.7366039032290885, "learning_rate": 6.503196960807766e-06, "loss": 0.0105, "step": 4590 }, { "epoch": 6.254768392370572, "grad_norm": 0.7516898841114431, "learning_rate": 6.499062927165896e-06, "loss": 0.0252, "step": 4591 }, { "epoch": 6.256130790190736, "grad_norm": 1.0475417270073417, "learning_rate": 6.494929575306848e-06, "loss": 0.0324, "step": 4592 }, { "epoch": 6.2574931880108995, "grad_norm": 1.3687479910820155, "learning_rate": 6.490796906035574e-06, "loss": 0.0049, "step": 4593 }, { "epoch": 6.258855585831062, "grad_norm": 1.3122179923024735, "learning_rate": 6.486664920156872e-06, "loss": 0.0048, "step": 4594 }, { "epoch": 6.260217983651226, "grad_norm": 1.0482767228857572, "learning_rate": 6.482533618475422e-06, "loss": 0.001, "step": 4595 }, { "epoch": 6.26158038147139, "grad_norm": 0.565995204273737, "learning_rate": 6.478403001795762e-06, "loss": 0.0016, "step": 4596 }, { "epoch": 6.262942779291553, "grad_norm": 0.378705832278089, "learning_rate": 6.474273070922304e-06, "loss": 0.0011, "step": 4597 }, { "epoch": 6.264305177111717, "grad_norm": 2.3600220439651, "learning_rate": 6.470143826659317e-06, "loss": 0.0209, "step": 4598 }, { "epoch": 6.26566757493188, "grad_norm": 0.7857790841666421, "learning_rate": 6.466015269810953e-06, "loss": 0.0054, "step": 4599 }, { "epoch": 6.267029972752043, "grad_norm": 1.4866053919974804, "learning_rate": 6.461887401181213e-06, "loss": 0.0183, "step": 4600 }, { "epoch": 6.268392370572207, "grad_norm": 2.4170754408618658, "learning_rate": 6.457760221573969e-06, "loss": 0.0044, "step": 4601 }, { "epoch": 6.269754768392371, "grad_norm": 0.5461016154502721, "learning_rate": 6.453633731792961e-06, "loss": 0.0084, "step": 4602 }, { "epoch": 6.271117166212534, "grad_norm": 0.7757481992514641, "learning_rate": 6.449507932641796e-06, "loss": 0.0024, "step": 4603 }, { "epoch": 6.272479564032698, "grad_norm": 1.0674809136868055, "learning_rate": 6.445382824923938e-06, "loss": 0.0034, "step": 4604 }, { "epoch": 6.273841961852861, "grad_norm": 1.1171970587885263, "learning_rate": 6.441258409442735e-06, "loss": 0.0078, "step": 4605 }, { "epoch": 6.275204359673024, "grad_norm": 0.17662102881163197, "learning_rate": 6.437134687001376e-06, "loss": 0.0005, "step": 4606 }, { "epoch": 6.276566757493188, "grad_norm": 0.758340551071797, "learning_rate": 6.4330116584029325e-06, "loss": 0.0044, "step": 4607 }, { "epoch": 6.277929155313352, "grad_norm": 0.8215735400599399, "learning_rate": 6.428889324450334e-06, "loss": 0.0007, "step": 4608 }, { "epoch": 6.279291553133515, "grad_norm": 1.5088133526207044, "learning_rate": 6.4247676859463716e-06, "loss": 0.0067, "step": 4609 }, { "epoch": 6.2806539509536785, "grad_norm": 1.131870136681447, "learning_rate": 6.420646743693715e-06, "loss": 0.0048, "step": 4610 }, { "epoch": 6.282016348773842, "grad_norm": 1.6077891043199104, "learning_rate": 6.416526498494882e-06, "loss": 0.0024, "step": 4611 }, { "epoch": 6.283378746594005, "grad_norm": 1.1683316751590398, "learning_rate": 6.412406951152266e-06, "loss": 0.0058, "step": 4612 }, { "epoch": 6.284741144414169, "grad_norm": 0.13664115544713298, "learning_rate": 6.408288102468114e-06, "loss": 0.0025, "step": 4613 }, { "epoch": 6.286103542234333, "grad_norm": 2.657295603114857, "learning_rate": 6.404169953244545e-06, "loss": 0.0272, "step": 4614 }, { "epoch": 6.287465940054496, "grad_norm": 0.35632959384283464, "learning_rate": 6.400052504283536e-06, "loss": 0.0014, "step": 4615 }, { "epoch": 6.2888283378746594, "grad_norm": 0.7753424526453045, "learning_rate": 6.39593575638694e-06, "loss": 0.0102, "step": 4616 }, { "epoch": 6.290190735694823, "grad_norm": 3.027823602086262, "learning_rate": 6.391819710356457e-06, "loss": 0.0163, "step": 4617 }, { "epoch": 6.291553133514986, "grad_norm": 1.520121517291573, "learning_rate": 6.387704366993665e-06, "loss": 0.0162, "step": 4618 }, { "epoch": 6.29291553133515, "grad_norm": 0.5134878306919338, "learning_rate": 6.383589727099992e-06, "loss": 0.0037, "step": 4619 }, { "epoch": 6.294277929155314, "grad_norm": 0.9686831609290286, "learning_rate": 6.379475791476736e-06, "loss": 0.0013, "step": 4620 }, { "epoch": 6.295640326975477, "grad_norm": 1.340940739059839, "learning_rate": 6.375362560925054e-06, "loss": 0.0064, "step": 4621 }, { "epoch": 6.29700272479564, "grad_norm": 2.1965063410778014, "learning_rate": 6.371250036245977e-06, "loss": 0.0263, "step": 4622 }, { "epoch": 6.298365122615804, "grad_norm": 1.8498150279228267, "learning_rate": 6.367138218240385e-06, "loss": 0.0096, "step": 4623 }, { "epoch": 6.299727520435967, "grad_norm": 0.3527259234126053, "learning_rate": 6.363027107709028e-06, "loss": 0.0006, "step": 4624 }, { "epoch": 6.301089918256131, "grad_norm": 1.4724398389945352, "learning_rate": 6.358916705452514e-06, "loss": 0.0094, "step": 4625 }, { "epoch": 6.302452316076295, "grad_norm": 1.147641788453032, "learning_rate": 6.3548070122713124e-06, "loss": 0.0094, "step": 4626 }, { "epoch": 6.3038147138964575, "grad_norm": 0.3836427506411667, "learning_rate": 6.350698028965756e-06, "loss": 0.0149, "step": 4627 }, { "epoch": 6.305177111716621, "grad_norm": 0.6780981841282194, "learning_rate": 6.34658975633605e-06, "loss": 0.001, "step": 4628 }, { "epoch": 6.306539509536785, "grad_norm": 0.21501934515150498, "learning_rate": 6.342482195182245e-06, "loss": 0.0006, "step": 4629 }, { "epoch": 6.307901907356948, "grad_norm": 1.2382893257942795, "learning_rate": 6.33837534630426e-06, "loss": 0.0161, "step": 4630 }, { "epoch": 6.309264305177112, "grad_norm": 1.681624841399813, "learning_rate": 6.334269210501876e-06, "loss": 0.0076, "step": 4631 }, { "epoch": 6.310626702997276, "grad_norm": 0.8451234318518309, "learning_rate": 6.330163788574726e-06, "loss": 0.0015, "step": 4632 }, { "epoch": 6.3119891008174385, "grad_norm": 2.0504493480463495, "learning_rate": 6.326059081322324e-06, "loss": 0.0068, "step": 4633 }, { "epoch": 6.313351498637602, "grad_norm": 0.31058016724553905, "learning_rate": 6.321955089544029e-06, "loss": 0.0079, "step": 4634 }, { "epoch": 6.314713896457766, "grad_norm": 0.20719072410247985, "learning_rate": 6.317851814039062e-06, "loss": 0.0006, "step": 4635 }, { "epoch": 6.316076294277929, "grad_norm": 2.705888888691183, "learning_rate": 6.313749255606509e-06, "loss": 0.0034, "step": 4636 }, { "epoch": 6.317438692098093, "grad_norm": 1.4955446795542426, "learning_rate": 6.309647415045315e-06, "loss": 0.0034, "step": 4637 }, { "epoch": 6.3188010899182565, "grad_norm": 2.070531056091895, "learning_rate": 6.305546293154276e-06, "loss": 0.0142, "step": 4638 }, { "epoch": 6.320163487738419, "grad_norm": 0.8172086595325014, "learning_rate": 6.30144589073207e-06, "loss": 0.0074, "step": 4639 }, { "epoch": 6.321525885558583, "grad_norm": 0.4855386959214757, "learning_rate": 6.297346208577213e-06, "loss": 0.0008, "step": 4640 }, { "epoch": 6.322888283378747, "grad_norm": 0.7628079735852713, "learning_rate": 6.2932472474880935e-06, "loss": 0.0132, "step": 4641 }, { "epoch": 6.32425068119891, "grad_norm": 0.7756506668286105, "learning_rate": 6.289149008262953e-06, "loss": 0.0074, "step": 4642 }, { "epoch": 6.325613079019074, "grad_norm": 0.6509686788777602, "learning_rate": 6.285051491699896e-06, "loss": 0.001, "step": 4643 }, { "epoch": 6.3269754768392374, "grad_norm": 0.5149044113162713, "learning_rate": 6.280954698596877e-06, "loss": 0.0085, "step": 4644 }, { "epoch": 6.3283378746594, "grad_norm": 0.1855289776955832, "learning_rate": 6.27685862975173e-06, "loss": 0.0008, "step": 4645 }, { "epoch": 6.329700272479564, "grad_norm": 1.2800272501833119, "learning_rate": 6.272763285962129e-06, "loss": 0.004, "step": 4646 }, { "epoch": 6.331062670299728, "grad_norm": 0.3953691623392929, "learning_rate": 6.268668668025615e-06, "loss": 0.0091, "step": 4647 }, { "epoch": 6.332425068119891, "grad_norm": 0.8320050400114456, "learning_rate": 6.264574776739588e-06, "loss": 0.0338, "step": 4648 }, { "epoch": 6.333787465940055, "grad_norm": 2.590248847157165, "learning_rate": 6.260481612901299e-06, "loss": 0.0299, "step": 4649 }, { "epoch": 6.335149863760218, "grad_norm": 1.7938014590793068, "learning_rate": 6.2563891773078625e-06, "loss": 0.01, "step": 4650 }, { "epoch": 6.336512261580381, "grad_norm": 2.5403880527178977, "learning_rate": 6.252297470756258e-06, "loss": 0.0165, "step": 4651 }, { "epoch": 6.337874659400545, "grad_norm": 0.5309285022393476, "learning_rate": 6.248206494043313e-06, "loss": 0.0006, "step": 4652 }, { "epoch": 6.339237057220709, "grad_norm": 0.4902151891905532, "learning_rate": 6.244116247965717e-06, "loss": 0.001, "step": 4653 }, { "epoch": 6.340599455040872, "grad_norm": 5.947869143086109, "learning_rate": 6.240026733320017e-06, "loss": 0.0079, "step": 4654 }, { "epoch": 6.3419618528610355, "grad_norm": 1.5191826205074215, "learning_rate": 6.235937950902615e-06, "loss": 0.0269, "step": 4655 }, { "epoch": 6.343324250681199, "grad_norm": 0.5714473552629813, "learning_rate": 6.231849901509771e-06, "loss": 0.008, "step": 4656 }, { "epoch": 6.344686648501362, "grad_norm": 0.5451251730270883, "learning_rate": 6.227762585937609e-06, "loss": 0.0017, "step": 4657 }, { "epoch": 6.346049046321526, "grad_norm": 1.5086824108339203, "learning_rate": 6.223676004982104e-06, "loss": 0.0031, "step": 4658 }, { "epoch": 6.34741144414169, "grad_norm": 1.4731052584075697, "learning_rate": 6.219590159439088e-06, "loss": 0.0101, "step": 4659 }, { "epoch": 6.348773841961853, "grad_norm": 0.48429996170796746, "learning_rate": 6.215505050104248e-06, "loss": 0.0152, "step": 4660 }, { "epoch": 6.3501362397820165, "grad_norm": 1.7472238381381993, "learning_rate": 6.211420677773131e-06, "loss": 0.0315, "step": 4661 }, { "epoch": 6.35149863760218, "grad_norm": 0.9734787262532772, "learning_rate": 6.207337043241144e-06, "loss": 0.025, "step": 4662 }, { "epoch": 6.352861035422343, "grad_norm": 2.547978223293763, "learning_rate": 6.203254147303542e-06, "loss": 0.0205, "step": 4663 }, { "epoch": 6.354223433242507, "grad_norm": 2.2106360742865916, "learning_rate": 6.199171990755442e-06, "loss": 0.0228, "step": 4664 }, { "epoch": 6.355585831062671, "grad_norm": 0.4806306733593375, "learning_rate": 6.1950905743918155e-06, "loss": 0.001, "step": 4665 }, { "epoch": 6.356948228882834, "grad_norm": 1.1648517952461863, "learning_rate": 6.191009899007487e-06, "loss": 0.0009, "step": 4666 }, { "epoch": 6.358310626702997, "grad_norm": 1.1407147234769393, "learning_rate": 6.186929965397139e-06, "loss": 0.0016, "step": 4667 }, { "epoch": 6.359673024523161, "grad_norm": 0.35645177526780053, "learning_rate": 6.182850774355314e-06, "loss": 0.0008, "step": 4668 }, { "epoch": 6.361035422343324, "grad_norm": 3.6328184057198833, "learning_rate": 6.178772326676403e-06, "loss": 0.0173, "step": 4669 }, { "epoch": 6.362397820163488, "grad_norm": 0.5633625746597677, "learning_rate": 6.174694623154658e-06, "loss": 0.0017, "step": 4670 }, { "epoch": 6.363760217983652, "grad_norm": 1.1812383904641484, "learning_rate": 6.170617664584178e-06, "loss": 0.0102, "step": 4671 }, { "epoch": 6.3651226158038146, "grad_norm": 0.8747419817651354, "learning_rate": 6.166541451758925e-06, "loss": 0.0148, "step": 4672 }, { "epoch": 6.366485013623978, "grad_norm": 0.6138692765117861, "learning_rate": 6.16246598547271e-06, "loss": 0.002, "step": 4673 }, { "epoch": 6.367847411444142, "grad_norm": 1.0461234146027119, "learning_rate": 6.158391266519205e-06, "loss": 0.0221, "step": 4674 }, { "epoch": 6.369209809264305, "grad_norm": 0.9920828087300518, "learning_rate": 6.154317295691931e-06, "loss": 0.0011, "step": 4675 }, { "epoch": 6.370572207084469, "grad_norm": 2.4580163625454006, "learning_rate": 6.150244073784266e-06, "loss": 0.0285, "step": 4676 }, { "epoch": 6.371934604904633, "grad_norm": 0.8450232372643737, "learning_rate": 6.146171601589441e-06, "loss": 0.0028, "step": 4677 }, { "epoch": 6.3732970027247955, "grad_norm": 0.5478896431062041, "learning_rate": 6.14209987990054e-06, "loss": 0.0019, "step": 4678 }, { "epoch": 6.374659400544959, "grad_norm": 1.5210709380341494, "learning_rate": 6.138028909510503e-06, "loss": 0.0208, "step": 4679 }, { "epoch": 6.376021798365123, "grad_norm": 1.0733029373428913, "learning_rate": 6.133958691212123e-06, "loss": 0.0187, "step": 4680 }, { "epoch": 6.377384196185286, "grad_norm": 0.85465279087949, "learning_rate": 6.129889225798045e-06, "loss": 0.0026, "step": 4681 }, { "epoch": 6.37874659400545, "grad_norm": 1.0519894640244953, "learning_rate": 6.125820514060772e-06, "loss": 0.0093, "step": 4682 }, { "epoch": 6.3801089918256135, "grad_norm": 0.5673330048227525, "learning_rate": 6.121752556792655e-06, "loss": 0.001, "step": 4683 }, { "epoch": 6.381471389645776, "grad_norm": 1.653881277250104, "learning_rate": 6.117685354785898e-06, "loss": 0.0236, "step": 4684 }, { "epoch": 6.38283378746594, "grad_norm": 1.4065411031466797, "learning_rate": 6.113618908832561e-06, "loss": 0.0053, "step": 4685 }, { "epoch": 6.384196185286104, "grad_norm": 0.510984769950932, "learning_rate": 6.109553219724558e-06, "loss": 0.0074, "step": 4686 }, { "epoch": 6.385558583106267, "grad_norm": 0.2648291562894614, "learning_rate": 6.105488288253652e-06, "loss": 0.0082, "step": 4687 }, { "epoch": 6.386920980926431, "grad_norm": 2.038342656195592, "learning_rate": 6.1014241152114584e-06, "loss": 0.0029, "step": 4688 }, { "epoch": 6.3882833787465945, "grad_norm": 1.4932032155419936, "learning_rate": 6.097360701389448e-06, "loss": 0.0109, "step": 4689 }, { "epoch": 6.389645776566757, "grad_norm": 0.8682160929651035, "learning_rate": 6.093298047578939e-06, "loss": 0.0008, "step": 4690 }, { "epoch": 6.391008174386921, "grad_norm": 1.268354138088723, "learning_rate": 6.08923615457111e-06, "loss": 0.0077, "step": 4691 }, { "epoch": 6.392370572207085, "grad_norm": 1.0009929161497055, "learning_rate": 6.085175023156983e-06, "loss": 0.019, "step": 4692 }, { "epoch": 6.393732970027248, "grad_norm": 1.7388691105303216, "learning_rate": 6.081114654127434e-06, "loss": 0.0124, "step": 4693 }, { "epoch": 6.395095367847412, "grad_norm": 1.7369479830585186, "learning_rate": 6.077055048273193e-06, "loss": 0.0156, "step": 4694 }, { "epoch": 6.396457765667575, "grad_norm": 0.2932345780018971, "learning_rate": 6.072996206384837e-06, "loss": 0.0011, "step": 4695 }, { "epoch": 6.397820163487738, "grad_norm": 1.2687038805452244, "learning_rate": 6.068938129252797e-06, "loss": 0.0031, "step": 4696 }, { "epoch": 6.399182561307902, "grad_norm": 4.773372550189394, "learning_rate": 6.064880817667359e-06, "loss": 0.0228, "step": 4697 }, { "epoch": 6.400544959128065, "grad_norm": 1.080544761541547, "learning_rate": 6.060824272418652e-06, "loss": 0.0085, "step": 4698 }, { "epoch": 6.401907356948229, "grad_norm": 1.7625347472961646, "learning_rate": 6.056768494296663e-06, "loss": 0.0027, "step": 4699 }, { "epoch": 6.4032697547683926, "grad_norm": 1.0558730214964789, "learning_rate": 6.052713484091223e-06, "loss": 0.0085, "step": 4700 }, { "epoch": 6.4046321525885554, "grad_norm": 1.8530531567665138, "learning_rate": 6.048659242592016e-06, "loss": 0.0343, "step": 4701 }, { "epoch": 6.405994550408719, "grad_norm": 2.2078690680160697, "learning_rate": 6.0446057705885796e-06, "loss": 0.016, "step": 4702 }, { "epoch": 6.407356948228883, "grad_norm": 1.181855100014907, "learning_rate": 6.040553068870298e-06, "loss": 0.0012, "step": 4703 }, { "epoch": 6.408719346049046, "grad_norm": 1.0992327313070462, "learning_rate": 6.036501138226407e-06, "loss": 0.0169, "step": 4704 }, { "epoch": 6.41008174386921, "grad_norm": 3.3424525192312355, "learning_rate": 6.032449979445991e-06, "loss": 0.0088, "step": 4705 }, { "epoch": 6.4114441416893735, "grad_norm": 1.014578179891552, "learning_rate": 6.028399593317984e-06, "loss": 0.017, "step": 4706 }, { "epoch": 6.412806539509536, "grad_norm": 1.0804509403531513, "learning_rate": 6.024349980631172e-06, "loss": 0.0149, "step": 4707 }, { "epoch": 6.4141689373297, "grad_norm": 1.5127246877544236, "learning_rate": 6.020301142174183e-06, "loss": 0.0273, "step": 4708 }, { "epoch": 6.415531335149864, "grad_norm": 0.7137956802076316, "learning_rate": 6.016253078735508e-06, "loss": 0.0098, "step": 4709 }, { "epoch": 6.416893732970027, "grad_norm": 2.0646393516309303, "learning_rate": 6.012205791103474e-06, "loss": 0.0119, "step": 4710 }, { "epoch": 6.418256130790191, "grad_norm": 0.298869949954401, "learning_rate": 6.008159280066262e-06, "loss": 0.0087, "step": 4711 }, { "epoch": 6.419618528610354, "grad_norm": 0.49109946807485155, "learning_rate": 6.0041135464119024e-06, "loss": 0.0012, "step": 4712 }, { "epoch": 6.420980926430517, "grad_norm": 1.1454136681648053, "learning_rate": 6.000068590928272e-06, "loss": 0.0175, "step": 4713 }, { "epoch": 6.422343324250681, "grad_norm": 1.8932824191521127, "learning_rate": 5.996024414403097e-06, "loss": 0.0113, "step": 4714 }, { "epoch": 6.423705722070845, "grad_norm": 1.1621716453023327, "learning_rate": 5.9919810176239554e-06, "loss": 0.0025, "step": 4715 }, { "epoch": 6.425068119891008, "grad_norm": 0.7150447654475929, "learning_rate": 5.987938401378269e-06, "loss": 0.0079, "step": 4716 }, { "epoch": 6.426430517711172, "grad_norm": 1.650183036720799, "learning_rate": 5.983896566453309e-06, "loss": 0.019, "step": 4717 }, { "epoch": 6.427792915531335, "grad_norm": 1.104953076465568, "learning_rate": 5.979855513636192e-06, "loss": 0.0108, "step": 4718 }, { "epoch": 6.429155313351498, "grad_norm": 1.1846217719018894, "learning_rate": 5.975815243713885e-06, "loss": 0.0052, "step": 4719 }, { "epoch": 6.430517711171662, "grad_norm": 0.4998155628022551, "learning_rate": 5.971775757473204e-06, "loss": 0.0023, "step": 4720 }, { "epoch": 6.431880108991826, "grad_norm": 0.3129435873736103, "learning_rate": 5.96773705570081e-06, "loss": 0.0083, "step": 4721 }, { "epoch": 6.433242506811989, "grad_norm": 3.130029753488068, "learning_rate": 5.963699139183212e-06, "loss": 0.0104, "step": 4722 }, { "epoch": 6.4346049046321525, "grad_norm": 1.5484059380741226, "learning_rate": 5.959662008706766e-06, "loss": 0.0077, "step": 4723 }, { "epoch": 6.435967302452316, "grad_norm": 1.464042798962241, "learning_rate": 5.9556256650576715e-06, "loss": 0.0029, "step": 4724 }, { "epoch": 6.437329700272479, "grad_norm": 1.048404842473022, "learning_rate": 5.951590109021981e-06, "loss": 0.0211, "step": 4725 }, { "epoch": 6.438692098092643, "grad_norm": 0.22982796031824418, "learning_rate": 5.947555341385589e-06, "loss": 0.0008, "step": 4726 }, { "epoch": 6.440054495912807, "grad_norm": 0.8539772894225238, "learning_rate": 5.943521362934241e-06, "loss": 0.009, "step": 4727 }, { "epoch": 6.44141689373297, "grad_norm": 2.1170696070440442, "learning_rate": 5.939488174453525e-06, "loss": 0.0239, "step": 4728 }, { "epoch": 6.4427792915531334, "grad_norm": 2.469676733499775, "learning_rate": 5.935455776728873e-06, "loss": 0.0117, "step": 4729 }, { "epoch": 6.444141689373297, "grad_norm": 3.388634644000298, "learning_rate": 5.931424170545568e-06, "loss": 0.0263, "step": 4730 }, { "epoch": 6.44550408719346, "grad_norm": 1.8780176545947236, "learning_rate": 5.9273933566887354e-06, "loss": 0.0055, "step": 4731 }, { "epoch": 6.446866485013624, "grad_norm": 1.2466466541059216, "learning_rate": 5.92336333594335e-06, "loss": 0.0157, "step": 4732 }, { "epoch": 6.448228882833788, "grad_norm": 1.2206579230274004, "learning_rate": 5.919334109094233e-06, "loss": 0.0016, "step": 4733 }, { "epoch": 6.449591280653951, "grad_norm": 2.2015735116247774, "learning_rate": 5.91530567692604e-06, "loss": 0.0183, "step": 4734 }, { "epoch": 6.450953678474114, "grad_norm": 0.6178350772904004, "learning_rate": 5.911278040223286e-06, "loss": 0.0093, "step": 4735 }, { "epoch": 6.452316076294278, "grad_norm": 1.6129595281971731, "learning_rate": 5.907251199770323e-06, "loss": 0.0103, "step": 4736 }, { "epoch": 6.453678474114441, "grad_norm": 1.2939436775857214, "learning_rate": 5.903225156351347e-06, "loss": 0.006, "step": 4737 }, { "epoch": 6.455040871934605, "grad_norm": 1.041721374082887, "learning_rate": 5.899199910750406e-06, "loss": 0.0064, "step": 4738 }, { "epoch": 6.456403269754769, "grad_norm": 1.54784651091752, "learning_rate": 5.8951754637513855e-06, "loss": 0.0036, "step": 4739 }, { "epoch": 6.4577656675749315, "grad_norm": 1.2644854005476698, "learning_rate": 5.891151816138021e-06, "loss": 0.002, "step": 4740 }, { "epoch": 6.459128065395095, "grad_norm": 0.8882955879171074, "learning_rate": 5.887128968693887e-06, "loss": 0.0066, "step": 4741 }, { "epoch": 6.460490463215259, "grad_norm": 0.9754214264675606, "learning_rate": 5.883106922202405e-06, "loss": 0.0021, "step": 4742 }, { "epoch": 6.461852861035422, "grad_norm": 2.0188138275866825, "learning_rate": 5.8790856774468385e-06, "loss": 0.0083, "step": 4743 }, { "epoch": 6.463215258855586, "grad_norm": 1.8740083569624923, "learning_rate": 5.875065235210301e-06, "loss": 0.0095, "step": 4744 }, { "epoch": 6.46457765667575, "grad_norm": 1.6955381997163095, "learning_rate": 5.871045596275741e-06, "loss": 0.0145, "step": 4745 }, { "epoch": 6.4659400544959125, "grad_norm": 1.1359667002502145, "learning_rate": 5.86702676142596e-06, "loss": 0.0011, "step": 4746 }, { "epoch": 6.467302452316076, "grad_norm": 0.954276902357899, "learning_rate": 5.863008731443594e-06, "loss": 0.0143, "step": 4747 }, { "epoch": 6.46866485013624, "grad_norm": 1.2620254891828784, "learning_rate": 5.858991507111122e-06, "loss": 0.0072, "step": 4748 }, { "epoch": 6.470027247956403, "grad_norm": 1.266048157302215, "learning_rate": 5.8549750892108794e-06, "loss": 0.0147, "step": 4749 }, { "epoch": 6.471389645776567, "grad_norm": 1.0297047302424334, "learning_rate": 5.850959478525029e-06, "loss": 0.001, "step": 4750 }, { "epoch": 6.4727520435967305, "grad_norm": 0.36475417765259216, "learning_rate": 5.846944675835584e-06, "loss": 0.0014, "step": 4751 }, { "epoch": 6.474114441416893, "grad_norm": 0.9642202233682898, "learning_rate": 5.8429306819244e-06, "loss": 0.0417, "step": 4752 }, { "epoch": 6.475476839237057, "grad_norm": 1.6033913410871228, "learning_rate": 5.838917497573173e-06, "loss": 0.0068, "step": 4753 }, { "epoch": 6.476839237057221, "grad_norm": 0.5929355724704946, "learning_rate": 5.834905123563441e-06, "loss": 0.0012, "step": 4754 }, { "epoch": 6.478201634877384, "grad_norm": 1.0088036846016588, "learning_rate": 5.830893560676587e-06, "loss": 0.0028, "step": 4755 }, { "epoch": 6.479564032697548, "grad_norm": 2.014835431786389, "learning_rate": 5.826882809693839e-06, "loss": 0.015, "step": 4756 }, { "epoch": 6.4809264305177114, "grad_norm": 0.5861286656564886, "learning_rate": 5.822872871396255e-06, "loss": 0.0095, "step": 4757 }, { "epoch": 6.482288828337874, "grad_norm": 0.29992159100972365, "learning_rate": 5.818863746564747e-06, "loss": 0.0262, "step": 4758 }, { "epoch": 6.483651226158038, "grad_norm": 1.0757304037064916, "learning_rate": 5.814855435980065e-06, "loss": 0.0099, "step": 4759 }, { "epoch": 6.485013623978202, "grad_norm": 0.5942621048862539, "learning_rate": 5.810847940422786e-06, "loss": 0.0007, "step": 4760 }, { "epoch": 6.486376021798365, "grad_norm": 2.1770036846681036, "learning_rate": 5.806841260673362e-06, "loss": 0.0247, "step": 4761 }, { "epoch": 6.487738419618529, "grad_norm": 0.6567936700479525, "learning_rate": 5.80283539751205e-06, "loss": 0.008, "step": 4762 }, { "epoch": 6.489100817438692, "grad_norm": 1.6507349212057942, "learning_rate": 5.798830351718975e-06, "loss": 0.0115, "step": 4763 }, { "epoch": 6.490463215258855, "grad_norm": 0.47187285769302245, "learning_rate": 5.794826124074081e-06, "loss": 0.0011, "step": 4764 }, { "epoch": 6.491825613079019, "grad_norm": 1.3199726642471579, "learning_rate": 5.790822715357172e-06, "loss": 0.0082, "step": 4765 }, { "epoch": 6.493188010899183, "grad_norm": 0.7933827031047636, "learning_rate": 5.786820126347876e-06, "loss": 0.006, "step": 4766 }, { "epoch": 6.494550408719346, "grad_norm": 0.18587028411843343, "learning_rate": 5.782818357825673e-06, "loss": 0.0087, "step": 4767 }, { "epoch": 6.4959128065395095, "grad_norm": 0.3437289869170641, "learning_rate": 5.778817410569881e-06, "loss": 0.0155, "step": 4768 }, { "epoch": 6.497275204359673, "grad_norm": 1.7588680538532355, "learning_rate": 5.77481728535965e-06, "loss": 0.0024, "step": 4769 }, { "epoch": 6.498637602179836, "grad_norm": 1.259453247324949, "learning_rate": 5.770817982973986e-06, "loss": 0.0062, "step": 4770 }, { "epoch": 6.5, "grad_norm": 0.26047301865590683, "learning_rate": 5.766819504191714e-06, "loss": 0.001, "step": 4771 }, { "epoch": 6.501362397820164, "grad_norm": 1.0536470778335385, "learning_rate": 5.762821849791516e-06, "loss": 0.0131, "step": 4772 }, { "epoch": 6.502724795640327, "grad_norm": 2.88273977375208, "learning_rate": 5.758825020551909e-06, "loss": 0.0117, "step": 4773 }, { "epoch": 6.5040871934604905, "grad_norm": 0.9072375923524832, "learning_rate": 5.75482901725124e-06, "loss": 0.016, "step": 4774 }, { "epoch": 6.505449591280654, "grad_norm": 3.282744546644544, "learning_rate": 5.750833840667711e-06, "loss": 0.0182, "step": 4775 }, { "epoch": 6.506811989100817, "grad_norm": 2.6656651769755686, "learning_rate": 5.746839491579346e-06, "loss": 0.0074, "step": 4776 }, { "epoch": 6.508174386920981, "grad_norm": 1.7448039951823415, "learning_rate": 5.74284597076402e-06, "loss": 0.0195, "step": 4777 }, { "epoch": 6.509536784741145, "grad_norm": 1.6003345915140315, "learning_rate": 5.7388532789994476e-06, "loss": 0.006, "step": 4778 }, { "epoch": 6.510899182561308, "grad_norm": 1.5434659882262545, "learning_rate": 5.73486141706317e-06, "loss": 0.0015, "step": 4779 }, { "epoch": 6.512261580381471, "grad_norm": 1.4116495643613212, "learning_rate": 5.730870385732581e-06, "loss": 0.0044, "step": 4780 }, { "epoch": 6.513623978201635, "grad_norm": 0.945411008395646, "learning_rate": 5.726880185784898e-06, "loss": 0.0092, "step": 4781 }, { "epoch": 6.514986376021798, "grad_norm": 1.7266565607413273, "learning_rate": 5.722890817997193e-06, "loss": 0.0104, "step": 4782 }, { "epoch": 6.516348773841962, "grad_norm": 0.8369660445186297, "learning_rate": 5.718902283146359e-06, "loss": 0.0209, "step": 4783 }, { "epoch": 6.517711171662126, "grad_norm": 0.3242145433153025, "learning_rate": 5.714914582009139e-06, "loss": 0.0061, "step": 4784 }, { "epoch": 6.5190735694822886, "grad_norm": 0.6897485432685639, "learning_rate": 5.710927715362113e-06, "loss": 0.0015, "step": 4785 }, { "epoch": 6.520435967302452, "grad_norm": 1.6751269967676479, "learning_rate": 5.706941683981688e-06, "loss": 0.0103, "step": 4786 }, { "epoch": 6.521798365122616, "grad_norm": 1.2495620974230421, "learning_rate": 5.7029564886441245e-06, "loss": 0.0124, "step": 4787 }, { "epoch": 6.523160762942779, "grad_norm": 1.0066371285076374, "learning_rate": 5.698972130125499e-06, "loss": 0.0017, "step": 4788 }, { "epoch": 6.524523160762943, "grad_norm": 1.1958788104595648, "learning_rate": 5.694988609201748e-06, "loss": 0.0029, "step": 4789 }, { "epoch": 6.525885558583107, "grad_norm": 0.9437444647846469, "learning_rate": 5.6910059266486315e-06, "loss": 0.0029, "step": 4790 }, { "epoch": 6.5272479564032695, "grad_norm": 1.3936162845215343, "learning_rate": 5.687024083241746e-06, "loss": 0.0023, "step": 4791 }, { "epoch": 6.528610354223433, "grad_norm": 2.413946386333879, "learning_rate": 5.6830430797565314e-06, "loss": 0.0071, "step": 4792 }, { "epoch": 6.529972752043597, "grad_norm": 1.5811455543277009, "learning_rate": 5.679062916968256e-06, "loss": 0.0095, "step": 4793 }, { "epoch": 6.53133514986376, "grad_norm": 1.0797211138130995, "learning_rate": 5.675083595652033e-06, "loss": 0.0154, "step": 4794 }, { "epoch": 6.532697547683924, "grad_norm": 1.4367708256538323, "learning_rate": 5.6711051165828e-06, "loss": 0.0144, "step": 4795 }, { "epoch": 6.5340599455040875, "grad_norm": 1.3971330733822387, "learning_rate": 5.6671274805353435e-06, "loss": 0.0012, "step": 4796 }, { "epoch": 6.53542234332425, "grad_norm": 1.4308278953723323, "learning_rate": 5.663150688284282e-06, "loss": 0.0154, "step": 4797 }, { "epoch": 6.536784741144414, "grad_norm": 3.2979861625463704, "learning_rate": 5.659174740604062e-06, "loss": 0.0127, "step": 4798 }, { "epoch": 6.538147138964578, "grad_norm": 1.195360153116798, "learning_rate": 5.655199638268978e-06, "loss": 0.0052, "step": 4799 }, { "epoch": 6.539509536784741, "grad_norm": 0.9846515510468064, "learning_rate": 5.651225382053143e-06, "loss": 0.004, "step": 4800 }, { "epoch": 6.540871934604905, "grad_norm": 2.8097925926212706, "learning_rate": 5.647251972730523e-06, "loss": 0.02, "step": 4801 }, { "epoch": 6.5422343324250685, "grad_norm": 1.5653223037899051, "learning_rate": 5.643279411074913e-06, "loss": 0.0113, "step": 4802 }, { "epoch": 6.543596730245231, "grad_norm": 1.196073998237165, "learning_rate": 5.639307697859937e-06, "loss": 0.029, "step": 4803 }, { "epoch": 6.544959128065395, "grad_norm": 1.2880103225208945, "learning_rate": 5.635336833859061e-06, "loss": 0.0086, "step": 4804 }, { "epoch": 6.546321525885559, "grad_norm": 0.4245915816711645, "learning_rate": 5.631366819845579e-06, "loss": 0.0015, "step": 4805 }, { "epoch": 6.547683923705722, "grad_norm": 1.4713816355572364, "learning_rate": 5.627397656592624e-06, "loss": 0.0022, "step": 4806 }, { "epoch": 6.549046321525886, "grad_norm": 0.7315277819957453, "learning_rate": 5.623429344873169e-06, "loss": 0.0228, "step": 4807 }, { "epoch": 6.550408719346049, "grad_norm": 1.33929504595552, "learning_rate": 5.619461885460006e-06, "loss": 0.0459, "step": 4808 }, { "epoch": 6.551771117166212, "grad_norm": 0.8743688836510592, "learning_rate": 5.615495279125778e-06, "loss": 0.0103, "step": 4809 }, { "epoch": 6.553133514986376, "grad_norm": 0.8331064923821372, "learning_rate": 5.611529526642945e-06, "loss": 0.0044, "step": 4810 }, { "epoch": 6.55449591280654, "grad_norm": 1.0620088854420555, "learning_rate": 5.607564628783817e-06, "loss": 0.0094, "step": 4811 }, { "epoch": 6.555858310626703, "grad_norm": 1.4497079824048928, "learning_rate": 5.6036005863205234e-06, "loss": 0.0176, "step": 4812 }, { "epoch": 6.5572207084468666, "grad_norm": 0.5500520431503455, "learning_rate": 5.599637400025036e-06, "loss": 0.0008, "step": 4813 }, { "epoch": 6.55858310626703, "grad_norm": 2.317135440070761, "learning_rate": 5.595675070669162e-06, "loss": 0.0076, "step": 4814 }, { "epoch": 6.559945504087193, "grad_norm": 0.8317213695496943, "learning_rate": 5.591713599024531e-06, "loss": 0.0124, "step": 4815 }, { "epoch": 6.561307901907357, "grad_norm": 0.8851833459137655, "learning_rate": 5.587752985862616e-06, "loss": 0.0136, "step": 4816 }, { "epoch": 6.562670299727521, "grad_norm": 1.4242025449903268, "learning_rate": 5.583793231954713e-06, "loss": 0.0233, "step": 4817 }, { "epoch": 6.564032697547684, "grad_norm": 1.9269874003594962, "learning_rate": 5.57983433807196e-06, "loss": 0.0171, "step": 4818 }, { "epoch": 6.5653950953678475, "grad_norm": 1.2320463602542997, "learning_rate": 5.575876304985327e-06, "loss": 0.0182, "step": 4819 }, { "epoch": 6.566757493188011, "grad_norm": 0.9689928123067417, "learning_rate": 5.571919133465605e-06, "loss": 0.0122, "step": 4820 }, { "epoch": 6.568119891008174, "grad_norm": 0.5732656068772559, "learning_rate": 5.567962824283435e-06, "loss": 0.0081, "step": 4821 }, { "epoch": 6.569482288828338, "grad_norm": 0.9561825177832887, "learning_rate": 5.5640073782092695e-06, "loss": 0.0078, "step": 4822 }, { "epoch": 6.570844686648502, "grad_norm": 1.6540718018577718, "learning_rate": 5.560052796013413e-06, "loss": 0.0219, "step": 4823 }, { "epoch": 6.572207084468665, "grad_norm": 2.4770762941813915, "learning_rate": 5.556099078465986e-06, "loss": 0.0034, "step": 4824 }, { "epoch": 6.573569482288828, "grad_norm": 0.6718599353064261, "learning_rate": 5.55214622633695e-06, "loss": 0.0052, "step": 4825 }, { "epoch": 6.574931880108992, "grad_norm": 2.415311875992691, "learning_rate": 5.548194240396099e-06, "loss": 0.0135, "step": 4826 }, { "epoch": 6.576294277929155, "grad_norm": 0.5001557205226856, "learning_rate": 5.544243121413044e-06, "loss": 0.0014, "step": 4827 }, { "epoch": 6.577656675749319, "grad_norm": 0.8980319067681596, "learning_rate": 5.5402928701572516e-06, "loss": 0.0034, "step": 4828 }, { "epoch": 6.579019073569482, "grad_norm": 0.6343100747238811, "learning_rate": 5.536343487397991e-06, "loss": 0.0009, "step": 4829 }, { "epoch": 6.580381471389646, "grad_norm": 1.1670582616019853, "learning_rate": 5.532394973904385e-06, "loss": 0.0153, "step": 4830 }, { "epoch": 6.581743869209809, "grad_norm": 1.6577546462107937, "learning_rate": 5.528447330445379e-06, "loss": 0.012, "step": 4831 }, { "epoch": 6.583106267029972, "grad_norm": 0.9893832547648541, "learning_rate": 5.524500557789745e-06, "loss": 0.0251, "step": 4832 }, { "epoch": 6.584468664850136, "grad_norm": 1.103217199603075, "learning_rate": 5.520554656706094e-06, "loss": 0.0107, "step": 4833 }, { "epoch": 6.5858310626703, "grad_norm": 1.2418292965290165, "learning_rate": 5.516609627962856e-06, "loss": 0.023, "step": 4834 }, { "epoch": 6.587193460490463, "grad_norm": 1.7815177378931621, "learning_rate": 5.512665472328302e-06, "loss": 0.0169, "step": 4835 }, { "epoch": 6.5885558583106265, "grad_norm": 0.5929624373768043, "learning_rate": 5.50872219057053e-06, "loss": 0.0024, "step": 4836 }, { "epoch": 6.58991825613079, "grad_norm": 0.48937538493643223, "learning_rate": 5.5047797834574616e-06, "loss": 0.0089, "step": 4837 }, { "epoch": 6.591280653950953, "grad_norm": 1.067962758147391, "learning_rate": 5.500838251756857e-06, "loss": 0.004, "step": 4838 }, { "epoch": 6.592643051771117, "grad_norm": 0.4454779402056827, "learning_rate": 5.496897596236298e-06, "loss": 0.0051, "step": 4839 }, { "epoch": 6.594005449591281, "grad_norm": 1.092654550858619, "learning_rate": 5.492957817663205e-06, "loss": 0.0176, "step": 4840 }, { "epoch": 6.595367847411444, "grad_norm": 1.7304447878393472, "learning_rate": 5.489018916804813e-06, "loss": 0.0097, "step": 4841 }, { "epoch": 6.5967302452316074, "grad_norm": 1.0562149304706188, "learning_rate": 5.485080894428202e-06, "loss": 0.0109, "step": 4842 }, { "epoch": 6.598092643051771, "grad_norm": 2.360135095124546, "learning_rate": 5.4811437513002765e-06, "loss": 0.0212, "step": 4843 }, { "epoch": 6.599455040871934, "grad_norm": 0.3869257735587314, "learning_rate": 5.477207488187759e-06, "loss": 0.0084, "step": 4844 }, { "epoch": 6.600817438692098, "grad_norm": 2.4999743580543394, "learning_rate": 5.473272105857219e-06, "loss": 0.0057, "step": 4845 }, { "epoch": 6.602179836512262, "grad_norm": 1.9300632176671824, "learning_rate": 5.469337605075034e-06, "loss": 0.0046, "step": 4846 }, { "epoch": 6.603542234332425, "grad_norm": 0.605049630423343, "learning_rate": 5.465403986607426e-06, "loss": 0.0021, "step": 4847 }, { "epoch": 6.604904632152588, "grad_norm": 1.1907153477736283, "learning_rate": 5.461471251220443e-06, "loss": 0.0172, "step": 4848 }, { "epoch": 6.606267029972752, "grad_norm": 0.8871119392042544, "learning_rate": 5.45753939967995e-06, "loss": 0.0081, "step": 4849 }, { "epoch": 6.607629427792915, "grad_norm": 1.0852382937531044, "learning_rate": 5.4536084327516535e-06, "loss": 0.0017, "step": 4850 }, { "epoch": 6.608991825613079, "grad_norm": 0.46388236471542316, "learning_rate": 5.449678351201075e-06, "loss": 0.0089, "step": 4851 }, { "epoch": 6.610354223433243, "grad_norm": 1.5878674105702126, "learning_rate": 5.445749155793578e-06, "loss": 0.0218, "step": 4852 }, { "epoch": 6.6117166212534055, "grad_norm": 1.0434938789229549, "learning_rate": 5.441820847294339e-06, "loss": 0.012, "step": 4853 }, { "epoch": 6.613079019073569, "grad_norm": 1.3915652322095622, "learning_rate": 5.43789342646837e-06, "loss": 0.0012, "step": 4854 }, { "epoch": 6.614441416893733, "grad_norm": 0.5345491861756672, "learning_rate": 5.433966894080514e-06, "loss": 0.0027, "step": 4855 }, { "epoch": 6.615803814713896, "grad_norm": 0.8611081020837928, "learning_rate": 5.430041250895428e-06, "loss": 0.0121, "step": 4856 }, { "epoch": 6.61716621253406, "grad_norm": 0.6353044515266756, "learning_rate": 5.426116497677609e-06, "loss": 0.015, "step": 4857 }, { "epoch": 6.618528610354224, "grad_norm": 0.812455593340174, "learning_rate": 5.422192635191366e-06, "loss": 0.0093, "step": 4858 }, { "epoch": 6.6198910081743865, "grad_norm": 1.0577564561661106, "learning_rate": 5.418269664200857e-06, "loss": 0.005, "step": 4859 }, { "epoch": 6.62125340599455, "grad_norm": 0.2257093952733529, "learning_rate": 5.4143475854700475e-06, "loss": 0.0082, "step": 4860 }, { "epoch": 6.622615803814714, "grad_norm": 1.8969887512521562, "learning_rate": 5.410426399762728e-06, "loss": 0.0304, "step": 4861 }, { "epoch": 6.623978201634877, "grad_norm": 0.8335410293774789, "learning_rate": 5.406506107842532e-06, "loss": 0.0048, "step": 4862 }, { "epoch": 6.625340599455041, "grad_norm": 1.559751522323153, "learning_rate": 5.402586710472899e-06, "loss": 0.0066, "step": 4863 }, { "epoch": 6.6267029972752045, "grad_norm": 0.23454713405010505, "learning_rate": 5.398668208417108e-06, "loss": 0.001, "step": 4864 }, { "epoch": 6.628065395095367, "grad_norm": 1.1782479738617497, "learning_rate": 5.3947506024382665e-06, "loss": 0.0151, "step": 4865 }, { "epoch": 6.629427792915531, "grad_norm": 0.9483940250118089, "learning_rate": 5.39083389329929e-06, "loss": 0.0149, "step": 4866 }, { "epoch": 6.630790190735695, "grad_norm": 1.1941806468380411, "learning_rate": 5.386918081762938e-06, "loss": 0.0085, "step": 4867 }, { "epoch": 6.632152588555858, "grad_norm": 0.3336955315421735, "learning_rate": 5.383003168591781e-06, "loss": 0.0011, "step": 4868 }, { "epoch": 6.633514986376022, "grad_norm": 0.8972856140598083, "learning_rate": 5.379089154548226e-06, "loss": 0.0019, "step": 4869 }, { "epoch": 6.6348773841961854, "grad_norm": 0.742622916027509, "learning_rate": 5.37517604039449e-06, "loss": 0.002, "step": 4870 }, { "epoch": 6.636239782016348, "grad_norm": 1.3370222100576445, "learning_rate": 5.37126382689264e-06, "loss": 0.0066, "step": 4871 }, { "epoch": 6.637602179836512, "grad_norm": 1.2402892050895382, "learning_rate": 5.367352514804543e-06, "loss": 0.0164, "step": 4872 }, { "epoch": 6.638964577656676, "grad_norm": 1.4748442875953003, "learning_rate": 5.363442104891897e-06, "loss": 0.0147, "step": 4873 }, { "epoch": 6.640326975476839, "grad_norm": 2.7131188835141957, "learning_rate": 5.359532597916233e-06, "loss": 0.0189, "step": 4874 }, { "epoch": 6.641689373297003, "grad_norm": 1.113878592677274, "learning_rate": 5.3556239946388945e-06, "loss": 0.0079, "step": 4875 }, { "epoch": 6.643051771117166, "grad_norm": 1.165470167306517, "learning_rate": 5.351716295821055e-06, "loss": 0.0024, "step": 4876 }, { "epoch": 6.644414168937329, "grad_norm": 0.3589272109724705, "learning_rate": 5.347809502223717e-06, "loss": 0.0149, "step": 4877 }, { "epoch": 6.645776566757493, "grad_norm": 0.554724766322155, "learning_rate": 5.343903614607695e-06, "loss": 0.0015, "step": 4878 }, { "epoch": 6.647138964577657, "grad_norm": 0.5774390299306231, "learning_rate": 5.339998633733638e-06, "loss": 0.0018, "step": 4879 }, { "epoch": 6.64850136239782, "grad_norm": 0.8156637996510094, "learning_rate": 5.336094560362006e-06, "loss": 0.0089, "step": 4880 }, { "epoch": 6.6498637602179835, "grad_norm": 1.2584907619021477, "learning_rate": 5.3321913952531e-06, "loss": 0.024, "step": 4881 }, { "epoch": 6.651226158038147, "grad_norm": 0.5743736459627657, "learning_rate": 5.32828913916702e-06, "loss": 0.0079, "step": 4882 }, { "epoch": 6.65258855585831, "grad_norm": 0.9726162998882905, "learning_rate": 5.324387792863719e-06, "loss": 0.016, "step": 4883 }, { "epoch": 6.653950953678474, "grad_norm": 1.2744741141221925, "learning_rate": 5.320487357102949e-06, "loss": 0.0016, "step": 4884 }, { "epoch": 6.655313351498638, "grad_norm": 0.14103221767788857, "learning_rate": 5.316587832644289e-06, "loss": 0.0081, "step": 4885 }, { "epoch": 6.656675749318801, "grad_norm": 1.279328433374178, "learning_rate": 5.312689220247152e-06, "loss": 0.0051, "step": 4886 }, { "epoch": 6.6580381471389645, "grad_norm": 0.801917345838893, "learning_rate": 5.308791520670751e-06, "loss": 0.0031, "step": 4887 }, { "epoch": 6.659400544959128, "grad_norm": 0.9709936286213683, "learning_rate": 5.304894734674153e-06, "loss": 0.0055, "step": 4888 }, { "epoch": 6.660762942779291, "grad_norm": 0.4518826455847437, "learning_rate": 5.300998863016223e-06, "loss": 0.0045, "step": 4889 }, { "epoch": 6.662125340599455, "grad_norm": 1.0924278628999131, "learning_rate": 5.297103906455647e-06, "loss": 0.0007, "step": 4890 }, { "epoch": 6.663487738419619, "grad_norm": 2.2359293545312897, "learning_rate": 5.293209865750952e-06, "loss": 0.0049, "step": 4891 }, { "epoch": 6.664850136239782, "grad_norm": 0.3899817671430184, "learning_rate": 5.289316741660466e-06, "loss": 0.0008, "step": 4892 }, { "epoch": 6.666212534059945, "grad_norm": 0.9808055515437821, "learning_rate": 5.285424534942352e-06, "loss": 0.0056, "step": 4893 }, { "epoch": 6.667574931880109, "grad_norm": 1.185588233693306, "learning_rate": 5.2815332463545865e-06, "loss": 0.001, "step": 4894 }, { "epoch": 6.668937329700272, "grad_norm": 1.5911944690970834, "learning_rate": 5.277642876654978e-06, "loss": 0.0103, "step": 4895 }, { "epoch": 6.670299727520436, "grad_norm": 1.292539115873158, "learning_rate": 5.273753426601145e-06, "loss": 0.002, "step": 4896 }, { "epoch": 6.6716621253406, "grad_norm": 1.306110612388661, "learning_rate": 5.2698648969505224e-06, "loss": 0.0104, "step": 4897 }, { "epoch": 6.6730245231607626, "grad_norm": 1.1697792293693157, "learning_rate": 5.265977288460387e-06, "loss": 0.0312, "step": 4898 }, { "epoch": 6.674386920980926, "grad_norm": 2.2163298626038666, "learning_rate": 5.262090601887808e-06, "loss": 0.0068, "step": 4899 }, { "epoch": 6.67574931880109, "grad_norm": 1.6783231088460444, "learning_rate": 5.258204837989707e-06, "loss": 0.0154, "step": 4900 }, { "epoch": 6.677111716621253, "grad_norm": 1.6065221644461662, "learning_rate": 5.254319997522797e-06, "loss": 0.0089, "step": 4901 }, { "epoch": 6.678474114441417, "grad_norm": 1.7403044378526977, "learning_rate": 5.250436081243631e-06, "loss": 0.0077, "step": 4902 }, { "epoch": 6.679836512261581, "grad_norm": 1.8084741976854481, "learning_rate": 5.24655308990857e-06, "loss": 0.0083, "step": 4903 }, { "epoch": 6.6811989100817435, "grad_norm": 4.191731230299069, "learning_rate": 5.242671024273798e-06, "loss": 0.0143, "step": 4904 }, { "epoch": 6.682561307901907, "grad_norm": 0.590069965104285, "learning_rate": 5.23878988509532e-06, "loss": 0.0017, "step": 4905 }, { "epoch": 6.683923705722071, "grad_norm": 0.684952248488349, "learning_rate": 5.234909673128963e-06, "loss": 0.0021, "step": 4906 }, { "epoch": 6.685286103542234, "grad_norm": 0.6647667939799718, "learning_rate": 5.231030389130375e-06, "loss": 0.0009, "step": 4907 }, { "epoch": 6.686648501362398, "grad_norm": 2.5497656807983105, "learning_rate": 5.227152033855014e-06, "loss": 0.0153, "step": 4908 }, { "epoch": 6.6880108991825615, "grad_norm": 1.4148168962181311, "learning_rate": 5.22327460805816e-06, "loss": 0.0082, "step": 4909 }, { "epoch": 6.689373297002724, "grad_norm": 2.774451377122142, "learning_rate": 5.21939811249492e-06, "loss": 0.0174, "step": 4910 }, { "epoch": 6.690735694822888, "grad_norm": 1.179706375815574, "learning_rate": 5.215522547920203e-06, "loss": 0.0013, "step": 4911 }, { "epoch": 6.692098092643052, "grad_norm": 1.4702479480992179, "learning_rate": 5.2116479150887645e-06, "loss": 0.0042, "step": 4912 }, { "epoch": 6.693460490463215, "grad_norm": 0.48448012727540385, "learning_rate": 5.20777421475515e-06, "loss": 0.0233, "step": 4913 }, { "epoch": 6.694822888283379, "grad_norm": 1.0462107519897337, "learning_rate": 5.2039014476737445e-06, "loss": 0.0044, "step": 4914 }, { "epoch": 6.6961852861035425, "grad_norm": 0.8105038241496467, "learning_rate": 5.200029614598735e-06, "loss": 0.0008, "step": 4915 }, { "epoch": 6.697547683923705, "grad_norm": 0.30272460419359565, "learning_rate": 5.196158716284129e-06, "loss": 0.0013, "step": 4916 }, { "epoch": 6.698910081743869, "grad_norm": 1.0264679671421453, "learning_rate": 5.19228875348377e-06, "loss": 0.0091, "step": 4917 }, { "epoch": 6.700272479564033, "grad_norm": 0.5763727686579537, "learning_rate": 5.188419726951295e-06, "loss": 0.0155, "step": 4918 }, { "epoch": 6.701634877384196, "grad_norm": 0.7405594175773749, "learning_rate": 5.1845516374401786e-06, "loss": 0.0086, "step": 4919 }, { "epoch": 6.70299727520436, "grad_norm": 1.3795981714566905, "learning_rate": 5.180684485703698e-06, "loss": 0.0085, "step": 4920 }, { "epoch": 6.704359673024523, "grad_norm": 1.4933763099173787, "learning_rate": 5.176818272494951e-06, "loss": 0.0069, "step": 4921 }, { "epoch": 6.705722070844686, "grad_norm": 1.3990205934076978, "learning_rate": 5.17295299856686e-06, "loss": 0.0009, "step": 4922 }, { "epoch": 6.70708446866485, "grad_norm": 1.953315539709361, "learning_rate": 5.169088664672159e-06, "loss": 0.0108, "step": 4923 }, { "epoch": 6.708446866485014, "grad_norm": 0.46525616964481464, "learning_rate": 5.165225271563403e-06, "loss": 0.0248, "step": 4924 }, { "epoch": 6.709809264305177, "grad_norm": 2.0217593766469477, "learning_rate": 5.161362819992954e-06, "loss": 0.0234, "step": 4925 }, { "epoch": 6.7111716621253406, "grad_norm": 1.6586297216939516, "learning_rate": 5.157501310713005e-06, "loss": 0.0012, "step": 4926 }, { "epoch": 6.712534059945504, "grad_norm": 0.6863863474124108, "learning_rate": 5.1536407444755544e-06, "loss": 0.0337, "step": 4927 }, { "epoch": 6.713896457765667, "grad_norm": 2.572415525679391, "learning_rate": 5.14978112203241e-06, "loss": 0.0101, "step": 4928 }, { "epoch": 6.715258855585831, "grad_norm": 1.7400886903976762, "learning_rate": 5.145922444135224e-06, "loss": 0.0047, "step": 4929 }, { "epoch": 6.716621253405995, "grad_norm": 0.682132710683588, "learning_rate": 5.142064711535434e-06, "loss": 0.0086, "step": 4930 }, { "epoch": 6.717983651226158, "grad_norm": 1.739634523874169, "learning_rate": 5.138207924984313e-06, "loss": 0.0036, "step": 4931 }, { "epoch": 6.7193460490463215, "grad_norm": 3.2461737780600872, "learning_rate": 5.134352085232942e-06, "loss": 0.0053, "step": 4932 }, { "epoch": 6.720708446866485, "grad_norm": 0.6128040025628357, "learning_rate": 5.130497193032213e-06, "loss": 0.0054, "step": 4933 }, { "epoch": 6.722070844686648, "grad_norm": 0.5405939910633749, "learning_rate": 5.126643249132843e-06, "loss": 0.0081, "step": 4934 }, { "epoch": 6.723433242506812, "grad_norm": 1.4315667547928945, "learning_rate": 5.122790254285359e-06, "loss": 0.0024, "step": 4935 }, { "epoch": 6.724795640326976, "grad_norm": 2.2282838301277677, "learning_rate": 5.118938209240112e-06, "loss": 0.0246, "step": 4936 }, { "epoch": 6.726158038147139, "grad_norm": 0.28459125862264034, "learning_rate": 5.11508711474725e-06, "loss": 0.0083, "step": 4937 }, { "epoch": 6.727520435967302, "grad_norm": 2.0794042397301506, "learning_rate": 5.111236971556755e-06, "loss": 0.0154, "step": 4938 }, { "epoch": 6.728882833787466, "grad_norm": 3.1395227545648767, "learning_rate": 5.107387780418411e-06, "loss": 0.0064, "step": 4939 }, { "epoch": 6.730245231607629, "grad_norm": 1.044562048340091, "learning_rate": 5.103539542081815e-06, "loss": 0.0087, "step": 4940 }, { "epoch": 6.731607629427793, "grad_norm": 1.9868042490956717, "learning_rate": 5.099692257296398e-06, "loss": 0.0125, "step": 4941 }, { "epoch": 6.732970027247957, "grad_norm": 1.6643900164565335, "learning_rate": 5.09584592681138e-06, "loss": 0.0098, "step": 4942 }, { "epoch": 6.73433242506812, "grad_norm": 0.3409560623245381, "learning_rate": 5.092000551375814e-06, "loss": 0.0008, "step": 4943 }, { "epoch": 6.735694822888283, "grad_norm": 0.3966082348600136, "learning_rate": 5.088156131738554e-06, "loss": 0.0071, "step": 4944 }, { "epoch": 6.737057220708447, "grad_norm": 0.36392885893784666, "learning_rate": 5.084312668648277e-06, "loss": 0.0077, "step": 4945 }, { "epoch": 6.73841961852861, "grad_norm": 0.5600022216368595, "learning_rate": 5.080470162853473e-06, "loss": 0.0137, "step": 4946 }, { "epoch": 6.739782016348774, "grad_norm": 1.5079563345875195, "learning_rate": 5.076628615102435e-06, "loss": 0.0015, "step": 4947 }, { "epoch": 6.741144414168938, "grad_norm": 0.797085406262801, "learning_rate": 5.072788026143287e-06, "loss": 0.0088, "step": 4948 }, { "epoch": 6.7425068119891005, "grad_norm": 0.8605648098717793, "learning_rate": 5.068948396723947e-06, "loss": 0.0156, "step": 4949 }, { "epoch": 6.743869209809264, "grad_norm": 1.3005400201777604, "learning_rate": 5.065109727592164e-06, "loss": 0.0042, "step": 4950 }, { "epoch": 6.745231607629428, "grad_norm": 2.0107576478491738, "learning_rate": 5.061272019495486e-06, "loss": 0.007, "step": 4951 }, { "epoch": 6.746594005449591, "grad_norm": 0.9580153044034275, "learning_rate": 5.0574352731812816e-06, "loss": 0.001, "step": 4952 }, { "epoch": 6.747956403269755, "grad_norm": 1.4150131988459964, "learning_rate": 5.053599489396732e-06, "loss": 0.0052, "step": 4953 }, { "epoch": 6.7493188010899186, "grad_norm": 0.6544858535397474, "learning_rate": 5.049764668888826e-06, "loss": 0.0008, "step": 4954 }, { "epoch": 6.7506811989100814, "grad_norm": 1.5925194752170024, "learning_rate": 5.045930812404372e-06, "loss": 0.0138, "step": 4955 }, { "epoch": 6.752043596730245, "grad_norm": 2.026934464303342, "learning_rate": 5.042097920689981e-06, "loss": 0.0095, "step": 4956 }, { "epoch": 6.753405994550409, "grad_norm": 1.3030782355971475, "learning_rate": 5.038265994492083e-06, "loss": 0.0084, "step": 4957 }, { "epoch": 6.754768392370572, "grad_norm": 0.5927862593821485, "learning_rate": 5.0344350345569244e-06, "loss": 0.0014, "step": 4958 }, { "epoch": 6.756130790190736, "grad_norm": 1.4669892117367112, "learning_rate": 5.03060504163055e-06, "loss": 0.014, "step": 4959 }, { "epoch": 6.7574931880108995, "grad_norm": 1.49151020498711, "learning_rate": 5.026776016458831e-06, "loss": 0.001, "step": 4960 }, { "epoch": 6.758855585831062, "grad_norm": 0.7562992590475374, "learning_rate": 5.0229479597874355e-06, "loss": 0.0007, "step": 4961 }, { "epoch": 6.760217983651226, "grad_norm": 1.772924474117609, "learning_rate": 5.019120872361858e-06, "loss": 0.0189, "step": 4962 }, { "epoch": 6.76158038147139, "grad_norm": 0.8154757585004843, "learning_rate": 5.015294754927389e-06, "loss": 0.0317, "step": 4963 }, { "epoch": 6.762942779291553, "grad_norm": 0.6767930733548249, "learning_rate": 5.011469608229143e-06, "loss": 0.0065, "step": 4964 }, { "epoch": 6.764305177111717, "grad_norm": 1.039305685579258, "learning_rate": 5.007645433012042e-06, "loss": 0.0225, "step": 4965 }, { "epoch": 6.76566757493188, "grad_norm": 0.8893647020700622, "learning_rate": 5.003822230020814e-06, "loss": 0.0082, "step": 4966 }, { "epoch": 6.767029972752043, "grad_norm": 1.288134823281538, "learning_rate": 5.000000000000003e-06, "loss": 0.0011, "step": 4967 }, { "epoch": 6.768392370572207, "grad_norm": 0.5005564829008718, "learning_rate": 4.996178743693956e-06, "loss": 0.0144, "step": 4968 }, { "epoch": 6.769754768392371, "grad_norm": 5.146394285535339, "learning_rate": 4.992358461846839e-06, "loss": 0.0261, "step": 4969 }, { "epoch": 6.771117166212534, "grad_norm": 1.191694936842193, "learning_rate": 4.988539155202631e-06, "loss": 0.0027, "step": 4970 }, { "epoch": 6.772479564032698, "grad_norm": 1.7277274739052932, "learning_rate": 4.984720824505105e-06, "loss": 0.0034, "step": 4971 }, { "epoch": 6.773841961852861, "grad_norm": 1.1822519379583143, "learning_rate": 4.980903470497863e-06, "loss": 0.003, "step": 4972 }, { "epoch": 6.775204359673024, "grad_norm": 2.7868531153205605, "learning_rate": 4.977087093924299e-06, "loss": 0.0206, "step": 4973 }, { "epoch": 6.776566757493188, "grad_norm": 2.0725976292161796, "learning_rate": 4.97327169552763e-06, "loss": 0.014, "step": 4974 }, { "epoch": 6.777929155313352, "grad_norm": 2.4904395076030177, "learning_rate": 4.969457276050882e-06, "loss": 0.0145, "step": 4975 }, { "epoch": 6.779291553133515, "grad_norm": 2.666428298791147, "learning_rate": 4.965643836236878e-06, "loss": 0.0067, "step": 4976 }, { "epoch": 6.7806539509536785, "grad_norm": 0.7040621041712615, "learning_rate": 4.961831376828267e-06, "loss": 0.011, "step": 4977 }, { "epoch": 6.782016348773842, "grad_norm": 1.9243398326677568, "learning_rate": 4.95801989856749e-06, "loss": 0.0019, "step": 4978 }, { "epoch": 6.783378746594005, "grad_norm": 0.9585718227018574, "learning_rate": 4.954209402196813e-06, "loss": 0.0107, "step": 4979 }, { "epoch": 6.784741144414169, "grad_norm": 1.8104006403152793, "learning_rate": 4.950399888458298e-06, "loss": 0.0182, "step": 4980 }, { "epoch": 6.786103542234333, "grad_norm": 0.755298830684112, "learning_rate": 4.946591358093822e-06, "loss": 0.001, "step": 4981 }, { "epoch": 6.787465940054496, "grad_norm": 2.9310082837754887, "learning_rate": 4.942783811845075e-06, "loss": 0.0129, "step": 4982 }, { "epoch": 6.7888283378746594, "grad_norm": 1.2089911484843108, "learning_rate": 4.93897725045354e-06, "loss": 0.0109, "step": 4983 }, { "epoch": 6.790190735694823, "grad_norm": 4.851038732329618, "learning_rate": 4.935171674660529e-06, "loss": 0.004, "step": 4984 }, { "epoch": 6.791553133514986, "grad_norm": 3.4747463384356485, "learning_rate": 4.931367085207142e-06, "loss": 0.0148, "step": 4985 }, { "epoch": 6.79291553133515, "grad_norm": 1.8588676235445247, "learning_rate": 4.927563482834298e-06, "loss": 0.0026, "step": 4986 }, { "epoch": 6.794277929155314, "grad_norm": 0.5436508873193387, "learning_rate": 4.923760868282726e-06, "loss": 0.0009, "step": 4987 }, { "epoch": 6.795640326975477, "grad_norm": 1.2343965968745751, "learning_rate": 4.9199592422929545e-06, "loss": 0.0068, "step": 4988 }, { "epoch": 6.79700272479564, "grad_norm": 1.2950100859613027, "learning_rate": 4.916158605605327e-06, "loss": 0.0085, "step": 4989 }, { "epoch": 6.798365122615804, "grad_norm": 1.5521512332929428, "learning_rate": 4.912358958959984e-06, "loss": 0.0116, "step": 4990 }, { "epoch": 6.799727520435967, "grad_norm": 1.573240473250025, "learning_rate": 4.9085603030968874e-06, "loss": 0.0164, "step": 4991 }, { "epoch": 6.801089918256131, "grad_norm": 1.5993107111142817, "learning_rate": 4.904762638755793e-06, "loss": 0.0012, "step": 4992 }, { "epoch": 6.802452316076295, "grad_norm": 1.3372052770899643, "learning_rate": 4.900965966676271e-06, "loss": 0.0019, "step": 4993 }, { "epoch": 6.8038147138964575, "grad_norm": 1.0907245743157215, "learning_rate": 4.8971702875977e-06, "loss": 0.0091, "step": 4994 }, { "epoch": 6.805177111716621, "grad_norm": 2.1826613793048515, "learning_rate": 4.893375602259256e-06, "loss": 0.0127, "step": 4995 }, { "epoch": 6.806539509536785, "grad_norm": 1.8616689061931508, "learning_rate": 4.889581911399933e-06, "loss": 0.0019, "step": 4996 }, { "epoch": 6.807901907356948, "grad_norm": 0.5290881198551354, "learning_rate": 4.88578921575852e-06, "loss": 0.0008, "step": 4997 }, { "epoch": 6.809264305177112, "grad_norm": 2.7436090019364814, "learning_rate": 4.881997516073619e-06, "loss": 0.0228, "step": 4998 }, { "epoch": 6.810626702997276, "grad_norm": 1.9267001127441223, "learning_rate": 4.878206813083642e-06, "loss": 0.002, "step": 4999 }, { "epoch": 6.8119891008174385, "grad_norm": 1.4704248357983332, "learning_rate": 4.874417107526796e-06, "loss": 0.0216, "step": 5000 }, { "epoch": 6.813351498637602, "grad_norm": 2.605125364467944, "learning_rate": 4.870628400141103e-06, "loss": 0.0136, "step": 5001 }, { "epoch": 6.814713896457766, "grad_norm": 1.2773584976477323, "learning_rate": 4.866840691664382e-06, "loss": 0.0177, "step": 5002 }, { "epoch": 6.816076294277929, "grad_norm": 1.5989800963808307, "learning_rate": 4.863053982834266e-06, "loss": 0.0089, "step": 5003 }, { "epoch": 6.817438692098093, "grad_norm": 1.137323079560383, "learning_rate": 4.859268274388194e-06, "loss": 0.0111, "step": 5004 }, { "epoch": 6.8188010899182565, "grad_norm": 3.0859777781065736, "learning_rate": 4.855483567063397e-06, "loss": 0.0162, "step": 5005 }, { "epoch": 6.820163487738419, "grad_norm": 2.3395756199378908, "learning_rate": 4.85169986159693e-06, "loss": 0.0113, "step": 5006 }, { "epoch": 6.821525885558583, "grad_norm": 2.8566416111139565, "learning_rate": 4.847917158725633e-06, "loss": 0.0162, "step": 5007 }, { "epoch": 6.822888283378747, "grad_norm": 2.272273191665905, "learning_rate": 4.84413545918617e-06, "loss": 0.0263, "step": 5008 }, { "epoch": 6.82425068119891, "grad_norm": 0.5949538571896212, "learning_rate": 4.840354763714991e-06, "loss": 0.0139, "step": 5009 }, { "epoch": 6.825613079019074, "grad_norm": 2.409786717638896, "learning_rate": 4.8365750730483665e-06, "loss": 0.0145, "step": 5010 }, { "epoch": 6.8269754768392374, "grad_norm": 0.7740933705663487, "learning_rate": 4.832796387922366e-06, "loss": 0.0051, "step": 5011 }, { "epoch": 6.8283378746594, "grad_norm": 0.7090045165505031, "learning_rate": 4.829018709072855e-06, "loss": 0.001, "step": 5012 }, { "epoch": 6.829700272479564, "grad_norm": 1.3278778381468206, "learning_rate": 4.825242037235516e-06, "loss": 0.0019, "step": 5013 }, { "epoch": 6.831062670299728, "grad_norm": 3.2050901205505475, "learning_rate": 4.821466373145824e-06, "loss": 0.0078, "step": 5014 }, { "epoch": 6.832425068119891, "grad_norm": 0.8603677526237486, "learning_rate": 4.817691717539066e-06, "loss": 0.0082, "step": 5015 }, { "epoch": 6.833787465940055, "grad_norm": 2.009195092964973, "learning_rate": 4.813918071150332e-06, "loss": 0.0304, "step": 5016 }, { "epoch": 6.835149863760218, "grad_norm": 2.889710561952771, "learning_rate": 4.810145434714507e-06, "loss": 0.0173, "step": 5017 }, { "epoch": 6.836512261580381, "grad_norm": 1.7840546101242623, "learning_rate": 4.806373808966293e-06, "loss": 0.0192, "step": 5018 }, { "epoch": 6.837874659400545, "grad_norm": 2.465545771914786, "learning_rate": 4.802603194640179e-06, "loss": 0.026, "step": 5019 }, { "epoch": 6.839237057220709, "grad_norm": 3.3483759784614353, "learning_rate": 4.798833592470473e-06, "loss": 0.004, "step": 5020 }, { "epoch": 6.840599455040872, "grad_norm": 2.7194284004041296, "learning_rate": 4.795065003191273e-06, "loss": 0.0071, "step": 5021 }, { "epoch": 6.8419618528610355, "grad_norm": 1.0446333037014748, "learning_rate": 4.7912974275364855e-06, "loss": 0.0156, "step": 5022 }, { "epoch": 6.843324250681199, "grad_norm": 0.766673203690291, "learning_rate": 4.787530866239826e-06, "loss": 0.0099, "step": 5023 }, { "epoch": 6.844686648501362, "grad_norm": 0.3800078292544662, "learning_rate": 4.783765320034798e-06, "loss": 0.0074, "step": 5024 }, { "epoch": 6.846049046321526, "grad_norm": 0.6705160203491384, "learning_rate": 4.780000789654721e-06, "loss": 0.0014, "step": 5025 }, { "epoch": 6.84741144414169, "grad_norm": 1.367994865533106, "learning_rate": 4.776237275832704e-06, "loss": 0.0188, "step": 5026 }, { "epoch": 6.848773841961853, "grad_norm": 1.3633110511769042, "learning_rate": 4.772474779301669e-06, "loss": 0.0243, "step": 5027 }, { "epoch": 6.8501362397820165, "grad_norm": 2.9248986063455455, "learning_rate": 4.768713300794338e-06, "loss": 0.0109, "step": 5028 }, { "epoch": 6.85149863760218, "grad_norm": 0.3833663959707023, "learning_rate": 4.764952841043229e-06, "loss": 0.0005, "step": 5029 }, { "epoch": 6.852861035422343, "grad_norm": 0.5739097953354444, "learning_rate": 4.761193400780667e-06, "loss": 0.0085, "step": 5030 }, { "epoch": 6.854223433242507, "grad_norm": 1.3054403002178037, "learning_rate": 4.7574349807387735e-06, "loss": 0.008, "step": 5031 }, { "epoch": 6.855585831062671, "grad_norm": 2.2295065000930956, "learning_rate": 4.753677581649474e-06, "loss": 0.0161, "step": 5032 }, { "epoch": 6.856948228882834, "grad_norm": 2.7647636940203504, "learning_rate": 4.749921204244503e-06, "loss": 0.0154, "step": 5033 }, { "epoch": 6.858310626702997, "grad_norm": 2.1651816399265016, "learning_rate": 4.746165849255378e-06, "loss": 0.0044, "step": 5034 }, { "epoch": 6.859673024523161, "grad_norm": 1.2199762972264645, "learning_rate": 4.742411517413439e-06, "loss": 0.0025, "step": 5035 }, { "epoch": 6.861035422343324, "grad_norm": 0.9142965322266204, "learning_rate": 4.7386582094498045e-06, "loss": 0.0154, "step": 5036 }, { "epoch": 6.862397820163488, "grad_norm": 1.23585468180275, "learning_rate": 4.734905926095415e-06, "loss": 0.0071, "step": 5037 }, { "epoch": 6.863760217983652, "grad_norm": 2.15454802059066, "learning_rate": 4.731154668080991e-06, "loss": 0.0124, "step": 5038 }, { "epoch": 6.8651226158038146, "grad_norm": 1.3672855702101587, "learning_rate": 4.72740443613707e-06, "loss": 0.0094, "step": 5039 }, { "epoch": 6.866485013623978, "grad_norm": 0.6780317250508351, "learning_rate": 4.723655230993985e-06, "loss": 0.0014, "step": 5040 }, { "epoch": 6.867847411444142, "grad_norm": 1.4761810341146955, "learning_rate": 4.7199070533818615e-06, "loss": 0.0033, "step": 5041 }, { "epoch": 6.869209809264305, "grad_norm": 1.872494027970414, "learning_rate": 4.716159904030637e-06, "loss": 0.0027, "step": 5042 }, { "epoch": 6.870572207084469, "grad_norm": 1.1329331216390635, "learning_rate": 4.712413783670036e-06, "loss": 0.0154, "step": 5043 }, { "epoch": 6.871934604904633, "grad_norm": 2.4025079786185435, "learning_rate": 4.708668693029591e-06, "loss": 0.0221, "step": 5044 }, { "epoch": 6.8732970027247955, "grad_norm": 0.346076863585189, "learning_rate": 4.704924632838636e-06, "loss": 0.0014, "step": 5045 }, { "epoch": 6.874659400544959, "grad_norm": 0.7529888966623512, "learning_rate": 4.701181603826296e-06, "loss": 0.0009, "step": 5046 }, { "epoch": 6.876021798365123, "grad_norm": 0.9649646332625907, "learning_rate": 4.6974396067215035e-06, "loss": 0.0006, "step": 5047 }, { "epoch": 6.877384196185286, "grad_norm": 0.9486268762713832, "learning_rate": 4.6936986422529795e-06, "loss": 0.01, "step": 5048 }, { "epoch": 6.87874659400545, "grad_norm": 2.458782446738247, "learning_rate": 4.689958711149258e-06, "loss": 0.0196, "step": 5049 }, { "epoch": 6.8801089918256135, "grad_norm": 0.7084192191208687, "learning_rate": 4.686219814138659e-06, "loss": 0.006, "step": 5050 }, { "epoch": 6.881471389645776, "grad_norm": 1.4824047563639071, "learning_rate": 4.682481951949306e-06, "loss": 0.0097, "step": 5051 }, { "epoch": 6.88283378746594, "grad_norm": 1.3978680399826655, "learning_rate": 4.678745125309128e-06, "loss": 0.0265, "step": 5052 }, { "epoch": 6.884196185286104, "grad_norm": 1.5014180325435857, "learning_rate": 4.675009334945836e-06, "loss": 0.0017, "step": 5053 }, { "epoch": 6.885558583106267, "grad_norm": 0.5284075736189215, "learning_rate": 4.671274581586959e-06, "loss": 0.0011, "step": 5054 }, { "epoch": 6.886920980926431, "grad_norm": 1.0055973773018354, "learning_rate": 4.667540865959801e-06, "loss": 0.0104, "step": 5055 }, { "epoch": 6.8882833787465945, "grad_norm": 0.519370350155319, "learning_rate": 4.663808188791492e-06, "loss": 0.0068, "step": 5056 }, { "epoch": 6.889645776566757, "grad_norm": 0.39185405961979497, "learning_rate": 4.660076550808936e-06, "loss": 0.0011, "step": 5057 }, { "epoch": 6.891008174386921, "grad_norm": 2.341214506717162, "learning_rate": 4.656345952738842e-06, "loss": 0.0075, "step": 5058 }, { "epoch": 6.892370572207085, "grad_norm": 1.0696437464846904, "learning_rate": 4.652616395307724e-06, "loss": 0.0012, "step": 5059 }, { "epoch": 6.893732970027248, "grad_norm": 0.6535516945673678, "learning_rate": 4.648887879241879e-06, "loss": 0.0091, "step": 5060 }, { "epoch": 6.895095367847412, "grad_norm": 0.2092978415317374, "learning_rate": 4.645160405267414e-06, "loss": 0.0084, "step": 5061 }, { "epoch": 6.896457765667575, "grad_norm": 1.2684612781310953, "learning_rate": 4.64143397411023e-06, "loss": 0.0114, "step": 5062 }, { "epoch": 6.897820163487738, "grad_norm": 1.8409098676646258, "learning_rate": 4.637708586496018e-06, "loss": 0.0069, "step": 5063 }, { "epoch": 6.899182561307902, "grad_norm": 1.332154662449708, "learning_rate": 4.633984243150277e-06, "loss": 0.0043, "step": 5064 }, { "epoch": 6.900544959128066, "grad_norm": 1.9025501660242972, "learning_rate": 4.63026094479829e-06, "loss": 0.0139, "step": 5065 }, { "epoch": 6.901907356948229, "grad_norm": 0.9620176985801924, "learning_rate": 4.62653869216515e-06, "loss": 0.0006, "step": 5066 }, { "epoch": 6.9032697547683926, "grad_norm": 0.6722287151228725, "learning_rate": 4.622817485975729e-06, "loss": 0.0073, "step": 5067 }, { "epoch": 6.904632152588556, "grad_norm": 0.23145395696666324, "learning_rate": 4.619097326954719e-06, "loss": 0.0007, "step": 5068 }, { "epoch": 6.905994550408719, "grad_norm": 1.4462562523229654, "learning_rate": 4.61537821582659e-06, "loss": 0.0084, "step": 5069 }, { "epoch": 6.907356948228883, "grad_norm": 0.45843351855782566, "learning_rate": 4.611660153315607e-06, "loss": 0.0005, "step": 5070 }, { "epoch": 6.908719346049047, "grad_norm": 1.405150222853843, "learning_rate": 4.6079431401458444e-06, "loss": 0.0115, "step": 5071 }, { "epoch": 6.91008174386921, "grad_norm": 0.35960122684063733, "learning_rate": 4.604227177041156e-06, "loss": 0.0005, "step": 5072 }, { "epoch": 6.9114441416893735, "grad_norm": 1.6830386973180191, "learning_rate": 4.600512264725204e-06, "loss": 0.0102, "step": 5073 }, { "epoch": 6.912806539509537, "grad_norm": 0.8429022591101153, "learning_rate": 4.596798403921443e-06, "loss": 0.0273, "step": 5074 }, { "epoch": 6.9141689373297, "grad_norm": 1.0334505530790978, "learning_rate": 4.593085595353122e-06, "loss": 0.0086, "step": 5075 }, { "epoch": 6.915531335149864, "grad_norm": 0.5281521660885385, "learning_rate": 4.589373839743282e-06, "loss": 0.0112, "step": 5076 }, { "epoch": 6.916893732970028, "grad_norm": 0.26466247643649604, "learning_rate": 4.585663137814757e-06, "loss": 0.0006, "step": 5077 }, { "epoch": 6.918256130790191, "grad_norm": 0.5089225550977424, "learning_rate": 4.581953490290188e-06, "loss": 0.0096, "step": 5078 }, { "epoch": 6.919618528610354, "grad_norm": 0.8195528682508122, "learning_rate": 4.578244897891994e-06, "loss": 0.0016, "step": 5079 }, { "epoch": 6.920980926430518, "grad_norm": 0.7487057476252982, "learning_rate": 4.5745373613424075e-06, "loss": 0.0076, "step": 5080 }, { "epoch": 6.922343324250681, "grad_norm": 1.5928514093157817, "learning_rate": 4.57083088136344e-06, "loss": 0.0164, "step": 5081 }, { "epoch": 6.923705722070845, "grad_norm": 1.7611832333636417, "learning_rate": 4.567125458676898e-06, "loss": 0.005, "step": 5082 }, { "epoch": 6.925068119891008, "grad_norm": 1.5810235732373261, "learning_rate": 4.563421094004394e-06, "loss": 0.0141, "step": 5083 }, { "epoch": 6.926430517711172, "grad_norm": 0.8299164089687089, "learning_rate": 4.559717788067316e-06, "loss": 0.0219, "step": 5084 }, { "epoch": 6.927792915531335, "grad_norm": 0.8531383077440312, "learning_rate": 4.556015541586872e-06, "loss": 0.0013, "step": 5085 }, { "epoch": 6.929155313351498, "grad_norm": 0.9011601912149076, "learning_rate": 4.552314355284037e-06, "loss": 0.0237, "step": 5086 }, { "epoch": 6.930517711171662, "grad_norm": 0.2691186942870097, "learning_rate": 4.5486142298795995e-06, "loss": 0.0006, "step": 5087 }, { "epoch": 6.931880108991826, "grad_norm": 0.12645264988496718, "learning_rate": 4.544915166094127e-06, "loss": 0.0004, "step": 5088 }, { "epoch": 6.933242506811989, "grad_norm": 0.420096781948223, "learning_rate": 4.541217164647984e-06, "loss": 0.0007, "step": 5089 }, { "epoch": 6.9346049046321525, "grad_norm": 0.7452771016387709, "learning_rate": 4.537520226261333e-06, "loss": 0.0011, "step": 5090 }, { "epoch": 6.935967302452316, "grad_norm": 2.328890416675744, "learning_rate": 4.533824351654126e-06, "loss": 0.0039, "step": 5091 }, { "epoch": 6.937329700272479, "grad_norm": 0.4251072636775671, "learning_rate": 4.530129541546115e-06, "loss": 0.0182, "step": 5092 }, { "epoch": 6.938692098092643, "grad_norm": 1.0582475419629633, "learning_rate": 4.52643579665683e-06, "loss": 0.001, "step": 5093 }, { "epoch": 6.940054495912807, "grad_norm": 0.9297629588899312, "learning_rate": 4.522743117705603e-06, "loss": 0.0116, "step": 5094 }, { "epoch": 6.94141689373297, "grad_norm": 0.7169806466586841, "learning_rate": 4.5190515054115625e-06, "loss": 0.0082, "step": 5095 }, { "epoch": 6.9427792915531334, "grad_norm": 1.0629949595594868, "learning_rate": 4.5153609604936125e-06, "loss": 0.0022, "step": 5096 }, { "epoch": 6.944141689373297, "grad_norm": 0.33399394875936667, "learning_rate": 4.511671483670475e-06, "loss": 0.0031, "step": 5097 }, { "epoch": 6.94550408719346, "grad_norm": 3.6785386026049065, "learning_rate": 4.50798307566064e-06, "loss": 0.0174, "step": 5098 }, { "epoch": 6.946866485013624, "grad_norm": 2.1168039990073155, "learning_rate": 4.5042957371824055e-06, "loss": 0.0246, "step": 5099 }, { "epoch": 6.948228882833788, "grad_norm": 1.83125156845177, "learning_rate": 4.500609468953851e-06, "loss": 0.0109, "step": 5100 }, { "epoch": 6.949591280653951, "grad_norm": 0.37250846531124937, "learning_rate": 4.496924271692848e-06, "loss": 0.0084, "step": 5101 }, { "epoch": 6.950953678474114, "grad_norm": 1.3691762562130105, "learning_rate": 4.493240146117066e-06, "loss": 0.0157, "step": 5102 }, { "epoch": 6.952316076294278, "grad_norm": 1.6571143820161074, "learning_rate": 4.489557092943962e-06, "loss": 0.0162, "step": 5103 }, { "epoch": 6.953678474114441, "grad_norm": 0.6744058966777876, "learning_rate": 4.485875112890791e-06, "loss": 0.0253, "step": 5104 }, { "epoch": 6.955040871934605, "grad_norm": 0.9814571292526042, "learning_rate": 4.482194206674585e-06, "loss": 0.0019, "step": 5105 }, { "epoch": 6.956403269754769, "grad_norm": 1.1530462326438853, "learning_rate": 4.478514375012173e-06, "loss": 0.0157, "step": 5106 }, { "epoch": 6.9577656675749315, "grad_norm": 1.1822618428950478, "learning_rate": 4.47483561862018e-06, "loss": 0.0076, "step": 5107 }, { "epoch": 6.959128065395095, "grad_norm": 1.2108852397935927, "learning_rate": 4.471157938215017e-06, "loss": 0.0081, "step": 5108 }, { "epoch": 6.960490463215259, "grad_norm": 0.9279276186678365, "learning_rate": 4.467481334512892e-06, "loss": 0.0165, "step": 5109 }, { "epoch": 6.961852861035422, "grad_norm": 1.3010798386377849, "learning_rate": 4.463805808229788e-06, "loss": 0.0104, "step": 5110 }, { "epoch": 6.963215258855586, "grad_norm": 2.0285037805679833, "learning_rate": 4.4601313600814966e-06, "loss": 0.0152, "step": 5111 }, { "epoch": 6.96457765667575, "grad_norm": 1.019563919941536, "learning_rate": 4.456457990783586e-06, "loss": 0.0008, "step": 5112 }, { "epoch": 6.9659400544959125, "grad_norm": 0.5729973153142633, "learning_rate": 4.452785701051412e-06, "loss": 0.0089, "step": 5113 }, { "epoch": 6.967302452316076, "grad_norm": 1.1346542427627946, "learning_rate": 4.449114491600143e-06, "loss": 0.0253, "step": 5114 }, { "epoch": 6.96866485013624, "grad_norm": 0.7139336614003822, "learning_rate": 4.445444363144709e-06, "loss": 0.0029, "step": 5115 }, { "epoch": 6.970027247956403, "grad_norm": 1.7674984976749835, "learning_rate": 4.441775316399849e-06, "loss": 0.0099, "step": 5116 }, { "epoch": 6.971389645776567, "grad_norm": 3.2533350767807008, "learning_rate": 4.438107352080076e-06, "loss": 0.005, "step": 5117 }, { "epoch": 6.9727520435967305, "grad_norm": 0.6139998035359011, "learning_rate": 4.43444047089971e-06, "loss": 0.0086, "step": 5118 }, { "epoch": 6.974114441416893, "grad_norm": 1.2150766501404604, "learning_rate": 4.430774673572841e-06, "loss": 0.0032, "step": 5119 }, { "epoch": 6.975476839237057, "grad_norm": 0.3036541658128356, "learning_rate": 4.427109960813361e-06, "loss": 0.0073, "step": 5120 }, { "epoch": 6.976839237057221, "grad_norm": 0.29184459560984366, "learning_rate": 4.42344633333495e-06, "loss": 0.0007, "step": 5121 }, { "epoch": 6.978201634877384, "grad_norm": 3.6879210509837015, "learning_rate": 4.419783791851068e-06, "loss": 0.0185, "step": 5122 }, { "epoch": 6.979564032697548, "grad_norm": 0.5508438864000387, "learning_rate": 4.416122337074975e-06, "loss": 0.0152, "step": 5123 }, { "epoch": 6.9809264305177114, "grad_norm": 0.6691998426293302, "learning_rate": 4.412461969719711e-06, "loss": 0.0083, "step": 5124 }, { "epoch": 6.982288828337874, "grad_norm": 0.525223173841921, "learning_rate": 4.408802690498099e-06, "loss": 0.0151, "step": 5125 }, { "epoch": 6.983651226158038, "grad_norm": 1.2946600622034095, "learning_rate": 4.405144500122772e-06, "loss": 0.0031, "step": 5126 }, { "epoch": 6.985013623978202, "grad_norm": 0.9382077421361652, "learning_rate": 4.401487399306127e-06, "loss": 0.0134, "step": 5127 }, { "epoch": 6.986376021798365, "grad_norm": 1.0063586743169628, "learning_rate": 4.397831388760364e-06, "loss": 0.0037, "step": 5128 }, { "epoch": 6.987738419618529, "grad_norm": 0.5692557144126743, "learning_rate": 4.39417646919746e-06, "loss": 0.0085, "step": 5129 }, { "epoch": 6.989100817438692, "grad_norm": 2.110833846137637, "learning_rate": 4.390522641329191e-06, "loss": 0.0071, "step": 5130 }, { "epoch": 6.990463215258855, "grad_norm": 1.1870513234913533, "learning_rate": 4.3868699058671075e-06, "loss": 0.0069, "step": 5131 }, { "epoch": 6.991825613079019, "grad_norm": 0.3966024475089747, "learning_rate": 4.383218263522556e-06, "loss": 0.0143, "step": 5132 }, { "epoch": 6.993188010899183, "grad_norm": 2.052609318894662, "learning_rate": 4.379567715006675e-06, "loss": 0.0037, "step": 5133 }, { "epoch": 6.994550408719346, "grad_norm": 1.1826095360400175, "learning_rate": 4.375918261030373e-06, "loss": 0.0018, "step": 5134 }, { "epoch": 6.9959128065395095, "grad_norm": 0.21492173618398364, "learning_rate": 4.3722699023043634e-06, "loss": 0.0072, "step": 5135 }, { "epoch": 6.997275204359673, "grad_norm": 0.9799888740232585, "learning_rate": 4.368622639539132e-06, "loss": 0.0017, "step": 5136 }, { "epoch": 6.998637602179836, "grad_norm": 0.709918333638005, "learning_rate": 4.364976473444961e-06, "loss": 0.0188, "step": 5137 }, { "epoch": 7.0, "grad_norm": 1.4133226310823979, "learning_rate": 4.361331404731917e-06, "loss": 0.0049, "step": 5138 }, { "epoch": 7.0, "eval_accuracy": 0.9517125210555868, "eval_f1": 0.943057180150225, "eval_loss": 0.12044619768857956, "eval_precision": 0.9361802025261319, "eval_recall": 0.9528613977236464, "eval_runtime": 18.5905, "eval_samples_per_second": 95.802, "eval_steps_per_second": 0.753, "step": 5138 }, { "epoch": 7.001362397820164, "grad_norm": 0.09997723452041758, "learning_rate": 4.357687434109846e-06, "loss": 0.0005, "step": 5139 }, { "epoch": 7.002724795640327, "grad_norm": 1.3342060224645313, "learning_rate": 4.354044562288394e-06, "loss": 0.0069, "step": 5140 }, { "epoch": 7.0040871934604905, "grad_norm": 0.6570208039976408, "learning_rate": 4.350402789976975e-06, "loss": 0.0006, "step": 5141 }, { "epoch": 7.005449591280654, "grad_norm": 0.2904024112925065, "learning_rate": 4.346762117884804e-06, "loss": 0.014, "step": 5142 }, { "epoch": 7.006811989100817, "grad_norm": 0.7895221659495193, "learning_rate": 4.3431225467208795e-06, "loss": 0.0078, "step": 5143 }, { "epoch": 7.008174386920981, "grad_norm": 0.8447107351919747, "learning_rate": 4.339484077193974e-06, "loss": 0.0024, "step": 5144 }, { "epoch": 7.009536784741145, "grad_norm": 0.7301927128374857, "learning_rate": 4.335846710012661e-06, "loss": 0.0133, "step": 5145 }, { "epoch": 7.010899182561308, "grad_norm": 0.7952697968186994, "learning_rate": 4.332210445885286e-06, "loss": 0.0023, "step": 5146 }, { "epoch": 7.012261580381471, "grad_norm": 1.7054288331101926, "learning_rate": 4.328575285519994e-06, "loss": 0.0141, "step": 5147 }, { "epoch": 7.013623978201635, "grad_norm": 1.3325817854974145, "learning_rate": 4.324941229624696e-06, "loss": 0.0135, "step": 5148 }, { "epoch": 7.014986376021798, "grad_norm": 0.6095094998510311, "learning_rate": 4.3213082789071056e-06, "loss": 0.0005, "step": 5149 }, { "epoch": 7.016348773841962, "grad_norm": 1.330941563886923, "learning_rate": 4.317676434074718e-06, "loss": 0.005, "step": 5150 }, { "epoch": 7.017711171662126, "grad_norm": 1.0081454918656296, "learning_rate": 4.3140456958348e-06, "loss": 0.0205, "step": 5151 }, { "epoch": 7.0190735694822886, "grad_norm": 1.0374076227247897, "learning_rate": 4.310416064894421e-06, "loss": 0.0107, "step": 5152 }, { "epoch": 7.020435967302452, "grad_norm": 0.5181901712358156, "learning_rate": 4.306787541960419e-06, "loss": 0.0103, "step": 5153 }, { "epoch": 7.021798365122616, "grad_norm": 0.8408420515186045, "learning_rate": 4.303160127739426e-06, "loss": 0.0077, "step": 5154 }, { "epoch": 7.023160762942779, "grad_norm": 1.4976267885363694, "learning_rate": 4.299533822937859e-06, "loss": 0.0188, "step": 5155 }, { "epoch": 7.024523160762943, "grad_norm": 1.2802991232441987, "learning_rate": 4.29590862826191e-06, "loss": 0.0103, "step": 5156 }, { "epoch": 7.025885558583107, "grad_norm": 1.4774491576377424, "learning_rate": 4.292284544417565e-06, "loss": 0.0086, "step": 5157 }, { "epoch": 7.0272479564032695, "grad_norm": 1.6055551532018626, "learning_rate": 4.288661572110582e-06, "loss": 0.0018, "step": 5158 }, { "epoch": 7.028610354223433, "grad_norm": 1.1248403687673703, "learning_rate": 4.285039712046517e-06, "loss": 0.0029, "step": 5159 }, { "epoch": 7.029972752043597, "grad_norm": 0.5501945238871555, "learning_rate": 4.281418964930694e-06, "loss": 0.0018, "step": 5160 }, { "epoch": 7.03133514986376, "grad_norm": 1.1272189792620388, "learning_rate": 4.277799331468233e-06, "loss": 0.0073, "step": 5161 }, { "epoch": 7.032697547683924, "grad_norm": 2.835020196013098, "learning_rate": 4.2741808123640335e-06, "loss": 0.0113, "step": 5162 }, { "epoch": 7.0340599455040875, "grad_norm": 1.3284261074381052, "learning_rate": 4.270563408322772e-06, "loss": 0.0077, "step": 5163 }, { "epoch": 7.03542234332425, "grad_norm": 0.892847836544168, "learning_rate": 4.2669471200489174e-06, "loss": 0.0083, "step": 5164 }, { "epoch": 7.036784741144414, "grad_norm": 3.389670404097234, "learning_rate": 4.263331948246711e-06, "loss": 0.0137, "step": 5165 }, { "epoch": 7.038147138964578, "grad_norm": 1.1995888688060876, "learning_rate": 4.2597178936201845e-06, "loss": 0.0048, "step": 5166 }, { "epoch": 7.039509536784741, "grad_norm": 1.1012898876623707, "learning_rate": 4.2561049568731525e-06, "loss": 0.0009, "step": 5167 }, { "epoch": 7.040871934604905, "grad_norm": 0.29515367181155144, "learning_rate": 4.2524931387092036e-06, "loss": 0.0148, "step": 5168 }, { "epoch": 7.0422343324250685, "grad_norm": 0.5322243274682711, "learning_rate": 4.24888243983172e-06, "loss": 0.0081, "step": 5169 }, { "epoch": 7.043596730245231, "grad_norm": 0.7252002747253962, "learning_rate": 4.245272860943852e-06, "loss": 0.0009, "step": 5170 }, { "epoch": 7.044959128065395, "grad_norm": 0.5345097784645565, "learning_rate": 4.241664402748544e-06, "loss": 0.001, "step": 5171 }, { "epoch": 7.046321525885559, "grad_norm": 1.2032520225014067, "learning_rate": 4.238057065948523e-06, "loss": 0.0188, "step": 5172 }, { "epoch": 7.047683923705722, "grad_norm": 0.9175169256275587, "learning_rate": 4.234450851246284e-06, "loss": 0.0018, "step": 5173 }, { "epoch": 7.049046321525886, "grad_norm": 1.6562765811802271, "learning_rate": 4.230845759344116e-06, "loss": 0.0132, "step": 5174 }, { "epoch": 7.050408719346049, "grad_norm": 0.9943612786718506, "learning_rate": 4.2272417909440836e-06, "loss": 0.0044, "step": 5175 }, { "epoch": 7.051771117166212, "grad_norm": 1.145142006778297, "learning_rate": 4.223638946748038e-06, "loss": 0.0017, "step": 5176 }, { "epoch": 7.053133514986376, "grad_norm": 0.49440134465807684, "learning_rate": 4.220037227457601e-06, "loss": 0.0089, "step": 5177 }, { "epoch": 7.05449591280654, "grad_norm": 1.5082541466383967, "learning_rate": 4.216436633774185e-06, "loss": 0.0211, "step": 5178 }, { "epoch": 7.055858310626703, "grad_norm": 0.17851273098831943, "learning_rate": 4.212837166398986e-06, "loss": 0.0006, "step": 5179 }, { "epoch": 7.0572207084468666, "grad_norm": 0.6781213115501131, "learning_rate": 4.209238826032965e-06, "loss": 0.0083, "step": 5180 }, { "epoch": 7.05858310626703, "grad_norm": 0.4436754155650317, "learning_rate": 4.205641613376884e-06, "loss": 0.0008, "step": 5181 }, { "epoch": 7.059945504087193, "grad_norm": 0.4873475063502109, "learning_rate": 4.202045529131265e-06, "loss": 0.0094, "step": 5182 }, { "epoch": 7.061307901907357, "grad_norm": 1.202913851421326, "learning_rate": 4.198450573996423e-06, "loss": 0.0186, "step": 5183 }, { "epoch": 7.062670299727521, "grad_norm": 0.39855936298980593, "learning_rate": 4.1948567486724566e-06, "loss": 0.0079, "step": 5184 }, { "epoch": 7.064032697547684, "grad_norm": 1.0910159184750248, "learning_rate": 4.19126405385923e-06, "loss": 0.0078, "step": 5185 }, { "epoch": 7.0653950953678475, "grad_norm": 0.918911722346597, "learning_rate": 4.1876724902564e-06, "loss": 0.0046, "step": 5186 }, { "epoch": 7.066757493188011, "grad_norm": 1.4267955068323763, "learning_rate": 4.184082058563393e-06, "loss": 0.0068, "step": 5187 }, { "epoch": 7.068119891008174, "grad_norm": 0.8663628732765796, "learning_rate": 4.180492759479429e-06, "loss": 0.0012, "step": 5188 }, { "epoch": 7.069482288828338, "grad_norm": 0.7985937917317488, "learning_rate": 4.176904593703488e-06, "loss": 0.002, "step": 5189 }, { "epoch": 7.070844686648502, "grad_norm": 0.90164063437217, "learning_rate": 4.173317561934346e-06, "loss": 0.0007, "step": 5190 }, { "epoch": 7.072207084468665, "grad_norm": 0.9755519636385593, "learning_rate": 4.169731664870556e-06, "loss": 0.0055, "step": 5191 }, { "epoch": 7.073569482288828, "grad_norm": 0.4591256735157931, "learning_rate": 4.166146903210436e-06, "loss": 0.0031, "step": 5192 }, { "epoch": 7.074931880108992, "grad_norm": 0.23627052302604104, "learning_rate": 4.162563277652104e-06, "loss": 0.0007, "step": 5193 }, { "epoch": 7.076294277929155, "grad_norm": 0.7474801931747591, "learning_rate": 4.1589807888934365e-06, "loss": 0.0007, "step": 5194 }, { "epoch": 7.077656675749319, "grad_norm": 0.961559347494936, "learning_rate": 4.155399437632103e-06, "loss": 0.007, "step": 5195 }, { "epoch": 7.079019073569483, "grad_norm": 0.1483037820604079, "learning_rate": 4.151819224565548e-06, "loss": 0.0008, "step": 5196 }, { "epoch": 7.080381471389646, "grad_norm": 1.842413419727902, "learning_rate": 4.148240150390987e-06, "loss": 0.004, "step": 5197 }, { "epoch": 7.081743869209809, "grad_norm": 0.9023236750977783, "learning_rate": 4.144662215805426e-06, "loss": 0.0011, "step": 5198 }, { "epoch": 7.083106267029973, "grad_norm": 0.6374603216095288, "learning_rate": 4.141085421505636e-06, "loss": 0.0015, "step": 5199 }, { "epoch": 7.084468664850136, "grad_norm": 0.8393856220407685, "learning_rate": 4.137509768188176e-06, "loss": 0.0095, "step": 5200 }, { "epoch": 7.0858310626703, "grad_norm": 0.9935137178827534, "learning_rate": 4.1339352565493825e-06, "loss": 0.0145, "step": 5201 }, { "epoch": 7.087193460490464, "grad_norm": 2.338180047837276, "learning_rate": 4.130361887285359e-06, "loss": 0.0055, "step": 5202 }, { "epoch": 7.0885558583106265, "grad_norm": 0.3947752225628683, "learning_rate": 4.126789661092002e-06, "loss": 0.0006, "step": 5203 }, { "epoch": 7.08991825613079, "grad_norm": 0.2369303863283591, "learning_rate": 4.12321857866497e-06, "loss": 0.0022, "step": 5204 }, { "epoch": 7.091280653950954, "grad_norm": 0.8401054721600928, "learning_rate": 4.1196486406997125e-06, "loss": 0.009, "step": 5205 }, { "epoch": 7.092643051771117, "grad_norm": 0.45426203482001815, "learning_rate": 4.116079847891444e-06, "loss": 0.0012, "step": 5206 }, { "epoch": 7.094005449591281, "grad_norm": 0.33935034273956033, "learning_rate": 4.1125122009351636e-06, "loss": 0.0133, "step": 5207 }, { "epoch": 7.0953678474114446, "grad_norm": 0.5667193646331297, "learning_rate": 4.108945700525651e-06, "loss": 0.0038, "step": 5208 }, { "epoch": 7.0967302452316074, "grad_norm": 0.34722099589684147, "learning_rate": 4.105380347357448e-06, "loss": 0.0016, "step": 5209 }, { "epoch": 7.098092643051771, "grad_norm": 0.3980459672894612, "learning_rate": 4.1018161421248905e-06, "loss": 0.0013, "step": 5210 }, { "epoch": 7.099455040871935, "grad_norm": 0.7780538288667909, "learning_rate": 4.098253085522074e-06, "loss": 0.0174, "step": 5211 }, { "epoch": 7.100817438692098, "grad_norm": 0.44106650721341534, "learning_rate": 4.094691178242883e-06, "loss": 0.0007, "step": 5212 }, { "epoch": 7.102179836512262, "grad_norm": 0.2893829517789899, "learning_rate": 4.091130420980978e-06, "loss": 0.008, "step": 5213 }, { "epoch": 7.1035422343324255, "grad_norm": 1.4099171005503062, "learning_rate": 4.0875708144297835e-06, "loss": 0.005, "step": 5214 }, { "epoch": 7.104904632152588, "grad_norm": 0.41753778394962426, "learning_rate": 4.084012359282514e-06, "loss": 0.0004, "step": 5215 }, { "epoch": 7.106267029972752, "grad_norm": 0.3726979524218341, "learning_rate": 4.080455056232148e-06, "loss": 0.0075, "step": 5216 }, { "epoch": 7.107629427792916, "grad_norm": 0.29520070476564453, "learning_rate": 4.0768989059714515e-06, "loss": 0.001, "step": 5217 }, { "epoch": 7.108991825613079, "grad_norm": 2.1982928965705066, "learning_rate": 4.073343909192954e-06, "loss": 0.0064, "step": 5218 }, { "epoch": 7.110354223433243, "grad_norm": 2.944997974593434, "learning_rate": 4.069790066588966e-06, "loss": 0.0095, "step": 5219 }, { "epoch": 7.111716621253406, "grad_norm": 0.3804970650992207, "learning_rate": 4.0662373788515815e-06, "loss": 0.0008, "step": 5220 }, { "epoch": 7.113079019073569, "grad_norm": 0.6352771336126112, "learning_rate": 4.0626858466726515e-06, "loss": 0.0008, "step": 5221 }, { "epoch": 7.114441416893733, "grad_norm": 1.550611359508187, "learning_rate": 4.05913547074382e-06, "loss": 0.005, "step": 5222 }, { "epoch": 7.115803814713897, "grad_norm": 0.9020731102494878, "learning_rate": 4.055586251756489e-06, "loss": 0.0114, "step": 5223 }, { "epoch": 7.11716621253406, "grad_norm": 0.1593639592166208, "learning_rate": 4.05203819040185e-06, "loss": 0.0077, "step": 5224 }, { "epoch": 7.118528610354224, "grad_norm": 0.43707710764046453, "learning_rate": 4.048491287370864e-06, "loss": 0.002, "step": 5225 }, { "epoch": 7.1198910081743865, "grad_norm": 1.281648949356482, "learning_rate": 4.044945543354259e-06, "loss": 0.0115, "step": 5226 }, { "epoch": 7.12125340599455, "grad_norm": 0.5558480119980029, "learning_rate": 4.041400959042553e-06, "loss": 0.0011, "step": 5227 }, { "epoch": 7.122615803814714, "grad_norm": 0.7681641505632961, "learning_rate": 4.037857535126019e-06, "loss": 0.0015, "step": 5228 }, { "epoch": 7.123978201634877, "grad_norm": 2.206695438134788, "learning_rate": 4.0343152722947175e-06, "loss": 0.0086, "step": 5229 }, { "epoch": 7.125340599455041, "grad_norm": 0.9194653974157808, "learning_rate": 4.0307741712384855e-06, "loss": 0.0015, "step": 5230 }, { "epoch": 7.1267029972752045, "grad_norm": 1.5190759720860367, "learning_rate": 4.027234232646916e-06, "loss": 0.0099, "step": 5231 }, { "epoch": 7.128065395095367, "grad_norm": 0.5518207737532791, "learning_rate": 4.023695457209398e-06, "loss": 0.0008, "step": 5232 }, { "epoch": 7.129427792915531, "grad_norm": 0.5892073542678967, "learning_rate": 4.020157845615075e-06, "loss": 0.0005, "step": 5233 }, { "epoch": 7.130790190735695, "grad_norm": 0.28956021891370326, "learning_rate": 4.016621398552877e-06, "loss": 0.0008, "step": 5234 }, { "epoch": 7.132152588555858, "grad_norm": 0.3085976016352024, "learning_rate": 4.013086116711497e-06, "loss": 0.0079, "step": 5235 }, { "epoch": 7.133514986376022, "grad_norm": 1.3298731245780555, "learning_rate": 4.009552000779411e-06, "loss": 0.0236, "step": 5236 }, { "epoch": 7.1348773841961854, "grad_norm": 1.008299370061912, "learning_rate": 4.0060190514448634e-06, "loss": 0.0008, "step": 5237 }, { "epoch": 7.136239782016348, "grad_norm": 0.4888353307486064, "learning_rate": 4.002487269395867e-06, "loss": 0.0009, "step": 5238 }, { "epoch": 7.137602179836512, "grad_norm": 0.3655364940071582, "learning_rate": 3.998956655320218e-06, "loss": 0.0074, "step": 5239 }, { "epoch": 7.138964577656676, "grad_norm": 0.8167715527212651, "learning_rate": 3.9954272099054696e-06, "loss": 0.0141, "step": 5240 }, { "epoch": 7.140326975476839, "grad_norm": 1.2784707780339772, "learning_rate": 3.991898933838962e-06, "loss": 0.0009, "step": 5241 }, { "epoch": 7.141689373297003, "grad_norm": 1.037431592185904, "learning_rate": 3.9883718278078045e-06, "loss": 0.0014, "step": 5242 }, { "epoch": 7.143051771117166, "grad_norm": 0.5186146282447573, "learning_rate": 3.984845892498869e-06, "loss": 0.0022, "step": 5243 }, { "epoch": 7.144414168937329, "grad_norm": 2.307999125642953, "learning_rate": 3.981321128598815e-06, "loss": 0.0078, "step": 5244 }, { "epoch": 7.145776566757493, "grad_norm": 0.743870963635081, "learning_rate": 3.977797536794057e-06, "loss": 0.0009, "step": 5245 }, { "epoch": 7.147138964577657, "grad_norm": 1.2423586000240463, "learning_rate": 3.974275117770798e-06, "loss": 0.0048, "step": 5246 }, { "epoch": 7.14850136239782, "grad_norm": 1.2984123542755084, "learning_rate": 3.970753872214993e-06, "loss": 0.0008, "step": 5247 }, { "epoch": 7.1498637602179835, "grad_norm": 0.9598274267799316, "learning_rate": 3.967233800812393e-06, "loss": 0.0035, "step": 5248 }, { "epoch": 7.151226158038147, "grad_norm": 0.3785254627282493, "learning_rate": 3.963714904248501e-06, "loss": 0.0068, "step": 5249 }, { "epoch": 7.15258855585831, "grad_norm": 0.36526747389013514, "learning_rate": 3.960197183208594e-06, "loss": 0.0025, "step": 5250 }, { "epoch": 7.153950953678474, "grad_norm": 0.9839875556527322, "learning_rate": 3.95668063837773e-06, "loss": 0.0071, "step": 5251 }, { "epoch": 7.155313351498638, "grad_norm": 0.7688932042389459, "learning_rate": 3.953165270440721e-06, "loss": 0.0022, "step": 5252 }, { "epoch": 7.156675749318801, "grad_norm": 0.557922698433873, "learning_rate": 3.949651080082174e-06, "loss": 0.0023, "step": 5253 }, { "epoch": 7.1580381471389645, "grad_norm": 0.4207595060066831, "learning_rate": 3.946138067986446e-06, "loss": 0.0023, "step": 5254 }, { "epoch": 7.159400544959128, "grad_norm": 1.106182052747032, "learning_rate": 3.9426262348376676e-06, "loss": 0.0025, "step": 5255 }, { "epoch": 7.160762942779291, "grad_norm": 0.5566050513089004, "learning_rate": 3.9391155813197505e-06, "loss": 0.0089, "step": 5256 }, { "epoch": 7.162125340599455, "grad_norm": 2.2237913159544984, "learning_rate": 3.935606108116362e-06, "loss": 0.0156, "step": 5257 }, { "epoch": 7.163487738419619, "grad_norm": 0.8513906152938088, "learning_rate": 3.932097815910954e-06, "loss": 0.0087, "step": 5258 }, { "epoch": 7.164850136239782, "grad_norm": 1.6585111080005772, "learning_rate": 3.928590705386737e-06, "loss": 0.0066, "step": 5259 }, { "epoch": 7.166212534059945, "grad_norm": 1.4195864064319743, "learning_rate": 3.925084777226703e-06, "loss": 0.0378, "step": 5260 }, { "epoch": 7.167574931880109, "grad_norm": 1.538615200517548, "learning_rate": 3.921580032113602e-06, "loss": 0.0154, "step": 5261 }, { "epoch": 7.168937329700272, "grad_norm": 0.981237649745901, "learning_rate": 3.9180764707299546e-06, "loss": 0.0016, "step": 5262 }, { "epoch": 7.170299727520436, "grad_norm": 2.1904339550573604, "learning_rate": 3.914574093758064e-06, "loss": 0.0181, "step": 5263 }, { "epoch": 7.1716621253406, "grad_norm": 1.4324923921547388, "learning_rate": 3.911072901879979e-06, "loss": 0.0141, "step": 5264 }, { "epoch": 7.1730245231607626, "grad_norm": 1.3364270929462934, "learning_rate": 3.907572895777549e-06, "loss": 0.0012, "step": 5265 }, { "epoch": 7.174386920980926, "grad_norm": 0.6672163840170272, "learning_rate": 3.904074076132368e-06, "loss": 0.0237, "step": 5266 }, { "epoch": 7.17574931880109, "grad_norm": 0.3377682828942112, "learning_rate": 3.900576443625803e-06, "loss": 0.0017, "step": 5267 }, { "epoch": 7.177111716621253, "grad_norm": 0.7005731431836223, "learning_rate": 3.8970799989389995e-06, "loss": 0.0032, "step": 5268 }, { "epoch": 7.178474114441417, "grad_norm": 4.698054395815493, "learning_rate": 3.893584742752859e-06, "loss": 0.0189, "step": 5269 }, { "epoch": 7.179836512261581, "grad_norm": 0.45495900294355185, "learning_rate": 3.890090675748062e-06, "loss": 0.0199, "step": 5270 }, { "epoch": 7.1811989100817435, "grad_norm": 0.2934952271588441, "learning_rate": 3.886597798605052e-06, "loss": 0.0009, "step": 5271 }, { "epoch": 7.182561307901907, "grad_norm": 2.4434082794708503, "learning_rate": 3.8831061120040466e-06, "loss": 0.0024, "step": 5272 }, { "epoch": 7.183923705722071, "grad_norm": 0.8866154228433998, "learning_rate": 3.879615616625024e-06, "loss": 0.001, "step": 5273 }, { "epoch": 7.185286103542234, "grad_norm": 0.6753913164321275, "learning_rate": 3.87612631314773e-06, "loss": 0.0062, "step": 5274 }, { "epoch": 7.186648501362398, "grad_norm": 0.8496270359860822, "learning_rate": 3.872638202251685e-06, "loss": 0.0013, "step": 5275 }, { "epoch": 7.1880108991825615, "grad_norm": 1.1563550021338982, "learning_rate": 3.869151284616174e-06, "loss": 0.0018, "step": 5276 }, { "epoch": 7.189373297002724, "grad_norm": 0.49435314624556825, "learning_rate": 3.8656655609202536e-06, "loss": 0.0005, "step": 5277 }, { "epoch": 7.190735694822888, "grad_norm": 1.82913254547153, "learning_rate": 3.86218103184274e-06, "loss": 0.0033, "step": 5278 }, { "epoch": 7.192098092643052, "grad_norm": 0.5396935121699326, "learning_rate": 3.858697698062217e-06, "loss": 0.0075, "step": 5279 }, { "epoch": 7.193460490463215, "grad_norm": 2.414355786043634, "learning_rate": 3.855215560257046e-06, "loss": 0.0022, "step": 5280 }, { "epoch": 7.194822888283379, "grad_norm": 0.1665600781849381, "learning_rate": 3.85173461910534e-06, "loss": 0.0006, "step": 5281 }, { "epoch": 7.1961852861035425, "grad_norm": 0.7067187877048257, "learning_rate": 3.848254875285e-06, "loss": 0.0007, "step": 5282 }, { "epoch": 7.197547683923705, "grad_norm": 0.7764498882459034, "learning_rate": 3.844776329473672e-06, "loss": 0.0023, "step": 5283 }, { "epoch": 7.198910081743869, "grad_norm": 1.818387404362438, "learning_rate": 3.841298982348786e-06, "loss": 0.0137, "step": 5284 }, { "epoch": 7.200272479564033, "grad_norm": 1.331912059650445, "learning_rate": 3.8378228345875244e-06, "loss": 0.016, "step": 5285 }, { "epoch": 7.201634877384196, "grad_norm": 1.5682206562577352, "learning_rate": 3.834347886866843e-06, "loss": 0.0118, "step": 5286 }, { "epoch": 7.20299727520436, "grad_norm": 3.8324566591504112, "learning_rate": 3.830874139863463e-06, "loss": 0.013, "step": 5287 }, { "epoch": 7.204359673024523, "grad_norm": 0.4268612493388812, "learning_rate": 3.827401594253875e-06, "loss": 0.0071, "step": 5288 }, { "epoch": 7.205722070844686, "grad_norm": 2.4528238804743996, "learning_rate": 3.823930250714335e-06, "loss": 0.0274, "step": 5289 }, { "epoch": 7.20708446866485, "grad_norm": 1.4366243351655978, "learning_rate": 3.820460109920856e-06, "loss": 0.009, "step": 5290 }, { "epoch": 7.208446866485014, "grad_norm": 0.5923686517112812, "learning_rate": 3.81699117254923e-06, "loss": 0.0061, "step": 5291 }, { "epoch": 7.209809264305177, "grad_norm": 1.3499517048292908, "learning_rate": 3.8135234392750053e-06, "loss": 0.0112, "step": 5292 }, { "epoch": 7.2111716621253406, "grad_norm": 0.2970532975512986, "learning_rate": 3.8100569107734918e-06, "loss": 0.0084, "step": 5293 }, { "epoch": 7.212534059945504, "grad_norm": 2.078549152878026, "learning_rate": 3.8065915877197847e-06, "loss": 0.0028, "step": 5294 }, { "epoch": 7.213896457765667, "grad_norm": 0.5456931484017759, "learning_rate": 3.803127470788721e-06, "loss": 0.0268, "step": 5295 }, { "epoch": 7.215258855585831, "grad_norm": 2.5197248203903215, "learning_rate": 3.79966456065492e-06, "loss": 0.0236, "step": 5296 }, { "epoch": 7.216621253405995, "grad_norm": 1.3036652980959094, "learning_rate": 3.7962028579927555e-06, "loss": 0.012, "step": 5297 }, { "epoch": 7.217983651226158, "grad_norm": 1.1673176199088384, "learning_rate": 3.7927423634763673e-06, "loss": 0.001, "step": 5298 }, { "epoch": 7.2193460490463215, "grad_norm": 1.3199656410742175, "learning_rate": 3.789283077779664e-06, "loss": 0.0027, "step": 5299 }, { "epoch": 7.220708446866485, "grad_norm": 0.14799897995437533, "learning_rate": 3.7858250015763176e-06, "loss": 0.0005, "step": 5300 }, { "epoch": 7.222070844686648, "grad_norm": 0.876341701815359, "learning_rate": 3.782368135539769e-06, "loss": 0.0007, "step": 5301 }, { "epoch": 7.223433242506812, "grad_norm": 1.3403731900746674, "learning_rate": 3.7789124803432096e-06, "loss": 0.008, "step": 5302 }, { "epoch": 7.224795640326976, "grad_norm": 0.49693673746505584, "learning_rate": 3.7754580366596116e-06, "loss": 0.0008, "step": 5303 }, { "epoch": 7.226158038147139, "grad_norm": 0.4680825185410753, "learning_rate": 3.7720048051616973e-06, "loss": 0.0044, "step": 5304 }, { "epoch": 7.227520435967302, "grad_norm": 1.3453312361880398, "learning_rate": 3.768552786521962e-06, "loss": 0.0015, "step": 5305 }, { "epoch": 7.228882833787466, "grad_norm": 0.6198191640999844, "learning_rate": 3.7651019814126656e-06, "loss": 0.0149, "step": 5306 }, { "epoch": 7.230245231607629, "grad_norm": 1.0010893173486484, "learning_rate": 3.761652390505821e-06, "loss": 0.0011, "step": 5307 }, { "epoch": 7.231607629427793, "grad_norm": 1.343462529546585, "learning_rate": 3.758204014473219e-06, "loss": 0.0031, "step": 5308 }, { "epoch": 7.232970027247957, "grad_norm": 0.9643640409609598, "learning_rate": 3.754756853986402e-06, "loss": 0.0022, "step": 5309 }, { "epoch": 7.23433242506812, "grad_norm": 0.7599436471646468, "learning_rate": 3.7513109097166755e-06, "loss": 0.0174, "step": 5310 }, { "epoch": 7.235694822888283, "grad_norm": 1.9558200197926654, "learning_rate": 3.747866182335125e-06, "loss": 0.0137, "step": 5311 }, { "epoch": 7.237057220708447, "grad_norm": 1.8675450867085597, "learning_rate": 3.7444226725125767e-06, "loss": 0.0182, "step": 5312 }, { "epoch": 7.23841961852861, "grad_norm": 1.081340130267187, "learning_rate": 3.7409803809196366e-06, "loss": 0.0026, "step": 5313 }, { "epoch": 7.239782016348774, "grad_norm": 1.0646493041614506, "learning_rate": 3.7375393082266597e-06, "loss": 0.0025, "step": 5314 }, { "epoch": 7.241144414168938, "grad_norm": 1.1275744183140703, "learning_rate": 3.734099455103779e-06, "loss": 0.0129, "step": 5315 }, { "epoch": 7.2425068119891005, "grad_norm": 1.8784599134035562, "learning_rate": 3.730660822220873e-06, "loss": 0.0201, "step": 5316 }, { "epoch": 7.243869209809264, "grad_norm": 1.2283494328839595, "learning_rate": 3.727223410247596e-06, "loss": 0.033, "step": 5317 }, { "epoch": 7.245231607629428, "grad_norm": 1.59586498805587, "learning_rate": 3.723787219853363e-06, "loss": 0.001, "step": 5318 }, { "epoch": 7.246594005449591, "grad_norm": 1.5227487107912236, "learning_rate": 3.72035225170734e-06, "loss": 0.002, "step": 5319 }, { "epoch": 7.247956403269755, "grad_norm": 7.087908350935087, "learning_rate": 3.7169185064784706e-06, "loss": 0.0112, "step": 5320 }, { "epoch": 7.2493188010899186, "grad_norm": 1.8859108663362014, "learning_rate": 3.7134859848354486e-06, "loss": 0.0041, "step": 5321 }, { "epoch": 7.2506811989100814, "grad_norm": 2.286533384799582, "learning_rate": 3.7100546874467268e-06, "loss": 0.011, "step": 5322 }, { "epoch": 7.252043596730245, "grad_norm": 1.0067375778189267, "learning_rate": 3.7066246149805397e-06, "loss": 0.0018, "step": 5323 }, { "epoch": 7.253405994550409, "grad_norm": 2.482585375523634, "learning_rate": 3.7031957681048604e-06, "loss": 0.0173, "step": 5324 }, { "epoch": 7.254768392370572, "grad_norm": 0.8360867383686427, "learning_rate": 3.699768147487438e-06, "loss": 0.0006, "step": 5325 }, { "epoch": 7.256130790190736, "grad_norm": 1.4313656428295343, "learning_rate": 3.696341753795771e-06, "loss": 0.0036, "step": 5326 }, { "epoch": 7.2574931880108995, "grad_norm": 1.292313887656688, "learning_rate": 3.692916587697134e-06, "loss": 0.0054, "step": 5327 }, { "epoch": 7.258855585831062, "grad_norm": 1.4392846096881653, "learning_rate": 3.689492649858545e-06, "loss": 0.009, "step": 5328 }, { "epoch": 7.260217983651226, "grad_norm": 1.2921911260611119, "learning_rate": 3.686069940946796e-06, "loss": 0.0007, "step": 5329 }, { "epoch": 7.26158038147139, "grad_norm": 0.8315671387127649, "learning_rate": 3.6826484616284388e-06, "loss": 0.0066, "step": 5330 }, { "epoch": 7.262942779291553, "grad_norm": 0.6477750171533382, "learning_rate": 3.6792282125697755e-06, "loss": 0.0179, "step": 5331 }, { "epoch": 7.264305177111717, "grad_norm": 1.9819392350067906, "learning_rate": 3.6758091944368824e-06, "loss": 0.0345, "step": 5332 }, { "epoch": 7.26566757493188, "grad_norm": 0.3534219855113762, "learning_rate": 3.6723914078955824e-06, "loss": 0.0024, "step": 5333 }, { "epoch": 7.267029972752043, "grad_norm": 0.4554034336303901, "learning_rate": 3.668974853611469e-06, "loss": 0.0008, "step": 5334 }, { "epoch": 7.268392370572207, "grad_norm": 0.3579009185637327, "learning_rate": 3.6655595322498936e-06, "loss": 0.0009, "step": 5335 }, { "epoch": 7.269754768392371, "grad_norm": 0.48182887511038697, "learning_rate": 3.662145444475963e-06, "loss": 0.0146, "step": 5336 }, { "epoch": 7.271117166212534, "grad_norm": 0.93932615280902, "learning_rate": 3.6587325909545503e-06, "loss": 0.0104, "step": 5337 }, { "epoch": 7.272479564032698, "grad_norm": 0.31464443887679555, "learning_rate": 3.655320972350278e-06, "loss": 0.0016, "step": 5338 }, { "epoch": 7.273841961852861, "grad_norm": 0.8766404677119666, "learning_rate": 3.65191058932754e-06, "loss": 0.0348, "step": 5339 }, { "epoch": 7.275204359673024, "grad_norm": 0.8464749519614563, "learning_rate": 3.6485014425504874e-06, "loss": 0.0011, "step": 5340 }, { "epoch": 7.276566757493188, "grad_norm": 1.2844542061997817, "learning_rate": 3.6450935326830193e-06, "loss": 0.0084, "step": 5341 }, { "epoch": 7.277929155313352, "grad_norm": 1.0757837883845025, "learning_rate": 3.64168686038881e-06, "loss": 0.0218, "step": 5342 }, { "epoch": 7.279291553133515, "grad_norm": 1.5484652980562266, "learning_rate": 3.6382814263312773e-06, "loss": 0.0098, "step": 5343 }, { "epoch": 7.2806539509536785, "grad_norm": 1.6069956802930085, "learning_rate": 3.6348772311736136e-06, "loss": 0.0208, "step": 5344 }, { "epoch": 7.282016348773842, "grad_norm": 0.6421383938566105, "learning_rate": 3.6314742755787537e-06, "loss": 0.0113, "step": 5345 }, { "epoch": 7.283378746594005, "grad_norm": 1.172497108188579, "learning_rate": 3.6280725602094035e-06, "loss": 0.0025, "step": 5346 }, { "epoch": 7.284741144414169, "grad_norm": 0.9253194485101532, "learning_rate": 3.624672085728027e-06, "loss": 0.0078, "step": 5347 }, { "epoch": 7.286103542234333, "grad_norm": 0.2761816589049386, "learning_rate": 3.6212728527968345e-06, "loss": 0.0007, "step": 5348 }, { "epoch": 7.287465940054496, "grad_norm": 0.9154136697591371, "learning_rate": 3.6178748620778103e-06, "loss": 0.0017, "step": 5349 }, { "epoch": 7.2888283378746594, "grad_norm": 0.6828524774367621, "learning_rate": 3.6144781142326824e-06, "loss": 0.0016, "step": 5350 }, { "epoch": 7.290190735694823, "grad_norm": 0.5442492423440554, "learning_rate": 3.6110826099229457e-06, "loss": 0.0085, "step": 5351 }, { "epoch": 7.291553133514986, "grad_norm": 1.4340308770899661, "learning_rate": 3.607688349809856e-06, "loss": 0.0241, "step": 5352 }, { "epoch": 7.29291553133515, "grad_norm": 1.3213107943375928, "learning_rate": 3.604295334554414e-06, "loss": 0.0171, "step": 5353 }, { "epoch": 7.294277929155314, "grad_norm": 1.426560620002176, "learning_rate": 3.600903564817392e-06, "loss": 0.0028, "step": 5354 }, { "epoch": 7.295640326975477, "grad_norm": 0.904914479538903, "learning_rate": 3.597513041259306e-06, "loss": 0.0077, "step": 5355 }, { "epoch": 7.29700272479564, "grad_norm": 0.8319362066435912, "learning_rate": 3.5941237645404437e-06, "loss": 0.0008, "step": 5356 }, { "epoch": 7.298365122615804, "grad_norm": 1.6185162017791477, "learning_rate": 3.5907357353208374e-06, "loss": 0.0078, "step": 5357 }, { "epoch": 7.299727520435967, "grad_norm": 0.9262979139111781, "learning_rate": 3.5873489542602833e-06, "loss": 0.0032, "step": 5358 }, { "epoch": 7.301089918256131, "grad_norm": 1.39497282947084, "learning_rate": 3.5839634220183383e-06, "loss": 0.018, "step": 5359 }, { "epoch": 7.302452316076295, "grad_norm": 0.7969988290673189, "learning_rate": 3.5805791392543033e-06, "loss": 0.0006, "step": 5360 }, { "epoch": 7.3038147138964575, "grad_norm": 1.7682910934007963, "learning_rate": 3.577196106627251e-06, "loss": 0.0057, "step": 5361 }, { "epoch": 7.305177111716621, "grad_norm": 3.345341964874396, "learning_rate": 3.5738143247959944e-06, "loss": 0.0148, "step": 5362 }, { "epoch": 7.306539509536785, "grad_norm": 2.7046671972311715, "learning_rate": 3.570433794419117e-06, "loss": 0.0099, "step": 5363 }, { "epoch": 7.307901907356948, "grad_norm": 1.4035657473662773, "learning_rate": 3.5670545161549565e-06, "loss": 0.0114, "step": 5364 }, { "epoch": 7.309264305177112, "grad_norm": 0.5599770147690322, "learning_rate": 3.563676490661596e-06, "loss": 0.0014, "step": 5365 }, { "epoch": 7.310626702997276, "grad_norm": 1.1211715989860556, "learning_rate": 3.560299718596889e-06, "loss": 0.003, "step": 5366 }, { "epoch": 7.3119891008174385, "grad_norm": 0.5820066665624573, "learning_rate": 3.5569242006184314e-06, "loss": 0.0006, "step": 5367 }, { "epoch": 7.313351498637602, "grad_norm": 0.6762255683812395, "learning_rate": 3.553549937383586e-06, "loss": 0.0085, "step": 5368 }, { "epoch": 7.314713896457766, "grad_norm": 0.6761720207940639, "learning_rate": 3.550176929549468e-06, "loss": 0.0028, "step": 5369 }, { "epoch": 7.316076294277929, "grad_norm": 1.7406216089414752, "learning_rate": 3.5468051777729427e-06, "loss": 0.0038, "step": 5370 }, { "epoch": 7.317438692098093, "grad_norm": 0.4629247592442975, "learning_rate": 3.5434346827106404e-06, "loss": 0.0081, "step": 5371 }, { "epoch": 7.3188010899182565, "grad_norm": 0.40123635777127137, "learning_rate": 3.540065445018933e-06, "loss": 0.008, "step": 5372 }, { "epoch": 7.320163487738419, "grad_norm": 5.5928684946039064, "learning_rate": 3.5366974653539653e-06, "loss": 0.0043, "step": 5373 }, { "epoch": 7.321525885558583, "grad_norm": 0.9967678734362556, "learning_rate": 3.5333307443716203e-06, "loss": 0.0088, "step": 5374 }, { "epoch": 7.322888283378747, "grad_norm": 0.49871791133073695, "learning_rate": 3.5299652827275455e-06, "loss": 0.0005, "step": 5375 }, { "epoch": 7.32425068119891, "grad_norm": 0.16708760300246184, "learning_rate": 3.5266010810771446e-06, "loss": 0.0005, "step": 5376 }, { "epoch": 7.325613079019074, "grad_norm": 0.0827754859174579, "learning_rate": 3.5232381400755645e-06, "loss": 0.0079, "step": 5377 }, { "epoch": 7.3269754768392374, "grad_norm": 0.6006271906865591, "learning_rate": 3.5198764603777235e-06, "loss": 0.0005, "step": 5378 }, { "epoch": 7.3283378746594, "grad_norm": 0.6194107105160146, "learning_rate": 3.5165160426382763e-06, "loss": 0.0006, "step": 5379 }, { "epoch": 7.329700272479564, "grad_norm": 0.9492415323493208, "learning_rate": 3.513156887511644e-06, "loss": 0.0019, "step": 5380 }, { "epoch": 7.331062670299728, "grad_norm": 0.9750480815749515, "learning_rate": 3.5097989956520016e-06, "loss": 0.0062, "step": 5381 }, { "epoch": 7.332425068119891, "grad_norm": 0.8547278928540646, "learning_rate": 3.5064423677132696e-06, "loss": 0.0087, "step": 5382 }, { "epoch": 7.333787465940055, "grad_norm": 0.39036503625480257, "learning_rate": 3.5030870043491337e-06, "loss": 0.0005, "step": 5383 }, { "epoch": 7.335149863760218, "grad_norm": 1.0553047826406476, "learning_rate": 3.49973290621302e-06, "loss": 0.0093, "step": 5384 }, { "epoch": 7.336512261580381, "grad_norm": 3.3553161609346036, "learning_rate": 3.496380073958123e-06, "loss": 0.0178, "step": 5385 }, { "epoch": 7.337874659400545, "grad_norm": 0.950960366034231, "learning_rate": 3.493028508237375e-06, "loss": 0.0007, "step": 5386 }, { "epoch": 7.339237057220709, "grad_norm": 0.35954970685743387, "learning_rate": 3.4896782097034755e-06, "loss": 0.002, "step": 5387 }, { "epoch": 7.340599455040872, "grad_norm": 0.8854223750231717, "learning_rate": 3.4863291790088727e-06, "loss": 0.0019, "step": 5388 }, { "epoch": 7.3419618528610355, "grad_norm": 1.0786510546469146, "learning_rate": 3.4829814168057607e-06, "loss": 0.0076, "step": 5389 }, { "epoch": 7.343324250681199, "grad_norm": 2.3180427987510175, "learning_rate": 3.4796349237461003e-06, "loss": 0.0036, "step": 5390 }, { "epoch": 7.344686648501362, "grad_norm": 0.2896620630427038, "learning_rate": 3.4762897004815886e-06, "loss": 0.0076, "step": 5391 }, { "epoch": 7.346049046321526, "grad_norm": 1.4396387669903454, "learning_rate": 3.4729457476636896e-06, "loss": 0.0142, "step": 5392 }, { "epoch": 7.34741144414169, "grad_norm": 0.303203514793494, "learning_rate": 3.4696030659436174e-06, "loss": 0.0071, "step": 5393 }, { "epoch": 7.348773841961853, "grad_norm": 0.686519203939781, "learning_rate": 3.4662616559723283e-06, "loss": 0.0008, "step": 5394 }, { "epoch": 7.3501362397820165, "grad_norm": 0.8923461509450847, "learning_rate": 3.462921518400546e-06, "loss": 0.0019, "step": 5395 }, { "epoch": 7.35149863760218, "grad_norm": 1.2658422395337128, "learning_rate": 3.4595826538787314e-06, "loss": 0.0043, "step": 5396 }, { "epoch": 7.352861035422343, "grad_norm": 2.2336306307558154, "learning_rate": 3.456245063057109e-06, "loss": 0.0021, "step": 5397 }, { "epoch": 7.354223433242507, "grad_norm": 0.5998364977521001, "learning_rate": 3.4529087465856536e-06, "loss": 0.0083, "step": 5398 }, { "epoch": 7.355585831062671, "grad_norm": 0.6951212153783084, "learning_rate": 3.449573705114082e-06, "loss": 0.0081, "step": 5399 }, { "epoch": 7.356948228882834, "grad_norm": 1.909001555541436, "learning_rate": 3.446239939291879e-06, "loss": 0.0182, "step": 5400 }, { "epoch": 7.358310626702997, "grad_norm": 0.23186441871802393, "learning_rate": 3.4429074497682635e-06, "loss": 0.007, "step": 5401 }, { "epoch": 7.359673024523161, "grad_norm": 0.826550077097191, "learning_rate": 3.4395762371922203e-06, "loss": 0.0149, "step": 5402 }, { "epoch": 7.361035422343324, "grad_norm": 0.755025821722117, "learning_rate": 3.436246302212476e-06, "loss": 0.0019, "step": 5403 }, { "epoch": 7.362397820163488, "grad_norm": 0.9187151854672224, "learning_rate": 3.4329176454775116e-06, "loss": 0.0072, "step": 5404 }, { "epoch": 7.363760217983652, "grad_norm": 1.1074376631650067, "learning_rate": 3.4295902676355654e-06, "loss": 0.0017, "step": 5405 }, { "epoch": 7.3651226158038146, "grad_norm": 1.910414802371789, "learning_rate": 3.4262641693346132e-06, "loss": 0.0055, "step": 5406 }, { "epoch": 7.366485013623978, "grad_norm": 0.3588035059715621, "learning_rate": 3.422939351222396e-06, "loss": 0.0009, "step": 5407 }, { "epoch": 7.367847411444142, "grad_norm": 0.2744210200069188, "learning_rate": 3.419615813946392e-06, "loss": 0.0078, "step": 5408 }, { "epoch": 7.369209809264305, "grad_norm": 0.4017291452088479, "learning_rate": 3.41629355815384e-06, "loss": 0.0012, "step": 5409 }, { "epoch": 7.370572207084469, "grad_norm": 0.04299738438920392, "learning_rate": 3.4129725844917304e-06, "loss": 0.0003, "step": 5410 }, { "epoch": 7.371934604904633, "grad_norm": 0.34195407249547655, "learning_rate": 3.409652893606791e-06, "loss": 0.0077, "step": 5411 }, { "epoch": 7.3732970027247955, "grad_norm": 0.9625716825410932, "learning_rate": 3.406334486145515e-06, "loss": 0.0032, "step": 5412 }, { "epoch": 7.374659400544959, "grad_norm": 1.9967700446277123, "learning_rate": 3.403017362754133e-06, "loss": 0.033, "step": 5413 }, { "epoch": 7.376021798365123, "grad_norm": 0.7492924534307228, "learning_rate": 3.3997015240786346e-06, "loss": 0.0098, "step": 5414 }, { "epoch": 7.377384196185286, "grad_norm": 0.6119609864577862, "learning_rate": 3.396386970764759e-06, "loss": 0.0027, "step": 5415 }, { "epoch": 7.37874659400545, "grad_norm": 0.967318880593442, "learning_rate": 3.3930737034579862e-06, "loss": 0.0039, "step": 5416 }, { "epoch": 7.3801089918256135, "grad_norm": 3.1353523949213393, "learning_rate": 3.3897617228035573e-06, "loss": 0.0133, "step": 5417 }, { "epoch": 7.381471389645776, "grad_norm": 0.36511607572485777, "learning_rate": 3.3864510294464514e-06, "loss": 0.0006, "step": 5418 }, { "epoch": 7.38283378746594, "grad_norm": 0.5255543486098958, "learning_rate": 3.3831416240314085e-06, "loss": 0.0018, "step": 5419 }, { "epoch": 7.384196185286104, "grad_norm": 1.781782005933912, "learning_rate": 3.3798335072029033e-06, "loss": 0.0024, "step": 5420 }, { "epoch": 7.385558583106267, "grad_norm": 1.2275065720173448, "learning_rate": 3.3765266796051797e-06, "loss": 0.0154, "step": 5421 }, { "epoch": 7.386920980926431, "grad_norm": 1.844829452771544, "learning_rate": 3.3732211418822137e-06, "loss": 0.0065, "step": 5422 }, { "epoch": 7.3882833787465945, "grad_norm": 0.11081639130343314, "learning_rate": 3.369916894677733e-06, "loss": 0.0006, "step": 5423 }, { "epoch": 7.389645776566757, "grad_norm": 1.6544245843686647, "learning_rate": 3.3666139386352205e-06, "loss": 0.0109, "step": 5424 }, { "epoch": 7.391008174386921, "grad_norm": 0.2099100485502123, "learning_rate": 3.3633122743978996e-06, "loss": 0.0028, "step": 5425 }, { "epoch": 7.392370572207085, "grad_norm": 0.3841004140463261, "learning_rate": 3.3600119026087475e-06, "loss": 0.0005, "step": 5426 }, { "epoch": 7.393732970027248, "grad_norm": 0.42708779618550424, "learning_rate": 3.356712823910493e-06, "loss": 0.0007, "step": 5427 }, { "epoch": 7.395095367847412, "grad_norm": 0.716018580851642, "learning_rate": 3.353415038945601e-06, "loss": 0.0059, "step": 5428 }, { "epoch": 7.396457765667575, "grad_norm": 0.19649095212299694, "learning_rate": 3.350118548356299e-06, "loss": 0.0007, "step": 5429 }, { "epoch": 7.397820163487738, "grad_norm": 0.22400108730189314, "learning_rate": 3.3468233527845485e-06, "loss": 0.0009, "step": 5430 }, { "epoch": 7.399182561307902, "grad_norm": 0.45318382659525247, "learning_rate": 3.343529452872073e-06, "loss": 0.0008, "step": 5431 }, { "epoch": 7.400544959128065, "grad_norm": 0.3418193487015061, "learning_rate": 3.3402368492603245e-06, "loss": 0.0023, "step": 5432 }, { "epoch": 7.401907356948229, "grad_norm": 0.8409397746527243, "learning_rate": 3.3369455425905294e-06, "loss": 0.0024, "step": 5433 }, { "epoch": 7.4032697547683926, "grad_norm": 0.28855414736298934, "learning_rate": 3.3336555335036393e-06, "loss": 0.0008, "step": 5434 }, { "epoch": 7.4046321525885554, "grad_norm": 0.19646575613654205, "learning_rate": 3.330366822640356e-06, "loss": 0.0004, "step": 5435 }, { "epoch": 7.405994550408719, "grad_norm": 0.19110860795518836, "learning_rate": 3.3270794106411408e-06, "loss": 0.0004, "step": 5436 }, { "epoch": 7.407356948228883, "grad_norm": 0.2006147805839284, "learning_rate": 3.323793298146187e-06, "loss": 0.0007, "step": 5437 }, { "epoch": 7.408719346049046, "grad_norm": 0.9260523456441347, "learning_rate": 3.3205084857954452e-06, "loss": 0.0126, "step": 5438 }, { "epoch": 7.41008174386921, "grad_norm": 0.4165098850030267, "learning_rate": 3.3172249742286124e-06, "loss": 0.0082, "step": 5439 }, { "epoch": 7.4114441416893735, "grad_norm": 0.33291696445775165, "learning_rate": 3.313942764085122e-06, "loss": 0.0173, "step": 5440 }, { "epoch": 7.412806539509536, "grad_norm": 0.754583518871325, "learning_rate": 3.3106618560041704e-06, "loss": 0.001, "step": 5441 }, { "epoch": 7.4141689373297, "grad_norm": 1.1047228481364424, "learning_rate": 3.3073822506246822e-06, "loss": 0.0014, "step": 5442 }, { "epoch": 7.415531335149864, "grad_norm": 0.3039195614889438, "learning_rate": 3.304103948585341e-06, "loss": 0.0003, "step": 5443 }, { "epoch": 7.416893732970027, "grad_norm": 1.361704203488064, "learning_rate": 3.300826950524575e-06, "loss": 0.0074, "step": 5444 }, { "epoch": 7.418256130790191, "grad_norm": 0.7191878851647583, "learning_rate": 3.297551257080558e-06, "loss": 0.0023, "step": 5445 }, { "epoch": 7.419618528610354, "grad_norm": 0.25447738905228096, "learning_rate": 3.294276868891205e-06, "loss": 0.0066, "step": 5446 }, { "epoch": 7.420980926430517, "grad_norm": 0.39529951266208707, "learning_rate": 3.291003786594178e-06, "loss": 0.0079, "step": 5447 }, { "epoch": 7.422343324250681, "grad_norm": 1.2308473397054063, "learning_rate": 3.287732010826892e-06, "loss": 0.0099, "step": 5448 }, { "epoch": 7.423705722070845, "grad_norm": 2.7531883535280044, "learning_rate": 3.2844615422264925e-06, "loss": 0.0124, "step": 5449 }, { "epoch": 7.425068119891008, "grad_norm": 0.18932918898127785, "learning_rate": 3.281192381429894e-06, "loss": 0.0079, "step": 5450 }, { "epoch": 7.426430517711172, "grad_norm": 0.3169746163245884, "learning_rate": 3.277924529073735e-06, "loss": 0.0171, "step": 5451 }, { "epoch": 7.427792915531335, "grad_norm": 0.5777824994865913, "learning_rate": 3.274657985794405e-06, "loss": 0.0007, "step": 5452 }, { "epoch": 7.429155313351498, "grad_norm": 0.5106400285546141, "learning_rate": 3.2713927522280455e-06, "loss": 0.0007, "step": 5453 }, { "epoch": 7.430517711171662, "grad_norm": 0.6080553730931002, "learning_rate": 3.2681288290105315e-06, "loss": 0.008, "step": 5454 }, { "epoch": 7.431880108991826, "grad_norm": 0.6743315967976149, "learning_rate": 3.264866216777494e-06, "loss": 0.0076, "step": 5455 }, { "epoch": 7.433242506811989, "grad_norm": 1.5513935888722883, "learning_rate": 3.2616049161643005e-06, "loss": 0.0041, "step": 5456 }, { "epoch": 7.4346049046321525, "grad_norm": 0.6449362835451301, "learning_rate": 3.2583449278060707e-06, "loss": 0.0009, "step": 5457 }, { "epoch": 7.435967302452316, "grad_norm": 0.6551786532223234, "learning_rate": 3.255086252337664e-06, "loss": 0.0077, "step": 5458 }, { "epoch": 7.437329700272479, "grad_norm": 0.29576479432771097, "learning_rate": 3.2518288903936767e-06, "loss": 0.0005, "step": 5459 }, { "epoch": 7.438692098092643, "grad_norm": 3.5120901061615175, "learning_rate": 3.248572842608466e-06, "loss": 0.0055, "step": 5460 }, { "epoch": 7.440054495912807, "grad_norm": 0.6449363872012661, "learning_rate": 3.2453181096161157e-06, "loss": 0.0065, "step": 5461 }, { "epoch": 7.44141689373297, "grad_norm": 2.7268777589843523, "learning_rate": 3.2420646920504727e-06, "loss": 0.0046, "step": 5462 }, { "epoch": 7.4427792915531334, "grad_norm": 0.39811658614418877, "learning_rate": 3.2388125905451084e-06, "loss": 0.0077, "step": 5463 }, { "epoch": 7.444141689373297, "grad_norm": 0.878995451182881, "learning_rate": 3.2355618057333537e-06, "loss": 0.0025, "step": 5464 }, { "epoch": 7.44550408719346, "grad_norm": 2.4453999242191893, "learning_rate": 3.2323123382482713e-06, "loss": 0.0181, "step": 5465 }, { "epoch": 7.446866485013624, "grad_norm": 0.6805110075281885, "learning_rate": 3.2290641887226705e-06, "loss": 0.0007, "step": 5466 }, { "epoch": 7.448228882833788, "grad_norm": 1.7483479671901303, "learning_rate": 3.2258173577891085e-06, "loss": 0.0228, "step": 5467 }, { "epoch": 7.449591280653951, "grad_norm": 1.7304663547126362, "learning_rate": 3.2225718460798816e-06, "loss": 0.0138, "step": 5468 }, { "epoch": 7.450953678474114, "grad_norm": 1.2309664704933585, "learning_rate": 3.2193276542270345e-06, "loss": 0.0207, "step": 5469 }, { "epoch": 7.452316076294278, "grad_norm": 2.573936527945757, "learning_rate": 3.2160847828623465e-06, "loss": 0.0097, "step": 5470 }, { "epoch": 7.453678474114441, "grad_norm": 0.7972707812409947, "learning_rate": 3.212843232617343e-06, "loss": 0.0087, "step": 5471 }, { "epoch": 7.455040871934605, "grad_norm": 0.8219219924249875, "learning_rate": 3.209603004123294e-06, "loss": 0.0045, "step": 5472 }, { "epoch": 7.456403269754769, "grad_norm": 0.7892073375811979, "learning_rate": 3.206364098011212e-06, "loss": 0.0147, "step": 5473 }, { "epoch": 7.4577656675749315, "grad_norm": 0.2871436830278234, "learning_rate": 3.203126514911854e-06, "loss": 0.0007, "step": 5474 }, { "epoch": 7.459128065395095, "grad_norm": 2.1406827876837955, "learning_rate": 3.19989025545571e-06, "loss": 0.0053, "step": 5475 }, { "epoch": 7.460490463215259, "grad_norm": 1.2947709391650046, "learning_rate": 3.1966553202730266e-06, "loss": 0.025, "step": 5476 }, { "epoch": 7.461852861035422, "grad_norm": 0.6547618283706986, "learning_rate": 3.1934217099937794e-06, "loss": 0.0077, "step": 5477 }, { "epoch": 7.463215258855586, "grad_norm": 1.9984410046539838, "learning_rate": 3.190189425247686e-06, "loss": 0.0094, "step": 5478 }, { "epoch": 7.46457765667575, "grad_norm": 0.7269135376481152, "learning_rate": 3.186958466664224e-06, "loss": 0.0072, "step": 5479 }, { "epoch": 7.4659400544959125, "grad_norm": 0.48835575324652813, "learning_rate": 3.1837288348725904e-06, "loss": 0.0085, "step": 5480 }, { "epoch": 7.467302452316076, "grad_norm": 0.46650284631092476, "learning_rate": 3.1805005305017377e-06, "loss": 0.007, "step": 5481 }, { "epoch": 7.46866485013624, "grad_norm": 0.6091660034565037, "learning_rate": 3.1772735541803545e-06, "loss": 0.0012, "step": 5482 }, { "epoch": 7.470027247956403, "grad_norm": 0.6688707128508187, "learning_rate": 3.174047906536867e-06, "loss": 0.0007, "step": 5483 }, { "epoch": 7.471389645776567, "grad_norm": 0.3483052428350522, "learning_rate": 3.170823588199452e-06, "loss": 0.0064, "step": 5484 }, { "epoch": 7.4727520435967305, "grad_norm": 1.820360600874361, "learning_rate": 3.1676005997960214e-06, "loss": 0.0051, "step": 5485 }, { "epoch": 7.474114441416893, "grad_norm": 1.9970564144692968, "learning_rate": 3.1643789419542325e-06, "loss": 0.007, "step": 5486 }, { "epoch": 7.475476839237057, "grad_norm": 0.4335052842286486, "learning_rate": 3.161158615301476e-06, "loss": 0.0059, "step": 5487 }, { "epoch": 7.476839237057221, "grad_norm": 2.4523499075219015, "learning_rate": 3.1579396204648928e-06, "loss": 0.0118, "step": 5488 }, { "epoch": 7.478201634877384, "grad_norm": 1.107985503158266, "learning_rate": 3.154721958071356e-06, "loss": 0.0014, "step": 5489 }, { "epoch": 7.479564032697548, "grad_norm": 1.5516180568563145, "learning_rate": 3.1515056287474764e-06, "loss": 0.0029, "step": 5490 }, { "epoch": 7.4809264305177114, "grad_norm": 0.32419445595589885, "learning_rate": 3.1482906331196263e-06, "loss": 0.0005, "step": 5491 }, { "epoch": 7.482288828337874, "grad_norm": 1.3864052265720246, "learning_rate": 3.145076971813891e-06, "loss": 0.0025, "step": 5492 }, { "epoch": 7.483651226158038, "grad_norm": 1.8820328375892605, "learning_rate": 3.1418646454561153e-06, "loss": 0.0285, "step": 5493 }, { "epoch": 7.485013623978202, "grad_norm": 2.5893395357399234, "learning_rate": 3.138653654671875e-06, "loss": 0.0085, "step": 5494 }, { "epoch": 7.486376021798365, "grad_norm": 0.5537482261526989, "learning_rate": 3.135444000086485e-06, "loss": 0.0003, "step": 5495 }, { "epoch": 7.487738419618529, "grad_norm": 0.6347867330202326, "learning_rate": 3.1322356823250043e-06, "loss": 0.0247, "step": 5496 }, { "epoch": 7.489100817438692, "grad_norm": 2.8902482396196194, "learning_rate": 3.129028702012232e-06, "loss": 0.0097, "step": 5497 }, { "epoch": 7.490463215258855, "grad_norm": 0.6166208806497786, "learning_rate": 3.125823059772708e-06, "loss": 0.0006, "step": 5498 }, { "epoch": 7.491825613079019, "grad_norm": 0.8995698021944295, "learning_rate": 3.1226187562307007e-06, "loss": 0.0008, "step": 5499 }, { "epoch": 7.493188010899183, "grad_norm": 1.166195742363286, "learning_rate": 3.119415792010233e-06, "loss": 0.0148, "step": 5500 }, { "epoch": 7.494550408719346, "grad_norm": 1.5979736717883783, "learning_rate": 3.1162141677350532e-06, "loss": 0.0131, "step": 5501 }, { "epoch": 7.4959128065395095, "grad_norm": 1.3779904298764802, "learning_rate": 3.113013884028658e-06, "loss": 0.01, "step": 5502 }, { "epoch": 7.497275204359673, "grad_norm": 0.8632020424517498, "learning_rate": 3.1098149415142843e-06, "loss": 0.0005, "step": 5503 }, { "epoch": 7.498637602179836, "grad_norm": 1.1289160977627353, "learning_rate": 3.106617340814896e-06, "loss": 0.0019, "step": 5504 }, { "epoch": 7.5, "grad_norm": 0.7727333502011853, "learning_rate": 3.1034210825532098e-06, "loss": 0.0047, "step": 5505 }, { "epoch": 7.501362397820164, "grad_norm": 1.6368424458780435, "learning_rate": 3.100226167351672e-06, "loss": 0.0045, "step": 5506 }, { "epoch": 7.502724795640327, "grad_norm": 0.8268821302240295, "learning_rate": 3.097032595832462e-06, "loss": 0.0061, "step": 5507 }, { "epoch": 7.5040871934604905, "grad_norm": 0.8349872476907623, "learning_rate": 3.09384036861752e-06, "loss": 0.0091, "step": 5508 }, { "epoch": 7.505449591280654, "grad_norm": 1.3260568080980675, "learning_rate": 3.0906494863284987e-06, "loss": 0.0039, "step": 5509 }, { "epoch": 7.506811989100817, "grad_norm": 0.6900062775398421, "learning_rate": 3.0874599495868074e-06, "loss": 0.0189, "step": 5510 }, { "epoch": 7.508174386920981, "grad_norm": 1.1131012804964777, "learning_rate": 3.0842717590135796e-06, "loss": 0.0009, "step": 5511 }, { "epoch": 7.509536784741145, "grad_norm": 1.335381133923222, "learning_rate": 3.081084915229697e-06, "loss": 0.0038, "step": 5512 }, { "epoch": 7.510899182561308, "grad_norm": 1.1923465304671828, "learning_rate": 3.0778994188557722e-06, "loss": 0.0121, "step": 5513 }, { "epoch": 7.512261580381471, "grad_norm": 1.1382248797359162, "learning_rate": 3.0747152705121586e-06, "loss": 0.0139, "step": 5514 }, { "epoch": 7.513623978201635, "grad_norm": 1.5289800227320076, "learning_rate": 3.071532470818951e-06, "loss": 0.0072, "step": 5515 }, { "epoch": 7.514986376021798, "grad_norm": 1.4206469868402098, "learning_rate": 3.068351020395971e-06, "loss": 0.001, "step": 5516 }, { "epoch": 7.516348773841962, "grad_norm": 1.1779961495072062, "learning_rate": 3.065170919862789e-06, "loss": 0.0012, "step": 5517 }, { "epoch": 7.517711171662126, "grad_norm": 0.42596021106833915, "learning_rate": 3.061992169838701e-06, "loss": 0.0009, "step": 5518 }, { "epoch": 7.5190735694822886, "grad_norm": 1.377014025147287, "learning_rate": 3.058814770942751e-06, "loss": 0.0021, "step": 5519 }, { "epoch": 7.520435967302452, "grad_norm": 1.0941687369676516, "learning_rate": 3.055638723793717e-06, "loss": 0.0005, "step": 5520 }, { "epoch": 7.521798365122616, "grad_norm": 1.1590058005590023, "learning_rate": 3.0524640290101037e-06, "loss": 0.0119, "step": 5521 }, { "epoch": 7.523160762942779, "grad_norm": 0.6518500204534886, "learning_rate": 3.04929068721017e-06, "loss": 0.0008, "step": 5522 }, { "epoch": 7.524523160762943, "grad_norm": 1.2600435820880325, "learning_rate": 3.046118699011894e-06, "loss": 0.0048, "step": 5523 }, { "epoch": 7.525885558583107, "grad_norm": 1.144476230410641, "learning_rate": 3.0429480650330045e-06, "loss": 0.0007, "step": 5524 }, { "epoch": 7.5272479564032695, "grad_norm": 0.6008219263134378, "learning_rate": 3.0397787858909543e-06, "loss": 0.0009, "step": 5525 }, { "epoch": 7.528610354223433, "grad_norm": 0.8161365899133474, "learning_rate": 3.036610862202941e-06, "loss": 0.0058, "step": 5526 }, { "epoch": 7.529972752043597, "grad_norm": 2.615062544870619, "learning_rate": 3.0334442945858978e-06, "loss": 0.0046, "step": 5527 }, { "epoch": 7.53133514986376, "grad_norm": 2.0696454772223944, "learning_rate": 3.0302790836564853e-06, "loss": 0.0036, "step": 5528 }, { "epoch": 7.532697547683924, "grad_norm": 0.5717337075705394, "learning_rate": 3.0271152300311134e-06, "loss": 0.0017, "step": 5529 }, { "epoch": 7.5340599455040875, "grad_norm": 0.6507681341612988, "learning_rate": 3.0239527343259146e-06, "loss": 0.0156, "step": 5530 }, { "epoch": 7.53542234332425, "grad_norm": 1.7948588400945704, "learning_rate": 3.0207915971567624e-06, "loss": 0.012, "step": 5531 }, { "epoch": 7.536784741144414, "grad_norm": 2.7715804163339137, "learning_rate": 3.017631819139273e-06, "loss": 0.0062, "step": 5532 }, { "epoch": 7.538147138964578, "grad_norm": 1.7822893374912443, "learning_rate": 3.014473400888782e-06, "loss": 0.0251, "step": 5533 }, { "epoch": 7.539509536784741, "grad_norm": 2.0382188878073144, "learning_rate": 3.0113163430203775e-06, "loss": 0.0237, "step": 5534 }, { "epoch": 7.540871934604905, "grad_norm": 1.0617853901377372, "learning_rate": 3.0081606461488656e-06, "loss": 0.0006, "step": 5535 }, { "epoch": 7.5422343324250685, "grad_norm": 1.3745202876558404, "learning_rate": 3.0050063108888004e-06, "loss": 0.0009, "step": 5536 }, { "epoch": 7.543596730245231, "grad_norm": 0.7003438593641758, "learning_rate": 3.001853337854471e-06, "loss": 0.0008, "step": 5537 }, { "epoch": 7.544959128065395, "grad_norm": 0.3067456771055492, "learning_rate": 2.998701727659887e-06, "loss": 0.0006, "step": 5538 }, { "epoch": 7.546321525885559, "grad_norm": 1.2377438343523288, "learning_rate": 2.9955514809188126e-06, "loss": 0.0032, "step": 5539 }, { "epoch": 7.547683923705722, "grad_norm": 1.1080599529681925, "learning_rate": 2.9924025982447267e-06, "loss": 0.001, "step": 5540 }, { "epoch": 7.549046321525886, "grad_norm": 1.0267498865289963, "learning_rate": 2.9892550802508603e-06, "loss": 0.0024, "step": 5541 }, { "epoch": 7.550408719346049, "grad_norm": 0.272560176321803, "learning_rate": 2.986108927550162e-06, "loss": 0.0079, "step": 5542 }, { "epoch": 7.551771117166212, "grad_norm": 0.9993424533076128, "learning_rate": 2.9829641407553277e-06, "loss": 0.0089, "step": 5543 }, { "epoch": 7.553133514986376, "grad_norm": 0.24015038491175178, "learning_rate": 2.9798207204787854e-06, "loss": 0.0007, "step": 5544 }, { "epoch": 7.55449591280654, "grad_norm": 0.44542063667586484, "learning_rate": 2.9766786673326875e-06, "loss": 0.0043, "step": 5545 }, { "epoch": 7.555858310626703, "grad_norm": 0.3078303765712081, "learning_rate": 2.973537981928932e-06, "loss": 0.0009, "step": 5546 }, { "epoch": 7.5572207084468666, "grad_norm": 1.8263985976345267, "learning_rate": 2.9703986648791416e-06, "loss": 0.0154, "step": 5547 }, { "epoch": 7.55858310626703, "grad_norm": 0.8254398713321451, "learning_rate": 2.9672607167946767e-06, "loss": 0.0149, "step": 5548 }, { "epoch": 7.559945504087193, "grad_norm": 0.3550199680379405, "learning_rate": 2.964124138286635e-06, "loss": 0.008, "step": 5549 }, { "epoch": 7.561307901907357, "grad_norm": 0.34190126146792343, "learning_rate": 2.9609889299658357e-06, "loss": 0.007, "step": 5550 }, { "epoch": 7.562670299727521, "grad_norm": 1.1144102110075906, "learning_rate": 2.957855092442846e-06, "loss": 0.0025, "step": 5551 }, { "epoch": 7.564032697547684, "grad_norm": 0.44096389779892087, "learning_rate": 2.9547226263279525e-06, "loss": 0.008, "step": 5552 }, { "epoch": 7.5653950953678475, "grad_norm": 0.8640142972553685, "learning_rate": 2.9515915322311862e-06, "loss": 0.0266, "step": 5553 }, { "epoch": 7.566757493188011, "grad_norm": 1.1194966856360309, "learning_rate": 2.9484618107622996e-06, "loss": 0.003, "step": 5554 }, { "epoch": 7.568119891008174, "grad_norm": 0.47267145695795026, "learning_rate": 2.945333462530788e-06, "loss": 0.0006, "step": 5555 }, { "epoch": 7.569482288828338, "grad_norm": 0.7049666715996984, "learning_rate": 2.942206488145878e-06, "loss": 0.0015, "step": 5556 }, { "epoch": 7.570844686648502, "grad_norm": 1.5333155205322282, "learning_rate": 2.939080888216518e-06, "loss": 0.0131, "step": 5557 }, { "epoch": 7.572207084468665, "grad_norm": 1.848607970359653, "learning_rate": 2.9359566633514034e-06, "loss": 0.0106, "step": 5558 }, { "epoch": 7.573569482288828, "grad_norm": 0.4965421703733641, "learning_rate": 2.9328338141589506e-06, "loss": 0.0006, "step": 5559 }, { "epoch": 7.574931880108992, "grad_norm": 0.8672033832838614, "learning_rate": 2.929712341247314e-06, "loss": 0.0094, "step": 5560 }, { "epoch": 7.576294277929155, "grad_norm": 1.200506509338465, "learning_rate": 2.926592245224381e-06, "loss": 0.0012, "step": 5561 }, { "epoch": 7.577656675749319, "grad_norm": 1.0282574139249672, "learning_rate": 2.9234735266977632e-06, "loss": 0.0017, "step": 5562 }, { "epoch": 7.579019073569482, "grad_norm": 0.31623274714261357, "learning_rate": 2.9203561862748153e-06, "loss": 0.0005, "step": 5563 }, { "epoch": 7.580381471389646, "grad_norm": 1.2196818808631273, "learning_rate": 2.9172402245626107e-06, "loss": 0.0032, "step": 5564 }, { "epoch": 7.581743869209809, "grad_norm": 0.5299784932886384, "learning_rate": 2.9141256421679643e-06, "loss": 0.0074, "step": 5565 }, { "epoch": 7.583106267029972, "grad_norm": 0.78896841540279, "learning_rate": 2.9110124396974216e-06, "loss": 0.0068, "step": 5566 }, { "epoch": 7.584468664850136, "grad_norm": 0.5824348142211966, "learning_rate": 2.9079006177572523e-06, "loss": 0.0076, "step": 5567 }, { "epoch": 7.5858310626703, "grad_norm": 2.2841808535370154, "learning_rate": 2.904790176953466e-06, "loss": 0.0274, "step": 5568 }, { "epoch": 7.587193460490463, "grad_norm": 1.8977050878326371, "learning_rate": 2.9016811178917937e-06, "loss": 0.0042, "step": 5569 }, { "epoch": 7.5885558583106265, "grad_norm": 1.344833881928879, "learning_rate": 2.89857344117771e-06, "loss": 0.001, "step": 5570 }, { "epoch": 7.58991825613079, "grad_norm": 3.9579289131309547, "learning_rate": 2.895467147416405e-06, "loss": 0.0164, "step": 5571 }, { "epoch": 7.591280653950953, "grad_norm": 0.6619106930256948, "learning_rate": 2.8923622372128113e-06, "loss": 0.0009, "step": 5572 }, { "epoch": 7.592643051771117, "grad_norm": 0.9210073490275086, "learning_rate": 2.8892587111715918e-06, "loss": 0.0007, "step": 5573 }, { "epoch": 7.594005449591281, "grad_norm": 0.6635475040241995, "learning_rate": 2.8861565698971306e-06, "loss": 0.0025, "step": 5574 }, { "epoch": 7.595367847411444, "grad_norm": 1.793258233126103, "learning_rate": 2.883055813993553e-06, "loss": 0.0265, "step": 5575 }, { "epoch": 7.5967302452316074, "grad_norm": 0.5615248777570794, "learning_rate": 2.879956444064703e-06, "loss": 0.0024, "step": 5576 }, { "epoch": 7.598092643051771, "grad_norm": 0.9132358233531103, "learning_rate": 2.8768584607141657e-06, "loss": 0.008, "step": 5577 }, { "epoch": 7.599455040871934, "grad_norm": 0.7816090251245281, "learning_rate": 2.8737618645452525e-06, "loss": 0.0007, "step": 5578 }, { "epoch": 7.600817438692098, "grad_norm": 1.0477533158811951, "learning_rate": 2.870666656161e-06, "loss": 0.0071, "step": 5579 }, { "epoch": 7.602179836512262, "grad_norm": 0.44648910939001124, "learning_rate": 2.867572836164182e-06, "loss": 0.0028, "step": 5580 }, { "epoch": 7.603542234332425, "grad_norm": 3.3952488097663243, "learning_rate": 2.8644804051572926e-06, "loss": 0.0146, "step": 5581 }, { "epoch": 7.604904632152588, "grad_norm": 0.24664911349945765, "learning_rate": 2.861389363742565e-06, "loss": 0.0009, "step": 5582 }, { "epoch": 7.606267029972752, "grad_norm": 0.294434122832005, "learning_rate": 2.8582997125219604e-06, "loss": 0.0005, "step": 5583 }, { "epoch": 7.607629427792915, "grad_norm": 0.5892958089087054, "learning_rate": 2.8552114520971596e-06, "loss": 0.0004, "step": 5584 }, { "epoch": 7.608991825613079, "grad_norm": 0.4300303395291871, "learning_rate": 2.8521245830695864e-06, "loss": 0.0082, "step": 5585 }, { "epoch": 7.610354223433243, "grad_norm": 0.6233348691215771, "learning_rate": 2.849039106040381e-06, "loss": 0.0005, "step": 5586 }, { "epoch": 7.6117166212534055, "grad_norm": 0.7130025945001158, "learning_rate": 2.8459550216104246e-06, "loss": 0.0045, "step": 5587 }, { "epoch": 7.613079019073569, "grad_norm": 0.3200461913730619, "learning_rate": 2.842872330380314e-06, "loss": 0.0006, "step": 5588 }, { "epoch": 7.614441416893733, "grad_norm": 0.7346429170330365, "learning_rate": 2.839791032950384e-06, "loss": 0.0009, "step": 5589 }, { "epoch": 7.615803814713896, "grad_norm": 0.36536810405474834, "learning_rate": 2.8367111299207006e-06, "loss": 0.006, "step": 5590 }, { "epoch": 7.61716621253406, "grad_norm": 1.2544281681290192, "learning_rate": 2.833632621891046e-06, "loss": 0.0133, "step": 5591 }, { "epoch": 7.618528610354224, "grad_norm": 0.7775458231210168, "learning_rate": 2.830555509460944e-06, "loss": 0.0062, "step": 5592 }, { "epoch": 7.6198910081743865, "grad_norm": 0.8634925743966498, "learning_rate": 2.827479793229634e-06, "loss": 0.0138, "step": 5593 }, { "epoch": 7.62125340599455, "grad_norm": 1.0895336412053727, "learning_rate": 2.8244054737960935e-06, "loss": 0.0007, "step": 5594 }, { "epoch": 7.622615803814714, "grad_norm": 1.6230567801433764, "learning_rate": 2.8213325517590284e-06, "loss": 0.0123, "step": 5595 }, { "epoch": 7.623978201634877, "grad_norm": 1.5622371130044836, "learning_rate": 2.818261027716861e-06, "loss": 0.0147, "step": 5596 }, { "epoch": 7.625340599455041, "grad_norm": 0.3115509785936286, "learning_rate": 2.815190902267757e-06, "loss": 0.0006, "step": 5597 }, { "epoch": 7.6267029972752045, "grad_norm": 1.6785382016400558, "learning_rate": 2.812122176009594e-06, "loss": 0.0154, "step": 5598 }, { "epoch": 7.628065395095367, "grad_norm": 1.1059873832611815, "learning_rate": 2.8090548495399915e-06, "loss": 0.0012, "step": 5599 }, { "epoch": 7.629427792915531, "grad_norm": 1.011997388960627, "learning_rate": 2.805988923456283e-06, "loss": 0.0081, "step": 5600 }, { "epoch": 7.630790190735695, "grad_norm": 1.7452927874693842, "learning_rate": 2.802924398355541e-06, "loss": 0.0125, "step": 5601 }, { "epoch": 7.632152588555858, "grad_norm": 1.859623309602037, "learning_rate": 2.7998612748345612e-06, "loss": 0.0108, "step": 5602 }, { "epoch": 7.633514986376022, "grad_norm": 2.4495155566125133, "learning_rate": 2.7967995534898595e-06, "loss": 0.0025, "step": 5603 }, { "epoch": 7.6348773841961854, "grad_norm": 2.55719919438142, "learning_rate": 2.793739234917692e-06, "loss": 0.0099, "step": 5604 }, { "epoch": 7.636239782016348, "grad_norm": 1.2408298885280484, "learning_rate": 2.790680319714025e-06, "loss": 0.0119, "step": 5605 }, { "epoch": 7.637602179836512, "grad_norm": 0.4350140486107405, "learning_rate": 2.787622808474567e-06, "loss": 0.0022, "step": 5606 }, { "epoch": 7.638964577656676, "grad_norm": 0.3260791682697723, "learning_rate": 2.7845667017947496e-06, "loss": 0.0007, "step": 5607 }, { "epoch": 7.640326975476839, "grad_norm": 3.2448663378645906, "learning_rate": 2.7815120002697206e-06, "loss": 0.0112, "step": 5608 }, { "epoch": 7.641689373297003, "grad_norm": 0.3154284371664449, "learning_rate": 2.778458704494367e-06, "loss": 0.0013, "step": 5609 }, { "epoch": 7.643051771117166, "grad_norm": 0.7017479781864219, "learning_rate": 2.775406815063292e-06, "loss": 0.0023, "step": 5610 }, { "epoch": 7.644414168937329, "grad_norm": 0.5525284902123566, "learning_rate": 2.7723563325708324e-06, "loss": 0.0018, "step": 5611 }, { "epoch": 7.645776566757493, "grad_norm": 0.2934543270924048, "learning_rate": 2.7693072576110515e-06, "loss": 0.0005, "step": 5612 }, { "epoch": 7.647138964577657, "grad_norm": 0.9639732539809857, "learning_rate": 2.7662595907777277e-06, "loss": 0.0099, "step": 5613 }, { "epoch": 7.64850136239782, "grad_norm": 0.7125061805110834, "learning_rate": 2.7632133326643806e-06, "loss": 0.0045, "step": 5614 }, { "epoch": 7.6498637602179835, "grad_norm": 2.071357145267301, "learning_rate": 2.760168483864241e-06, "loss": 0.0251, "step": 5615 }, { "epoch": 7.651226158038147, "grad_norm": 0.8550461352133025, "learning_rate": 2.757125044970276e-06, "loss": 0.0093, "step": 5616 }, { "epoch": 7.65258855585831, "grad_norm": 1.3495975247339105, "learning_rate": 2.7540830165751665e-06, "loss": 0.0099, "step": 5617 }, { "epoch": 7.653950953678474, "grad_norm": 1.314917937679053, "learning_rate": 2.7510423992713374e-06, "loss": 0.0009, "step": 5618 }, { "epoch": 7.655313351498638, "grad_norm": 0.5017567567830251, "learning_rate": 2.748003193650922e-06, "loss": 0.0006, "step": 5619 }, { "epoch": 7.656675749318801, "grad_norm": 0.4718757138807863, "learning_rate": 2.7449654003057813e-06, "loss": 0.001, "step": 5620 }, { "epoch": 7.6580381471389645, "grad_norm": 1.5318687331675254, "learning_rate": 2.7419290198275096e-06, "loss": 0.0059, "step": 5621 }, { "epoch": 7.659400544959128, "grad_norm": 1.7102696461451325, "learning_rate": 2.738894052807415e-06, "loss": 0.0027, "step": 5622 }, { "epoch": 7.660762942779291, "grad_norm": 0.8761674629567987, "learning_rate": 2.735860499836539e-06, "loss": 0.0067, "step": 5623 }, { "epoch": 7.662125340599455, "grad_norm": 0.6993472184269285, "learning_rate": 2.732828361505647e-06, "loss": 0.0009, "step": 5624 }, { "epoch": 7.663487738419619, "grad_norm": 1.6396777380222705, "learning_rate": 2.7297976384052216e-06, "loss": 0.0039, "step": 5625 }, { "epoch": 7.664850136239782, "grad_norm": 0.9594660645503924, "learning_rate": 2.726768331125479e-06, "loss": 0.0095, "step": 5626 }, { "epoch": 7.666212534059945, "grad_norm": 0.2482660353052259, "learning_rate": 2.7237404402563516e-06, "loss": 0.0032, "step": 5627 }, { "epoch": 7.667574931880109, "grad_norm": 0.5862359694910438, "learning_rate": 2.720713966387505e-06, "loss": 0.0059, "step": 5628 }, { "epoch": 7.668937329700272, "grad_norm": 1.0191772774138639, "learning_rate": 2.7176889101083135e-06, "loss": 0.0017, "step": 5629 }, { "epoch": 7.670299727520436, "grad_norm": 0.986555587862826, "learning_rate": 2.7146652720079e-06, "loss": 0.0146, "step": 5630 }, { "epoch": 7.6716621253406, "grad_norm": 1.14212267094088, "learning_rate": 2.711643052675088e-06, "loss": 0.0224, "step": 5631 }, { "epoch": 7.6730245231607626, "grad_norm": 0.39837851325288914, "learning_rate": 2.708622252698433e-06, "loss": 0.0153, "step": 5632 }, { "epoch": 7.674386920980926, "grad_norm": 1.5414120263863422, "learning_rate": 2.7056028726662175e-06, "loss": 0.0055, "step": 5633 }, { "epoch": 7.67574931880109, "grad_norm": 2.1820977126022107, "learning_rate": 2.7025849131664415e-06, "loss": 0.0111, "step": 5634 }, { "epoch": 7.677111716621253, "grad_norm": 1.0882107080558148, "learning_rate": 2.699568374786833e-06, "loss": 0.0018, "step": 5635 }, { "epoch": 7.678474114441417, "grad_norm": 0.47648491254042946, "learning_rate": 2.69655325811484e-06, "loss": 0.001, "step": 5636 }, { "epoch": 7.679836512261581, "grad_norm": 1.1559427812275163, "learning_rate": 2.6935395637376418e-06, "loss": 0.0007, "step": 5637 }, { "epoch": 7.6811989100817435, "grad_norm": 2.037545429812871, "learning_rate": 2.6905272922421276e-06, "loss": 0.0131, "step": 5638 }, { "epoch": 7.682561307901907, "grad_norm": 1.133504244931823, "learning_rate": 2.6875164442149147e-06, "loss": 0.0013, "step": 5639 }, { "epoch": 7.683923705722071, "grad_norm": 2.739589134411757, "learning_rate": 2.6845070202423483e-06, "loss": 0.0026, "step": 5640 }, { "epoch": 7.685286103542234, "grad_norm": 0.8855253160271196, "learning_rate": 2.6814990209104917e-06, "loss": 0.0006, "step": 5641 }, { "epoch": 7.686648501362398, "grad_norm": 0.8778443437084312, "learning_rate": 2.6784924468051342e-06, "loss": 0.001, "step": 5642 }, { "epoch": 7.6880108991825615, "grad_norm": 0.26941558067941745, "learning_rate": 2.675487298511782e-06, "loss": 0.0078, "step": 5643 }, { "epoch": 7.689373297002724, "grad_norm": 1.4323345235835772, "learning_rate": 2.672483576615664e-06, "loss": 0.007, "step": 5644 }, { "epoch": 7.690735694822888, "grad_norm": 0.6538834095706451, "learning_rate": 2.669481281701739e-06, "loss": 0.0095, "step": 5645 }, { "epoch": 7.692098092643052, "grad_norm": 0.19258603601481591, "learning_rate": 2.6664804143546753e-06, "loss": 0.0004, "step": 5646 }, { "epoch": 7.693460490463215, "grad_norm": 1.3728178840350023, "learning_rate": 2.663480975158882e-06, "loss": 0.0041, "step": 5647 }, { "epoch": 7.694822888283379, "grad_norm": 0.13533294277966965, "learning_rate": 2.6604829646984687e-06, "loss": 0.0079, "step": 5648 }, { "epoch": 7.6961852861035425, "grad_norm": 0.4942348742205398, "learning_rate": 2.6574863835572852e-06, "loss": 0.0008, "step": 5649 }, { "epoch": 7.697547683923705, "grad_norm": 0.8444995586259407, "learning_rate": 2.6544912323188885e-06, "loss": 0.0132, "step": 5650 }, { "epoch": 7.698910081743869, "grad_norm": 0.19847937825451767, "learning_rate": 2.651497511566562e-06, "loss": 0.0004, "step": 5651 }, { "epoch": 7.700272479564033, "grad_norm": 1.9693826562901295, "learning_rate": 2.648505221883315e-06, "loss": 0.0113, "step": 5652 }, { "epoch": 7.701634877384196, "grad_norm": 0.4301305495028917, "learning_rate": 2.645514363851874e-06, "loss": 0.0008, "step": 5653 }, { "epoch": 7.70299727520436, "grad_norm": 0.6893176089598501, "learning_rate": 2.6425249380546914e-06, "loss": 0.0014, "step": 5654 }, { "epoch": 7.704359673024523, "grad_norm": 1.3446597032102345, "learning_rate": 2.639536945073933e-06, "loss": 0.011, "step": 5655 }, { "epoch": 7.705722070844686, "grad_norm": 1.0130187493226173, "learning_rate": 2.636550385491485e-06, "loss": 0.008, "step": 5656 }, { "epoch": 7.70708446866485, "grad_norm": 0.7197037596524861, "learning_rate": 2.6335652598889682e-06, "loss": 0.004, "step": 5657 }, { "epoch": 7.708446866485014, "grad_norm": 0.48651823624175483, "learning_rate": 2.630581568847704e-06, "loss": 0.001, "step": 5658 }, { "epoch": 7.709809264305177, "grad_norm": 0.19987933603064592, "learning_rate": 2.6275993129487566e-06, "loss": 0.0005, "step": 5659 }, { "epoch": 7.7111716621253406, "grad_norm": 0.5761879314437263, "learning_rate": 2.6246184927728913e-06, "loss": 0.0077, "step": 5660 }, { "epoch": 7.712534059945504, "grad_norm": 0.23579365614098433, "learning_rate": 2.621639108900608e-06, "loss": 0.0076, "step": 5661 }, { "epoch": 7.713896457765667, "grad_norm": 1.7717642781746332, "learning_rate": 2.618661161912116e-06, "loss": 0.0103, "step": 5662 }, { "epoch": 7.715258855585831, "grad_norm": 0.34951334949099905, "learning_rate": 2.615684652387348e-06, "loss": 0.0011, "step": 5663 }, { "epoch": 7.716621253405995, "grad_norm": 1.391647672716631, "learning_rate": 2.612709580905961e-06, "loss": 0.0224, "step": 5664 }, { "epoch": 7.717983651226158, "grad_norm": 1.3205535237619033, "learning_rate": 2.6097359480473294e-06, "loss": 0.0031, "step": 5665 }, { "epoch": 7.7193460490463215, "grad_norm": 1.1970523672299282, "learning_rate": 2.60676375439055e-06, "loss": 0.001, "step": 5666 }, { "epoch": 7.720708446866485, "grad_norm": 1.7012766926263903, "learning_rate": 2.603793000514434e-06, "loss": 0.0043, "step": 5667 }, { "epoch": 7.722070844686648, "grad_norm": 1.7287507832249887, "learning_rate": 2.60082368699751e-06, "loss": 0.0175, "step": 5668 }, { "epoch": 7.723433242506812, "grad_norm": 1.605361534623734, "learning_rate": 2.5978558144180368e-06, "loss": 0.0077, "step": 5669 }, { "epoch": 7.724795640326976, "grad_norm": 1.8453236354743405, "learning_rate": 2.5948893833539844e-06, "loss": 0.0037, "step": 5670 }, { "epoch": 7.726158038147139, "grad_norm": 1.0846113217386677, "learning_rate": 2.5919243943830472e-06, "loss": 0.0063, "step": 5671 }, { "epoch": 7.727520435967302, "grad_norm": 1.6974866926274472, "learning_rate": 2.5889608480826323e-06, "loss": 0.0118, "step": 5672 }, { "epoch": 7.728882833787466, "grad_norm": 2.3560898742472642, "learning_rate": 2.585998745029873e-06, "loss": 0.004, "step": 5673 }, { "epoch": 7.730245231607629, "grad_norm": 3.714601550184873, "learning_rate": 2.583038085801617e-06, "loss": 0.0178, "step": 5674 }, { "epoch": 7.731607629427793, "grad_norm": 1.6238905142470839, "learning_rate": 2.5800788709744228e-06, "loss": 0.0059, "step": 5675 }, { "epoch": 7.732970027247957, "grad_norm": 1.0323177117757778, "learning_rate": 2.5771211011245923e-06, "loss": 0.0134, "step": 5676 }, { "epoch": 7.73433242506812, "grad_norm": 1.483074136887167, "learning_rate": 2.5741647768281187e-06, "loss": 0.0114, "step": 5677 }, { "epoch": 7.735694822888283, "grad_norm": 1.3960956072245316, "learning_rate": 2.5712098986607325e-06, "loss": 0.0076, "step": 5678 }, { "epoch": 7.737057220708447, "grad_norm": 0.5159540074469247, "learning_rate": 2.5682564671978726e-06, "loss": 0.0015, "step": 5679 }, { "epoch": 7.73841961852861, "grad_norm": 0.6089649606067723, "learning_rate": 2.5653044830146944e-06, "loss": 0.0008, "step": 5680 }, { "epoch": 7.739782016348774, "grad_norm": 0.9181808697158338, "learning_rate": 2.5623539466860813e-06, "loss": 0.0013, "step": 5681 }, { "epoch": 7.741144414168938, "grad_norm": 1.0301162613897565, "learning_rate": 2.5594048587866273e-06, "loss": 0.0005, "step": 5682 }, { "epoch": 7.7425068119891005, "grad_norm": 0.42670472421692646, "learning_rate": 2.5564572198906513e-06, "loss": 0.0004, "step": 5683 }, { "epoch": 7.743869209809264, "grad_norm": 1.871893036286953, "learning_rate": 2.5535110305721777e-06, "loss": 0.01, "step": 5684 }, { "epoch": 7.745231607629428, "grad_norm": 0.7218772709817596, "learning_rate": 2.5505662914049635e-06, "loss": 0.0005, "step": 5685 }, { "epoch": 7.746594005449591, "grad_norm": 2.393965045669533, "learning_rate": 2.5476230029624714e-06, "loss": 0.0046, "step": 5686 }, { "epoch": 7.747956403269755, "grad_norm": 0.3429427822752358, "learning_rate": 2.5446811658178816e-06, "loss": 0.0147, "step": 5687 }, { "epoch": 7.7493188010899186, "grad_norm": 0.8963706065539491, "learning_rate": 2.5417407805441076e-06, "loss": 0.0008, "step": 5688 }, { "epoch": 7.7506811989100814, "grad_norm": 0.514541382219728, "learning_rate": 2.538801847713759e-06, "loss": 0.0086, "step": 5689 }, { "epoch": 7.752043596730245, "grad_norm": 0.6870507904242411, "learning_rate": 2.535864367899179e-06, "loss": 0.0006, "step": 5690 }, { "epoch": 7.753405994550409, "grad_norm": 0.8932958759190988, "learning_rate": 2.532928341672414e-06, "loss": 0.0031, "step": 5691 }, { "epoch": 7.754768392370572, "grad_norm": 1.9812586337865075, "learning_rate": 2.5299937696052424e-06, "loss": 0.0062, "step": 5692 }, { "epoch": 7.756130790190736, "grad_norm": 2.6967073456721753, "learning_rate": 2.527060652269144e-06, "loss": 0.0016, "step": 5693 }, { "epoch": 7.7574931880108995, "grad_norm": 0.8321450467368507, "learning_rate": 2.524128990235326e-06, "loss": 0.002, "step": 5694 }, { "epoch": 7.758855585831062, "grad_norm": 0.18305168171607233, "learning_rate": 2.521198784074712e-06, "loss": 0.0004, "step": 5695 }, { "epoch": 7.760217983651226, "grad_norm": 0.47472105061181413, "learning_rate": 2.5182700343579337e-06, "loss": 0.0029, "step": 5696 }, { "epoch": 7.76158038147139, "grad_norm": 0.753918412761487, "learning_rate": 2.5153427416553488e-06, "loss": 0.0012, "step": 5697 }, { "epoch": 7.762942779291553, "grad_norm": 0.4713086453055967, "learning_rate": 2.512416906537022e-06, "loss": 0.0089, "step": 5698 }, { "epoch": 7.764305177111717, "grad_norm": 0.7073198713273148, "learning_rate": 2.5094925295727423e-06, "loss": 0.0005, "step": 5699 }, { "epoch": 7.76566757493188, "grad_norm": 0.6419388825239487, "learning_rate": 2.5065696113320147e-06, "loss": 0.0012, "step": 5700 }, { "epoch": 7.767029972752043, "grad_norm": 1.3368033843693747, "learning_rate": 2.5036481523840496e-06, "loss": 0.0209, "step": 5701 }, { "epoch": 7.768392370572207, "grad_norm": 1.21229821181527, "learning_rate": 2.500728153297788e-06, "loss": 0.0007, "step": 5702 }, { "epoch": 7.769754768392371, "grad_norm": 1.0243317204578601, "learning_rate": 2.497809614641872e-06, "loss": 0.0024, "step": 5703 }, { "epoch": 7.771117166212534, "grad_norm": 0.7711487899493259, "learning_rate": 2.49489253698467e-06, "loss": 0.0004, "step": 5704 }, { "epoch": 7.772479564032698, "grad_norm": 0.7567126687878758, "learning_rate": 2.4919769208942658e-06, "loss": 0.0198, "step": 5705 }, { "epoch": 7.773841961852861, "grad_norm": 1.006115749193325, "learning_rate": 2.489062766938448e-06, "loss": 0.0008, "step": 5706 }, { "epoch": 7.775204359673024, "grad_norm": 1.2517039551720734, "learning_rate": 2.4861500756847332e-06, "loss": 0.0053, "step": 5707 }, { "epoch": 7.776566757493188, "grad_norm": 1.041488723120965, "learning_rate": 2.4832388477003444e-06, "loss": 0.0133, "step": 5708 }, { "epoch": 7.777929155313352, "grad_norm": 2.576260588447377, "learning_rate": 2.480329083552225e-06, "loss": 0.01, "step": 5709 }, { "epoch": 7.779291553133515, "grad_norm": 1.0083945521883182, "learning_rate": 2.4774207838070275e-06, "loss": 0.0082, "step": 5710 }, { "epoch": 7.7806539509536785, "grad_norm": 1.074614923684055, "learning_rate": 2.4745139490311254e-06, "loss": 0.0026, "step": 5711 }, { "epoch": 7.782016348773842, "grad_norm": 0.7727849393407759, "learning_rate": 2.4716085797906062e-06, "loss": 0.0005, "step": 5712 }, { "epoch": 7.783378746594005, "grad_norm": 0.6969309062732068, "learning_rate": 2.4687046766512656e-06, "loss": 0.0018, "step": 5713 }, { "epoch": 7.784741144414169, "grad_norm": 1.3378372059319574, "learning_rate": 2.4658022401786243e-06, "loss": 0.0018, "step": 5714 }, { "epoch": 7.786103542234333, "grad_norm": 0.5206525395581661, "learning_rate": 2.462901270937903e-06, "loss": 0.0081, "step": 5715 }, { "epoch": 7.787465940054496, "grad_norm": 0.5497923155501306, "learning_rate": 2.4600017694940503e-06, "loss": 0.0079, "step": 5716 }, { "epoch": 7.7888283378746594, "grad_norm": 1.088666450595012, "learning_rate": 2.4571037364117255e-06, "loss": 0.002, "step": 5717 }, { "epoch": 7.790190735694823, "grad_norm": 0.416216206925259, "learning_rate": 2.4542071722552952e-06, "loss": 0.0154, "step": 5718 }, { "epoch": 7.791553133514986, "grad_norm": 0.7191717221879484, "learning_rate": 2.45131207758885e-06, "loss": 0.0079, "step": 5719 }, { "epoch": 7.79291553133515, "grad_norm": 3.307568541594346, "learning_rate": 2.4484184529761836e-06, "loss": 0.0215, "step": 5720 }, { "epoch": 7.794277929155314, "grad_norm": 0.3444713092471131, "learning_rate": 2.4455262989808117e-06, "loss": 0.0005, "step": 5721 }, { "epoch": 7.795640326975477, "grad_norm": 0.35558537172591104, "learning_rate": 2.4426356161659636e-06, "loss": 0.0008, "step": 5722 }, { "epoch": 7.79700272479564, "grad_norm": 0.2908972964556408, "learning_rate": 2.4397464050945753e-06, "loss": 0.0008, "step": 5723 }, { "epoch": 7.798365122615804, "grad_norm": 1.0161087800237223, "learning_rate": 2.4368586663293036e-06, "loss": 0.0169, "step": 5724 }, { "epoch": 7.799727520435967, "grad_norm": 0.10137932004635208, "learning_rate": 2.4339724004325104e-06, "loss": 0.0005, "step": 5725 }, { "epoch": 7.801089918256131, "grad_norm": 0.7165280550132641, "learning_rate": 2.4310876079662824e-06, "loss": 0.0014, "step": 5726 }, { "epoch": 7.802452316076295, "grad_norm": 1.1113077550885992, "learning_rate": 2.428204289492406e-06, "loss": 0.0034, "step": 5727 }, { "epoch": 7.8038147138964575, "grad_norm": 2.334638560639589, "learning_rate": 2.42532244557239e-06, "loss": 0.0157, "step": 5728 }, { "epoch": 7.805177111716621, "grad_norm": 1.240760207315243, "learning_rate": 2.422442076767456e-06, "loss": 0.0227, "step": 5729 }, { "epoch": 7.806539509536785, "grad_norm": 1.6005692354391838, "learning_rate": 2.4195631836385303e-06, "loss": 0.0043, "step": 5730 }, { "epoch": 7.807901907356948, "grad_norm": 0.8929857681325241, "learning_rate": 2.416685766746263e-06, "loss": 0.004, "step": 5731 }, { "epoch": 7.809264305177112, "grad_norm": 0.2793325082078326, "learning_rate": 2.4138098266510036e-06, "loss": 0.0006, "step": 5732 }, { "epoch": 7.810626702997276, "grad_norm": 0.390266586270456, "learning_rate": 2.410935363912825e-06, "loss": 0.0006, "step": 5733 }, { "epoch": 7.8119891008174385, "grad_norm": 1.3120887156453676, "learning_rate": 2.4080623790915116e-06, "loss": 0.0191, "step": 5734 }, { "epoch": 7.813351498637602, "grad_norm": 1.813324157800951, "learning_rate": 2.4051908727465513e-06, "loss": 0.0082, "step": 5735 }, { "epoch": 7.814713896457766, "grad_norm": 0.34780458375250645, "learning_rate": 2.402320845437155e-06, "loss": 0.0079, "step": 5736 }, { "epoch": 7.816076294277929, "grad_norm": 0.6690935779362632, "learning_rate": 2.3994522977222346e-06, "loss": 0.0024, "step": 5737 }, { "epoch": 7.817438692098093, "grad_norm": 0.15120613826423368, "learning_rate": 2.396585230160425e-06, "loss": 0.0079, "step": 5738 }, { "epoch": 7.8188010899182565, "grad_norm": 0.4766735957109569, "learning_rate": 2.393719643310063e-06, "loss": 0.0005, "step": 5739 }, { "epoch": 7.820163487738419, "grad_norm": 0.692795047161507, "learning_rate": 2.3908555377292033e-06, "loss": 0.009, "step": 5740 }, { "epoch": 7.821525885558583, "grad_norm": 0.5173218117846368, "learning_rate": 2.3879929139756133e-06, "loss": 0.0023, "step": 5741 }, { "epoch": 7.822888283378747, "grad_norm": 1.109571091781906, "learning_rate": 2.3851317726067626e-06, "loss": 0.0102, "step": 5742 }, { "epoch": 7.82425068119891, "grad_norm": 0.27992643163212927, "learning_rate": 2.3822721141798445e-06, "loss": 0.0007, "step": 5743 }, { "epoch": 7.825613079019074, "grad_norm": 0.8103117896978754, "learning_rate": 2.379413939251751e-06, "loss": 0.0046, "step": 5744 }, { "epoch": 7.8269754768392374, "grad_norm": 0.6553781296543227, "learning_rate": 2.3765572483790957e-06, "loss": 0.0011, "step": 5745 }, { "epoch": 7.8283378746594, "grad_norm": 0.49057400889955777, "learning_rate": 2.373702042118201e-06, "loss": 0.0006, "step": 5746 }, { "epoch": 7.829700272479564, "grad_norm": 0.1903815577061928, "learning_rate": 2.370848321025093e-06, "loss": 0.0005, "step": 5747 }, { "epoch": 7.831062670299728, "grad_norm": 1.231739066667946, "learning_rate": 2.3679960856555185e-06, "loss": 0.0062, "step": 5748 }, { "epoch": 7.832425068119891, "grad_norm": 0.2941434449905125, "learning_rate": 2.365145336564926e-06, "loss": 0.0005, "step": 5749 }, { "epoch": 7.833787465940055, "grad_norm": 0.7131783737343377, "learning_rate": 2.36229607430848e-06, "loss": 0.0005, "step": 5750 }, { "epoch": 7.835149863760218, "grad_norm": 1.4731925290048913, "learning_rate": 2.359448299441057e-06, "loss": 0.0228, "step": 5751 }, { "epoch": 7.836512261580381, "grad_norm": 0.19055823861099802, "learning_rate": 2.3566020125172382e-06, "loss": 0.0005, "step": 5752 }, { "epoch": 7.837874659400545, "grad_norm": 1.840147759292583, "learning_rate": 2.353757214091321e-06, "loss": 0.0162, "step": 5753 }, { "epoch": 7.839237057220709, "grad_norm": 0.12362409387094016, "learning_rate": 2.3509139047173046e-06, "loss": 0.0005, "step": 5754 }, { "epoch": 7.840599455040872, "grad_norm": 0.22285270891412212, "learning_rate": 2.348072084948909e-06, "loss": 0.0071, "step": 5755 }, { "epoch": 7.8419618528610355, "grad_norm": 1.887665665235507, "learning_rate": 2.3452317553395544e-06, "loss": 0.0128, "step": 5756 }, { "epoch": 7.843324250681199, "grad_norm": 1.4713809743191548, "learning_rate": 2.342392916442375e-06, "loss": 0.0064, "step": 5757 }, { "epoch": 7.844686648501362, "grad_norm": 0.20360462653567377, "learning_rate": 2.339555568810221e-06, "loss": 0.0076, "step": 5758 }, { "epoch": 7.846049046321526, "grad_norm": 0.19289587401927658, "learning_rate": 2.3367197129956376e-06, "loss": 0.0008, "step": 5759 }, { "epoch": 7.84741144414169, "grad_norm": 1.3699590745101917, "learning_rate": 2.333885349550895e-06, "loss": 0.0103, "step": 5760 }, { "epoch": 7.848773841961853, "grad_norm": 2.4341446542832594, "learning_rate": 2.3310524790279577e-06, "loss": 0.0207, "step": 5761 }, { "epoch": 7.8501362397820165, "grad_norm": 1.0207500182852882, "learning_rate": 2.328221101978513e-06, "loss": 0.0044, "step": 5762 }, { "epoch": 7.85149863760218, "grad_norm": 2.1047253905857635, "learning_rate": 2.3253912189539517e-06, "loss": 0.0268, "step": 5763 }, { "epoch": 7.852861035422343, "grad_norm": 1.8718096719451383, "learning_rate": 2.322562830505369e-06, "loss": 0.0036, "step": 5764 }, { "epoch": 7.854223433242507, "grad_norm": 2.276306573867422, "learning_rate": 2.3197359371835802e-06, "loss": 0.0038, "step": 5765 }, { "epoch": 7.855585831062671, "grad_norm": 1.4373862003863986, "learning_rate": 2.3169105395390967e-06, "loss": 0.0187, "step": 5766 }, { "epoch": 7.856948228882834, "grad_norm": 0.5571892289091755, "learning_rate": 2.3140866381221494e-06, "loss": 0.0082, "step": 5767 }, { "epoch": 7.858310626702997, "grad_norm": 3.439963961579769, "learning_rate": 2.3112642334826686e-06, "loss": 0.0028, "step": 5768 }, { "epoch": 7.859673024523161, "grad_norm": 0.8868602227229526, "learning_rate": 2.3084433261703e-06, "loss": 0.0079, "step": 5769 }, { "epoch": 7.861035422343324, "grad_norm": 1.4379095611576713, "learning_rate": 2.3056239167343987e-06, "loss": 0.0091, "step": 5770 }, { "epoch": 7.862397820163488, "grad_norm": 0.27881255312748904, "learning_rate": 2.3028060057240186e-06, "loss": 0.0006, "step": 5771 }, { "epoch": 7.863760217983652, "grad_norm": 1.1837182006499944, "learning_rate": 2.2999895936879346e-06, "loss": 0.0017, "step": 5772 }, { "epoch": 7.8651226158038146, "grad_norm": 0.9981574269067529, "learning_rate": 2.2971746811746164e-06, "loss": 0.0013, "step": 5773 }, { "epoch": 7.866485013623978, "grad_norm": 0.9504862628861808, "learning_rate": 2.2943612687322525e-06, "loss": 0.0007, "step": 5774 }, { "epoch": 7.867847411444142, "grad_norm": 0.7708781115741216, "learning_rate": 2.2915493569087366e-06, "loss": 0.0012, "step": 5775 }, { "epoch": 7.869209809264305, "grad_norm": 0.6844564003507851, "learning_rate": 2.288738946251664e-06, "loss": 0.0005, "step": 5776 }, { "epoch": 7.870572207084469, "grad_norm": 0.8246401449688778, "learning_rate": 2.285930037308347e-06, "loss": 0.0005, "step": 5777 }, { "epoch": 7.871934604904633, "grad_norm": 0.9510131321120097, "learning_rate": 2.283122630625796e-06, "loss": 0.0098, "step": 5778 }, { "epoch": 7.8732970027247955, "grad_norm": 0.32508935254877086, "learning_rate": 2.2803167267507366e-06, "loss": 0.001, "step": 5779 }, { "epoch": 7.874659400544959, "grad_norm": 0.8972509484138338, "learning_rate": 2.277512326229601e-06, "loss": 0.0006, "step": 5780 }, { "epoch": 7.876021798365123, "grad_norm": 0.7965593066822719, "learning_rate": 2.2747094296085214e-06, "loss": 0.0007, "step": 5781 }, { "epoch": 7.877384196185286, "grad_norm": 0.2692828360847994, "learning_rate": 2.2719080374333468e-06, "loss": 0.0092, "step": 5782 }, { "epoch": 7.87874659400545, "grad_norm": 0.07227003756094354, "learning_rate": 2.269108150249625e-06, "loss": 0.0004, "step": 5783 }, { "epoch": 7.8801089918256135, "grad_norm": 2.511892667457382, "learning_rate": 2.2663097686026183e-06, "loss": 0.0141, "step": 5784 }, { "epoch": 7.881471389645776, "grad_norm": 0.2893069677446308, "learning_rate": 2.263512893037285e-06, "loss": 0.0006, "step": 5785 }, { "epoch": 7.88283378746594, "grad_norm": 0.6401372883032629, "learning_rate": 2.2607175240983027e-06, "loss": 0.0033, "step": 5786 }, { "epoch": 7.884196185286104, "grad_norm": 0.6153020958475288, "learning_rate": 2.2579236623300503e-06, "loss": 0.0011, "step": 5787 }, { "epoch": 7.885558583106267, "grad_norm": 0.18688200633765134, "learning_rate": 2.255131308276609e-06, "loss": 0.0004, "step": 5788 }, { "epoch": 7.886920980926431, "grad_norm": 0.6543077896592917, "learning_rate": 2.2523404624817733e-06, "loss": 0.002, "step": 5789 }, { "epoch": 7.8882833787465945, "grad_norm": 0.6575377879214335, "learning_rate": 2.249551125489038e-06, "loss": 0.0024, "step": 5790 }, { "epoch": 7.889645776566757, "grad_norm": 0.24853082271496818, "learning_rate": 2.2467632978416064e-06, "loss": 0.0003, "step": 5791 }, { "epoch": 7.891008174386921, "grad_norm": 2.332770266098299, "learning_rate": 2.2439769800823942e-06, "loss": 0.003, "step": 5792 }, { "epoch": 7.892370572207085, "grad_norm": 0.1730181440638547, "learning_rate": 2.24119217275401e-06, "loss": 0.0081, "step": 5793 }, { "epoch": 7.893732970027248, "grad_norm": 1.9190606229243004, "learning_rate": 2.2384088763987798e-06, "loss": 0.0111, "step": 5794 }, { "epoch": 7.895095367847412, "grad_norm": 1.3295517143443059, "learning_rate": 2.2356270915587274e-06, "loss": 0.0057, "step": 5795 }, { "epoch": 7.896457765667575, "grad_norm": 0.09115147354298418, "learning_rate": 2.2328468187755915e-06, "loss": 0.0004, "step": 5796 }, { "epoch": 7.897820163487738, "grad_norm": 0.7942307926223258, "learning_rate": 2.230068058590803e-06, "loss": 0.0082, "step": 5797 }, { "epoch": 7.899182561307902, "grad_norm": 0.412197401435321, "learning_rate": 2.2272908115455105e-06, "loss": 0.0034, "step": 5798 }, { "epoch": 7.900544959128066, "grad_norm": 0.8152215820089248, "learning_rate": 2.224515078180565e-06, "loss": 0.0097, "step": 5799 }, { "epoch": 7.901907356948229, "grad_norm": 0.2515795973121825, "learning_rate": 2.2217408590365164e-06, "loss": 0.0005, "step": 5800 }, { "epoch": 7.9032697547683926, "grad_norm": 0.6825727649343877, "learning_rate": 2.2189681546536292e-06, "loss": 0.0004, "step": 5801 }, { "epoch": 7.904632152588556, "grad_norm": 0.14146355698638233, "learning_rate": 2.216196965571862e-06, "loss": 0.0079, "step": 5802 }, { "epoch": 7.905994550408719, "grad_norm": 0.6222878903067107, "learning_rate": 2.2134272923308885e-06, "loss": 0.0006, "step": 5803 }, { "epoch": 7.907356948228883, "grad_norm": 0.6838472984235999, "learning_rate": 2.2106591354700846e-06, "loss": 0.0085, "step": 5804 }, { "epoch": 7.908719346049047, "grad_norm": 0.26943746347121134, "learning_rate": 2.2078924955285253e-06, "loss": 0.0015, "step": 5805 }, { "epoch": 7.91008174386921, "grad_norm": 0.8661372233826274, "learning_rate": 2.2051273730449986e-06, "loss": 0.0076, "step": 5806 }, { "epoch": 7.9114441416893735, "grad_norm": 0.9062197009726738, "learning_rate": 2.2023637685579856e-06, "loss": 0.0023, "step": 5807 }, { "epoch": 7.912806539509537, "grad_norm": 0.19239912105199383, "learning_rate": 2.1996016826056854e-06, "loss": 0.0006, "step": 5808 }, { "epoch": 7.9141689373297, "grad_norm": 0.2137638623917626, "learning_rate": 2.1968411157259916e-06, "loss": 0.0081, "step": 5809 }, { "epoch": 7.915531335149864, "grad_norm": 0.37092291279479145, "learning_rate": 2.194082068456509e-06, "loss": 0.0125, "step": 5810 }, { "epoch": 7.916893732970028, "grad_norm": 0.5079553782997661, "learning_rate": 2.1913245413345395e-06, "loss": 0.0006, "step": 5811 }, { "epoch": 7.918256130790191, "grad_norm": 0.4577555025799726, "learning_rate": 2.1885685348970897e-06, "loss": 0.0065, "step": 5812 }, { "epoch": 7.919618528610354, "grad_norm": 0.6425935826484432, "learning_rate": 2.1858140496808776e-06, "loss": 0.0082, "step": 5813 }, { "epoch": 7.920980926430518, "grad_norm": 0.18706691884583548, "learning_rate": 2.1830610862223123e-06, "loss": 0.0004, "step": 5814 }, { "epoch": 7.922343324250681, "grad_norm": 1.0594998193800522, "learning_rate": 2.1803096450575244e-06, "loss": 0.0045, "step": 5815 }, { "epoch": 7.923705722070845, "grad_norm": 0.7679302788937504, "learning_rate": 2.1775597267223327e-06, "loss": 0.0088, "step": 5816 }, { "epoch": 7.925068119891008, "grad_norm": 1.362449338775294, "learning_rate": 2.174811331752261e-06, "loss": 0.0059, "step": 5817 }, { "epoch": 7.926430517711172, "grad_norm": 0.26020798979008036, "learning_rate": 2.172064460682545e-06, "loss": 0.0067, "step": 5818 }, { "epoch": 7.927792915531335, "grad_norm": 0.17606954238781983, "learning_rate": 2.169319114048114e-06, "loss": 0.0006, "step": 5819 }, { "epoch": 7.929155313351498, "grad_norm": 0.1887753464124293, "learning_rate": 2.166575292383607e-06, "loss": 0.0004, "step": 5820 }, { "epoch": 7.930517711171662, "grad_norm": 0.16994808006056847, "learning_rate": 2.163832996223365e-06, "loss": 0.0004, "step": 5821 }, { "epoch": 7.931880108991826, "grad_norm": 0.933117577692471, "learning_rate": 2.161092226101432e-06, "loss": 0.0068, "step": 5822 }, { "epoch": 7.933242506811989, "grad_norm": 1.5248840418214864, "learning_rate": 2.1583529825515526e-06, "loss": 0.0094, "step": 5823 }, { "epoch": 7.9346049046321525, "grad_norm": 0.6155578647964586, "learning_rate": 2.1556152661071707e-06, "loss": 0.0008, "step": 5824 }, { "epoch": 7.935967302452316, "grad_norm": 0.4722518514638843, "learning_rate": 2.152879077301443e-06, "loss": 0.0007, "step": 5825 }, { "epoch": 7.937329700272479, "grad_norm": 0.48865962289159004, "learning_rate": 2.150144416667217e-06, "loss": 0.0004, "step": 5826 }, { "epoch": 7.938692098092643, "grad_norm": 2.3052456970193336, "learning_rate": 2.1474112847370566e-06, "loss": 0.0056, "step": 5827 }, { "epoch": 7.940054495912807, "grad_norm": 0.8734610830010099, "learning_rate": 2.144679682043217e-06, "loss": 0.0102, "step": 5828 }, { "epoch": 7.94141689373297, "grad_norm": 0.3546259646566513, "learning_rate": 2.1419496091176538e-06, "loss": 0.004, "step": 5829 }, { "epoch": 7.9427792915531334, "grad_norm": 0.12290974799050132, "learning_rate": 2.1392210664920353e-06, "loss": 0.0004, "step": 5830 }, { "epoch": 7.944141689373297, "grad_norm": 1.7926659227957318, "learning_rate": 2.136494054697722e-06, "loss": 0.0191, "step": 5831 }, { "epoch": 7.94550408719346, "grad_norm": 0.5501911763146341, "learning_rate": 2.133768574265782e-06, "loss": 0.0075, "step": 5832 }, { "epoch": 7.946866485013624, "grad_norm": 0.46134398444196484, "learning_rate": 2.1310446257269834e-06, "loss": 0.0009, "step": 5833 }, { "epoch": 7.948228882833788, "grad_norm": 1.7541564824298377, "learning_rate": 2.1283222096117984e-06, "loss": 0.0077, "step": 5834 }, { "epoch": 7.949591280653951, "grad_norm": 0.2772387056110345, "learning_rate": 2.125601326450395e-06, "loss": 0.0084, "step": 5835 }, { "epoch": 7.950953678474114, "grad_norm": 1.0629336408126548, "learning_rate": 2.122881976772646e-06, "loss": 0.0097, "step": 5836 }, { "epoch": 7.952316076294278, "grad_norm": 1.08551391869013, "learning_rate": 2.1201641611081246e-06, "loss": 0.004, "step": 5837 }, { "epoch": 7.953678474114441, "grad_norm": 0.4094098836691078, "learning_rate": 2.11744787998611e-06, "loss": 0.001, "step": 5838 }, { "epoch": 7.955040871934605, "grad_norm": 1.0033110546830402, "learning_rate": 2.114733133935578e-06, "loss": 0.0095, "step": 5839 }, { "epoch": 7.956403269754769, "grad_norm": 0.18299466828462738, "learning_rate": 2.112019923485207e-06, "loss": 0.0033, "step": 5840 }, { "epoch": 7.9577656675749315, "grad_norm": 0.7667837885591455, "learning_rate": 2.10930824916337e-06, "loss": 0.0005, "step": 5841 }, { "epoch": 7.959128065395095, "grad_norm": 1.0162249165147905, "learning_rate": 2.1065981114981535e-06, "loss": 0.0117, "step": 5842 }, { "epoch": 7.960490463215259, "grad_norm": 0.656053019841841, "learning_rate": 2.1038895110173287e-06, "loss": 0.0025, "step": 5843 }, { "epoch": 7.961852861035422, "grad_norm": 0.11464378210468777, "learning_rate": 2.101182448248388e-06, "loss": 0.0005, "step": 5844 }, { "epoch": 7.963215258855586, "grad_norm": 1.3508485681576805, "learning_rate": 2.0984769237185043e-06, "loss": 0.0158, "step": 5845 }, { "epoch": 7.96457765667575, "grad_norm": 0.5731800624436233, "learning_rate": 2.0957729379545654e-06, "loss": 0.0004, "step": 5846 }, { "epoch": 7.9659400544959125, "grad_norm": 0.3613150764317762, "learning_rate": 2.093070491483149e-06, "loss": 0.0007, "step": 5847 }, { "epoch": 7.967302452316076, "grad_norm": 1.5432839042341449, "learning_rate": 2.090369584830537e-06, "loss": 0.0048, "step": 5848 }, { "epoch": 7.96866485013624, "grad_norm": 2.354061965243665, "learning_rate": 2.087670218522714e-06, "loss": 0.0179, "step": 5849 }, { "epoch": 7.970027247956403, "grad_norm": 1.95099129359819, "learning_rate": 2.0849723930853628e-06, "loss": 0.0115, "step": 5850 }, { "epoch": 7.971389645776567, "grad_norm": 1.701000605558548, "learning_rate": 2.082276109043867e-06, "loss": 0.0296, "step": 5851 }, { "epoch": 7.9727520435967305, "grad_norm": 0.7394932252053203, "learning_rate": 2.0795813669233078e-06, "loss": 0.0008, "step": 5852 }, { "epoch": 7.974114441416893, "grad_norm": 0.5344426115276387, "learning_rate": 2.0768881672484643e-06, "loss": 0.0072, "step": 5853 }, { "epoch": 7.975476839237057, "grad_norm": 0.6529038452998126, "learning_rate": 2.074196510543822e-06, "loss": 0.0141, "step": 5854 }, { "epoch": 7.976839237057221, "grad_norm": 1.015755101659313, "learning_rate": 2.071506397333557e-06, "loss": 0.0073, "step": 5855 }, { "epoch": 7.978201634877384, "grad_norm": 1.3929222988860523, "learning_rate": 2.068817828141558e-06, "loss": 0.0152, "step": 5856 }, { "epoch": 7.979564032697548, "grad_norm": 1.5206826579977517, "learning_rate": 2.0661308034913986e-06, "loss": 0.005, "step": 5857 }, { "epoch": 7.9809264305177114, "grad_norm": 1.4711597378919523, "learning_rate": 2.0634453239063623e-06, "loss": 0.0107, "step": 5858 }, { "epoch": 7.982288828337874, "grad_norm": 1.1901692802403758, "learning_rate": 2.0607613899094235e-06, "loss": 0.0097, "step": 5859 }, { "epoch": 7.983651226158038, "grad_norm": 0.3962154034167815, "learning_rate": 2.05807900202326e-06, "loss": 0.001, "step": 5860 }, { "epoch": 7.985013623978202, "grad_norm": 0.5558299363651726, "learning_rate": 2.0553981607702477e-06, "loss": 0.0047, "step": 5861 }, { "epoch": 7.986376021798365, "grad_norm": 0.7479854343896363, "learning_rate": 2.0527188666724627e-06, "loss": 0.007, "step": 5862 }, { "epoch": 7.987738419618529, "grad_norm": 1.1626963598343862, "learning_rate": 2.0500411202516814e-06, "loss": 0.0054, "step": 5863 }, { "epoch": 7.989100817438692, "grad_norm": 0.8722395117570212, "learning_rate": 2.04736492202937e-06, "loss": 0.0031, "step": 5864 }, { "epoch": 7.990463215258855, "grad_norm": 0.2677739186411213, "learning_rate": 2.044690272526706e-06, "loss": 0.0006, "step": 5865 }, { "epoch": 7.991825613079019, "grad_norm": 0.3296150111736111, "learning_rate": 2.0420171722645522e-06, "loss": 0.0085, "step": 5866 }, { "epoch": 7.993188010899183, "grad_norm": 1.0369993338850252, "learning_rate": 2.0393456217634777e-06, "loss": 0.0085, "step": 5867 }, { "epoch": 7.994550408719346, "grad_norm": 0.6305021052641328, "learning_rate": 2.036675621543753e-06, "loss": 0.0091, "step": 5868 }, { "epoch": 7.9959128065395095, "grad_norm": 0.6682736775696436, "learning_rate": 2.034007172125335e-06, "loss": 0.0052, "step": 5869 }, { "epoch": 7.997275204359673, "grad_norm": 0.23511702321587244, "learning_rate": 2.031340274027891e-06, "loss": 0.0079, "step": 5870 }, { "epoch": 7.998637602179836, "grad_norm": 0.3014261103652605, "learning_rate": 2.0286749277707783e-06, "loss": 0.0007, "step": 5871 }, { "epoch": 8.0, "grad_norm": 0.9446049050973007, "learning_rate": 2.026011133873048e-06, "loss": 0.0141, "step": 5872 }, { "epoch": 8.0, "eval_accuracy": 0.9455362156092083, "eval_f1": 0.9370106732535761, "eval_loss": 0.14214062690734863, "eval_precision": 0.9295128060137188, "eval_recall": 0.9480454206999256, "eval_runtime": 19.3777, "eval_samples_per_second": 91.91, "eval_steps_per_second": 0.722, "step": 5872 }, { "epoch": 8.001362397820163, "grad_norm": 0.46684527532017134, "learning_rate": 2.023348892853467e-06, "loss": 0.0008, "step": 5873 }, { "epoch": 8.002724795640328, "grad_norm": 0.6674869289257558, "learning_rate": 2.02068820523048e-06, "loss": 0.0078, "step": 5874 }, { "epoch": 8.00408719346049, "grad_norm": 0.1652688267713772, "learning_rate": 2.018029071522243e-06, "loss": 0.0004, "step": 5875 }, { "epoch": 8.005449591280653, "grad_norm": 0.07949063673370528, "learning_rate": 2.015371492246596e-06, "loss": 0.0003, "step": 5876 }, { "epoch": 8.006811989100818, "grad_norm": 0.36789034385475233, "learning_rate": 2.0127154679210904e-06, "loss": 0.0005, "step": 5877 }, { "epoch": 8.008174386920981, "grad_norm": 0.7903525951070565, "learning_rate": 2.010060999062964e-06, "loss": 0.0092, "step": 5878 }, { "epoch": 8.009536784741144, "grad_norm": 0.6519063495337054, "learning_rate": 2.0074080861891564e-06, "loss": 0.0079, "step": 5879 }, { "epoch": 8.010899182561309, "grad_norm": 0.5358180516610231, "learning_rate": 2.004756729816307e-06, "loss": 0.0068, "step": 5880 }, { "epoch": 8.012261580381471, "grad_norm": 0.9346106958648531, "learning_rate": 2.0021069304607455e-06, "loss": 0.0093, "step": 5881 }, { "epoch": 8.013623978201634, "grad_norm": 0.2786996629086419, "learning_rate": 1.9994586886385046e-06, "loss": 0.0023, "step": 5882 }, { "epoch": 8.014986376021799, "grad_norm": 0.1480301799293539, "learning_rate": 1.9968120048653094e-06, "loss": 0.0004, "step": 5883 }, { "epoch": 8.016348773841962, "grad_norm": 0.38062297588013144, "learning_rate": 1.9941668796565773e-06, "loss": 0.0014, "step": 5884 }, { "epoch": 8.017711171662125, "grad_norm": 0.38005796642206335, "learning_rate": 1.991523313527437e-06, "loss": 0.0008, "step": 5885 }, { "epoch": 8.01907356948229, "grad_norm": 1.7292150189023352, "learning_rate": 1.9888813069926983e-06, "loss": 0.0116, "step": 5886 }, { "epoch": 8.020435967302452, "grad_norm": 0.3226355274308322, "learning_rate": 1.986240860566878e-06, "loss": 0.0077, "step": 5887 }, { "epoch": 8.021798365122615, "grad_norm": 0.43110482756618507, "learning_rate": 1.98360197476418e-06, "loss": 0.0007, "step": 5888 }, { "epoch": 8.02316076294278, "grad_norm": 0.5811575984336803, "learning_rate": 1.980964650098509e-06, "loss": 0.0006, "step": 5889 }, { "epoch": 8.024523160762943, "grad_norm": 2.955643258448458, "learning_rate": 1.9783288870834705e-06, "loss": 0.0205, "step": 5890 }, { "epoch": 8.025885558583106, "grad_norm": 0.870130410869259, "learning_rate": 1.9756946862323534e-06, "loss": 0.0059, "step": 5891 }, { "epoch": 8.02724795640327, "grad_norm": 0.4979945352532563, "learning_rate": 1.9730620480581575e-06, "loss": 0.0084, "step": 5892 }, { "epoch": 8.028610354223433, "grad_norm": 0.15929509157370647, "learning_rate": 1.9704309730735638e-06, "loss": 0.0003, "step": 5893 }, { "epoch": 8.029972752043596, "grad_norm": 0.5098315316722671, "learning_rate": 1.9678014617909604e-06, "loss": 0.0077, "step": 5894 }, { "epoch": 8.03133514986376, "grad_norm": 0.19157371214714575, "learning_rate": 1.9651735147224214e-06, "loss": 0.0004, "step": 5895 }, { "epoch": 8.032697547683924, "grad_norm": 0.3685740378676392, "learning_rate": 1.962547132379724e-06, "loss": 0.0005, "step": 5896 }, { "epoch": 8.034059945504087, "grad_norm": 0.482277661412048, "learning_rate": 1.95992231527434e-06, "loss": 0.0003, "step": 5897 }, { "epoch": 8.035422343324251, "grad_norm": 0.4479366210024694, "learning_rate": 1.9572990639174273e-06, "loss": 0.0087, "step": 5898 }, { "epoch": 8.036784741144414, "grad_norm": 0.13973015876549735, "learning_rate": 1.9546773788198538e-06, "loss": 0.0005, "step": 5899 }, { "epoch": 8.038147138964577, "grad_norm": 0.26042840294032815, "learning_rate": 1.952057260492167e-06, "loss": 0.0008, "step": 5900 }, { "epoch": 8.039509536784742, "grad_norm": 0.13824513779184316, "learning_rate": 1.9494387094446187e-06, "loss": 0.0009, "step": 5901 }, { "epoch": 8.040871934604905, "grad_norm": 1.5565467812604987, "learning_rate": 1.946821726187158e-06, "loss": 0.0051, "step": 5902 }, { "epoch": 8.042234332425068, "grad_norm": 0.9367382548439195, "learning_rate": 1.9442063112294164e-06, "loss": 0.0197, "step": 5903 }, { "epoch": 8.043596730245232, "grad_norm": 0.3169299216921019, "learning_rate": 1.941592465080735e-06, "loss": 0.0004, "step": 5904 }, { "epoch": 8.044959128065395, "grad_norm": 0.14477589774487412, "learning_rate": 1.938980188250135e-06, "loss": 0.0017, "step": 5905 }, { "epoch": 8.046321525885558, "grad_norm": 0.3584228119810487, "learning_rate": 1.9363694812463442e-06, "loss": 0.0007, "step": 5906 }, { "epoch": 8.047683923705723, "grad_norm": 0.39322302831274236, "learning_rate": 1.933760344577774e-06, "loss": 0.0007, "step": 5907 }, { "epoch": 8.049046321525886, "grad_norm": 0.1864973375996815, "learning_rate": 1.93115277875254e-06, "loss": 0.0005, "step": 5908 }, { "epoch": 8.050408719346049, "grad_norm": 1.9052022000648319, "learning_rate": 1.9285467842784465e-06, "loss": 0.0077, "step": 5909 }, { "epoch": 8.051771117166213, "grad_norm": 0.4192974047007752, "learning_rate": 1.925942361662991e-06, "loss": 0.0006, "step": 5910 }, { "epoch": 8.053133514986376, "grad_norm": 0.5429737463272052, "learning_rate": 1.923339511413369e-06, "loss": 0.0012, "step": 5911 }, { "epoch": 8.054495912806539, "grad_norm": 0.16950716498327528, "learning_rate": 1.920738234036463e-06, "loss": 0.0004, "step": 5912 }, { "epoch": 8.055858310626704, "grad_norm": 0.33263016226161474, "learning_rate": 1.9181385300388565e-06, "loss": 0.0009, "step": 5913 }, { "epoch": 8.057220708446867, "grad_norm": 0.5284064576802953, "learning_rate": 1.9155403999268262e-06, "loss": 0.002, "step": 5914 }, { "epoch": 8.05858310626703, "grad_norm": 1.5000310294924333, "learning_rate": 1.912943844206333e-06, "loss": 0.0111, "step": 5915 }, { "epoch": 8.059945504087194, "grad_norm": 0.31253777770348856, "learning_rate": 1.910348863383045e-06, "loss": 0.0005, "step": 5916 }, { "epoch": 8.061307901907357, "grad_norm": 2.9526575016230328, "learning_rate": 1.9077554579623093e-06, "loss": 0.02, "step": 5917 }, { "epoch": 8.06267029972752, "grad_norm": 1.9795247449117224, "learning_rate": 1.9051636284491758e-06, "loss": 0.0159, "step": 5918 }, { "epoch": 8.064032697547685, "grad_norm": 0.792329831741927, "learning_rate": 1.9025733753483898e-06, "loss": 0.0006, "step": 5919 }, { "epoch": 8.065395095367847, "grad_norm": 0.6535803777956237, "learning_rate": 1.899984699164379e-06, "loss": 0.0006, "step": 5920 }, { "epoch": 8.06675749318801, "grad_norm": 0.989176011837308, "learning_rate": 1.897397600401274e-06, "loss": 0.0014, "step": 5921 }, { "epoch": 8.068119891008175, "grad_norm": 1.8745011231063813, "learning_rate": 1.894812079562889e-06, "loss": 0.0052, "step": 5922 }, { "epoch": 8.069482288828338, "grad_norm": 0.570914902465075, "learning_rate": 1.8922281371527418e-06, "loss": 0.0005, "step": 5923 }, { "epoch": 8.0708446866485, "grad_norm": 0.714252324920696, "learning_rate": 1.8896457736740316e-06, "loss": 0.0005, "step": 5924 }, { "epoch": 8.072207084468666, "grad_norm": 0.3020711556388079, "learning_rate": 1.8870649896296579e-06, "loss": 0.0008, "step": 5925 }, { "epoch": 8.073569482288828, "grad_norm": 0.05778743311550594, "learning_rate": 1.884485785522212e-06, "loss": 0.0003, "step": 5926 }, { "epoch": 8.074931880108991, "grad_norm": 1.444044092302218, "learning_rate": 1.8819081618539725e-06, "loss": 0.0065, "step": 5927 }, { "epoch": 8.076294277929156, "grad_norm": 0.6330448407603747, "learning_rate": 1.8793321191269176e-06, "loss": 0.0006, "step": 5928 }, { "epoch": 8.077656675749319, "grad_norm": 0.9239255767089317, "learning_rate": 1.8767576578427082e-06, "loss": 0.0066, "step": 5929 }, { "epoch": 8.079019073569482, "grad_norm": 1.2151902817550637, "learning_rate": 1.8741847785027046e-06, "loss": 0.0131, "step": 5930 }, { "epoch": 8.080381471389646, "grad_norm": 0.41473990739651095, "learning_rate": 1.8716134816079601e-06, "loss": 0.0008, "step": 5931 }, { "epoch": 8.08174386920981, "grad_norm": 0.8439101607827099, "learning_rate": 1.8690437676592121e-06, "loss": 0.0101, "step": 5932 }, { "epoch": 8.083106267029972, "grad_norm": 0.8563265695408085, "learning_rate": 1.8664756371568981e-06, "loss": 0.0074, "step": 5933 }, { "epoch": 8.084468664850137, "grad_norm": 0.7963270118752397, "learning_rate": 1.8639090906011393e-06, "loss": 0.0045, "step": 5934 }, { "epoch": 8.0858310626703, "grad_norm": 0.30487514493723544, "learning_rate": 1.8613441284917567e-06, "loss": 0.0004, "step": 5935 }, { "epoch": 8.087193460490463, "grad_norm": 0.9397538323167282, "learning_rate": 1.8587807513282552e-06, "loss": 0.0216, "step": 5936 }, { "epoch": 8.088555858310627, "grad_norm": 0.41263110197132663, "learning_rate": 1.8562189596098357e-06, "loss": 0.0025, "step": 5937 }, { "epoch": 8.08991825613079, "grad_norm": 0.2041035099429033, "learning_rate": 1.8536587538353912e-06, "loss": 0.0008, "step": 5938 }, { "epoch": 8.091280653950953, "grad_norm": 1.376971457263273, "learning_rate": 1.8511001345034995e-06, "loss": 0.005, "step": 5939 }, { "epoch": 8.092643051771118, "grad_norm": 0.37658431708988277, "learning_rate": 1.848543102112439e-06, "loss": 0.0005, "step": 5940 }, { "epoch": 8.09400544959128, "grad_norm": 0.5344037837728886, "learning_rate": 1.8459876571601676e-06, "loss": 0.0012, "step": 5941 }, { "epoch": 8.095367847411444, "grad_norm": 0.7670340115498988, "learning_rate": 1.843433800144343e-06, "loss": 0.0081, "step": 5942 }, { "epoch": 8.096730245231608, "grad_norm": 0.7713687480489168, "learning_rate": 1.8408815315623152e-06, "loss": 0.0087, "step": 5943 }, { "epoch": 8.098092643051771, "grad_norm": 0.569356882239356, "learning_rate": 1.838330851911112e-06, "loss": 0.0057, "step": 5944 }, { "epoch": 8.099455040871934, "grad_norm": 0.48743945791293397, "learning_rate": 1.8357817616874696e-06, "loss": 0.0021, "step": 5945 }, { "epoch": 8.100817438692099, "grad_norm": 1.3625598373419676, "learning_rate": 1.8332342613877962e-06, "loss": 0.0109, "step": 5946 }, { "epoch": 8.102179836512262, "grad_norm": 0.11697487247066685, "learning_rate": 1.8306883515082053e-06, "loss": 0.0004, "step": 5947 }, { "epoch": 8.103542234332425, "grad_norm": 0.5947633604626129, "learning_rate": 1.8281440325444954e-06, "loss": 0.0022, "step": 5948 }, { "epoch": 8.10490463215259, "grad_norm": 0.35929630040619576, "learning_rate": 1.8256013049921494e-06, "loss": 0.0005, "step": 5949 }, { "epoch": 8.106267029972752, "grad_norm": 0.2562938142697616, "learning_rate": 1.8230601693463524e-06, "loss": 0.0004, "step": 5950 }, { "epoch": 8.107629427792915, "grad_norm": 0.41310518284192277, "learning_rate": 1.820520626101967e-06, "loss": 0.0008, "step": 5951 }, { "epoch": 8.10899182561308, "grad_norm": 0.4369016448429145, "learning_rate": 1.8179826757535556e-06, "loss": 0.0008, "step": 5952 }, { "epoch": 8.110354223433243, "grad_norm": 0.8119408081628204, "learning_rate": 1.815446318795362e-06, "loss": 0.0212, "step": 5953 }, { "epoch": 8.111716621253406, "grad_norm": 0.3621340001768774, "learning_rate": 1.8129115557213262e-06, "loss": 0.0102, "step": 5954 }, { "epoch": 8.11307901907357, "grad_norm": 0.9890357241953751, "learning_rate": 1.8103783870250767e-06, "loss": 0.007, "step": 5955 }, { "epoch": 8.114441416893733, "grad_norm": 0.6484210391367418, "learning_rate": 1.8078468131999273e-06, "loss": 0.0008, "step": 5956 }, { "epoch": 8.115803814713896, "grad_norm": 0.24363554060364317, "learning_rate": 1.8053168347388882e-06, "loss": 0.0004, "step": 5957 }, { "epoch": 8.11716621253406, "grad_norm": 2.0961850879442228, "learning_rate": 1.8027884521346494e-06, "loss": 0.0118, "step": 5958 }, { "epoch": 8.118528610354224, "grad_norm": 0.9394234304721183, "learning_rate": 1.8002616658795991e-06, "loss": 0.0012, "step": 5959 }, { "epoch": 8.119891008174386, "grad_norm": 0.8883434619560533, "learning_rate": 1.7977364764658124e-06, "loss": 0.0005, "step": 5960 }, { "epoch": 8.121253405994551, "grad_norm": 0.44029170940047946, "learning_rate": 1.795212884385048e-06, "loss": 0.0076, "step": 5961 }, { "epoch": 8.122615803814714, "grad_norm": 0.24320613901065488, "learning_rate": 1.7926908901287621e-06, "loss": 0.0079, "step": 5962 }, { "epoch": 8.123978201634877, "grad_norm": 1.3453862032450354, "learning_rate": 1.7901704941880914e-06, "loss": 0.017, "step": 5963 }, { "epoch": 8.125340599455042, "grad_norm": 1.517296680526602, "learning_rate": 1.7876516970538682e-06, "loss": 0.0061, "step": 5964 }, { "epoch": 8.126702997275205, "grad_norm": 0.8280564062507821, "learning_rate": 1.7851344992166087e-06, "loss": 0.0068, "step": 5965 }, { "epoch": 8.128065395095367, "grad_norm": 0.25997485307612656, "learning_rate": 1.7826189011665186e-06, "loss": 0.0007, "step": 5966 }, { "epoch": 8.129427792915532, "grad_norm": 1.759755059203539, "learning_rate": 1.780104903393497e-06, "loss": 0.0099, "step": 5967 }, { "epoch": 8.130790190735695, "grad_norm": 0.6160497123195865, "learning_rate": 1.7775925063871225e-06, "loss": 0.0016, "step": 5968 }, { "epoch": 8.132152588555858, "grad_norm": 0.22423011219863287, "learning_rate": 1.7750817106366714e-06, "loss": 0.0003, "step": 5969 }, { "epoch": 8.133514986376023, "grad_norm": 1.53495167784778, "learning_rate": 1.7725725166310993e-06, "loss": 0.0155, "step": 5970 }, { "epoch": 8.134877384196185, "grad_norm": 2.3969452656346117, "learning_rate": 1.770064924859054e-06, "loss": 0.0137, "step": 5971 }, { "epoch": 8.136239782016348, "grad_norm": 0.15571734828589986, "learning_rate": 1.7675589358088763e-06, "loss": 0.0008, "step": 5972 }, { "epoch": 8.137602179836513, "grad_norm": 0.1325517042630117, "learning_rate": 1.7650545499685835e-06, "loss": 0.0003, "step": 5973 }, { "epoch": 8.138964577656676, "grad_norm": 0.19522393511489564, "learning_rate": 1.762551767825893e-06, "loss": 0.0005, "step": 5974 }, { "epoch": 8.140326975476839, "grad_norm": 0.9635062357003331, "learning_rate": 1.7600505898681996e-06, "loss": 0.0032, "step": 5975 }, { "epoch": 8.141689373297003, "grad_norm": 0.22949707150422438, "learning_rate": 1.7575510165825904e-06, "loss": 0.0005, "step": 5976 }, { "epoch": 8.143051771117166, "grad_norm": 1.9705691230800966, "learning_rate": 1.7550530484558438e-06, "loss": 0.0024, "step": 5977 }, { "epoch": 8.14441416893733, "grad_norm": 0.14254494357432754, "learning_rate": 1.752556685974417e-06, "loss": 0.0081, "step": 5978 }, { "epoch": 8.145776566757494, "grad_norm": 0.41680023517918136, "learning_rate": 1.7500619296244613e-06, "loss": 0.0003, "step": 5979 }, { "epoch": 8.147138964577657, "grad_norm": 0.9857487190211158, "learning_rate": 1.7475687798918117e-06, "loss": 0.0171, "step": 5980 }, { "epoch": 8.14850136239782, "grad_norm": 0.06800060559190421, "learning_rate": 1.745077237261994e-06, "loss": 0.0003, "step": 5981 }, { "epoch": 8.149863760217984, "grad_norm": 0.8341024081078205, "learning_rate": 1.7425873022202111e-06, "loss": 0.0091, "step": 5982 }, { "epoch": 8.151226158038147, "grad_norm": 0.5324022743695214, "learning_rate": 1.7400989752513709e-06, "loss": 0.0008, "step": 5983 }, { "epoch": 8.15258855585831, "grad_norm": 0.5096447477330551, "learning_rate": 1.7376122568400533e-06, "loss": 0.0013, "step": 5984 }, { "epoch": 8.153950953678475, "grad_norm": 0.20807042638081247, "learning_rate": 1.7351271474705245e-06, "loss": 0.0004, "step": 5985 }, { "epoch": 8.155313351498638, "grad_norm": 1.0122326483414334, "learning_rate": 1.7326436476267495e-06, "loss": 0.0066, "step": 5986 }, { "epoch": 8.1566757493188, "grad_norm": 0.18955335088610983, "learning_rate": 1.7301617577923668e-06, "loss": 0.0022, "step": 5987 }, { "epoch": 8.158038147138965, "grad_norm": 0.651532601353066, "learning_rate": 1.7276814784507079e-06, "loss": 0.0054, "step": 5988 }, { "epoch": 8.159400544959128, "grad_norm": 0.5941493452953961, "learning_rate": 1.725202810084794e-06, "loss": 0.0007, "step": 5989 }, { "epoch": 8.160762942779291, "grad_norm": 0.2262966460042652, "learning_rate": 1.7227257531773223e-06, "loss": 0.0003, "step": 5990 }, { "epoch": 8.162125340599456, "grad_norm": 0.1703492415417705, "learning_rate": 1.720250308210688e-06, "loss": 0.0004, "step": 5991 }, { "epoch": 8.163487738419619, "grad_norm": 0.14177231349497882, "learning_rate": 1.71777647566696e-06, "loss": 0.0005, "step": 5992 }, { "epoch": 8.164850136239782, "grad_norm": 0.6270869355746478, "learning_rate": 1.7153042560279064e-06, "loss": 0.0081, "step": 5993 }, { "epoch": 8.166212534059946, "grad_norm": 0.32296845486429293, "learning_rate": 1.7128336497749665e-06, "loss": 0.0003, "step": 5994 }, { "epoch": 8.16757493188011, "grad_norm": 0.7555975331150261, "learning_rate": 1.710364657389284e-06, "loss": 0.0082, "step": 5995 }, { "epoch": 8.168937329700272, "grad_norm": 0.4914853849355637, "learning_rate": 1.7078972793516713e-06, "loss": 0.0081, "step": 5996 }, { "epoch": 8.170299727520437, "grad_norm": 0.25415918954610767, "learning_rate": 1.7054315161426317e-06, "loss": 0.0005, "step": 5997 }, { "epoch": 8.1716621253406, "grad_norm": 0.18507095436285148, "learning_rate": 1.7029673682423597e-06, "loss": 0.0003, "step": 5998 }, { "epoch": 8.173024523160763, "grad_norm": 0.6414925364955436, "learning_rate": 1.7005048361307264e-06, "loss": 0.004, "step": 5999 }, { "epoch": 8.174386920980927, "grad_norm": 0.13082532864244686, "learning_rate": 1.6980439202872934e-06, "loss": 0.0004, "step": 6000 }, { "epoch": 8.17574931880109, "grad_norm": 0.23437351634303724, "learning_rate": 1.6955846211913118e-06, "loss": 0.0004, "step": 6001 }, { "epoch": 8.177111716621253, "grad_norm": 0.9923948159397439, "learning_rate": 1.6931269393217054e-06, "loss": 0.0048, "step": 6002 }, { "epoch": 8.178474114441418, "grad_norm": 0.30776982336958764, "learning_rate": 1.6906708751570955e-06, "loss": 0.0005, "step": 6003 }, { "epoch": 8.17983651226158, "grad_norm": 1.3738522184405204, "learning_rate": 1.6882164291757807e-06, "loss": 0.0108, "step": 6004 }, { "epoch": 8.181198910081743, "grad_norm": 0.34299526546761405, "learning_rate": 1.6857636018557466e-06, "loss": 0.0081, "step": 6005 }, { "epoch": 8.182561307901908, "grad_norm": 3.4132722943302167, "learning_rate": 1.6833123936746664e-06, "loss": 0.0066, "step": 6006 }, { "epoch": 8.183923705722071, "grad_norm": 0.45473549883437175, "learning_rate": 1.6808628051098974e-06, "loss": 0.0022, "step": 6007 }, { "epoch": 8.185286103542234, "grad_norm": 0.6673652661722742, "learning_rate": 1.6784148366384756e-06, "loss": 0.0006, "step": 6008 }, { "epoch": 8.186648501362399, "grad_norm": 0.265975131868501, "learning_rate": 1.6759684887371253e-06, "loss": 0.0007, "step": 6009 }, { "epoch": 8.188010899182562, "grad_norm": 0.22211082548400912, "learning_rate": 1.6735237618822598e-06, "loss": 0.0003, "step": 6010 }, { "epoch": 8.189373297002724, "grad_norm": 0.6937105459555674, "learning_rate": 1.6710806565499648e-06, "loss": 0.0026, "step": 6011 }, { "epoch": 8.190735694822889, "grad_norm": 0.40365592308944664, "learning_rate": 1.6686391732160278e-06, "loss": 0.0004, "step": 6012 }, { "epoch": 8.192098092643052, "grad_norm": 0.353199245107328, "learning_rate": 1.6661993123559051e-06, "loss": 0.0019, "step": 6013 }, { "epoch": 8.193460490463215, "grad_norm": 0.21215071967254054, "learning_rate": 1.6637610744447407e-06, "loss": 0.0003, "step": 6014 }, { "epoch": 8.19482288828338, "grad_norm": 0.49280730807962314, "learning_rate": 1.6613244599573685e-06, "loss": 0.0024, "step": 6015 }, { "epoch": 8.196185286103542, "grad_norm": 2.58696714354473, "learning_rate": 1.6588894693682977e-06, "loss": 0.0017, "step": 6016 }, { "epoch": 8.197547683923705, "grad_norm": 0.29096449849470246, "learning_rate": 1.6564561031517278e-06, "loss": 0.0093, "step": 6017 }, { "epoch": 8.19891008174387, "grad_norm": 1.2619619502796975, "learning_rate": 1.6540243617815399e-06, "loss": 0.0076, "step": 6018 }, { "epoch": 8.200272479564033, "grad_norm": 0.4266382978596022, "learning_rate": 1.6515942457313005e-06, "loss": 0.0007, "step": 6019 }, { "epoch": 8.201634877384196, "grad_norm": 0.2746457224078363, "learning_rate": 1.6491657554742556e-06, "loss": 0.0003, "step": 6020 }, { "epoch": 8.20299727520436, "grad_norm": 0.8379728016564578, "learning_rate": 1.6467388914833337e-06, "loss": 0.0153, "step": 6021 }, { "epoch": 8.204359673024523, "grad_norm": 0.4194855618424143, "learning_rate": 1.6443136542311545e-06, "loss": 0.0007, "step": 6022 }, { "epoch": 8.205722070844686, "grad_norm": 0.0337468275651359, "learning_rate": 1.6418900441900088e-06, "loss": 0.0003, "step": 6023 }, { "epoch": 8.207084468664851, "grad_norm": 0.15787296571512097, "learning_rate": 1.6394680618318859e-06, "loss": 0.0075, "step": 6024 }, { "epoch": 8.208446866485014, "grad_norm": 1.033675323559178, "learning_rate": 1.6370477076284475e-06, "loss": 0.0201, "step": 6025 }, { "epoch": 8.209809264305177, "grad_norm": 0.9520988103398326, "learning_rate": 1.6346289820510364e-06, "loss": 0.0145, "step": 6026 }, { "epoch": 8.211171662125341, "grad_norm": 0.3460752197512635, "learning_rate": 1.632211885570687e-06, "loss": 0.0071, "step": 6027 }, { "epoch": 8.212534059945504, "grad_norm": 0.3727724389049566, "learning_rate": 1.629796418658105e-06, "loss": 0.0003, "step": 6028 }, { "epoch": 8.213896457765667, "grad_norm": 0.6763249124748925, "learning_rate": 1.6273825817836964e-06, "loss": 0.0021, "step": 6029 }, { "epoch": 8.215258855585832, "grad_norm": 0.2029993570942265, "learning_rate": 1.6249703754175295e-06, "loss": 0.0005, "step": 6030 }, { "epoch": 8.216621253405995, "grad_norm": 0.17962716751842825, "learning_rate": 1.6225598000293707e-06, "loss": 0.0004, "step": 6031 }, { "epoch": 8.217983651226158, "grad_norm": 0.22797379037556667, "learning_rate": 1.6201508560886602e-06, "loss": 0.0011, "step": 6032 }, { "epoch": 8.219346049046322, "grad_norm": 0.6081817853235483, "learning_rate": 1.6177435440645195e-06, "loss": 0.0026, "step": 6033 }, { "epoch": 8.220708446866485, "grad_norm": 0.4039438102387838, "learning_rate": 1.615337864425759e-06, "loss": 0.0004, "step": 6034 }, { "epoch": 8.222070844686648, "grad_norm": 0.24750267300396978, "learning_rate": 1.612933817640868e-06, "loss": 0.0006, "step": 6035 }, { "epoch": 8.223433242506813, "grad_norm": 0.3598968897384686, "learning_rate": 1.6105314041780195e-06, "loss": 0.0079, "step": 6036 }, { "epoch": 8.224795640326976, "grad_norm": 1.0446283336151494, "learning_rate": 1.608130624505061e-06, "loss": 0.0125, "step": 6037 }, { "epoch": 8.226158038147139, "grad_norm": 0.7283256930658788, "learning_rate": 1.605731479089534e-06, "loss": 0.0217, "step": 6038 }, { "epoch": 8.227520435967303, "grad_norm": 1.6538319998110615, "learning_rate": 1.6033339683986515e-06, "loss": 0.0045, "step": 6039 }, { "epoch": 8.228882833787466, "grad_norm": 2.007538622030839, "learning_rate": 1.600938092899308e-06, "loss": 0.0118, "step": 6040 }, { "epoch": 8.230245231607629, "grad_norm": 0.49721149420355143, "learning_rate": 1.598543853058091e-06, "loss": 0.0004, "step": 6041 }, { "epoch": 8.231607629427794, "grad_norm": 0.13678706765072088, "learning_rate": 1.5961512493412568e-06, "loss": 0.0004, "step": 6042 }, { "epoch": 8.232970027247957, "grad_norm": 0.4077234770082347, "learning_rate": 1.5937602822147525e-06, "loss": 0.0006, "step": 6043 }, { "epoch": 8.23433242506812, "grad_norm": 0.7778246464003773, "learning_rate": 1.5913709521441989e-06, "loss": 0.0083, "step": 6044 }, { "epoch": 8.235694822888284, "grad_norm": 1.9358085145772548, "learning_rate": 1.5889832595948995e-06, "loss": 0.0268, "step": 6045 }, { "epoch": 8.237057220708447, "grad_norm": 0.5779344243560941, "learning_rate": 1.5865972050318424e-06, "loss": 0.0004, "step": 6046 }, { "epoch": 8.23841961852861, "grad_norm": 0.4209139485094982, "learning_rate": 1.5842127889196957e-06, "loss": 0.0049, "step": 6047 }, { "epoch": 8.239782016348773, "grad_norm": 0.09989428702570548, "learning_rate": 1.5818300117228092e-06, "loss": 0.0004, "step": 6048 }, { "epoch": 8.241144414168938, "grad_norm": 0.07676496071849268, "learning_rate": 1.5794488739052072e-06, "loss": 0.0004, "step": 6049 }, { "epoch": 8.2425068119891, "grad_norm": 0.32224833296747835, "learning_rate": 1.5770693759306055e-06, "loss": 0.0004, "step": 6050 }, { "epoch": 8.243869209809265, "grad_norm": 1.6259501736970987, "learning_rate": 1.5746915182623901e-06, "loss": 0.015, "step": 6051 }, { "epoch": 8.245231607629428, "grad_norm": 1.5958038403200165, "learning_rate": 1.5723153013636294e-06, "loss": 0.0037, "step": 6052 }, { "epoch": 8.246594005449591, "grad_norm": 0.1387265158972999, "learning_rate": 1.5699407256970834e-06, "loss": 0.0021, "step": 6053 }, { "epoch": 8.247956403269754, "grad_norm": 0.40412137039523494, "learning_rate": 1.5675677917251764e-06, "loss": 0.0008, "step": 6054 }, { "epoch": 8.249318801089919, "grad_norm": 0.1799507049414183, "learning_rate": 1.5651964999100265e-06, "loss": 0.0004, "step": 6055 }, { "epoch": 8.250681198910081, "grad_norm": 0.387022326240324, "learning_rate": 1.5628268507134225e-06, "loss": 0.0006, "step": 6056 }, { "epoch": 8.252043596730246, "grad_norm": 0.5323603917269473, "learning_rate": 1.5604588445968337e-06, "loss": 0.0007, "step": 6057 }, { "epoch": 8.253405994550409, "grad_norm": 0.23688697058655184, "learning_rate": 1.5580924820214204e-06, "loss": 0.0075, "step": 6058 }, { "epoch": 8.254768392370572, "grad_norm": 0.5840655632784114, "learning_rate": 1.5557277634480084e-06, "loss": 0.0006, "step": 6059 }, { "epoch": 8.256130790190735, "grad_norm": 0.8002638167255911, "learning_rate": 1.5533646893371157e-06, "loss": 0.0026, "step": 6060 }, { "epoch": 8.2574931880109, "grad_norm": 4.268602039577602, "learning_rate": 1.5510032601489277e-06, "loss": 0.0376, "step": 6061 }, { "epoch": 8.258855585831062, "grad_norm": 0.5607172314638726, "learning_rate": 1.5486434763433222e-06, "loss": 0.0032, "step": 6062 }, { "epoch": 8.260217983651227, "grad_norm": 2.422046407176436, "learning_rate": 1.5462853383798459e-06, "loss": 0.0048, "step": 6063 }, { "epoch": 8.26158038147139, "grad_norm": 0.4582463995808889, "learning_rate": 1.5439288467177317e-06, "loss": 0.0023, "step": 6064 }, { "epoch": 8.262942779291553, "grad_norm": 0.2122680701566296, "learning_rate": 1.541574001815892e-06, "loss": 0.0003, "step": 6065 }, { "epoch": 8.264305177111716, "grad_norm": 0.8671675446792739, "learning_rate": 1.5392208041329104e-06, "loss": 0.0018, "step": 6066 }, { "epoch": 8.26566757493188, "grad_norm": 0.10924267192558618, "learning_rate": 1.5368692541270625e-06, "loss": 0.0004, "step": 6067 }, { "epoch": 8.267029972752043, "grad_norm": 0.17727939962144884, "learning_rate": 1.5345193522562918e-06, "loss": 0.0004, "step": 6068 }, { "epoch": 8.268392370572208, "grad_norm": 0.3431912105730702, "learning_rate": 1.5321710989782213e-06, "loss": 0.0007, "step": 6069 }, { "epoch": 8.269754768392371, "grad_norm": 0.21955834598290758, "learning_rate": 1.5298244947501661e-06, "loss": 0.0005, "step": 6070 }, { "epoch": 8.271117166212534, "grad_norm": 0.3616660948665255, "learning_rate": 1.5274795400291044e-06, "loss": 0.0019, "step": 6071 }, { "epoch": 8.272479564032697, "grad_norm": 0.12002653561272647, "learning_rate": 1.5251362352717026e-06, "loss": 0.0004, "step": 6072 }, { "epoch": 8.273841961852861, "grad_norm": 0.43455575250166184, "learning_rate": 1.5227945809342992e-06, "loss": 0.0073, "step": 6073 }, { "epoch": 8.275204359673024, "grad_norm": 1.9295795133523848, "learning_rate": 1.5204545774729207e-06, "loss": 0.0055, "step": 6074 }, { "epoch": 8.276566757493187, "grad_norm": 0.18669443105499808, "learning_rate": 1.518116225343259e-06, "loss": 0.002, "step": 6075 }, { "epoch": 8.277929155313352, "grad_norm": 0.5539274997866289, "learning_rate": 1.5157795250006968e-06, "loss": 0.0143, "step": 6076 }, { "epoch": 8.279291553133515, "grad_norm": 0.6612858528216121, "learning_rate": 1.513444476900291e-06, "loss": 0.0005, "step": 6077 }, { "epoch": 8.280653950953678, "grad_norm": 0.7065139500208312, "learning_rate": 1.51111108149677e-06, "loss": 0.0015, "step": 6078 }, { "epoch": 8.282016348773842, "grad_norm": 0.1399888286474569, "learning_rate": 1.5087793392445526e-06, "loss": 0.0016, "step": 6079 }, { "epoch": 8.283378746594005, "grad_norm": 0.14095022098745474, "learning_rate": 1.5064492505977234e-06, "loss": 0.0003, "step": 6080 }, { "epoch": 8.284741144414168, "grad_norm": 0.3928006387881287, "learning_rate": 1.504120816010054e-06, "loss": 0.0008, "step": 6081 }, { "epoch": 8.286103542234333, "grad_norm": 0.6066569070278505, "learning_rate": 1.5017940359349925e-06, "loss": 0.0033, "step": 6082 }, { "epoch": 8.287465940054496, "grad_norm": 0.2913364859564993, "learning_rate": 1.4994689108256577e-06, "loss": 0.0004, "step": 6083 }, { "epoch": 8.288828337874659, "grad_norm": 1.5236709684746985, "learning_rate": 1.4971454411348574e-06, "loss": 0.0016, "step": 6084 }, { "epoch": 8.290190735694823, "grad_norm": 0.14128772714936685, "learning_rate": 1.494823627315065e-06, "loss": 0.0003, "step": 6085 }, { "epoch": 8.291553133514986, "grad_norm": 1.3268105980743903, "learning_rate": 1.4925034698184393e-06, "loss": 0.0043, "step": 6086 }, { "epoch": 8.292915531335149, "grad_norm": 0.32672441739779057, "learning_rate": 1.4901849690968183e-06, "loss": 0.0003, "step": 6087 }, { "epoch": 8.294277929155314, "grad_norm": 2.603269469920641, "learning_rate": 1.4878681256017092e-06, "loss": 0.0399, "step": 6088 }, { "epoch": 8.295640326975477, "grad_norm": 0.3535583761649987, "learning_rate": 1.4855529397843039e-06, "loss": 0.001, "step": 6089 }, { "epoch": 8.29700272479564, "grad_norm": 0.824412557109143, "learning_rate": 1.4832394120954651e-06, "loss": 0.0014, "step": 6090 }, { "epoch": 8.298365122615804, "grad_norm": 0.7614456741815576, "learning_rate": 1.4809275429857406e-06, "loss": 0.0005, "step": 6091 }, { "epoch": 8.299727520435967, "grad_norm": 1.3442375795746038, "learning_rate": 1.4786173329053466e-06, "loss": 0.0166, "step": 6092 }, { "epoch": 8.30108991825613, "grad_norm": 0.3013226874966819, "learning_rate": 1.47630878230418e-06, "loss": 0.0007, "step": 6093 }, { "epoch": 8.302452316076295, "grad_norm": 0.17827554297806703, "learning_rate": 1.4740018916318198e-06, "loss": 0.0004, "step": 6094 }, { "epoch": 8.303814713896458, "grad_norm": 0.3542118874922867, "learning_rate": 1.4716966613375116e-06, "loss": 0.0006, "step": 6095 }, { "epoch": 8.30517711171662, "grad_norm": 0.5628098509737872, "learning_rate": 1.469393091870186e-06, "loss": 0.0066, "step": 6096 }, { "epoch": 8.306539509536785, "grad_norm": 0.8998742862053618, "learning_rate": 1.467091183678444e-06, "loss": 0.0005, "step": 6097 }, { "epoch": 8.307901907356948, "grad_norm": 0.47235344555401865, "learning_rate": 1.4647909372105673e-06, "loss": 0.0006, "step": 6098 }, { "epoch": 8.309264305177111, "grad_norm": 0.6018256575304711, "learning_rate": 1.4624923529145141e-06, "loss": 0.0059, "step": 6099 }, { "epoch": 8.310626702997276, "grad_norm": 0.9471289990625121, "learning_rate": 1.460195431237915e-06, "loss": 0.0013, "step": 6100 }, { "epoch": 8.311989100817438, "grad_norm": 0.40188761483685703, "learning_rate": 1.4579001726280828e-06, "loss": 0.0004, "step": 6101 }, { "epoch": 8.313351498637601, "grad_norm": 2.488472885178994, "learning_rate": 1.4556065775319982e-06, "loss": 0.024, "step": 6102 }, { "epoch": 8.314713896457766, "grad_norm": 0.1961124763938814, "learning_rate": 1.4533146463963277e-06, "loss": 0.0077, "step": 6103 }, { "epoch": 8.316076294277929, "grad_norm": 2.5863595485216853, "learning_rate": 1.451024379667404e-06, "loss": 0.008, "step": 6104 }, { "epoch": 8.317438692098092, "grad_norm": 0.7650184479080604, "learning_rate": 1.4487357777912426e-06, "loss": 0.0011, "step": 6105 }, { "epoch": 8.318801089918257, "grad_norm": 1.1812278914622187, "learning_rate": 1.4464488412135347e-06, "loss": 0.0056, "step": 6106 }, { "epoch": 8.32016348773842, "grad_norm": 1.2966950496811054, "learning_rate": 1.4441635703796409e-06, "loss": 0.0035, "step": 6107 }, { "epoch": 8.321525885558582, "grad_norm": 0.1355518514408584, "learning_rate": 1.441879965734606e-06, "loss": 0.0005, "step": 6108 }, { "epoch": 8.322888283378747, "grad_norm": 0.998222688716856, "learning_rate": 1.4395980277231424e-06, "loss": 0.0077, "step": 6109 }, { "epoch": 8.32425068119891, "grad_norm": 0.9527420749486566, "learning_rate": 1.4373177567896412e-06, "loss": 0.0052, "step": 6110 }, { "epoch": 8.325613079019073, "grad_norm": 0.18998908036787573, "learning_rate": 1.4350391533781738e-06, "loss": 0.0003, "step": 6111 }, { "epoch": 8.326975476839237, "grad_norm": 0.8569300123928933, "learning_rate": 1.4327622179324763e-06, "loss": 0.0013, "step": 6112 }, { "epoch": 8.3283378746594, "grad_norm": 0.5562883209359277, "learning_rate": 1.4304869508959707e-06, "loss": 0.0004, "step": 6113 }, { "epoch": 8.329700272479563, "grad_norm": 0.7780070096588056, "learning_rate": 1.4282133527117447e-06, "loss": 0.0004, "step": 6114 }, { "epoch": 8.331062670299728, "grad_norm": 0.45596396025393293, "learning_rate": 1.4259414238225677e-06, "loss": 0.0004, "step": 6115 }, { "epoch": 8.33242506811989, "grad_norm": 0.34281474225188097, "learning_rate": 1.4236711646708844e-06, "loss": 0.0003, "step": 6116 }, { "epoch": 8.333787465940054, "grad_norm": 0.9256107418408607, "learning_rate": 1.421402575698807e-06, "loss": 0.0073, "step": 6117 }, { "epoch": 8.335149863760218, "grad_norm": 0.7993816683617435, "learning_rate": 1.4191356573481318e-06, "loss": 0.0231, "step": 6118 }, { "epoch": 8.336512261580381, "grad_norm": 0.4328322277354571, "learning_rate": 1.4168704100603214e-06, "loss": 0.0004, "step": 6119 }, { "epoch": 8.337874659400544, "grad_norm": 1.1556561261175835, "learning_rate": 1.4146068342765196e-06, "loss": 0.0079, "step": 6120 }, { "epoch": 8.339237057220709, "grad_norm": 0.3927713519756877, "learning_rate": 1.4123449304375393e-06, "loss": 0.0005, "step": 6121 }, { "epoch": 8.340599455040872, "grad_norm": 0.14508481462200673, "learning_rate": 1.41008469898387e-06, "loss": 0.0008, "step": 6122 }, { "epoch": 8.341961852861035, "grad_norm": 0.4695287244509662, "learning_rate": 1.4078261403556804e-06, "loss": 0.0004, "step": 6123 }, { "epoch": 8.3433242506812, "grad_norm": 0.6586169489656795, "learning_rate": 1.4055692549928046e-06, "loss": 0.0004, "step": 6124 }, { "epoch": 8.344686648501362, "grad_norm": 0.8283805706081283, "learning_rate": 1.403314043334757e-06, "loss": 0.0022, "step": 6125 }, { "epoch": 8.346049046321525, "grad_norm": 2.4780250162063324, "learning_rate": 1.4010605058207226e-06, "loss": 0.003, "step": 6126 }, { "epoch": 8.34741144414169, "grad_norm": 0.1997621822424246, "learning_rate": 1.398808642889562e-06, "loss": 0.0003, "step": 6127 }, { "epoch": 8.348773841961853, "grad_norm": 1.042762277277558, "learning_rate": 1.396558454979814e-06, "loss": 0.0042, "step": 6128 }, { "epoch": 8.350136239782016, "grad_norm": 1.9094526506008394, "learning_rate": 1.3943099425296802e-06, "loss": 0.0095, "step": 6129 }, { "epoch": 8.35149863760218, "grad_norm": 0.805802048165429, "learning_rate": 1.3920631059770483e-06, "loss": 0.0073, "step": 6130 }, { "epoch": 8.352861035422343, "grad_norm": 1.046827616149841, "learning_rate": 1.3898179457594686e-06, "loss": 0.0032, "step": 6131 }, { "epoch": 8.354223433242506, "grad_norm": 0.05017733977326381, "learning_rate": 1.3875744623141763e-06, "loss": 0.0003, "step": 6132 }, { "epoch": 8.35558583106267, "grad_norm": 1.8717002399901361, "learning_rate": 1.3853326560780678e-06, "loss": 0.0041, "step": 6133 }, { "epoch": 8.356948228882834, "grad_norm": 0.20971933151243918, "learning_rate": 1.3830925274877216e-06, "loss": 0.0004, "step": 6134 }, { "epoch": 8.358310626702997, "grad_norm": 0.3295169164028419, "learning_rate": 1.38085407697939e-06, "loss": 0.0011, "step": 6135 }, { "epoch": 8.359673024523161, "grad_norm": 0.14569130959805016, "learning_rate": 1.3786173049889907e-06, "loss": 0.0004, "step": 6136 }, { "epoch": 8.361035422343324, "grad_norm": 2.1166044112834648, "learning_rate": 1.376382211952123e-06, "loss": 0.0405, "step": 6137 }, { "epoch": 8.362397820163487, "grad_norm": 0.24057232830090497, "learning_rate": 1.3741487983040513e-06, "loss": 0.0101, "step": 6138 }, { "epoch": 8.363760217983652, "grad_norm": 0.5771551238748369, "learning_rate": 1.371917064479721e-06, "loss": 0.015, "step": 6139 }, { "epoch": 8.365122615803815, "grad_norm": 0.41726550037596033, "learning_rate": 1.3696870109137462e-06, "loss": 0.0143, "step": 6140 }, { "epoch": 8.366485013623977, "grad_norm": 2.1720510198283685, "learning_rate": 1.367458638040412e-06, "loss": 0.0184, "step": 6141 }, { "epoch": 8.367847411444142, "grad_norm": 0.38429060544472504, "learning_rate": 1.3652319462936815e-06, "loss": 0.01, "step": 6142 }, { "epoch": 8.369209809264305, "grad_norm": 0.5830077538027746, "learning_rate": 1.363006936107183e-06, "loss": 0.0003, "step": 6143 }, { "epoch": 8.370572207084468, "grad_norm": 0.7742080385326853, "learning_rate": 1.3607836079142233e-06, "loss": 0.0039, "step": 6144 }, { "epoch": 8.371934604904633, "grad_norm": 0.42537102476405203, "learning_rate": 1.358561962147783e-06, "loss": 0.0005, "step": 6145 }, { "epoch": 8.373297002724795, "grad_norm": 0.5170994555152767, "learning_rate": 1.356341999240507e-06, "loss": 0.0004, "step": 6146 }, { "epoch": 8.374659400544958, "grad_norm": 1.3035076910421786, "learning_rate": 1.3541237196247226e-06, "loss": 0.0086, "step": 6147 }, { "epoch": 8.376021798365123, "grad_norm": 0.20322415962198104, "learning_rate": 1.3519071237324188e-06, "loss": 0.0004, "step": 6148 }, { "epoch": 8.377384196185286, "grad_norm": 0.41944202612038234, "learning_rate": 1.349692211995266e-06, "loss": 0.0004, "step": 6149 }, { "epoch": 8.378746594005449, "grad_norm": 0.5182705140027238, "learning_rate": 1.3474789848446e-06, "loss": 0.001, "step": 6150 }, { "epoch": 8.380108991825614, "grad_norm": 0.5419415522161541, "learning_rate": 1.345267442711432e-06, "loss": 0.0046, "step": 6151 }, { "epoch": 8.381471389645776, "grad_norm": 1.5310784435676437, "learning_rate": 1.343057586026446e-06, "loss": 0.0058, "step": 6152 }, { "epoch": 8.38283378746594, "grad_norm": 0.3544954961819237, "learning_rate": 1.3408494152199937e-06, "loss": 0.0005, "step": 6153 }, { "epoch": 8.384196185286104, "grad_norm": 0.6874559594775049, "learning_rate": 1.3386429307221026e-06, "loss": 0.0024, "step": 6154 }, { "epoch": 8.385558583106267, "grad_norm": 0.10843422768091067, "learning_rate": 1.3364381329624688e-06, "loss": 0.0006, "step": 6155 }, { "epoch": 8.38692098092643, "grad_norm": 0.4499873658984408, "learning_rate": 1.3342350223704603e-06, "loss": 0.0004, "step": 6156 }, { "epoch": 8.388283378746594, "grad_norm": 0.43808141268445777, "learning_rate": 1.332033599375121e-06, "loss": 0.0003, "step": 6157 }, { "epoch": 8.389645776566757, "grad_norm": 0.8565002219734296, "learning_rate": 1.3298338644051579e-06, "loss": 0.0055, "step": 6158 }, { "epoch": 8.39100817438692, "grad_norm": 0.3224451811319387, "learning_rate": 1.3276358178889581e-06, "loss": 0.0006, "step": 6159 }, { "epoch": 8.392370572207085, "grad_norm": 0.9356002107887704, "learning_rate": 1.3254394602545718e-06, "loss": 0.0071, "step": 6160 }, { "epoch": 8.393732970027248, "grad_norm": 0.32800774832563534, "learning_rate": 1.3232447919297276e-06, "loss": 0.001, "step": 6161 }, { "epoch": 8.39509536784741, "grad_norm": 0.45076320151165133, "learning_rate": 1.3210518133418182e-06, "loss": 0.0009, "step": 6162 }, { "epoch": 8.396457765667575, "grad_norm": 0.8041642228967909, "learning_rate": 1.318860524917912e-06, "loss": 0.0015, "step": 6163 }, { "epoch": 8.397820163487738, "grad_norm": 0.4409520458984406, "learning_rate": 1.316670927084751e-06, "loss": 0.0097, "step": 6164 }, { "epoch": 8.399182561307901, "grad_norm": 0.43529904496608507, "learning_rate": 1.3144830202687376e-06, "loss": 0.0121, "step": 6165 }, { "epoch": 8.400544959128066, "grad_norm": 0.5391913331102506, "learning_rate": 1.312296804895956e-06, "loss": 0.0007, "step": 6166 }, { "epoch": 8.401907356948229, "grad_norm": 2.151479528534197, "learning_rate": 1.310112281392153e-06, "loss": 0.0047, "step": 6167 }, { "epoch": 8.403269754768392, "grad_norm": 0.7249056942345696, "learning_rate": 1.30792945018275e-06, "loss": 0.0081, "step": 6168 }, { "epoch": 8.404632152588556, "grad_norm": 1.095284258408931, "learning_rate": 1.3057483116928415e-06, "loss": 0.002, "step": 6169 }, { "epoch": 8.40599455040872, "grad_norm": 0.44974271682786093, "learning_rate": 1.3035688663471836e-06, "loss": 0.0019, "step": 6170 }, { "epoch": 8.407356948228882, "grad_norm": 0.5702560614366103, "learning_rate": 1.3013911145702119e-06, "loss": 0.0072, "step": 6171 }, { "epoch": 8.408719346049047, "grad_norm": 0.5562473514821328, "learning_rate": 1.299215056786024e-06, "loss": 0.0005, "step": 6172 }, { "epoch": 8.41008174386921, "grad_norm": 2.046905848375195, "learning_rate": 1.2970406934183955e-06, "loss": 0.0045, "step": 6173 }, { "epoch": 8.411444141689373, "grad_norm": 0.24067465396979587, "learning_rate": 1.2948680248907686e-06, "loss": 0.0031, "step": 6174 }, { "epoch": 8.412806539509537, "grad_norm": 0.4031340855566453, "learning_rate": 1.2926970516262516e-06, "loss": 0.0011, "step": 6175 }, { "epoch": 8.4141689373297, "grad_norm": 0.46136152516891293, "learning_rate": 1.2905277740476318e-06, "loss": 0.0005, "step": 6176 }, { "epoch": 8.415531335149863, "grad_norm": 0.3801791437561589, "learning_rate": 1.2883601925773536e-06, "loss": 0.0005, "step": 6177 }, { "epoch": 8.416893732970028, "grad_norm": 1.234554142599227, "learning_rate": 1.286194307637545e-06, "loss": 0.0068, "step": 6178 }, { "epoch": 8.41825613079019, "grad_norm": 0.3631847252361131, "learning_rate": 1.2840301196499894e-06, "loss": 0.0007, "step": 6179 }, { "epoch": 8.419618528610354, "grad_norm": 0.8914292005952189, "learning_rate": 1.2818676290361564e-06, "loss": 0.0231, "step": 6180 }, { "epoch": 8.420980926430518, "grad_norm": 0.5694475483670259, "learning_rate": 1.2797068362171706e-06, "loss": 0.0004, "step": 6181 }, { "epoch": 8.422343324250681, "grad_norm": 0.918096695873557, "learning_rate": 1.2775477416138292e-06, "loss": 0.0004, "step": 6182 }, { "epoch": 8.423705722070844, "grad_norm": 1.4361708172164451, "learning_rate": 1.2753903456466054e-06, "loss": 0.0179, "step": 6183 }, { "epoch": 8.425068119891009, "grad_norm": 1.6599168859430518, "learning_rate": 1.273234648735633e-06, "loss": 0.0165, "step": 6184 }, { "epoch": 8.426430517711172, "grad_norm": 0.3803585721000302, "learning_rate": 1.271080651300719e-06, "loss": 0.0071, "step": 6185 }, { "epoch": 8.427792915531334, "grad_norm": 0.29228082077134915, "learning_rate": 1.268928353761344e-06, "loss": 0.0005, "step": 6186 }, { "epoch": 8.4291553133515, "grad_norm": 3.362626235546874, "learning_rate": 1.2667777565366456e-06, "loss": 0.0061, "step": 6187 }, { "epoch": 8.430517711171662, "grad_norm": 1.1692657796451622, "learning_rate": 1.2646288600454448e-06, "loss": 0.0073, "step": 6188 }, { "epoch": 8.431880108991825, "grad_norm": 0.6886773493873679, "learning_rate": 1.2624816647062166e-06, "loss": 0.0025, "step": 6189 }, { "epoch": 8.43324250681199, "grad_norm": 0.745624701837511, "learning_rate": 1.2603361709371197e-06, "loss": 0.0043, "step": 6190 }, { "epoch": 8.434604904632153, "grad_norm": 0.5600044381258222, "learning_rate": 1.2581923791559647e-06, "loss": 0.0004, "step": 6191 }, { "epoch": 8.435967302452315, "grad_norm": 0.6502518219570835, "learning_rate": 1.2560502897802507e-06, "loss": 0.0005, "step": 6192 }, { "epoch": 8.43732970027248, "grad_norm": 0.8681087234645408, "learning_rate": 1.2539099032271285e-06, "loss": 0.0013, "step": 6193 }, { "epoch": 8.438692098092643, "grad_norm": 0.2377723755254919, "learning_rate": 1.2517712199134225e-06, "loss": 0.008, "step": 6194 }, { "epoch": 8.440054495912806, "grad_norm": 0.1933766184414849, "learning_rate": 1.2496342402556284e-06, "loss": 0.0005, "step": 6195 }, { "epoch": 8.44141689373297, "grad_norm": 1.670947580504078, "learning_rate": 1.2474989646699043e-06, "loss": 0.01, "step": 6196 }, { "epoch": 8.442779291553133, "grad_norm": 0.6037922887074156, "learning_rate": 1.2453653935720866e-06, "loss": 0.0014, "step": 6197 }, { "epoch": 8.444141689373296, "grad_norm": 1.8614788493706411, "learning_rate": 1.2432335273776708e-06, "loss": 0.0124, "step": 6198 }, { "epoch": 8.445504087193461, "grad_norm": 0.7353431628419784, "learning_rate": 1.2411033665018179e-06, "loss": 0.0006, "step": 6199 }, { "epoch": 8.446866485013624, "grad_norm": 0.6655011001719532, "learning_rate": 1.2389749113593686e-06, "loss": 0.0004, "step": 6200 }, { "epoch": 8.448228882833787, "grad_norm": 0.923147797075931, "learning_rate": 1.2368481623648187e-06, "loss": 0.0091, "step": 6201 }, { "epoch": 8.449591280653951, "grad_norm": 1.1306735576632867, "learning_rate": 1.2347231199323418e-06, "loss": 0.0075, "step": 6202 }, { "epoch": 8.450953678474114, "grad_norm": 0.603383198027715, "learning_rate": 1.232599784475772e-06, "loss": 0.0004, "step": 6203 }, { "epoch": 8.452316076294277, "grad_norm": 1.252508764300714, "learning_rate": 1.2304781564086177e-06, "loss": 0.001, "step": 6204 }, { "epoch": 8.453678474114442, "grad_norm": 1.15548338736037, "learning_rate": 1.2283582361440495e-06, "loss": 0.0006, "step": 6205 }, { "epoch": 8.455040871934605, "grad_norm": 1.9322773995589897, "learning_rate": 1.2262400240949023e-06, "loss": 0.0125, "step": 6206 }, { "epoch": 8.456403269754768, "grad_norm": 1.6689899697516806, "learning_rate": 1.22412352067369e-06, "loss": 0.0113, "step": 6207 }, { "epoch": 8.457765667574932, "grad_norm": 0.5062235797247399, "learning_rate": 1.2220087262925796e-06, "loss": 0.0077, "step": 6208 }, { "epoch": 8.459128065395095, "grad_norm": 0.5180586878807777, "learning_rate": 1.21989564136342e-06, "loss": 0.0003, "step": 6209 }, { "epoch": 8.460490463215258, "grad_norm": 1.6881868057787113, "learning_rate": 1.2177842662977136e-06, "loss": 0.0111, "step": 6210 }, { "epoch": 8.461852861035423, "grad_norm": 1.3009865995398537, "learning_rate": 1.2156746015066413e-06, "loss": 0.003, "step": 6211 }, { "epoch": 8.463215258855586, "grad_norm": 0.6835841908083308, "learning_rate": 1.213566647401041e-06, "loss": 0.0087, "step": 6212 }, { "epoch": 8.464577656675749, "grad_norm": 0.362053489062875, "learning_rate": 1.2114604043914225e-06, "loss": 0.0003, "step": 6213 }, { "epoch": 8.465940054495913, "grad_norm": 0.5838722153031564, "learning_rate": 1.2093558728879618e-06, "loss": 0.0003, "step": 6214 }, { "epoch": 8.467302452316076, "grad_norm": 0.6042243560202228, "learning_rate": 1.2072530533005012e-06, "loss": 0.0073, "step": 6215 }, { "epoch": 8.46866485013624, "grad_norm": 0.9678812492154656, "learning_rate": 1.205151946038554e-06, "loss": 0.0004, "step": 6216 }, { "epoch": 8.470027247956404, "grad_norm": 0.7180423677165126, "learning_rate": 1.2030525515112933e-06, "loss": 0.0011, "step": 6217 }, { "epoch": 8.471389645776567, "grad_norm": 0.24301817055932826, "learning_rate": 1.2009548701275598e-06, "loss": 0.0084, "step": 6218 }, { "epoch": 8.47275204359673, "grad_norm": 0.9096243707217792, "learning_rate": 1.198858902295864e-06, "loss": 0.0175, "step": 6219 }, { "epoch": 8.474114441416894, "grad_norm": 0.7375932197153026, "learning_rate": 1.1967646484243777e-06, "loss": 0.0031, "step": 6220 }, { "epoch": 8.475476839237057, "grad_norm": 1.2062027555212456, "learning_rate": 1.1946721089209478e-06, "loss": 0.0074, "step": 6221 }, { "epoch": 8.47683923705722, "grad_norm": 0.6820669736569396, "learning_rate": 1.1925812841930772e-06, "loss": 0.0009, "step": 6222 }, { "epoch": 8.478201634877385, "grad_norm": 1.0238531567208125, "learning_rate": 1.1904921746479426e-06, "loss": 0.0035, "step": 6223 }, { "epoch": 8.479564032697548, "grad_norm": 0.08431027256846614, "learning_rate": 1.1884047806923816e-06, "loss": 0.0004, "step": 6224 }, { "epoch": 8.48092643051771, "grad_norm": 1.117105936511954, "learning_rate": 1.1863191027328957e-06, "loss": 0.0024, "step": 6225 }, { "epoch": 8.482288828337875, "grad_norm": 0.47678880224379677, "learning_rate": 1.1842351411756625e-06, "loss": 0.01, "step": 6226 }, { "epoch": 8.483651226158038, "grad_norm": 0.3089247388081137, "learning_rate": 1.182152896426515e-06, "loss": 0.0077, "step": 6227 }, { "epoch": 8.485013623978201, "grad_norm": 0.7280696722070661, "learning_rate": 1.1800723688909589e-06, "loss": 0.0004, "step": 6228 }, { "epoch": 8.486376021798366, "grad_norm": 0.38859580019884987, "learning_rate": 1.1779935589741587e-06, "loss": 0.0004, "step": 6229 }, { "epoch": 8.487738419618529, "grad_norm": 0.7560684293479267, "learning_rate": 1.1759164670809486e-06, "loss": 0.0074, "step": 6230 }, { "epoch": 8.489100817438691, "grad_norm": 1.4517090111576554, "learning_rate": 1.1738410936158272e-06, "loss": 0.0051, "step": 6231 }, { "epoch": 8.490463215258856, "grad_norm": 0.2902686579541488, "learning_rate": 1.1717674389829603e-06, "loss": 0.0004, "step": 6232 }, { "epoch": 8.491825613079019, "grad_norm": 0.5864512550161688, "learning_rate": 1.169695503586179e-06, "loss": 0.0075, "step": 6233 }, { "epoch": 8.493188010899182, "grad_norm": 0.32085876828358906, "learning_rate": 1.167625287828975e-06, "loss": 0.0006, "step": 6234 }, { "epoch": 8.494550408719347, "grad_norm": 0.08095683988273673, "learning_rate": 1.1655567921145117e-06, "loss": 0.0004, "step": 6235 }, { "epoch": 8.49591280653951, "grad_norm": 0.5834796269736024, "learning_rate": 1.163490016845611e-06, "loss": 0.0068, "step": 6236 }, { "epoch": 8.497275204359672, "grad_norm": 0.6166282661205261, "learning_rate": 1.161424962424761e-06, "loss": 0.018, "step": 6237 }, { "epoch": 8.498637602179837, "grad_norm": 0.36219977208966414, "learning_rate": 1.159361629254122e-06, "loss": 0.0062, "step": 6238 }, { "epoch": 8.5, "grad_norm": 1.398082633358376, "learning_rate": 1.1573000177355086e-06, "loss": 0.0029, "step": 6239 }, { "epoch": 8.501362397820163, "grad_norm": 0.282114865899868, "learning_rate": 1.1552401282704107e-06, "loss": 0.0004, "step": 6240 }, { "epoch": 8.502724795640328, "grad_norm": 0.5307960298751013, "learning_rate": 1.1531819612599725e-06, "loss": 0.0064, "step": 6241 }, { "epoch": 8.50408719346049, "grad_norm": 0.9526186801873012, "learning_rate": 1.1511255171050084e-06, "loss": 0.0118, "step": 6242 }, { "epoch": 8.505449591280653, "grad_norm": 0.3191722109820663, "learning_rate": 1.1490707962059955e-06, "loss": 0.0085, "step": 6243 }, { "epoch": 8.506811989100818, "grad_norm": 0.17009101789390774, "learning_rate": 1.1470177989630781e-06, "loss": 0.0078, "step": 6244 }, { "epoch": 8.508174386920981, "grad_norm": 0.47837494676090164, "learning_rate": 1.1449665257760656e-06, "loss": 0.0145, "step": 6245 }, { "epoch": 8.509536784741144, "grad_norm": 0.46250183899065267, "learning_rate": 1.1429169770444226e-06, "loss": 0.0072, "step": 6246 }, { "epoch": 8.510899182561309, "grad_norm": 0.9363939472503289, "learning_rate": 1.14086915316729e-06, "loss": 0.002, "step": 6247 }, { "epoch": 8.512261580381471, "grad_norm": 1.0638031722824008, "learning_rate": 1.1388230545434652e-06, "loss": 0.0045, "step": 6248 }, { "epoch": 8.513623978201634, "grad_norm": 0.6249994065772339, "learning_rate": 1.1367786815714054e-06, "loss": 0.007, "step": 6249 }, { "epoch": 8.514986376021799, "grad_norm": 0.7456653039102836, "learning_rate": 1.1347360346492486e-06, "loss": 0.015, "step": 6250 }, { "epoch": 8.516348773841962, "grad_norm": 0.36411493756646807, "learning_rate": 1.132695114174779e-06, "loss": 0.0008, "step": 6251 }, { "epoch": 8.517711171662125, "grad_norm": 2.3276685566594444, "learning_rate": 1.1306559205454537e-06, "loss": 0.008, "step": 6252 }, { "epoch": 8.51907356948229, "grad_norm": 0.241702294847463, "learning_rate": 1.12861845415839e-06, "loss": 0.0163, "step": 6253 }, { "epoch": 8.520435967302452, "grad_norm": 0.16382007162542325, "learning_rate": 1.1265827154103703e-06, "loss": 0.0025, "step": 6254 }, { "epoch": 8.521798365122615, "grad_norm": 1.3250084400033102, "learning_rate": 1.1245487046978432e-06, "loss": 0.0054, "step": 6255 }, { "epoch": 8.52316076294278, "grad_norm": 0.4322525269456099, "learning_rate": 1.1225164224169138e-06, "loss": 0.0157, "step": 6256 }, { "epoch": 8.524523160762943, "grad_norm": 0.6317586114634143, "learning_rate": 1.120485868963358e-06, "loss": 0.0029, "step": 6257 }, { "epoch": 8.525885558583106, "grad_norm": 1.079423700014286, "learning_rate": 1.1184570447326081e-06, "loss": 0.0088, "step": 6258 }, { "epoch": 8.52724795640327, "grad_norm": 0.3580401770059214, "learning_rate": 1.1164299501197684e-06, "loss": 0.0009, "step": 6259 }, { "epoch": 8.528610354223433, "grad_norm": 0.46403863968766795, "learning_rate": 1.1144045855195972e-06, "loss": 0.0022, "step": 6260 }, { "epoch": 8.529972752043596, "grad_norm": 0.335139761842561, "learning_rate": 1.1123809513265204e-06, "loss": 0.0005, "step": 6261 }, { "epoch": 8.53133514986376, "grad_norm": 0.6196468947514561, "learning_rate": 1.1103590479346293e-06, "loss": 0.0005, "step": 6262 }, { "epoch": 8.532697547683924, "grad_norm": 0.39404321695732664, "learning_rate": 1.1083388757376712e-06, "loss": 0.0053, "step": 6263 }, { "epoch": 8.534059945504087, "grad_norm": 0.7045292918502364, "learning_rate": 1.1063204351290658e-06, "loss": 0.008, "step": 6264 }, { "epoch": 8.535422343324251, "grad_norm": 2.6193953670301533, "learning_rate": 1.104303726501884e-06, "loss": 0.0048, "step": 6265 }, { "epoch": 8.536784741144414, "grad_norm": 2.1312711127739967, "learning_rate": 1.1022887502488687e-06, "loss": 0.0117, "step": 6266 }, { "epoch": 8.538147138964577, "grad_norm": 0.3130996718169054, "learning_rate": 1.100275506762425e-06, "loss": 0.0004, "step": 6267 }, { "epoch": 8.539509536784742, "grad_norm": 0.389816910511454, "learning_rate": 1.0982639964346132e-06, "loss": 0.0069, "step": 6268 }, { "epoch": 8.540871934604905, "grad_norm": 0.5892240292979404, "learning_rate": 1.0962542196571636e-06, "loss": 0.0009, "step": 6269 }, { "epoch": 8.542234332425068, "grad_norm": 0.47752671660732915, "learning_rate": 1.094246176821464e-06, "loss": 0.0005, "step": 6270 }, { "epoch": 8.543596730245232, "grad_norm": 0.46259266454295583, "learning_rate": 1.0922398683185698e-06, "loss": 0.0006, "step": 6271 }, { "epoch": 8.544959128065395, "grad_norm": 0.36523447419696864, "learning_rate": 1.0902352945391903e-06, "loss": 0.0072, "step": 6272 }, { "epoch": 8.546321525885558, "grad_norm": 0.5156698083507567, "learning_rate": 1.0882324558737068e-06, "loss": 0.0092, "step": 6273 }, { "epoch": 8.547683923705723, "grad_norm": 0.20763790815478214, "learning_rate": 1.086231352712157e-06, "loss": 0.0008, "step": 6274 }, { "epoch": 8.549046321525886, "grad_norm": 0.23343177698225034, "learning_rate": 1.0842319854442396e-06, "loss": 0.0006, "step": 6275 }, { "epoch": 8.550408719346049, "grad_norm": 0.6210550497065758, "learning_rate": 1.0822343544593217e-06, "loss": 0.0085, "step": 6276 }, { "epoch": 8.551771117166213, "grad_norm": 1.2797637257418586, "learning_rate": 1.080238460146421e-06, "loss": 0.0043, "step": 6277 }, { "epoch": 8.553133514986376, "grad_norm": 1.1766148217463195, "learning_rate": 1.078244302894229e-06, "loss": 0.0038, "step": 6278 }, { "epoch": 8.554495912806539, "grad_norm": 0.4794934828068677, "learning_rate": 1.076251883091095e-06, "loss": 0.0005, "step": 6279 }, { "epoch": 8.555858310626704, "grad_norm": 0.20562206056820384, "learning_rate": 1.0742612011250242e-06, "loss": 0.0004, "step": 6280 }, { "epoch": 8.557220708446867, "grad_norm": 0.06828192007786726, "learning_rate": 1.0722722573836908e-06, "loss": 0.0003, "step": 6281 }, { "epoch": 8.55858310626703, "grad_norm": 0.4997832228782124, "learning_rate": 1.0702850522544262e-06, "loss": 0.0072, "step": 6282 }, { "epoch": 8.559945504087194, "grad_norm": 0.3200672234303323, "learning_rate": 1.068299586124224e-06, "loss": 0.0003, "step": 6283 }, { "epoch": 8.561307901907357, "grad_norm": 0.23852560427383443, "learning_rate": 1.066315859379743e-06, "loss": 0.0071, "step": 6284 }, { "epoch": 8.56267029972752, "grad_norm": 0.7996377983555223, "learning_rate": 1.064333872407296e-06, "loss": 0.0037, "step": 6285 }, { "epoch": 8.564032697547685, "grad_norm": 0.43149315957415946, "learning_rate": 1.0623536255928646e-06, "loss": 0.0007, "step": 6286 }, { "epoch": 8.565395095367847, "grad_norm": 0.0718761695792368, "learning_rate": 1.0603751193220846e-06, "loss": 0.0003, "step": 6287 }, { "epoch": 8.56675749318801, "grad_norm": 0.30763417824149575, "learning_rate": 1.05839835398026e-06, "loss": 0.0021, "step": 6288 }, { "epoch": 8.568119891008175, "grad_norm": 0.5198735345318701, "learning_rate": 1.0564233299523475e-06, "loss": 0.0084, "step": 6289 }, { "epoch": 8.569482288828338, "grad_norm": 0.962684499365378, "learning_rate": 1.0544500476229713e-06, "loss": 0.0092, "step": 6290 }, { "epoch": 8.5708446866485, "grad_norm": 0.11320815550757168, "learning_rate": 1.0524785073764155e-06, "loss": 0.0003, "step": 6291 }, { "epoch": 8.572207084468666, "grad_norm": 0.20978776702870108, "learning_rate": 1.0505087095966204e-06, "loss": 0.0069, "step": 6292 }, { "epoch": 8.573569482288828, "grad_norm": 0.4261376155879086, "learning_rate": 1.048540654667195e-06, "loss": 0.0102, "step": 6293 }, { "epoch": 8.574931880108991, "grad_norm": 0.6709243141289819, "learning_rate": 1.0465743429713994e-06, "loss": 0.0008, "step": 6294 }, { "epoch": 8.576294277929156, "grad_norm": 0.3569163445744285, "learning_rate": 1.0446097748921601e-06, "loss": 0.0004, "step": 6295 }, { "epoch": 8.577656675749319, "grad_norm": 0.10827843971145391, "learning_rate": 1.0426469508120662e-06, "loss": 0.0005, "step": 6296 }, { "epoch": 8.579019073569482, "grad_norm": 0.627362782520104, "learning_rate": 1.0406858711133594e-06, "loss": 0.0015, "step": 6297 }, { "epoch": 8.580381471389646, "grad_norm": 0.3483025755764607, "learning_rate": 1.0387265361779498e-06, "loss": 0.0005, "step": 6298 }, { "epoch": 8.58174386920981, "grad_norm": 1.5828148840380636, "learning_rate": 1.0367689463874009e-06, "loss": 0.003, "step": 6299 }, { "epoch": 8.583106267029972, "grad_norm": 0.21345352420496003, "learning_rate": 1.0348131021229423e-06, "loss": 0.0004, "step": 6300 }, { "epoch": 8.584468664850137, "grad_norm": 0.8306108101851256, "learning_rate": 1.0328590037654573e-06, "loss": 0.0155, "step": 6301 }, { "epoch": 8.5858310626703, "grad_norm": 0.3074718398278177, "learning_rate": 1.0309066516954958e-06, "loss": 0.0058, "step": 6302 }, { "epoch": 8.587193460490463, "grad_norm": 0.791799767888269, "learning_rate": 1.0289560462932657e-06, "loss": 0.0068, "step": 6303 }, { "epoch": 8.588555858310627, "grad_norm": 0.5023647006244751, "learning_rate": 1.02700718793863e-06, "loss": 0.0105, "step": 6304 }, { "epoch": 8.58991825613079, "grad_norm": 0.4146092974121763, "learning_rate": 1.0250600770111186e-06, "loss": 0.0083, "step": 6305 }, { "epoch": 8.591280653950953, "grad_norm": 0.2754561741197284, "learning_rate": 1.0231147138899145e-06, "loss": 0.0009, "step": 6306 }, { "epoch": 8.592643051771118, "grad_norm": 0.19867053621212288, "learning_rate": 1.0211710989538638e-06, "loss": 0.0003, "step": 6307 }, { "epoch": 8.59400544959128, "grad_norm": 0.3069561868703218, "learning_rate": 1.0192292325814757e-06, "loss": 0.0004, "step": 6308 }, { "epoch": 8.595367847411444, "grad_norm": 0.2891843500444419, "learning_rate": 1.0172891151509101e-06, "loss": 0.0072, "step": 6309 }, { "epoch": 8.596730245231608, "grad_norm": 5.042248749891386, "learning_rate": 1.0153507470399958e-06, "loss": 0.0268, "step": 6310 }, { "epoch": 8.598092643051771, "grad_norm": 1.3515020384892706, "learning_rate": 1.0134141286262112e-06, "loss": 0.02, "step": 6311 }, { "epoch": 8.599455040871934, "grad_norm": 1.1246094861669815, "learning_rate": 1.011479260286703e-06, "loss": 0.0057, "step": 6312 }, { "epoch": 8.600817438692099, "grad_norm": 1.272945123395369, "learning_rate": 1.0095461423982723e-06, "loss": 0.0034, "step": 6313 }, { "epoch": 8.602179836512262, "grad_norm": 0.6549327636274909, "learning_rate": 1.0076147753373787e-06, "loss": 0.0072, "step": 6314 }, { "epoch": 8.603542234332425, "grad_norm": 0.47492337193064144, "learning_rate": 1.0056851594801453e-06, "loss": 0.008, "step": 6315 }, { "epoch": 8.60490463215259, "grad_norm": 0.12760235249027774, "learning_rate": 1.0037572952023477e-06, "loss": 0.0004, "step": 6316 }, { "epoch": 8.606267029972752, "grad_norm": 0.5461464653799744, "learning_rate": 1.001831182879427e-06, "loss": 0.0067, "step": 6317 }, { "epoch": 8.607629427792915, "grad_norm": 0.12575073215022275, "learning_rate": 9.999068228864773e-07, "loss": 0.0003, "step": 6318 }, { "epoch": 8.60899182561308, "grad_norm": 0.26220565421075065, "learning_rate": 9.979842155982543e-07, "loss": 0.0005, "step": 6319 }, { "epoch": 8.610354223433243, "grad_norm": 1.020661331277262, "learning_rate": 9.960633613891757e-07, "loss": 0.0225, "step": 6320 }, { "epoch": 8.611716621253406, "grad_norm": 0.391951222515219, "learning_rate": 9.941442606333106e-07, "loss": 0.0004, "step": 6321 }, { "epoch": 8.61307901907357, "grad_norm": 0.9781944000145, "learning_rate": 9.922269137043927e-07, "loss": 0.0075, "step": 6322 }, { "epoch": 8.614441416893733, "grad_norm": 0.9017595206291023, "learning_rate": 9.903113209758098e-07, "loss": 0.0035, "step": 6323 }, { "epoch": 8.615803814713896, "grad_norm": 0.4636164224712412, "learning_rate": 9.8839748282061e-07, "loss": 0.0055, "step": 6324 }, { "epoch": 8.61716621253406, "grad_norm": 0.8514940894508863, "learning_rate": 9.864853996115032e-07, "loss": 0.0025, "step": 6325 }, { "epoch": 8.618528610354224, "grad_norm": 0.0906639879458289, "learning_rate": 9.845750717208502e-07, "loss": 0.0005, "step": 6326 }, { "epoch": 8.619891008174386, "grad_norm": 0.9739625470499222, "learning_rate": 9.82666499520677e-07, "loss": 0.0083, "step": 6327 }, { "epoch": 8.621253405994551, "grad_norm": 0.4731492569740348, "learning_rate": 9.807596833826615e-07, "loss": 0.0007, "step": 6328 }, { "epoch": 8.622615803814714, "grad_norm": 0.1616226099961114, "learning_rate": 9.78854623678146e-07, "loss": 0.0005, "step": 6329 }, { "epoch": 8.623978201634877, "grad_norm": 2.009040378194925, "learning_rate": 9.769513207781234e-07, "loss": 0.0059, "step": 6330 }, { "epoch": 8.625340599455042, "grad_norm": 0.3053774640336809, "learning_rate": 9.750497750532517e-07, "loss": 0.0076, "step": 6331 }, { "epoch": 8.626702997275205, "grad_norm": 0.36262423639514413, "learning_rate": 9.731499868738448e-07, "loss": 0.0003, "step": 6332 }, { "epoch": 8.628065395095367, "grad_norm": 0.4895878875953532, "learning_rate": 9.712519566098677e-07, "loss": 0.0005, "step": 6333 }, { "epoch": 8.629427792915532, "grad_norm": 0.31522529962140056, "learning_rate": 9.693556846309548e-07, "loss": 0.0009, "step": 6334 }, { "epoch": 8.630790190735695, "grad_norm": 0.5186940668522309, "learning_rate": 9.674611713063863e-07, "loss": 0.0073, "step": 6335 }, { "epoch": 8.632152588555858, "grad_norm": 0.8130402914384147, "learning_rate": 9.655684170051072e-07, "loss": 0.006, "step": 6336 }, { "epoch": 8.633514986376023, "grad_norm": 0.07144042552138435, "learning_rate": 9.636774220957213e-07, "loss": 0.0004, "step": 6337 }, { "epoch": 8.634877384196185, "grad_norm": 0.7972624229528823, "learning_rate": 9.617881869464807e-07, "loss": 0.002, "step": 6338 }, { "epoch": 8.636239782016348, "grad_norm": 3.2791900540546495, "learning_rate": 9.599007119253056e-07, "loss": 0.0076, "step": 6339 }, { "epoch": 8.637602179836513, "grad_norm": 0.6048403740097565, "learning_rate": 9.580149973997655e-07, "loss": 0.0012, "step": 6340 }, { "epoch": 8.638964577656676, "grad_norm": 0.8821811376956313, "learning_rate": 9.561310437370907e-07, "loss": 0.0053, "step": 6341 }, { "epoch": 8.640326975476839, "grad_norm": 0.16546458612503995, "learning_rate": 9.542488513041704e-07, "loss": 0.0005, "step": 6342 }, { "epoch": 8.641689373297003, "grad_norm": 0.3975443977843822, "learning_rate": 9.523684204675432e-07, "loss": 0.0006, "step": 6343 }, { "epoch": 8.643051771117166, "grad_norm": 1.2418859321889042, "learning_rate": 9.504897515934153e-07, "loss": 0.005, "step": 6344 }, { "epoch": 8.64441416893733, "grad_norm": 0.2664050765770733, "learning_rate": 9.486128450476395e-07, "loss": 0.0003, "step": 6345 }, { "epoch": 8.645776566757494, "grad_norm": 0.6641544576105441, "learning_rate": 9.467377011957346e-07, "loss": 0.0201, "step": 6346 }, { "epoch": 8.647138964577657, "grad_norm": 0.2885649404971378, "learning_rate": 9.448643204028662e-07, "loss": 0.0005, "step": 6347 }, { "epoch": 8.64850136239782, "grad_norm": 1.177800173201611, "learning_rate": 9.429927030338659e-07, "loss": 0.0042, "step": 6348 }, { "epoch": 8.649863760217984, "grad_norm": 0.08272995217790513, "learning_rate": 9.411228494532199e-07, "loss": 0.0018, "step": 6349 }, { "epoch": 8.651226158038147, "grad_norm": 0.08222639857108041, "learning_rate": 9.392547600250634e-07, "loss": 0.0022, "step": 6350 }, { "epoch": 8.65258855585831, "grad_norm": 0.11160759351570251, "learning_rate": 9.373884351131979e-07, "loss": 0.0004, "step": 6351 }, { "epoch": 8.653950953678475, "grad_norm": 2.348196536194174, "learning_rate": 9.355238750810747e-07, "loss": 0.0037, "step": 6352 }, { "epoch": 8.655313351498638, "grad_norm": 0.41018169081188727, "learning_rate": 9.336610802918044e-07, "loss": 0.0079, "step": 6353 }, { "epoch": 8.6566757493188, "grad_norm": 0.8111333302297774, "learning_rate": 9.318000511081559e-07, "loss": 0.0014, "step": 6354 }, { "epoch": 8.658038147138965, "grad_norm": 0.8209407712569895, "learning_rate": 9.29940787892546e-07, "loss": 0.0147, "step": 6355 }, { "epoch": 8.659400544959128, "grad_norm": 0.751123005197978, "learning_rate": 9.280832910070592e-07, "loss": 0.0083, "step": 6356 }, { "epoch": 8.660762942779291, "grad_norm": 0.5599720354592713, "learning_rate": 9.262275608134241e-07, "loss": 0.004, "step": 6357 }, { "epoch": 8.662125340599456, "grad_norm": 0.6470015618009662, "learning_rate": 9.243735976730373e-07, "loss": 0.0053, "step": 6358 }, { "epoch": 8.663487738419619, "grad_norm": 0.4654952433459946, "learning_rate": 9.225214019469387e-07, "loss": 0.0015, "step": 6359 }, { "epoch": 8.664850136239782, "grad_norm": 0.1889623421425288, "learning_rate": 9.20670973995833e-07, "loss": 0.0005, "step": 6360 }, { "epoch": 8.666212534059946, "grad_norm": 0.11807333833911803, "learning_rate": 9.188223141800801e-07, "loss": 0.0004, "step": 6361 }, { "epoch": 8.66757493188011, "grad_norm": 0.10003996530320053, "learning_rate": 9.169754228596905e-07, "loss": 0.0005, "step": 6362 }, { "epoch": 8.668937329700272, "grad_norm": 0.388037315688091, "learning_rate": 9.151303003943368e-07, "loss": 0.0004, "step": 6363 }, { "epoch": 8.670299727520437, "grad_norm": 0.49033910329866953, "learning_rate": 9.132869471433359e-07, "loss": 0.0004, "step": 6364 }, { "epoch": 8.6716621253406, "grad_norm": 0.22151426758561457, "learning_rate": 9.114453634656784e-07, "loss": 0.0004, "step": 6365 }, { "epoch": 8.673024523160763, "grad_norm": 3.076257261641617, "learning_rate": 9.09605549719994e-07, "loss": 0.0075, "step": 6366 }, { "epoch": 8.674386920980927, "grad_norm": 1.1418855506882122, "learning_rate": 9.077675062645719e-07, "loss": 0.0036, "step": 6367 }, { "epoch": 8.67574931880109, "grad_norm": 0.4635846153582339, "learning_rate": 9.059312334573634e-07, "loss": 0.0011, "step": 6368 }, { "epoch": 8.677111716621253, "grad_norm": 0.1288262506399579, "learning_rate": 9.040967316559634e-07, "loss": 0.0003, "step": 6369 }, { "epoch": 8.678474114441418, "grad_norm": 0.4322174877853561, "learning_rate": 9.02264001217632e-07, "loss": 0.0034, "step": 6370 }, { "epoch": 8.67983651226158, "grad_norm": 0.31311198474720164, "learning_rate": 9.004330424992813e-07, "loss": 0.0004, "step": 6371 }, { "epoch": 8.681198910081743, "grad_norm": 1.2895571091552815, "learning_rate": 8.986038558574739e-07, "loss": 0.0044, "step": 6372 }, { "epoch": 8.682561307901908, "grad_norm": 0.7414305169937404, "learning_rate": 8.967764416484359e-07, "loss": 0.0072, "step": 6373 }, { "epoch": 8.683923705722071, "grad_norm": 1.8738121299605062, "learning_rate": 8.949508002280383e-07, "loss": 0.0027, "step": 6374 }, { "epoch": 8.685286103542234, "grad_norm": 0.07735243333941809, "learning_rate": 8.931269319518166e-07, "loss": 0.0003, "step": 6375 }, { "epoch": 8.686648501362399, "grad_norm": 1.4063385169308338, "learning_rate": 8.913048371749511e-07, "loss": 0.0065, "step": 6376 }, { "epoch": 8.688010899182562, "grad_norm": 0.42514206313372005, "learning_rate": 8.894845162522869e-07, "loss": 0.0005, "step": 6377 }, { "epoch": 8.689373297002724, "grad_norm": 0.27302004117519013, "learning_rate": 8.876659695383172e-07, "loss": 0.0019, "step": 6378 }, { "epoch": 8.690735694822889, "grad_norm": 0.21138109172940392, "learning_rate": 8.858491973871897e-07, "loss": 0.0022, "step": 6379 }, { "epoch": 8.692098092643052, "grad_norm": 0.31697475858243407, "learning_rate": 8.840342001527091e-07, "loss": 0.0005, "step": 6380 }, { "epoch": 8.693460490463215, "grad_norm": 0.7629115903835024, "learning_rate": 8.822209781883329e-07, "loss": 0.0059, "step": 6381 }, { "epoch": 8.69482288828338, "grad_norm": 0.5884288891957735, "learning_rate": 8.804095318471728e-07, "loss": 0.0036, "step": 6382 }, { "epoch": 8.696185286103542, "grad_norm": 0.6095842635004283, "learning_rate": 8.785998614819957e-07, "loss": 0.0067, "step": 6383 }, { "epoch": 8.697547683923705, "grad_norm": 0.19939808415344257, "learning_rate": 8.767919674452241e-07, "loss": 0.0084, "step": 6384 }, { "epoch": 8.69891008174387, "grad_norm": 0.25686143142508255, "learning_rate": 8.749858500889307e-07, "loss": 0.0023, "step": 6385 }, { "epoch": 8.700272479564033, "grad_norm": 2.1742397513173217, "learning_rate": 8.731815097648432e-07, "loss": 0.0038, "step": 6386 }, { "epoch": 8.701634877384196, "grad_norm": 0.9453139832075068, "learning_rate": 8.713789468243461e-07, "loss": 0.0079, "step": 6387 }, { "epoch": 8.70299727520436, "grad_norm": 2.0615505837812527, "learning_rate": 8.695781616184718e-07, "loss": 0.0204, "step": 6388 }, { "epoch": 8.704359673024523, "grad_norm": 0.8505928719297465, "learning_rate": 8.677791544979175e-07, "loss": 0.0099, "step": 6389 }, { "epoch": 8.705722070844686, "grad_norm": 0.5293906081477396, "learning_rate": 8.659819258130231e-07, "loss": 0.0004, "step": 6390 }, { "epoch": 8.70708446866485, "grad_norm": 0.8188828863626993, "learning_rate": 8.641864759137853e-07, "loss": 0.0006, "step": 6391 }, { "epoch": 8.708446866485014, "grad_norm": 0.5693916872870017, "learning_rate": 8.623928051498575e-07, "loss": 0.0006, "step": 6392 }, { "epoch": 8.709809264305177, "grad_norm": 0.388285800923001, "learning_rate": 8.606009138705406e-07, "loss": 0.0004, "step": 6393 }, { "epoch": 8.711171662125341, "grad_norm": 0.16059668699181176, "learning_rate": 8.588108024247987e-07, "loss": 0.0007, "step": 6394 }, { "epoch": 8.712534059945504, "grad_norm": 0.16430212191303947, "learning_rate": 8.570224711612385e-07, "loss": 0.0007, "step": 6395 }, { "epoch": 8.713896457765667, "grad_norm": 0.3381977015064503, "learning_rate": 8.55235920428129e-07, "loss": 0.0007, "step": 6396 }, { "epoch": 8.71525885558583, "grad_norm": 0.7920306389673544, "learning_rate": 8.534511505733856e-07, "loss": 0.0076, "step": 6397 }, { "epoch": 8.716621253405995, "grad_norm": 1.6575970705472742, "learning_rate": 8.516681619445788e-07, "loss": 0.0046, "step": 6398 }, { "epoch": 8.717983651226158, "grad_norm": 0.28213947247969917, "learning_rate": 8.498869548889344e-07, "loss": 0.0167, "step": 6399 }, { "epoch": 8.719346049046322, "grad_norm": 0.2691510598134109, "learning_rate": 8.481075297533303e-07, "loss": 0.0005, "step": 6400 }, { "epoch": 8.720708446866485, "grad_norm": 1.9665090281032795, "learning_rate": 8.463298868842973e-07, "loss": 0.0035, "step": 6401 }, { "epoch": 8.722070844686648, "grad_norm": 0.5466325520530174, "learning_rate": 8.445540266280195e-07, "loss": 0.009, "step": 6402 }, { "epoch": 8.723433242506811, "grad_norm": 0.9102239752216188, "learning_rate": 8.42779949330329e-07, "loss": 0.0263, "step": 6403 }, { "epoch": 8.724795640326976, "grad_norm": 0.2797489990133177, "learning_rate": 8.410076553367208e-07, "loss": 0.0073, "step": 6404 }, { "epoch": 8.726158038147139, "grad_norm": 1.04718350864835, "learning_rate": 8.392371449923286e-07, "loss": 0.0169, "step": 6405 }, { "epoch": 8.727520435967303, "grad_norm": 0.48384755807600566, "learning_rate": 8.374684186419569e-07, "loss": 0.0073, "step": 6406 }, { "epoch": 8.728882833787466, "grad_norm": 0.8819481538467009, "learning_rate": 8.357014766300442e-07, "loss": 0.0023, "step": 6407 }, { "epoch": 8.730245231607629, "grad_norm": 1.4213051898644582, "learning_rate": 8.339363193006966e-07, "loss": 0.0151, "step": 6408 }, { "epoch": 8.731607629427792, "grad_norm": 0.3842100717641704, "learning_rate": 8.321729469976625e-07, "loss": 0.0015, "step": 6409 }, { "epoch": 8.732970027247957, "grad_norm": 0.40409065216831047, "learning_rate": 8.304113600643437e-07, "loss": 0.0006, "step": 6410 }, { "epoch": 8.73433242506812, "grad_norm": 0.7828986690706716, "learning_rate": 8.286515588438016e-07, "loss": 0.0012, "step": 6411 }, { "epoch": 8.735694822888284, "grad_norm": 0.0981613071369202, "learning_rate": 8.268935436787417e-07, "loss": 0.0004, "step": 6412 }, { "epoch": 8.737057220708447, "grad_norm": 0.6624174927668318, "learning_rate": 8.251373149115294e-07, "loss": 0.0071, "step": 6413 }, { "epoch": 8.73841961852861, "grad_norm": 0.5735529383274067, "learning_rate": 8.233828728841742e-07, "loss": 0.0005, "step": 6414 }, { "epoch": 8.739782016348773, "grad_norm": 0.26543584207677134, "learning_rate": 8.216302179383406e-07, "loss": 0.0174, "step": 6415 }, { "epoch": 8.741144414168938, "grad_norm": 0.5976152721116211, "learning_rate": 8.198793504153491e-07, "loss": 0.0006, "step": 6416 }, { "epoch": 8.7425068119891, "grad_norm": 0.8943419291644955, "learning_rate": 8.181302706561633e-07, "loss": 0.0204, "step": 6417 }, { "epoch": 8.743869209809265, "grad_norm": 0.24409890206620874, "learning_rate": 8.163829790014121e-07, "loss": 0.0006, "step": 6418 }, { "epoch": 8.745231607629428, "grad_norm": 1.0201599612962229, "learning_rate": 8.146374757913622e-07, "loss": 0.0048, "step": 6419 }, { "epoch": 8.746594005449591, "grad_norm": 0.39845319026457576, "learning_rate": 8.128937613659415e-07, "loss": 0.0023, "step": 6420 }, { "epoch": 8.747956403269754, "grad_norm": 0.6721002325105825, "learning_rate": 8.111518360647252e-07, "loss": 0.0169, "step": 6421 }, { "epoch": 8.749318801089919, "grad_norm": 0.5488093429826277, "learning_rate": 8.094117002269363e-07, "loss": 0.004, "step": 6422 }, { "epoch": 8.750681198910081, "grad_norm": 0.792856463394759, "learning_rate": 8.076733541914617e-07, "loss": 0.0013, "step": 6423 }, { "epoch": 8.752043596730246, "grad_norm": 0.38970769369336805, "learning_rate": 8.059367982968258e-07, "loss": 0.0022, "step": 6424 }, { "epoch": 8.753405994550409, "grad_norm": 0.9493454334440241, "learning_rate": 8.042020328812162e-07, "loss": 0.0156, "step": 6425 }, { "epoch": 8.754768392370572, "grad_norm": 0.712808218759975, "learning_rate": 8.024690582824613e-07, "loss": 0.0079, "step": 6426 }, { "epoch": 8.756130790190735, "grad_norm": 2.2150942044484934, "learning_rate": 8.007378748380479e-07, "loss": 0.0019, "step": 6427 }, { "epoch": 8.7574931880109, "grad_norm": 0.1486941862245997, "learning_rate": 7.990084828851108e-07, "loss": 0.0004, "step": 6428 }, { "epoch": 8.758855585831062, "grad_norm": 2.8498378881521353, "learning_rate": 7.972808827604383e-07, "loss": 0.0108, "step": 6429 }, { "epoch": 8.760217983651227, "grad_norm": 0.8179938259696217, "learning_rate": 7.955550748004681e-07, "loss": 0.0094, "step": 6430 }, { "epoch": 8.76158038147139, "grad_norm": 0.18297462361872777, "learning_rate": 7.938310593412879e-07, "loss": 0.0003, "step": 6431 }, { "epoch": 8.762942779291553, "grad_norm": 0.5222907103621414, "learning_rate": 7.921088367186391e-07, "loss": 0.0031, "step": 6432 }, { "epoch": 8.764305177111716, "grad_norm": 0.3766884100628806, "learning_rate": 7.903884072679125e-07, "loss": 0.0009, "step": 6433 }, { "epoch": 8.76566757493188, "grad_norm": 0.09637323838640788, "learning_rate": 7.886697713241453e-07, "loss": 0.0004, "step": 6434 }, { "epoch": 8.767029972752043, "grad_norm": 1.7965853619776493, "learning_rate": 7.869529292220357e-07, "loss": 0.0067, "step": 6435 }, { "epoch": 8.768392370572208, "grad_norm": 0.08945758105823146, "learning_rate": 7.852378812959227e-07, "loss": 0.0004, "step": 6436 }, { "epoch": 8.769754768392371, "grad_norm": 0.08278782330468017, "learning_rate": 7.835246278798037e-07, "loss": 0.0003, "step": 6437 }, { "epoch": 8.771117166212534, "grad_norm": 1.6744952024170274, "learning_rate": 7.818131693073184e-07, "loss": 0.0067, "step": 6438 }, { "epoch": 8.772479564032697, "grad_norm": 0.7077474358476904, "learning_rate": 7.801035059117645e-07, "loss": 0.0092, "step": 6439 }, { "epoch": 8.773841961852861, "grad_norm": 1.055805395407434, "learning_rate": 7.783956380260837e-07, "loss": 0.0077, "step": 6440 }, { "epoch": 8.775204359673024, "grad_norm": 0.24282300289294312, "learning_rate": 7.766895659828733e-07, "loss": 0.0008, "step": 6441 }, { "epoch": 8.776566757493189, "grad_norm": 2.545488535541811, "learning_rate": 7.749852901143795e-07, "loss": 0.0108, "step": 6442 }, { "epoch": 8.777929155313352, "grad_norm": 2.2007198246776616, "learning_rate": 7.73282810752497e-07, "loss": 0.0277, "step": 6443 }, { "epoch": 8.779291553133515, "grad_norm": 0.575043014702571, "learning_rate": 7.715821282287727e-07, "loss": 0.0037, "step": 6444 }, { "epoch": 8.780653950953678, "grad_norm": 0.5071182077992529, "learning_rate": 7.698832428744007e-07, "loss": 0.0078, "step": 6445 }, { "epoch": 8.782016348773842, "grad_norm": 1.1006853459515076, "learning_rate": 7.681861550202252e-07, "loss": 0.0042, "step": 6446 }, { "epoch": 8.783378746594005, "grad_norm": 0.751590307030243, "learning_rate": 7.664908649967484e-07, "loss": 0.0006, "step": 6447 }, { "epoch": 8.78474114441417, "grad_norm": 1.53816266387001, "learning_rate": 7.647973731341107e-07, "loss": 0.004, "step": 6448 }, { "epoch": 8.786103542234333, "grad_norm": 0.8206227223265555, "learning_rate": 7.631056797621106e-07, "loss": 0.002, "step": 6449 }, { "epoch": 8.787465940054496, "grad_norm": 0.5351422171555089, "learning_rate": 7.614157852101911e-07, "loss": 0.0012, "step": 6450 }, { "epoch": 8.788828337874659, "grad_norm": 0.25620700513651173, "learning_rate": 7.597276898074479e-07, "loss": 0.0003, "step": 6451 }, { "epoch": 8.790190735694823, "grad_norm": 0.6051677442066559, "learning_rate": 7.580413938826292e-07, "loss": 0.0005, "step": 6452 }, { "epoch": 8.791553133514986, "grad_norm": 0.3148696760869787, "learning_rate": 7.563568977641233e-07, "loss": 0.0181, "step": 6453 }, { "epoch": 8.79291553133515, "grad_norm": 0.612261838996191, "learning_rate": 7.54674201779979e-07, "loss": 0.0008, "step": 6454 }, { "epoch": 8.794277929155314, "grad_norm": 1.9316838111957404, "learning_rate": 7.529933062578865e-07, "loss": 0.0076, "step": 6455 }, { "epoch": 8.795640326975477, "grad_norm": 1.6408995307763161, "learning_rate": 7.513142115251892e-07, "loss": 0.0038, "step": 6456 }, { "epoch": 8.79700272479564, "grad_norm": 0.14528707785481804, "learning_rate": 7.496369179088791e-07, "loss": 0.0007, "step": 6457 }, { "epoch": 8.798365122615804, "grad_norm": 1.07334567998372, "learning_rate": 7.479614257355972e-07, "loss": 0.0064, "step": 6458 }, { "epoch": 8.799727520435967, "grad_norm": 0.7030806837067135, "learning_rate": 7.462877353316345e-07, "loss": 0.0016, "step": 6459 }, { "epoch": 8.80108991825613, "grad_norm": 0.2475025432168155, "learning_rate": 7.446158470229282e-07, "loss": 0.0004, "step": 6460 }, { "epoch": 8.802452316076295, "grad_norm": 0.4508737708670653, "learning_rate": 7.42945761135071e-07, "loss": 0.0004, "step": 6461 }, { "epoch": 8.803814713896458, "grad_norm": 1.132136447497927, "learning_rate": 7.412774779932951e-07, "loss": 0.0105, "step": 6462 }, { "epoch": 8.80517711171662, "grad_norm": 0.40782817435889335, "learning_rate": 7.396109979224897e-07, "loss": 0.0008, "step": 6463 }, { "epoch": 8.806539509536785, "grad_norm": 0.5990716510602387, "learning_rate": 7.379463212471916e-07, "loss": 0.0063, "step": 6464 }, { "epoch": 8.807901907356948, "grad_norm": 1.28832212907284, "learning_rate": 7.362834482915815e-07, "loss": 0.0014, "step": 6465 }, { "epoch": 8.809264305177111, "grad_norm": 1.645336385006277, "learning_rate": 7.34622379379496e-07, "loss": 0.0048, "step": 6466 }, { "epoch": 8.810626702997276, "grad_norm": 0.775296116489793, "learning_rate": 7.329631148344119e-07, "loss": 0.0231, "step": 6467 }, { "epoch": 8.811989100817438, "grad_norm": 0.33312434824631343, "learning_rate": 7.31305654979464e-07, "loss": 0.0067, "step": 6468 }, { "epoch": 8.813351498637601, "grad_norm": 0.2742808494935475, "learning_rate": 7.296500001374263e-07, "loss": 0.0003, "step": 6469 }, { "epoch": 8.814713896457766, "grad_norm": 0.2547509512484448, "learning_rate": 7.279961506307288e-07, "loss": 0.0003, "step": 6470 }, { "epoch": 8.816076294277929, "grad_norm": 2.6870476668138843, "learning_rate": 7.263441067814469e-07, "loss": 0.005, "step": 6471 }, { "epoch": 8.817438692098092, "grad_norm": 0.05722646570502036, "learning_rate": 7.246938689113037e-07, "loss": 0.0003, "step": 6472 }, { "epoch": 8.818801089918257, "grad_norm": 0.11050453893139314, "learning_rate": 7.23045437341674e-07, "loss": 0.0003, "step": 6473 }, { "epoch": 8.82016348773842, "grad_norm": 0.14590532959450375, "learning_rate": 7.213988123935733e-07, "loss": 0.0011, "step": 6474 }, { "epoch": 8.821525885558582, "grad_norm": 0.22127221085483645, "learning_rate": 7.197539943876731e-07, "loss": 0.0003, "step": 6475 }, { "epoch": 8.822888283378747, "grad_norm": 0.5959326102034909, "learning_rate": 7.181109836442913e-07, "loss": 0.0033, "step": 6476 }, { "epoch": 8.82425068119891, "grad_norm": 0.3797774824501199, "learning_rate": 7.164697804833909e-07, "loss": 0.0014, "step": 6477 }, { "epoch": 8.825613079019073, "grad_norm": 0.3406514446821388, "learning_rate": 7.148303852245853e-07, "loss": 0.0006, "step": 6478 }, { "epoch": 8.826975476839237, "grad_norm": 0.08110399215483478, "learning_rate": 7.131927981871345e-07, "loss": 0.0003, "step": 6479 }, { "epoch": 8.8283378746594, "grad_norm": 1.4044672006258867, "learning_rate": 7.115570196899468e-07, "loss": 0.0144, "step": 6480 }, { "epoch": 8.829700272479563, "grad_norm": 0.09340960210425311, "learning_rate": 7.099230500515808e-07, "loss": 0.0003, "step": 6481 }, { "epoch": 8.831062670299728, "grad_norm": 1.5921468586442475, "learning_rate": 7.082908895902374e-07, "loss": 0.0111, "step": 6482 }, { "epoch": 8.83242506811989, "grad_norm": 0.22046748188850612, "learning_rate": 7.066605386237724e-07, "loss": 0.0016, "step": 6483 }, { "epoch": 8.833787465940054, "grad_norm": 0.6462893944628456, "learning_rate": 7.050319974696795e-07, "loss": 0.0075, "step": 6484 }, { "epoch": 8.835149863760218, "grad_norm": 0.26024140244737665, "learning_rate": 7.034052664451118e-07, "loss": 0.0025, "step": 6485 }, { "epoch": 8.836512261580381, "grad_norm": 0.10758844437703853, "learning_rate": 7.017803458668593e-07, "loss": 0.0005, "step": 6486 }, { "epoch": 8.837874659400544, "grad_norm": 0.17549250950226786, "learning_rate": 7.001572360513643e-07, "loss": 0.0004, "step": 6487 }, { "epoch": 8.839237057220709, "grad_norm": 1.1065844651672618, "learning_rate": 6.985359373147183e-07, "loss": 0.008, "step": 6488 }, { "epoch": 8.840599455040872, "grad_norm": 0.9381066683968562, "learning_rate": 6.969164499726555e-07, "loss": 0.0061, "step": 6489 }, { "epoch": 8.841961852861035, "grad_norm": 1.6096202321041664, "learning_rate": 6.952987743405626e-07, "loss": 0.0111, "step": 6490 }, { "epoch": 8.8433242506812, "grad_norm": 0.5786070511171069, "learning_rate": 6.936829107334663e-07, "loss": 0.0059, "step": 6491 }, { "epoch": 8.844686648501362, "grad_norm": 0.3911240624677943, "learning_rate": 6.920688594660474e-07, "loss": 0.0079, "step": 6492 }, { "epoch": 8.846049046321525, "grad_norm": 0.22145697572425674, "learning_rate": 6.90456620852632e-07, "loss": 0.0003, "step": 6493 }, { "epoch": 8.84741144414169, "grad_norm": 0.7088154974195494, "learning_rate": 6.88846195207189e-07, "loss": 0.0224, "step": 6494 }, { "epoch": 8.848773841961853, "grad_norm": 1.0685601904679356, "learning_rate": 6.872375828433397e-07, "loss": 0.0089, "step": 6495 }, { "epoch": 8.850136239782016, "grad_norm": 1.0618117217851193, "learning_rate": 6.856307840743493e-07, "loss": 0.0017, "step": 6496 }, { "epoch": 8.85149863760218, "grad_norm": 0.5409134584156864, "learning_rate": 6.840257992131316e-07, "loss": 0.0022, "step": 6497 }, { "epoch": 8.852861035422343, "grad_norm": 0.4356353949063891, "learning_rate": 6.824226285722446e-07, "loss": 0.0026, "step": 6498 }, { "epoch": 8.854223433242506, "grad_norm": 0.13573204294794924, "learning_rate": 6.808212724638941e-07, "loss": 0.0022, "step": 6499 }, { "epoch": 8.85558583106267, "grad_norm": 0.3557251551880056, "learning_rate": 6.79221731199936e-07, "loss": 0.0081, "step": 6500 }, { "epoch": 8.856948228882834, "grad_norm": 0.8434730174528825, "learning_rate": 6.776240050918658e-07, "loss": 0.0035, "step": 6501 }, { "epoch": 8.858310626702997, "grad_norm": 0.16513251469362442, "learning_rate": 6.760280944508324e-07, "loss": 0.0003, "step": 6502 }, { "epoch": 8.859673024523161, "grad_norm": 0.7578184026054411, "learning_rate": 6.744339995876259e-07, "loss": 0.0067, "step": 6503 }, { "epoch": 8.861035422343324, "grad_norm": 1.5386366856035603, "learning_rate": 6.728417208126858e-07, "loss": 0.0287, "step": 6504 }, { "epoch": 8.862397820163487, "grad_norm": 0.4231956012286817, "learning_rate": 6.712512584360997e-07, "loss": 0.0073, "step": 6505 }, { "epoch": 8.863760217983652, "grad_norm": 0.29053437572547675, "learning_rate": 6.696626127675943e-07, "loss": 0.002, "step": 6506 }, { "epoch": 8.865122615803815, "grad_norm": 0.11921034562748987, "learning_rate": 6.68075784116552e-07, "loss": 0.0002, "step": 6507 }, { "epoch": 8.866485013623977, "grad_norm": 0.982648394386247, "learning_rate": 6.664907727919911e-07, "loss": 0.0021, "step": 6508 }, { "epoch": 8.867847411444142, "grad_norm": 1.1520092306995415, "learning_rate": 6.649075791025861e-07, "loss": 0.0094, "step": 6509 }, { "epoch": 8.869209809264305, "grad_norm": 0.39043630885025876, "learning_rate": 6.633262033566512e-07, "loss": 0.0033, "step": 6510 }, { "epoch": 8.870572207084468, "grad_norm": 0.11516703788220789, "learning_rate": 6.617466458621468e-07, "loss": 0.0004, "step": 6511 }, { "epoch": 8.871934604904633, "grad_norm": 0.08851458686044854, "learning_rate": 6.601689069266837e-07, "loss": 0.0005, "step": 6512 }, { "epoch": 8.873297002724795, "grad_norm": 1.9034968038831024, "learning_rate": 6.585929868575125e-07, "loss": 0.0063, "step": 6513 }, { "epoch": 8.874659400544958, "grad_norm": 0.40019835352413535, "learning_rate": 6.570188859615346e-07, "loss": 0.0015, "step": 6514 }, { "epoch": 8.876021798365123, "grad_norm": 0.20907386858616384, "learning_rate": 6.554466045452923e-07, "loss": 0.0006, "step": 6515 }, { "epoch": 8.877384196185286, "grad_norm": 0.41372004514450067, "learning_rate": 6.538761429149787e-07, "loss": 0.0007, "step": 6516 }, { "epoch": 8.878746594005449, "grad_norm": 0.9292114821281436, "learning_rate": 6.523075013764302e-07, "loss": 0.0047, "step": 6517 }, { "epoch": 8.880108991825614, "grad_norm": 0.1501006243221565, "learning_rate": 6.507406802351269e-07, "loss": 0.0005, "step": 6518 }, { "epoch": 8.881471389645776, "grad_norm": 0.5637748724652277, "learning_rate": 6.49175679796199e-07, "loss": 0.0059, "step": 6519 }, { "epoch": 8.88283378746594, "grad_norm": 0.2379342180599206, "learning_rate": 6.476125003644162e-07, "loss": 0.0077, "step": 6520 }, { "epoch": 8.884196185286104, "grad_norm": 0.24736766598284157, "learning_rate": 6.460511422441984e-07, "loss": 0.0074, "step": 6521 }, { "epoch": 8.885558583106267, "grad_norm": 2.0104545751104346, "learning_rate": 6.444916057396089e-07, "loss": 0.003, "step": 6522 }, { "epoch": 8.88692098092643, "grad_norm": 0.3894260401326304, "learning_rate": 6.429338911543559e-07, "loss": 0.0007, "step": 6523 }, { "epoch": 8.888283378746594, "grad_norm": 0.9351719622054818, "learning_rate": 6.413779987917956e-07, "loss": 0.0066, "step": 6524 }, { "epoch": 8.889645776566757, "grad_norm": 0.5815846203254388, "learning_rate": 6.398239289549224e-07, "loss": 0.0012, "step": 6525 }, { "epoch": 8.89100817438692, "grad_norm": 0.4306556490636459, "learning_rate": 6.382716819463864e-07, "loss": 0.0006, "step": 6526 }, { "epoch": 8.892370572207085, "grad_norm": 0.2647628535248823, "learning_rate": 6.367212580684712e-07, "loss": 0.0025, "step": 6527 }, { "epoch": 8.893732970027248, "grad_norm": 0.19116497235102803, "learning_rate": 6.351726576231132e-07, "loss": 0.0004, "step": 6528 }, { "epoch": 8.89509536784741, "grad_norm": 1.4482851180841971, "learning_rate": 6.336258809118934e-07, "loss": 0.0049, "step": 6529 }, { "epoch": 8.896457765667575, "grad_norm": 0.34314732691977634, "learning_rate": 6.320809282360319e-07, "loss": 0.0009, "step": 6530 }, { "epoch": 8.897820163487738, "grad_norm": 0.9483139213501601, "learning_rate": 6.305377998964012e-07, "loss": 0.0037, "step": 6531 }, { "epoch": 8.899182561307901, "grad_norm": 0.5758709761200009, "learning_rate": 6.289964961935114e-07, "loss": 0.0011, "step": 6532 }, { "epoch": 8.900544959128066, "grad_norm": 1.0846919215240534, "learning_rate": 6.274570174275218e-07, "loss": 0.0158, "step": 6533 }, { "epoch": 8.901907356948229, "grad_norm": 2.186216423147653, "learning_rate": 6.259193638982375e-07, "loss": 0.0092, "step": 6534 }, { "epoch": 8.903269754768392, "grad_norm": 0.5323639883616367, "learning_rate": 6.24383535905101e-07, "loss": 0.0162, "step": 6535 }, { "epoch": 8.904632152588556, "grad_norm": 1.968380332071551, "learning_rate": 6.228495337472074e-07, "loss": 0.0068, "step": 6536 }, { "epoch": 8.90599455040872, "grad_norm": 0.671565372256836, "learning_rate": 6.213173577232911e-07, "loss": 0.0213, "step": 6537 }, { "epoch": 8.907356948228882, "grad_norm": 1.4141004684687244, "learning_rate": 6.197870081317325e-07, "loss": 0.0102, "step": 6538 }, { "epoch": 8.908719346049047, "grad_norm": 1.3343787656919959, "learning_rate": 6.182584852705587e-07, "loss": 0.0115, "step": 6539 }, { "epoch": 8.91008174386921, "grad_norm": 0.9401729576277765, "learning_rate": 6.167317894374358e-07, "loss": 0.0006, "step": 6540 }, { "epoch": 8.911444141689373, "grad_norm": 0.6344279680180062, "learning_rate": 6.152069209296807e-07, "loss": 0.0041, "step": 6541 }, { "epoch": 8.912806539509537, "grad_norm": 0.27732218701397215, "learning_rate": 6.136838800442457e-07, "loss": 0.0083, "step": 6542 }, { "epoch": 8.9141689373297, "grad_norm": 0.37167686688201235, "learning_rate": 6.121626670777381e-07, "loss": 0.0004, "step": 6543 }, { "epoch": 8.915531335149863, "grad_norm": 0.21349561862970934, "learning_rate": 6.106432823263975e-07, "loss": 0.0005, "step": 6544 }, { "epoch": 8.916893732970028, "grad_norm": 0.38895732410945744, "learning_rate": 6.091257260861172e-07, "loss": 0.0045, "step": 6545 }, { "epoch": 8.91825613079019, "grad_norm": 0.3544672780055366, "learning_rate": 6.076099986524309e-07, "loss": 0.001, "step": 6546 }, { "epoch": 8.919618528610354, "grad_norm": 1.179309450958194, "learning_rate": 6.060961003205135e-07, "loss": 0.015, "step": 6547 }, { "epoch": 8.920980926430518, "grad_norm": 0.6626584354021673, "learning_rate": 6.045840313851881e-07, "loss": 0.0112, "step": 6548 }, { "epoch": 8.922343324250681, "grad_norm": 1.4152779744207444, "learning_rate": 6.030737921409169e-07, "loss": 0.0064, "step": 6549 }, { "epoch": 8.923705722070844, "grad_norm": 0.9092996212387231, "learning_rate": 6.015653828818102e-07, "loss": 0.0164, "step": 6550 }, { "epoch": 8.925068119891009, "grad_norm": 0.22403020370994636, "learning_rate": 6.000588039016209e-07, "loss": 0.0005, "step": 6551 }, { "epoch": 8.926430517711172, "grad_norm": 0.27557743994293726, "learning_rate": 5.98554055493743e-07, "loss": 0.0023, "step": 6552 }, { "epoch": 8.927792915531334, "grad_norm": 0.6488316754543447, "learning_rate": 5.970511379512167e-07, "loss": 0.0151, "step": 6553 }, { "epoch": 8.9291553133515, "grad_norm": 0.3524933910628653, "learning_rate": 5.955500515667234e-07, "loss": 0.0003, "step": 6554 }, { "epoch": 8.930517711171662, "grad_norm": 0.5068564908530505, "learning_rate": 5.940507966325915e-07, "loss": 0.0038, "step": 6555 }, { "epoch": 8.931880108991825, "grad_norm": 0.4393994249151207, "learning_rate": 5.925533734407851e-07, "loss": 0.0093, "step": 6556 }, { "epoch": 8.93324250681199, "grad_norm": 0.17586983529692912, "learning_rate": 5.910577822829233e-07, "loss": 0.0004, "step": 6557 }, { "epoch": 8.934604904632153, "grad_norm": 3.730512615505931, "learning_rate": 5.895640234502597e-07, "loss": 0.0137, "step": 6558 }, { "epoch": 8.935967302452315, "grad_norm": 0.7775366646260962, "learning_rate": 5.880720972336906e-07, "loss": 0.0042, "step": 6559 }, { "epoch": 8.93732970027248, "grad_norm": 0.13032943313610487, "learning_rate": 5.865820039237624e-07, "loss": 0.0005, "step": 6560 }, { "epoch": 8.938692098092643, "grad_norm": 0.17782396176762688, "learning_rate": 5.850937438106541e-07, "loss": 0.0007, "step": 6561 }, { "epoch": 8.940054495912806, "grad_norm": 0.3577846579009717, "learning_rate": 5.836073171842005e-07, "loss": 0.0006, "step": 6562 }, { "epoch": 8.94141689373297, "grad_norm": 0.3381099658381442, "learning_rate": 5.821227243338712e-07, "loss": 0.0003, "step": 6563 }, { "epoch": 8.942779291553133, "grad_norm": 0.32780543040090415, "learning_rate": 5.806399655487771e-07, "loss": 0.0008, "step": 6564 }, { "epoch": 8.944141689373296, "grad_norm": 0.5123736596032461, "learning_rate": 5.791590411176773e-07, "loss": 0.0097, "step": 6565 }, { "epoch": 8.945504087193461, "grad_norm": 0.16391843737280676, "learning_rate": 5.776799513289711e-07, "loss": 0.0004, "step": 6566 }, { "epoch": 8.946866485013624, "grad_norm": 0.12028447937410326, "learning_rate": 5.762026964707001e-07, "loss": 0.0004, "step": 6567 }, { "epoch": 8.948228882833787, "grad_norm": 1.0400245008346423, "learning_rate": 5.74727276830549e-07, "loss": 0.0022, "step": 6568 }, { "epoch": 8.949591280653951, "grad_norm": 0.6081601260095095, "learning_rate": 5.732536926958487e-07, "loss": 0.0049, "step": 6569 }, { "epoch": 8.950953678474114, "grad_norm": 0.16054556494001518, "learning_rate": 5.717819443535677e-07, "loss": 0.0004, "step": 6570 }, { "epoch": 8.952316076294277, "grad_norm": 0.5900339415795063, "learning_rate": 5.703120320903143e-07, "loss": 0.0172, "step": 6571 }, { "epoch": 8.953678474114442, "grad_norm": 0.2686551940220622, "learning_rate": 5.688439561923509e-07, "loss": 0.0009, "step": 6572 }, { "epoch": 8.955040871934605, "grad_norm": 0.05955710099051971, "learning_rate": 5.673777169455663e-07, "loss": 0.0002, "step": 6573 }, { "epoch": 8.956403269754768, "grad_norm": 0.10542269675764694, "learning_rate": 5.659133146355089e-07, "loss": 0.0003, "step": 6574 }, { "epoch": 8.957765667574932, "grad_norm": 0.3737754817805213, "learning_rate": 5.644507495473573e-07, "loss": 0.0159, "step": 6575 }, { "epoch": 8.959128065395095, "grad_norm": 0.2695067124610899, "learning_rate": 5.629900219659323e-07, "loss": 0.0016, "step": 6576 }, { "epoch": 8.960490463215258, "grad_norm": 0.20815558672146728, "learning_rate": 5.615311321757066e-07, "loss": 0.0003, "step": 6577 }, { "epoch": 8.961852861035423, "grad_norm": 0.9296402744045879, "learning_rate": 5.600740804607829e-07, "loss": 0.0061, "step": 6578 }, { "epoch": 8.963215258855586, "grad_norm": 0.11545835933523682, "learning_rate": 5.586188671049142e-07, "loss": 0.0023, "step": 6579 }, { "epoch": 8.964577656675749, "grad_norm": 1.3483437794142945, "learning_rate": 5.571654923914938e-07, "loss": 0.0022, "step": 6580 }, { "epoch": 8.965940054495913, "grad_norm": 0.2051635055463327, "learning_rate": 5.557139566035574e-07, "loss": 0.0006, "step": 6581 }, { "epoch": 8.967302452316076, "grad_norm": 0.13568557054574895, "learning_rate": 5.542642600237791e-07, "loss": 0.0003, "step": 6582 }, { "epoch": 8.96866485013624, "grad_norm": 0.20984904744982458, "learning_rate": 5.528164029344751e-07, "loss": 0.0023, "step": 6583 }, { "epoch": 8.970027247956404, "grad_norm": 0.10351282497521626, "learning_rate": 5.513703856176112e-07, "loss": 0.0004, "step": 6584 }, { "epoch": 8.971389645776567, "grad_norm": 0.28109443342048773, "learning_rate": 5.499262083547807e-07, "loss": 0.0009, "step": 6585 }, { "epoch": 8.97275204359673, "grad_norm": 0.22777855609226247, "learning_rate": 5.484838714272355e-07, "loss": 0.0008, "step": 6586 }, { "epoch": 8.974114441416894, "grad_norm": 0.15535797393663844, "learning_rate": 5.470433751158577e-07, "loss": 0.0004, "step": 6587 }, { "epoch": 8.975476839237057, "grad_norm": 0.15538370375825136, "learning_rate": 5.456047197011715e-07, "loss": 0.0004, "step": 6588 }, { "epoch": 8.97683923705722, "grad_norm": 0.2243596626088338, "learning_rate": 5.441679054633475e-07, "loss": 0.0005, "step": 6589 }, { "epoch": 8.978201634877385, "grad_norm": 1.9147664401408537, "learning_rate": 5.427329326821906e-07, "loss": 0.0051, "step": 6590 }, { "epoch": 8.979564032697548, "grad_norm": 0.26844653256166523, "learning_rate": 5.412998016371596e-07, "loss": 0.0005, "step": 6591 }, { "epoch": 8.98092643051771, "grad_norm": 0.8930676382844132, "learning_rate": 5.3986851260734e-07, "loss": 0.0071, "step": 6592 }, { "epoch": 8.982288828337875, "grad_norm": 0.42579858296509826, "learning_rate": 5.384390658714688e-07, "loss": 0.0077, "step": 6593 }, { "epoch": 8.983651226158038, "grad_norm": 0.20857463027414963, "learning_rate": 5.370114617079192e-07, "loss": 0.0017, "step": 6594 }, { "epoch": 8.985013623978201, "grad_norm": 0.24616009792603313, "learning_rate": 5.355857003947062e-07, "loss": 0.0005, "step": 6595 }, { "epoch": 8.986376021798366, "grad_norm": 0.29576549067580743, "learning_rate": 5.341617822094869e-07, "loss": 0.0005, "step": 6596 }, { "epoch": 8.987738419618529, "grad_norm": 0.20772576982465976, "learning_rate": 5.327397074295615e-07, "loss": 0.0003, "step": 6597 }, { "epoch": 8.989100817438691, "grad_norm": 0.9058681474498619, "learning_rate": 5.313194763318685e-07, "loss": 0.0081, "step": 6598 }, { "epoch": 8.990463215258856, "grad_norm": 0.24403611279604226, "learning_rate": 5.299010891929857e-07, "loss": 0.0074, "step": 6599 }, { "epoch": 8.991825613079019, "grad_norm": 0.2327696453209477, "learning_rate": 5.284845462891363e-07, "loss": 0.0004, "step": 6600 }, { "epoch": 8.993188010899182, "grad_norm": 0.3113130082708755, "learning_rate": 5.270698478961822e-07, "loss": 0.0075, "step": 6601 }, { "epoch": 8.994550408719347, "grad_norm": 0.20677939399523143, "learning_rate": 5.256569942896217e-07, "loss": 0.0004, "step": 6602 }, { "epoch": 8.99591280653951, "grad_norm": 0.36567183184389646, "learning_rate": 5.242459857446047e-07, "loss": 0.0079, "step": 6603 }, { "epoch": 8.997275204359672, "grad_norm": 0.12686368052872266, "learning_rate": 5.228368225359104e-07, "loss": 0.0005, "step": 6604 }, { "epoch": 8.998637602179837, "grad_norm": 0.5255698898893854, "learning_rate": 5.214295049379658e-07, "loss": 0.0152, "step": 6605 }, { "epoch": 9.0, "grad_norm": 0.31815958747879314, "learning_rate": 5.200240332248352e-07, "loss": 0.0004, "step": 6606 }, { "epoch": 9.0, "eval_accuracy": 0.9427288040426727, "eval_f1": 0.9328668840996096, "eval_loss": 0.14483381807804108, "eval_precision": 0.9252303322120119, "eval_recall": 0.9445078980959473, "eval_runtime": 17.4191, "eval_samples_per_second": 102.244, "eval_steps_per_second": 0.804, "step": 6606 }, { "epoch": 9.001362397820163, "grad_norm": 0.3075476908360871, "learning_rate": 5.186204076702228e-07, "loss": 0.0004, "step": 6607 }, { "epoch": 9.002724795640328, "grad_norm": 0.487776469887706, "learning_rate": 5.172186285474756e-07, "loss": 0.0006, "step": 6608 }, { "epoch": 9.00408719346049, "grad_norm": 1.859605533108128, "learning_rate": 5.158186961295819e-07, "loss": 0.0028, "step": 6609 }, { "epoch": 9.005449591280653, "grad_norm": 0.38160501574919214, "learning_rate": 5.14420610689168e-07, "loss": 0.015, "step": 6610 }, { "epoch": 9.006811989100818, "grad_norm": 0.12221090973742578, "learning_rate": 5.130243724984995e-07, "loss": 0.0004, "step": 6611 }, { "epoch": 9.008174386920981, "grad_norm": 1.4040394814669932, "learning_rate": 5.116299818294868e-07, "loss": 0.0031, "step": 6612 }, { "epoch": 9.009536784741144, "grad_norm": 0.5519098608864599, "learning_rate": 5.102374389536768e-07, "loss": 0.0005, "step": 6613 }, { "epoch": 9.010899182561309, "grad_norm": 1.2056978197600852, "learning_rate": 5.088467441422529e-07, "loss": 0.0142, "step": 6614 }, { "epoch": 9.012261580381471, "grad_norm": 1.1421308484134927, "learning_rate": 5.074578976660504e-07, "loss": 0.0095, "step": 6615 }, { "epoch": 9.013623978201634, "grad_norm": 0.17463715343062783, "learning_rate": 5.060708997955322e-07, "loss": 0.0007, "step": 6616 }, { "epoch": 9.014986376021799, "grad_norm": 0.12903158080915678, "learning_rate": 5.046857508008085e-07, "loss": 0.0004, "step": 6617 }, { "epoch": 9.016348773841962, "grad_norm": 1.6490125968443323, "learning_rate": 5.033024509516282e-07, "loss": 0.0025, "step": 6618 }, { "epoch": 9.017711171662125, "grad_norm": 0.40888742987940035, "learning_rate": 5.019210005173747e-07, "loss": 0.0006, "step": 6619 }, { "epoch": 9.01907356948229, "grad_norm": 0.1953936019796425, "learning_rate": 5.005413997670816e-07, "loss": 0.0077, "step": 6620 }, { "epoch": 9.020435967302452, "grad_norm": 0.6794302454111951, "learning_rate": 4.991636489694118e-07, "loss": 0.0128, "step": 6621 }, { "epoch": 9.021798365122615, "grad_norm": 0.7077656881691828, "learning_rate": 4.977877483926763e-07, "loss": 0.0027, "step": 6622 }, { "epoch": 9.02316076294278, "grad_norm": 0.7544800036721783, "learning_rate": 4.964136983048184e-07, "loss": 0.0052, "step": 6623 }, { "epoch": 9.024523160762943, "grad_norm": 0.41514954246464475, "learning_rate": 4.950414989734287e-07, "loss": 0.0005, "step": 6624 }, { "epoch": 9.025885558583106, "grad_norm": 0.44923670447707703, "learning_rate": 4.936711506657288e-07, "loss": 0.0012, "step": 6625 }, { "epoch": 9.02724795640327, "grad_norm": 0.35644197607945866, "learning_rate": 4.923026536485876e-07, "loss": 0.0005, "step": 6626 }, { "epoch": 9.028610354223433, "grad_norm": 0.3899451784218227, "learning_rate": 4.909360081885106e-07, "loss": 0.0013, "step": 6627 }, { "epoch": 9.029972752043596, "grad_norm": 0.146689608275842, "learning_rate": 4.895712145516385e-07, "loss": 0.0005, "step": 6628 }, { "epoch": 9.03133514986376, "grad_norm": 0.3191294424001176, "learning_rate": 4.882082730037607e-07, "loss": 0.0003, "step": 6629 }, { "epoch": 9.032697547683924, "grad_norm": 0.3069613793497811, "learning_rate": 4.868471838102972e-07, "loss": 0.0009, "step": 6630 }, { "epoch": 9.034059945504087, "grad_norm": 0.36780261586851476, "learning_rate": 4.854879472363083e-07, "loss": 0.0007, "step": 6631 }, { "epoch": 9.035422343324251, "grad_norm": 0.4602492056683926, "learning_rate": 4.84130563546501e-07, "loss": 0.0081, "step": 6632 }, { "epoch": 9.036784741144414, "grad_norm": 0.29048558010999576, "learning_rate": 4.827750330052117e-07, "loss": 0.0003, "step": 6633 }, { "epoch": 9.038147138964577, "grad_norm": 1.3185522403522518, "learning_rate": 4.814213558764247e-07, "loss": 0.0028, "step": 6634 }, { "epoch": 9.039509536784742, "grad_norm": 0.09894365420858048, "learning_rate": 4.800695324237548e-07, "loss": 0.0004, "step": 6635 }, { "epoch": 9.040871934604905, "grad_norm": 0.3367930753891973, "learning_rate": 4.787195629104635e-07, "loss": 0.0059, "step": 6636 }, { "epoch": 9.042234332425068, "grad_norm": 0.20683110791139503, "learning_rate": 4.77371447599444e-07, "loss": 0.0004, "step": 6637 }, { "epoch": 9.043596730245232, "grad_norm": 0.12984298684501502, "learning_rate": 4.760251867532362e-07, "loss": 0.0004, "step": 6638 }, { "epoch": 9.044959128065395, "grad_norm": 0.08976011206593458, "learning_rate": 4.746807806340137e-07, "loss": 0.0003, "step": 6639 }, { "epoch": 9.046321525885558, "grad_norm": 0.13763585571997392, "learning_rate": 4.733382295035882e-07, "loss": 0.0022, "step": 6640 }, { "epoch": 9.047683923705723, "grad_norm": 0.18399576044319318, "learning_rate": 4.719975336234162e-07, "loss": 0.0079, "step": 6641 }, { "epoch": 9.049046321525886, "grad_norm": 0.24508823810365737, "learning_rate": 4.7065869325458536e-07, "loss": 0.0006, "step": 6642 }, { "epoch": 9.050408719346049, "grad_norm": 0.2578320667271478, "learning_rate": 4.693217086578239e-07, "loss": 0.0003, "step": 6643 }, { "epoch": 9.051771117166213, "grad_norm": 0.2574603742392622, "learning_rate": 4.6798658009350463e-07, "loss": 0.0003, "step": 6644 }, { "epoch": 9.053133514986376, "grad_norm": 0.5630227361390987, "learning_rate": 4.6665330782163174e-07, "loss": 0.0008, "step": 6645 }, { "epoch": 9.054495912806539, "grad_norm": 0.065309064911949, "learning_rate": 4.653218921018521e-07, "loss": 0.0004, "step": 6646 }, { "epoch": 9.055858310626704, "grad_norm": 0.16333184735171052, "learning_rate": 4.6399233319344703e-07, "loss": 0.0003, "step": 6647 }, { "epoch": 9.057220708446867, "grad_norm": 0.29964145997124975, "learning_rate": 4.626646313553396e-07, "loss": 0.0084, "step": 6648 }, { "epoch": 9.05858310626703, "grad_norm": 0.06569014441504042, "learning_rate": 4.613387868460939e-07, "loss": 0.0003, "step": 6649 }, { "epoch": 9.059945504087194, "grad_norm": 1.6531973190028553, "learning_rate": 4.600147999239035e-07, "loss": 0.0049, "step": 6650 }, { "epoch": 9.061307901907357, "grad_norm": 0.785347965201821, "learning_rate": 4.5869267084661106e-07, "loss": 0.001, "step": 6651 }, { "epoch": 9.06267029972752, "grad_norm": 1.9857900467106004, "learning_rate": 4.5737239987168615e-07, "loss": 0.0219, "step": 6652 }, { "epoch": 9.064032697547685, "grad_norm": 0.5738311539613224, "learning_rate": 4.560539872562464e-07, "loss": 0.0029, "step": 6653 }, { "epoch": 9.065395095367847, "grad_norm": 0.2811045504535072, "learning_rate": 4.547374332570409e-07, "loss": 0.0009, "step": 6654 }, { "epoch": 9.06675749318801, "grad_norm": 0.06680258907560863, "learning_rate": 4.534227381304601e-07, "loss": 0.0003, "step": 6655 }, { "epoch": 9.068119891008175, "grad_norm": 0.10374407369213454, "learning_rate": 4.521099021325337e-07, "loss": 0.0003, "step": 6656 }, { "epoch": 9.069482288828338, "grad_norm": 0.13221415097074793, "learning_rate": 4.507989255189238e-07, "loss": 0.0007, "step": 6657 }, { "epoch": 9.0708446866485, "grad_norm": 1.109837084510873, "learning_rate": 4.494898085449362e-07, "loss": 0.0117, "step": 6658 }, { "epoch": 9.072207084468666, "grad_norm": 0.8479256174305679, "learning_rate": 4.481825514655114e-07, "loss": 0.0096, "step": 6659 }, { "epoch": 9.073569482288828, "grad_norm": 0.43357114151846005, "learning_rate": 4.4687715453522793e-07, "loss": 0.0014, "step": 6660 }, { "epoch": 9.074931880108991, "grad_norm": 0.3423024153631204, "learning_rate": 4.4557361800830366e-07, "loss": 0.0084, "step": 6661 }, { "epoch": 9.076294277929156, "grad_norm": 0.5265828866221698, "learning_rate": 4.4427194213859216e-07, "loss": 0.0009, "step": 6662 }, { "epoch": 9.077656675749319, "grad_norm": 0.34290899928183527, "learning_rate": 4.429721271795884e-07, "loss": 0.0009, "step": 6663 }, { "epoch": 9.079019073569482, "grad_norm": 0.3124552229293279, "learning_rate": 4.4167417338441765e-07, "loss": 0.0004, "step": 6664 }, { "epoch": 9.080381471389646, "grad_norm": 0.3007690937407284, "learning_rate": 4.403780810058511e-07, "loss": 0.0021, "step": 6665 }, { "epoch": 9.08174386920981, "grad_norm": 0.060759470711726556, "learning_rate": 4.390838502962913e-07, "loss": 0.0003, "step": 6666 }, { "epoch": 9.083106267029972, "grad_norm": 0.9089052275080428, "learning_rate": 4.3779148150778104e-07, "loss": 0.0074, "step": 6667 }, { "epoch": 9.084468664850137, "grad_norm": 0.4101003530806512, "learning_rate": 4.3650097489200125e-07, "loss": 0.0079, "step": 6668 }, { "epoch": 9.0858310626703, "grad_norm": 0.39535654464367975, "learning_rate": 4.352123307002676e-07, "loss": 0.0021, "step": 6669 }, { "epoch": 9.087193460490463, "grad_norm": 0.28128260983145814, "learning_rate": 4.339255491835359e-07, "loss": 0.0004, "step": 6670 }, { "epoch": 9.088555858310627, "grad_norm": 0.6420104099345724, "learning_rate": 4.326406305923958e-07, "loss": 0.0086, "step": 6671 }, { "epoch": 9.08991825613079, "grad_norm": 0.3950572332094485, "learning_rate": 4.313575751770771e-07, "loss": 0.0004, "step": 6672 }, { "epoch": 9.091280653950953, "grad_norm": 1.4309445532669633, "learning_rate": 4.300763831874477e-07, "loss": 0.014, "step": 6673 }, { "epoch": 9.092643051771118, "grad_norm": 0.20592212155946002, "learning_rate": 4.2879705487300693e-07, "loss": 0.001, "step": 6674 }, { "epoch": 9.09400544959128, "grad_norm": 0.0723082204069369, "learning_rate": 4.2751959048289883e-07, "loss": 0.0003, "step": 6675 }, { "epoch": 9.095367847411444, "grad_norm": 0.21721580206114102, "learning_rate": 4.2624399026589656e-07, "loss": 0.0006, "step": 6676 }, { "epoch": 9.096730245231608, "grad_norm": 0.08224862839808453, "learning_rate": 4.2497025447041705e-07, "loss": 0.0003, "step": 6677 }, { "epoch": 9.098092643051771, "grad_norm": 0.36117656165011397, "learning_rate": 4.2369838334451183e-07, "loss": 0.0006, "step": 6678 }, { "epoch": 9.099455040871934, "grad_norm": 0.69776891670473, "learning_rate": 4.2242837713586725e-07, "loss": 0.0062, "step": 6679 }, { "epoch": 9.100817438692099, "grad_norm": 0.41166328849095557, "learning_rate": 4.211602360918099e-07, "loss": 0.0005, "step": 6680 }, { "epoch": 9.102179836512262, "grad_norm": 0.23208523704627457, "learning_rate": 4.1989396045929774e-07, "loss": 0.0073, "step": 6681 }, { "epoch": 9.103542234332425, "grad_norm": 0.37075804006951757, "learning_rate": 4.186295504849336e-07, "loss": 0.0004, "step": 6682 }, { "epoch": 9.10490463215259, "grad_norm": 0.35222554714309506, "learning_rate": 4.173670064149482e-07, "loss": 0.0004, "step": 6683 }, { "epoch": 9.106267029972752, "grad_norm": 0.2531723736017878, "learning_rate": 4.161063284952149e-07, "loss": 0.0004, "step": 6684 }, { "epoch": 9.107629427792915, "grad_norm": 0.4110762480900821, "learning_rate": 4.1484751697124406e-07, "loss": 0.0075, "step": 6685 }, { "epoch": 9.10899182561308, "grad_norm": 1.473274211804405, "learning_rate": 4.1359057208817613e-07, "loss": 0.0041, "step": 6686 }, { "epoch": 9.110354223433243, "grad_norm": 0.3434554498169647, "learning_rate": 4.123354940907953e-07, "loss": 0.0075, "step": 6687 }, { "epoch": 9.111716621253406, "grad_norm": 0.21450746321933167, "learning_rate": 4.110822832235184e-07, "loss": 0.0005, "step": 6688 }, { "epoch": 9.11307901907357, "grad_norm": 0.2637624769709021, "learning_rate": 4.098309397303979e-07, "loss": 0.0006, "step": 6689 }, { "epoch": 9.114441416893733, "grad_norm": 0.23928995333998826, "learning_rate": 4.0858146385512776e-07, "loss": 0.0003, "step": 6690 }, { "epoch": 9.115803814713896, "grad_norm": 0.13507232476926676, "learning_rate": 4.0733385584103e-07, "loss": 0.0003, "step": 6691 }, { "epoch": 9.11716621253406, "grad_norm": 0.18639363489909844, "learning_rate": 4.0608811593107254e-07, "loss": 0.0005, "step": 6692 }, { "epoch": 9.118528610354224, "grad_norm": 0.6543894723957103, "learning_rate": 4.048442443678491e-07, "loss": 0.0111, "step": 6693 }, { "epoch": 9.119891008174386, "grad_norm": 0.2907629185204874, "learning_rate": 4.036022413936003e-07, "loss": 0.0077, "step": 6694 }, { "epoch": 9.121253405994551, "grad_norm": 0.5051000278107888, "learning_rate": 4.0236210725019265e-07, "loss": 0.0014, "step": 6695 }, { "epoch": 9.122615803814714, "grad_norm": 0.0941711181476899, "learning_rate": 4.0112384217913523e-07, "loss": 0.0004, "step": 6696 }, { "epoch": 9.123978201634877, "grad_norm": 0.24672452698710917, "learning_rate": 3.9988744642157297e-07, "loss": 0.0005, "step": 6697 }, { "epoch": 9.125340599455042, "grad_norm": 0.31225576424706786, "learning_rate": 3.986529202182832e-07, "loss": 0.0006, "step": 6698 }, { "epoch": 9.126702997275205, "grad_norm": 0.2731761784034183, "learning_rate": 3.974202638096836e-07, "loss": 0.0004, "step": 6699 }, { "epoch": 9.128065395095367, "grad_norm": 0.07858178586827871, "learning_rate": 3.9618947743582213e-07, "loss": 0.0002, "step": 6700 }, { "epoch": 9.129427792915532, "grad_norm": 1.1990206827641294, "learning_rate": 3.949605613363883e-07, "loss": 0.0097, "step": 6701 }, { "epoch": 9.130790190735695, "grad_norm": 1.0266978693260658, "learning_rate": 3.937335157507038e-07, "loss": 0.0015, "step": 6702 }, { "epoch": 9.132152588555858, "grad_norm": 0.40111785010635304, "learning_rate": 3.925083409177266e-07, "loss": 0.0022, "step": 6703 }, { "epoch": 9.133514986376023, "grad_norm": 0.5689747640397399, "learning_rate": 3.912850370760535e-07, "loss": 0.0084, "step": 6704 }, { "epoch": 9.134877384196185, "grad_norm": 0.224909695583874, "learning_rate": 3.9006360446391056e-07, "loss": 0.0003, "step": 6705 }, { "epoch": 9.136239782016348, "grad_norm": 0.4606419638452805, "learning_rate": 3.888440433191654e-07, "loss": 0.0022, "step": 6706 }, { "epoch": 9.137602179836513, "grad_norm": 0.16432499154947275, "learning_rate": 3.8762635387931903e-07, "loss": 0.0004, "step": 6707 }, { "epoch": 9.138964577656676, "grad_norm": 0.24881599915216035, "learning_rate": 3.8641053638150625e-07, "loss": 0.0029, "step": 6708 }, { "epoch": 9.140326975476839, "grad_norm": 2.399234297528613, "learning_rate": 3.851965910625022e-07, "loss": 0.0023, "step": 6709 }, { "epoch": 9.141689373297003, "grad_norm": 1.4367753290048386, "learning_rate": 3.8398451815870984e-07, "loss": 0.0061, "step": 6710 }, { "epoch": 9.143051771117166, "grad_norm": 0.09716923156032409, "learning_rate": 3.8277431790617604e-07, "loss": 0.0004, "step": 6711 }, { "epoch": 9.14441416893733, "grad_norm": 0.09026625172686983, "learning_rate": 3.8156599054057553e-07, "loss": 0.0003, "step": 6712 }, { "epoch": 9.145776566757494, "grad_norm": 0.2707943404979169, "learning_rate": 3.8035953629722234e-07, "loss": 0.0009, "step": 6713 }, { "epoch": 9.147138964577657, "grad_norm": 0.43577822568228963, "learning_rate": 3.791549554110663e-07, "loss": 0.0025, "step": 6714 }, { "epoch": 9.14850136239782, "grad_norm": 0.5004675434579247, "learning_rate": 3.779522481166897e-07, "loss": 0.0078, "step": 6715 }, { "epoch": 9.149863760217984, "grad_norm": 0.8291583384930106, "learning_rate": 3.76751414648312e-07, "loss": 0.0013, "step": 6716 }, { "epoch": 9.151226158038147, "grad_norm": 0.32073628967674467, "learning_rate": 3.755524552397871e-07, "loss": 0.0033, "step": 6717 }, { "epoch": 9.15258855585831, "grad_norm": 0.10438552342716527, "learning_rate": 3.743553701246028e-07, "loss": 0.0003, "step": 6718 }, { "epoch": 9.153950953678475, "grad_norm": 0.08184213526373398, "learning_rate": 3.731601595358847e-07, "loss": 0.0003, "step": 6719 }, { "epoch": 9.155313351498638, "grad_norm": 0.5025141181217234, "learning_rate": 3.7196682370639004e-07, "loss": 0.0174, "step": 6720 }, { "epoch": 9.1566757493188, "grad_norm": 0.31984153344939703, "learning_rate": 3.70775362868514e-07, "loss": 0.0008, "step": 6721 }, { "epoch": 9.158038147138965, "grad_norm": 0.21316809266188014, "learning_rate": 3.6958577725428437e-07, "loss": 0.0021, "step": 6722 }, { "epoch": 9.159400544959128, "grad_norm": 0.21099931547144085, "learning_rate": 3.6839806709536465e-07, "loss": 0.0006, "step": 6723 }, { "epoch": 9.160762942779291, "grad_norm": 0.5643198374049989, "learning_rate": 3.6721223262305204e-07, "loss": 0.0084, "step": 6724 }, { "epoch": 9.162125340599456, "grad_norm": 0.24213829313509808, "learning_rate": 3.660282740682808e-07, "loss": 0.0007, "step": 6725 }, { "epoch": 9.163487738419619, "grad_norm": 0.4373874475133388, "learning_rate": 3.648461916616186e-07, "loss": 0.0078, "step": 6726 }, { "epoch": 9.164850136239782, "grad_norm": 0.15941632845201104, "learning_rate": 3.6366598563326695e-07, "loss": 0.0004, "step": 6727 }, { "epoch": 9.166212534059946, "grad_norm": 0.1785678703421276, "learning_rate": 3.6248765621306413e-07, "loss": 0.0006, "step": 6728 }, { "epoch": 9.16757493188011, "grad_norm": 0.29547802394074496, "learning_rate": 3.6131120363047777e-07, "loss": 0.0005, "step": 6729 }, { "epoch": 9.168937329700272, "grad_norm": 0.19220877643980008, "learning_rate": 3.601366281146179e-07, "loss": 0.0004, "step": 6730 }, { "epoch": 9.170299727520437, "grad_norm": 0.7728430067004421, "learning_rate": 3.589639298942238e-07, "loss": 0.0148, "step": 6731 }, { "epoch": 9.1716621253406, "grad_norm": 0.15337662493889462, "learning_rate": 3.577931091976683e-07, "loss": 0.0003, "step": 6732 }, { "epoch": 9.173024523160763, "grad_norm": 0.08852008759279134, "learning_rate": 3.5662416625296236e-07, "loss": 0.0003, "step": 6733 }, { "epoch": 9.174386920980927, "grad_norm": 0.4662670873497476, "learning_rate": 3.5545710128774837e-07, "loss": 0.0167, "step": 6734 }, { "epoch": 9.17574931880109, "grad_norm": 0.3067006060773778, "learning_rate": 3.5429191452930336e-07, "loss": 0.0003, "step": 6735 }, { "epoch": 9.177111716621253, "grad_norm": 0.17520967363165107, "learning_rate": 3.531286062045425e-07, "loss": 0.0004, "step": 6736 }, { "epoch": 9.178474114441418, "grad_norm": 2.1418188317562383, "learning_rate": 3.519671765400079e-07, "loss": 0.0024, "step": 6737 }, { "epoch": 9.17983651226158, "grad_norm": 0.27612514519698317, "learning_rate": 3.5080762576188307e-07, "loss": 0.0003, "step": 6738 }, { "epoch": 9.181198910081743, "grad_norm": 0.2719483904033552, "learning_rate": 3.496499540959786e-07, "loss": 0.0003, "step": 6739 }, { "epoch": 9.182561307901908, "grad_norm": 0.858839882004747, "learning_rate": 3.484941617677473e-07, "loss": 0.01, "step": 6740 }, { "epoch": 9.183923705722071, "grad_norm": 0.9484081774456877, "learning_rate": 3.47340249002267e-07, "loss": 0.0072, "step": 6741 }, { "epoch": 9.185286103542234, "grad_norm": 0.14670288667574294, "learning_rate": 3.461882160242591e-07, "loss": 0.0003, "step": 6742 }, { "epoch": 9.186648501362399, "grad_norm": 1.01678194664916, "learning_rate": 3.4503806305807076e-07, "loss": 0.0246, "step": 6743 }, { "epoch": 9.188010899182562, "grad_norm": 0.719432844827751, "learning_rate": 3.438897903276861e-07, "loss": 0.0069, "step": 6744 }, { "epoch": 9.189373297002724, "grad_norm": 1.0931109160907755, "learning_rate": 3.4274339805672405e-07, "loss": 0.0091, "step": 6745 }, { "epoch": 9.190735694822889, "grad_norm": 0.18808938632384353, "learning_rate": 3.4159888646843497e-07, "loss": 0.0005, "step": 6746 }, { "epoch": 9.192098092643052, "grad_norm": 0.18473721515419894, "learning_rate": 3.4045625578570605e-07, "loss": 0.0078, "step": 6747 }, { "epoch": 9.193460490463215, "grad_norm": 0.37941326803891917, "learning_rate": 3.3931550623105714e-07, "loss": 0.0018, "step": 6748 }, { "epoch": 9.19482288828338, "grad_norm": 1.4075520948565963, "learning_rate": 3.3817663802663935e-07, "loss": 0.0297, "step": 6749 }, { "epoch": 9.196185286103542, "grad_norm": 0.13627774939998982, "learning_rate": 3.370396513942409e-07, "loss": 0.0004, "step": 6750 }, { "epoch": 9.197547683923705, "grad_norm": 0.14229513569591795, "learning_rate": 3.3590454655528013e-07, "loss": 0.0004, "step": 6751 }, { "epoch": 9.19891008174387, "grad_norm": 0.1787615392186156, "learning_rate": 3.3477132373081257e-07, "loss": 0.0017, "step": 6752 }, { "epoch": 9.200272479564033, "grad_norm": 0.11698362342231994, "learning_rate": 3.336399831415216e-07, "loss": 0.0003, "step": 6753 }, { "epoch": 9.201634877384196, "grad_norm": 1.6893591246866277, "learning_rate": 3.325105250077332e-07, "loss": 0.0072, "step": 6754 }, { "epoch": 9.20299727520436, "grad_norm": 0.23640847294398706, "learning_rate": 3.313829495493992e-07, "loss": 0.0005, "step": 6755 }, { "epoch": 9.204359673024523, "grad_norm": 0.24943476080854243, "learning_rate": 3.302572569861051e-07, "loss": 0.004, "step": 6756 }, { "epoch": 9.205722070844686, "grad_norm": 0.4520517691649368, "learning_rate": 3.291334475370744e-07, "loss": 0.0026, "step": 6757 }, { "epoch": 9.207084468664851, "grad_norm": 0.36049193209277997, "learning_rate": 3.2801152142115764e-07, "loss": 0.0008, "step": 6758 }, { "epoch": 9.208446866485014, "grad_norm": 0.29568141619090466, "learning_rate": 3.2689147885684667e-07, "loss": 0.0005, "step": 6759 }, { "epoch": 9.209809264305177, "grad_norm": 0.1053484702433027, "learning_rate": 3.2577332006225926e-07, "loss": 0.0004, "step": 6760 }, { "epoch": 9.211171662125341, "grad_norm": 0.27958351534514503, "learning_rate": 3.2465704525514784e-07, "loss": 0.0005, "step": 6761 }, { "epoch": 9.212534059945504, "grad_norm": 0.4551164454768499, "learning_rate": 3.235426546529008e-07, "loss": 0.0174, "step": 6762 }, { "epoch": 9.213896457765667, "grad_norm": 0.7939024358149764, "learning_rate": 3.224301484725367e-07, "loss": 0.0154, "step": 6763 }, { "epoch": 9.215258855585832, "grad_norm": 0.10598897059392234, "learning_rate": 3.21319526930709e-07, "loss": 0.0004, "step": 6764 }, { "epoch": 9.216621253405995, "grad_norm": 0.17181388352911497, "learning_rate": 3.202107902437035e-07, "loss": 0.0004, "step": 6765 }, { "epoch": 9.217983651226158, "grad_norm": 0.8343251860829225, "learning_rate": 3.191039386274397e-07, "loss": 0.0063, "step": 6766 }, { "epoch": 9.219346049046322, "grad_norm": 0.6000365322683133, "learning_rate": 3.179989722974686e-07, "loss": 0.02, "step": 6767 }, { "epoch": 9.220708446866485, "grad_norm": 0.45453698161648587, "learning_rate": 3.1689589146897235e-07, "loss": 0.0062, "step": 6768 }, { "epoch": 9.222070844686648, "grad_norm": 1.3793863882148607, "learning_rate": 3.1579469635677153e-07, "loss": 0.0069, "step": 6769 }, { "epoch": 9.223433242506813, "grad_norm": 1.1318584280630124, "learning_rate": 3.1469538717531224e-07, "loss": 0.0044, "step": 6770 }, { "epoch": 9.224795640326976, "grad_norm": 0.7834570822937821, "learning_rate": 3.1359796413868214e-07, "loss": 0.0011, "step": 6771 }, { "epoch": 9.226158038147139, "grad_norm": 0.2502911786282327, "learning_rate": 3.125024274605926e-07, "loss": 0.0078, "step": 6772 }, { "epoch": 9.227520435967303, "grad_norm": 0.20370444571658275, "learning_rate": 3.114087773543939e-07, "loss": 0.0082, "step": 6773 }, { "epoch": 9.228882833787466, "grad_norm": 0.19722905270424113, "learning_rate": 3.103170140330658e-07, "loss": 0.0004, "step": 6774 }, { "epoch": 9.230245231607629, "grad_norm": 0.08227620312582397, "learning_rate": 3.0922713770922155e-07, "loss": 0.0003, "step": 6775 }, { "epoch": 9.231607629427794, "grad_norm": 0.8944372232388268, "learning_rate": 3.0813914859510575e-07, "loss": 0.0106, "step": 6776 }, { "epoch": 9.232970027247957, "grad_norm": 0.10790502608139457, "learning_rate": 3.0705304690259786e-07, "loss": 0.0005, "step": 6777 }, { "epoch": 9.23433242506812, "grad_norm": 0.20638452493154566, "learning_rate": 3.059688328432109e-07, "loss": 0.0003, "step": 6778 }, { "epoch": 9.235694822888284, "grad_norm": 0.8660406091732887, "learning_rate": 3.0488650662808483e-07, "loss": 0.0024, "step": 6779 }, { "epoch": 9.237057220708447, "grad_norm": 1.7140970121559793, "learning_rate": 3.0380606846799443e-07, "loss": 0.0042, "step": 6780 }, { "epoch": 9.23841961852861, "grad_norm": 0.11913783692722944, "learning_rate": 3.0272751857334915e-07, "loss": 0.0004, "step": 6781 }, { "epoch": 9.239782016348773, "grad_norm": 0.09224438272954769, "learning_rate": 3.016508571541876e-07, "loss": 0.0019, "step": 6782 }, { "epoch": 9.241144414168938, "grad_norm": 0.4004230664799455, "learning_rate": 3.005760844201844e-07, "loss": 0.0022, "step": 6783 }, { "epoch": 9.2425068119891, "grad_norm": 0.7205088066272514, "learning_rate": 2.9950320058064195e-07, "loss": 0.0037, "step": 6784 }, { "epoch": 9.243869209809265, "grad_norm": 0.39891749232256696, "learning_rate": 2.9843220584449774e-07, "loss": 0.0076, "step": 6785 }, { "epoch": 9.245231607629428, "grad_norm": 0.08611212428334244, "learning_rate": 2.9736310042032033e-07, "loss": 0.0003, "step": 6786 }, { "epoch": 9.246594005449591, "grad_norm": 0.19460404367438447, "learning_rate": 2.9629588451630885e-07, "loss": 0.0005, "step": 6787 }, { "epoch": 9.247956403269754, "grad_norm": 0.2655815617563681, "learning_rate": 2.952305583402981e-07, "loss": 0.0011, "step": 6788 }, { "epoch": 9.249318801089919, "grad_norm": 0.25002373885812, "learning_rate": 2.941671220997522e-07, "loss": 0.0011, "step": 6789 }, { "epoch": 9.250681198910081, "grad_norm": 0.1142078417907873, "learning_rate": 2.931055760017687e-07, "loss": 0.0003, "step": 6790 }, { "epoch": 9.252043596730246, "grad_norm": 0.11099935281898718, "learning_rate": 2.9204592025307565e-07, "loss": 0.0004, "step": 6791 }, { "epoch": 9.253405994550409, "grad_norm": 0.4965255570696321, "learning_rate": 2.909881550600324e-07, "loss": 0.0062, "step": 6792 }, { "epoch": 9.254768392370572, "grad_norm": 0.27008836916975404, "learning_rate": 2.89932280628632e-07, "loss": 0.0009, "step": 6793 }, { "epoch": 9.256130790190735, "grad_norm": 0.1461648678439068, "learning_rate": 2.8887829716449877e-07, "loss": 0.0005, "step": 6794 }, { "epoch": 9.2574931880109, "grad_norm": 0.0883859966660126, "learning_rate": 2.8782620487288857e-07, "loss": 0.002, "step": 6795 }, { "epoch": 9.258855585831062, "grad_norm": 1.2609023328274989, "learning_rate": 2.8677600395868866e-07, "loss": 0.0054, "step": 6796 }, { "epoch": 9.260217983651227, "grad_norm": 0.7317630094297679, "learning_rate": 2.857276946264198e-07, "loss": 0.0057, "step": 6797 }, { "epoch": 9.26158038147139, "grad_norm": 0.10747918222122485, "learning_rate": 2.8468127708023097e-07, "loss": 0.0003, "step": 6798 }, { "epoch": 9.262942779291553, "grad_norm": 0.30674735256011904, "learning_rate": 2.8363675152390357e-07, "loss": 0.0004, "step": 6799 }, { "epoch": 9.264305177111716, "grad_norm": 0.3254782601939161, "learning_rate": 2.825941181608549e-07, "loss": 0.0004, "step": 6800 }, { "epoch": 9.26566757493188, "grad_norm": 1.0262826368870102, "learning_rate": 2.8155337719412814e-07, "loss": 0.0009, "step": 6801 }, { "epoch": 9.267029972752043, "grad_norm": 0.5892305290320792, "learning_rate": 2.8051452882640105e-07, "loss": 0.0156, "step": 6802 }, { "epoch": 9.268392370572208, "grad_norm": 0.09746741430802813, "learning_rate": 2.7947757325998305e-07, "loss": 0.0003, "step": 6803 }, { "epoch": 9.269754768392371, "grad_norm": 0.3609087414887776, "learning_rate": 2.784425106968114e-07, "loss": 0.0081, "step": 6804 }, { "epoch": 9.271117166212534, "grad_norm": 1.8145317142779964, "learning_rate": 2.774093413384582e-07, "loss": 0.0161, "step": 6805 }, { "epoch": 9.272479564032697, "grad_norm": 1.0742013205814835, "learning_rate": 2.7637806538612586e-07, "loss": 0.0053, "step": 6806 }, { "epoch": 9.273841961852861, "grad_norm": 0.8871835569185532, "learning_rate": 2.753486830406504e-07, "loss": 0.0192, "step": 6807 }, { "epoch": 9.275204359673024, "grad_norm": 0.3045034476293669, "learning_rate": 2.7432119450249375e-07, "loss": 0.0006, "step": 6808 }, { "epoch": 9.276566757493187, "grad_norm": 1.2492316371115089, "learning_rate": 2.732955999717546e-07, "loss": 0.0046, "step": 6809 }, { "epoch": 9.277929155313352, "grad_norm": 0.1146465086861243, "learning_rate": 2.722718996481566e-07, "loss": 0.0007, "step": 6810 }, { "epoch": 9.279291553133515, "grad_norm": 0.5650228783637046, "learning_rate": 2.712500937310614e-07, "loss": 0.0064, "step": 6811 }, { "epoch": 9.280653950953678, "grad_norm": 0.5008934172022121, "learning_rate": 2.7023018241945863e-07, "loss": 0.0019, "step": 6812 }, { "epoch": 9.282016348773842, "grad_norm": 0.144020375776407, "learning_rate": 2.692121659119662e-07, "loss": 0.0006, "step": 6813 }, { "epoch": 9.283378746594005, "grad_norm": 0.12283151754335414, "learning_rate": 2.681960444068388e-07, "loss": 0.0003, "step": 6814 }, { "epoch": 9.284741144414168, "grad_norm": 0.0750409524643407, "learning_rate": 2.67181818101957e-07, "loss": 0.0004, "step": 6815 }, { "epoch": 9.286103542234333, "grad_norm": 0.06235003175594728, "learning_rate": 2.661694871948328e-07, "loss": 0.0003, "step": 6816 }, { "epoch": 9.287465940054496, "grad_norm": 0.46354561851897064, "learning_rate": 2.6515905188261413e-07, "loss": 0.0066, "step": 6817 }, { "epoch": 9.288828337874659, "grad_norm": 0.05321925139587287, "learning_rate": 2.6415051236207355e-07, "loss": 0.0003, "step": 6818 }, { "epoch": 9.290190735694823, "grad_norm": 0.38493676662363097, "learning_rate": 2.631438688296184e-07, "loss": 0.0008, "step": 6819 }, { "epoch": 9.291553133514986, "grad_norm": 0.1747533520561485, "learning_rate": 2.6213912148128405e-07, "loss": 0.0004, "step": 6820 }, { "epoch": 9.292915531335149, "grad_norm": 0.208638342575164, "learning_rate": 2.611362705127396e-07, "loss": 0.0005, "step": 6821 }, { "epoch": 9.294277929155314, "grad_norm": 0.1730897331458154, "learning_rate": 2.60135316119281e-07, "loss": 0.0009, "step": 6822 }, { "epoch": 9.295640326975477, "grad_norm": 0.4506550799775763, "learning_rate": 2.59136258495839e-07, "loss": 0.0017, "step": 6823 }, { "epoch": 9.29700272479564, "grad_norm": 1.315901600205271, "learning_rate": 2.5813909783697354e-07, "loss": 0.0042, "step": 6824 }, { "epoch": 9.298365122615804, "grad_norm": 4.4989998530074855, "learning_rate": 2.5714383433687154e-07, "loss": 0.0064, "step": 6825 }, { "epoch": 9.299727520435967, "grad_norm": 0.4608448365613649, "learning_rate": 2.5615046818935785e-07, "loss": 0.008, "step": 6826 }, { "epoch": 9.30108991825613, "grad_norm": 0.8074798263069513, "learning_rate": 2.551589995878789e-07, "loss": 0.0036, "step": 6827 }, { "epoch": 9.302452316076295, "grad_norm": 0.08184487858717185, "learning_rate": 2.541694287255192e-07, "loss": 0.0003, "step": 6828 }, { "epoch": 9.303814713896458, "grad_norm": 0.1358339938923283, "learning_rate": 2.5318175579499115e-07, "loss": 0.0003, "step": 6829 }, { "epoch": 9.30517711171662, "grad_norm": 0.23497328693606268, "learning_rate": 2.521959809886343e-07, "loss": 0.002, "step": 6830 }, { "epoch": 9.306539509536785, "grad_norm": 0.47202818754783804, "learning_rate": 2.5121210449842504e-07, "loss": 0.0038, "step": 6831 }, { "epoch": 9.307901907356948, "grad_norm": 0.3921917470564471, "learning_rate": 2.5023012651596236e-07, "loss": 0.0013, "step": 6832 }, { "epoch": 9.309264305177111, "grad_norm": 0.4806251274811358, "learning_rate": 2.492500472324832e-07, "loss": 0.0074, "step": 6833 }, { "epoch": 9.310626702997276, "grad_norm": 0.28043540750820395, "learning_rate": 2.482718668388473e-07, "loss": 0.0003, "step": 6834 }, { "epoch": 9.311989100817438, "grad_norm": 0.6027373712554897, "learning_rate": 2.472955855255521e-07, "loss": 0.015, "step": 6835 }, { "epoch": 9.313351498637601, "grad_norm": 0.5632308217231858, "learning_rate": 2.4632120348272e-07, "loss": 0.0078, "step": 6836 }, { "epoch": 9.314713896457766, "grad_norm": 0.9371136015970367, "learning_rate": 2.4534872090010377e-07, "loss": 0.0065, "step": 6837 }, { "epoch": 9.316076294277929, "grad_norm": 0.28061299952296853, "learning_rate": 2.4437813796709064e-07, "loss": 0.0004, "step": 6838 }, { "epoch": 9.317438692098092, "grad_norm": 0.18515952868387478, "learning_rate": 2.4340945487269176e-07, "loss": 0.0003, "step": 6839 }, { "epoch": 9.318801089918257, "grad_norm": 2.115284068416403, "learning_rate": 2.424426718055517e-07, "loss": 0.0108, "step": 6840 }, { "epoch": 9.32016348773842, "grad_norm": 0.08606939623564146, "learning_rate": 2.414777889539466e-07, "loss": 0.0003, "step": 6841 }, { "epoch": 9.321525885558582, "grad_norm": 0.13302463972551037, "learning_rate": 2.405148065057794e-07, "loss": 0.0005, "step": 6842 }, { "epoch": 9.322888283378747, "grad_norm": 0.12133166209506682, "learning_rate": 2.395537246485846e-07, "loss": 0.0003, "step": 6843 }, { "epoch": 9.32425068119891, "grad_norm": 0.5312811553505244, "learning_rate": 2.385945435695247e-07, "loss": 0.0076, "step": 6844 }, { "epoch": 9.325613079019073, "grad_norm": 0.41238206815814943, "learning_rate": 2.376372634553936e-07, "loss": 0.0013, "step": 6845 }, { "epoch": 9.326975476839237, "grad_norm": 0.2241477333210335, "learning_rate": 2.3668188449261774e-07, "loss": 0.0004, "step": 6846 }, { "epoch": 9.3283378746594, "grad_norm": 0.17667721447721776, "learning_rate": 2.3572840686724717e-07, "loss": 0.0003, "step": 6847 }, { "epoch": 9.329700272479563, "grad_norm": 0.20530359281017946, "learning_rate": 2.347768307649667e-07, "loss": 0.0055, "step": 6848 }, { "epoch": 9.331062670299728, "grad_norm": 0.08361071665733047, "learning_rate": 2.338271563710881e-07, "loss": 0.0004, "step": 6849 }, { "epoch": 9.33242506811989, "grad_norm": 0.4223822654150846, "learning_rate": 2.3287938387055565e-07, "loss": 0.0065, "step": 6850 }, { "epoch": 9.333787465940054, "grad_norm": 0.5136564872054643, "learning_rate": 2.3193351344793836e-07, "loss": 0.0058, "step": 6851 }, { "epoch": 9.335149863760218, "grad_norm": 0.8728048971852839, "learning_rate": 2.3098954528743888e-07, "loss": 0.0097, "step": 6852 }, { "epoch": 9.336512261580381, "grad_norm": 4.055474990400373, "learning_rate": 2.300474795728902e-07, "loss": 0.0132, "step": 6853 }, { "epoch": 9.337874659400544, "grad_norm": 0.26592229286233765, "learning_rate": 2.2910731648775108e-07, "loss": 0.0004, "step": 6854 }, { "epoch": 9.339237057220709, "grad_norm": 0.7833678886660911, "learning_rate": 2.2816905621511286e-07, "loss": 0.0178, "step": 6855 }, { "epoch": 9.340599455040872, "grad_norm": 0.09960685010120368, "learning_rate": 2.2723269893769385e-07, "loss": 0.0004, "step": 6856 }, { "epoch": 9.341961852861035, "grad_norm": 0.1368153144568124, "learning_rate": 2.262982448378437e-07, "loss": 0.0004, "step": 6857 }, { "epoch": 9.3433242506812, "grad_norm": 0.45008401001331416, "learning_rate": 2.2536569409754128e-07, "loss": 0.0008, "step": 6858 }, { "epoch": 9.344686648501362, "grad_norm": 1.997605677104415, "learning_rate": 2.2443504689839358e-07, "loss": 0.002, "step": 6859 }, { "epoch": 9.346049046321525, "grad_norm": 0.11778508526944964, "learning_rate": 2.235063034216378e-07, "loss": 0.0004, "step": 6860 }, { "epoch": 9.34741144414169, "grad_norm": 0.9664678996378472, "learning_rate": 2.2257946384813934e-07, "loss": 0.0066, "step": 6861 }, { "epoch": 9.348773841961853, "grad_norm": 0.9883333013861921, "learning_rate": 2.2165452835839596e-07, "loss": 0.0133, "step": 6862 }, { "epoch": 9.350136239782016, "grad_norm": 0.43134775766215144, "learning_rate": 2.2073149713252918e-07, "loss": 0.0078, "step": 6863 }, { "epoch": 9.35149863760218, "grad_norm": 0.5900210393439008, "learning_rate": 2.1981037035029519e-07, "loss": 0.0071, "step": 6864 }, { "epoch": 9.352861035422343, "grad_norm": 0.4668399362390715, "learning_rate": 2.1889114819107605e-07, "loss": 0.0008, "step": 6865 }, { "epoch": 9.354223433242506, "grad_norm": 0.5632766313492884, "learning_rate": 2.1797383083388412e-07, "loss": 0.0009, "step": 6866 }, { "epoch": 9.35558583106267, "grad_norm": 0.9892205837520188, "learning_rate": 2.1705841845736096e-07, "loss": 0.0101, "step": 6867 }, { "epoch": 9.356948228882834, "grad_norm": 1.5423331301327938, "learning_rate": 2.161449112397751e-07, "loss": 0.009, "step": 6868 }, { "epoch": 9.358310626702997, "grad_norm": 0.6434385897393466, "learning_rate": 2.1523330935902642e-07, "loss": 0.0031, "step": 6869 }, { "epoch": 9.359673024523161, "grad_norm": 2.554736556938312, "learning_rate": 2.1432361299264403e-07, "loss": 0.0141, "step": 6870 }, { "epoch": 9.361035422343324, "grad_norm": 0.1392778667282991, "learning_rate": 2.134158223177829e-07, "loss": 0.0022, "step": 6871 }, { "epoch": 9.362397820163487, "grad_norm": 0.7344503113358243, "learning_rate": 2.125099375112316e-07, "loss": 0.0017, "step": 6872 }, { "epoch": 9.363760217983652, "grad_norm": 0.15441787391255674, "learning_rate": 2.1160595874940237e-07, "loss": 0.0076, "step": 6873 }, { "epoch": 9.365122615803815, "grad_norm": 0.5515761644781537, "learning_rate": 2.107038862083388e-07, "loss": 0.0091, "step": 6874 }, { "epoch": 9.366485013623977, "grad_norm": 0.23079757901064407, "learning_rate": 2.0980372006371486e-07, "loss": 0.0024, "step": 6875 }, { "epoch": 9.367847411444142, "grad_norm": 0.3981365653057853, "learning_rate": 2.0890546049083027e-07, "loss": 0.0005, "step": 6876 }, { "epoch": 9.369209809264305, "grad_norm": 0.3776675226367274, "learning_rate": 2.0800910766461512e-07, "loss": 0.0008, "step": 6877 }, { "epoch": 9.370572207084468, "grad_norm": 0.4082873551052731, "learning_rate": 2.071146617596276e-07, "loss": 0.0006, "step": 6878 }, { "epoch": 9.371934604904633, "grad_norm": 0.4188617016979457, "learning_rate": 2.0622212295005494e-07, "loss": 0.0153, "step": 6879 }, { "epoch": 9.373297002724795, "grad_norm": 0.20223099182476342, "learning_rate": 2.0533149140971265e-07, "loss": 0.0006, "step": 6880 }, { "epoch": 9.374659400544958, "grad_norm": 0.21317002850444394, "learning_rate": 2.0444276731204416e-07, "loss": 0.0078, "step": 6881 }, { "epoch": 9.376021798365123, "grad_norm": 0.6139580963985919, "learning_rate": 2.0355595083012546e-07, "loss": 0.0117, "step": 6882 }, { "epoch": 9.377384196185286, "grad_norm": 0.11220692095326568, "learning_rate": 2.026710421366529e-07, "loss": 0.0003, "step": 6883 }, { "epoch": 9.378746594005449, "grad_norm": 0.16945169933732787, "learning_rate": 2.017880414039608e-07, "loss": 0.0006, "step": 6884 }, { "epoch": 9.380108991825614, "grad_norm": 0.4762419205599654, "learning_rate": 2.0090694880400384e-07, "loss": 0.0093, "step": 6885 }, { "epoch": 9.381471389645776, "grad_norm": 0.12961365546766399, "learning_rate": 2.0002776450837037e-07, "loss": 0.0005, "step": 6886 }, { "epoch": 9.38283378746594, "grad_norm": 0.9591509902569156, "learning_rate": 1.9915048868827558e-07, "loss": 0.0098, "step": 6887 }, { "epoch": 9.384196185286104, "grad_norm": 0.09514972450967012, "learning_rate": 1.9827512151456175e-07, "loss": 0.0004, "step": 6888 }, { "epoch": 9.385558583106267, "grad_norm": 0.07468999623305637, "learning_rate": 1.9740166315770137e-07, "loss": 0.0005, "step": 6889 }, { "epoch": 9.38692098092643, "grad_norm": 0.1476947601730391, "learning_rate": 1.9653011378779285e-07, "loss": 0.0004, "step": 6890 }, { "epoch": 9.388283378746594, "grad_norm": 0.5416301961496385, "learning_rate": 1.9566047357456708e-07, "loss": 0.0087, "step": 6891 }, { "epoch": 9.389645776566757, "grad_norm": 0.2705718584073687, "learning_rate": 1.9479274268737635e-07, "loss": 0.0029, "step": 6892 }, { "epoch": 9.39100817438692, "grad_norm": 0.2880571836811121, "learning_rate": 1.9392692129520884e-07, "loss": 0.0005, "step": 6893 }, { "epoch": 9.392370572207085, "grad_norm": 0.3978427060473644, "learning_rate": 1.9306300956667523e-07, "loss": 0.0085, "step": 6894 }, { "epoch": 9.393732970027248, "grad_norm": 0.3171292979466099, "learning_rate": 1.9220100767001647e-07, "loss": 0.001, "step": 6895 }, { "epoch": 9.39509536784741, "grad_norm": 0.12781805549550113, "learning_rate": 1.9134091577310278e-07, "loss": 0.0003, "step": 6896 }, { "epoch": 9.396457765667575, "grad_norm": 0.1694297809688104, "learning_rate": 1.9048273404342787e-07, "loss": 0.0004, "step": 6897 }, { "epoch": 9.397820163487738, "grad_norm": 0.1202288959229026, "learning_rate": 1.8962646264811924e-07, "loss": 0.0003, "step": 6898 }, { "epoch": 9.399182561307901, "grad_norm": 0.04325028059340069, "learning_rate": 1.88772101753929e-07, "loss": 0.0004, "step": 6899 }, { "epoch": 9.400544959128066, "grad_norm": 0.8430292153716012, "learning_rate": 1.8791965152723858e-07, "loss": 0.0073, "step": 6900 }, { "epoch": 9.401907356948229, "grad_norm": 0.18961895751070487, "learning_rate": 1.870691121340551e-07, "loss": 0.0022, "step": 6901 }, { "epoch": 9.403269754768392, "grad_norm": 0.1706436228291518, "learning_rate": 1.8622048374001612e-07, "loss": 0.0003, "step": 6902 }, { "epoch": 9.404632152588556, "grad_norm": 0.4313778273862446, "learning_rate": 1.8537376651038608e-07, "loss": 0.0006, "step": 6903 }, { "epoch": 9.40599455040872, "grad_norm": 1.0055540389854993, "learning_rate": 1.8452896061005755e-07, "loss": 0.0122, "step": 6904 }, { "epoch": 9.407356948228882, "grad_norm": 0.08164517826803719, "learning_rate": 1.836860662035489e-07, "loss": 0.0003, "step": 6905 }, { "epoch": 9.408719346049047, "grad_norm": 1.6734721204878382, "learning_rate": 1.8284508345500996e-07, "loss": 0.0113, "step": 6906 }, { "epoch": 9.41008174386921, "grad_norm": 0.2175520175167384, "learning_rate": 1.8200601252821415e-07, "loss": 0.0005, "step": 6907 }, { "epoch": 9.411444141689373, "grad_norm": 0.9637936441541546, "learning_rate": 1.8116885358656743e-07, "loss": 0.0014, "step": 6908 }, { "epoch": 9.412806539509537, "grad_norm": 1.9015732568167791, "learning_rate": 1.803336067930972e-07, "loss": 0.0027, "step": 6909 }, { "epoch": 9.4141689373297, "grad_norm": 1.413338107630945, "learning_rate": 1.795002723104633e-07, "loss": 0.0056, "step": 6910 }, { "epoch": 9.415531335149863, "grad_norm": 0.17465666799083432, "learning_rate": 1.786688503009537e-07, "loss": 0.0004, "step": 6911 }, { "epoch": 9.416893732970028, "grad_norm": 0.07191049592629849, "learning_rate": 1.778393409264778e-07, "loss": 0.0004, "step": 6912 }, { "epoch": 9.41825613079019, "grad_norm": 0.2223206143434011, "learning_rate": 1.7701174434858193e-07, "loss": 0.0007, "step": 6913 }, { "epoch": 9.419618528610354, "grad_norm": 0.7959288823427104, "learning_rate": 1.7618606072842938e-07, "loss": 0.002, "step": 6914 }, { "epoch": 9.420980926430518, "grad_norm": 0.04887585723384903, "learning_rate": 1.753622902268204e-07, "loss": 0.0003, "step": 6915 }, { "epoch": 9.422343324250681, "grad_norm": 0.2602227472008202, "learning_rate": 1.7454043300417668e-07, "loss": 0.0016, "step": 6916 }, { "epoch": 9.423705722070844, "grad_norm": 0.16330138993312251, "learning_rate": 1.7372048922054906e-07, "loss": 0.0023, "step": 6917 }, { "epoch": 9.425068119891009, "grad_norm": 1.987396766839491, "learning_rate": 1.7290245903561654e-07, "loss": 0.0099, "step": 6918 }, { "epoch": 9.426430517711172, "grad_norm": 0.8220202382350165, "learning_rate": 1.7208634260868274e-07, "loss": 0.0035, "step": 6919 }, { "epoch": 9.427792915531334, "grad_norm": 0.23371996059719494, "learning_rate": 1.7127214009868387e-07, "loss": 0.0076, "step": 6920 }, { "epoch": 9.4291553133515, "grad_norm": 0.3633729446765805, "learning_rate": 1.7045985166417645e-07, "loss": 0.0006, "step": 6921 }, { "epoch": 9.430517711171662, "grad_norm": 0.33622791075947195, "learning_rate": 1.6964947746335058e-07, "loss": 0.0017, "step": 6922 }, { "epoch": 9.431880108991825, "grad_norm": 0.1540525887771506, "learning_rate": 1.6884101765402117e-07, "loss": 0.0006, "step": 6923 }, { "epoch": 9.43324250681199, "grad_norm": 0.1857896955734588, "learning_rate": 1.680344723936278e-07, "loss": 0.0003, "step": 6924 }, { "epoch": 9.434604904632153, "grad_norm": 0.11662961548717746, "learning_rate": 1.6722984183924263e-07, "loss": 0.0003, "step": 6925 }, { "epoch": 9.435967302452315, "grad_norm": 0.7291797341874365, "learning_rate": 1.6642712614755697e-07, "loss": 0.002, "step": 6926 }, { "epoch": 9.43732970027248, "grad_norm": 0.39305642924895634, "learning_rate": 1.656263254748991e-07, "loss": 0.0151, "step": 6927 }, { "epoch": 9.438692098092643, "grad_norm": 0.05660845699967635, "learning_rate": 1.6482743997721762e-07, "loss": 0.0003, "step": 6928 }, { "epoch": 9.440054495912806, "grad_norm": 1.3440180765901322, "learning_rate": 1.6403046981008807e-07, "loss": 0.0018, "step": 6929 }, { "epoch": 9.44141689373297, "grad_norm": 0.39095456662022465, "learning_rate": 1.6323541512871633e-07, "loss": 0.0075, "step": 6930 }, { "epoch": 9.442779291553133, "grad_norm": 0.2535601024732749, "learning_rate": 1.6244227608793405e-07, "loss": 0.0005, "step": 6931 }, { "epoch": 9.444141689373296, "grad_norm": 0.5177525595644632, "learning_rate": 1.6165105284219774e-07, "loss": 0.016, "step": 6932 }, { "epoch": 9.445504087193461, "grad_norm": 1.1588168816304645, "learning_rate": 1.608617455455952e-07, "loss": 0.0032, "step": 6933 }, { "epoch": 9.446866485013624, "grad_norm": 0.17559899876190352, "learning_rate": 1.6007435435183572e-07, "loss": 0.0023, "step": 6934 }, { "epoch": 9.448228882833787, "grad_norm": 0.20046838861243174, "learning_rate": 1.5928887941426108e-07, "loss": 0.0004, "step": 6935 }, { "epoch": 9.449591280653951, "grad_norm": 1.200219501225846, "learning_rate": 1.585053208858345e-07, "loss": 0.0036, "step": 6936 }, { "epoch": 9.450953678474114, "grad_norm": 0.3934746390293598, "learning_rate": 1.5772367891914942e-07, "loss": 0.0079, "step": 6937 }, { "epoch": 9.452316076294277, "grad_norm": 0.08557675182603929, "learning_rate": 1.5694395366642413e-07, "loss": 0.0006, "step": 6938 }, { "epoch": 9.453678474114442, "grad_norm": 0.07303658244874965, "learning_rate": 1.5616614527950602e-07, "loss": 0.0003, "step": 6939 }, { "epoch": 9.455040871934605, "grad_norm": 0.09701646028279912, "learning_rate": 1.553902539098684e-07, "loss": 0.0004, "step": 6940 }, { "epoch": 9.456403269754768, "grad_norm": 0.15144864249723985, "learning_rate": 1.5461627970860814e-07, "loss": 0.0022, "step": 6941 }, { "epoch": 9.457765667574932, "grad_norm": 0.7133617219743527, "learning_rate": 1.5384422282645362e-07, "loss": 0.0008, "step": 6942 }, { "epoch": 9.459128065395095, "grad_norm": 0.26499081171776206, "learning_rate": 1.5307408341375563e-07, "loss": 0.0003, "step": 6943 }, { "epoch": 9.460490463215258, "grad_norm": 1.648383158834378, "learning_rate": 1.523058616204942e-07, "loss": 0.0096, "step": 6944 }, { "epoch": 9.461852861035423, "grad_norm": 0.6539071002536417, "learning_rate": 1.5153955759627636e-07, "loss": 0.0085, "step": 6945 }, { "epoch": 9.463215258855586, "grad_norm": 0.8347252178907995, "learning_rate": 1.5077517149033272e-07, "loss": 0.0041, "step": 6946 }, { "epoch": 9.464577656675749, "grad_norm": 0.3603639550341602, "learning_rate": 1.5001270345152308e-07, "loss": 0.0081, "step": 6947 }, { "epoch": 9.465940054495913, "grad_norm": 0.7939358692703742, "learning_rate": 1.49252153628332e-07, "loss": 0.0087, "step": 6948 }, { "epoch": 9.467302452316076, "grad_norm": 0.9496210943732404, "learning_rate": 1.4849352216887102e-07, "loss": 0.0085, "step": 6949 }, { "epoch": 9.46866485013624, "grad_norm": 0.27580755516033223, "learning_rate": 1.4773680922087863e-07, "loss": 0.0011, "step": 6950 }, { "epoch": 9.470027247956404, "grad_norm": 0.3467957923812793, "learning_rate": 1.4698201493172026e-07, "loss": 0.0037, "step": 6951 }, { "epoch": 9.471389645776567, "grad_norm": 0.6996169582503249, "learning_rate": 1.4622913944838502e-07, "loss": 0.0076, "step": 6952 }, { "epoch": 9.47275204359673, "grad_norm": 1.1728291476880708, "learning_rate": 1.4547818291749116e-07, "loss": 0.0061, "step": 6953 }, { "epoch": 9.474114441416894, "grad_norm": 0.5551207509205466, "learning_rate": 1.4472914548528283e-07, "loss": 0.0023, "step": 6954 }, { "epoch": 9.475476839237057, "grad_norm": 0.28493133921156183, "learning_rate": 1.4398202729762778e-07, "loss": 0.0077, "step": 6955 }, { "epoch": 9.47683923705722, "grad_norm": 0.09149457392323947, "learning_rate": 1.4323682850002407e-07, "loss": 0.0003, "step": 6956 }, { "epoch": 9.478201634877385, "grad_norm": 0.3848953644205811, "learning_rate": 1.424935492375934e-07, "loss": 0.0081, "step": 6957 }, { "epoch": 9.479564032697548, "grad_norm": 0.059913268529609896, "learning_rate": 1.417521896550833e-07, "loss": 0.0003, "step": 6958 }, { "epoch": 9.48092643051771, "grad_norm": 0.10317620831754509, "learning_rate": 1.4101274989687052e-07, "loss": 0.0004, "step": 6959 }, { "epoch": 9.482288828337875, "grad_norm": 0.7948317818894296, "learning_rate": 1.402752301069521e-07, "loss": 0.0012, "step": 6960 }, { "epoch": 9.483651226158038, "grad_norm": 0.215587885213648, "learning_rate": 1.3953963042895646e-07, "loss": 0.0006, "step": 6961 }, { "epoch": 9.485013623978201, "grad_norm": 0.21599554353108877, "learning_rate": 1.388059510061379e-07, "loss": 0.0014, "step": 6962 }, { "epoch": 9.486376021798366, "grad_norm": 0.09428071993141866, "learning_rate": 1.3807419198137439e-07, "loss": 0.0003, "step": 6963 }, { "epoch": 9.487738419618529, "grad_norm": 0.06595445547555114, "learning_rate": 1.3734435349717078e-07, "loss": 0.0004, "step": 6964 }, { "epoch": 9.489100817438691, "grad_norm": 0.992817889612726, "learning_rate": 1.3661643569565786e-07, "loss": 0.0018, "step": 6965 }, { "epoch": 9.490463215258856, "grad_norm": 4.8794169358578, "learning_rate": 1.358904387185922e-07, "loss": 0.0089, "step": 6966 }, { "epoch": 9.491825613079019, "grad_norm": 0.056388727975893276, "learning_rate": 1.3516636270735628e-07, "loss": 0.0003, "step": 6967 }, { "epoch": 9.493188010899182, "grad_norm": 0.7647355251709428, "learning_rate": 1.3444420780296062e-07, "loss": 0.0017, "step": 6968 }, { "epoch": 9.494550408719347, "grad_norm": 2.228961503566007, "learning_rate": 1.3372397414603721e-07, "loss": 0.0035, "step": 6969 }, { "epoch": 9.49591280653951, "grad_norm": 0.24545061392121664, "learning_rate": 1.330056618768494e-07, "loss": 0.0073, "step": 6970 }, { "epoch": 9.497275204359672, "grad_norm": 1.076344170110108, "learning_rate": 1.3228927113528189e-07, "loss": 0.0099, "step": 6971 }, { "epoch": 9.498637602179837, "grad_norm": 0.6778922398312381, "learning_rate": 1.3157480206084538e-07, "loss": 0.0017, "step": 6972 }, { "epoch": 9.5, "grad_norm": 0.38107706707844524, "learning_rate": 1.3086225479267966e-07, "loss": 0.0079, "step": 6973 }, { "epoch": 9.501362397820163, "grad_norm": 0.03030172690138771, "learning_rate": 1.3015162946954707e-07, "loss": 0.0003, "step": 6974 }, { "epoch": 9.502724795640328, "grad_norm": 0.47918121237166883, "learning_rate": 1.294429262298391e-07, "loss": 0.0012, "step": 6975 }, { "epoch": 9.50408719346049, "grad_norm": 0.5129180352510587, "learning_rate": 1.2873614521156873e-07, "loss": 0.0134, "step": 6976 }, { "epoch": 9.505449591280653, "grad_norm": 0.086210107354391, "learning_rate": 1.2803128655237694e-07, "loss": 0.0003, "step": 6977 }, { "epoch": 9.506811989100818, "grad_norm": 0.24995670516699772, "learning_rate": 1.273283503895295e-07, "loss": 0.0005, "step": 6978 }, { "epoch": 9.508174386920981, "grad_norm": 0.5428429285832941, "learning_rate": 1.2662733685991803e-07, "loss": 0.0183, "step": 6979 }, { "epoch": 9.509536784741144, "grad_norm": 1.0463745125891746, "learning_rate": 1.2592824610006215e-07, "loss": 0.0146, "step": 6980 }, { "epoch": 9.510899182561309, "grad_norm": 0.14987944076691287, "learning_rate": 1.2523107824610302e-07, "loss": 0.0003, "step": 6981 }, { "epoch": 9.512261580381471, "grad_norm": 0.3620101284707033, "learning_rate": 1.245358334338098e-07, "loss": 0.0008, "step": 6982 }, { "epoch": 9.513623978201634, "grad_norm": 0.045082576092905655, "learning_rate": 1.2384251179857642e-07, "loss": 0.0005, "step": 6983 }, { "epoch": 9.514986376021799, "grad_norm": 0.47022344682457196, "learning_rate": 1.2315111347542152e-07, "loss": 0.0009, "step": 6984 }, { "epoch": 9.516348773841962, "grad_norm": 0.11069125522452693, "learning_rate": 1.2246163859899185e-07, "loss": 0.0005, "step": 6985 }, { "epoch": 9.517711171662125, "grad_norm": 0.2933558477986853, "learning_rate": 1.2177408730355555e-07, "loss": 0.0015, "step": 6986 }, { "epoch": 9.51907356948229, "grad_norm": 0.23504543127910993, "learning_rate": 1.2108845972301107e-07, "loss": 0.0006, "step": 6987 }, { "epoch": 9.520435967302452, "grad_norm": 0.08072974647291535, "learning_rate": 1.204047559908772e-07, "loss": 0.0003, "step": 6988 }, { "epoch": 9.521798365122615, "grad_norm": 0.1494845779642542, "learning_rate": 1.1972297624030072e-07, "loss": 0.0004, "step": 6989 }, { "epoch": 9.52316076294278, "grad_norm": 0.2078971948324182, "learning_rate": 1.1904312060405432e-07, "loss": 0.0024, "step": 6990 }, { "epoch": 9.524523160762943, "grad_norm": 0.4414991905568732, "learning_rate": 1.1836518921453543e-07, "loss": 0.0023, "step": 6991 }, { "epoch": 9.525885558583106, "grad_norm": 0.049964565167010894, "learning_rate": 1.1768918220376624e-07, "loss": 0.0003, "step": 6992 }, { "epoch": 9.52724795640327, "grad_norm": 0.5263182152979528, "learning_rate": 1.1701509970339253e-07, "loss": 0.0042, "step": 6993 }, { "epoch": 9.528610354223433, "grad_norm": 0.6292632962734334, "learning_rate": 1.1634294184468931e-07, "loss": 0.0009, "step": 6994 }, { "epoch": 9.529972752043596, "grad_norm": 2.0211540978957205, "learning_rate": 1.1567270875855408e-07, "loss": 0.0081, "step": 6995 }, { "epoch": 9.53133514986376, "grad_norm": 0.12656531530467757, "learning_rate": 1.15004400575508e-07, "loss": 0.0003, "step": 6996 }, { "epoch": 9.532697547683924, "grad_norm": 0.07222640166741791, "learning_rate": 1.1433801742570249e-07, "loss": 0.0003, "step": 6997 }, { "epoch": 9.534059945504087, "grad_norm": 0.07859943226088857, "learning_rate": 1.1367355943890823e-07, "loss": 0.0003, "step": 6998 }, { "epoch": 9.535422343324251, "grad_norm": 0.45256546095822453, "learning_rate": 1.1301102674452724e-07, "loss": 0.0173, "step": 6999 }, { "epoch": 9.536784741144414, "grad_norm": 0.12569319615337585, "learning_rate": 1.1235041947157855e-07, "loss": 0.0004, "step": 7000 }, { "epoch": 9.538147138964577, "grad_norm": 0.3851230186767731, "learning_rate": 1.1169173774871478e-07, "loss": 0.0017, "step": 7001 }, { "epoch": 9.539509536784742, "grad_norm": 0.43515246406456437, "learning_rate": 1.1103498170420667e-07, "loss": 0.0015, "step": 7002 }, { "epoch": 9.540871934604905, "grad_norm": 0.3276575019732197, "learning_rate": 1.1038015146595526e-07, "loss": 0.0006, "step": 7003 }, { "epoch": 9.542234332425068, "grad_norm": 0.6044544321473817, "learning_rate": 1.0972724716148186e-07, "loss": 0.0058, "step": 7004 }, { "epoch": 9.543596730245232, "grad_norm": 0.06158998750864431, "learning_rate": 1.0907626891793699e-07, "loss": 0.0004, "step": 7005 }, { "epoch": 9.544959128065395, "grad_norm": 0.21343126806748006, "learning_rate": 1.0842721686209257e-07, "loss": 0.0005, "step": 7006 }, { "epoch": 9.546321525885558, "grad_norm": 1.3207385806132135, "learning_rate": 1.0778009112034749e-07, "loss": 0.0109, "step": 7007 }, { "epoch": 9.547683923705723, "grad_norm": 0.5211059312798205, "learning_rate": 1.0713489181872427e-07, "loss": 0.0079, "step": 7008 }, { "epoch": 9.549046321525886, "grad_norm": 0.32577942264608506, "learning_rate": 1.0649161908287243e-07, "loss": 0.001, "step": 7009 }, { "epoch": 9.550408719346049, "grad_norm": 0.4949459872151873, "learning_rate": 1.0585027303806395e-07, "loss": 0.0175, "step": 7010 }, { "epoch": 9.551771117166213, "grad_norm": 0.20719329500068334, "learning_rate": 1.0521085380919671e-07, "loss": 0.0004, "step": 7011 }, { "epoch": 9.553133514986376, "grad_norm": 0.08492262480513077, "learning_rate": 1.0457336152079223e-07, "loss": 0.0004, "step": 7012 }, { "epoch": 9.554495912806539, "grad_norm": 1.033559552141697, "learning_rate": 1.0393779629699785e-07, "loss": 0.0044, "step": 7013 }, { "epoch": 9.555858310626704, "grad_norm": 0.045905686015659794, "learning_rate": 1.0330415826158679e-07, "loss": 0.0003, "step": 7014 }, { "epoch": 9.557220708446867, "grad_norm": 0.24563282340851572, "learning_rate": 1.0267244753795368e-07, "loss": 0.0044, "step": 7015 }, { "epoch": 9.55858310626703, "grad_norm": 0.6480668525255034, "learning_rate": 1.0204266424912124e-07, "loss": 0.0011, "step": 7016 }, { "epoch": 9.559945504087194, "grad_norm": 1.1232908955678584, "learning_rate": 1.0141480851773467e-07, "loss": 0.0112, "step": 7017 }, { "epoch": 9.561307901907357, "grad_norm": 0.16998478821511834, "learning_rate": 1.0078888046606395e-07, "loss": 0.0008, "step": 7018 }, { "epoch": 9.56267029972752, "grad_norm": 0.33409428008808667, "learning_rate": 1.001648802160049e-07, "loss": 0.0007, "step": 7019 }, { "epoch": 9.564032697547685, "grad_norm": 0.5365273538069275, "learning_rate": 9.9542807889077e-08, "loss": 0.0063, "step": 7020 }, { "epoch": 9.565395095367847, "grad_norm": 0.3505633055286194, "learning_rate": 9.892266360642555e-08, "loss": 0.0072, "step": 7021 }, { "epoch": 9.56675749318801, "grad_norm": 0.1950666433744295, "learning_rate": 9.830444748881729e-08, "loss": 0.0002, "step": 7022 }, { "epoch": 9.568119891008175, "grad_norm": 0.4668141121534667, "learning_rate": 9.768815965664702e-08, "loss": 0.0006, "step": 7023 }, { "epoch": 9.569482288828338, "grad_norm": 0.15254949716661476, "learning_rate": 9.707380022993096e-08, "loss": 0.0013, "step": 7024 }, { "epoch": 9.5708446866485, "grad_norm": 0.10800397720010027, "learning_rate": 9.64613693283123e-08, "loss": 0.0004, "step": 7025 }, { "epoch": 9.572207084468666, "grad_norm": 0.25792633076676147, "learning_rate": 9.585086707105784e-08, "loss": 0.0004, "step": 7026 }, { "epoch": 9.573569482288828, "grad_norm": 0.2759566600546029, "learning_rate": 9.524229357705695e-08, "loss": 0.0004, "step": 7027 }, { "epoch": 9.574931880108991, "grad_norm": 0.17625335752463184, "learning_rate": 9.463564896482813e-08, "loss": 0.0073, "step": 7028 }, { "epoch": 9.576294277929156, "grad_norm": 0.19865001628554327, "learning_rate": 9.403093335250802e-08, "loss": 0.0003, "step": 7029 }, { "epoch": 9.577656675749319, "grad_norm": 0.4725355737917123, "learning_rate": 9.342814685786239e-08, "loss": 0.0092, "step": 7030 }, { "epoch": 9.579019073569482, "grad_norm": 0.1738900977842304, "learning_rate": 9.282728959827958e-08, "loss": 0.0003, "step": 7031 }, { "epoch": 9.580381471389646, "grad_norm": 0.14907916503083687, "learning_rate": 9.222836169077154e-08, "loss": 0.0004, "step": 7032 }, { "epoch": 9.58174386920981, "grad_norm": 1.2718193294651488, "learning_rate": 9.163136325197608e-08, "loss": 0.0029, "step": 7033 }, { "epoch": 9.583106267029972, "grad_norm": 0.7117637236431947, "learning_rate": 9.103629439815353e-08, "loss": 0.0125, "step": 7034 }, { "epoch": 9.584468664850137, "grad_norm": 0.5165298515430387, "learning_rate": 9.044315524519009e-08, "loss": 0.0065, "step": 7035 }, { "epoch": 9.5858310626703, "grad_norm": 0.2761253400936681, "learning_rate": 8.985194590859558e-08, "loss": 0.0008, "step": 7036 }, { "epoch": 9.587193460490463, "grad_norm": 0.10297648674489049, "learning_rate": 8.926266650350235e-08, "loss": 0.0005, "step": 7037 }, { "epoch": 9.588555858310627, "grad_norm": 1.0323782887066628, "learning_rate": 8.867531714467081e-08, "loss": 0.0017, "step": 7038 }, { "epoch": 9.58991825613079, "grad_norm": 0.46974512799753737, "learning_rate": 8.808989794648171e-08, "loss": 0.0063, "step": 7039 }, { "epoch": 9.591280653950953, "grad_norm": 0.12646646067688316, "learning_rate": 8.750640902294161e-08, "loss": 0.0021, "step": 7040 }, { "epoch": 9.592643051771118, "grad_norm": 0.3916797806037695, "learning_rate": 8.692485048767962e-08, "loss": 0.004, "step": 7041 }, { "epoch": 9.59400544959128, "grad_norm": 0.9913315020148743, "learning_rate": 8.634522245395183e-08, "loss": 0.01, "step": 7042 }, { "epoch": 9.595367847411444, "grad_norm": 0.7018335666992462, "learning_rate": 8.576752503463681e-08, "loss": 0.0064, "step": 7043 }, { "epoch": 9.596730245231608, "grad_norm": 0.11122188820529777, "learning_rate": 8.51917583422368e-08, "loss": 0.0004, "step": 7044 }, { "epoch": 9.598092643051771, "grad_norm": 1.3296752492305093, "learning_rate": 8.461792248887657e-08, "loss": 0.0049, "step": 7045 }, { "epoch": 9.599455040871934, "grad_norm": 0.16146432168542119, "learning_rate": 8.404601758630892e-08, "loss": 0.0006, "step": 7046 }, { "epoch": 9.600817438692099, "grad_norm": 1.1411907996929442, "learning_rate": 8.3476043745907e-08, "loss": 0.0181, "step": 7047 }, { "epoch": 9.602179836512262, "grad_norm": 0.20693567591203488, "learning_rate": 8.290800107866981e-08, "loss": 0.0003, "step": 7048 }, { "epoch": 9.603542234332425, "grad_norm": 0.1315840160619576, "learning_rate": 8.234188969521884e-08, "loss": 0.0005, "step": 7049 }, { "epoch": 9.60490463215259, "grad_norm": 1.4217860555586073, "learning_rate": 8.177770970580146e-08, "loss": 0.0224, "step": 7050 }, { "epoch": 9.606267029972752, "grad_norm": 0.4827505834769467, "learning_rate": 8.121546122028645e-08, "loss": 0.0013, "step": 7051 }, { "epoch": 9.607629427792915, "grad_norm": 1.0602013035573534, "learning_rate": 8.065514434816846e-08, "loss": 0.0035, "step": 7052 }, { "epoch": 9.60899182561308, "grad_norm": 0.8083452788998597, "learning_rate": 8.009675919856574e-08, "loss": 0.0146, "step": 7053 }, { "epoch": 9.610354223433243, "grad_norm": 0.4553152646627451, "learning_rate": 7.954030588021911e-08, "loss": 0.0016, "step": 7054 }, { "epoch": 9.611716621253406, "grad_norm": 0.415735519338262, "learning_rate": 7.898578450149408e-08, "loss": 0.002, "step": 7055 }, { "epoch": 9.61307901907357, "grad_norm": 0.18937244690172159, "learning_rate": 7.843319517037983e-08, "loss": 0.0008, "step": 7056 }, { "epoch": 9.614441416893733, "grad_norm": 0.21873856927042307, "learning_rate": 7.788253799448919e-08, "loss": 0.0013, "step": 7057 }, { "epoch": 9.615803814713896, "grad_norm": 0.26762604349400615, "learning_rate": 7.73338130810597e-08, "loss": 0.0007, "step": 7058 }, { "epoch": 9.61716621253406, "grad_norm": 0.23495527924242413, "learning_rate": 7.678702053695031e-08, "loss": 0.0003, "step": 7059 }, { "epoch": 9.618528610354224, "grad_norm": 0.6240432791248133, "learning_rate": 7.624216046864475e-08, "loss": 0.0016, "step": 7060 }, { "epoch": 9.619891008174386, "grad_norm": 0.21294097482844962, "learning_rate": 7.569923298225146e-08, "loss": 0.0005, "step": 7061 }, { "epoch": 9.621253405994551, "grad_norm": 0.14318119258490897, "learning_rate": 7.515823818350365e-08, "loss": 0.0004, "step": 7062 }, { "epoch": 9.622615803814714, "grad_norm": 0.20220775222265153, "learning_rate": 7.461917617775261e-08, "loss": 0.0005, "step": 7063 }, { "epoch": 9.623978201634877, "grad_norm": 0.22041976241459976, "learning_rate": 7.408204706997879e-08, "loss": 0.0078, "step": 7064 }, { "epoch": 9.625340599455042, "grad_norm": 0.2816521705630984, "learning_rate": 7.35468509647841e-08, "loss": 0.0007, "step": 7065 }, { "epoch": 9.626702997275205, "grad_norm": 0.09087187325335876, "learning_rate": 7.301358796639402e-08, "loss": 0.0021, "step": 7066 }, { "epoch": 9.628065395095367, "grad_norm": 0.0699181124121139, "learning_rate": 7.248225817865883e-08, "loss": 0.0004, "step": 7067 }, { "epoch": 9.629427792915532, "grad_norm": 0.31892499750329834, "learning_rate": 7.195286170504911e-08, "loss": 0.0012, "step": 7068 }, { "epoch": 9.630790190735695, "grad_norm": 0.4276089857385696, "learning_rate": 7.142539864866349e-08, "loss": 0.0079, "step": 7069 }, { "epoch": 9.632152588555858, "grad_norm": 1.8199868822126302, "learning_rate": 7.08998691122198e-08, "loss": 0.0105, "step": 7070 }, { "epoch": 9.633514986376023, "grad_norm": 0.09057446350960521, "learning_rate": 7.037627319806284e-08, "loss": 0.0004, "step": 7071 }, { "epoch": 9.634877384196185, "grad_norm": 0.36580594866695715, "learning_rate": 6.985461100815771e-08, "loss": 0.0005, "step": 7072 }, { "epoch": 9.636239782016348, "grad_norm": 0.10125873361068477, "learning_rate": 6.933488264409538e-08, "loss": 0.0004, "step": 7073 }, { "epoch": 9.637602179836513, "grad_norm": 0.1253640618410236, "learning_rate": 6.881708820708933e-08, "loss": 0.0003, "step": 7074 }, { "epoch": 9.638964577656676, "grad_norm": 0.05259387757344033, "learning_rate": 6.830122779797444e-08, "loss": 0.0004, "step": 7075 }, { "epoch": 9.640326975476839, "grad_norm": 0.3795404478725827, "learning_rate": 6.77873015172148e-08, "loss": 0.0076, "step": 7076 }, { "epoch": 9.641689373297003, "grad_norm": 0.189976887845205, "learning_rate": 6.727530946488925e-08, "loss": 0.002, "step": 7077 }, { "epoch": 9.643051771117166, "grad_norm": 0.2774440193205256, "learning_rate": 6.676525174070802e-08, "loss": 0.0004, "step": 7078 }, { "epoch": 9.64441416893733, "grad_norm": 0.7986000098393853, "learning_rate": 6.625712844400056e-08, "loss": 0.0059, "step": 7079 }, { "epoch": 9.645776566757494, "grad_norm": 0.47819783949152106, "learning_rate": 6.575093967371993e-08, "loss": 0.0021, "step": 7080 }, { "epoch": 9.647138964577657, "grad_norm": 1.4647302139582734, "learning_rate": 6.524668552844282e-08, "loss": 0.0228, "step": 7081 }, { "epoch": 9.64850136239782, "grad_norm": 0.04595969652944508, "learning_rate": 6.474436610636958e-08, "loss": 0.0003, "step": 7082 }, { "epoch": 9.649863760217984, "grad_norm": 0.13035074242610772, "learning_rate": 6.424398150532196e-08, "loss": 0.0024, "step": 7083 }, { "epoch": 9.651226158038147, "grad_norm": 0.29122048154661334, "learning_rate": 6.374553182274867e-08, "loss": 0.0004, "step": 7084 }, { "epoch": 9.65258855585831, "grad_norm": 0.2385434204795936, "learning_rate": 6.324901715571652e-08, "loss": 0.0004, "step": 7085 }, { "epoch": 9.653950953678475, "grad_norm": 0.1006482645820429, "learning_rate": 6.275443760092148e-08, "loss": 0.0003, "step": 7086 }, { "epoch": 9.655313351498638, "grad_norm": 1.1020728092402843, "learning_rate": 6.226179325467652e-08, "loss": 0.007, "step": 7087 }, { "epoch": 9.6566757493188, "grad_norm": 0.3242298485036959, "learning_rate": 6.177108421292266e-08, "loss": 0.0003, "step": 7088 }, { "epoch": 9.658038147138965, "grad_norm": 1.0456419596068383, "learning_rate": 6.128231057122014e-08, "loss": 0.0031, "step": 7089 }, { "epoch": 9.659400544959128, "grad_norm": 0.4012994474010167, "learning_rate": 6.079547242475504e-08, "loss": 0.008, "step": 7090 }, { "epoch": 9.660762942779291, "grad_norm": 0.6629113823683987, "learning_rate": 6.031056986833706e-08, "loss": 0.0051, "step": 7091 }, { "epoch": 9.662125340599456, "grad_norm": 0.8201218649112172, "learning_rate": 5.982760299639623e-08, "loss": 0.002, "step": 7092 }, { "epoch": 9.663487738419619, "grad_norm": 1.1215919659710036, "learning_rate": 5.9346571902986204e-08, "loss": 0.0022, "step": 7093 }, { "epoch": 9.664850136239782, "grad_norm": 0.5507114768875031, "learning_rate": 5.886747668178538e-08, "loss": 0.0065, "step": 7094 }, { "epoch": 9.666212534059946, "grad_norm": 0.31660255973489576, "learning_rate": 5.839031742609469e-08, "loss": 0.0084, "step": 7095 }, { "epoch": 9.66757493188011, "grad_norm": 1.1967383243856742, "learning_rate": 5.7915094228836456e-08, "loss": 0.0068, "step": 7096 }, { "epoch": 9.668937329700272, "grad_norm": 0.6394158914139202, "learning_rate": 5.744180718255776e-08, "loss": 0.001, "step": 7097 }, { "epoch": 9.670299727520437, "grad_norm": 0.6132002292683496, "learning_rate": 5.6970456379428217e-08, "loss": 0.0042, "step": 7098 }, { "epoch": 9.6716621253406, "grad_norm": 0.27951244153133786, "learning_rate": 5.650104191123773e-08, "loss": 0.0024, "step": 7099 }, { "epoch": 9.673024523160763, "grad_norm": 0.29749003717709505, "learning_rate": 5.6033563869404285e-08, "loss": 0.0006, "step": 7100 }, { "epoch": 9.674386920980927, "grad_norm": 0.3779156001657079, "learning_rate": 5.556802234496506e-08, "loss": 0.008, "step": 7101 }, { "epoch": 9.67574931880109, "grad_norm": 1.5478406136223102, "learning_rate": 5.5104417428580856e-08, "loss": 0.0054, "step": 7102 }, { "epoch": 9.677111716621253, "grad_norm": 0.3512479953218174, "learning_rate": 5.4642749210535026e-08, "loss": 0.0006, "step": 7103 }, { "epoch": 9.678474114441418, "grad_norm": 0.3775005768393968, "learning_rate": 5.418301778073565e-08, "loss": 0.0039, "step": 7104 }, { "epoch": 9.67983651226158, "grad_norm": 0.3525677735466011, "learning_rate": 5.372522322870999e-08, "loss": 0.0066, "step": 7105 }, { "epoch": 9.681198910081743, "grad_norm": 0.08465936089132425, "learning_rate": 5.326936564361118e-08, "loss": 0.0004, "step": 7106 }, { "epoch": 9.682561307901908, "grad_norm": 0.08905608224332337, "learning_rate": 5.281544511421488e-08, "loss": 0.0003, "step": 7107 }, { "epoch": 9.683923705722071, "grad_norm": 0.05524072181790551, "learning_rate": 5.236346172891926e-08, "loss": 0.0003, "step": 7108 }, { "epoch": 9.685286103542234, "grad_norm": 0.37377276869040665, "learning_rate": 5.191341557574392e-08, "loss": 0.0007, "step": 7109 }, { "epoch": 9.686648501362399, "grad_norm": 0.13327975000563252, "learning_rate": 5.1465306742332074e-08, "loss": 0.0004, "step": 7110 }, { "epoch": 9.688010899182562, "grad_norm": 0.12264068171771352, "learning_rate": 5.10191353159506e-08, "loss": 0.0005, "step": 7111 }, { "epoch": 9.689373297002724, "grad_norm": 1.2006803235867158, "learning_rate": 5.0574901383487754e-08, "loss": 0.0035, "step": 7112 }, { "epoch": 9.690735694822889, "grad_norm": 0.1548113892509629, "learning_rate": 5.013260503145434e-08, "loss": 0.0008, "step": 7113 }, { "epoch": 9.692098092643052, "grad_norm": 0.12783025583811625, "learning_rate": 4.9692246345985905e-08, "loss": 0.0003, "step": 7114 }, { "epoch": 9.693460490463215, "grad_norm": 0.5840586425921076, "learning_rate": 4.92538254128383e-08, "loss": 0.0078, "step": 7115 }, { "epoch": 9.69482288828338, "grad_norm": 1.0166100375930363, "learning_rate": 4.881734231739099e-08, "loss": 0.0038, "step": 7116 }, { "epoch": 9.696185286103542, "grad_norm": 0.8138029707710235, "learning_rate": 4.8382797144646e-08, "loss": 0.0079, "step": 7117 }, { "epoch": 9.697547683923705, "grad_norm": 1.5812214640535562, "learning_rate": 4.7950189979227844e-08, "loss": 0.0028, "step": 7118 }, { "epoch": 9.69891008174387, "grad_norm": 0.1422050569861404, "learning_rate": 4.751952090538359e-08, "loss": 0.0003, "step": 7119 }, { "epoch": 9.700272479564033, "grad_norm": 0.05449779876173887, "learning_rate": 4.709079000698391e-08, "loss": 0.0003, "step": 7120 }, { "epoch": 9.701634877384196, "grad_norm": 0.8629664174056962, "learning_rate": 4.666399736751981e-08, "loss": 0.01, "step": 7121 }, { "epoch": 9.70299727520436, "grad_norm": 0.4755297103444295, "learning_rate": 4.6239143070107015e-08, "loss": 0.0024, "step": 7122 }, { "epoch": 9.704359673024523, "grad_norm": 0.3276994989420877, "learning_rate": 4.581622719748269e-08, "loss": 0.0171, "step": 7123 }, { "epoch": 9.705722070844686, "grad_norm": 0.5904173323122405, "learning_rate": 4.53952498320076e-08, "loss": 0.0079, "step": 7124 }, { "epoch": 9.70708446866485, "grad_norm": 0.5807825100855797, "learning_rate": 4.497621105566175e-08, "loss": 0.0154, "step": 7125 }, { "epoch": 9.708446866485014, "grad_norm": 0.1679882290098806, "learning_rate": 4.4559110950052056e-08, "loss": 0.0015, "step": 7126 }, { "epoch": 9.709809264305177, "grad_norm": 0.1532047611881322, "learning_rate": 4.414394959640578e-08, "loss": 0.0003, "step": 7127 }, { "epoch": 9.711171662125341, "grad_norm": 0.19007390057474416, "learning_rate": 4.373072707557158e-08, "loss": 0.0003, "step": 7128 }, { "epoch": 9.712534059945504, "grad_norm": 1.3675084170874645, "learning_rate": 4.331944346802286e-08, "loss": 0.0069, "step": 7129 }, { "epoch": 9.713896457765667, "grad_norm": 0.36615082252086584, "learning_rate": 4.291009885385333e-08, "loss": 0.0004, "step": 7130 }, { "epoch": 9.71525885558583, "grad_norm": 0.3764066719074156, "learning_rate": 4.250269331278034e-08, "loss": 0.0013, "step": 7131 }, { "epoch": 9.716621253405995, "grad_norm": 0.2888868935225082, "learning_rate": 4.209722692414264e-08, "loss": 0.001, "step": 7132 }, { "epoch": 9.717983651226158, "grad_norm": 0.41837054489993636, "learning_rate": 4.169369976690263e-08, "loss": 0.0081, "step": 7133 }, { "epoch": 9.719346049046322, "grad_norm": 0.13678874679067685, "learning_rate": 4.129211191964411e-08, "loss": 0.0019, "step": 7134 }, { "epoch": 9.720708446866485, "grad_norm": 0.1299227251719153, "learning_rate": 4.08924634605723e-08, "loss": 0.0003, "step": 7135 }, { "epoch": 9.722070844686648, "grad_norm": 0.08099710819845964, "learning_rate": 4.0494754467518274e-08, "loss": 0.0003, "step": 7136 }, { "epoch": 9.723433242506811, "grad_norm": 0.27276960594117455, "learning_rate": 4.00989850179323e-08, "loss": 0.0003, "step": 7137 }, { "epoch": 9.724795640326976, "grad_norm": 0.16478620597838617, "learning_rate": 3.970515518888718e-08, "loss": 0.0003, "step": 7138 }, { "epoch": 9.726158038147139, "grad_norm": 0.17599106077675802, "learning_rate": 3.931326505707822e-08, "loss": 0.0004, "step": 7139 }, { "epoch": 9.727520435967303, "grad_norm": 0.07620446170899546, "learning_rate": 3.8923314698824374e-08, "loss": 0.0003, "step": 7140 }, { "epoch": 9.728882833787466, "grad_norm": 0.45574855959338284, "learning_rate": 3.8535304190063796e-08, "loss": 0.0042, "step": 7141 }, { "epoch": 9.730245231607629, "grad_norm": 0.22123671866706449, "learning_rate": 3.814923360636158e-08, "loss": 0.0006, "step": 7142 }, { "epoch": 9.731607629427792, "grad_norm": 0.37780473691896055, "learning_rate": 3.776510302289982e-08, "loss": 0.0048, "step": 7143 }, { "epoch": 9.732970027247957, "grad_norm": 0.7865663986419034, "learning_rate": 3.7382912514486445e-08, "loss": 0.0027, "step": 7144 }, { "epoch": 9.73433242506812, "grad_norm": 1.0470985642442312, "learning_rate": 3.700266215554971e-08, "loss": 0.0066, "step": 7145 }, { "epoch": 9.735694822888284, "grad_norm": 1.8700521744769223, "learning_rate": 3.6624352020141475e-08, "loss": 0.0106, "step": 7146 }, { "epoch": 9.737057220708447, "grad_norm": 0.34283931037204446, "learning_rate": 3.6247982181933924e-08, "loss": 0.0007, "step": 7147 }, { "epoch": 9.73841961852861, "grad_norm": 0.5230976873233846, "learning_rate": 3.587355271422288e-08, "loss": 0.0071, "step": 7148 }, { "epoch": 9.739782016348773, "grad_norm": 0.25948886270674876, "learning_rate": 3.550106368992667e-08, "loss": 0.0007, "step": 7149 }, { "epoch": 9.741144414168938, "grad_norm": 0.5409516591525211, "learning_rate": 3.5130515181583945e-08, "loss": 0.0068, "step": 7150 }, { "epoch": 9.7425068119891, "grad_norm": 0.5602418356575557, "learning_rate": 3.4761907261356976e-08, "loss": 0.0081, "step": 7151 }, { "epoch": 9.743869209809265, "grad_norm": 1.226329350876802, "learning_rate": 3.439524000102834e-08, "loss": 0.0087, "step": 7152 }, { "epoch": 9.745231607629428, "grad_norm": 0.5152047927176822, "learning_rate": 3.403051347200648e-08, "loss": 0.0078, "step": 7153 }, { "epoch": 9.746594005449591, "grad_norm": 1.0529965026077268, "learning_rate": 3.3667727745316794e-08, "loss": 0.0063, "step": 7154 }, { "epoch": 9.747956403269754, "grad_norm": 1.9253852608763775, "learning_rate": 3.330688289161055e-08, "loss": 0.0042, "step": 7155 }, { "epoch": 9.749318801089919, "grad_norm": 0.12660509533065134, "learning_rate": 3.2947978981158204e-08, "loss": 0.0004, "step": 7156 }, { "epoch": 9.750681198910081, "grad_norm": 0.7232606901918622, "learning_rate": 3.2591016083856066e-08, "loss": 0.0025, "step": 7157 }, { "epoch": 9.752043596730246, "grad_norm": 0.2779981423837457, "learning_rate": 3.2235994269218526e-08, "loss": 0.0021, "step": 7158 }, { "epoch": 9.753405994550409, "grad_norm": 1.0698108954186978, "learning_rate": 3.1882913606383625e-08, "loss": 0.0023, "step": 7159 }, { "epoch": 9.754768392370572, "grad_norm": 0.1499340575207644, "learning_rate": 3.153177416411191e-08, "loss": 0.0005, "step": 7160 }, { "epoch": 9.756130790190735, "grad_norm": 0.07153169749412276, "learning_rate": 3.118257601078645e-08, "loss": 0.0003, "step": 7161 }, { "epoch": 9.7574931880109, "grad_norm": 0.2271564854799525, "learning_rate": 3.083531921440841e-08, "loss": 0.0021, "step": 7162 }, { "epoch": 9.758855585831062, "grad_norm": 0.6686670978634689, "learning_rate": 3.049000384260592e-08, "loss": 0.0036, "step": 7163 }, { "epoch": 9.760217983651227, "grad_norm": 0.029348261869743063, "learning_rate": 3.0146629962624075e-08, "loss": 0.0003, "step": 7164 }, { "epoch": 9.76158038147139, "grad_norm": 0.11944922016983875, "learning_rate": 2.9805197641336046e-08, "loss": 0.0003, "step": 7165 }, { "epoch": 9.762942779291553, "grad_norm": 0.8057930389781243, "learning_rate": 2.9465706945230876e-08, "loss": 0.0074, "step": 7166 }, { "epoch": 9.764305177111716, "grad_norm": 0.10965559472243452, "learning_rate": 2.912815794042234e-08, "loss": 0.0003, "step": 7167 }, { "epoch": 9.76566757493188, "grad_norm": 0.06792959380591618, "learning_rate": 2.8792550692646747e-08, "loss": 0.0003, "step": 7168 }, { "epoch": 9.767029972752043, "grad_norm": 1.286838856338753, "learning_rate": 2.8458885267260706e-08, "loss": 0.0131, "step": 7169 }, { "epoch": 9.768392370572208, "grad_norm": 2.737033476773979, "learning_rate": 2.8127161729242235e-08, "loss": 0.0138, "step": 7170 }, { "epoch": 9.769754768392371, "grad_norm": 0.48808397586567065, "learning_rate": 2.7797380143192997e-08, "loss": 0.004, "step": 7171 }, { "epoch": 9.771117166212534, "grad_norm": 1.2206572761900667, "learning_rate": 2.746954057333606e-08, "loss": 0.0098, "step": 7172 }, { "epoch": 9.772479564032697, "grad_norm": 0.12077221679304814, "learning_rate": 2.7143643083514805e-08, "loss": 0.0003, "step": 7173 }, { "epoch": 9.773841961852861, "grad_norm": 0.5344507880633408, "learning_rate": 2.6819687737197342e-08, "loss": 0.0069, "step": 7174 }, { "epoch": 9.775204359673024, "grad_norm": 0.11161507529937713, "learning_rate": 2.6497674597469882e-08, "loss": 0.0003, "step": 7175 }, { "epoch": 9.776566757493189, "grad_norm": 0.16855135836606658, "learning_rate": 2.6177603727042257e-08, "loss": 0.0003, "step": 7176 }, { "epoch": 9.777929155313352, "grad_norm": 1.1835448152932388, "learning_rate": 2.5859475188246828e-08, "loss": 0.0038, "step": 7177 }, { "epoch": 9.779291553133515, "grad_norm": 0.20055152432451143, "learning_rate": 2.554328904303738e-08, "loss": 0.0005, "step": 7178 }, { "epoch": 9.780653950953678, "grad_norm": 0.8871739784356649, "learning_rate": 2.522904535298687e-08, "loss": 0.0071, "step": 7179 }, { "epoch": 9.782016348773842, "grad_norm": 1.453295767073732, "learning_rate": 2.491674417929413e-08, "loss": 0.0042, "step": 7180 }, { "epoch": 9.783378746594005, "grad_norm": 1.2087591954430938, "learning_rate": 2.460638558277606e-08, "loss": 0.0035, "step": 7181 }, { "epoch": 9.78474114441417, "grad_norm": 0.29599575692754787, "learning_rate": 2.429796962387432e-08, "loss": 0.001, "step": 7182 }, { "epoch": 9.786103542234333, "grad_norm": 0.24467426130159162, "learning_rate": 2.3991496362649747e-08, "loss": 0.0003, "step": 7183 }, { "epoch": 9.787465940054496, "grad_norm": 0.05698936833996192, "learning_rate": 2.3686965858786824e-08, "loss": 0.0004, "step": 7184 }, { "epoch": 9.788828337874659, "grad_norm": 0.07744255357968159, "learning_rate": 2.338437817158923e-08, "loss": 0.0003, "step": 7185 }, { "epoch": 9.790190735694823, "grad_norm": 0.043248505148441835, "learning_rate": 2.3083733359984262e-08, "loss": 0.0003, "step": 7186 }, { "epoch": 9.791553133514986, "grad_norm": 0.33589429691851175, "learning_rate": 2.2785031482521757e-08, "loss": 0.0072, "step": 7187 }, { "epoch": 9.79291553133515, "grad_norm": 1.269646060166043, "learning_rate": 2.2488272597369632e-08, "loss": 0.0057, "step": 7188 }, { "epoch": 9.794277929155314, "grad_norm": 0.24990789102297672, "learning_rate": 2.2193456762320542e-08, "loss": 0.0005, "step": 7189 }, { "epoch": 9.795640326975477, "grad_norm": 0.09292641116085405, "learning_rate": 2.1900584034788565e-08, "loss": 0.0003, "step": 7190 }, { "epoch": 9.79700272479564, "grad_norm": 0.23068913333242885, "learning_rate": 2.1609654471808074e-08, "loss": 0.0004, "step": 7191 }, { "epoch": 9.798365122615804, "grad_norm": 0.8186411298424018, "learning_rate": 2.132066813003486e-08, "loss": 0.0013, "step": 7192 }, { "epoch": 9.799727520435967, "grad_norm": 0.041422665218575766, "learning_rate": 2.1033625065747244e-08, "loss": 0.0004, "step": 7193 }, { "epoch": 9.80108991825613, "grad_norm": 0.3332404682215341, "learning_rate": 2.074852533484606e-08, "loss": 0.0079, "step": 7194 }, { "epoch": 9.802452316076295, "grad_norm": 0.5055453075097339, "learning_rate": 2.0465368992851343e-08, "loss": 0.0085, "step": 7195 }, { "epoch": 9.803814713896458, "grad_norm": 0.12193126147407142, "learning_rate": 2.018415609490565e-08, "loss": 0.0004, "step": 7196 }, { "epoch": 9.80517711171662, "grad_norm": 0.15629793522142832, "learning_rate": 1.9904886695772952e-08, "loss": 0.0003, "step": 7197 }, { "epoch": 9.806539509536785, "grad_norm": 0.09869538548879392, "learning_rate": 1.9627560849840856e-08, "loss": 0.0003, "step": 7198 }, { "epoch": 9.807901907356948, "grad_norm": 0.06434264812870129, "learning_rate": 1.9352178611115045e-08, "loss": 0.0005, "step": 7199 }, { "epoch": 9.809264305177111, "grad_norm": 1.1987763899213604, "learning_rate": 1.907874003322374e-08, "loss": 0.0062, "step": 7200 }, { "epoch": 9.810626702997276, "grad_norm": 1.075386771934332, "learning_rate": 1.8807245169418785e-08, "loss": 0.0152, "step": 7201 }, { "epoch": 9.811989100817438, "grad_norm": 0.8954109067179172, "learning_rate": 1.853769407257122e-08, "loss": 0.018, "step": 7202 }, { "epoch": 9.813351498637601, "grad_norm": 1.626256759598111, "learning_rate": 1.8270086795173502e-08, "loss": 0.0029, "step": 7203 }, { "epoch": 9.814713896457766, "grad_norm": 0.045867238807015785, "learning_rate": 1.8004423389341718e-08, "loss": 0.0003, "step": 7204 }, { "epoch": 9.816076294277929, "grad_norm": 0.36625693329711223, "learning_rate": 1.7740703906810042e-08, "loss": 0.0014, "step": 7205 }, { "epoch": 9.817438692098092, "grad_norm": 0.06376029551517107, "learning_rate": 1.74789283989385e-08, "loss": 0.0004, "step": 7206 }, { "epoch": 9.818801089918257, "grad_norm": 1.6539126296410278, "learning_rate": 1.7219096916702983e-08, "loss": 0.0149, "step": 7207 }, { "epoch": 9.82016348773842, "grad_norm": 0.7654400018555794, "learning_rate": 1.6961209510707454e-08, "loss": 0.0047, "step": 7208 }, { "epoch": 9.821525885558582, "grad_norm": 0.21788586476082755, "learning_rate": 1.6705266231169527e-08, "loss": 0.0022, "step": 7209 }, { "epoch": 9.822888283378747, "grad_norm": 0.1334452037279572, "learning_rate": 1.6451267127935987e-08, "loss": 0.0003, "step": 7210 }, { "epoch": 9.82425068119891, "grad_norm": 0.047194891784588716, "learning_rate": 1.61992122504695e-08, "loss": 0.0002, "step": 7211 }, { "epoch": 9.825613079019073, "grad_norm": 0.04384652479395842, "learning_rate": 1.5949101647855236e-08, "loss": 0.0003, "step": 7212 }, { "epoch": 9.826975476839237, "grad_norm": 0.10875756825009765, "learning_rate": 1.5700935368803127e-08, "loss": 0.0004, "step": 7213 }, { "epoch": 9.8283378746594, "grad_norm": 0.2531861837580526, "learning_rate": 1.545471346164007e-08, "loss": 0.0079, "step": 7214 }, { "epoch": 9.829700272479563, "grad_norm": 0.14937730289100562, "learning_rate": 1.5210435974315485e-08, "loss": 0.0005, "step": 7215 }, { "epoch": 9.831062670299728, "grad_norm": 0.15922293225564776, "learning_rate": 1.496810295440132e-08, "loss": 0.0003, "step": 7216 }, { "epoch": 9.83242506811989, "grad_norm": 0.23284805075237905, "learning_rate": 1.4727714449090936e-08, "loss": 0.002, "step": 7217 }, { "epoch": 9.833787465940054, "grad_norm": 0.1617604130314777, "learning_rate": 1.4489270505197995e-08, "loss": 0.0005, "step": 7218 }, { "epoch": 9.835149863760218, "grad_norm": 0.21131723361399976, "learning_rate": 1.4252771169156465e-08, "loss": 0.0077, "step": 7219 }, { "epoch": 9.836512261580381, "grad_norm": 0.17840551964482002, "learning_rate": 1.401821648702506e-08, "loss": 0.0004, "step": 7220 }, { "epoch": 9.837874659400544, "grad_norm": 0.866866022493182, "learning_rate": 1.3785606504480575e-08, "loss": 0.0015, "step": 7221 }, { "epoch": 9.839237057220709, "grad_norm": 0.3313539710581272, "learning_rate": 1.355494126682122e-08, "loss": 0.0007, "step": 7222 }, { "epoch": 9.840599455040872, "grad_norm": 0.09991188670718644, "learning_rate": 1.3326220818968839e-08, "loss": 0.0006, "step": 7223 }, { "epoch": 9.841961852861035, "grad_norm": 0.460198588890395, "learning_rate": 1.3099445205464468e-08, "loss": 0.0024, "step": 7224 }, { "epoch": 9.8433242506812, "grad_norm": 0.07543763783754534, "learning_rate": 1.287461447047167e-08, "loss": 0.0003, "step": 7225 }, { "epoch": 9.844686648501362, "grad_norm": 0.3764109823626893, "learning_rate": 1.2651728657773198e-08, "loss": 0.0248, "step": 7226 }, { "epoch": 9.846049046321525, "grad_norm": 1.0780101484472284, "learning_rate": 1.2430787810776556e-08, "loss": 0.0024, "step": 7227 }, { "epoch": 9.84741144414169, "grad_norm": 0.0555399892482004, "learning_rate": 1.2211791972506216e-08, "loss": 0.0002, "step": 7228 }, { "epoch": 9.848773841961853, "grad_norm": 0.7622879135555855, "learning_rate": 1.1994741185612501e-08, "loss": 0.0134, "step": 7229 }, { "epoch": 9.850136239782016, "grad_norm": 0.8190014870616428, "learning_rate": 1.1779635492362717e-08, "loss": 0.0021, "step": 7230 }, { "epoch": 9.85149863760218, "grad_norm": 0.9892591983109644, "learning_rate": 1.1566474934647798e-08, "loss": 0.0108, "step": 7231 }, { "epoch": 9.852861035422343, "grad_norm": 0.21846888346768736, "learning_rate": 1.1355259553978981e-08, "loss": 0.0082, "step": 7232 }, { "epoch": 9.854223433242506, "grad_norm": 0.08559393741237639, "learning_rate": 1.1145989391488921e-08, "loss": 0.0005, "step": 7233 }, { "epoch": 9.85558583106267, "grad_norm": 0.8296805960138534, "learning_rate": 1.0938664487931683e-08, "loss": 0.0098, "step": 7234 }, { "epoch": 9.856948228882834, "grad_norm": 0.08062582934987002, "learning_rate": 1.0733284883682748e-08, "loss": 0.0003, "step": 7235 }, { "epoch": 9.858310626702997, "grad_norm": 0.39637726889794045, "learning_rate": 1.0529850618737902e-08, "loss": 0.0078, "step": 7236 }, { "epoch": 9.859673024523161, "grad_norm": 0.029526103164378092, "learning_rate": 1.0328361732715453e-08, "loss": 0.0003, "step": 7237 }, { "epoch": 9.861035422343324, "grad_norm": 0.2642101239863846, "learning_rate": 1.0128818264851791e-08, "loss": 0.0075, "step": 7238 }, { "epoch": 9.862397820163487, "grad_norm": 0.21893085037353788, "learning_rate": 9.931220254008055e-09, "loss": 0.0006, "step": 7239 }, { "epoch": 9.863760217983652, "grad_norm": 0.6066431233661697, "learning_rate": 9.735567738665686e-09, "loss": 0.0185, "step": 7240 }, { "epoch": 9.865122615803815, "grad_norm": 0.030470830313877432, "learning_rate": 9.541860756925315e-09, "loss": 0.0003, "step": 7241 }, { "epoch": 9.866485013623977, "grad_norm": 0.7400727888628885, "learning_rate": 9.35009934651121e-09, "loss": 0.0161, "step": 7242 }, { "epoch": 9.867847411444142, "grad_norm": 0.21618170028660433, "learning_rate": 9.160283544766834e-09, "loss": 0.0018, "step": 7243 }, { "epoch": 9.869209809264305, "grad_norm": 0.14265807265489514, "learning_rate": 8.972413388657064e-09, "loss": 0.001, "step": 7244 }, { "epoch": 9.870572207084468, "grad_norm": 0.5386608492720981, "learning_rate": 8.786488914768187e-09, "loss": 0.0045, "step": 7245 }, { "epoch": 9.871934604904633, "grad_norm": 0.24040489338837043, "learning_rate": 8.602510159309019e-09, "loss": 0.0007, "step": 7246 }, { "epoch": 9.873297002724795, "grad_norm": 0.14246813818455517, "learning_rate": 8.420477158107565e-09, "loss": 0.0003, "step": 7247 }, { "epoch": 9.874659400544958, "grad_norm": 0.48168032073349965, "learning_rate": 8.240389946613247e-09, "loss": 0.0012, "step": 7248 }, { "epoch": 9.876021798365123, "grad_norm": 1.5390777903406239, "learning_rate": 8.062248559896902e-09, "loss": 0.0046, "step": 7249 }, { "epoch": 9.877384196185286, "grad_norm": 1.2014102202483643, "learning_rate": 7.886053032649665e-09, "loss": 0.0029, "step": 7250 }, { "epoch": 9.878746594005449, "grad_norm": 0.45582960106704473, "learning_rate": 7.711803399185202e-09, "loss": 0.0067, "step": 7251 }, { "epoch": 9.880108991825614, "grad_norm": 0.09509545350200989, "learning_rate": 7.53949969343748e-09, "loss": 0.0003, "step": 7252 }, { "epoch": 9.881471389645776, "grad_norm": 0.0976390199186634, "learning_rate": 7.369141948960768e-09, "loss": 0.002, "step": 7253 }, { "epoch": 9.88283378746594, "grad_norm": 0.16584769942077282, "learning_rate": 7.20073019893186e-09, "loss": 0.0003, "step": 7254 }, { "epoch": 9.884196185286104, "grad_norm": 1.5496540470854543, "learning_rate": 7.034264476146746e-09, "loss": 0.0161, "step": 7255 }, { "epoch": 9.885558583106267, "grad_norm": 0.14946878869008906, "learning_rate": 6.8697448130239374e-09, "loss": 0.0004, "step": 7256 }, { "epoch": 9.88692098092643, "grad_norm": 0.5463350500530049, "learning_rate": 6.707171241602251e-09, "loss": 0.0171, "step": 7257 }, { "epoch": 9.888283378746594, "grad_norm": 0.6495778937791681, "learning_rate": 6.546543793543026e-09, "loss": 0.0023, "step": 7258 }, { "epoch": 9.889645776566757, "grad_norm": 0.29149862759109085, "learning_rate": 6.387862500125686e-09, "loss": 0.0066, "step": 7259 }, { "epoch": 9.89100817438692, "grad_norm": 0.861515481884567, "learning_rate": 6.231127392252179e-09, "loss": 0.0137, "step": 7260 }, { "epoch": 9.892370572207085, "grad_norm": 1.9142035251583962, "learning_rate": 6.076338500446977e-09, "loss": 0.0088, "step": 7261 }, { "epoch": 9.893732970027248, "grad_norm": 0.49365267017889897, "learning_rate": 5.923495854853745e-09, "loss": 0.0017, "step": 7262 }, { "epoch": 9.89509536784741, "grad_norm": 0.9525510241562234, "learning_rate": 5.772599485236452e-09, "loss": 0.0078, "step": 7263 }, { "epoch": 9.896457765667575, "grad_norm": 1.442033467453314, "learning_rate": 5.623649420982702e-09, "loss": 0.0036, "step": 7264 }, { "epoch": 9.897820163487738, "grad_norm": 0.6497875466921866, "learning_rate": 5.476645691098181e-09, "loss": 0.0016, "step": 7265 }, { "epoch": 9.899182561307901, "grad_norm": 0.11730155901022318, "learning_rate": 5.3315883242122115e-09, "loss": 0.0003, "step": 7266 }, { "epoch": 9.900544959128066, "grad_norm": 0.04401571939590655, "learning_rate": 5.188477348571086e-09, "loss": 0.0002, "step": 7267 }, { "epoch": 9.901907356948229, "grad_norm": 0.380794872278984, "learning_rate": 5.047312792046955e-09, "loss": 0.0041, "step": 7268 }, { "epoch": 9.903269754768392, "grad_norm": 0.8894596026711606, "learning_rate": 4.90809468213116e-09, "loss": 0.0088, "step": 7269 }, { "epoch": 9.904632152588556, "grad_norm": 0.3914897446350289, "learning_rate": 4.770823045933126e-09, "loss": 0.0076, "step": 7270 }, { "epoch": 9.90599455040872, "grad_norm": 0.23935134303278613, "learning_rate": 4.635497910187026e-09, "loss": 0.001, "step": 7271 }, { "epoch": 9.907356948228882, "grad_norm": 0.9465092000377209, "learning_rate": 4.5021193012451115e-09, "loss": 0.0032, "step": 7272 }, { "epoch": 9.908719346049047, "grad_norm": 2.216720738660795, "learning_rate": 4.370687245084382e-09, "loss": 0.0038, "step": 7273 }, { "epoch": 9.91008174386921, "grad_norm": 0.18960547133570177, "learning_rate": 4.241201767298808e-09, "loss": 0.0008, "step": 7274 }, { "epoch": 9.911444141689373, "grad_norm": 0.462914533443058, "learning_rate": 4.113662893103776e-09, "loss": 0.0015, "step": 7275 }, { "epoch": 9.912806539509537, "grad_norm": 0.15291416941512256, "learning_rate": 3.988070647338305e-09, "loss": 0.0002, "step": 7276 }, { "epoch": 9.9141689373297, "grad_norm": 0.14051250045335598, "learning_rate": 3.8644250544594975e-09, "loss": 0.0003, "step": 7277 }, { "epoch": 9.915531335149863, "grad_norm": 0.13764459810507346, "learning_rate": 3.742726138548092e-09, "loss": 0.0003, "step": 7278 }, { "epoch": 9.916893732970028, "grad_norm": 0.18409418837712468, "learning_rate": 3.622973923301798e-09, "loss": 0.0003, "step": 7279 }, { "epoch": 9.91825613079019, "grad_norm": 0.05404820305738945, "learning_rate": 3.5051684320430712e-09, "loss": 0.0003, "step": 7280 }, { "epoch": 9.919618528610354, "grad_norm": 0.6081073989543819, "learning_rate": 3.38930968771356e-09, "loss": 0.0153, "step": 7281 }, { "epoch": 9.920980926430518, "grad_norm": 0.19013999089774256, "learning_rate": 3.275397712875217e-09, "loss": 0.0022, "step": 7282 }, { "epoch": 9.922343324250681, "grad_norm": 0.9662236334156084, "learning_rate": 3.163432529711408e-09, "loss": 0.0034, "step": 7283 }, { "epoch": 9.923705722070844, "grad_norm": 0.19504168170102357, "learning_rate": 3.0534141600280233e-09, "loss": 0.0006, "step": 7284 }, { "epoch": 9.925068119891009, "grad_norm": 0.5979853301210722, "learning_rate": 2.945342625250147e-09, "loss": 0.0021, "step": 7285 }, { "epoch": 9.926430517711172, "grad_norm": 0.8144479787885137, "learning_rate": 2.839217946422057e-09, "loss": 0.0036, "step": 7286 }, { "epoch": 9.927792915531334, "grad_norm": 1.7719859575647627, "learning_rate": 2.735040144212775e-09, "loss": 0.005, "step": 7287 }, { "epoch": 9.9291553133515, "grad_norm": 0.1343492386896603, "learning_rate": 2.6328092389094063e-09, "loss": 0.0003, "step": 7288 }, { "epoch": 9.930517711171662, "grad_norm": 0.09285887936668816, "learning_rate": 2.53252525042047e-09, "loss": 0.0004, "step": 7289 }, { "epoch": 9.931880108991825, "grad_norm": 1.439656162393729, "learning_rate": 2.4341881982758997e-09, "loss": 0.0123, "step": 7290 }, { "epoch": 9.93324250681199, "grad_norm": 0.062246181685467916, "learning_rate": 2.337798101625932e-09, "loss": 0.0004, "step": 7291 }, { "epoch": 9.934604904632153, "grad_norm": 0.10374441123530975, "learning_rate": 2.243354979242218e-09, "loss": 0.0004, "step": 7292 }, { "epoch": 9.935967302452315, "grad_norm": 0.6470083612797825, "learning_rate": 2.1508588495167125e-09, "loss": 0.0131, "step": 7293 }, { "epoch": 9.93732970027248, "grad_norm": 1.4268401607289742, "learning_rate": 2.060309730462784e-09, "loss": 0.015, "step": 7294 }, { "epoch": 9.938692098092643, "grad_norm": 0.21834145864849006, "learning_rate": 1.9717076397129942e-09, "loss": 0.0017, "step": 7295 }, { "epoch": 9.940054495912806, "grad_norm": 0.07701998504062366, "learning_rate": 1.8850525945235397e-09, "loss": 0.0005, "step": 7296 }, { "epoch": 9.94141689373297, "grad_norm": 0.11887283226941864, "learning_rate": 1.8003446117687006e-09, "loss": 0.0004, "step": 7297 }, { "epoch": 9.942779291553133, "grad_norm": 0.4323504379962989, "learning_rate": 1.7175837079452806e-09, "loss": 0.0077, "step": 7298 }, { "epoch": 9.944141689373296, "grad_norm": 0.10286475722733542, "learning_rate": 1.6367698991703874e-09, "loss": 0.0003, "step": 7299 }, { "epoch": 9.945504087193461, "grad_norm": 0.7380894298659838, "learning_rate": 1.5579032011825423e-09, "loss": 0.0016, "step": 7300 }, { "epoch": 9.946866485013624, "grad_norm": 0.9638045581322474, "learning_rate": 1.4809836293394607e-09, "loss": 0.0032, "step": 7301 }, { "epoch": 9.948228882833787, "grad_norm": 0.05982124454185277, "learning_rate": 1.4060111986202718e-09, "loss": 0.0003, "step": 7302 }, { "epoch": 9.949591280653951, "grad_norm": 1.7227230539142093, "learning_rate": 1.3329859236266284e-09, "loss": 0.0075, "step": 7303 }, { "epoch": 9.950953678474114, "grad_norm": 0.5565119003346584, "learning_rate": 1.2619078185793775e-09, "loss": 0.0025, "step": 7304 }, { "epoch": 9.952316076294277, "grad_norm": 0.24354299648815722, "learning_rate": 1.1927768973196696e-09, "loss": 0.008, "step": 7305 }, { "epoch": 9.953678474114442, "grad_norm": 0.13189855888988875, "learning_rate": 1.125593173311179e-09, "loss": 0.0004, "step": 7306 }, { "epoch": 9.955040871934605, "grad_norm": 0.6735776185043204, "learning_rate": 1.0603566596367743e-09, "loss": 0.0012, "step": 7307 }, { "epoch": 9.956403269754768, "grad_norm": 0.19828239869863096, "learning_rate": 9.970673690018474e-10, "loss": 0.0004, "step": 7308 }, { "epoch": 9.957765667574932, "grad_norm": 0.28826494717056456, "learning_rate": 9.357253137298738e-10, "loss": 0.0015, "step": 7309 }, { "epoch": 9.959128065395095, "grad_norm": 1.3542871094079423, "learning_rate": 8.763305057690741e-10, "loss": 0.0124, "step": 7310 }, { "epoch": 9.960490463215258, "grad_norm": 0.0653993300898456, "learning_rate": 8.18882956683531e-10, "loss": 0.0005, "step": 7311 }, { "epoch": 9.961852861035423, "grad_norm": 0.1127985992034623, "learning_rate": 7.633826776631826e-10, "loss": 0.0003, "step": 7312 }, { "epoch": 9.963215258855586, "grad_norm": 0.6008882181396401, "learning_rate": 7.098296795138293e-10, "loss": 0.0094, "step": 7313 }, { "epoch": 9.964577656675749, "grad_norm": 0.3503297471082439, "learning_rate": 6.582239726671269e-10, "loss": 0.0079, "step": 7314 }, { "epoch": 9.965940054495913, "grad_norm": 0.13482481163229199, "learning_rate": 6.085655671717039e-10, "loss": 0.0006, "step": 7315 }, { "epoch": 9.967302452316076, "grad_norm": 0.12922126623498958, "learning_rate": 5.608544726976028e-10, "loss": 0.0005, "step": 7316 }, { "epoch": 9.96866485013624, "grad_norm": 0.250598949316528, "learning_rate": 5.150906985373905e-10, "loss": 0.0009, "step": 7317 }, { "epoch": 9.970027247956404, "grad_norm": 0.11895562023172204, "learning_rate": 4.712742536017167e-10, "loss": 0.0004, "step": 7318 }, { "epoch": 9.971389645776567, "grad_norm": 1.5395975770695844, "learning_rate": 4.294051464259763e-10, "loss": 0.0093, "step": 7319 }, { "epoch": 9.97275204359673, "grad_norm": 0.2906201064348078, "learning_rate": 3.894833851614266e-10, "loss": 0.0012, "step": 7320 }, { "epoch": 9.974114441416894, "grad_norm": 0.14960388380016695, "learning_rate": 3.5150897758295945e-10, "loss": 0.0019, "step": 7321 }, { "epoch": 9.975476839237057, "grad_norm": 0.0685539352107702, "learning_rate": 3.1548193108688063e-10, "loss": 0.0004, "step": 7322 }, { "epoch": 9.97683923705722, "grad_norm": 0.09984583895073693, "learning_rate": 2.814022526886895e-10, "loss": 0.0003, "step": 7323 }, { "epoch": 9.978201634877385, "grad_norm": 0.6328992414736964, "learning_rate": 2.492699490252992e-10, "loss": 0.0016, "step": 7324 }, { "epoch": 9.979564032697548, "grad_norm": 1.0019235006341092, "learning_rate": 2.1908502635503703e-10, "loss": 0.0024, "step": 7325 }, { "epoch": 9.98092643051771, "grad_norm": 0.06061822541838903, "learning_rate": 1.908474905543134e-10, "loss": 0.0004, "step": 7326 }, { "epoch": 9.982288828337875, "grad_norm": 0.19709634713846272, "learning_rate": 1.6455734712317316e-10, "loss": 0.0004, "step": 7327 }, { "epoch": 9.983651226158038, "grad_norm": 0.23730127621265734, "learning_rate": 1.4021460118085473e-10, "loss": 0.0077, "step": 7328 }, { "epoch": 9.985013623978201, "grad_norm": 1.9084186467027215, "learning_rate": 1.1781925746912059e-10, "loss": 0.0141, "step": 7329 }, { "epoch": 9.986376021798366, "grad_norm": 0.3169797436544912, "learning_rate": 9.737132034892683e-11, "loss": 0.0049, "step": 7330 }, { "epoch": 9.987738419618529, "grad_norm": 0.05690622898083561, "learning_rate": 7.887079380153317e-11, "loss": 0.0003, "step": 7331 }, { "epoch": 9.989100817438691, "grad_norm": 0.1520560730231824, "learning_rate": 6.231768143072359e-11, "loss": 0.0004, "step": 7332 }, { "epoch": 9.990463215258856, "grad_norm": 0.07376588959392406, "learning_rate": 4.7711986460585725e-11, "loss": 0.0005, "step": 7333 }, { "epoch": 9.991825613079019, "grad_norm": 0.09234253484873045, "learning_rate": 3.505371173329053e-11, "loss": 0.0004, "step": 7334 }, { "epoch": 9.993188010899182, "grad_norm": 0.2737511598623607, "learning_rate": 2.4342859715753564e-11, "loss": 0.0075, "step": 7335 }, { "epoch": 9.994550408719347, "grad_norm": 1.259532586085102, "learning_rate": 1.557943249408389e-11, "loss": 0.0041, "step": 7336 }, { "epoch": 9.99591280653951, "grad_norm": 0.13376195708809732, "learning_rate": 8.763431773584075e-12, "loss": 0.0005, "step": 7337 }, { "epoch": 9.997275204359672, "grad_norm": 0.2399859857536912, "learning_rate": 3.894858882080854e-12, "loss": 0.0007, "step": 7338 }, { "epoch": 9.998637602179837, "grad_norm": 1.221339997844727, "learning_rate": 9.737147677046921e-13, "loss": 0.0077, "step": 7339 }, { "epoch": 10.0, "grad_norm": 0.1683627829441456, "learning_rate": 0.0, "loss": 0.0004, "step": 7340 }, { "epoch": 10.0, "eval_accuracy": 0.9416058394160584, "eval_f1": 0.9318246609752653, "eval_loss": 0.14797872304916382, "eval_precision": 0.924236915820259, "eval_recall": 0.9435651792362515, "eval_runtime": 16.7653, "eval_samples_per_second": 106.231, "eval_steps_per_second": 0.835, "step": 7340 }, { "epoch": 10.0, "step": 7340, "total_flos": 2.8481463415996416e+16, "train_loss": 0.06697300495097683, "train_runtime": 139639.1212, "train_samples_per_second": 26.898, "train_steps_per_second": 0.053 } ], "logging_steps": 1.0, "max_steps": 7340, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8481463415996416e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }