diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,80521 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.733371596199222, + "eval_steps": 500, + "global_step": 11500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.25009894478923006, + "learning_rate": 2.1231422505307853e-09, + "loss": 0.1048, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.19922636436321503, + "learning_rate": 4.246284501061571e-09, + "loss": 0.1915, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 0.3269235595332411, + "learning_rate": 6.369426751592357e-09, + "loss": 0.2746, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 0.358077214427226, + "learning_rate": 8.492569002123141e-09, + "loss": 0.3747, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 0.2540606765888093, + "learning_rate": 1.0615711252653927e-08, + "loss": 0.2772, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.28658288356106465, + "learning_rate": 1.2738853503184714e-08, + "loss": 0.349, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 0.685969966490638, + "learning_rate": 1.4861995753715499e-08, + "loss": 0.294, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 0.7418569063982221, + "learning_rate": 1.6985138004246283e-08, + "loss": 0.2128, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 0.13356416195804294, + "learning_rate": 1.910828025477707e-08, + "loss": 0.1117, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 0.10029129219851991, + "learning_rate": 2.1231422505307853e-08, + "loss": 0.0753, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.19137471392482466, + "learning_rate": 2.3354564755838637e-08, + "loss": 0.2082, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 0.22620629379442936, + "learning_rate": 2.5477707006369427e-08, + "loss": 0.1041, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 0.24389827304886408, + "learning_rate": 2.760084925690021e-08, + "loss": 0.181, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 0.3382479000349999, + "learning_rate": 2.9723991507430998e-08, + "loss": 0.33, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 0.18771980115214004, + "learning_rate": 3.184713375796178e-08, + "loss": 0.0976, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.42516237338923024, + "learning_rate": 3.3970276008492565e-08, + "loss": 0.319, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 0.24522305917300966, + "learning_rate": 3.609341825902335e-08, + "loss": 0.4258, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 0.24229655895652444, + "learning_rate": 3.821656050955414e-08, + "loss": 0.2826, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 0.3229545378825092, + "learning_rate": 4.033970276008492e-08, + "loss": 0.286, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 0.16404456968742298, + "learning_rate": 4.2462845010615706e-08, + "loss": 0.028, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.16530036487321828, + "learning_rate": 4.458598726114649e-08, + "loss": 0.1198, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 0.2385576349885164, + "learning_rate": 4.6709129511677274e-08, + "loss": 0.1685, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 0.19659189540970257, + "learning_rate": 4.883227176220807e-08, + "loss": 0.1903, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 0.2947538234856561, + "learning_rate": 5.0955414012738854e-08, + "loss": 0.1341, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 0.23116107483481232, + "learning_rate": 5.307855626326964e-08, + "loss": 0.209, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 2.1033019559393957, + "learning_rate": 5.520169851380042e-08, + "loss": 0.1789, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 0.08804539463365178, + "learning_rate": 5.732484076433121e-08, + "loss": 0.0189, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 0.2914851513918007, + "learning_rate": 5.9447983014861996e-08, + "loss": 0.1725, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 0.14836108588407937, + "learning_rate": 6.157112526539278e-08, + "loss": 0.2839, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 0.1919117139250952, + "learning_rate": 6.369426751592356e-08, + "loss": 0.1446, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.2761565442014638, + "learning_rate": 6.581740976645435e-08, + "loss": 0.2991, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 0.22305278681009405, + "learning_rate": 6.794055201698513e-08, + "loss": 0.0708, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 0.1610106563978104, + "learning_rate": 7.006369426751591e-08, + "loss": 0.261, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 0.14077057547983793, + "learning_rate": 7.21868365180467e-08, + "loss": 0.1801, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 0.20065851726476622, + "learning_rate": 7.43099787685775e-08, + "loss": 0.2476, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.3081035531257673, + "learning_rate": 7.643312101910828e-08, + "loss": 0.216, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 0.09010003471735234, + "learning_rate": 7.855626326963906e-08, + "loss": 0.0802, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 0.19947396878358303, + "learning_rate": 8.067940552016985e-08, + "loss": 0.305, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 0.262252267634813, + "learning_rate": 8.280254777070063e-08, + "loss": 0.1195, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 0.17317934071438829, + "learning_rate": 8.492569002123141e-08, + "loss": 0.0463, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.22859501169296872, + "learning_rate": 8.70488322717622e-08, + "loss": 0.1598, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 0.26001327021273624, + "learning_rate": 8.917197452229298e-08, + "loss": 0.066, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 0.1198363946982399, + "learning_rate": 9.129511677282376e-08, + "loss": 0.0997, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 0.2593161588612782, + "learning_rate": 9.341825902335455e-08, + "loss": 0.0994, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 0.2306799823690304, + "learning_rate": 9.554140127388536e-08, + "loss": 0.2179, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.17571689838456886, + "learning_rate": 9.766454352441614e-08, + "loss": 0.1061, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 0.17914073942300426, + "learning_rate": 9.978768577494693e-08, + "loss": 0.223, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 0.24889993931575963, + "learning_rate": 1.0191082802547771e-07, + "loss": 0.2474, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 0.22781020002071803, + "learning_rate": 1.0403397027600849e-07, + "loss": 0.11, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 0.2501869999590839, + "learning_rate": 1.0615711252653928e-07, + "loss": 0.176, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.36315613525619006, + "learning_rate": 1.0828025477707006e-07, + "loss": 0.1577, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 0.2658342666766177, + "learning_rate": 1.1040339702760084e-07, + "loss": 0.1983, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 0.18959948719710448, + "learning_rate": 1.1252653927813163e-07, + "loss": 0.1979, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 0.2601087356669149, + "learning_rate": 1.1464968152866242e-07, + "loss": 0.133, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 0.31557862443430856, + "learning_rate": 1.1677282377919321e-07, + "loss": 0.3968, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 0.19390524170160062, + "learning_rate": 1.1889596602972399e-07, + "loss": 0.2072, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 0.1998692909914467, + "learning_rate": 1.2101910828025477e-07, + "loss": 0.1156, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 0.1221613667000933, + "learning_rate": 1.2314225053078556e-07, + "loss": 0.2259, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 0.23782027171460882, + "learning_rate": 1.2526539278131634e-07, + "loss": 0.1484, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 0.2328260333787332, + "learning_rate": 1.2738853503184713e-07, + "loss": 0.288, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 0.14448131959238655, + "learning_rate": 1.295116772823779e-07, + "loss": 0.0096, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 0.332046819939343, + "learning_rate": 1.316348195329087e-07, + "loss": 0.0806, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 0.2544066675974567, + "learning_rate": 1.3375796178343948e-07, + "loss": 0.2087, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 0.10688903983575725, + "learning_rate": 1.3588110403397026e-07, + "loss": 0.0316, + "step": 64 + }, + { + "epoch": 0.0, + "grad_norm": 0.15536549771069488, + "learning_rate": 1.3800424628450104e-07, + "loss": 0.1157, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 0.18711607110158102, + "learning_rate": 1.4012738853503183e-07, + "loss": 0.3105, + "step": 66 + }, + { + "epoch": 0.0, + "grad_norm": 0.2615454076682301, + "learning_rate": 1.422505307855626e-07, + "loss": 0.1635, + "step": 67 + }, + { + "epoch": 0.0, + "grad_norm": 0.13564631530215016, + "learning_rate": 1.443736730360934e-07, + "loss": 0.1035, + "step": 68 + }, + { + "epoch": 0.0, + "grad_norm": 0.21407966082271274, + "learning_rate": 1.464968152866242e-07, + "loss": 0.119, + "step": 69 + }, + { + "epoch": 0.0, + "grad_norm": 0.30779767305709155, + "learning_rate": 1.48619957537155e-07, + "loss": 0.2524, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 0.33050380029323323, + "learning_rate": 1.5074309978768577e-07, + "loss": 0.2835, + "step": 71 + }, + { + "epoch": 0.0, + "grad_norm": 0.2502548847332565, + "learning_rate": 1.5286624203821656e-07, + "loss": 0.1124, + "step": 72 + }, + { + "epoch": 0.0, + "grad_norm": 0.2553321164753365, + "learning_rate": 1.5498938428874734e-07, + "loss": 0.2557, + "step": 73 + }, + { + "epoch": 0.0, + "grad_norm": 0.18679106407193102, + "learning_rate": 1.5711252653927812e-07, + "loss": 0.1674, + "step": 74 + }, + { + "epoch": 0.0, + "grad_norm": 0.1283137929571681, + "learning_rate": 1.592356687898089e-07, + "loss": 0.3204, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 0.24057748518138783, + "learning_rate": 1.613588110403397e-07, + "loss": 0.1441, + "step": 76 + }, + { + "epoch": 0.0, + "grad_norm": 0.21625351951617686, + "learning_rate": 1.6348195329087047e-07, + "loss": 0.1564, + "step": 77 + }, + { + "epoch": 0.0, + "grad_norm": 0.2807641314251535, + "learning_rate": 1.6560509554140126e-07, + "loss": 0.2258, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 0.21426607779258156, + "learning_rate": 1.6772823779193204e-07, + "loss": 0.0787, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 0.21735166406748668, + "learning_rate": 1.6985138004246283e-07, + "loss": 0.1448, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.09655181133610861, + "learning_rate": 1.719745222929936e-07, + "loss": 0.1662, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 0.1774750298108412, + "learning_rate": 1.740976645435244e-07, + "loss": 0.1068, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 0.9165863046549484, + "learning_rate": 1.7622080679405518e-07, + "loss": 0.2897, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 0.09591992055241581, + "learning_rate": 1.7834394904458596e-07, + "loss": 0.1542, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 0.1871035981427734, + "learning_rate": 1.8046709129511674e-07, + "loss": 0.0815, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.3013920904314661, + "learning_rate": 1.8259023354564753e-07, + "loss": 0.3896, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 0.08166990499696496, + "learning_rate": 1.847133757961783e-07, + "loss": 0.1132, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.24959187128555374, + "learning_rate": 1.868365180467091e-07, + "loss": 0.1128, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.11530906694869626, + "learning_rate": 1.8895966029723993e-07, + "loss": 0.1156, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 0.284104155324781, + "learning_rate": 1.9108280254777072e-07, + "loss": 0.2533, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.2161792934914752, + "learning_rate": 1.932059447983015e-07, + "loss": 0.0977, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.16412721895417143, + "learning_rate": 1.9532908704883228e-07, + "loss": 0.2882, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 0.21110753539717453, + "learning_rate": 1.9745222929936307e-07, + "loss": 0.0277, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.26361369211501084, + "learning_rate": 1.9957537154989385e-07, + "loss": 0.2768, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.2529278381367476, + "learning_rate": 2.0169851380042463e-07, + "loss": 0.1897, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.16391164203616654, + "learning_rate": 2.0382165605095542e-07, + "loss": 0.1368, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.2820574987511089, + "learning_rate": 2.059447983014862e-07, + "loss": 0.1703, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 0.11268193516563275, + "learning_rate": 2.0806794055201698e-07, + "loss": 0.2199, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.287806588908634, + "learning_rate": 2.1019108280254777e-07, + "loss": 0.0926, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 0.2822299996909002, + "learning_rate": 2.1231422505307855e-07, + "loss": 0.1967, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.2393906685301478, + "learning_rate": 2.1443736730360934e-07, + "loss": 0.398, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.21571853982926306, + "learning_rate": 2.1656050955414012e-07, + "loss": 0.2037, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.19160969775378794, + "learning_rate": 2.186836518046709e-07, + "loss": 0.3974, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.2942322069415299, + "learning_rate": 2.208067940552017e-07, + "loss": 0.3434, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.25922201553758256, + "learning_rate": 2.2292993630573247e-07, + "loss": 0.1251, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.2071198246502389, + "learning_rate": 2.2505307855626325e-07, + "loss": 0.1807, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.1794504223080536, + "learning_rate": 2.2717622080679404e-07, + "loss": 0.1192, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.3541472935508919, + "learning_rate": 2.2929936305732485e-07, + "loss": 0.1795, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.42212667279454036, + "learning_rate": 2.3142250530785563e-07, + "loss": 0.2508, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.15008163868349053, + "learning_rate": 2.3354564755838642e-07, + "loss": 0.1445, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.2883339348097026, + "learning_rate": 2.356687898089172e-07, + "loss": 0.1372, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.2689503988887403, + "learning_rate": 2.3779193205944798e-07, + "loss": 0.2331, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.3632782308638637, + "learning_rate": 2.3991507430997877e-07, + "loss": 0.3856, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.386197873074015, + "learning_rate": 2.4203821656050955e-07, + "loss": 0.0899, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.09151484128845049, + "learning_rate": 2.4416135881104033e-07, + "loss": 0.0109, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.24683497049830289, + "learning_rate": 2.462845010615711e-07, + "loss": 0.1895, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 0.2510248585389427, + "learning_rate": 2.484076433121019e-07, + "loss": 0.1953, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 0.31437307952859195, + "learning_rate": 2.505307855626327e-07, + "loss": 0.2637, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 0.5792771847840741, + "learning_rate": 2.5265392781316347e-07, + "loss": 0.2178, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 0.3569547700025303, + "learning_rate": 2.5477707006369425e-07, + "loss": 0.2827, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.3541865319331499, + "learning_rate": 2.5690021231422504e-07, + "loss": 0.2158, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 0.2803812751973915, + "learning_rate": 2.590233545647558e-07, + "loss": 0.1308, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 0.3733045506699708, + "learning_rate": 2.611464968152866e-07, + "loss": 0.1748, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 0.2627044050122649, + "learning_rate": 2.632696390658174e-07, + "loss": 0.1862, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 0.1801284370762871, + "learning_rate": 2.6539278131634817e-07, + "loss": 0.2093, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.14356965529788235, + "learning_rate": 2.6751592356687895e-07, + "loss": 0.1077, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 0.1448176733758346, + "learning_rate": 2.6963906581740974e-07, + "loss": 0.2931, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 0.4005460509049744, + "learning_rate": 2.717622080679405e-07, + "loss": 0.1749, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 0.22251676974314188, + "learning_rate": 2.738853503184713e-07, + "loss": 0.1389, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 0.3027281145545714, + "learning_rate": 2.760084925690021e-07, + "loss": 0.1792, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.23846282537302702, + "learning_rate": 2.7813163481953287e-07, + "loss": 0.3108, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 0.2564118214473458, + "learning_rate": 2.8025477707006366e-07, + "loss": 0.2239, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 0.17751117550418236, + "learning_rate": 2.8237791932059444e-07, + "loss": 0.1463, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 0.25702667079807706, + "learning_rate": 2.845010615711252e-07, + "loss": 0.3392, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 0.18390417299723882, + "learning_rate": 2.86624203821656e-07, + "loss": 0.101, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.2256740628934894, + "learning_rate": 2.887473460721868e-07, + "loss": 0.2245, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 0.22750489451154088, + "learning_rate": 2.908704883227176e-07, + "loss": 0.3181, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 0.2252501766471252, + "learning_rate": 2.929936305732484e-07, + "loss": 0.1086, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 0.26754928057573385, + "learning_rate": 2.951167728237792e-07, + "loss": 0.3815, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 0.3578695711287303, + "learning_rate": 2.9723991507431e-07, + "loss": 0.3195, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.166913223467953, + "learning_rate": 2.9936305732484076e-07, + "loss": 0.1541, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 0.22575856821984214, + "learning_rate": 3.0148619957537155e-07, + "loss": 0.1101, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 0.1816892201464045, + "learning_rate": 3.0360934182590233e-07, + "loss": 0.243, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 0.24199343241011556, + "learning_rate": 3.057324840764331e-07, + "loss": 0.0549, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 0.3745084458743019, + "learning_rate": 3.078556263269639e-07, + "loss": 0.0462, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.1060462353898553, + "learning_rate": 3.099787685774947e-07, + "loss": 0.1055, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 0.33017772424792025, + "learning_rate": 3.1210191082802546e-07, + "loss": 0.3886, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 0.13086145221413323, + "learning_rate": 3.1422505307855625e-07, + "loss": 0.1973, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 0.26957199286967665, + "learning_rate": 3.1634819532908703e-07, + "loss": 0.3495, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 0.25739770577352045, + "learning_rate": 3.184713375796178e-07, + "loss": 0.1319, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.26789222142663155, + "learning_rate": 3.205944798301486e-07, + "loss": 0.3428, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 0.09704698154957962, + "learning_rate": 3.227176220806794e-07, + "loss": 0.1104, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 0.29446869545909066, + "learning_rate": 3.2484076433121017e-07, + "loss": 0.2044, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 0.24583974076593526, + "learning_rate": 3.2696390658174095e-07, + "loss": 0.143, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 0.21672372184958014, + "learning_rate": 3.2908704883227173e-07, + "loss": 0.2422, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.2316459136996034, + "learning_rate": 3.312101910828025e-07, + "loss": 0.0438, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 0.20623875642113082, + "learning_rate": 3.333333333333333e-07, + "loss": 0.1578, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 0.4061097538574797, + "learning_rate": 3.354564755838641e-07, + "loss": 0.3662, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 0.14448612182415435, + "learning_rate": 3.3757961783439487e-07, + "loss": 0.0628, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 0.39492608570239246, + "learning_rate": 3.3970276008492565e-07, + "loss": 0.2601, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 0.25801533070450783, + "learning_rate": 3.4182590233545644e-07, + "loss": 0.1741, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 0.14966054415149585, + "learning_rate": 3.439490445859872e-07, + "loss": 0.1854, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 0.253115839014238, + "learning_rate": 3.46072186836518e-07, + "loss": 0.2261, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 0.34562150869603286, + "learning_rate": 3.481953290870488e-07, + "loss": 0.09, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 0.23803559936593066, + "learning_rate": 3.5031847133757957e-07, + "loss": 0.1875, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 0.3221505541878738, + "learning_rate": 3.5244161358811035e-07, + "loss": 0.1857, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 0.12490966501456342, + "learning_rate": 3.5456475583864114e-07, + "loss": 0.0353, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 0.18338891854904169, + "learning_rate": 3.566878980891719e-07, + "loss": 0.0618, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 0.17791003089439517, + "learning_rate": 3.588110403397027e-07, + "loss": 0.3212, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 0.21835973917262472, + "learning_rate": 3.609341825902335e-07, + "loss": 0.2773, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 0.21765605514455302, + "learning_rate": 3.6305732484076427e-07, + "loss": 0.1221, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 0.21150022657865714, + "learning_rate": 3.6518046709129506e-07, + "loss": 0.3392, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 0.17460086087779267, + "learning_rate": 3.6730360934182584e-07, + "loss": 0.1001, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 0.2999670269163138, + "learning_rate": 3.694267515923566e-07, + "loss": 0.0441, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 0.25135495215228293, + "learning_rate": 3.715498938428874e-07, + "loss": 0.2128, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.6471046329195711, + "learning_rate": 3.736730360934182e-07, + "loss": 0.1948, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 0.2278373196163679, + "learning_rate": 3.757961783439491e-07, + "loss": 0.2304, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 0.2758925917903719, + "learning_rate": 3.7791932059447986e-07, + "loss": 0.4609, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 0.21079555432541505, + "learning_rate": 3.8004246284501065e-07, + "loss": 0.2711, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 0.22056801853242466, + "learning_rate": 3.8216560509554143e-07, + "loss": 0.1321, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 0.934771789969799, + "learning_rate": 3.842887473460722e-07, + "loss": 0.3233, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 0.21372971984552552, + "learning_rate": 3.86411889596603e-07, + "loss": 0.3842, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 0.18177493199129321, + "learning_rate": 3.885350318471338e-07, + "loss": 0.1263, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 0.20539564694344178, + "learning_rate": 3.9065817409766457e-07, + "loss": 0.099, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 0.24517169926202315, + "learning_rate": 3.9278131634819535e-07, + "loss": 0.1368, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 0.1843016081672305, + "learning_rate": 3.9490445859872613e-07, + "loss": 0.0505, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 0.30375760425814136, + "learning_rate": 3.970276008492569e-07, + "loss": 0.2151, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 0.23789039380870222, + "learning_rate": 3.991507430997877e-07, + "loss": 0.0694, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 0.24787341950436864, + "learning_rate": 4.012738853503185e-07, + "loss": 0.3139, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 0.24142286451697284, + "learning_rate": 4.0339702760084927e-07, + "loss": 0.1738, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 0.20276550233245788, + "learning_rate": 4.0552016985138005e-07, + "loss": 0.0852, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 0.4582239097053706, + "learning_rate": 4.0764331210191083e-07, + "loss": 0.219, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 0.3893721484305137, + "learning_rate": 4.097664543524416e-07, + "loss": 0.2916, + "step": 193 + }, + { + "epoch": 0.01, + "grad_norm": 0.46880224835496076, + "learning_rate": 4.118895966029724e-07, + "loss": 0.2952, + "step": 194 + }, + { + "epoch": 0.01, + "grad_norm": 0.13977758365510837, + "learning_rate": 4.140127388535032e-07, + "loss": 0.1819, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 0.13470332419651734, + "learning_rate": 4.1613588110403397e-07, + "loss": 0.1215, + "step": 196 + }, + { + "epoch": 0.01, + "grad_norm": 0.2307751266090855, + "learning_rate": 4.1825902335456475e-07, + "loss": 0.2767, + "step": 197 + }, + { + "epoch": 0.01, + "grad_norm": 0.271595431624329, + "learning_rate": 4.2038216560509554e-07, + "loss": 0.1746, + "step": 198 + }, + { + "epoch": 0.01, + "grad_norm": 0.23079560235515081, + "learning_rate": 4.225053078556263e-07, + "loss": 0.1271, + "step": 199 + }, + { + "epoch": 0.01, + "grad_norm": 0.21300416406820002, + "learning_rate": 4.246284501061571e-07, + "loss": 0.2239, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 0.11774526827543365, + "learning_rate": 4.267515923566879e-07, + "loss": 0.1041, + "step": 201 + }, + { + "epoch": 0.01, + "grad_norm": 0.14021883811615443, + "learning_rate": 4.2887473460721867e-07, + "loss": 0.3552, + "step": 202 + }, + { + "epoch": 0.01, + "grad_norm": 0.15598643532331874, + "learning_rate": 4.3099787685774946e-07, + "loss": 0.0227, + "step": 203 + }, + { + "epoch": 0.01, + "grad_norm": 0.19572744171804726, + "learning_rate": 4.3312101910828024e-07, + "loss": 0.1274, + "step": 204 + }, + { + "epoch": 0.01, + "grad_norm": 0.27843730774946024, + "learning_rate": 4.35244161358811e-07, + "loss": 0.3357, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 0.2768138090599354, + "learning_rate": 4.373673036093418e-07, + "loss": 0.2559, + "step": 206 + }, + { + "epoch": 0.01, + "grad_norm": 0.30303560402599317, + "learning_rate": 4.394904458598726e-07, + "loss": 0.2036, + "step": 207 + }, + { + "epoch": 0.01, + "grad_norm": 0.3574035759761096, + "learning_rate": 4.416135881104034e-07, + "loss": 0.0257, + "step": 208 + }, + { + "epoch": 0.01, + "grad_norm": 0.513602085503996, + "learning_rate": 4.4373673036093416e-07, + "loss": 0.1839, + "step": 209 + }, + { + "epoch": 0.01, + "grad_norm": 0.23574962632529362, + "learning_rate": 4.4585987261146494e-07, + "loss": 0.131, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 0.301502611286239, + "learning_rate": 4.479830148619957e-07, + "loss": 0.1754, + "step": 211 + }, + { + "epoch": 0.01, + "grad_norm": 0.12831294475009764, + "learning_rate": 4.501061571125265e-07, + "loss": 0.0413, + "step": 212 + }, + { + "epoch": 0.01, + "grad_norm": 0.44448268764246396, + "learning_rate": 4.522292993630573e-07, + "loss": 0.0187, + "step": 213 + }, + { + "epoch": 0.01, + "grad_norm": 0.16541563293031547, + "learning_rate": 4.543524416135881e-07, + "loss": 0.0696, + "step": 214 + }, + { + "epoch": 0.01, + "grad_norm": 0.4156837966875057, + "learning_rate": 4.5647558386411886e-07, + "loss": 0.1157, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 0.27956594396333684, + "learning_rate": 4.585987261146497e-07, + "loss": 0.0714, + "step": 216 + }, + { + "epoch": 0.01, + "grad_norm": 0.1141071927218783, + "learning_rate": 4.607218683651805e-07, + "loss": 0.0591, + "step": 217 + }, + { + "epoch": 0.01, + "grad_norm": 0.168783943119781, + "learning_rate": 4.6284501061571126e-07, + "loss": 0.2204, + "step": 218 + }, + { + "epoch": 0.01, + "grad_norm": 0.1561287939942871, + "learning_rate": 4.6496815286624205e-07, + "loss": 0.0735, + "step": 219 + }, + { + "epoch": 0.01, + "grad_norm": 0.30322314822119223, + "learning_rate": 4.6709129511677283e-07, + "loss": 0.1685, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 0.3697363694305286, + "learning_rate": 4.692144373673036e-07, + "loss": 0.3518, + "step": 221 + }, + { + "epoch": 0.01, + "grad_norm": 0.2906261422640796, + "learning_rate": 4.713375796178344e-07, + "loss": 0.1692, + "step": 222 + }, + { + "epoch": 0.01, + "grad_norm": 0.26954453234957454, + "learning_rate": 4.734607218683652e-07, + "loss": 0.1832, + "step": 223 + }, + { + "epoch": 0.01, + "grad_norm": 0.27235398428229896, + "learning_rate": 4.7558386411889597e-07, + "loss": 0.3283, + "step": 224 + }, + { + "epoch": 0.01, + "grad_norm": 0.280661228991786, + "learning_rate": 4.777070063694267e-07, + "loss": 0.314, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 0.2218287050986227, + "learning_rate": 4.798301486199575e-07, + "loss": 0.1771, + "step": 226 + }, + { + "epoch": 0.01, + "grad_norm": 0.12591559039362718, + "learning_rate": 4.819532908704883e-07, + "loss": 0.0138, + "step": 227 + }, + { + "epoch": 0.01, + "grad_norm": 0.13838304163340706, + "learning_rate": 4.840764331210191e-07, + "loss": 0.0261, + "step": 228 + }, + { + "epoch": 0.01, + "grad_norm": 0.11523289156706043, + "learning_rate": 4.861995753715499e-07, + "loss": 0.1658, + "step": 229 + }, + { + "epoch": 0.01, + "grad_norm": 0.24297355446371952, + "learning_rate": 4.883227176220807e-07, + "loss": 0.3567, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 0.19210771351944797, + "learning_rate": 4.904458598726115e-07, + "loss": 0.1158, + "step": 231 + }, + { + "epoch": 0.01, + "grad_norm": 0.17117268316319734, + "learning_rate": 4.925690021231422e-07, + "loss": 0.0205, + "step": 232 + }, + { + "epoch": 0.01, + "grad_norm": 0.5389672372369377, + "learning_rate": 4.94692144373673e-07, + "loss": 0.1087, + "step": 233 + }, + { + "epoch": 0.01, + "grad_norm": 0.10165201346621026, + "learning_rate": 4.968152866242038e-07, + "loss": 0.021, + "step": 234 + }, + { + "epoch": 0.01, + "grad_norm": 0.15794862575618773, + "learning_rate": 4.989384288747346e-07, + "loss": 0.1067, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 0.0857455512752192, + "learning_rate": 5.010615711252654e-07, + "loss": 0.0126, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 0.15674156646349519, + "learning_rate": 5.031847133757962e-07, + "loss": 0.17, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 0.23624012189202026, + "learning_rate": 5.053078556263269e-07, + "loss": 0.0348, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 0.20950870199232627, + "learning_rate": 5.074309978768577e-07, + "loss": 0.1308, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 0.182355080636987, + "learning_rate": 5.095541401273885e-07, + "loss": 0.2538, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 0.19906201837253645, + "learning_rate": 5.116772823779193e-07, + "loss": 0.1013, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 0.31748922822807213, + "learning_rate": 5.138004246284501e-07, + "loss": 0.1983, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 0.2879583102322477, + "learning_rate": 5.159235668789809e-07, + "loss": 0.3305, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 0.2848482889330333, + "learning_rate": 5.180467091295116e-07, + "loss": 0.1683, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 0.38641844976427114, + "learning_rate": 5.201698513800424e-07, + "loss": 0.1918, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 0.29791922743032145, + "learning_rate": 5.222929936305732e-07, + "loss": 0.2141, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 0.37328860142096176, + "learning_rate": 5.24416135881104e-07, + "loss": 0.324, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 0.11778988848411559, + "learning_rate": 5.265392781316348e-07, + "loss": 0.197, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 0.33803319607529914, + "learning_rate": 5.286624203821656e-07, + "loss": 0.1999, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 0.09994307309166663, + "learning_rate": 5.307855626326963e-07, + "loss": 0.0133, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 0.18546932643609904, + "learning_rate": 5.329087048832271e-07, + "loss": 0.1493, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 0.24972406037114434, + "learning_rate": 5.350318471337579e-07, + "loss": 0.2281, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 0.22815217212037803, + "learning_rate": 5.371549893842887e-07, + "loss": 0.3029, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 0.35664572216679813, + "learning_rate": 5.392781316348195e-07, + "loss": 0.2978, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 0.36878615874476434, + "learning_rate": 5.414012738853503e-07, + "loss": 0.1935, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 0.12433161555769073, + "learning_rate": 5.43524416135881e-07, + "loss": 0.0079, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 0.7621643835905139, + "learning_rate": 5.456475583864118e-07, + "loss": 0.2617, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 0.19432234383374675, + "learning_rate": 5.477707006369426e-07, + "loss": 0.1661, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 0.15478225006265445, + "learning_rate": 5.498938428874734e-07, + "loss": 0.1457, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 0.4026440623845048, + "learning_rate": 5.520169851380042e-07, + "loss": 0.2138, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 0.14132590741784368, + "learning_rate": 5.54140127388535e-07, + "loss": 0.3274, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 0.2744286830896772, + "learning_rate": 5.562632696390657e-07, + "loss": 0.319, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 0.32236468776382265, + "learning_rate": 5.583864118895965e-07, + "loss": 0.256, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 0.17485595559343078, + "learning_rate": 5.605095541401273e-07, + "loss": 0.2133, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 0.29452707985083865, + "learning_rate": 5.626326963906581e-07, + "loss": 0.0416, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 0.3424552020931376, + "learning_rate": 5.647558386411889e-07, + "loss": 0.1472, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 0.26338706700603143, + "learning_rate": 5.668789808917197e-07, + "loss": 0.296, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 0.14705260009709054, + "learning_rate": 5.690021231422504e-07, + "loss": 0.1678, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 0.7860159787237884, + "learning_rate": 5.711252653927812e-07, + "loss": 0.1939, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 0.13624004223090053, + "learning_rate": 5.73248407643312e-07, + "loss": 0.1554, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 0.09898779142099313, + "learning_rate": 5.753715498938428e-07, + "loss": 0.192, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 0.18706483719575634, + "learning_rate": 5.774946921443736e-07, + "loss": 0.2032, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 0.29195550701482514, + "learning_rate": 5.796178343949044e-07, + "loss": 0.2077, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 0.3392739493646915, + "learning_rate": 5.817409766454351e-07, + "loss": 0.2532, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 0.3041085515552397, + "learning_rate": 5.838641188959659e-07, + "loss": 0.0918, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 0.4770793774693724, + "learning_rate": 5.859872611464968e-07, + "loss": 0.1106, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 0.19562567545163273, + "learning_rate": 5.881104033970276e-07, + "loss": 0.1333, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 0.2707278306013383, + "learning_rate": 5.902335456475584e-07, + "loss": 0.1306, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 0.16296943278549977, + "learning_rate": 5.923566878980892e-07, + "loss": 0.0364, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 0.5386648312096373, + "learning_rate": 5.9447983014862e-07, + "loss": 0.0808, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 0.2942127275073548, + "learning_rate": 5.966029723991507e-07, + "loss": 0.2626, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 0.3119903160409143, + "learning_rate": 5.987261146496815e-07, + "loss": 0.1895, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 0.2737013844339624, + "learning_rate": 6.008492569002123e-07, + "loss": 0.1385, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 0.16855653755832362, + "learning_rate": 6.029723991507431e-07, + "loss": 0.1529, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 0.49718223528016353, + "learning_rate": 6.050955414012739e-07, + "loss": 0.4262, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 0.2627780618572779, + "learning_rate": 6.072186836518047e-07, + "loss": 0.3654, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 0.2270233033606091, + "learning_rate": 6.093418259023354e-07, + "loss": 0.1655, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 0.32556073415138226, + "learning_rate": 6.114649681528662e-07, + "loss": 0.1, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 0.0688499303724151, + "learning_rate": 6.13588110403397e-07, + "loss": 0.0131, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 0.19737684668244823, + "learning_rate": 6.157112526539278e-07, + "loss": 0.2689, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.22678346062623986, + "learning_rate": 6.178343949044586e-07, + "loss": 0.2251, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 0.21751630427029792, + "learning_rate": 6.199575371549894e-07, + "loss": 0.2584, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 0.2141418417559859, + "learning_rate": 6.220806794055201e-07, + "loss": 0.0784, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 0.1430074561073039, + "learning_rate": 6.242038216560509e-07, + "loss": 0.1433, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 0.2786748881959617, + "learning_rate": 6.263269639065817e-07, + "loss": 0.2284, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 0.2433691630645959, + "learning_rate": 6.284501061571125e-07, + "loss": 0.1846, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 0.7923395523484135, + "learning_rate": 6.305732484076433e-07, + "loss": 0.3029, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 0.1963409180306421, + "learning_rate": 6.326963906581741e-07, + "loss": 0.1897, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 0.28965013571514175, + "learning_rate": 6.348195329087048e-07, + "loss": 0.0649, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 0.10288304663737333, + "learning_rate": 6.369426751592356e-07, + "loss": 0.0935, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 0.2386140993298097, + "learning_rate": 6.390658174097664e-07, + "loss": 0.1989, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 0.2001752439033513, + "learning_rate": 6.411889596602972e-07, + "loss": 0.2559, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 0.14419532926564962, + "learning_rate": 6.43312101910828e-07, + "loss": 0.1163, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 0.20905738114675854, + "learning_rate": 6.454352441613588e-07, + "loss": 0.015, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 0.24380431007046924, + "learning_rate": 6.475583864118895e-07, + "loss": 0.112, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 0.3023639200080016, + "learning_rate": 6.496815286624203e-07, + "loss": 0.2968, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 0.2606866540139771, + "learning_rate": 6.518046709129511e-07, + "loss": 0.1304, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 0.2656805524127184, + "learning_rate": 6.539278131634819e-07, + "loss": 0.2697, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 0.3193198432455117, + "learning_rate": 6.560509554140127e-07, + "loss": 0.1832, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 0.2874561835870219, + "learning_rate": 6.581740976645435e-07, + "loss": 0.0569, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 0.3174031017726217, + "learning_rate": 6.602972399150743e-07, + "loss": 0.1652, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 0.20165045009761387, + "learning_rate": 6.62420382165605e-07, + "loss": 0.0533, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 0.26014267892389276, + "learning_rate": 6.645435244161358e-07, + "loss": 0.3167, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 0.1251119388682503, + "learning_rate": 6.666666666666666e-07, + "loss": 0.1834, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 0.3945081001141345, + "learning_rate": 6.687898089171974e-07, + "loss": 0.2264, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 0.3474798985267554, + "learning_rate": 6.709129511677282e-07, + "loss": 0.2383, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 0.1947402716044741, + "learning_rate": 6.73036093418259e-07, + "loss": 0.2622, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 0.22362536364169935, + "learning_rate": 6.751592356687897e-07, + "loss": 0.0834, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 0.11661468027254816, + "learning_rate": 6.772823779193205e-07, + "loss": 0.0487, + "step": 319 + }, + { + "epoch": 0.02, + "grad_norm": 0.17541144954253332, + "learning_rate": 6.794055201698513e-07, + "loss": 0.1103, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 0.13308947738335472, + "learning_rate": 6.815286624203821e-07, + "loss": 0.0908, + "step": 321 + }, + { + "epoch": 0.02, + "grad_norm": 0.28308382340263966, + "learning_rate": 6.836518046709129e-07, + "loss": 0.2402, + "step": 322 + }, + { + "epoch": 0.02, + "grad_norm": 0.16304676460641138, + "learning_rate": 6.857749469214437e-07, + "loss": 0.1215, + "step": 323 + }, + { + "epoch": 0.02, + "grad_norm": 0.2690315548677506, + "learning_rate": 6.878980891719744e-07, + "loss": 0.1385, + "step": 324 + }, + { + "epoch": 0.02, + "grad_norm": 0.28782689849209325, + "learning_rate": 6.900212314225052e-07, + "loss": 0.1083, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 0.32365798582335054, + "learning_rate": 6.92144373673036e-07, + "loss": 0.1046, + "step": 326 + }, + { + "epoch": 0.02, + "grad_norm": 0.3017609449576566, + "learning_rate": 6.942675159235668e-07, + "loss": 0.2847, + "step": 327 + }, + { + "epoch": 0.02, + "grad_norm": 0.26305840526601176, + "learning_rate": 6.963906581740976e-07, + "loss": 0.132, + "step": 328 + }, + { + "epoch": 0.02, + "grad_norm": 0.527745134158186, + "learning_rate": 6.985138004246284e-07, + "loss": 0.3415, + "step": 329 + }, + { + "epoch": 0.02, + "grad_norm": 0.2400082999037914, + "learning_rate": 7.006369426751591e-07, + "loss": 0.224, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 0.31498865831609935, + "learning_rate": 7.027600849256899e-07, + "loss": 0.2224, + "step": 331 + }, + { + "epoch": 0.02, + "grad_norm": 0.23024024876382654, + "learning_rate": 7.048832271762207e-07, + "loss": 0.1899, + "step": 332 + }, + { + "epoch": 0.02, + "grad_norm": 0.4186895240598085, + "learning_rate": 7.070063694267515e-07, + "loss": 0.2308, + "step": 333 + }, + { + "epoch": 0.02, + "grad_norm": 0.13259159510712912, + "learning_rate": 7.091295116772823e-07, + "loss": 0.1469, + "step": 334 + }, + { + "epoch": 0.02, + "grad_norm": 0.2373762968766546, + "learning_rate": 7.112526539278131e-07, + "loss": 0.1811, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 0.2953297267584623, + "learning_rate": 7.133757961783438e-07, + "loss": 0.2956, + "step": 336 + }, + { + "epoch": 0.02, + "grad_norm": 0.2963767478220723, + "learning_rate": 7.154989384288746e-07, + "loss": 0.27, + "step": 337 + }, + { + "epoch": 0.02, + "grad_norm": 0.22335625807264956, + "learning_rate": 7.176220806794054e-07, + "loss": 0.1269, + "step": 338 + }, + { + "epoch": 0.02, + "grad_norm": 0.32139447372756724, + "learning_rate": 7.197452229299362e-07, + "loss": 0.2346, + "step": 339 + }, + { + "epoch": 0.02, + "grad_norm": 0.23705093106999992, + "learning_rate": 7.21868365180467e-07, + "loss": 0.2248, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 0.22767202512578014, + "learning_rate": 7.239915074309978e-07, + "loss": 0.2066, + "step": 341 + }, + { + "epoch": 0.02, + "grad_norm": 0.18397356381653115, + "learning_rate": 7.261146496815285e-07, + "loss": 0.1799, + "step": 342 + }, + { + "epoch": 0.02, + "grad_norm": 0.20673006822064957, + "learning_rate": 7.282377919320593e-07, + "loss": 0.4484, + "step": 343 + }, + { + "epoch": 0.02, + "grad_norm": 0.0557207926549818, + "learning_rate": 7.303609341825901e-07, + "loss": 0.0687, + "step": 344 + }, + { + "epoch": 0.02, + "grad_norm": 0.3081173807962164, + "learning_rate": 7.324840764331209e-07, + "loss": 0.2799, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 0.18776167618511286, + "learning_rate": 7.346072186836517e-07, + "loss": 0.2008, + "step": 346 + }, + { + "epoch": 0.02, + "grad_norm": 0.3296962816092498, + "learning_rate": 7.367303609341825e-07, + "loss": 0.4462, + "step": 347 + }, + { + "epoch": 0.02, + "grad_norm": 0.1485189730693178, + "learning_rate": 7.388535031847132e-07, + "loss": 0.0807, + "step": 348 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852474737176507, + "learning_rate": 7.40976645435244e-07, + "loss": 0.2873, + "step": 349 + }, + { + "epoch": 0.02, + "grad_norm": 0.3262034889242401, + "learning_rate": 7.430997876857748e-07, + "loss": 0.1459, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 0.15031259529226046, + "learning_rate": 7.452229299363056e-07, + "loss": 0.2377, + "step": 351 + }, + { + "epoch": 0.02, + "grad_norm": 0.16892482959317795, + "learning_rate": 7.473460721868364e-07, + "loss": 0.2246, + "step": 352 + }, + { + "epoch": 0.02, + "grad_norm": 0.25425496343673953, + "learning_rate": 7.494692144373672e-07, + "loss": 0.0964, + "step": 353 + }, + { + "epoch": 0.02, + "grad_norm": 0.13793263456492666, + "learning_rate": 7.515923566878982e-07, + "loss": 0.1007, + "step": 354 + }, + { + "epoch": 0.02, + "grad_norm": 0.2097891423154737, + "learning_rate": 7.537154989384289e-07, + "loss": 0.2488, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 0.08684897361559107, + "learning_rate": 7.558386411889597e-07, + "loss": 0.1011, + "step": 356 + }, + { + "epoch": 0.02, + "grad_norm": 0.14510536271973992, + "learning_rate": 7.579617834394905e-07, + "loss": 0.0371, + "step": 357 + }, + { + "epoch": 0.02, + "grad_norm": 0.2233939359582414, + "learning_rate": 7.600849256900213e-07, + "loss": 0.2513, + "step": 358 + }, + { + "epoch": 0.02, + "grad_norm": 0.1778509190137439, + "learning_rate": 7.622080679405521e-07, + "loss": 0.1493, + "step": 359 + }, + { + "epoch": 0.02, + "grad_norm": 0.29882084943077136, + "learning_rate": 7.643312101910829e-07, + "loss": 0.0833, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 0.27539509978181104, + "learning_rate": 7.664543524416136e-07, + "loss": 0.4315, + "step": 361 + }, + { + "epoch": 0.02, + "grad_norm": 0.43024944450808494, + "learning_rate": 7.685774946921444e-07, + "loss": 0.2586, + "step": 362 + }, + { + "epoch": 0.02, + "grad_norm": 0.29794875496527123, + "learning_rate": 7.707006369426752e-07, + "loss": 0.1013, + "step": 363 + }, + { + "epoch": 0.02, + "grad_norm": 0.11911472604816405, + "learning_rate": 7.72823779193206e-07, + "loss": 0.1041, + "step": 364 + }, + { + "epoch": 0.02, + "grad_norm": 0.16868684281299706, + "learning_rate": 7.749469214437368e-07, + "loss": 0.2312, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 0.1845638602526959, + "learning_rate": 7.770700636942676e-07, + "loss": 0.322, + "step": 366 + }, + { + "epoch": 0.02, + "grad_norm": 0.26984042118586926, + "learning_rate": 7.791932059447983e-07, + "loss": 0.2608, + "step": 367 + }, + { + "epoch": 0.02, + "grad_norm": 0.16810527194085234, + "learning_rate": 7.813163481953291e-07, + "loss": 0.2321, + "step": 368 + }, + { + "epoch": 0.02, + "grad_norm": 0.06283977236084466, + "learning_rate": 7.834394904458599e-07, + "loss": 0.0854, + "step": 369 + }, + { + "epoch": 0.02, + "grad_norm": 0.3218832078829887, + "learning_rate": 7.855626326963907e-07, + "loss": 0.2296, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 0.1727750015726721, + "learning_rate": 7.876857749469215e-07, + "loss": 0.0982, + "step": 371 + }, + { + "epoch": 0.02, + "grad_norm": 0.2994955471352695, + "learning_rate": 7.898089171974523e-07, + "loss": 0.0651, + "step": 372 + }, + { + "epoch": 0.02, + "grad_norm": 0.3306255736205435, + "learning_rate": 7.91932059447983e-07, + "loss": 0.2255, + "step": 373 + }, + { + "epoch": 0.02, + "grad_norm": 0.3446522069251835, + "learning_rate": 7.940552016985138e-07, + "loss": 0.2778, + "step": 374 + }, + { + "epoch": 0.02, + "grad_norm": 0.15815947749538553, + "learning_rate": 7.961783439490446e-07, + "loss": 0.0535, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 0.10367095348687529, + "learning_rate": 7.983014861995754e-07, + "loss": 0.049, + "step": 376 + }, + { + "epoch": 0.02, + "grad_norm": 0.249563603920521, + "learning_rate": 8.004246284501062e-07, + "loss": 0.2211, + "step": 377 + }, + { + "epoch": 0.02, + "grad_norm": 0.3102621403983259, + "learning_rate": 8.02547770700637e-07, + "loss": 0.158, + "step": 378 + }, + { + "epoch": 0.02, + "grad_norm": 0.45287223375211105, + "learning_rate": 8.046709129511678e-07, + "loss": 0.1806, + "step": 379 + }, + { + "epoch": 0.02, + "grad_norm": 0.24097223015886662, + "learning_rate": 8.067940552016985e-07, + "loss": 0.171, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 0.2050006159478445, + "learning_rate": 8.089171974522293e-07, + "loss": 0.0992, + "step": 381 + }, + { + "epoch": 0.02, + "grad_norm": 0.20793762739204616, + "learning_rate": 8.110403397027601e-07, + "loss": 0.1235, + "step": 382 + }, + { + "epoch": 0.02, + "grad_norm": 0.19965927601019257, + "learning_rate": 8.131634819532909e-07, + "loss": 0.5675, + "step": 383 + }, + { + "epoch": 0.02, + "grad_norm": 0.4643045643537523, + "learning_rate": 8.152866242038217e-07, + "loss": 0.1717, + "step": 384 + }, + { + "epoch": 0.02, + "grad_norm": 0.12032396043051378, + "learning_rate": 8.174097664543525e-07, + "loss": 0.1565, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 0.3713133529651683, + "learning_rate": 8.195329087048832e-07, + "loss": 0.0515, + "step": 386 + }, + { + "epoch": 0.02, + "grad_norm": 0.3202619840308792, + "learning_rate": 8.21656050955414e-07, + "loss": 0.291, + "step": 387 + }, + { + "epoch": 0.02, + "grad_norm": 0.19443370285639025, + "learning_rate": 8.237791932059448e-07, + "loss": 0.1902, + "step": 388 + }, + { + "epoch": 0.02, + "grad_norm": 0.19849023334196433, + "learning_rate": 8.259023354564756e-07, + "loss": 0.1461, + "step": 389 + }, + { + "epoch": 0.02, + "grad_norm": 0.214838462555769, + "learning_rate": 8.280254777070064e-07, + "loss": 0.1556, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 0.13309551953092594, + "learning_rate": 8.301486199575372e-07, + "loss": 0.0695, + "step": 391 + }, + { + "epoch": 0.02, + "grad_norm": 0.47221558686874704, + "learning_rate": 8.322717622080679e-07, + "loss": 0.3792, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 0.3251883958033443, + "learning_rate": 8.343949044585987e-07, + "loss": 0.1702, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 0.13748152191871976, + "learning_rate": 8.365180467091295e-07, + "loss": 0.0414, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 0.2577512517607871, + "learning_rate": 8.386411889596603e-07, + "loss": 0.2511, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 0.18287998914848866, + "learning_rate": 8.407643312101911e-07, + "loss": 0.1164, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 0.17709339849932243, + "learning_rate": 8.428874734607219e-07, + "loss": 0.0207, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 0.2298605814218488, + "learning_rate": 8.450106157112526e-07, + "loss": 0.2123, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 0.3962396303985176, + "learning_rate": 8.471337579617834e-07, + "loss": 0.4138, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 0.2808567887236285, + "learning_rate": 8.492569002123142e-07, + "loss": 0.4247, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 0.23215587936104334, + "learning_rate": 8.51380042462845e-07, + "loss": 0.1985, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 0.2939695074098435, + "learning_rate": 8.535031847133758e-07, + "loss": 0.1631, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 0.23766967230175637, + "learning_rate": 8.556263269639066e-07, + "loss": 0.1726, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 0.30850612413413675, + "learning_rate": 8.577494692144373e-07, + "loss": 0.2422, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 0.2413195428537783, + "learning_rate": 8.598726114649681e-07, + "loss": 0.5659, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 0.13921330227742174, + "learning_rate": 8.619957537154989e-07, + "loss": 0.0168, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 0.2163843683745216, + "learning_rate": 8.641188959660297e-07, + "loss": 0.3101, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 0.2196577154163935, + "learning_rate": 8.662420382165605e-07, + "loss": 0.3257, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 0.6236373549005418, + "learning_rate": 8.683651804670913e-07, + "loss": 0.3986, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 0.12046982288675889, + "learning_rate": 8.70488322717622e-07, + "loss": 0.125, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 0.2581329329149649, + "learning_rate": 8.726114649681528e-07, + "loss": 0.4828, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 0.2851014489197922, + "learning_rate": 8.747346072186836e-07, + "loss": 0.2913, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 0.2410155492065017, + "learning_rate": 8.768577494692144e-07, + "loss": 0.1909, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 0.17536848451664563, + "learning_rate": 8.789808917197452e-07, + "loss": 0.1295, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 0.38772909706184794, + "learning_rate": 8.81104033970276e-07, + "loss": 0.188, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 0.28915557836813943, + "learning_rate": 8.832271762208067e-07, + "loss": 0.1589, + "step": 416 + }, + { + "epoch": 0.03, + "grad_norm": 0.28975293174705274, + "learning_rate": 8.853503184713375e-07, + "loss": 0.1225, + "step": 417 + }, + { + "epoch": 0.03, + "grad_norm": 0.22249415674474898, + "learning_rate": 8.874734607218683e-07, + "loss": 0.2004, + "step": 418 + }, + { + "epoch": 0.03, + "grad_norm": 0.2940454009135285, + "learning_rate": 8.895966029723991e-07, + "loss": 0.1272, + "step": 419 + }, + { + "epoch": 0.03, + "grad_norm": 0.42358785034734187, + "learning_rate": 8.917197452229299e-07, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 0.21235244241456386, + "learning_rate": 8.938428874734607e-07, + "loss": 0.2632, + "step": 421 + }, + { + "epoch": 0.03, + "grad_norm": 0.37224640019004696, + "learning_rate": 8.959660297239914e-07, + "loss": 0.2549, + "step": 422 + }, + { + "epoch": 0.03, + "grad_norm": 0.36378654543030986, + "learning_rate": 8.980891719745222e-07, + "loss": 0.3476, + "step": 423 + }, + { + "epoch": 0.03, + "grad_norm": 0.2566243356722543, + "learning_rate": 9.00212314225053e-07, + "loss": 0.2631, + "step": 424 + }, + { + "epoch": 0.03, + "grad_norm": 0.4315065982171834, + "learning_rate": 9.023354564755838e-07, + "loss": 0.2681, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 0.13878811119352702, + "learning_rate": 9.044585987261146e-07, + "loss": 0.0866, + "step": 426 + }, + { + "epoch": 0.03, + "grad_norm": 0.098010200776135, + "learning_rate": 9.065817409766454e-07, + "loss": 0.0576, + "step": 427 + }, + { + "epoch": 0.03, + "grad_norm": 0.18421455741569376, + "learning_rate": 9.087048832271762e-07, + "loss": 0.1636, + "step": 428 + }, + { + "epoch": 0.03, + "grad_norm": 0.22135093664942307, + "learning_rate": 9.108280254777069e-07, + "loss": 0.1907, + "step": 429 + }, + { + "epoch": 0.03, + "grad_norm": 0.2613270863068073, + "learning_rate": 9.129511677282377e-07, + "loss": 0.0568, + "step": 430 + }, + { + "epoch": 0.03, + "grad_norm": 0.4885576563006937, + "learning_rate": 9.150743099787685e-07, + "loss": 0.1314, + "step": 431 + }, + { + "epoch": 0.03, + "grad_norm": 0.3843438674385242, + "learning_rate": 9.171974522292994e-07, + "loss": 0.2748, + "step": 432 + }, + { + "epoch": 0.03, + "grad_norm": 0.3162268006181552, + "learning_rate": 9.193205944798302e-07, + "loss": 0.2064, + "step": 433 + }, + { + "epoch": 0.03, + "grad_norm": 0.18415232052742567, + "learning_rate": 9.21443736730361e-07, + "loss": 0.1955, + "step": 434 + }, + { + "epoch": 0.03, + "grad_norm": 0.1977546707271865, + "learning_rate": 9.235668789808917e-07, + "loss": 0.1213, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 0.17277134525309018, + "learning_rate": 9.256900212314225e-07, + "loss": 0.288, + "step": 436 + }, + { + "epoch": 0.03, + "grad_norm": 0.25520185550781, + "learning_rate": 9.278131634819533e-07, + "loss": 0.1035, + "step": 437 + }, + { + "epoch": 0.03, + "grad_norm": 0.17704131808478485, + "learning_rate": 9.299363057324841e-07, + "loss": 0.0181, + "step": 438 + }, + { + "epoch": 0.03, + "grad_norm": 0.3245504988404467, + "learning_rate": 9.320594479830149e-07, + "loss": 0.2044, + "step": 439 + }, + { + "epoch": 0.03, + "grad_norm": 0.18244120441800724, + "learning_rate": 9.341825902335457e-07, + "loss": 0.2578, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 0.21902255767587192, + "learning_rate": 9.363057324840764e-07, + "loss": 0.1456, + "step": 441 + }, + { + "epoch": 0.03, + "grad_norm": 0.1695979665834792, + "learning_rate": 9.384288747346072e-07, + "loss": 0.1774, + "step": 442 + }, + { + "epoch": 0.03, + "grad_norm": 0.2676615780508529, + "learning_rate": 9.40552016985138e-07, + "loss": 0.2396, + "step": 443 + }, + { + "epoch": 0.03, + "grad_norm": 0.34088965282414585, + "learning_rate": 9.426751592356688e-07, + "loss": 0.1528, + "step": 444 + }, + { + "epoch": 0.03, + "grad_norm": 0.07925269437411404, + "learning_rate": 9.447983014861996e-07, + "loss": 0.0918, + "step": 445 + }, + { + "epoch": 0.03, + "grad_norm": 0.13870357480386233, + "learning_rate": 9.469214437367304e-07, + "loss": 0.0852, + "step": 446 + }, + { + "epoch": 0.03, + "grad_norm": 0.32870650312044764, + "learning_rate": 9.490445859872611e-07, + "loss": 0.1525, + "step": 447 + }, + { + "epoch": 0.03, + "grad_norm": 0.24305219592733074, + "learning_rate": 9.511677282377919e-07, + "loss": 0.2167, + "step": 448 + }, + { + "epoch": 0.03, + "grad_norm": 0.31525378059408554, + "learning_rate": 9.532908704883227e-07, + "loss": 0.364, + "step": 449 + }, + { + "epoch": 0.03, + "grad_norm": 0.4545081071608993, + "learning_rate": 9.554140127388535e-07, + "loss": 0.0589, + "step": 450 + }, + { + "epoch": 0.03, + "grad_norm": 0.1864295918607807, + "learning_rate": 9.575371549893843e-07, + "loss": 0.1124, + "step": 451 + }, + { + "epoch": 0.03, + "grad_norm": 0.1497477565593572, + "learning_rate": 9.59660297239915e-07, + "loss": 0.2063, + "step": 452 + }, + { + "epoch": 0.03, + "grad_norm": 0.15682831310264794, + "learning_rate": 9.617834394904458e-07, + "loss": 0.1217, + "step": 453 + }, + { + "epoch": 0.03, + "grad_norm": 0.23565246524026945, + "learning_rate": 9.639065817409766e-07, + "loss": 0.2472, + "step": 454 + }, + { + "epoch": 0.03, + "grad_norm": 0.21338418196035058, + "learning_rate": 9.660297239915074e-07, + "loss": 0.1409, + "step": 455 + }, + { + "epoch": 0.03, + "grad_norm": 0.32363018578448816, + "learning_rate": 9.681528662420382e-07, + "loss": 0.0964, + "step": 456 + }, + { + "epoch": 0.03, + "grad_norm": 0.36305631286103945, + "learning_rate": 9.70276008492569e-07, + "loss": 0.2474, + "step": 457 + }, + { + "epoch": 0.03, + "grad_norm": 0.1315636111272709, + "learning_rate": 9.723991507430998e-07, + "loss": 0.0133, + "step": 458 + }, + { + "epoch": 0.03, + "grad_norm": 0.36665855866774594, + "learning_rate": 9.745222929936306e-07, + "loss": 0.0556, + "step": 459 + }, + { + "epoch": 0.03, + "grad_norm": 0.2932467384725678, + "learning_rate": 9.766454352441613e-07, + "loss": 0.4045, + "step": 460 + }, + { + "epoch": 0.03, + "grad_norm": 0.19772647917989614, + "learning_rate": 9.787685774946921e-07, + "loss": 0.1258, + "step": 461 + }, + { + "epoch": 0.03, + "grad_norm": 0.1944562471708232, + "learning_rate": 9.80891719745223e-07, + "loss": 0.1912, + "step": 462 + }, + { + "epoch": 0.03, + "grad_norm": 0.5129004712182335, + "learning_rate": 9.830148619957537e-07, + "loss": 0.0369, + "step": 463 + }, + { + "epoch": 0.03, + "grad_norm": 0.27134534580369873, + "learning_rate": 9.851380042462845e-07, + "loss": 0.2076, + "step": 464 + }, + { + "epoch": 0.03, + "grad_norm": 0.6036942576112112, + "learning_rate": 9.872611464968153e-07, + "loss": 0.1239, + "step": 465 + }, + { + "epoch": 0.03, + "grad_norm": 0.4238546988928349, + "learning_rate": 9.89384288747346e-07, + "loss": 0.1025, + "step": 466 + }, + { + "epoch": 0.03, + "grad_norm": 0.27541270253213207, + "learning_rate": 9.915074309978768e-07, + "loss": 0.194, + "step": 467 + }, + { + "epoch": 0.03, + "grad_norm": 0.2546318338738281, + "learning_rate": 9.936305732484076e-07, + "loss": 0.2443, + "step": 468 + }, + { + "epoch": 0.03, + "grad_norm": 0.18805689752215418, + "learning_rate": 9.957537154989384e-07, + "loss": 0.1765, + "step": 469 + }, + { + "epoch": 0.03, + "grad_norm": 0.29960419963214957, + "learning_rate": 9.978768577494692e-07, + "loss": 0.2171, + "step": 470 + }, + { + "epoch": 0.03, + "grad_norm": 0.34422612896806737, + "learning_rate": 1e-06, + "loss": 0.4547, + "step": 471 + }, + { + "epoch": 0.03, + "grad_norm": 0.21025726996204436, + "learning_rate": 9.999999893344976e-07, + "loss": 0.2975, + "step": 472 + }, + { + "epoch": 0.03, + "grad_norm": 0.3184464372094676, + "learning_rate": 9.999999573379902e-07, + "loss": 0.3635, + "step": 473 + }, + { + "epoch": 0.03, + "grad_norm": 0.22529194348745743, + "learning_rate": 9.999999040104795e-07, + "loss": 0.1233, + "step": 474 + }, + { + "epoch": 0.03, + "grad_norm": 0.17920512378839853, + "learning_rate": 9.99999829351968e-07, + "loss": 0.1818, + "step": 475 + }, + { + "epoch": 0.03, + "grad_norm": 0.21146143920686314, + "learning_rate": 9.999997333624587e-07, + "loss": 0.1197, + "step": 476 + }, + { + "epoch": 0.03, + "grad_norm": 0.21073388057143283, + "learning_rate": 9.999996160419555e-07, + "loss": 0.1825, + "step": 477 + }, + { + "epoch": 0.03, + "grad_norm": 0.16078577139204342, + "learning_rate": 9.999994773904636e-07, + "loss": 0.2614, + "step": 478 + }, + { + "epoch": 0.03, + "grad_norm": 0.2761128508668464, + "learning_rate": 9.999993174079888e-07, + "loss": 0.1577, + "step": 479 + }, + { + "epoch": 0.03, + "grad_norm": 0.2885503791210814, + "learning_rate": 9.999991360945382e-07, + "loss": 0.1658, + "step": 480 + }, + { + "epoch": 0.03, + "grad_norm": 0.5056550557135914, + "learning_rate": 9.99998933450119e-07, + "loss": 0.1356, + "step": 481 + }, + { + "epoch": 0.03, + "grad_norm": 0.2587423302824715, + "learning_rate": 9.999987094747404e-07, + "loss": 0.1906, + "step": 482 + }, + { + "epoch": 0.03, + "grad_norm": 0.27175524731807416, + "learning_rate": 9.999984641684116e-07, + "loss": 0.1844, + "step": 483 + }, + { + "epoch": 0.03, + "grad_norm": 0.16102534545447217, + "learning_rate": 9.999981975311433e-07, + "loss": 0.1798, + "step": 484 + }, + { + "epoch": 0.03, + "grad_norm": 0.13692929348976293, + "learning_rate": 9.999979095629469e-07, + "loss": 0.1092, + "step": 485 + }, + { + "epoch": 0.03, + "grad_norm": 0.19888672041303426, + "learning_rate": 9.999976002638344e-07, + "loss": 0.3372, + "step": 486 + }, + { + "epoch": 0.03, + "grad_norm": 0.26586280153311215, + "learning_rate": 9.99997269633819e-07, + "loss": 0.1877, + "step": 487 + }, + { + "epoch": 0.03, + "grad_norm": 0.10917284468967871, + "learning_rate": 9.999969176729153e-07, + "loss": 0.1221, + "step": 488 + }, + { + "epoch": 0.03, + "grad_norm": 0.650651973475683, + "learning_rate": 9.999965443811376e-07, + "loss": 0.1607, + "step": 489 + }, + { + "epoch": 0.03, + "grad_norm": 0.2145075448697716, + "learning_rate": 9.999961497585024e-07, + "loss": 0.0983, + "step": 490 + }, + { + "epoch": 0.03, + "grad_norm": 0.6452394084802534, + "learning_rate": 9.999957338050265e-07, + "loss": 0.3028, + "step": 491 + }, + { + "epoch": 0.03, + "grad_norm": 0.29862832309920007, + "learning_rate": 9.999952965207273e-07, + "loss": 0.1307, + "step": 492 + }, + { + "epoch": 0.03, + "grad_norm": 0.32137994202195397, + "learning_rate": 9.999948379056235e-07, + "loss": 0.1424, + "step": 493 + }, + { + "epoch": 0.03, + "grad_norm": 0.38034356609412096, + "learning_rate": 9.99994357959735e-07, + "loss": 0.3911, + "step": 494 + }, + { + "epoch": 0.03, + "grad_norm": 0.15666426264417455, + "learning_rate": 9.99993856683082e-07, + "loss": 0.2112, + "step": 495 + }, + { + "epoch": 0.03, + "grad_norm": 0.12027522270375962, + "learning_rate": 9.99993334075686e-07, + "loss": 0.0216, + "step": 496 + }, + { + "epoch": 0.03, + "grad_norm": 0.10923574165943764, + "learning_rate": 9.99992790137569e-07, + "loss": 0.1166, + "step": 497 + }, + { + "epoch": 0.03, + "grad_norm": 0.14919427521926085, + "learning_rate": 9.999922248687548e-07, + "loss": 0.1056, + "step": 498 + }, + { + "epoch": 0.03, + "grad_norm": 0.3044315277531875, + "learning_rate": 9.99991638269267e-07, + "loss": 0.1135, + "step": 499 + }, + { + "epoch": 0.03, + "grad_norm": 0.6105525827468412, + "learning_rate": 9.999910303391308e-07, + "loss": 0.1656, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 0.23900556126543304, + "learning_rate": 9.999904010783723e-07, + "loss": 0.1068, + "step": 501 + }, + { + "epoch": 0.03, + "grad_norm": 0.394750778990148, + "learning_rate": 9.999897504870182e-07, + "loss": 0.2906, + "step": 502 + }, + { + "epoch": 0.03, + "grad_norm": 0.12818377999218414, + "learning_rate": 9.99989078565096e-07, + "loss": 0.1879, + "step": 503 + }, + { + "epoch": 0.03, + "grad_norm": 0.3565725372269199, + "learning_rate": 9.999883853126348e-07, + "loss": 0.134, + "step": 504 + }, + { + "epoch": 0.03, + "grad_norm": 0.26619545869125333, + "learning_rate": 9.999876707296637e-07, + "loss": 0.2273, + "step": 505 + }, + { + "epoch": 0.03, + "grad_norm": 0.2532572699129161, + "learning_rate": 9.999869348162139e-07, + "loss": 0.021, + "step": 506 + }, + { + "epoch": 0.03, + "grad_norm": 0.16057755469215762, + "learning_rate": 9.999861775723161e-07, + "loss": 0.038, + "step": 507 + }, + { + "epoch": 0.03, + "grad_norm": 0.14580176791619404, + "learning_rate": 9.999853989980027e-07, + "loss": 0.1291, + "step": 508 + }, + { + "epoch": 0.03, + "grad_norm": 0.23289785982715136, + "learning_rate": 9.999845990933073e-07, + "loss": 0.1479, + "step": 509 + }, + { + "epoch": 0.03, + "grad_norm": 0.33869594797132546, + "learning_rate": 9.999837778582638e-07, + "loss": 0.1374, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 0.1817354272596778, + "learning_rate": 9.999829352929074e-07, + "loss": 0.0891, + "step": 511 + }, + { + "epoch": 0.03, + "grad_norm": 0.3110651313704324, + "learning_rate": 9.999820713972737e-07, + "loss": 0.0974, + "step": 512 + }, + { + "epoch": 0.03, + "grad_norm": 0.23108817473383325, + "learning_rate": 9.999811861713998e-07, + "loss": 0.2463, + "step": 513 + }, + { + "epoch": 0.03, + "grad_norm": 0.15727001210456104, + "learning_rate": 9.999802796153234e-07, + "loss": 0.015, + "step": 514 + }, + { + "epoch": 0.03, + "grad_norm": 0.26637691734140634, + "learning_rate": 9.99979351729083e-07, + "loss": 0.3334, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 0.2124085837214984, + "learning_rate": 9.999784025127185e-07, + "loss": 0.0869, + "step": 516 + }, + { + "epoch": 0.03, + "grad_norm": 0.199038642426334, + "learning_rate": 9.999774319662703e-07, + "loss": 0.1575, + "step": 517 + }, + { + "epoch": 0.03, + "grad_norm": 0.27989246765365144, + "learning_rate": 9.999764400897798e-07, + "loss": 0.0992, + "step": 518 + }, + { + "epoch": 0.03, + "grad_norm": 0.24058349608902685, + "learning_rate": 9.999754268832893e-07, + "loss": 0.4741, + "step": 519 + }, + { + "epoch": 0.03, + "grad_norm": 0.21542311327582966, + "learning_rate": 9.99974392346842e-07, + "loss": 0.2214, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 0.3549191787539132, + "learning_rate": 9.999733364804818e-07, + "loss": 0.0226, + "step": 521 + }, + { + "epoch": 0.03, + "grad_norm": 0.2232365983566046, + "learning_rate": 9.999722592842543e-07, + "loss": 0.1226, + "step": 522 + }, + { + "epoch": 0.03, + "grad_norm": 0.2583979267337865, + "learning_rate": 9.999711607582052e-07, + "loss": 0.2159, + "step": 523 + }, + { + "epoch": 0.03, + "grad_norm": 0.2723403267606881, + "learning_rate": 9.99970040902381e-07, + "loss": 0.3245, + "step": 524 + }, + { + "epoch": 0.03, + "grad_norm": 0.2542485452458295, + "learning_rate": 9.999688997168301e-07, + "loss": 0.2327, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 0.41809755135138027, + "learning_rate": 9.999677372016007e-07, + "loss": 0.1287, + "step": 526 + }, + { + "epoch": 0.03, + "grad_norm": 0.27112163543158724, + "learning_rate": 9.999665533567426e-07, + "loss": 0.3427, + "step": 527 + }, + { + "epoch": 0.03, + "grad_norm": 0.26191221466291176, + "learning_rate": 9.999653481823063e-07, + "loss": 0.2782, + "step": 528 + }, + { + "epoch": 0.03, + "grad_norm": 0.2102654999678496, + "learning_rate": 9.99964121678343e-07, + "loss": 0.3097, + "step": 529 + }, + { + "epoch": 0.03, + "grad_norm": 0.11716270764715653, + "learning_rate": 9.999628738449055e-07, + "loss": 0.0224, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 0.2342432126524212, + "learning_rate": 9.999616046820466e-07, + "loss": 0.1761, + "step": 531 + }, + { + "epoch": 0.03, + "grad_norm": 0.528700125502073, + "learning_rate": 9.999603141898207e-07, + "loss": 0.3711, + "step": 532 + }, + { + "epoch": 0.03, + "grad_norm": 0.22774259370215127, + "learning_rate": 9.999590023682826e-07, + "loss": 0.1242, + "step": 533 + }, + { + "epoch": 0.03, + "grad_norm": 0.25913121938300426, + "learning_rate": 9.999576692174884e-07, + "loss": 0.3121, + "step": 534 + }, + { + "epoch": 0.03, + "grad_norm": 0.2695919605172889, + "learning_rate": 9.999563147374952e-07, + "loss": 0.2923, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 0.17673669493872907, + "learning_rate": 9.999549389283606e-07, + "loss": 0.135, + "step": 536 + }, + { + "epoch": 0.03, + "grad_norm": 0.23203256134608177, + "learning_rate": 9.99953541790143e-07, + "loss": 0.0209, + "step": 537 + }, + { + "epoch": 0.03, + "grad_norm": 0.39836612884069494, + "learning_rate": 9.999521233229025e-07, + "loss": 0.351, + "step": 538 + }, + { + "epoch": 0.03, + "grad_norm": 0.34472204539733897, + "learning_rate": 9.999506835266992e-07, + "loss": 0.2228, + "step": 539 + }, + { + "epoch": 0.03, + "grad_norm": 0.264377801633734, + "learning_rate": 9.999492224015948e-07, + "loss": 0.0628, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 0.20910132050425237, + "learning_rate": 9.999477399476516e-07, + "loss": 0.1251, + "step": 541 + }, + { + "epoch": 0.03, + "grad_norm": 0.23636435994906232, + "learning_rate": 9.999462361649327e-07, + "loss": 0.2616, + "step": 542 + }, + { + "epoch": 0.03, + "grad_norm": 0.40297277864631065, + "learning_rate": 9.999447110535024e-07, + "loss": 0.3826, + "step": 543 + }, + { + "epoch": 0.03, + "grad_norm": 0.28148856030908415, + "learning_rate": 9.999431646134257e-07, + "loss": 0.1426, + "step": 544 + }, + { + "epoch": 0.03, + "grad_norm": 0.26535308731379725, + "learning_rate": 9.999415968447687e-07, + "loss": 0.4973, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 0.14706290474883787, + "learning_rate": 9.99940007747598e-07, + "loss": 0.1553, + "step": 546 + }, + { + "epoch": 0.03, + "grad_norm": 0.17983524372845375, + "learning_rate": 9.999383973219816e-07, + "loss": 0.2053, + "step": 547 + }, + { + "epoch": 0.03, + "grad_norm": 0.1621418678808146, + "learning_rate": 9.999367655679881e-07, + "loss": 0.0513, + "step": 548 + }, + { + "epoch": 0.04, + "grad_norm": 0.06329448454305941, + "learning_rate": 9.999351124856873e-07, + "loss": 0.0875, + "step": 549 + }, + { + "epoch": 0.04, + "grad_norm": 0.20894730328341823, + "learning_rate": 9.999334380751497e-07, + "loss": 0.0524, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 0.3801867943102373, + "learning_rate": 9.999317423364465e-07, + "loss": 0.2819, + "step": 551 + }, + { + "epoch": 0.04, + "grad_norm": 0.12575459885606005, + "learning_rate": 9.9993002526965e-07, + "loss": 0.174, + "step": 552 + }, + { + "epoch": 0.04, + "grad_norm": 0.19373342460912157, + "learning_rate": 9.99928286874834e-07, + "loss": 0.2381, + "step": 553 + }, + { + "epoch": 0.04, + "grad_norm": 0.290343647970398, + "learning_rate": 9.99926527152072e-07, + "loss": 0.2266, + "step": 554 + }, + { + "epoch": 0.04, + "grad_norm": 0.21789424364573587, + "learning_rate": 9.999247461014395e-07, + "loss": 0.1309, + "step": 555 + }, + { + "epoch": 0.04, + "grad_norm": 0.3589600866174122, + "learning_rate": 9.999229437230123e-07, + "loss": 0.1268, + "step": 556 + }, + { + "epoch": 0.04, + "grad_norm": 0.32039536701754084, + "learning_rate": 9.999211200168675e-07, + "loss": 0.0894, + "step": 557 + }, + { + "epoch": 0.04, + "grad_norm": 0.602307055588835, + "learning_rate": 9.999192749830827e-07, + "loss": 0.2676, + "step": 558 + }, + { + "epoch": 0.04, + "grad_norm": 0.1731340196071816, + "learning_rate": 9.999174086217368e-07, + "loss": 0.2218, + "step": 559 + }, + { + "epoch": 0.04, + "grad_norm": 0.21485030799997704, + "learning_rate": 9.99915520932909e-07, + "loss": 0.197, + "step": 560 + }, + { + "epoch": 0.04, + "grad_norm": 0.09116657996386676, + "learning_rate": 9.999136119166803e-07, + "loss": 0.0076, + "step": 561 + }, + { + "epoch": 0.04, + "grad_norm": 0.2595362787199046, + "learning_rate": 9.999116815731318e-07, + "loss": 0.1199, + "step": 562 + }, + { + "epoch": 0.04, + "grad_norm": 0.2943434903221813, + "learning_rate": 9.999097299023463e-07, + "loss": 0.2567, + "step": 563 + }, + { + "epoch": 0.04, + "grad_norm": 0.2549772125656584, + "learning_rate": 9.999077569044066e-07, + "loss": 0.2859, + "step": 564 + }, + { + "epoch": 0.04, + "grad_norm": 0.11980427571428975, + "learning_rate": 9.999057625793969e-07, + "loss": 0.0486, + "step": 565 + }, + { + "epoch": 0.04, + "grad_norm": 0.23365723416624679, + "learning_rate": 9.999037469274026e-07, + "loss": 0.1725, + "step": 566 + }, + { + "epoch": 0.04, + "grad_norm": 0.07751232484547327, + "learning_rate": 9.999017099485095e-07, + "loss": 0.0058, + "step": 567 + }, + { + "epoch": 0.04, + "grad_norm": 0.47467674056310083, + "learning_rate": 9.998996516428045e-07, + "loss": 0.2391, + "step": 568 + }, + { + "epoch": 0.04, + "grad_norm": 0.30761449261074536, + "learning_rate": 9.998975720103756e-07, + "loss": 0.2047, + "step": 569 + }, + { + "epoch": 0.04, + "grad_norm": 0.18044750339675913, + "learning_rate": 9.99895471051311e-07, + "loss": 0.2432, + "step": 570 + }, + { + "epoch": 0.04, + "grad_norm": 0.28473339587930035, + "learning_rate": 9.99893348765701e-07, + "loss": 0.1458, + "step": 571 + }, + { + "epoch": 0.04, + "grad_norm": 0.20667850273765226, + "learning_rate": 9.998912051536358e-07, + "loss": 0.1582, + "step": 572 + }, + { + "epoch": 0.04, + "grad_norm": 0.3232007620220984, + "learning_rate": 9.998890402152067e-07, + "loss": 0.2699, + "step": 573 + }, + { + "epoch": 0.04, + "grad_norm": 0.2747885272567359, + "learning_rate": 9.998868539505065e-07, + "loss": 0.061, + "step": 574 + }, + { + "epoch": 0.04, + "grad_norm": 0.3028579312727913, + "learning_rate": 9.99884646359628e-07, + "loss": 0.4315, + "step": 575 + }, + { + "epoch": 0.04, + "grad_norm": 0.26332033672438776, + "learning_rate": 9.998824174426656e-07, + "loss": 0.0787, + "step": 576 + }, + { + "epoch": 0.04, + "grad_norm": 0.35321268776480974, + "learning_rate": 9.998801671997146e-07, + "loss": 0.3332, + "step": 577 + }, + { + "epoch": 0.04, + "grad_norm": 0.24897210129496727, + "learning_rate": 9.998778956308707e-07, + "loss": 0.0626, + "step": 578 + }, + { + "epoch": 0.04, + "grad_norm": 0.47698351985136805, + "learning_rate": 9.998756027362308e-07, + "loss": 0.3223, + "step": 579 + }, + { + "epoch": 0.04, + "grad_norm": 0.19134790383960557, + "learning_rate": 9.998732885158927e-07, + "loss": 0.2695, + "step": 580 + }, + { + "epoch": 0.04, + "grad_norm": 0.4879603420778916, + "learning_rate": 9.998709529699555e-07, + "loss": 0.1466, + "step": 581 + }, + { + "epoch": 0.04, + "grad_norm": 0.3128280251335007, + "learning_rate": 9.998685960985186e-07, + "loss": 0.4176, + "step": 582 + }, + { + "epoch": 0.04, + "grad_norm": 0.1838525897173844, + "learning_rate": 9.998662179016821e-07, + "loss": 0.1679, + "step": 583 + }, + { + "epoch": 0.04, + "grad_norm": 0.19676301033745802, + "learning_rate": 9.998638183795483e-07, + "loss": 0.2997, + "step": 584 + }, + { + "epoch": 0.04, + "grad_norm": 0.1403235253701055, + "learning_rate": 9.99861397532219e-07, + "loss": 0.1538, + "step": 585 + }, + { + "epoch": 0.04, + "grad_norm": 0.2713200681540278, + "learning_rate": 9.998589553597976e-07, + "loss": 0.2481, + "step": 586 + }, + { + "epoch": 0.04, + "grad_norm": 0.31210448436474636, + "learning_rate": 9.998564918623884e-07, + "loss": 0.3303, + "step": 587 + }, + { + "epoch": 0.04, + "grad_norm": 0.19441767560085377, + "learning_rate": 9.998540070400965e-07, + "loss": 0.1644, + "step": 588 + }, + { + "epoch": 0.04, + "grad_norm": 0.1307191376988514, + "learning_rate": 9.998515008930277e-07, + "loss": 0.202, + "step": 589 + }, + { + "epoch": 0.04, + "grad_norm": 0.7254158512474338, + "learning_rate": 9.99848973421289e-07, + "loss": 0.3441, + "step": 590 + }, + { + "epoch": 0.04, + "grad_norm": 0.27264410203686634, + "learning_rate": 9.998464246249883e-07, + "loss": 0.2037, + "step": 591 + }, + { + "epoch": 0.04, + "grad_norm": 0.2911967943345156, + "learning_rate": 9.998438545042345e-07, + "loss": 0.0963, + "step": 592 + }, + { + "epoch": 0.04, + "grad_norm": 0.327683277210842, + "learning_rate": 9.998412630591369e-07, + "loss": 0.1515, + "step": 593 + }, + { + "epoch": 0.04, + "grad_norm": 0.2219327938303094, + "learning_rate": 9.998386502898061e-07, + "loss": 0.1894, + "step": 594 + }, + { + "epoch": 0.04, + "grad_norm": 0.18622294921364274, + "learning_rate": 9.99836016196354e-07, + "loss": 0.1695, + "step": 595 + }, + { + "epoch": 0.04, + "grad_norm": 0.2925134602430663, + "learning_rate": 9.998333607788922e-07, + "loss": 0.1408, + "step": 596 + }, + { + "epoch": 0.04, + "grad_norm": 0.40499870275179556, + "learning_rate": 9.998306840375349e-07, + "loss": 0.1963, + "step": 597 + }, + { + "epoch": 0.04, + "grad_norm": 0.40439244999887497, + "learning_rate": 9.998279859723954e-07, + "loss": 0.2666, + "step": 598 + }, + { + "epoch": 0.04, + "grad_norm": 0.30494083909444997, + "learning_rate": 9.998252665835895e-07, + "loss": 0.1802, + "step": 599 + }, + { + "epoch": 0.04, + "grad_norm": 0.29650189969662283, + "learning_rate": 9.99822525871233e-07, + "loss": 0.1154, + "step": 600 + }, + { + "epoch": 0.04, + "grad_norm": 0.3300319194535553, + "learning_rate": 9.998197638354427e-07, + "loss": 0.0357, + "step": 601 + }, + { + "epoch": 0.04, + "grad_norm": 0.29748899925065037, + "learning_rate": 9.998169804763365e-07, + "loss": 0.3921, + "step": 602 + }, + { + "epoch": 0.04, + "grad_norm": 0.2842270442567387, + "learning_rate": 9.99814175794033e-07, + "loss": 0.1997, + "step": 603 + }, + { + "epoch": 0.04, + "grad_norm": 0.17694701192442455, + "learning_rate": 9.998113497886522e-07, + "loss": 0.0825, + "step": 604 + }, + { + "epoch": 0.04, + "grad_norm": 0.19775297135044023, + "learning_rate": 9.998085024603144e-07, + "loss": 0.2962, + "step": 605 + }, + { + "epoch": 0.04, + "grad_norm": 0.4098023299487314, + "learning_rate": 9.998056338091413e-07, + "loss": 0.1432, + "step": 606 + }, + { + "epoch": 0.04, + "grad_norm": 0.2230885301929253, + "learning_rate": 9.99802743835255e-07, + "loss": 0.3254, + "step": 607 + }, + { + "epoch": 0.04, + "grad_norm": 0.12637777237273629, + "learning_rate": 9.997998325387788e-07, + "loss": 0.0687, + "step": 608 + }, + { + "epoch": 0.04, + "grad_norm": 0.3196469626868912, + "learning_rate": 9.997968999198371e-07, + "loss": 0.0266, + "step": 609 + }, + { + "epoch": 0.04, + "grad_norm": 0.2310289775253604, + "learning_rate": 9.997939459785552e-07, + "loss": 0.2164, + "step": 610 + }, + { + "epoch": 0.04, + "grad_norm": 0.26611036284271894, + "learning_rate": 9.997909707150584e-07, + "loss": 0.3605, + "step": 611 + }, + { + "epoch": 0.04, + "grad_norm": 0.2614133341008609, + "learning_rate": 9.997879741294744e-07, + "loss": 0.0632, + "step": 612 + }, + { + "epoch": 0.04, + "grad_norm": 0.26078759864867007, + "learning_rate": 9.997849562219307e-07, + "loss": 0.1604, + "step": 613 + }, + { + "epoch": 0.04, + "grad_norm": 0.44154590338271715, + "learning_rate": 9.99781916992556e-07, + "loss": 0.2396, + "step": 614 + }, + { + "epoch": 0.04, + "grad_norm": 0.14067566349075272, + "learning_rate": 9.9977885644148e-07, + "loss": 0.2071, + "step": 615 + }, + { + "epoch": 0.04, + "grad_norm": 0.3575616851456624, + "learning_rate": 9.997757745688334e-07, + "loss": 0.3583, + "step": 616 + }, + { + "epoch": 0.04, + "grad_norm": 0.33687583636678, + "learning_rate": 9.997726713747475e-07, + "loss": 0.1614, + "step": 617 + }, + { + "epoch": 0.04, + "grad_norm": 0.1287137337645716, + "learning_rate": 9.997695468593547e-07, + "loss": 0.0285, + "step": 618 + }, + { + "epoch": 0.04, + "grad_norm": 0.22629252964500782, + "learning_rate": 9.997664010227885e-07, + "loss": 0.1418, + "step": 619 + }, + { + "epoch": 0.04, + "grad_norm": 0.2244281138166755, + "learning_rate": 9.997632338651828e-07, + "loss": 0.2675, + "step": 620 + }, + { + "epoch": 0.04, + "grad_norm": 0.4123947347059294, + "learning_rate": 9.997600453866732e-07, + "loss": 0.3591, + "step": 621 + }, + { + "epoch": 0.04, + "grad_norm": 0.11262626115015796, + "learning_rate": 9.997568355873953e-07, + "loss": 0.1854, + "step": 622 + }, + { + "epoch": 0.04, + "grad_norm": 0.19934916889725812, + "learning_rate": 9.997536044674862e-07, + "loss": 0.0942, + "step": 623 + }, + { + "epoch": 0.04, + "grad_norm": 0.4240857699247332, + "learning_rate": 9.997503520270835e-07, + "loss": 0.1083, + "step": 624 + }, + { + "epoch": 0.04, + "grad_norm": 0.2192476132653266, + "learning_rate": 9.997470782663262e-07, + "loss": 0.2008, + "step": 625 + }, + { + "epoch": 0.04, + "grad_norm": 0.47564784426711987, + "learning_rate": 9.99743783185354e-07, + "loss": 0.1532, + "step": 626 + }, + { + "epoch": 0.04, + "grad_norm": 0.33507348750716026, + "learning_rate": 9.997404667843074e-07, + "loss": 0.0814, + "step": 627 + }, + { + "epoch": 0.04, + "grad_norm": 0.20428464002313707, + "learning_rate": 9.997371290633278e-07, + "loss": 0.3139, + "step": 628 + }, + { + "epoch": 0.04, + "grad_norm": 0.3458666228430019, + "learning_rate": 9.997337700225578e-07, + "loss": 0.1347, + "step": 629 + }, + { + "epoch": 0.04, + "grad_norm": 0.1155058198126477, + "learning_rate": 9.997303896621404e-07, + "loss": 0.1214, + "step": 630 + }, + { + "epoch": 0.04, + "grad_norm": 0.2868224800278387, + "learning_rate": 9.997269879822204e-07, + "loss": 0.1442, + "step": 631 + }, + { + "epoch": 0.04, + "grad_norm": 0.21098114298120535, + "learning_rate": 9.99723564982942e-07, + "loss": 0.4881, + "step": 632 + }, + { + "epoch": 0.04, + "grad_norm": 0.28539541990386547, + "learning_rate": 9.99720120664452e-07, + "loss": 0.212, + "step": 633 + }, + { + "epoch": 0.04, + "grad_norm": 0.15762416827649048, + "learning_rate": 9.997166550268972e-07, + "loss": 0.0298, + "step": 634 + }, + { + "epoch": 0.04, + "grad_norm": 0.17249326656550196, + "learning_rate": 9.997131680704251e-07, + "loss": 0.1836, + "step": 635 + }, + { + "epoch": 0.04, + "grad_norm": 0.26370313587358046, + "learning_rate": 9.99709659795185e-07, + "loss": 0.3679, + "step": 636 + }, + { + "epoch": 0.04, + "grad_norm": 0.23461575570233473, + "learning_rate": 9.99706130201326e-07, + "loss": 0.2692, + "step": 637 + }, + { + "epoch": 0.04, + "grad_norm": 0.2979758421620129, + "learning_rate": 9.99702579288999e-07, + "loss": 0.2391, + "step": 638 + }, + { + "epoch": 0.04, + "grad_norm": 0.1770101807416079, + "learning_rate": 9.996990070583555e-07, + "loss": 0.2021, + "step": 639 + }, + { + "epoch": 0.04, + "grad_norm": 0.3250207900693622, + "learning_rate": 9.996954135095478e-07, + "loss": 0.1972, + "step": 640 + }, + { + "epoch": 0.04, + "grad_norm": 0.29805102006964246, + "learning_rate": 9.996917986427293e-07, + "loss": 0.1832, + "step": 641 + }, + { + "epoch": 0.04, + "grad_norm": 0.34048618804477215, + "learning_rate": 9.99688162458054e-07, + "loss": 0.0834, + "step": 642 + }, + { + "epoch": 0.04, + "grad_norm": 0.276125773326266, + "learning_rate": 9.996845049556775e-07, + "loss": 0.0967, + "step": 643 + }, + { + "epoch": 0.04, + "grad_norm": 0.3643296711012014, + "learning_rate": 9.996808261357553e-07, + "loss": 0.4839, + "step": 644 + }, + { + "epoch": 0.04, + "grad_norm": 0.3594869795982038, + "learning_rate": 9.996771259984446e-07, + "loss": 0.2085, + "step": 645 + }, + { + "epoch": 0.04, + "grad_norm": 0.25612813685311064, + "learning_rate": 9.996734045439032e-07, + "loss": 0.2249, + "step": 646 + }, + { + "epoch": 0.04, + "grad_norm": 0.5294491242173487, + "learning_rate": 9.9966966177229e-07, + "loss": 0.2232, + "step": 647 + }, + { + "epoch": 0.04, + "grad_norm": 0.223394109480478, + "learning_rate": 9.996658976837644e-07, + "loss": 0.2066, + "step": 648 + }, + { + "epoch": 0.04, + "grad_norm": 0.0964490455397499, + "learning_rate": 9.996621122784872e-07, + "loss": 0.0176, + "step": 649 + }, + { + "epoch": 0.04, + "grad_norm": 0.9916404275370678, + "learning_rate": 9.996583055566199e-07, + "loss": 0.2847, + "step": 650 + }, + { + "epoch": 0.04, + "grad_norm": 0.4085104775813451, + "learning_rate": 9.99654477518325e-07, + "loss": 0.1965, + "step": 651 + }, + { + "epoch": 0.04, + "grad_norm": 0.26930592600432185, + "learning_rate": 9.996506281637653e-07, + "loss": 0.1166, + "step": 652 + }, + { + "epoch": 0.04, + "grad_norm": 0.20570651628474856, + "learning_rate": 9.996467574931058e-07, + "loss": 0.2854, + "step": 653 + }, + { + "epoch": 0.04, + "grad_norm": 0.23019324170520708, + "learning_rate": 9.996428655065108e-07, + "loss": 0.238, + "step": 654 + }, + { + "epoch": 0.04, + "grad_norm": 0.28456364609373247, + "learning_rate": 9.99638952204147e-07, + "loss": 0.3202, + "step": 655 + }, + { + "epoch": 0.04, + "grad_norm": 0.33711673215738003, + "learning_rate": 9.996350175861809e-07, + "loss": 0.2112, + "step": 656 + }, + { + "epoch": 0.04, + "grad_norm": 0.2628481495394333, + "learning_rate": 9.996310616527808e-07, + "loss": 0.097, + "step": 657 + }, + { + "epoch": 0.04, + "grad_norm": 0.14330502365215603, + "learning_rate": 9.99627084404115e-07, + "loss": 0.1624, + "step": 658 + }, + { + "epoch": 0.04, + "grad_norm": 0.14064107397795123, + "learning_rate": 9.996230858403536e-07, + "loss": 0.0913, + "step": 659 + }, + { + "epoch": 0.04, + "grad_norm": 0.3607971179922472, + "learning_rate": 9.99619065961667e-07, + "loss": 0.0854, + "step": 660 + }, + { + "epoch": 0.04, + "grad_norm": 0.23005910265204066, + "learning_rate": 9.996150247682265e-07, + "loss": 0.2756, + "step": 661 + }, + { + "epoch": 0.04, + "grad_norm": 0.24785734359247824, + "learning_rate": 9.996109622602048e-07, + "loss": 0.1309, + "step": 662 + }, + { + "epoch": 0.04, + "grad_norm": 0.4946815559380326, + "learning_rate": 9.99606878437775e-07, + "loss": 0.3128, + "step": 663 + }, + { + "epoch": 0.04, + "grad_norm": 0.18161354687738152, + "learning_rate": 9.996027733011117e-07, + "loss": 0.1091, + "step": 664 + }, + { + "epoch": 0.04, + "grad_norm": 0.40597640467228835, + "learning_rate": 9.995986468503894e-07, + "loss": 0.3422, + "step": 665 + }, + { + "epoch": 0.04, + "grad_norm": 0.10124882585850081, + "learning_rate": 9.995944990857847e-07, + "loss": 0.1157, + "step": 666 + }, + { + "epoch": 0.04, + "grad_norm": 0.31196022827667125, + "learning_rate": 9.995903300074744e-07, + "loss": 0.3875, + "step": 667 + }, + { + "epoch": 0.04, + "grad_norm": 0.31771538807403404, + "learning_rate": 9.995861396156362e-07, + "loss": 0.21, + "step": 668 + }, + { + "epoch": 0.04, + "grad_norm": 0.3493822437991453, + "learning_rate": 9.995819279104491e-07, + "loss": 0.0353, + "step": 669 + }, + { + "epoch": 0.04, + "grad_norm": 0.4421172995145604, + "learning_rate": 9.995776948920927e-07, + "loss": 0.1876, + "step": 670 + }, + { + "epoch": 0.04, + "grad_norm": 0.19443582410038723, + "learning_rate": 9.995734405607474e-07, + "loss": 0.0966, + "step": 671 + }, + { + "epoch": 0.04, + "grad_norm": 0.3275481656952158, + "learning_rate": 9.99569164916595e-07, + "loss": 0.4793, + "step": 672 + }, + { + "epoch": 0.04, + "grad_norm": 0.22924677843911015, + "learning_rate": 9.995648679598176e-07, + "loss": 0.2181, + "step": 673 + }, + { + "epoch": 0.04, + "grad_norm": 0.3008393077777119, + "learning_rate": 9.99560549690599e-07, + "loss": 0.4154, + "step": 674 + }, + { + "epoch": 0.04, + "grad_norm": 0.5001730906408609, + "learning_rate": 9.99556210109123e-07, + "loss": 0.0971, + "step": 675 + }, + { + "epoch": 0.04, + "grad_norm": 0.25220347513813307, + "learning_rate": 9.995518492155746e-07, + "loss": 0.1822, + "step": 676 + }, + { + "epoch": 0.04, + "grad_norm": 0.21326006771705913, + "learning_rate": 9.9954746701014e-07, + "loss": 0.0362, + "step": 677 + }, + { + "epoch": 0.04, + "grad_norm": 0.12982483839799883, + "learning_rate": 9.995430634930066e-07, + "loss": 0.1691, + "step": 678 + }, + { + "epoch": 0.04, + "grad_norm": 0.33526482948055236, + "learning_rate": 9.99538638664362e-07, + "loss": 0.5494, + "step": 679 + }, + { + "epoch": 0.04, + "grad_norm": 0.20965784744111918, + "learning_rate": 9.995341925243944e-07, + "loss": 0.1009, + "step": 680 + }, + { + "epoch": 0.04, + "grad_norm": 0.4229254242252686, + "learning_rate": 9.99529725073294e-07, + "loss": 0.1194, + "step": 681 + }, + { + "epoch": 0.04, + "grad_norm": 0.08776084595143485, + "learning_rate": 9.995252363112518e-07, + "loss": 0.065, + "step": 682 + }, + { + "epoch": 0.04, + "grad_norm": 0.16284348597463982, + "learning_rate": 9.995207262384585e-07, + "loss": 0.0518, + "step": 683 + }, + { + "epoch": 0.04, + "grad_norm": 0.20175325495331153, + "learning_rate": 9.995161948551069e-07, + "loss": 0.0596, + "step": 684 + }, + { + "epoch": 0.04, + "grad_norm": 0.41025397147531095, + "learning_rate": 9.995116421613905e-07, + "loss": 0.2243, + "step": 685 + }, + { + "epoch": 0.04, + "grad_norm": 0.27349920467101396, + "learning_rate": 9.995070681575032e-07, + "loss": 0.3185, + "step": 686 + }, + { + "epoch": 0.04, + "grad_norm": 0.39644977768940787, + "learning_rate": 9.995024728436401e-07, + "loss": 0.2763, + "step": 687 + }, + { + "epoch": 0.04, + "grad_norm": 0.1399900794362018, + "learning_rate": 9.994978562199973e-07, + "loss": 0.0156, + "step": 688 + }, + { + "epoch": 0.04, + "grad_norm": 0.3472303888369003, + "learning_rate": 9.99493218286772e-07, + "loss": 0.182, + "step": 689 + }, + { + "epoch": 0.04, + "grad_norm": 0.2636485704732172, + "learning_rate": 9.99488559044162e-07, + "loss": 0.159, + "step": 690 + }, + { + "epoch": 0.04, + "grad_norm": 0.44025629657577875, + "learning_rate": 9.994838784923657e-07, + "loss": 0.13, + "step": 691 + }, + { + "epoch": 0.04, + "grad_norm": 0.5309811169145624, + "learning_rate": 9.994791766315833e-07, + "loss": 0.2435, + "step": 692 + }, + { + "epoch": 0.04, + "grad_norm": 0.3815430042476451, + "learning_rate": 9.99474453462015e-07, + "loss": 0.0291, + "step": 693 + }, + { + "epoch": 0.04, + "grad_norm": 0.6649567885730471, + "learning_rate": 9.994697089838626e-07, + "loss": 0.068, + "step": 694 + }, + { + "epoch": 0.04, + "grad_norm": 0.3355111021093103, + "learning_rate": 9.994649431973283e-07, + "loss": 0.0949, + "step": 695 + }, + { + "epoch": 0.04, + "grad_norm": 0.22658701080249202, + "learning_rate": 9.994601561026153e-07, + "loss": 0.0596, + "step": 696 + }, + { + "epoch": 0.04, + "grad_norm": 0.23806279247513487, + "learning_rate": 9.994553476999281e-07, + "loss": 0.2143, + "step": 697 + }, + { + "epoch": 0.04, + "grad_norm": 0.3346821326673875, + "learning_rate": 9.994505179894718e-07, + "loss": 0.2204, + "step": 698 + }, + { + "epoch": 0.04, + "grad_norm": 0.3255836963757066, + "learning_rate": 9.994456669714523e-07, + "loss": 0.117, + "step": 699 + }, + { + "epoch": 0.04, + "grad_norm": 0.17276392954244343, + "learning_rate": 9.994407946460768e-07, + "loss": 0.0316, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 0.40489743890177804, + "learning_rate": 9.99435901013553e-07, + "loss": 0.1418, + "step": 701 + }, + { + "epoch": 0.04, + "grad_norm": 0.5784755364483929, + "learning_rate": 9.994309860740894e-07, + "loss": 0.2431, + "step": 702 + }, + { + "epoch": 0.04, + "grad_norm": 0.41512771975633733, + "learning_rate": 9.994260498278962e-07, + "loss": 0.2355, + "step": 703 + }, + { + "epoch": 0.04, + "grad_norm": 0.3187904283921282, + "learning_rate": 9.994210922751836e-07, + "loss": 0.3454, + "step": 704 + }, + { + "epoch": 0.04, + "grad_norm": 0.10930687596375602, + "learning_rate": 9.994161134161632e-07, + "loss": 0.0869, + "step": 705 + }, + { + "epoch": 0.05, + "grad_norm": 0.21349147939129332, + "learning_rate": 9.994111132510477e-07, + "loss": 0.0998, + "step": 706 + }, + { + "epoch": 0.05, + "grad_norm": 0.29174491502664607, + "learning_rate": 9.994060917800499e-07, + "loss": 0.1836, + "step": 707 + }, + { + "epoch": 0.05, + "grad_norm": 0.267973293440664, + "learning_rate": 9.994010490033843e-07, + "loss": 0.0963, + "step": 708 + }, + { + "epoch": 0.05, + "grad_norm": 0.2959997574431314, + "learning_rate": 9.993959849212662e-07, + "loss": 0.1727, + "step": 709 + }, + { + "epoch": 0.05, + "grad_norm": 0.37866755142864683, + "learning_rate": 9.993908995339114e-07, + "loss": 0.3663, + "step": 710 + }, + { + "epoch": 0.05, + "grad_norm": 0.2138710941633362, + "learning_rate": 9.993857928415368e-07, + "loss": 0.2025, + "step": 711 + }, + { + "epoch": 0.05, + "grad_norm": 0.3630054849873252, + "learning_rate": 9.993806648443606e-07, + "loss": 0.3173, + "step": 712 + }, + { + "epoch": 0.05, + "grad_norm": 0.21645649187262386, + "learning_rate": 9.993755155426014e-07, + "loss": 0.1201, + "step": 713 + }, + { + "epoch": 0.05, + "grad_norm": 0.4298086689210384, + "learning_rate": 9.993703449364785e-07, + "loss": 0.2854, + "step": 714 + }, + { + "epoch": 0.05, + "grad_norm": 0.6737470060141839, + "learning_rate": 9.99365153026213e-07, + "loss": 0.2807, + "step": 715 + }, + { + "epoch": 0.05, + "grad_norm": 0.46771804450657134, + "learning_rate": 9.993599398120263e-07, + "loss": 0.3458, + "step": 716 + }, + { + "epoch": 0.05, + "grad_norm": 0.13088635700932683, + "learning_rate": 9.993547052941407e-07, + "loss": 0.1335, + "step": 717 + }, + { + "epoch": 0.05, + "grad_norm": 0.38230493975198065, + "learning_rate": 9.993494494727795e-07, + "loss": 0.2078, + "step": 718 + }, + { + "epoch": 0.05, + "grad_norm": 0.5303441806830073, + "learning_rate": 9.99344172348167e-07, + "loss": 0.3254, + "step": 719 + }, + { + "epoch": 0.05, + "grad_norm": 0.41501870663594365, + "learning_rate": 9.993388739205283e-07, + "loss": 0.2984, + "step": 720 + }, + { + "epoch": 0.05, + "grad_norm": 0.4466052256901839, + "learning_rate": 9.993335541900893e-07, + "loss": 0.326, + "step": 721 + }, + { + "epoch": 0.05, + "grad_norm": 0.3664984275313783, + "learning_rate": 9.993282131570772e-07, + "loss": 0.408, + "step": 722 + }, + { + "epoch": 0.05, + "grad_norm": 0.42448567094972184, + "learning_rate": 9.9932285082172e-07, + "loss": 0.2234, + "step": 723 + }, + { + "epoch": 0.05, + "grad_norm": 0.17745007270794735, + "learning_rate": 9.99317467184246e-07, + "loss": 0.1807, + "step": 724 + }, + { + "epoch": 0.05, + "grad_norm": 0.44935936650217406, + "learning_rate": 9.993120622448849e-07, + "loss": 0.2376, + "step": 725 + }, + { + "epoch": 0.05, + "grad_norm": 0.6126905176000934, + "learning_rate": 9.993066360038678e-07, + "loss": 0.2615, + "step": 726 + }, + { + "epoch": 0.05, + "grad_norm": 0.7901474797504096, + "learning_rate": 9.993011884614256e-07, + "loss": 0.0211, + "step": 727 + }, + { + "epoch": 0.05, + "grad_norm": 0.26384046180651166, + "learning_rate": 9.99295719617791e-07, + "loss": 0.1528, + "step": 728 + }, + { + "epoch": 0.05, + "grad_norm": 0.1624837760476425, + "learning_rate": 9.992902294731974e-07, + "loss": 0.0162, + "step": 729 + }, + { + "epoch": 0.05, + "grad_norm": 0.3153028508242539, + "learning_rate": 9.992847180278792e-07, + "loss": 0.3626, + "step": 730 + }, + { + "epoch": 0.05, + "grad_norm": 0.3489224986364525, + "learning_rate": 9.992791852820708e-07, + "loss": 0.1076, + "step": 731 + }, + { + "epoch": 0.05, + "grad_norm": 0.38130284266583797, + "learning_rate": 9.992736312360089e-07, + "loss": 0.3346, + "step": 732 + }, + { + "epoch": 0.05, + "grad_norm": 0.8070432281761301, + "learning_rate": 9.992680558899303e-07, + "loss": 0.3271, + "step": 733 + }, + { + "epoch": 0.05, + "grad_norm": 0.43492179476209475, + "learning_rate": 9.992624592440725e-07, + "loss": 0.2262, + "step": 734 + }, + { + "epoch": 0.05, + "grad_norm": 0.15688682197524684, + "learning_rate": 9.992568412986748e-07, + "loss": 0.1294, + "step": 735 + }, + { + "epoch": 0.05, + "grad_norm": 0.32013111372934266, + "learning_rate": 9.992512020539765e-07, + "loss": 0.2633, + "step": 736 + }, + { + "epoch": 0.05, + "grad_norm": 0.3857147256820132, + "learning_rate": 9.992455415102182e-07, + "loss": 0.2661, + "step": 737 + }, + { + "epoch": 0.05, + "grad_norm": 0.4282079559143247, + "learning_rate": 9.992398596676417e-07, + "loss": 0.1998, + "step": 738 + }, + { + "epoch": 0.05, + "grad_norm": 0.08828878270340566, + "learning_rate": 9.99234156526489e-07, + "loss": 0.0104, + "step": 739 + }, + { + "epoch": 0.05, + "grad_norm": 0.3516698478793413, + "learning_rate": 9.992284320870037e-07, + "loss": 0.045, + "step": 740 + }, + { + "epoch": 0.05, + "grad_norm": 0.2377456378438165, + "learning_rate": 9.9922268634943e-07, + "loss": 0.141, + "step": 741 + }, + { + "epoch": 0.05, + "grad_norm": 0.3916903296743999, + "learning_rate": 9.992169193140127e-07, + "loss": 0.2449, + "step": 742 + }, + { + "epoch": 0.05, + "grad_norm": 0.2735201883748301, + "learning_rate": 9.992111309809982e-07, + "loss": 0.35, + "step": 743 + }, + { + "epoch": 0.05, + "grad_norm": 0.3854829590577924, + "learning_rate": 9.992053213506333e-07, + "loss": 0.1347, + "step": 744 + }, + { + "epoch": 0.05, + "grad_norm": 0.2756410028217851, + "learning_rate": 9.99199490423166e-07, + "loss": 0.0884, + "step": 745 + }, + { + "epoch": 0.05, + "grad_norm": 0.27719372195536185, + "learning_rate": 9.991936381988447e-07, + "loss": 0.1471, + "step": 746 + }, + { + "epoch": 0.05, + "grad_norm": 0.21292987230744306, + "learning_rate": 9.991877646779194e-07, + "loss": 0.2303, + "step": 747 + }, + { + "epoch": 0.05, + "grad_norm": 0.561405462595992, + "learning_rate": 9.991818698606404e-07, + "loss": 0.1685, + "step": 748 + }, + { + "epoch": 0.05, + "grad_norm": 0.26414580347221267, + "learning_rate": 9.991759537472597e-07, + "loss": 0.3096, + "step": 749 + }, + { + "epoch": 0.05, + "grad_norm": 0.25524290308172876, + "learning_rate": 9.99170016338029e-07, + "loss": 0.1581, + "step": 750 + }, + { + "epoch": 0.05, + "grad_norm": 1.148642376017468, + "learning_rate": 9.99164057633202e-07, + "loss": 0.4003, + "step": 751 + }, + { + "epoch": 0.05, + "grad_norm": 0.3809899101312142, + "learning_rate": 9.99158077633033e-07, + "loss": 0.3427, + "step": 752 + }, + { + "epoch": 0.05, + "grad_norm": 0.335677292084092, + "learning_rate": 9.99152076337777e-07, + "loss": 0.2826, + "step": 753 + }, + { + "epoch": 0.05, + "grad_norm": 0.3201739427658402, + "learning_rate": 9.9914605374769e-07, + "loss": 0.367, + "step": 754 + }, + { + "epoch": 0.05, + "grad_norm": 0.4314453049894622, + "learning_rate": 9.991400098630288e-07, + "loss": 0.3241, + "step": 755 + }, + { + "epoch": 0.05, + "grad_norm": 0.39924962518783, + "learning_rate": 9.991339446840515e-07, + "loss": 0.1677, + "step": 756 + }, + { + "epoch": 0.05, + "grad_norm": 0.23400593462214092, + "learning_rate": 9.991278582110166e-07, + "loss": 0.2681, + "step": 757 + }, + { + "epoch": 0.05, + "grad_norm": 0.8307402139466588, + "learning_rate": 9.99121750444184e-07, + "loss": 0.1325, + "step": 758 + }, + { + "epoch": 0.05, + "grad_norm": 0.43037593280946795, + "learning_rate": 9.99115621383814e-07, + "loss": 0.3028, + "step": 759 + }, + { + "epoch": 0.05, + "grad_norm": 0.2593830448653943, + "learning_rate": 9.991094710301686e-07, + "loss": 0.0571, + "step": 760 + }, + { + "epoch": 0.05, + "grad_norm": 0.32735630380268255, + "learning_rate": 9.991032993835095e-07, + "loss": 0.0852, + "step": 761 + }, + { + "epoch": 0.05, + "grad_norm": 0.2900643412585144, + "learning_rate": 9.990971064441004e-07, + "loss": 0.2157, + "step": 762 + }, + { + "epoch": 0.05, + "grad_norm": 0.29488411113941265, + "learning_rate": 9.990908922122056e-07, + "loss": 0.1244, + "step": 763 + }, + { + "epoch": 0.05, + "grad_norm": 1.0569396764700685, + "learning_rate": 9.990846566880899e-07, + "loss": 0.1896, + "step": 764 + }, + { + "epoch": 0.05, + "grad_norm": 0.17865347993534295, + "learning_rate": 9.990783998720193e-07, + "loss": 0.0746, + "step": 765 + }, + { + "epoch": 0.05, + "grad_norm": 0.5269334730810578, + "learning_rate": 9.990721217642612e-07, + "loss": 0.0305, + "step": 766 + }, + { + "epoch": 0.05, + "grad_norm": 0.3051566717293546, + "learning_rate": 9.99065822365083e-07, + "loss": 0.314, + "step": 767 + }, + { + "epoch": 0.05, + "grad_norm": 0.9345007572272831, + "learning_rate": 9.990595016747535e-07, + "loss": 0.5108, + "step": 768 + }, + { + "epoch": 0.05, + "grad_norm": 0.5998423994556221, + "learning_rate": 9.990531596935424e-07, + "loss": 0.3033, + "step": 769 + }, + { + "epoch": 0.05, + "grad_norm": 0.45181850336453955, + "learning_rate": 9.990467964217204e-07, + "loss": 0.2796, + "step": 770 + }, + { + "epoch": 0.05, + "grad_norm": 0.4850613814989678, + "learning_rate": 9.990404118595587e-07, + "loss": 0.1074, + "step": 771 + }, + { + "epoch": 0.05, + "grad_norm": 0.18496526534645386, + "learning_rate": 9.990340060073302e-07, + "loss": 0.1308, + "step": 772 + }, + { + "epoch": 0.05, + "grad_norm": 0.19599395432566782, + "learning_rate": 9.990275788653074e-07, + "loss": 0.1107, + "step": 773 + }, + { + "epoch": 0.05, + "grad_norm": 0.38743758561766645, + "learning_rate": 9.99021130433765e-07, + "loss": 0.281, + "step": 774 + }, + { + "epoch": 0.05, + "grad_norm": 0.29353828597777165, + "learning_rate": 9.99014660712978e-07, + "loss": 0.1374, + "step": 775 + }, + { + "epoch": 0.05, + "grad_norm": 0.38702552932139134, + "learning_rate": 9.990081697032226e-07, + "loss": 0.3723, + "step": 776 + }, + { + "epoch": 0.05, + "grad_norm": 0.5511249787455814, + "learning_rate": 9.990016574047755e-07, + "loss": 0.3179, + "step": 777 + }, + { + "epoch": 0.05, + "grad_norm": 0.19069921374797308, + "learning_rate": 9.989951238179146e-07, + "loss": 0.1863, + "step": 778 + }, + { + "epoch": 0.05, + "grad_norm": 0.5477637035431495, + "learning_rate": 9.989885689429187e-07, + "loss": 0.2018, + "step": 779 + }, + { + "epoch": 0.05, + "grad_norm": 0.23073750329632237, + "learning_rate": 9.989819927800671e-07, + "loss": 0.1418, + "step": 780 + }, + { + "epoch": 0.05, + "grad_norm": 0.030750283996359234, + "learning_rate": 9.989753953296408e-07, + "loss": 0.0027, + "step": 781 + }, + { + "epoch": 0.05, + "grad_norm": 0.1145215007398251, + "learning_rate": 9.98968776591921e-07, + "loss": 0.147, + "step": 782 + }, + { + "epoch": 0.05, + "grad_norm": 0.25940045290858127, + "learning_rate": 9.989621365671902e-07, + "loss": 0.0713, + "step": 783 + }, + { + "epoch": 0.05, + "grad_norm": 0.43694900746613674, + "learning_rate": 9.989554752557314e-07, + "loss": 0.1064, + "step": 784 + }, + { + "epoch": 0.05, + "grad_norm": 0.3971010506393241, + "learning_rate": 9.989487926578291e-07, + "loss": 0.1655, + "step": 785 + }, + { + "epoch": 0.05, + "grad_norm": 0.1909020753793466, + "learning_rate": 9.989420887737683e-07, + "loss": 0.2461, + "step": 786 + }, + { + "epoch": 0.05, + "grad_norm": 0.3099298618303888, + "learning_rate": 9.989353636038351e-07, + "loss": 0.2658, + "step": 787 + }, + { + "epoch": 0.05, + "grad_norm": 0.9535739179506793, + "learning_rate": 9.989286171483161e-07, + "loss": 0.271, + "step": 788 + }, + { + "epoch": 0.05, + "grad_norm": 0.36664103535127324, + "learning_rate": 9.989218494074995e-07, + "loss": 0.2692, + "step": 789 + }, + { + "epoch": 0.05, + "grad_norm": 0.36033331083904596, + "learning_rate": 9.989150603816738e-07, + "loss": 0.1979, + "step": 790 + }, + { + "epoch": 0.05, + "grad_norm": 1.0563505863364127, + "learning_rate": 9.989082500711287e-07, + "loss": 0.2973, + "step": 791 + }, + { + "epoch": 0.05, + "grad_norm": 0.35337153350301953, + "learning_rate": 9.989014184761546e-07, + "loss": 0.0985, + "step": 792 + }, + { + "epoch": 0.05, + "grad_norm": 0.41319436850610153, + "learning_rate": 9.988945655970434e-07, + "loss": 0.1578, + "step": 793 + }, + { + "epoch": 0.05, + "grad_norm": 0.428415309646363, + "learning_rate": 9.988876914340868e-07, + "loss": 0.3707, + "step": 794 + }, + { + "epoch": 0.05, + "grad_norm": 0.216637732229521, + "learning_rate": 9.988807959875786e-07, + "loss": 0.0188, + "step": 795 + }, + { + "epoch": 0.05, + "grad_norm": 0.4181888867376164, + "learning_rate": 9.988738792578126e-07, + "loss": 0.2444, + "step": 796 + }, + { + "epoch": 0.05, + "grad_norm": 0.2000092320601304, + "learning_rate": 9.98866941245084e-07, + "loss": 0.1637, + "step": 797 + }, + { + "epoch": 0.05, + "grad_norm": 0.41985990604917495, + "learning_rate": 9.98859981949689e-07, + "loss": 0.2563, + "step": 798 + }, + { + "epoch": 0.05, + "grad_norm": 0.3699535318356039, + "learning_rate": 9.988530013719243e-07, + "loss": 0.1085, + "step": 799 + }, + { + "epoch": 0.05, + "grad_norm": 0.26470185162460874, + "learning_rate": 9.988459995120877e-07, + "loss": 0.1144, + "step": 800 + }, + { + "epoch": 0.05, + "grad_norm": 0.40079513827952984, + "learning_rate": 9.988389763704778e-07, + "loss": 0.3005, + "step": 801 + }, + { + "epoch": 0.05, + "grad_norm": 0.7079814755560667, + "learning_rate": 9.988319319473947e-07, + "loss": 0.2295, + "step": 802 + }, + { + "epoch": 0.05, + "grad_norm": 0.47031904416782294, + "learning_rate": 9.988248662431385e-07, + "loss": 0.4155, + "step": 803 + }, + { + "epoch": 0.05, + "grad_norm": 0.35989836180944096, + "learning_rate": 9.988177792580106e-07, + "loss": 0.0439, + "step": 804 + }, + { + "epoch": 0.05, + "grad_norm": 0.29549006075515255, + "learning_rate": 9.988106709923137e-07, + "loss": 0.09, + "step": 805 + }, + { + "epoch": 0.05, + "grad_norm": 0.17598994399439483, + "learning_rate": 9.988035414463507e-07, + "loss": 0.0525, + "step": 806 + }, + { + "epoch": 0.05, + "grad_norm": 0.5363124292698066, + "learning_rate": 9.98796390620426e-07, + "loss": 0.3053, + "step": 807 + }, + { + "epoch": 0.05, + "grad_norm": 0.6041872608092101, + "learning_rate": 9.987892185148443e-07, + "loss": 0.3073, + "step": 808 + }, + { + "epoch": 0.05, + "grad_norm": 0.4189089818332621, + "learning_rate": 9.98782025129912e-07, + "loss": 0.3295, + "step": 809 + }, + { + "epoch": 0.05, + "grad_norm": 0.4753315947071798, + "learning_rate": 9.987748104659359e-07, + "loss": 0.2363, + "step": 810 + }, + { + "epoch": 0.05, + "grad_norm": 0.6384770671401014, + "learning_rate": 9.987675745232239e-07, + "loss": 0.0567, + "step": 811 + }, + { + "epoch": 0.05, + "grad_norm": 0.39078600515760487, + "learning_rate": 9.987603173020842e-07, + "loss": 0.1865, + "step": 812 + }, + { + "epoch": 0.05, + "grad_norm": 0.41160175106312996, + "learning_rate": 9.987530388028267e-07, + "loss": 0.2448, + "step": 813 + }, + { + "epoch": 0.05, + "grad_norm": 0.2468671563228391, + "learning_rate": 9.987457390257622e-07, + "loss": 0.1582, + "step": 814 + }, + { + "epoch": 0.05, + "grad_norm": 0.2780845450730514, + "learning_rate": 9.987384179712018e-07, + "loss": 0.1083, + "step": 815 + }, + { + "epoch": 0.05, + "grad_norm": 0.5634701131210977, + "learning_rate": 9.987310756394578e-07, + "loss": 0.2131, + "step": 816 + }, + { + "epoch": 0.05, + "grad_norm": 0.378584012441739, + "learning_rate": 9.987237120308435e-07, + "loss": 0.2024, + "step": 817 + }, + { + "epoch": 0.05, + "grad_norm": 0.24450289137278988, + "learning_rate": 9.987163271456733e-07, + "loss": 0.2023, + "step": 818 + }, + { + "epoch": 0.05, + "grad_norm": 0.5248509815392669, + "learning_rate": 9.987089209842618e-07, + "loss": 0.2578, + "step": 819 + }, + { + "epoch": 0.05, + "grad_norm": 0.3488439970835759, + "learning_rate": 9.987014935469254e-07, + "loss": 0.3219, + "step": 820 + }, + { + "epoch": 0.05, + "grad_norm": 0.4434052624292268, + "learning_rate": 9.986940448339807e-07, + "loss": 0.1275, + "step": 821 + }, + { + "epoch": 0.05, + "grad_norm": 0.1639172911274137, + "learning_rate": 9.986865748457455e-07, + "loss": 0.0444, + "step": 822 + }, + { + "epoch": 0.05, + "grad_norm": 0.3198579670432246, + "learning_rate": 9.986790835825385e-07, + "loss": 0.2233, + "step": 823 + }, + { + "epoch": 0.05, + "grad_norm": 0.6046038247945492, + "learning_rate": 9.986715710446795e-07, + "loss": 0.2182, + "step": 824 + }, + { + "epoch": 0.05, + "grad_norm": 0.5503062834527775, + "learning_rate": 9.986640372324887e-07, + "loss": 0.1813, + "step": 825 + }, + { + "epoch": 0.05, + "grad_norm": 0.3311415631719066, + "learning_rate": 9.986564821462876e-07, + "loss": 0.1052, + "step": 826 + }, + { + "epoch": 0.05, + "grad_norm": 0.3745468369706555, + "learning_rate": 9.986489057863986e-07, + "loss": 0.2697, + "step": 827 + }, + { + "epoch": 0.05, + "grad_norm": 0.4019031440647441, + "learning_rate": 9.986413081531448e-07, + "loss": 0.2565, + "step": 828 + }, + { + "epoch": 0.05, + "grad_norm": 0.48819622858764106, + "learning_rate": 9.986336892468506e-07, + "loss": 0.158, + "step": 829 + }, + { + "epoch": 0.05, + "grad_norm": 0.49255007594275985, + "learning_rate": 9.986260490678406e-07, + "loss": 0.2423, + "step": 830 + }, + { + "epoch": 0.05, + "grad_norm": 0.46966914868829196, + "learning_rate": 9.98618387616441e-07, + "loss": 0.3574, + "step": 831 + }, + { + "epoch": 0.05, + "grad_norm": 0.5241945042886459, + "learning_rate": 9.98610704892979e-07, + "loss": 0.1017, + "step": 832 + }, + { + "epoch": 0.05, + "grad_norm": 0.4420068530363283, + "learning_rate": 9.986030008977816e-07, + "loss": 0.1116, + "step": 833 + }, + { + "epoch": 0.05, + "grad_norm": 0.42633745870809486, + "learning_rate": 9.98595275631178e-07, + "loss": 0.1198, + "step": 834 + }, + { + "epoch": 0.05, + "grad_norm": 0.42117115114059506, + "learning_rate": 9.985875290934974e-07, + "loss": 0.2213, + "step": 835 + }, + { + "epoch": 0.05, + "grad_norm": 0.5886583825546735, + "learning_rate": 9.985797612850709e-07, + "loss": 0.2779, + "step": 836 + }, + { + "epoch": 0.05, + "grad_norm": 0.17939168561944074, + "learning_rate": 9.985719722062293e-07, + "loss": 0.0299, + "step": 837 + }, + { + "epoch": 0.05, + "grad_norm": 0.55192345774415, + "learning_rate": 9.98564161857305e-07, + "loss": 0.1841, + "step": 838 + }, + { + "epoch": 0.05, + "grad_norm": 0.9291312498876918, + "learning_rate": 9.985563302386317e-07, + "loss": 0.4546, + "step": 839 + }, + { + "epoch": 0.05, + "grad_norm": 0.6257171085637463, + "learning_rate": 9.985484773505427e-07, + "loss": 0.3799, + "step": 840 + }, + { + "epoch": 0.05, + "grad_norm": 0.6860559930564434, + "learning_rate": 9.985406031933737e-07, + "loss": 0.2958, + "step": 841 + }, + { + "epoch": 0.05, + "grad_norm": 0.6107884200040208, + "learning_rate": 9.985327077674604e-07, + "loss": 0.2403, + "step": 842 + }, + { + "epoch": 0.05, + "grad_norm": 0.5212449988275933, + "learning_rate": 9.985247910731395e-07, + "loss": 0.1581, + "step": 843 + }, + { + "epoch": 0.05, + "grad_norm": 0.5544588418030959, + "learning_rate": 9.985168531107488e-07, + "loss": 0.2562, + "step": 844 + }, + { + "epoch": 0.05, + "grad_norm": 0.9556498835020415, + "learning_rate": 9.985088938806271e-07, + "loss": 0.0674, + "step": 845 + }, + { + "epoch": 0.05, + "grad_norm": 0.06436617935042317, + "learning_rate": 9.98500913383114e-07, + "loss": 0.0047, + "step": 846 + }, + { + "epoch": 0.05, + "grad_norm": 0.6433903061356261, + "learning_rate": 9.984929116185497e-07, + "loss": 0.4003, + "step": 847 + }, + { + "epoch": 0.05, + "grad_norm": 0.18427332635185062, + "learning_rate": 9.984848885872756e-07, + "loss": 0.0985, + "step": 848 + }, + { + "epoch": 0.05, + "grad_norm": 0.326773548187247, + "learning_rate": 9.984768442896341e-07, + "loss": 0.1337, + "step": 849 + }, + { + "epoch": 0.05, + "grad_norm": 0.7734791586844639, + "learning_rate": 9.984687787259683e-07, + "loss": 0.2146, + "step": 850 + }, + { + "epoch": 0.05, + "grad_norm": 0.4573907101064241, + "learning_rate": 9.984606918966226e-07, + "loss": 0.2659, + "step": 851 + }, + { + "epoch": 0.05, + "grad_norm": 0.7185429956776199, + "learning_rate": 9.984525838019415e-07, + "loss": 0.3559, + "step": 852 + }, + { + "epoch": 0.05, + "grad_norm": 0.5786571028021502, + "learning_rate": 9.984444544422712e-07, + "loss": 0.2182, + "step": 853 + }, + { + "epoch": 0.05, + "grad_norm": 0.261193248528463, + "learning_rate": 9.984363038179587e-07, + "loss": 0.0158, + "step": 854 + }, + { + "epoch": 0.05, + "grad_norm": 0.1607690185548112, + "learning_rate": 9.984281319293513e-07, + "loss": 0.0183, + "step": 855 + }, + { + "epoch": 0.05, + "grad_norm": 0.43391240870556813, + "learning_rate": 9.984199387767978e-07, + "loss": 0.3064, + "step": 856 + }, + { + "epoch": 0.05, + "grad_norm": 0.3421269906808885, + "learning_rate": 9.984117243606478e-07, + "loss": 0.2376, + "step": 857 + }, + { + "epoch": 0.05, + "grad_norm": 0.3914337813170445, + "learning_rate": 9.984034886812519e-07, + "loss": 0.1398, + "step": 858 + }, + { + "epoch": 0.05, + "grad_norm": 0.22670431645526753, + "learning_rate": 9.983952317389609e-07, + "loss": 0.1208, + "step": 859 + }, + { + "epoch": 0.05, + "grad_norm": 0.551276633488919, + "learning_rate": 9.983869535341276e-07, + "loss": 0.2405, + "step": 860 + }, + { + "epoch": 0.05, + "grad_norm": 0.5254185456481869, + "learning_rate": 9.98378654067105e-07, + "loss": 0.1736, + "step": 861 + }, + { + "epoch": 0.05, + "grad_norm": 0.1252888813245433, + "learning_rate": 9.98370333338247e-07, + "loss": 0.085, + "step": 862 + }, + { + "epoch": 0.06, + "grad_norm": 0.24475631176376744, + "learning_rate": 9.98361991347909e-07, + "loss": 0.1545, + "step": 863 + }, + { + "epoch": 0.06, + "grad_norm": 0.8985437187589743, + "learning_rate": 9.983536280964463e-07, + "loss": 0.1804, + "step": 864 + }, + { + "epoch": 0.06, + "grad_norm": 0.5835744721955892, + "learning_rate": 9.983452435842161e-07, + "loss": 0.2285, + "step": 865 + }, + { + "epoch": 0.06, + "grad_norm": 0.4454473512156972, + "learning_rate": 9.98336837811576e-07, + "loss": 0.1476, + "step": 866 + }, + { + "epoch": 0.06, + "grad_norm": 0.5508829195580042, + "learning_rate": 9.983284107788849e-07, + "loss": 0.2139, + "step": 867 + }, + { + "epoch": 0.06, + "grad_norm": 0.46970387871543356, + "learning_rate": 9.983199624865019e-07, + "loss": 0.2151, + "step": 868 + }, + { + "epoch": 0.06, + "grad_norm": 0.3177141191160735, + "learning_rate": 9.983114929347875e-07, + "loss": 0.0975, + "step": 869 + }, + { + "epoch": 0.06, + "grad_norm": 0.7368498456487158, + "learning_rate": 9.98303002124103e-07, + "loss": 0.2882, + "step": 870 + }, + { + "epoch": 0.06, + "grad_norm": 0.4349333901871027, + "learning_rate": 9.982944900548106e-07, + "loss": 0.2377, + "step": 871 + }, + { + "epoch": 0.06, + "grad_norm": 0.14115678908192322, + "learning_rate": 9.982859567272738e-07, + "loss": 0.0091, + "step": 872 + }, + { + "epoch": 0.06, + "grad_norm": 0.37967986469455783, + "learning_rate": 9.982774021418564e-07, + "loss": 0.2847, + "step": 873 + }, + { + "epoch": 0.06, + "grad_norm": 0.10136774015629568, + "learning_rate": 9.982688262989235e-07, + "loss": 0.0098, + "step": 874 + }, + { + "epoch": 0.06, + "grad_norm": 0.3824427964589409, + "learning_rate": 9.982602291988404e-07, + "loss": 0.013, + "step": 875 + }, + { + "epoch": 0.06, + "grad_norm": 0.749030640530025, + "learning_rate": 9.982516108419745e-07, + "loss": 0.2335, + "step": 876 + }, + { + "epoch": 0.06, + "grad_norm": 0.5183944961616913, + "learning_rate": 9.982429712286933e-07, + "loss": 0.2373, + "step": 877 + }, + { + "epoch": 0.06, + "grad_norm": 0.24214020912983317, + "learning_rate": 9.982343103593654e-07, + "loss": 0.0894, + "step": 878 + }, + { + "epoch": 0.06, + "grad_norm": 0.4661896309772867, + "learning_rate": 9.982256282343603e-07, + "loss": 0.1716, + "step": 879 + }, + { + "epoch": 0.06, + "grad_norm": 0.416058283139024, + "learning_rate": 9.98216924854048e-07, + "loss": 0.2509, + "step": 880 + }, + { + "epoch": 0.06, + "grad_norm": 1.0653442194630949, + "learning_rate": 9.982082002188004e-07, + "loss": 0.1884, + "step": 881 + }, + { + "epoch": 0.06, + "grad_norm": 0.45427821871010937, + "learning_rate": 9.981994543289895e-07, + "loss": 0.0431, + "step": 882 + }, + { + "epoch": 0.06, + "grad_norm": 0.4367716379240054, + "learning_rate": 9.981906871849884e-07, + "loss": 0.1209, + "step": 883 + }, + { + "epoch": 0.06, + "grad_norm": 1.1250590942922611, + "learning_rate": 9.981818987871708e-07, + "loss": 0.2175, + "step": 884 + }, + { + "epoch": 0.06, + "grad_norm": 0.31067847406138993, + "learning_rate": 9.98173089135912e-07, + "loss": 0.019, + "step": 885 + }, + { + "epoch": 0.06, + "grad_norm": 0.7127744823338241, + "learning_rate": 9.98164258231588e-07, + "loss": 0.4501, + "step": 886 + }, + { + "epoch": 0.06, + "grad_norm": 0.5454463047150027, + "learning_rate": 9.981554060745754e-07, + "loss": 0.2822, + "step": 887 + }, + { + "epoch": 0.06, + "grad_norm": 0.7384467024144157, + "learning_rate": 9.981465326652515e-07, + "loss": 0.1541, + "step": 888 + }, + { + "epoch": 0.06, + "grad_norm": 0.2491223286847421, + "learning_rate": 9.98137638003995e-07, + "loss": 0.1961, + "step": 889 + }, + { + "epoch": 0.06, + "grad_norm": 0.3060126226204753, + "learning_rate": 9.981287220911857e-07, + "loss": 0.1382, + "step": 890 + }, + { + "epoch": 0.06, + "grad_norm": 0.3969689694949935, + "learning_rate": 9.981197849272038e-07, + "loss": 0.2625, + "step": 891 + }, + { + "epoch": 0.06, + "grad_norm": 0.27307865038414464, + "learning_rate": 9.981108265124303e-07, + "loss": 0.2458, + "step": 892 + }, + { + "epoch": 0.06, + "grad_norm": 0.6276549621281022, + "learning_rate": 9.981018468472478e-07, + "loss": 0.2624, + "step": 893 + }, + { + "epoch": 0.06, + "grad_norm": 0.3623102171189811, + "learning_rate": 9.98092845932039e-07, + "loss": 0.1402, + "step": 894 + }, + { + "epoch": 0.06, + "grad_norm": 0.6124999407880806, + "learning_rate": 9.980838237671883e-07, + "loss": 0.1638, + "step": 895 + }, + { + "epoch": 0.06, + "grad_norm": 0.7329529551589599, + "learning_rate": 9.980747803530804e-07, + "loss": 0.1128, + "step": 896 + }, + { + "epoch": 0.06, + "grad_norm": 0.22111671966192745, + "learning_rate": 9.98065715690101e-07, + "loss": 0.1134, + "step": 897 + }, + { + "epoch": 0.06, + "grad_norm": 0.5603531292210565, + "learning_rate": 9.98056629778637e-07, + "loss": 0.117, + "step": 898 + }, + { + "epoch": 0.06, + "grad_norm": 1.1477932072556478, + "learning_rate": 9.98047522619076e-07, + "loss": 0.2562, + "step": 899 + }, + { + "epoch": 0.06, + "grad_norm": 0.5238308145115796, + "learning_rate": 9.980383942118065e-07, + "loss": 0.1441, + "step": 900 + }, + { + "epoch": 0.06, + "grad_norm": 0.5232892018074623, + "learning_rate": 9.980292445572179e-07, + "loss": 0.2693, + "step": 901 + }, + { + "epoch": 0.06, + "grad_norm": 1.5262895713718663, + "learning_rate": 9.980200736557004e-07, + "loss": 0.3365, + "step": 902 + }, + { + "epoch": 0.06, + "grad_norm": 0.25404175535715173, + "learning_rate": 9.980108815076455e-07, + "loss": 0.0636, + "step": 903 + }, + { + "epoch": 0.06, + "grad_norm": 0.14217573604570202, + "learning_rate": 9.980016681134454e-07, + "loss": 0.0077, + "step": 904 + }, + { + "epoch": 0.06, + "grad_norm": 0.27577195936313037, + "learning_rate": 9.979924334734929e-07, + "loss": 0.1928, + "step": 905 + }, + { + "epoch": 0.06, + "grad_norm": 0.44262819970244743, + "learning_rate": 9.979831775881819e-07, + "loss": 0.1663, + "step": 906 + }, + { + "epoch": 0.06, + "grad_norm": 0.44267162341035937, + "learning_rate": 9.979739004579077e-07, + "loss": 0.1427, + "step": 907 + }, + { + "epoch": 0.06, + "grad_norm": 0.6146554850708117, + "learning_rate": 9.97964602083066e-07, + "loss": 0.1161, + "step": 908 + }, + { + "epoch": 0.06, + "grad_norm": 0.9272958776949122, + "learning_rate": 9.979552824640531e-07, + "loss": 0.1589, + "step": 909 + }, + { + "epoch": 0.06, + "grad_norm": 0.21303002979582747, + "learning_rate": 9.97945941601267e-07, + "loss": 0.0127, + "step": 910 + }, + { + "epoch": 0.06, + "grad_norm": 0.41194094422912997, + "learning_rate": 9.979365794951058e-07, + "loss": 0.1445, + "step": 911 + }, + { + "epoch": 0.06, + "grad_norm": 0.46353461993783285, + "learning_rate": 9.979271961459694e-07, + "loss": 0.1449, + "step": 912 + }, + { + "epoch": 0.06, + "grad_norm": 0.20054834219323922, + "learning_rate": 9.979177915542578e-07, + "loss": 0.0069, + "step": 913 + }, + { + "epoch": 0.06, + "grad_norm": 0.2708297641113638, + "learning_rate": 9.979083657203723e-07, + "loss": 0.1858, + "step": 914 + }, + { + "epoch": 0.06, + "grad_norm": 0.6109363257222682, + "learning_rate": 9.978989186447146e-07, + "loss": 0.1778, + "step": 915 + }, + { + "epoch": 0.06, + "grad_norm": 0.5028189692038627, + "learning_rate": 9.978894503276885e-07, + "loss": 0.1534, + "step": 916 + }, + { + "epoch": 0.06, + "grad_norm": 0.465761413142227, + "learning_rate": 9.978799607696975e-07, + "loss": 0.0239, + "step": 917 + }, + { + "epoch": 0.06, + "grad_norm": 0.379612151226625, + "learning_rate": 9.978704499711466e-07, + "loss": 0.296, + "step": 918 + }, + { + "epoch": 0.06, + "grad_norm": 0.36307380563278896, + "learning_rate": 9.978609179324414e-07, + "loss": 0.1662, + "step": 919 + }, + { + "epoch": 0.06, + "grad_norm": 0.5121961709273829, + "learning_rate": 9.978513646539886e-07, + "loss": 0.1688, + "step": 920 + }, + { + "epoch": 0.06, + "grad_norm": 0.5762208409026472, + "learning_rate": 9.978417901361957e-07, + "loss": 0.4547, + "step": 921 + }, + { + "epoch": 0.06, + "grad_norm": 0.7750293229894867, + "learning_rate": 9.978321943794715e-07, + "loss": 0.2777, + "step": 922 + }, + { + "epoch": 0.06, + "grad_norm": 0.1923763827464195, + "learning_rate": 9.978225773842248e-07, + "loss": 0.065, + "step": 923 + }, + { + "epoch": 0.06, + "grad_norm": 0.21859226925702638, + "learning_rate": 9.978129391508663e-07, + "loss": 0.1765, + "step": 924 + }, + { + "epoch": 0.06, + "grad_norm": 0.4779201707557506, + "learning_rate": 9.97803279679807e-07, + "loss": 0.4008, + "step": 925 + }, + { + "epoch": 0.06, + "grad_norm": 0.4703071639715682, + "learning_rate": 9.977935989714594e-07, + "loss": 0.1467, + "step": 926 + }, + { + "epoch": 0.06, + "grad_norm": 0.9150128592447756, + "learning_rate": 9.97783897026236e-07, + "loss": 0.4139, + "step": 927 + }, + { + "epoch": 0.06, + "grad_norm": 0.6190921533181195, + "learning_rate": 9.977741738445507e-07, + "loss": 0.3267, + "step": 928 + }, + { + "epoch": 0.06, + "grad_norm": 1.2143610086216308, + "learning_rate": 9.977644294268187e-07, + "loss": 0.1645, + "step": 929 + }, + { + "epoch": 0.06, + "grad_norm": 0.6172031300911652, + "learning_rate": 9.977546637734556e-07, + "loss": 0.1381, + "step": 930 + }, + { + "epoch": 0.06, + "grad_norm": 0.6070008665072504, + "learning_rate": 9.977448768848777e-07, + "loss": 0.426, + "step": 931 + }, + { + "epoch": 0.06, + "grad_norm": 0.5518807559900349, + "learning_rate": 9.977350687615027e-07, + "loss": 0.1151, + "step": 932 + }, + { + "epoch": 0.06, + "grad_norm": 0.5046020486044156, + "learning_rate": 9.977252394037492e-07, + "loss": 0.1508, + "step": 933 + }, + { + "epoch": 0.06, + "grad_norm": 0.21914492471035532, + "learning_rate": 9.977153888120366e-07, + "loss": 0.1708, + "step": 934 + }, + { + "epoch": 0.06, + "grad_norm": 0.6464788062297534, + "learning_rate": 9.977055169867848e-07, + "loss": 0.3474, + "step": 935 + }, + { + "epoch": 0.06, + "grad_norm": 0.33618861236827735, + "learning_rate": 9.976956239284151e-07, + "loss": 0.1884, + "step": 936 + }, + { + "epoch": 0.06, + "grad_norm": 0.6457467960715154, + "learning_rate": 9.976857096373499e-07, + "loss": 0.1886, + "step": 937 + }, + { + "epoch": 0.06, + "grad_norm": 0.6412063637630472, + "learning_rate": 9.976757741140115e-07, + "loss": 0.1398, + "step": 938 + }, + { + "epoch": 0.06, + "grad_norm": 0.4207704803199653, + "learning_rate": 9.976658173588243e-07, + "loss": 0.2348, + "step": 939 + }, + { + "epoch": 0.06, + "grad_norm": 0.46616906548217213, + "learning_rate": 9.976558393722127e-07, + "loss": 0.338, + "step": 940 + }, + { + "epoch": 0.06, + "grad_norm": 0.4083200801513483, + "learning_rate": 9.976458401546028e-07, + "loss": 0.1094, + "step": 941 + }, + { + "epoch": 0.06, + "grad_norm": 0.5406999137425461, + "learning_rate": 9.97635819706421e-07, + "loss": 0.1616, + "step": 942 + }, + { + "epoch": 0.06, + "grad_norm": 0.3111717063994098, + "learning_rate": 9.976257780280945e-07, + "loss": 0.2098, + "step": 943 + }, + { + "epoch": 0.06, + "grad_norm": 0.587578293857706, + "learning_rate": 9.976157151200522e-07, + "loss": 0.1851, + "step": 944 + }, + { + "epoch": 0.06, + "grad_norm": 0.3849442286716842, + "learning_rate": 9.97605630982723e-07, + "loss": 0.3162, + "step": 945 + }, + { + "epoch": 0.06, + "grad_norm": 0.40198988524598256, + "learning_rate": 9.975955256165374e-07, + "loss": 0.3014, + "step": 946 + }, + { + "epoch": 0.06, + "grad_norm": 0.28406903973403613, + "learning_rate": 9.975853990219264e-07, + "loss": 0.053, + "step": 947 + }, + { + "epoch": 0.06, + "grad_norm": 0.2876604500665912, + "learning_rate": 9.975752511993218e-07, + "loss": 0.1911, + "step": 948 + }, + { + "epoch": 0.06, + "grad_norm": 0.46743378594402885, + "learning_rate": 9.97565082149157e-07, + "loss": 0.2186, + "step": 949 + }, + { + "epoch": 0.06, + "grad_norm": 0.6255651517071519, + "learning_rate": 9.975548918718653e-07, + "loss": 0.2079, + "step": 950 + }, + { + "epoch": 0.06, + "grad_norm": 0.5948784357183413, + "learning_rate": 9.975446803678817e-07, + "loss": 0.23, + "step": 951 + }, + { + "epoch": 0.06, + "grad_norm": 0.7860708313336866, + "learning_rate": 9.97534447637642e-07, + "loss": 0.1739, + "step": 952 + }, + { + "epoch": 0.06, + "grad_norm": 0.6335474927518889, + "learning_rate": 9.975241936815826e-07, + "loss": 0.3164, + "step": 953 + }, + { + "epoch": 0.06, + "grad_norm": 0.34379862224453245, + "learning_rate": 9.97513918500141e-07, + "loss": 0.0258, + "step": 954 + }, + { + "epoch": 0.06, + "grad_norm": 0.7167515453188937, + "learning_rate": 9.975036220937553e-07, + "loss": 0.2723, + "step": 955 + }, + { + "epoch": 0.06, + "grad_norm": 0.6856200757666084, + "learning_rate": 9.974933044628652e-07, + "loss": 0.1118, + "step": 956 + }, + { + "epoch": 0.06, + "grad_norm": 0.21543958132484572, + "learning_rate": 9.974829656079104e-07, + "loss": 0.1057, + "step": 957 + }, + { + "epoch": 0.06, + "grad_norm": 1.1921174412206408, + "learning_rate": 9.974726055293325e-07, + "loss": 0.1333, + "step": 958 + }, + { + "epoch": 0.06, + "grad_norm": 0.33487542565410683, + "learning_rate": 9.97462224227573e-07, + "loss": 0.0938, + "step": 959 + }, + { + "epoch": 0.06, + "grad_norm": 0.6336804368411803, + "learning_rate": 9.97451821703075e-07, + "loss": 0.1854, + "step": 960 + }, + { + "epoch": 0.06, + "grad_norm": 0.5751970727441501, + "learning_rate": 9.974413979562823e-07, + "loss": 0.2631, + "step": 961 + }, + { + "epoch": 0.06, + "grad_norm": 0.6658528447001772, + "learning_rate": 9.974309529876396e-07, + "loss": 0.1177, + "step": 962 + }, + { + "epoch": 0.06, + "grad_norm": 0.7153463446728332, + "learning_rate": 9.974204867975926e-07, + "loss": 0.2134, + "step": 963 + }, + { + "epoch": 0.06, + "grad_norm": 0.556809473274764, + "learning_rate": 9.974099993865877e-07, + "loss": 0.0977, + "step": 964 + }, + { + "epoch": 0.06, + "grad_norm": 0.4850417655109695, + "learning_rate": 9.97399490755072e-07, + "loss": 0.3, + "step": 965 + }, + { + "epoch": 0.06, + "grad_norm": 0.393049910278882, + "learning_rate": 9.973889609034944e-07, + "loss": 0.1262, + "step": 966 + }, + { + "epoch": 0.06, + "grad_norm": 0.21258533130898213, + "learning_rate": 9.973784098323038e-07, + "loss": 0.0703, + "step": 967 + }, + { + "epoch": 0.06, + "grad_norm": 0.7971081695534867, + "learning_rate": 9.973678375419504e-07, + "loss": 0.1148, + "step": 968 + }, + { + "epoch": 0.06, + "grad_norm": 0.6055105582764296, + "learning_rate": 9.97357244032885e-07, + "loss": 0.1464, + "step": 969 + }, + { + "epoch": 0.06, + "grad_norm": 0.98527191471172, + "learning_rate": 9.973466293055602e-07, + "loss": 0.1548, + "step": 970 + }, + { + "epoch": 0.06, + "grad_norm": 0.8005624121428562, + "learning_rate": 9.97335993360428e-07, + "loss": 0.2071, + "step": 971 + }, + { + "epoch": 0.06, + "grad_norm": 0.7641223533278209, + "learning_rate": 9.973253361979427e-07, + "loss": 0.2064, + "step": 972 + }, + { + "epoch": 0.06, + "grad_norm": 0.5262702698149297, + "learning_rate": 9.973146578185588e-07, + "loss": 0.3942, + "step": 973 + }, + { + "epoch": 0.06, + "grad_norm": 0.44707227252640414, + "learning_rate": 9.973039582227319e-07, + "loss": 0.5105, + "step": 974 + }, + { + "epoch": 0.06, + "grad_norm": 1.459651156249005, + "learning_rate": 9.972932374109182e-07, + "loss": 0.3136, + "step": 975 + }, + { + "epoch": 0.06, + "grad_norm": 0.8895166874354026, + "learning_rate": 9.972824953835756e-07, + "loss": 0.2322, + "step": 976 + }, + { + "epoch": 0.06, + "grad_norm": 0.29423753592284146, + "learning_rate": 9.97271732141162e-07, + "loss": 0.1969, + "step": 977 + }, + { + "epoch": 0.06, + "grad_norm": 0.8961796479193413, + "learning_rate": 9.972609476841365e-07, + "loss": 0.0635, + "step": 978 + }, + { + "epoch": 0.06, + "grad_norm": 0.27029945533215904, + "learning_rate": 9.972501420129595e-07, + "loss": 0.0962, + "step": 979 + }, + { + "epoch": 0.06, + "grad_norm": 0.4174636997470639, + "learning_rate": 9.97239315128092e-07, + "loss": 0.1965, + "step": 980 + }, + { + "epoch": 0.06, + "grad_norm": 1.5576671873511183, + "learning_rate": 9.972284670299955e-07, + "loss": 0.1702, + "step": 981 + }, + { + "epoch": 0.06, + "grad_norm": 0.8479045439406689, + "learning_rate": 9.972175977191332e-07, + "loss": 0.4184, + "step": 982 + }, + { + "epoch": 0.06, + "grad_norm": 1.4090059840814684, + "learning_rate": 9.972067071959685e-07, + "loss": 0.5159, + "step": 983 + }, + { + "epoch": 0.06, + "grad_norm": 0.7488220660318784, + "learning_rate": 9.971957954609662e-07, + "loss": 0.2249, + "step": 984 + }, + { + "epoch": 0.06, + "grad_norm": 1.0557110648302839, + "learning_rate": 9.971848625145919e-07, + "loss": 0.1822, + "step": 985 + }, + { + "epoch": 0.06, + "grad_norm": 0.6860523194772445, + "learning_rate": 9.97173908357312e-07, + "loss": 0.2638, + "step": 986 + }, + { + "epoch": 0.06, + "grad_norm": 0.5146231791989171, + "learning_rate": 9.971629329895934e-07, + "loss": 0.2392, + "step": 987 + }, + { + "epoch": 0.06, + "grad_norm": 0.6204843920733871, + "learning_rate": 9.971519364119048e-07, + "loss": 0.1253, + "step": 988 + }, + { + "epoch": 0.06, + "grad_norm": 0.6831185038719864, + "learning_rate": 9.971409186247151e-07, + "loss": 0.1564, + "step": 989 + }, + { + "epoch": 0.06, + "grad_norm": 0.5145580830055223, + "learning_rate": 9.971298796284947e-07, + "loss": 0.1791, + "step": 990 + }, + { + "epoch": 0.06, + "grad_norm": 0.3080111600263855, + "learning_rate": 9.97118819423714e-07, + "loss": 0.1089, + "step": 991 + }, + { + "epoch": 0.06, + "grad_norm": 1.026427565716047, + "learning_rate": 9.97107738010845e-07, + "loss": 0.1174, + "step": 992 + }, + { + "epoch": 0.06, + "grad_norm": 0.556784534737722, + "learning_rate": 9.97096635390361e-07, + "loss": 0.0677, + "step": 993 + }, + { + "epoch": 0.06, + "grad_norm": 0.3100620984644595, + "learning_rate": 9.97085511562735e-07, + "loss": 0.1863, + "step": 994 + }, + { + "epoch": 0.06, + "grad_norm": 0.6374941098365919, + "learning_rate": 9.970743665284418e-07, + "loss": 0.0677, + "step": 995 + }, + { + "epoch": 0.06, + "grad_norm": 0.4408399328420816, + "learning_rate": 9.97063200287957e-07, + "loss": 0.0862, + "step": 996 + }, + { + "epoch": 0.06, + "grad_norm": 0.4235597994350044, + "learning_rate": 9.970520128417567e-07, + "loss": 0.3549, + "step": 997 + }, + { + "epoch": 0.06, + "grad_norm": 0.5956360358683012, + "learning_rate": 9.970408041903185e-07, + "loss": 0.165, + "step": 998 + }, + { + "epoch": 0.06, + "grad_norm": 0.5629107483257062, + "learning_rate": 9.970295743341205e-07, + "loss": 0.1385, + "step": 999 + }, + { + "epoch": 0.06, + "grad_norm": 0.2858967054426137, + "learning_rate": 9.970183232736414e-07, + "loss": 0.0118, + "step": 1000 + }, + { + "epoch": 0.06, + "grad_norm": 0.5001823321468504, + "learning_rate": 9.970070510093616e-07, + "loss": 0.3272, + "step": 1001 + }, + { + "epoch": 0.06, + "grad_norm": 0.8804405518087354, + "learning_rate": 9.96995757541762e-07, + "loss": 0.3571, + "step": 1002 + }, + { + "epoch": 0.06, + "grad_norm": 0.5344573086959735, + "learning_rate": 9.969844428713242e-07, + "loss": 0.2215, + "step": 1003 + }, + { + "epoch": 0.06, + "grad_norm": 1.7997486073780382, + "learning_rate": 9.96973106998531e-07, + "loss": 0.2332, + "step": 1004 + }, + { + "epoch": 0.06, + "grad_norm": 0.6109746072637622, + "learning_rate": 9.96961749923866e-07, + "loss": 0.4813, + "step": 1005 + }, + { + "epoch": 0.06, + "grad_norm": 0.31869720912455685, + "learning_rate": 9.969503716478138e-07, + "loss": 0.2018, + "step": 1006 + }, + { + "epoch": 0.06, + "grad_norm": 0.5995328400952173, + "learning_rate": 9.9693897217086e-07, + "loss": 0.1437, + "step": 1007 + }, + { + "epoch": 0.06, + "grad_norm": 1.1796887188031795, + "learning_rate": 9.969275514934903e-07, + "loss": 0.1808, + "step": 1008 + }, + { + "epoch": 0.06, + "grad_norm": 0.5658398713474428, + "learning_rate": 9.969161096161924e-07, + "loss": 0.1765, + "step": 1009 + }, + { + "epoch": 0.06, + "grad_norm": 0.4490361150588233, + "learning_rate": 9.969046465394544e-07, + "loss": 0.1756, + "step": 1010 + }, + { + "epoch": 0.06, + "grad_norm": 0.5629561790593844, + "learning_rate": 9.968931622637651e-07, + "loss": 0.0333, + "step": 1011 + }, + { + "epoch": 0.06, + "grad_norm": 0.5323933653130326, + "learning_rate": 9.968816567896148e-07, + "loss": 0.2166, + "step": 1012 + }, + { + "epoch": 0.06, + "grad_norm": 0.3418637697195922, + "learning_rate": 9.96870130117494e-07, + "loss": 0.2775, + "step": 1013 + }, + { + "epoch": 0.06, + "grad_norm": 0.5571011756013858, + "learning_rate": 9.968585822478948e-07, + "loss": 0.3092, + "step": 1014 + }, + { + "epoch": 0.06, + "grad_norm": 1.470738521801038, + "learning_rate": 9.968470131813096e-07, + "loss": 0.21, + "step": 1015 + }, + { + "epoch": 0.06, + "grad_norm": 0.2805505635889866, + "learning_rate": 9.968354229182319e-07, + "loss": 0.0548, + "step": 1016 + }, + { + "epoch": 0.06, + "grad_norm": 0.6034060234356918, + "learning_rate": 9.968238114591565e-07, + "loss": 0.3312, + "step": 1017 + }, + { + "epoch": 0.06, + "grad_norm": 0.5386064080951632, + "learning_rate": 9.968121788045784e-07, + "loss": 0.2165, + "step": 1018 + }, + { + "epoch": 0.06, + "grad_norm": 0.31641615699874504, + "learning_rate": 9.968005249549942e-07, + "loss": 0.0959, + "step": 1019 + }, + { + "epoch": 0.07, + "grad_norm": 1.3656653627529067, + "learning_rate": 9.967888499109007e-07, + "loss": 0.2005, + "step": 1020 + }, + { + "epoch": 0.07, + "grad_norm": 0.34502538080044065, + "learning_rate": 9.967771536727963e-07, + "loss": 0.2918, + "step": 1021 + }, + { + "epoch": 0.07, + "grad_norm": 0.6263498899361163, + "learning_rate": 9.967654362411798e-07, + "loss": 0.3505, + "step": 1022 + }, + { + "epoch": 0.07, + "grad_norm": 0.22916347440126475, + "learning_rate": 9.967536976165515e-07, + "loss": 0.0495, + "step": 1023 + }, + { + "epoch": 0.07, + "grad_norm": 0.36512885889328833, + "learning_rate": 9.967419377994116e-07, + "loss": 0.2453, + "step": 1024 + }, + { + "epoch": 0.07, + "grad_norm": 0.6159879197644346, + "learning_rate": 9.967301567902619e-07, + "loss": 0.0777, + "step": 1025 + }, + { + "epoch": 0.07, + "grad_norm": 0.5848397426298377, + "learning_rate": 9.967183545896055e-07, + "loss": 0.1947, + "step": 1026 + }, + { + "epoch": 0.07, + "grad_norm": 0.3063476765681408, + "learning_rate": 9.967065311979452e-07, + "loss": 0.0198, + "step": 1027 + }, + { + "epoch": 0.07, + "grad_norm": 0.12462664133673665, + "learning_rate": 9.96694686615786e-07, + "loss": 0.0142, + "step": 1028 + }, + { + "epoch": 0.07, + "grad_norm": 0.33720844756144086, + "learning_rate": 9.96682820843633e-07, + "loss": 0.1659, + "step": 1029 + }, + { + "epoch": 0.07, + "grad_norm": 0.5796779985194487, + "learning_rate": 9.966709338819924e-07, + "loss": 0.1904, + "step": 1030 + }, + { + "epoch": 0.07, + "grad_norm": 0.4410799052200912, + "learning_rate": 9.966590257313713e-07, + "loss": 0.2489, + "step": 1031 + }, + { + "epoch": 0.07, + "grad_norm": 0.5506144502649098, + "learning_rate": 9.966470963922778e-07, + "loss": 0.1857, + "step": 1032 + }, + { + "epoch": 0.07, + "grad_norm": 0.8245813418322644, + "learning_rate": 9.966351458652207e-07, + "loss": 0.0588, + "step": 1033 + }, + { + "epoch": 0.07, + "grad_norm": 0.7573390896270649, + "learning_rate": 9.9662317415071e-07, + "loss": 0.2642, + "step": 1034 + }, + { + "epoch": 0.07, + "grad_norm": 0.6741845741174953, + "learning_rate": 9.96611181249256e-07, + "loss": 0.1297, + "step": 1035 + }, + { + "epoch": 0.07, + "grad_norm": 0.5176367178481315, + "learning_rate": 9.965991671613712e-07, + "loss": 0.1497, + "step": 1036 + }, + { + "epoch": 0.07, + "grad_norm": 0.24016814478297469, + "learning_rate": 9.965871318875674e-07, + "loss": 0.1805, + "step": 1037 + }, + { + "epoch": 0.07, + "grad_norm": 0.5975343726265553, + "learning_rate": 9.965750754283581e-07, + "loss": 0.3547, + "step": 1038 + }, + { + "epoch": 0.07, + "grad_norm": 0.48165162801011546, + "learning_rate": 9.965629977842583e-07, + "loss": 0.0807, + "step": 1039 + }, + { + "epoch": 0.07, + "grad_norm": 0.3968531251659212, + "learning_rate": 9.965508989557825e-07, + "loss": 0.3349, + "step": 1040 + }, + { + "epoch": 0.07, + "grad_norm": 0.27580191225200595, + "learning_rate": 9.965387789434473e-07, + "loss": 0.011, + "step": 1041 + }, + { + "epoch": 0.07, + "grad_norm": 0.44955503042974304, + "learning_rate": 9.965266377477694e-07, + "loss": 0.2546, + "step": 1042 + }, + { + "epoch": 0.07, + "grad_norm": 0.3241293984826068, + "learning_rate": 9.965144753692672e-07, + "loss": 0.0935, + "step": 1043 + }, + { + "epoch": 0.07, + "grad_norm": 0.3984129298375122, + "learning_rate": 9.965022918084591e-07, + "loss": 0.0321, + "step": 1044 + }, + { + "epoch": 0.07, + "grad_norm": 0.22176833947800315, + "learning_rate": 9.964900870658653e-07, + "loss": 0.1971, + "step": 1045 + }, + { + "epoch": 0.07, + "grad_norm": 0.4374868508187839, + "learning_rate": 9.964778611420063e-07, + "loss": 0.1572, + "step": 1046 + }, + { + "epoch": 0.07, + "grad_norm": 0.6606499413383291, + "learning_rate": 9.964656140374038e-07, + "loss": 0.1915, + "step": 1047 + }, + { + "epoch": 0.07, + "grad_norm": 0.6208050665040733, + "learning_rate": 9.9645334575258e-07, + "loss": 0.0846, + "step": 1048 + }, + { + "epoch": 0.07, + "grad_norm": 0.3445588204772651, + "learning_rate": 9.964410562880587e-07, + "loss": 0.0497, + "step": 1049 + }, + { + "epoch": 0.07, + "grad_norm": 0.4460992279579737, + "learning_rate": 9.964287456443639e-07, + "loss": 0.3039, + "step": 1050 + }, + { + "epoch": 0.07, + "grad_norm": 0.3220445790766819, + "learning_rate": 9.964164138220207e-07, + "loss": 0.0138, + "step": 1051 + }, + { + "epoch": 0.07, + "grad_norm": 0.6758744160821144, + "learning_rate": 9.964040608215557e-07, + "loss": 0.2955, + "step": 1052 + }, + { + "epoch": 0.07, + "grad_norm": 0.3339475440618883, + "learning_rate": 9.963916866434952e-07, + "loss": 0.0671, + "step": 1053 + }, + { + "epoch": 0.07, + "grad_norm": 1.1073303272937873, + "learning_rate": 9.963792912883676e-07, + "loss": 0.2129, + "step": 1054 + }, + { + "epoch": 0.07, + "grad_norm": 0.6059768593260848, + "learning_rate": 9.963668747567018e-07, + "loss": 0.2493, + "step": 1055 + }, + { + "epoch": 0.07, + "grad_norm": 0.4349080240496784, + "learning_rate": 9.963544370490268e-07, + "loss": 0.408, + "step": 1056 + }, + { + "epoch": 0.07, + "grad_norm": 0.6005730259160773, + "learning_rate": 9.963419781658742e-07, + "loss": 0.095, + "step": 1057 + }, + { + "epoch": 0.07, + "grad_norm": 0.6750176494329789, + "learning_rate": 9.963294981077747e-07, + "loss": 0.3842, + "step": 1058 + }, + { + "epoch": 0.07, + "grad_norm": 0.3515007208986595, + "learning_rate": 9.963169968752613e-07, + "loss": 0.128, + "step": 1059 + }, + { + "epoch": 0.07, + "grad_norm": 0.8845299459356517, + "learning_rate": 9.96304474468867e-07, + "loss": 0.3846, + "step": 1060 + }, + { + "epoch": 0.07, + "grad_norm": 0.7393735816661642, + "learning_rate": 9.962919308891263e-07, + "loss": 0.3981, + "step": 1061 + }, + { + "epoch": 0.07, + "grad_norm": 0.44839149358820335, + "learning_rate": 9.96279366136574e-07, + "loss": 0.2503, + "step": 1062 + }, + { + "epoch": 0.07, + "grad_norm": 0.5404046180966962, + "learning_rate": 9.962667802117463e-07, + "loss": 0.1419, + "step": 1063 + }, + { + "epoch": 0.07, + "grad_norm": 0.16238469118518242, + "learning_rate": 9.962541731151802e-07, + "loss": 0.0093, + "step": 1064 + }, + { + "epoch": 0.07, + "grad_norm": 0.5738195741439209, + "learning_rate": 9.962415448474133e-07, + "loss": 0.2628, + "step": 1065 + }, + { + "epoch": 0.07, + "grad_norm": 0.25126688476863107, + "learning_rate": 9.962288954089845e-07, + "loss": 0.0424, + "step": 1066 + }, + { + "epoch": 0.07, + "grad_norm": 0.35598606064613925, + "learning_rate": 9.962162248004337e-07, + "loss": 0.1254, + "step": 1067 + }, + { + "epoch": 0.07, + "grad_norm": 0.46534900163923154, + "learning_rate": 9.96203533022301e-07, + "loss": 0.0417, + "step": 1068 + }, + { + "epoch": 0.07, + "grad_norm": 0.469743894149503, + "learning_rate": 9.961908200751283e-07, + "loss": 0.1853, + "step": 1069 + }, + { + "epoch": 0.07, + "grad_norm": 0.5671191935457041, + "learning_rate": 9.961780859594578e-07, + "loss": 0.1609, + "step": 1070 + }, + { + "epoch": 0.07, + "grad_norm": 0.5871653801245706, + "learning_rate": 9.961653306758325e-07, + "loss": 0.2998, + "step": 1071 + }, + { + "epoch": 0.07, + "grad_norm": 0.5748645345770612, + "learning_rate": 9.961525542247968e-07, + "loss": 0.1314, + "step": 1072 + }, + { + "epoch": 0.07, + "grad_norm": 0.8097576392881596, + "learning_rate": 9.961397566068958e-07, + "loss": 0.2395, + "step": 1073 + }, + { + "epoch": 0.07, + "grad_norm": 0.3934771452321142, + "learning_rate": 9.961269378226755e-07, + "loss": 0.0575, + "step": 1074 + }, + { + "epoch": 0.07, + "grad_norm": 0.7713689273114455, + "learning_rate": 9.961140978726827e-07, + "loss": 0.3685, + "step": 1075 + }, + { + "epoch": 0.07, + "grad_norm": 0.6497760757747596, + "learning_rate": 9.96101236757465e-07, + "loss": 0.2248, + "step": 1076 + }, + { + "epoch": 0.07, + "grad_norm": 1.141576142325507, + "learning_rate": 9.960883544775713e-07, + "loss": 0.1828, + "step": 1077 + }, + { + "epoch": 0.07, + "grad_norm": 1.348158927625783, + "learning_rate": 9.960754510335513e-07, + "loss": 0.2588, + "step": 1078 + }, + { + "epoch": 0.07, + "grad_norm": 0.7260341348660807, + "learning_rate": 9.960625264259552e-07, + "loss": 0.3234, + "step": 1079 + }, + { + "epoch": 0.07, + "grad_norm": 0.23468077749983832, + "learning_rate": 9.960495806553345e-07, + "loss": 0.1675, + "step": 1080 + }, + { + "epoch": 0.07, + "grad_norm": 1.140786165743733, + "learning_rate": 9.960366137222413e-07, + "loss": 0.0529, + "step": 1081 + }, + { + "epoch": 0.07, + "grad_norm": 0.5568296291928382, + "learning_rate": 9.960236256272293e-07, + "loss": 0.1255, + "step": 1082 + }, + { + "epoch": 0.07, + "grad_norm": 0.5604993219180562, + "learning_rate": 9.960106163708522e-07, + "loss": 0.3344, + "step": 1083 + }, + { + "epoch": 0.07, + "grad_norm": 0.4688437202999613, + "learning_rate": 9.959975859536652e-07, + "loss": 0.0343, + "step": 1084 + }, + { + "epoch": 0.07, + "grad_norm": 0.42635694146014674, + "learning_rate": 9.95984534376224e-07, + "loss": 0.1534, + "step": 1085 + }, + { + "epoch": 0.07, + "grad_norm": 0.17090463730875813, + "learning_rate": 9.959714616390855e-07, + "loss": 0.0088, + "step": 1086 + }, + { + "epoch": 0.07, + "grad_norm": 0.5101292094283796, + "learning_rate": 9.959583677428074e-07, + "loss": 0.2843, + "step": 1087 + }, + { + "epoch": 0.07, + "grad_norm": 0.6069779617803412, + "learning_rate": 9.959452526879484e-07, + "loss": 0.1746, + "step": 1088 + }, + { + "epoch": 0.07, + "grad_norm": 1.1833347923100763, + "learning_rate": 9.959321164750678e-07, + "loss": 0.2469, + "step": 1089 + }, + { + "epoch": 0.07, + "grad_norm": 0.5081377136538748, + "learning_rate": 9.959189591047264e-07, + "loss": 0.3232, + "step": 1090 + }, + { + "epoch": 0.07, + "grad_norm": 1.4076384920062124, + "learning_rate": 9.959057805774851e-07, + "loss": 0.2807, + "step": 1091 + }, + { + "epoch": 0.07, + "grad_norm": 0.3374042679952573, + "learning_rate": 9.958925808939063e-07, + "loss": 0.1914, + "step": 1092 + }, + { + "epoch": 0.07, + "grad_norm": 0.6500977046862573, + "learning_rate": 9.958793600545531e-07, + "loss": 0.4011, + "step": 1093 + }, + { + "epoch": 0.07, + "grad_norm": 0.17172487725261068, + "learning_rate": 9.958661180599898e-07, + "loss": 0.0724, + "step": 1094 + }, + { + "epoch": 0.07, + "grad_norm": 0.5200887718325509, + "learning_rate": 9.95852854910781e-07, + "loss": 0.109, + "step": 1095 + }, + { + "epoch": 0.07, + "grad_norm": 1.0243269067900986, + "learning_rate": 9.958395706074925e-07, + "loss": 0.3678, + "step": 1096 + }, + { + "epoch": 0.07, + "grad_norm": 0.21424244960150624, + "learning_rate": 9.958262651506913e-07, + "loss": 0.0498, + "step": 1097 + }, + { + "epoch": 0.07, + "grad_norm": 0.43342568758247785, + "learning_rate": 9.958129385409447e-07, + "loss": 0.2587, + "step": 1098 + }, + { + "epoch": 0.07, + "grad_norm": 0.44996930849030936, + "learning_rate": 9.957995907788217e-07, + "loss": 0.0959, + "step": 1099 + }, + { + "epoch": 0.07, + "grad_norm": 0.3958412973964965, + "learning_rate": 9.95786221864891e-07, + "loss": 0.2296, + "step": 1100 + }, + { + "epoch": 0.07, + "grad_norm": 0.6945523383410414, + "learning_rate": 9.95772831799724e-07, + "loss": 0.2633, + "step": 1101 + }, + { + "epoch": 0.07, + "grad_norm": 0.5773111265925281, + "learning_rate": 9.95759420583891e-07, + "loss": 0.0299, + "step": 1102 + }, + { + "epoch": 0.07, + "grad_norm": 0.49091192136696843, + "learning_rate": 9.957459882179647e-07, + "loss": 0.2164, + "step": 1103 + }, + { + "epoch": 0.07, + "grad_norm": 0.6774248050977482, + "learning_rate": 9.957325347025178e-07, + "loss": 0.3325, + "step": 1104 + }, + { + "epoch": 0.07, + "grad_norm": 0.5824953378957358, + "learning_rate": 9.957190600381245e-07, + "loss": 0.2733, + "step": 1105 + }, + { + "epoch": 0.07, + "grad_norm": 0.565278403467531, + "learning_rate": 9.957055642253596e-07, + "loss": 0.376, + "step": 1106 + }, + { + "epoch": 0.07, + "grad_norm": 0.27982614461170324, + "learning_rate": 9.95692047264799e-07, + "loss": 0.0116, + "step": 1107 + }, + { + "epoch": 0.07, + "grad_norm": 0.6146706873778525, + "learning_rate": 9.956785091570189e-07, + "loss": 0.2508, + "step": 1108 + }, + { + "epoch": 0.07, + "grad_norm": 0.6467482493773037, + "learning_rate": 9.956649499025973e-07, + "loss": 0.2231, + "step": 1109 + }, + { + "epoch": 0.07, + "grad_norm": 0.41524716018566904, + "learning_rate": 9.956513695021124e-07, + "loss": 0.3621, + "step": 1110 + }, + { + "epoch": 0.07, + "grad_norm": 0.39463424844312994, + "learning_rate": 9.956377679561439e-07, + "loss": 0.3311, + "step": 1111 + }, + { + "epoch": 0.07, + "grad_norm": 0.20254282800084308, + "learning_rate": 9.956241452652717e-07, + "loss": 0.1893, + "step": 1112 + }, + { + "epoch": 0.07, + "grad_norm": 0.6996924986838188, + "learning_rate": 9.95610501430077e-07, + "loss": 0.2643, + "step": 1113 + }, + { + "epoch": 0.07, + "grad_norm": 0.45664575882710157, + "learning_rate": 9.955968364511425e-07, + "loss": 0.2615, + "step": 1114 + }, + { + "epoch": 0.07, + "grad_norm": 0.4008275526455585, + "learning_rate": 9.955831503290502e-07, + "loss": 0.2214, + "step": 1115 + }, + { + "epoch": 0.07, + "grad_norm": 0.6087192129796571, + "learning_rate": 9.955694430643847e-07, + "loss": 0.2212, + "step": 1116 + }, + { + "epoch": 0.07, + "grad_norm": 0.30902609825608435, + "learning_rate": 9.955557146577305e-07, + "loss": 0.2269, + "step": 1117 + }, + { + "epoch": 0.07, + "grad_norm": 0.6729231869763611, + "learning_rate": 9.955419651096733e-07, + "loss": 0.155, + "step": 1118 + }, + { + "epoch": 0.07, + "grad_norm": 0.6423072684982292, + "learning_rate": 9.955281944207996e-07, + "loss": 0.2317, + "step": 1119 + }, + { + "epoch": 0.07, + "grad_norm": 0.3614719727445751, + "learning_rate": 9.955144025916972e-07, + "loss": 0.0284, + "step": 1120 + }, + { + "epoch": 0.07, + "grad_norm": 0.31847134873224486, + "learning_rate": 9.955005896229541e-07, + "loss": 0.059, + "step": 1121 + }, + { + "epoch": 0.07, + "grad_norm": 0.6640613090116112, + "learning_rate": 9.954867555151599e-07, + "loss": 0.1176, + "step": 1122 + }, + { + "epoch": 0.07, + "grad_norm": 0.2870218793153667, + "learning_rate": 9.954729002689046e-07, + "loss": 0.1116, + "step": 1123 + }, + { + "epoch": 0.07, + "grad_norm": 2.4299444902143463, + "learning_rate": 9.954590238847792e-07, + "loss": 0.177, + "step": 1124 + }, + { + "epoch": 0.07, + "grad_norm": 0.5042856873929278, + "learning_rate": 9.954451263633761e-07, + "loss": 0.2856, + "step": 1125 + }, + { + "epoch": 0.07, + "grad_norm": 0.5624574567029191, + "learning_rate": 9.95431207705288e-07, + "loss": 0.2922, + "step": 1126 + }, + { + "epoch": 0.07, + "grad_norm": 0.7033341472119826, + "learning_rate": 9.954172679111083e-07, + "loss": 0.19, + "step": 1127 + }, + { + "epoch": 0.07, + "grad_norm": 0.5166063605840293, + "learning_rate": 9.954033069814323e-07, + "loss": 0.1983, + "step": 1128 + }, + { + "epoch": 0.07, + "grad_norm": 0.3111036588703034, + "learning_rate": 9.953893249168552e-07, + "loss": 0.2572, + "step": 1129 + }, + { + "epoch": 0.07, + "grad_norm": 0.5850201954455693, + "learning_rate": 9.953753217179737e-07, + "loss": 0.3981, + "step": 1130 + }, + { + "epoch": 0.07, + "grad_norm": 0.31148056278206443, + "learning_rate": 9.95361297385385e-07, + "loss": 0.1387, + "step": 1131 + }, + { + "epoch": 0.07, + "grad_norm": 0.5011527650666843, + "learning_rate": 9.953472519196876e-07, + "loss": 0.1981, + "step": 1132 + }, + { + "epoch": 0.07, + "grad_norm": 0.2952964681892801, + "learning_rate": 9.95333185321481e-07, + "loss": 0.0314, + "step": 1133 + }, + { + "epoch": 0.07, + "grad_norm": 0.2969760901552938, + "learning_rate": 9.953190975913645e-07, + "loss": 0.0142, + "step": 1134 + }, + { + "epoch": 0.07, + "grad_norm": 0.303357352799507, + "learning_rate": 9.953049887299399e-07, + "loss": 0.0174, + "step": 1135 + }, + { + "epoch": 0.07, + "grad_norm": 0.5223668998963273, + "learning_rate": 9.952908587378088e-07, + "loss": 0.0619, + "step": 1136 + }, + { + "epoch": 0.07, + "grad_norm": 0.4011876020727392, + "learning_rate": 9.95276707615574e-07, + "loss": 0.1629, + "step": 1137 + }, + { + "epoch": 0.07, + "grad_norm": 0.6661033093782415, + "learning_rate": 9.95262535363839e-07, + "loss": 0.3111, + "step": 1138 + }, + { + "epoch": 0.07, + "grad_norm": 0.20087502918394676, + "learning_rate": 9.952483419832087e-07, + "loss": 0.0984, + "step": 1139 + }, + { + "epoch": 0.07, + "grad_norm": 0.5523294888749696, + "learning_rate": 9.952341274742885e-07, + "loss": 0.3158, + "step": 1140 + }, + { + "epoch": 0.07, + "grad_norm": 0.4577651432739096, + "learning_rate": 9.952198918376852e-07, + "loss": 0.233, + "step": 1141 + }, + { + "epoch": 0.07, + "grad_norm": 0.6495563574623383, + "learning_rate": 9.952056350740055e-07, + "loss": 0.231, + "step": 1142 + }, + { + "epoch": 0.07, + "grad_norm": 0.1953624200184034, + "learning_rate": 9.95191357183858e-07, + "loss": 0.1671, + "step": 1143 + }, + { + "epoch": 0.07, + "grad_norm": 0.656737350758839, + "learning_rate": 9.951770581678517e-07, + "loss": 0.2997, + "step": 1144 + }, + { + "epoch": 0.07, + "grad_norm": 0.4068216497777626, + "learning_rate": 9.951627380265966e-07, + "loss": 0.4044, + "step": 1145 + }, + { + "epoch": 0.07, + "grad_norm": 0.7505436657249192, + "learning_rate": 9.95148396760704e-07, + "loss": 0.1125, + "step": 1146 + }, + { + "epoch": 0.07, + "grad_norm": 2.649769295135062, + "learning_rate": 9.95134034370785e-07, + "loss": 0.0573, + "step": 1147 + }, + { + "epoch": 0.07, + "grad_norm": 0.3886561767416595, + "learning_rate": 9.95119650857453e-07, + "loss": 0.0267, + "step": 1148 + }, + { + "epoch": 0.07, + "grad_norm": 0.3897086233521129, + "learning_rate": 9.951052462213214e-07, + "loss": 0.133, + "step": 1149 + }, + { + "epoch": 0.07, + "grad_norm": 0.29633369766252005, + "learning_rate": 9.950908204630047e-07, + "loss": 0.1935, + "step": 1150 + }, + { + "epoch": 0.07, + "grad_norm": 0.4656970089994235, + "learning_rate": 9.950763735831182e-07, + "loss": 0.3169, + "step": 1151 + }, + { + "epoch": 0.07, + "grad_norm": 0.3022354535153058, + "learning_rate": 9.950619055822786e-07, + "loss": 0.0651, + "step": 1152 + }, + { + "epoch": 0.07, + "grad_norm": 0.7774838289878337, + "learning_rate": 9.950474164611028e-07, + "loss": 0.1482, + "step": 1153 + }, + { + "epoch": 0.07, + "grad_norm": 0.5852672827206604, + "learning_rate": 9.95032906220209e-07, + "loss": 0.1979, + "step": 1154 + }, + { + "epoch": 0.07, + "grad_norm": 0.4731884099599208, + "learning_rate": 9.950183748602163e-07, + "loss": 0.2213, + "step": 1155 + }, + { + "epoch": 0.07, + "grad_norm": 0.6184415335068383, + "learning_rate": 9.950038223817447e-07, + "loss": 0.073, + "step": 1156 + }, + { + "epoch": 0.07, + "grad_norm": 0.47959352443439546, + "learning_rate": 9.949892487854149e-07, + "loss": 0.252, + "step": 1157 + }, + { + "epoch": 0.07, + "grad_norm": 0.7619017568194408, + "learning_rate": 9.949746540718487e-07, + "loss": 0.2122, + "step": 1158 + }, + { + "epoch": 0.07, + "grad_norm": 0.4304366317713998, + "learning_rate": 9.949600382416685e-07, + "loss": 0.0904, + "step": 1159 + }, + { + "epoch": 0.07, + "grad_norm": 0.6021043497073343, + "learning_rate": 9.949454012954985e-07, + "loss": 0.1185, + "step": 1160 + }, + { + "epoch": 0.07, + "grad_norm": 0.6665744147667394, + "learning_rate": 9.949307432339624e-07, + "loss": 0.1825, + "step": 1161 + }, + { + "epoch": 0.07, + "grad_norm": 0.47650417959695823, + "learning_rate": 9.94916064057686e-07, + "loss": 0.3691, + "step": 1162 + }, + { + "epoch": 0.07, + "grad_norm": 0.15356554982909978, + "learning_rate": 9.949013637672953e-07, + "loss": 0.1171, + "step": 1163 + }, + { + "epoch": 0.07, + "grad_norm": 0.5157615623872764, + "learning_rate": 9.948866423634176e-07, + "loss": 0.1202, + "step": 1164 + }, + { + "epoch": 0.07, + "grad_norm": 0.4315926796169466, + "learning_rate": 9.94871899846681e-07, + "loss": 0.152, + "step": 1165 + }, + { + "epoch": 0.07, + "grad_norm": 0.40499839612104527, + "learning_rate": 9.94857136217714e-07, + "loss": 0.3212, + "step": 1166 + }, + { + "epoch": 0.07, + "grad_norm": 0.2255758734858464, + "learning_rate": 9.948423514771472e-07, + "loss": 0.1532, + "step": 1167 + }, + { + "epoch": 0.07, + "grad_norm": 0.6682922622170951, + "learning_rate": 9.948275456256108e-07, + "loss": 0.2367, + "step": 1168 + }, + { + "epoch": 0.07, + "grad_norm": 0.7060494908892788, + "learning_rate": 9.948127186637365e-07, + "loss": 0.0218, + "step": 1169 + }, + { + "epoch": 0.07, + "grad_norm": 0.3161321311597232, + "learning_rate": 9.94797870592157e-07, + "loss": 0.2219, + "step": 1170 + }, + { + "epoch": 0.07, + "grad_norm": 0.3369016471937861, + "learning_rate": 9.947830014115056e-07, + "loss": 0.2064, + "step": 1171 + }, + { + "epoch": 0.07, + "grad_norm": 1.1991222681659008, + "learning_rate": 9.947681111224166e-07, + "loss": 0.4302, + "step": 1172 + }, + { + "epoch": 0.07, + "grad_norm": 1.4410691788944219, + "learning_rate": 9.947531997255256e-07, + "loss": 0.2865, + "step": 1173 + }, + { + "epoch": 0.07, + "grad_norm": 0.3624040415600144, + "learning_rate": 9.947382672214684e-07, + "loss": 0.2802, + "step": 1174 + }, + { + "epoch": 0.07, + "grad_norm": 0.5548278217941871, + "learning_rate": 9.947233136108822e-07, + "loss": 0.1076, + "step": 1175 + }, + { + "epoch": 0.07, + "grad_norm": 0.5653568715153751, + "learning_rate": 9.94708338894405e-07, + "loss": 0.3068, + "step": 1176 + }, + { + "epoch": 0.08, + "grad_norm": 0.18076817660824668, + "learning_rate": 9.946933430726753e-07, + "loss": 0.0119, + "step": 1177 + }, + { + "epoch": 0.08, + "grad_norm": 0.4426239335022008, + "learning_rate": 9.946783261463333e-07, + "loss": 0.1841, + "step": 1178 + }, + { + "epoch": 0.08, + "grad_norm": 0.582521438640547, + "learning_rate": 9.946632881160196e-07, + "loss": 0.2756, + "step": 1179 + }, + { + "epoch": 0.08, + "grad_norm": 0.7782034172939633, + "learning_rate": 9.946482289823755e-07, + "loss": 0.1649, + "step": 1180 + }, + { + "epoch": 0.08, + "grad_norm": 0.3751614071969384, + "learning_rate": 9.946331487460435e-07, + "loss": 0.048, + "step": 1181 + }, + { + "epoch": 0.08, + "grad_norm": 0.6767317790794916, + "learning_rate": 9.946180474076673e-07, + "loss": 0.1142, + "step": 1182 + }, + { + "epoch": 0.08, + "grad_norm": 0.40650271265656646, + "learning_rate": 9.946029249678906e-07, + "loss": 0.0159, + "step": 1183 + }, + { + "epoch": 0.08, + "grad_norm": 0.4988330139152289, + "learning_rate": 9.94587781427359e-07, + "loss": 0.1236, + "step": 1184 + }, + { + "epoch": 0.08, + "grad_norm": 0.741740164161934, + "learning_rate": 9.945726167867184e-07, + "loss": 0.3357, + "step": 1185 + }, + { + "epoch": 0.08, + "grad_norm": 0.6184543088109902, + "learning_rate": 9.945574310466159e-07, + "loss": 0.2253, + "step": 1186 + }, + { + "epoch": 0.08, + "grad_norm": 0.5727246845437999, + "learning_rate": 9.945422242076989e-07, + "loss": 0.3845, + "step": 1187 + }, + { + "epoch": 0.08, + "grad_norm": 0.5432437340533075, + "learning_rate": 9.945269962706167e-07, + "loss": 0.3971, + "step": 1188 + }, + { + "epoch": 0.08, + "grad_norm": 0.29682050896542284, + "learning_rate": 9.945117472360184e-07, + "loss": 0.1106, + "step": 1189 + }, + { + "epoch": 0.08, + "grad_norm": 0.8166999080191234, + "learning_rate": 9.944964771045552e-07, + "loss": 0.3071, + "step": 1190 + }, + { + "epoch": 0.08, + "grad_norm": 0.7471517122493586, + "learning_rate": 9.944811858768782e-07, + "loss": 0.1709, + "step": 1191 + }, + { + "epoch": 0.08, + "grad_norm": 0.2994779575470051, + "learning_rate": 9.944658735536395e-07, + "loss": 0.2743, + "step": 1192 + }, + { + "epoch": 0.08, + "grad_norm": 0.23381810826721885, + "learning_rate": 9.94450540135493e-07, + "loss": 0.0987, + "step": 1193 + }, + { + "epoch": 0.08, + "grad_norm": 0.39938123805950754, + "learning_rate": 9.94435185623092e-07, + "loss": 0.2987, + "step": 1194 + }, + { + "epoch": 0.08, + "grad_norm": 0.9688135724386572, + "learning_rate": 9.944198100170927e-07, + "loss": 0.152, + "step": 1195 + }, + { + "epoch": 0.08, + "grad_norm": 0.6587060853941012, + "learning_rate": 9.9440441331815e-07, + "loss": 0.1525, + "step": 1196 + }, + { + "epoch": 0.08, + "grad_norm": 0.16281682683361928, + "learning_rate": 9.943889955269212e-07, + "loss": 0.0904, + "step": 1197 + }, + { + "epoch": 0.08, + "grad_norm": 0.33697811875794936, + "learning_rate": 9.94373556644064e-07, + "loss": 0.059, + "step": 1198 + }, + { + "epoch": 0.08, + "grad_norm": 0.3749015720899504, + "learning_rate": 9.94358096670237e-07, + "loss": 0.2188, + "step": 1199 + }, + { + "epoch": 0.08, + "grad_norm": 1.010358437978261, + "learning_rate": 9.943426156061e-07, + "loss": 0.2657, + "step": 1200 + }, + { + "epoch": 0.08, + "grad_norm": 0.5161456376184853, + "learning_rate": 9.94327113452313e-07, + "loss": 0.0139, + "step": 1201 + }, + { + "epoch": 0.08, + "grad_norm": 0.2871394945981592, + "learning_rate": 9.943115902095378e-07, + "loss": 0.008, + "step": 1202 + }, + { + "epoch": 0.08, + "grad_norm": 0.6177966823720042, + "learning_rate": 9.942960458784364e-07, + "loss": 0.3859, + "step": 1203 + }, + { + "epoch": 0.08, + "grad_norm": 0.3777364649984627, + "learning_rate": 9.942804804596722e-07, + "loss": 0.1185, + "step": 1204 + }, + { + "epoch": 0.08, + "grad_norm": 0.969121238221322, + "learning_rate": 9.942648939539086e-07, + "loss": 0.306, + "step": 1205 + }, + { + "epoch": 0.08, + "grad_norm": 0.146732018703517, + "learning_rate": 9.942492863618114e-07, + "loss": 0.156, + "step": 1206 + }, + { + "epoch": 0.08, + "grad_norm": 0.41643345430756434, + "learning_rate": 9.942336576840462e-07, + "loss": 0.1954, + "step": 1207 + }, + { + "epoch": 0.08, + "grad_norm": 0.19056838451877037, + "learning_rate": 9.942180079212793e-07, + "loss": 0.093, + "step": 1208 + }, + { + "epoch": 0.08, + "grad_norm": 0.39719466309769197, + "learning_rate": 9.94202337074179e-07, + "loss": 0.1966, + "step": 1209 + }, + { + "epoch": 0.08, + "grad_norm": 0.7120739258858446, + "learning_rate": 9.941866451434131e-07, + "loss": 0.2299, + "step": 1210 + }, + { + "epoch": 0.08, + "grad_norm": 2.394798508834025, + "learning_rate": 9.94170932129652e-07, + "loss": 0.2869, + "step": 1211 + }, + { + "epoch": 0.08, + "grad_norm": 0.5192994332437146, + "learning_rate": 9.941551980335652e-07, + "loss": 0.2403, + "step": 1212 + }, + { + "epoch": 0.08, + "grad_norm": 0.4367852588222532, + "learning_rate": 9.941394428558244e-07, + "loss": 0.1468, + "step": 1213 + }, + { + "epoch": 0.08, + "grad_norm": 0.74335549781076, + "learning_rate": 9.941236665971015e-07, + "loss": 0.2553, + "step": 1214 + }, + { + "epoch": 0.08, + "grad_norm": 0.3262131805085108, + "learning_rate": 9.941078692580698e-07, + "loss": 0.1993, + "step": 1215 + }, + { + "epoch": 0.08, + "grad_norm": 0.3145273998877605, + "learning_rate": 9.94092050839403e-07, + "loss": 0.1359, + "step": 1216 + }, + { + "epoch": 0.08, + "grad_norm": 0.43561714541066404, + "learning_rate": 9.94076211341776e-07, + "loss": 0.0249, + "step": 1217 + }, + { + "epoch": 0.08, + "grad_norm": 0.2378825638605044, + "learning_rate": 9.940603507658648e-07, + "loss": 0.0156, + "step": 1218 + }, + { + "epoch": 0.08, + "grad_norm": 0.6998547645217111, + "learning_rate": 9.940444691123458e-07, + "loss": 0.3879, + "step": 1219 + }, + { + "epoch": 0.08, + "grad_norm": 0.9523241648092471, + "learning_rate": 9.940285663818967e-07, + "loss": 0.3711, + "step": 1220 + }, + { + "epoch": 0.08, + "grad_norm": 0.8063499293177215, + "learning_rate": 9.940126425751956e-07, + "loss": 0.0743, + "step": 1221 + }, + { + "epoch": 0.08, + "grad_norm": 1.6669138093510243, + "learning_rate": 9.939966976929222e-07, + "loss": 0.4219, + "step": 1222 + }, + { + "epoch": 0.08, + "grad_norm": 0.5257776310542316, + "learning_rate": 9.939807317357566e-07, + "loss": 0.3033, + "step": 1223 + }, + { + "epoch": 0.08, + "grad_norm": 0.23137183935848096, + "learning_rate": 9.939647447043798e-07, + "loss": 0.0905, + "step": 1224 + }, + { + "epoch": 0.08, + "grad_norm": 0.1987866191319867, + "learning_rate": 9.939487365994741e-07, + "loss": 0.0111, + "step": 1225 + }, + { + "epoch": 0.08, + "grad_norm": 0.4266213404580218, + "learning_rate": 9.939327074217225e-07, + "loss": 0.1496, + "step": 1226 + }, + { + "epoch": 0.08, + "grad_norm": 0.8760138859606063, + "learning_rate": 9.939166571718084e-07, + "loss": 0.5151, + "step": 1227 + }, + { + "epoch": 0.08, + "grad_norm": 0.46525354145023406, + "learning_rate": 9.93900585850417e-07, + "loss": 0.15, + "step": 1228 + }, + { + "epoch": 0.08, + "grad_norm": 0.48283921693480103, + "learning_rate": 9.938844934582337e-07, + "loss": 0.2343, + "step": 1229 + }, + { + "epoch": 0.08, + "grad_norm": 0.6795500594554379, + "learning_rate": 9.938683799959452e-07, + "loss": 0.2176, + "step": 1230 + }, + { + "epoch": 0.08, + "grad_norm": 0.4507303170281007, + "learning_rate": 9.938522454642387e-07, + "loss": 0.1081, + "step": 1231 + }, + { + "epoch": 0.08, + "grad_norm": 0.4182381054561944, + "learning_rate": 9.938360898638026e-07, + "loss": 0.1998, + "step": 1232 + }, + { + "epoch": 0.08, + "grad_norm": 0.1807866975834911, + "learning_rate": 9.938199131953263e-07, + "loss": 0.0178, + "step": 1233 + }, + { + "epoch": 0.08, + "grad_norm": 0.6328066547132252, + "learning_rate": 9.938037154594996e-07, + "loss": 0.1559, + "step": 1234 + }, + { + "epoch": 0.08, + "grad_norm": 0.4285963389738819, + "learning_rate": 9.937874966570139e-07, + "loss": 0.1104, + "step": 1235 + }, + { + "epoch": 0.08, + "grad_norm": 0.689788062574801, + "learning_rate": 9.937712567885608e-07, + "loss": 0.1247, + "step": 1236 + }, + { + "epoch": 0.08, + "grad_norm": 0.4376887501103226, + "learning_rate": 9.937549958548335e-07, + "loss": 0.3802, + "step": 1237 + }, + { + "epoch": 0.08, + "grad_norm": 0.42673112844316574, + "learning_rate": 9.937387138565255e-07, + "loss": 0.2974, + "step": 1238 + }, + { + "epoch": 0.08, + "grad_norm": 0.48626121488061697, + "learning_rate": 9.93722410794331e-07, + "loss": 0.3471, + "step": 1239 + }, + { + "epoch": 0.08, + "grad_norm": 0.6003795063350325, + "learning_rate": 9.937060866689463e-07, + "loss": 0.2365, + "step": 1240 + }, + { + "epoch": 0.08, + "grad_norm": 0.5696774630629393, + "learning_rate": 9.936897414810676e-07, + "loss": 0.2721, + "step": 1241 + }, + { + "epoch": 0.08, + "grad_norm": 0.20690602525821974, + "learning_rate": 9.936733752313918e-07, + "loss": 0.1156, + "step": 1242 + }, + { + "epoch": 0.08, + "grad_norm": 0.7178905498679734, + "learning_rate": 9.936569879206175e-07, + "loss": 0.2117, + "step": 1243 + }, + { + "epoch": 0.08, + "grad_norm": 0.5812984740921069, + "learning_rate": 9.936405795494438e-07, + "loss": 0.0232, + "step": 1244 + }, + { + "epoch": 0.08, + "grad_norm": 0.12484029407011618, + "learning_rate": 9.936241501185705e-07, + "loss": 0.0247, + "step": 1245 + }, + { + "epoch": 0.08, + "grad_norm": 0.5954917585734902, + "learning_rate": 9.936076996286987e-07, + "loss": 0.0639, + "step": 1246 + }, + { + "epoch": 0.08, + "grad_norm": 0.4379080912973191, + "learning_rate": 9.935912280805302e-07, + "loss": 0.1497, + "step": 1247 + }, + { + "epoch": 0.08, + "grad_norm": 0.8882229683532059, + "learning_rate": 9.935747354747677e-07, + "loss": 0.0951, + "step": 1248 + }, + { + "epoch": 0.08, + "grad_norm": 0.29518732119725033, + "learning_rate": 9.935582218121147e-07, + "loss": 0.0143, + "step": 1249 + }, + { + "epoch": 0.08, + "grad_norm": 0.4637263032560623, + "learning_rate": 9.935416870932757e-07, + "loss": 0.1727, + "step": 1250 + }, + { + "epoch": 0.08, + "grad_norm": 0.28543800842752126, + "learning_rate": 9.935251313189563e-07, + "loss": 0.0648, + "step": 1251 + }, + { + "epoch": 0.08, + "grad_norm": 0.3134866725587153, + "learning_rate": 9.935085544898627e-07, + "loss": 0.0368, + "step": 1252 + }, + { + "epoch": 0.08, + "grad_norm": 1.0373680656726496, + "learning_rate": 9.93491956606702e-07, + "loss": 0.1571, + "step": 1253 + }, + { + "epoch": 0.08, + "grad_norm": 0.484000544280834, + "learning_rate": 9.934753376701825e-07, + "loss": 0.2467, + "step": 1254 + }, + { + "epoch": 0.08, + "grad_norm": 0.5171668737774494, + "learning_rate": 9.93458697681013e-07, + "loss": 0.2099, + "step": 1255 + }, + { + "epoch": 0.08, + "grad_norm": 1.02310506054813, + "learning_rate": 9.934420366399036e-07, + "loss": 0.2822, + "step": 1256 + }, + { + "epoch": 0.08, + "grad_norm": 0.42189408668989653, + "learning_rate": 9.934253545475648e-07, + "loss": 0.1756, + "step": 1257 + }, + { + "epoch": 0.08, + "grad_norm": 0.15533316374032066, + "learning_rate": 9.934086514047086e-07, + "loss": 0.0806, + "step": 1258 + }, + { + "epoch": 0.08, + "grad_norm": 0.6559099183399568, + "learning_rate": 9.933919272120474e-07, + "loss": 0.2921, + "step": 1259 + }, + { + "epoch": 0.08, + "grad_norm": 0.17634704006158938, + "learning_rate": 9.933751819702948e-07, + "loss": 0.0823, + "step": 1260 + }, + { + "epoch": 0.08, + "grad_norm": 0.5371146377850167, + "learning_rate": 9.93358415680165e-07, + "loss": 0.194, + "step": 1261 + }, + { + "epoch": 0.08, + "grad_norm": 0.6832922633310711, + "learning_rate": 9.933416283423736e-07, + "loss": 0.154, + "step": 1262 + }, + { + "epoch": 0.08, + "grad_norm": 0.5445353980456783, + "learning_rate": 9.933248199576364e-07, + "loss": 0.1312, + "step": 1263 + }, + { + "epoch": 0.08, + "grad_norm": 0.7814843978025754, + "learning_rate": 9.933079905266707e-07, + "loss": 0.1132, + "step": 1264 + }, + { + "epoch": 0.08, + "grad_norm": 0.952130812885761, + "learning_rate": 9.932911400501947e-07, + "loss": 0.1473, + "step": 1265 + }, + { + "epoch": 0.08, + "grad_norm": 0.56061281154319, + "learning_rate": 9.932742685289269e-07, + "loss": 0.2857, + "step": 1266 + }, + { + "epoch": 0.08, + "grad_norm": 0.8866076182489778, + "learning_rate": 9.93257375963587e-07, + "loss": 0.1433, + "step": 1267 + }, + { + "epoch": 0.08, + "grad_norm": 0.21904820241433778, + "learning_rate": 9.93240462354896e-07, + "loss": 0.0023, + "step": 1268 + }, + { + "epoch": 0.08, + "grad_norm": 0.7858922196798822, + "learning_rate": 9.932235277035754e-07, + "loss": 0.291, + "step": 1269 + }, + { + "epoch": 0.08, + "grad_norm": 0.7868040611739603, + "learning_rate": 9.932065720103476e-07, + "loss": 0.4161, + "step": 1270 + }, + { + "epoch": 0.08, + "grad_norm": 0.657228617660802, + "learning_rate": 9.931895952759357e-07, + "loss": 0.1842, + "step": 1271 + }, + { + "epoch": 0.08, + "grad_norm": 0.5977616164189764, + "learning_rate": 9.931725975010646e-07, + "loss": 0.1011, + "step": 1272 + }, + { + "epoch": 0.08, + "grad_norm": 0.6084631591727283, + "learning_rate": 9.931555786864589e-07, + "loss": 0.2752, + "step": 1273 + }, + { + "epoch": 0.08, + "grad_norm": 0.5876481628805643, + "learning_rate": 9.931385388328448e-07, + "loss": 0.5346, + "step": 1274 + }, + { + "epoch": 0.08, + "grad_norm": 0.35573169640134594, + "learning_rate": 9.931214779409494e-07, + "loss": 0.2471, + "step": 1275 + }, + { + "epoch": 0.08, + "grad_norm": 0.5684956930967539, + "learning_rate": 9.931043960115005e-07, + "loss": 0.2068, + "step": 1276 + }, + { + "epoch": 0.08, + "grad_norm": 0.5446491705101372, + "learning_rate": 9.930872930452267e-07, + "loss": 0.0741, + "step": 1277 + }, + { + "epoch": 0.08, + "grad_norm": 0.4690054081002771, + "learning_rate": 9.930701690428577e-07, + "loss": 0.2964, + "step": 1278 + }, + { + "epoch": 0.08, + "grad_norm": 0.21813429221611408, + "learning_rate": 9.930530240051243e-07, + "loss": 0.0949, + "step": 1279 + }, + { + "epoch": 0.08, + "grad_norm": 0.8295842997198223, + "learning_rate": 9.930358579327576e-07, + "loss": 0.4008, + "step": 1280 + }, + { + "epoch": 0.08, + "grad_norm": 1.498212880447362, + "learning_rate": 9.9301867082649e-07, + "loss": 0.2155, + "step": 1281 + }, + { + "epoch": 0.08, + "grad_norm": 0.9418525856185737, + "learning_rate": 9.93001462687055e-07, + "loss": 0.1317, + "step": 1282 + }, + { + "epoch": 0.08, + "grad_norm": 0.16956843695330423, + "learning_rate": 9.929842335151863e-07, + "loss": 0.0192, + "step": 1283 + }, + { + "epoch": 0.08, + "grad_norm": 0.45637433018782536, + "learning_rate": 9.929669833116194e-07, + "loss": 0.2064, + "step": 1284 + }, + { + "epoch": 0.08, + "grad_norm": 0.25788325094536546, + "learning_rate": 9.9294971207709e-07, + "loss": 0.0865, + "step": 1285 + }, + { + "epoch": 0.08, + "grad_norm": 0.8106476754548811, + "learning_rate": 9.929324198123347e-07, + "loss": 0.1804, + "step": 1286 + }, + { + "epoch": 0.08, + "grad_norm": 0.48021474596666486, + "learning_rate": 9.929151065180915e-07, + "loss": 0.3005, + "step": 1287 + }, + { + "epoch": 0.08, + "grad_norm": 0.29667317885857586, + "learning_rate": 9.928977721950992e-07, + "loss": 0.2073, + "step": 1288 + }, + { + "epoch": 0.08, + "grad_norm": 0.43856265730787586, + "learning_rate": 9.928804168440969e-07, + "loss": 0.1377, + "step": 1289 + }, + { + "epoch": 0.08, + "grad_norm": 1.0318322581898132, + "learning_rate": 9.928630404658254e-07, + "loss": 0.1567, + "step": 1290 + }, + { + "epoch": 0.08, + "grad_norm": 0.49896780895805287, + "learning_rate": 9.928456430610257e-07, + "loss": 0.1008, + "step": 1291 + }, + { + "epoch": 0.08, + "grad_norm": 0.4054032864642662, + "learning_rate": 9.9282822463044e-07, + "loss": 0.2169, + "step": 1292 + }, + { + "epoch": 0.08, + "grad_norm": 0.5500438098269558, + "learning_rate": 9.928107851748118e-07, + "loss": 0.2038, + "step": 1293 + }, + { + "epoch": 0.08, + "grad_norm": 0.5264338135127226, + "learning_rate": 9.927933246948846e-07, + "loss": 0.1568, + "step": 1294 + }, + { + "epoch": 0.08, + "grad_norm": 0.5709867696264319, + "learning_rate": 9.927758431914036e-07, + "loss": 0.2531, + "step": 1295 + }, + { + "epoch": 0.08, + "grad_norm": 0.3732442385885163, + "learning_rate": 9.927583406651145e-07, + "loss": 0.1685, + "step": 1296 + }, + { + "epoch": 0.08, + "grad_norm": 0.13061876615576834, + "learning_rate": 9.927408171167641e-07, + "loss": 0.0074, + "step": 1297 + }, + { + "epoch": 0.08, + "grad_norm": 0.5839412601840739, + "learning_rate": 9.927232725470998e-07, + "loss": 0.4051, + "step": 1298 + }, + { + "epoch": 0.08, + "grad_norm": 0.41830401854019367, + "learning_rate": 9.927057069568702e-07, + "loss": 0.134, + "step": 1299 + }, + { + "epoch": 0.08, + "grad_norm": 0.9675478024352526, + "learning_rate": 9.926881203468248e-07, + "loss": 0.1277, + "step": 1300 + }, + { + "epoch": 0.08, + "grad_norm": 0.6413055047498372, + "learning_rate": 9.926705127177137e-07, + "loss": 0.1473, + "step": 1301 + }, + { + "epoch": 0.08, + "grad_norm": 0.5766913693417655, + "learning_rate": 9.92652884070288e-07, + "loss": 0.0879, + "step": 1302 + }, + { + "epoch": 0.08, + "grad_norm": 0.6926279196508893, + "learning_rate": 9.926352344053001e-07, + "loss": 0.0861, + "step": 1303 + }, + { + "epoch": 0.08, + "grad_norm": 0.8660050060857681, + "learning_rate": 9.926175637235026e-07, + "loss": 0.1214, + "step": 1304 + }, + { + "epoch": 0.08, + "grad_norm": 0.6410176098271414, + "learning_rate": 9.925998720256496e-07, + "loss": 0.0194, + "step": 1305 + }, + { + "epoch": 0.08, + "grad_norm": 0.36678590115652, + "learning_rate": 9.925821593124959e-07, + "loss": 0.0723, + "step": 1306 + }, + { + "epoch": 0.08, + "grad_norm": 0.64978428647516, + "learning_rate": 9.92564425584797e-07, + "loss": 0.2001, + "step": 1307 + }, + { + "epoch": 0.08, + "grad_norm": 0.5857836993663871, + "learning_rate": 9.925466708433097e-07, + "loss": 0.1756, + "step": 1308 + }, + { + "epoch": 0.08, + "grad_norm": 0.6154155311141734, + "learning_rate": 9.925288950887912e-07, + "loss": 0.147, + "step": 1309 + }, + { + "epoch": 0.08, + "grad_norm": 0.2220469057151436, + "learning_rate": 9.925110983219998e-07, + "loss": 0.0138, + "step": 1310 + }, + { + "epoch": 0.08, + "grad_norm": 0.44819777842306247, + "learning_rate": 9.924932805436948e-07, + "loss": 0.2058, + "step": 1311 + }, + { + "epoch": 0.08, + "grad_norm": 0.9549652290680253, + "learning_rate": 9.924754417546367e-07, + "loss": 0.123, + "step": 1312 + }, + { + "epoch": 0.08, + "grad_norm": 0.6976385869512235, + "learning_rate": 9.924575819555861e-07, + "loss": 0.2911, + "step": 1313 + }, + { + "epoch": 0.08, + "grad_norm": 0.5428886474113841, + "learning_rate": 9.92439701147305e-07, + "loss": 0.4047, + "step": 1314 + }, + { + "epoch": 0.08, + "grad_norm": 0.522051202353723, + "learning_rate": 9.924217993305563e-07, + "loss": 0.3151, + "step": 1315 + }, + { + "epoch": 0.08, + "grad_norm": 0.4459646433200106, + "learning_rate": 9.92403876506104e-07, + "loss": 0.1648, + "step": 1316 + }, + { + "epoch": 0.08, + "grad_norm": 0.5060715812929723, + "learning_rate": 9.923859326747124e-07, + "loss": 0.2347, + "step": 1317 + }, + { + "epoch": 0.08, + "grad_norm": 0.31687712905787674, + "learning_rate": 9.92367967837147e-07, + "loss": 0.0478, + "step": 1318 + }, + { + "epoch": 0.08, + "grad_norm": 0.6603126960062933, + "learning_rate": 9.923499819941744e-07, + "loss": 0.2607, + "step": 1319 + }, + { + "epoch": 0.08, + "grad_norm": 0.6912679532061217, + "learning_rate": 9.923319751465615e-07, + "loss": 0.2336, + "step": 1320 + }, + { + "epoch": 0.08, + "grad_norm": 0.4936629031027776, + "learning_rate": 9.923139472950772e-07, + "loss": 0.0684, + "step": 1321 + }, + { + "epoch": 0.08, + "grad_norm": 0.4074506017586614, + "learning_rate": 9.922958984404901e-07, + "loss": 0.1365, + "step": 1322 + }, + { + "epoch": 0.08, + "grad_norm": 0.06931220971631999, + "learning_rate": 9.922778285835704e-07, + "loss": 0.0039, + "step": 1323 + }, + { + "epoch": 0.08, + "grad_norm": 1.0584991047923242, + "learning_rate": 9.92259737725089e-07, + "loss": 0.1751, + "step": 1324 + }, + { + "epoch": 0.08, + "grad_norm": 0.42512537728409516, + "learning_rate": 9.922416258658173e-07, + "loss": 0.0221, + "step": 1325 + }, + { + "epoch": 0.08, + "grad_norm": 0.7174700906380793, + "learning_rate": 9.922234930065285e-07, + "loss": 0.1311, + "step": 1326 + }, + { + "epoch": 0.08, + "grad_norm": 0.5040507640209686, + "learning_rate": 9.922053391479961e-07, + "loss": 0.1044, + "step": 1327 + }, + { + "epoch": 0.08, + "grad_norm": 0.47708684920990274, + "learning_rate": 9.921871642909944e-07, + "loss": 0.097, + "step": 1328 + }, + { + "epoch": 0.08, + "grad_norm": 0.505050356460811, + "learning_rate": 9.92168968436299e-07, + "loss": 0.1123, + "step": 1329 + }, + { + "epoch": 0.08, + "grad_norm": 0.5969726272276742, + "learning_rate": 9.921507515846856e-07, + "loss": 0.1421, + "step": 1330 + }, + { + "epoch": 0.08, + "grad_norm": 0.606016186383965, + "learning_rate": 9.92132513736932e-07, + "loss": 0.2951, + "step": 1331 + }, + { + "epoch": 0.08, + "grad_norm": 0.5882241937129672, + "learning_rate": 9.92114254893816e-07, + "loss": 0.2291, + "step": 1332 + }, + { + "epoch": 0.09, + "grad_norm": 0.26234108070725803, + "learning_rate": 9.920959750561169e-07, + "loss": 0.069, + "step": 1333 + }, + { + "epoch": 0.09, + "grad_norm": 0.26844022808681833, + "learning_rate": 9.920776742246142e-07, + "loss": 0.0168, + "step": 1334 + }, + { + "epoch": 0.09, + "grad_norm": 0.8127077765743058, + "learning_rate": 9.920593524000885e-07, + "loss": 0.4992, + "step": 1335 + }, + { + "epoch": 0.09, + "grad_norm": 0.4332594733357405, + "learning_rate": 9.920410095833217e-07, + "loss": 0.3344, + "step": 1336 + }, + { + "epoch": 0.09, + "grad_norm": 0.15581514468417898, + "learning_rate": 9.920226457750964e-07, + "loss": 0.0103, + "step": 1337 + }, + { + "epoch": 0.09, + "grad_norm": 0.5601738146747562, + "learning_rate": 9.920042609761961e-07, + "loss": 0.3145, + "step": 1338 + }, + { + "epoch": 0.09, + "grad_norm": 0.7410327563077606, + "learning_rate": 9.919858551874048e-07, + "loss": 0.4673, + "step": 1339 + }, + { + "epoch": 0.09, + "grad_norm": 0.05010858722366252, + "learning_rate": 9.919674284095078e-07, + "loss": 0.0021, + "step": 1340 + }, + { + "epoch": 0.09, + "grad_norm": 0.28155993225973236, + "learning_rate": 9.919489806432914e-07, + "loss": 0.1986, + "step": 1341 + }, + { + "epoch": 0.09, + "grad_norm": 0.9192755585612271, + "learning_rate": 9.919305118895424e-07, + "loss": 0.4798, + "step": 1342 + }, + { + "epoch": 0.09, + "grad_norm": 1.0614597046619378, + "learning_rate": 9.919120221490492e-07, + "loss": 0.2899, + "step": 1343 + }, + { + "epoch": 0.09, + "grad_norm": 0.16533559228097142, + "learning_rate": 9.918935114226e-07, + "loss": 0.0089, + "step": 1344 + }, + { + "epoch": 0.09, + "grad_norm": 0.8236466378681705, + "learning_rate": 9.918749797109848e-07, + "loss": 0.2607, + "step": 1345 + }, + { + "epoch": 0.09, + "grad_norm": 0.26840855921958284, + "learning_rate": 9.918564270149942e-07, + "loss": 0.1592, + "step": 1346 + }, + { + "epoch": 0.09, + "grad_norm": 0.3758093577499254, + "learning_rate": 9.918378533354197e-07, + "loss": 0.115, + "step": 1347 + }, + { + "epoch": 0.09, + "grad_norm": 0.2804836172746173, + "learning_rate": 9.918192586730538e-07, + "loss": 0.2261, + "step": 1348 + }, + { + "epoch": 0.09, + "grad_norm": 0.8855784879112487, + "learning_rate": 9.918006430286893e-07, + "loss": 0.1226, + "step": 1349 + }, + { + "epoch": 0.09, + "grad_norm": 1.2574103117154745, + "learning_rate": 9.917820064031211e-07, + "loss": 0.1307, + "step": 1350 + }, + { + "epoch": 0.09, + "grad_norm": 0.6820728818513073, + "learning_rate": 9.917633487971438e-07, + "loss": 0.255, + "step": 1351 + }, + { + "epoch": 0.09, + "grad_norm": 0.4839934500545702, + "learning_rate": 9.917446702115533e-07, + "loss": 0.218, + "step": 1352 + }, + { + "epoch": 0.09, + "grad_norm": 0.9974031137381807, + "learning_rate": 9.917259706471467e-07, + "loss": 0.1613, + "step": 1353 + }, + { + "epoch": 0.09, + "grad_norm": 0.6288966585844071, + "learning_rate": 9.917072501047217e-07, + "loss": 0.3266, + "step": 1354 + }, + { + "epoch": 0.09, + "grad_norm": 0.5899033809330453, + "learning_rate": 9.91688508585077e-07, + "loss": 0.0135, + "step": 1355 + }, + { + "epoch": 0.09, + "grad_norm": 0.48974260277611126, + "learning_rate": 9.91669746089012e-07, + "loss": 0.2727, + "step": 1356 + }, + { + "epoch": 0.09, + "grad_norm": 0.892130586122228, + "learning_rate": 9.916509626173275e-07, + "loss": 0.2681, + "step": 1357 + }, + { + "epoch": 0.09, + "grad_norm": 0.5759666173641834, + "learning_rate": 9.916321581708245e-07, + "loss": 0.2102, + "step": 1358 + }, + { + "epoch": 0.09, + "grad_norm": 0.1415334650198193, + "learning_rate": 9.916133327503052e-07, + "loss": 0.0707, + "step": 1359 + }, + { + "epoch": 0.09, + "grad_norm": 0.2950143180306834, + "learning_rate": 9.915944863565728e-07, + "loss": 0.0184, + "step": 1360 + }, + { + "epoch": 0.09, + "grad_norm": 0.622270261736345, + "learning_rate": 9.915756189904316e-07, + "loss": 0.2563, + "step": 1361 + }, + { + "epoch": 0.09, + "grad_norm": 0.6687547593629255, + "learning_rate": 9.915567306526862e-07, + "loss": 0.2629, + "step": 1362 + }, + { + "epoch": 0.09, + "grad_norm": 1.813456436767718, + "learning_rate": 9.915378213441425e-07, + "loss": 0.059, + "step": 1363 + }, + { + "epoch": 0.09, + "grad_norm": 0.8296119799260199, + "learning_rate": 9.915188910656073e-07, + "loss": 0.1444, + "step": 1364 + }, + { + "epoch": 0.09, + "grad_norm": 0.7155707282550502, + "learning_rate": 9.91499939817888e-07, + "loss": 0.0603, + "step": 1365 + }, + { + "epoch": 0.09, + "grad_norm": 0.4621222371647438, + "learning_rate": 9.914809676017935e-07, + "loss": 0.1742, + "step": 1366 + }, + { + "epoch": 0.09, + "grad_norm": 1.1300100528341723, + "learning_rate": 9.914619744181326e-07, + "loss": 0.5811, + "step": 1367 + }, + { + "epoch": 0.09, + "grad_norm": 0.9614231224301606, + "learning_rate": 9.914429602677161e-07, + "loss": 0.3166, + "step": 1368 + }, + { + "epoch": 0.09, + "grad_norm": 1.0794458569950924, + "learning_rate": 9.914239251513549e-07, + "loss": 0.18, + "step": 1369 + }, + { + "epoch": 0.09, + "grad_norm": 0.6467544649663229, + "learning_rate": 9.91404869069861e-07, + "loss": 0.2362, + "step": 1370 + }, + { + "epoch": 0.09, + "grad_norm": 0.6154057398537042, + "learning_rate": 9.91385792024048e-07, + "loss": 0.084, + "step": 1371 + }, + { + "epoch": 0.09, + "grad_norm": 0.5645728368622048, + "learning_rate": 9.913666940147289e-07, + "loss": 0.3736, + "step": 1372 + }, + { + "epoch": 0.09, + "grad_norm": 0.6206794190429861, + "learning_rate": 9.91347575042719e-07, + "loss": 0.1729, + "step": 1373 + }, + { + "epoch": 0.09, + "grad_norm": 0.46476261881865205, + "learning_rate": 9.913284351088338e-07, + "loss": 0.0415, + "step": 1374 + }, + { + "epoch": 0.09, + "grad_norm": 0.9571329913919955, + "learning_rate": 9.9130927421389e-07, + "loss": 0.4317, + "step": 1375 + }, + { + "epoch": 0.09, + "grad_norm": 0.5332665697299472, + "learning_rate": 9.912900923587047e-07, + "loss": 0.2812, + "step": 1376 + }, + { + "epoch": 0.09, + "grad_norm": 0.3770817447669728, + "learning_rate": 9.912708895440966e-07, + "loss": 0.0834, + "step": 1377 + }, + { + "epoch": 0.09, + "grad_norm": 0.6260119806099472, + "learning_rate": 9.912516657708847e-07, + "loss": 0.0474, + "step": 1378 + }, + { + "epoch": 0.09, + "grad_norm": 0.5076190706164107, + "learning_rate": 9.912324210398892e-07, + "loss": 0.2409, + "step": 1379 + }, + { + "epoch": 0.09, + "grad_norm": 0.5911689240300607, + "learning_rate": 9.91213155351931e-07, + "loss": 0.3276, + "step": 1380 + }, + { + "epoch": 0.09, + "grad_norm": 0.527706445220485, + "learning_rate": 9.911938687078323e-07, + "loss": 0.1643, + "step": 1381 + }, + { + "epoch": 0.09, + "grad_norm": 0.6533182877886919, + "learning_rate": 9.911745611084156e-07, + "loss": 0.1749, + "step": 1382 + }, + { + "epoch": 0.09, + "grad_norm": 1.2121348501912437, + "learning_rate": 9.91155232554505e-07, + "loss": 0.2173, + "step": 1383 + }, + { + "epoch": 0.09, + "grad_norm": 0.5489449013024819, + "learning_rate": 9.911358830469247e-07, + "loss": 0.1405, + "step": 1384 + }, + { + "epoch": 0.09, + "grad_norm": 0.4199252225993026, + "learning_rate": 9.911165125865001e-07, + "loss": 0.2352, + "step": 1385 + }, + { + "epoch": 0.09, + "grad_norm": 0.5663838266086673, + "learning_rate": 9.91097121174058e-07, + "loss": 0.208, + "step": 1386 + }, + { + "epoch": 0.09, + "grad_norm": 0.41766510964406783, + "learning_rate": 9.910777088104256e-07, + "loss": 0.0624, + "step": 1387 + }, + { + "epoch": 0.09, + "grad_norm": 0.7570359673996523, + "learning_rate": 9.910582754964306e-07, + "loss": 0.3536, + "step": 1388 + }, + { + "epoch": 0.09, + "grad_norm": 0.8684024534790326, + "learning_rate": 9.910388212329027e-07, + "loss": 0.2251, + "step": 1389 + }, + { + "epoch": 0.09, + "grad_norm": 1.0843837871077118, + "learning_rate": 9.910193460206716e-07, + "loss": 0.163, + "step": 1390 + }, + { + "epoch": 0.09, + "grad_norm": 0.6591320256330364, + "learning_rate": 9.90999849860568e-07, + "loss": 0.4135, + "step": 1391 + }, + { + "epoch": 0.09, + "grad_norm": 0.38250931088260687, + "learning_rate": 9.909803327534239e-07, + "loss": 0.0429, + "step": 1392 + }, + { + "epoch": 0.09, + "grad_norm": 0.5092062891712076, + "learning_rate": 9.909607947000717e-07, + "loss": 0.2099, + "step": 1393 + }, + { + "epoch": 0.09, + "grad_norm": 0.5321071463340533, + "learning_rate": 9.90941235701345e-07, + "loss": 0.089, + "step": 1394 + }, + { + "epoch": 0.09, + "grad_norm": 0.5105319453775725, + "learning_rate": 9.909216557580784e-07, + "loss": 0.1912, + "step": 1395 + }, + { + "epoch": 0.09, + "grad_norm": 1.0240034951819497, + "learning_rate": 9.90902054871107e-07, + "loss": 0.2655, + "step": 1396 + }, + { + "epoch": 0.09, + "grad_norm": 0.35749500684136865, + "learning_rate": 9.90882433041267e-07, + "loss": 0.0129, + "step": 1397 + }, + { + "epoch": 0.09, + "grad_norm": 0.7430532137923848, + "learning_rate": 9.908627902693957e-07, + "loss": 0.1514, + "step": 1398 + }, + { + "epoch": 0.09, + "grad_norm": 0.4790887423592187, + "learning_rate": 9.908431265563313e-07, + "loss": 0.1774, + "step": 1399 + }, + { + "epoch": 0.09, + "grad_norm": 0.8233676472290283, + "learning_rate": 9.90823441902912e-07, + "loss": 0.1219, + "step": 1400 + }, + { + "epoch": 0.09, + "grad_norm": 0.5939786949019252, + "learning_rate": 9.908037363099782e-07, + "loss": 0.0689, + "step": 1401 + }, + { + "epoch": 0.09, + "grad_norm": 0.5939627617523665, + "learning_rate": 9.907840097783704e-07, + "loss": 0.1783, + "step": 1402 + }, + { + "epoch": 0.09, + "grad_norm": 0.49141165452616525, + "learning_rate": 9.9076426230893e-07, + "loss": 0.1545, + "step": 1403 + }, + { + "epoch": 0.09, + "grad_norm": 0.7278504336264148, + "learning_rate": 9.907444939024997e-07, + "loss": 0.2825, + "step": 1404 + }, + { + "epoch": 0.09, + "grad_norm": 0.28442755361920813, + "learning_rate": 9.907247045599226e-07, + "loss": 0.1271, + "step": 1405 + }, + { + "epoch": 0.09, + "grad_norm": 0.45846238394230227, + "learning_rate": 9.907048942820432e-07, + "loss": 0.0251, + "step": 1406 + }, + { + "epoch": 0.09, + "grad_norm": 0.5143526782461665, + "learning_rate": 9.906850630697066e-07, + "loss": 0.2793, + "step": 1407 + }, + { + "epoch": 0.09, + "grad_norm": 0.6207469331297467, + "learning_rate": 9.906652109237588e-07, + "loss": 0.0464, + "step": 1408 + }, + { + "epoch": 0.09, + "grad_norm": 0.874109918494243, + "learning_rate": 9.90645337845047e-07, + "loss": 0.2319, + "step": 1409 + }, + { + "epoch": 0.09, + "grad_norm": 0.4302973166798125, + "learning_rate": 9.906254438344185e-07, + "loss": 0.2829, + "step": 1410 + }, + { + "epoch": 0.09, + "grad_norm": 0.37521259962395315, + "learning_rate": 9.906055288927221e-07, + "loss": 0.1903, + "step": 1411 + }, + { + "epoch": 0.09, + "grad_norm": 0.9284162240136209, + "learning_rate": 9.90585593020808e-07, + "loss": 0.359, + "step": 1412 + }, + { + "epoch": 0.09, + "grad_norm": 0.7100888419280851, + "learning_rate": 9.905656362195261e-07, + "loss": 0.2603, + "step": 1413 + }, + { + "epoch": 0.09, + "grad_norm": 0.34959608011317916, + "learning_rate": 9.90545658489728e-07, + "loss": 0.2394, + "step": 1414 + }, + { + "epoch": 0.09, + "grad_norm": 0.4230948498712605, + "learning_rate": 9.90525659832266e-07, + "loss": 0.2377, + "step": 1415 + }, + { + "epoch": 0.09, + "grad_norm": 0.48756417643515504, + "learning_rate": 9.905056402479933e-07, + "loss": 0.142, + "step": 1416 + }, + { + "epoch": 0.09, + "grad_norm": 0.27732749330665324, + "learning_rate": 9.904855997377638e-07, + "loss": 0.1023, + "step": 1417 + }, + { + "epoch": 0.09, + "grad_norm": 0.21866130982516352, + "learning_rate": 9.904655383024327e-07, + "loss": 0.0283, + "step": 1418 + }, + { + "epoch": 0.09, + "grad_norm": 0.4805676435876558, + "learning_rate": 9.90445455942856e-07, + "loss": 0.3483, + "step": 1419 + }, + { + "epoch": 0.09, + "grad_norm": 0.6717010660990245, + "learning_rate": 9.9042535265989e-07, + "loss": 0.0295, + "step": 1420 + }, + { + "epoch": 0.09, + "grad_norm": 0.42242652013031196, + "learning_rate": 9.904052284543925e-07, + "loss": 0.3359, + "step": 1421 + }, + { + "epoch": 0.09, + "grad_norm": 0.9660930421018699, + "learning_rate": 9.903850833272222e-07, + "loss": 0.4354, + "step": 1422 + }, + { + "epoch": 0.09, + "grad_norm": 0.4133245581699803, + "learning_rate": 9.903649172792386e-07, + "loss": 0.1681, + "step": 1423 + }, + { + "epoch": 0.09, + "grad_norm": 0.8389060031095329, + "learning_rate": 9.903447303113017e-07, + "loss": 0.3308, + "step": 1424 + }, + { + "epoch": 0.09, + "grad_norm": 0.2505562843920614, + "learning_rate": 9.90324522424273e-07, + "loss": 0.1088, + "step": 1425 + }, + { + "epoch": 0.09, + "grad_norm": 0.46521286222480235, + "learning_rate": 9.903042936190145e-07, + "loss": 0.1423, + "step": 1426 + }, + { + "epoch": 0.09, + "grad_norm": 0.5018964774218596, + "learning_rate": 9.90284043896389e-07, + "loss": 0.2328, + "step": 1427 + }, + { + "epoch": 0.09, + "grad_norm": 0.5915417271474269, + "learning_rate": 9.90263773257261e-07, + "loss": 0.4089, + "step": 1428 + }, + { + "epoch": 0.09, + "grad_norm": 0.524245641467094, + "learning_rate": 9.902434817024945e-07, + "loss": 0.3119, + "step": 1429 + }, + { + "epoch": 0.09, + "grad_norm": 1.4189219922182432, + "learning_rate": 9.902231692329556e-07, + "loss": 0.1085, + "step": 1430 + }, + { + "epoch": 0.09, + "grad_norm": 0.276746429325565, + "learning_rate": 9.90202835849511e-07, + "loss": 0.3031, + "step": 1431 + }, + { + "epoch": 0.09, + "grad_norm": 0.8251646253861777, + "learning_rate": 9.901824815530277e-07, + "loss": 0.1873, + "step": 1432 + }, + { + "epoch": 0.09, + "grad_norm": 0.5937526854920018, + "learning_rate": 9.901621063443746e-07, + "loss": 0.1396, + "step": 1433 + }, + { + "epoch": 0.09, + "grad_norm": 0.4455253693138965, + "learning_rate": 9.901417102244207e-07, + "loss": 0.2131, + "step": 1434 + }, + { + "epoch": 0.09, + "grad_norm": 0.582796698479084, + "learning_rate": 9.90121293194036e-07, + "loss": 0.5477, + "step": 1435 + }, + { + "epoch": 0.09, + "grad_norm": 0.4848414087429192, + "learning_rate": 9.901008552540918e-07, + "loss": 0.1672, + "step": 1436 + }, + { + "epoch": 0.09, + "grad_norm": 0.28593497405984064, + "learning_rate": 9.900803964054597e-07, + "loss": 0.0338, + "step": 1437 + }, + { + "epoch": 0.09, + "grad_norm": 0.6782419152092031, + "learning_rate": 9.900599166490129e-07, + "loss": 0.2368, + "step": 1438 + }, + { + "epoch": 0.09, + "grad_norm": 0.5081115265738643, + "learning_rate": 9.900394159856249e-07, + "loss": 0.3658, + "step": 1439 + }, + { + "epoch": 0.09, + "grad_norm": 0.9275593554404851, + "learning_rate": 9.900188944161701e-07, + "loss": 0.2623, + "step": 1440 + }, + { + "epoch": 0.09, + "grad_norm": 0.400317797796568, + "learning_rate": 9.899983519415244e-07, + "loss": 0.1618, + "step": 1441 + }, + { + "epoch": 0.09, + "grad_norm": 0.8491504770105957, + "learning_rate": 9.89977788562564e-07, + "loss": 0.2061, + "step": 1442 + }, + { + "epoch": 0.09, + "grad_norm": 0.6179973314813763, + "learning_rate": 9.899572042801661e-07, + "loss": 0.1308, + "step": 1443 + }, + { + "epoch": 0.09, + "grad_norm": 0.6112656200925553, + "learning_rate": 9.89936599095209e-07, + "loss": 0.0589, + "step": 1444 + }, + { + "epoch": 0.09, + "grad_norm": 0.48155520352962544, + "learning_rate": 9.899159730085714e-07, + "loss": 0.197, + "step": 1445 + }, + { + "epoch": 0.09, + "grad_norm": 1.2113338467664378, + "learning_rate": 9.898953260211337e-07, + "loss": 0.1121, + "step": 1446 + }, + { + "epoch": 0.09, + "grad_norm": 0.5039106787821244, + "learning_rate": 9.898746581337766e-07, + "loss": 0.2074, + "step": 1447 + }, + { + "epoch": 0.09, + "grad_norm": 0.4526936579946449, + "learning_rate": 9.89853969347382e-07, + "loss": 0.1398, + "step": 1448 + }, + { + "epoch": 0.09, + "grad_norm": 0.5788236603227206, + "learning_rate": 9.898332596628322e-07, + "loss": 0.1798, + "step": 1449 + }, + { + "epoch": 0.09, + "grad_norm": 0.20559894082176153, + "learning_rate": 9.898125290810107e-07, + "loss": 0.1567, + "step": 1450 + }, + { + "epoch": 0.09, + "grad_norm": 0.6075111709699883, + "learning_rate": 9.897917776028022e-07, + "loss": 0.2502, + "step": 1451 + }, + { + "epoch": 0.09, + "grad_norm": 1.31621268799203, + "learning_rate": 9.89771005229092e-07, + "loss": 0.0991, + "step": 1452 + }, + { + "epoch": 0.09, + "grad_norm": 0.27102036808554036, + "learning_rate": 9.89750211960766e-07, + "loss": 0.1116, + "step": 1453 + }, + { + "epoch": 0.09, + "grad_norm": 0.19562214729796376, + "learning_rate": 9.897293977987112e-07, + "loss": 0.1442, + "step": 1454 + }, + { + "epoch": 0.09, + "grad_norm": 0.2897769074366858, + "learning_rate": 9.897085627438162e-07, + "loss": 0.1174, + "step": 1455 + }, + { + "epoch": 0.09, + "grad_norm": 0.9699879807136473, + "learning_rate": 9.896877067969694e-07, + "loss": 0.3481, + "step": 1456 + }, + { + "epoch": 0.09, + "grad_norm": 0.4334282701949201, + "learning_rate": 9.896668299590606e-07, + "loss": 0.1089, + "step": 1457 + }, + { + "epoch": 0.09, + "grad_norm": 0.263748101801689, + "learning_rate": 9.896459322309802e-07, + "loss": 0.0101, + "step": 1458 + }, + { + "epoch": 0.09, + "grad_norm": 0.5811943469647456, + "learning_rate": 9.896250136136203e-07, + "loss": 0.0801, + "step": 1459 + }, + { + "epoch": 0.09, + "grad_norm": 0.8318265996009885, + "learning_rate": 9.89604074107873e-07, + "loss": 0.1777, + "step": 1460 + }, + { + "epoch": 0.09, + "grad_norm": 0.2212493744552875, + "learning_rate": 9.895831137146318e-07, + "loss": 0.0888, + "step": 1461 + }, + { + "epoch": 0.09, + "grad_norm": 0.26580603276275505, + "learning_rate": 9.895621324347908e-07, + "loss": 0.178, + "step": 1462 + }, + { + "epoch": 0.09, + "grad_norm": 0.277147620795641, + "learning_rate": 9.895411302692448e-07, + "loss": 0.051, + "step": 1463 + }, + { + "epoch": 0.09, + "grad_norm": 0.5628289413299168, + "learning_rate": 9.895201072188903e-07, + "loss": 0.3389, + "step": 1464 + }, + { + "epoch": 0.09, + "grad_norm": 0.6444211883774726, + "learning_rate": 9.89499063284624e-07, + "loss": 0.1766, + "step": 1465 + }, + { + "epoch": 0.09, + "grad_norm": 0.4995024859452174, + "learning_rate": 9.894779984673433e-07, + "loss": 0.266, + "step": 1466 + }, + { + "epoch": 0.09, + "grad_norm": 0.5980645482596011, + "learning_rate": 9.894569127679476e-07, + "loss": 0.3054, + "step": 1467 + }, + { + "epoch": 0.09, + "grad_norm": 0.4433154938139935, + "learning_rate": 9.894358061873358e-07, + "loss": 0.0984, + "step": 1468 + }, + { + "epoch": 0.09, + "grad_norm": 0.2491109776587946, + "learning_rate": 9.894146787264088e-07, + "loss": 0.0295, + "step": 1469 + }, + { + "epoch": 0.09, + "grad_norm": 0.36524548467967133, + "learning_rate": 9.893935303860677e-07, + "loss": 0.0157, + "step": 1470 + }, + { + "epoch": 0.09, + "grad_norm": 0.3223149265797452, + "learning_rate": 9.893723611672147e-07, + "loss": 0.4159, + "step": 1471 + }, + { + "epoch": 0.09, + "grad_norm": 0.5802469108703582, + "learning_rate": 9.89351171070753e-07, + "loss": 0.1877, + "step": 1472 + }, + { + "epoch": 0.09, + "grad_norm": 0.3402492212622589, + "learning_rate": 9.89329960097587e-07, + "loss": 0.1217, + "step": 1473 + }, + { + "epoch": 0.09, + "grad_norm": 0.7272394862500775, + "learning_rate": 9.893087282486208e-07, + "loss": 0.2007, + "step": 1474 + }, + { + "epoch": 0.09, + "grad_norm": 0.1860676868954452, + "learning_rate": 9.892874755247608e-07, + "loss": 0.1, + "step": 1475 + }, + { + "epoch": 0.09, + "grad_norm": 0.533114804369231, + "learning_rate": 9.892662019269136e-07, + "loss": 0.2238, + "step": 1476 + }, + { + "epoch": 0.09, + "grad_norm": 0.6122280427665033, + "learning_rate": 9.892449074559864e-07, + "loss": 0.039, + "step": 1477 + }, + { + "epoch": 0.09, + "grad_norm": 0.6477653508701525, + "learning_rate": 9.892235921128881e-07, + "loss": 0.2751, + "step": 1478 + }, + { + "epoch": 0.09, + "grad_norm": 0.6317612010148661, + "learning_rate": 9.89202255898528e-07, + "loss": 0.2251, + "step": 1479 + }, + { + "epoch": 0.09, + "grad_norm": 0.47801365439872195, + "learning_rate": 9.89180898813816e-07, + "loss": 0.1295, + "step": 1480 + }, + { + "epoch": 0.09, + "grad_norm": 0.3439080532007902, + "learning_rate": 9.891595208596638e-07, + "loss": 0.161, + "step": 1481 + }, + { + "epoch": 0.09, + "grad_norm": 0.2094726595885454, + "learning_rate": 9.891381220369827e-07, + "loss": 0.0166, + "step": 1482 + }, + { + "epoch": 0.09, + "grad_norm": 0.4794332543917334, + "learning_rate": 9.891167023466864e-07, + "loss": 0.1335, + "step": 1483 + }, + { + "epoch": 0.09, + "grad_norm": 0.6044568815496913, + "learning_rate": 9.890952617896882e-07, + "loss": 0.3535, + "step": 1484 + }, + { + "epoch": 0.09, + "grad_norm": 0.36745720152941697, + "learning_rate": 9.890738003669027e-07, + "loss": 0.0173, + "step": 1485 + }, + { + "epoch": 0.09, + "grad_norm": 0.7129460235783216, + "learning_rate": 9.89052318079246e-07, + "loss": 0.0579, + "step": 1486 + }, + { + "epoch": 0.09, + "grad_norm": 1.0361603130817678, + "learning_rate": 9.890308149276342e-07, + "loss": 0.1412, + "step": 1487 + }, + { + "epoch": 0.09, + "grad_norm": 0.18823015993972705, + "learning_rate": 9.890092909129848e-07, + "loss": 0.0465, + "step": 1488 + }, + { + "epoch": 0.09, + "grad_norm": 1.3016928228509086, + "learning_rate": 9.88987746036216e-07, + "loss": 0.1725, + "step": 1489 + }, + { + "epoch": 0.1, + "grad_norm": 0.3818409880923631, + "learning_rate": 9.889661802982468e-07, + "loss": 0.251, + "step": 1490 + }, + { + "epoch": 0.1, + "grad_norm": 0.3649233229388234, + "learning_rate": 9.889445936999976e-07, + "loss": 0.2045, + "step": 1491 + }, + { + "epoch": 0.1, + "grad_norm": 0.9295062256605674, + "learning_rate": 9.889229862423892e-07, + "loss": 0.1487, + "step": 1492 + }, + { + "epoch": 0.1, + "grad_norm": 0.6602399601320765, + "learning_rate": 9.889013579263433e-07, + "loss": 0.2343, + "step": 1493 + }, + { + "epoch": 0.1, + "grad_norm": 0.6796608325296203, + "learning_rate": 9.888797087527826e-07, + "loss": 0.2295, + "step": 1494 + }, + { + "epoch": 0.1, + "grad_norm": 0.3587957490130416, + "learning_rate": 9.888580387226307e-07, + "loss": 0.0863, + "step": 1495 + }, + { + "epoch": 0.1, + "grad_norm": 0.5379181859428669, + "learning_rate": 9.88836347836812e-07, + "loss": 0.2108, + "step": 1496 + }, + { + "epoch": 0.1, + "grad_norm": 1.2348044214295755, + "learning_rate": 9.88814636096252e-07, + "loss": 0.3544, + "step": 1497 + }, + { + "epoch": 0.1, + "grad_norm": 0.294621699605234, + "learning_rate": 9.887929035018773e-07, + "loss": 0.1577, + "step": 1498 + }, + { + "epoch": 0.1, + "grad_norm": 1.1004985743456288, + "learning_rate": 9.887711500546147e-07, + "loss": 0.1324, + "step": 1499 + }, + { + "epoch": 0.1, + "grad_norm": 0.44690192051575695, + "learning_rate": 9.887493757553923e-07, + "loss": 0.3278, + "step": 1500 + }, + { + "epoch": 0.1, + "grad_norm": 0.7855658632665525, + "learning_rate": 9.887275806051388e-07, + "loss": 0.3881, + "step": 1501 + }, + { + "epoch": 0.1, + "grad_norm": 0.9292787724738459, + "learning_rate": 9.887057646047846e-07, + "loss": 0.2462, + "step": 1502 + }, + { + "epoch": 0.1, + "grad_norm": 0.712761817448678, + "learning_rate": 9.886839277552597e-07, + "loss": 0.0251, + "step": 1503 + }, + { + "epoch": 0.1, + "grad_norm": 0.5377398737419191, + "learning_rate": 9.886620700574962e-07, + "loss": 0.2777, + "step": 1504 + }, + { + "epoch": 0.1, + "grad_norm": 0.3413670328124251, + "learning_rate": 9.886401915124263e-07, + "loss": 0.1447, + "step": 1505 + }, + { + "epoch": 0.1, + "grad_norm": 0.5039823574572356, + "learning_rate": 9.886182921209839e-07, + "loss": 0.2739, + "step": 1506 + }, + { + "epoch": 0.1, + "grad_norm": 0.4076212630655461, + "learning_rate": 9.885963718841025e-07, + "loss": 0.2969, + "step": 1507 + }, + { + "epoch": 0.1, + "grad_norm": 0.44300072813030744, + "learning_rate": 9.88574430802718e-07, + "loss": 0.2196, + "step": 1508 + }, + { + "epoch": 0.1, + "grad_norm": 0.5676336561032818, + "learning_rate": 9.88552468877766e-07, + "loss": 0.2615, + "step": 1509 + }, + { + "epoch": 0.1, + "grad_norm": 0.93740836315491, + "learning_rate": 9.885304861101835e-07, + "loss": 0.2727, + "step": 1510 + }, + { + "epoch": 0.1, + "grad_norm": 0.8297037804825235, + "learning_rate": 9.885084825009084e-07, + "loss": 0.3854, + "step": 1511 + }, + { + "epoch": 0.1, + "grad_norm": 0.640175390536145, + "learning_rate": 9.884864580508795e-07, + "loss": 0.4684, + "step": 1512 + }, + { + "epoch": 0.1, + "grad_norm": 0.861467133913177, + "learning_rate": 9.884644127610365e-07, + "loss": 0.2164, + "step": 1513 + }, + { + "epoch": 0.1, + "grad_norm": 0.2962211416169949, + "learning_rate": 9.884423466323192e-07, + "loss": 0.1421, + "step": 1514 + }, + { + "epoch": 0.1, + "grad_norm": 0.22972637966336357, + "learning_rate": 9.8842025966567e-07, + "loss": 0.1644, + "step": 1515 + }, + { + "epoch": 0.1, + "grad_norm": 0.26510495004736484, + "learning_rate": 9.883981518620304e-07, + "loss": 0.104, + "step": 1516 + }, + { + "epoch": 0.1, + "grad_norm": 0.4411577210188472, + "learning_rate": 9.88376023222344e-07, + "loss": 0.0874, + "step": 1517 + }, + { + "epoch": 0.1, + "grad_norm": 0.4871797030131832, + "learning_rate": 9.883538737475544e-07, + "loss": 0.2601, + "step": 1518 + }, + { + "epoch": 0.1, + "grad_norm": 0.7531400614799489, + "learning_rate": 9.883317034386073e-07, + "loss": 0.3019, + "step": 1519 + }, + { + "epoch": 0.1, + "grad_norm": 0.7616916491544177, + "learning_rate": 9.883095122964477e-07, + "loss": 0.4258, + "step": 1520 + }, + { + "epoch": 0.1, + "grad_norm": 0.4315192948478817, + "learning_rate": 9.882873003220227e-07, + "loss": 0.2612, + "step": 1521 + }, + { + "epoch": 0.1, + "grad_norm": 0.33962815028522214, + "learning_rate": 9.8826506751628e-07, + "loss": 0.0532, + "step": 1522 + }, + { + "epoch": 0.1, + "grad_norm": 0.31640458172068525, + "learning_rate": 9.88242813880168e-07, + "loss": 0.2545, + "step": 1523 + }, + { + "epoch": 0.1, + "grad_norm": 0.24711460479640185, + "learning_rate": 9.88220539414636e-07, + "loss": 0.1543, + "step": 1524 + }, + { + "epoch": 0.1, + "grad_norm": 0.5478387891792209, + "learning_rate": 9.881982441206343e-07, + "loss": 0.4665, + "step": 1525 + }, + { + "epoch": 0.1, + "grad_norm": 1.097502839498695, + "learning_rate": 9.881759279991143e-07, + "loss": 0.0322, + "step": 1526 + }, + { + "epoch": 0.1, + "grad_norm": 0.3476195653312959, + "learning_rate": 9.881535910510276e-07, + "loss": 0.0366, + "step": 1527 + }, + { + "epoch": 0.1, + "grad_norm": 0.5333026791706978, + "learning_rate": 9.881312332773275e-07, + "loss": 0.4375, + "step": 1528 + }, + { + "epoch": 0.1, + "grad_norm": 0.3112252104360692, + "learning_rate": 9.881088546789677e-07, + "loss": 0.1351, + "step": 1529 + }, + { + "epoch": 0.1, + "grad_norm": 0.4399471696451268, + "learning_rate": 9.88086455256903e-07, + "loss": 0.1491, + "step": 1530 + }, + { + "epoch": 0.1, + "grad_norm": 0.6295850881492374, + "learning_rate": 9.88064035012089e-07, + "loss": 0.0689, + "step": 1531 + }, + { + "epoch": 0.1, + "grad_norm": 0.38561365987020313, + "learning_rate": 9.88041593945482e-07, + "loss": 0.2467, + "step": 1532 + }, + { + "epoch": 0.1, + "grad_norm": 0.5938699720510816, + "learning_rate": 9.880191320580396e-07, + "loss": 0.3254, + "step": 1533 + }, + { + "epoch": 0.1, + "grad_norm": 0.1697906908430658, + "learning_rate": 9.879966493507198e-07, + "loss": 0.0077, + "step": 1534 + }, + { + "epoch": 0.1, + "grad_norm": 0.3496812653264161, + "learning_rate": 9.879741458244822e-07, + "loss": 0.0746, + "step": 1535 + }, + { + "epoch": 0.1, + "grad_norm": 0.31029222651159144, + "learning_rate": 9.879516214802866e-07, + "loss": 0.216, + "step": 1536 + }, + { + "epoch": 0.1, + "grad_norm": 0.4859938049276853, + "learning_rate": 9.879290763190937e-07, + "loss": 0.15, + "step": 1537 + }, + { + "epoch": 0.1, + "grad_norm": 0.37417699629811346, + "learning_rate": 9.879065103418657e-07, + "loss": 0.0962, + "step": 1538 + }, + { + "epoch": 0.1, + "grad_norm": 0.5437353087596696, + "learning_rate": 9.87883923549565e-07, + "loss": 0.145, + "step": 1539 + }, + { + "epoch": 0.1, + "grad_norm": 0.8925268366823884, + "learning_rate": 9.878613159431554e-07, + "loss": 0.2945, + "step": 1540 + }, + { + "epoch": 0.1, + "grad_norm": 0.5877766008896336, + "learning_rate": 9.878386875236016e-07, + "loss": 0.17, + "step": 1541 + }, + { + "epoch": 0.1, + "grad_norm": 0.13553921863758533, + "learning_rate": 9.878160382918683e-07, + "loss": 0.0674, + "step": 1542 + }, + { + "epoch": 0.1, + "grad_norm": 0.614686376491471, + "learning_rate": 9.877933682489224e-07, + "loss": 0.154, + "step": 1543 + }, + { + "epoch": 0.1, + "grad_norm": 0.4224147413842552, + "learning_rate": 9.877706773957308e-07, + "loss": 0.2191, + "step": 1544 + }, + { + "epoch": 0.1, + "grad_norm": 0.6719867186441478, + "learning_rate": 9.877479657332617e-07, + "loss": 0.2382, + "step": 1545 + }, + { + "epoch": 0.1, + "grad_norm": 0.20322204392227114, + "learning_rate": 9.877252332624837e-07, + "loss": 0.0097, + "step": 1546 + }, + { + "epoch": 0.1, + "grad_norm": 0.8891487395054584, + "learning_rate": 9.877024799843667e-07, + "loss": 0.0891, + "step": 1547 + }, + { + "epoch": 0.1, + "grad_norm": 1.1298489880351104, + "learning_rate": 9.876797058998817e-07, + "loss": 0.165, + "step": 1548 + }, + { + "epoch": 0.1, + "grad_norm": 0.514652923651593, + "learning_rate": 9.8765691101e-07, + "loss": 0.1222, + "step": 1549 + }, + { + "epoch": 0.1, + "grad_norm": 0.6397997624940633, + "learning_rate": 9.876340953156943e-07, + "loss": 0.2687, + "step": 1550 + }, + { + "epoch": 0.1, + "grad_norm": 0.7785255189564557, + "learning_rate": 9.876112588179376e-07, + "loss": 0.1656, + "step": 1551 + }, + { + "epoch": 0.1, + "grad_norm": 0.7121913562734088, + "learning_rate": 9.875884015177045e-07, + "loss": 0.2444, + "step": 1552 + }, + { + "epoch": 0.1, + "grad_norm": 0.6605143159514306, + "learning_rate": 9.8756552341597e-07, + "loss": 0.2287, + "step": 1553 + }, + { + "epoch": 0.1, + "grad_norm": 0.36886705029343614, + "learning_rate": 9.875426245137101e-07, + "loss": 0.1895, + "step": 1554 + }, + { + "epoch": 0.1, + "grad_norm": 0.7679070950832787, + "learning_rate": 9.87519704811902e-07, + "loss": 0.2849, + "step": 1555 + }, + { + "epoch": 0.1, + "grad_norm": 0.6701865236270271, + "learning_rate": 9.87496764311523e-07, + "loss": 0.2261, + "step": 1556 + }, + { + "epoch": 0.1, + "grad_norm": 0.42004633192356616, + "learning_rate": 9.874738030135521e-07, + "loss": 0.2492, + "step": 1557 + }, + { + "epoch": 0.1, + "grad_norm": 0.339808480879583, + "learning_rate": 9.874508209189689e-07, + "loss": 0.2003, + "step": 1558 + }, + { + "epoch": 0.1, + "grad_norm": 0.7761775197322209, + "learning_rate": 9.874278180287536e-07, + "loss": 0.2574, + "step": 1559 + }, + { + "epoch": 0.1, + "grad_norm": 0.9695043231557823, + "learning_rate": 9.874047943438878e-07, + "loss": 0.2385, + "step": 1560 + }, + { + "epoch": 0.1, + "grad_norm": 1.004971613166862, + "learning_rate": 9.873817498653537e-07, + "loss": 0.2249, + "step": 1561 + }, + { + "epoch": 0.1, + "grad_norm": 0.5756796278033042, + "learning_rate": 9.873586845941344e-07, + "loss": 0.1366, + "step": 1562 + }, + { + "epoch": 0.1, + "grad_norm": 1.2083389990401696, + "learning_rate": 9.87335598531214e-07, + "loss": 0.1626, + "step": 1563 + }, + { + "epoch": 0.1, + "grad_norm": 0.9287886982072556, + "learning_rate": 9.87312491677577e-07, + "loss": 0.0379, + "step": 1564 + }, + { + "epoch": 0.1, + "grad_norm": 0.4611163777873519, + "learning_rate": 9.872893640342098e-07, + "loss": 0.1681, + "step": 1565 + }, + { + "epoch": 0.1, + "grad_norm": 0.6474301274319696, + "learning_rate": 9.872662156020986e-07, + "loss": 0.2599, + "step": 1566 + }, + { + "epoch": 0.1, + "grad_norm": 0.41372820467170063, + "learning_rate": 9.872430463822312e-07, + "loss": 0.2448, + "step": 1567 + }, + { + "epoch": 0.1, + "grad_norm": 0.5639025728086583, + "learning_rate": 9.872198563755959e-07, + "loss": 0.2386, + "step": 1568 + }, + { + "epoch": 0.1, + "grad_norm": 0.3475531518259124, + "learning_rate": 9.87196645583182e-07, + "loss": 0.1789, + "step": 1569 + }, + { + "epoch": 0.1, + "grad_norm": 0.6908444271896197, + "learning_rate": 9.8717341400598e-07, + "loss": 0.1217, + "step": 1570 + }, + { + "epoch": 0.1, + "grad_norm": 0.5832040661187229, + "learning_rate": 9.871501616449806e-07, + "loss": 0.1626, + "step": 1571 + }, + { + "epoch": 0.1, + "grad_norm": 0.3531856618380542, + "learning_rate": 9.87126888501176e-07, + "loss": 0.1372, + "step": 1572 + }, + { + "epoch": 0.1, + "grad_norm": 0.48973224252772235, + "learning_rate": 9.871035945755593e-07, + "loss": 0.098, + "step": 1573 + }, + { + "epoch": 0.1, + "grad_norm": 0.40065693580039274, + "learning_rate": 9.87080279869124e-07, + "loss": 0.0152, + "step": 1574 + }, + { + "epoch": 0.1, + "grad_norm": 0.6889147943802033, + "learning_rate": 9.87056944382865e-07, + "loss": 0.3114, + "step": 1575 + }, + { + "epoch": 0.1, + "grad_norm": 0.3608933592686679, + "learning_rate": 9.870335881177773e-07, + "loss": 0.2242, + "step": 1576 + }, + { + "epoch": 0.1, + "grad_norm": 0.7705750279452553, + "learning_rate": 9.870102110748577e-07, + "loss": 0.4654, + "step": 1577 + }, + { + "epoch": 0.1, + "grad_norm": 0.5027787671169268, + "learning_rate": 9.869868132551036e-07, + "loss": 0.0991, + "step": 1578 + }, + { + "epoch": 0.1, + "grad_norm": 0.45869951418894883, + "learning_rate": 9.86963394659513e-07, + "loss": 0.1745, + "step": 1579 + }, + { + "epoch": 0.1, + "grad_norm": 0.5005370133604695, + "learning_rate": 9.869399552890852e-07, + "loss": 0.1244, + "step": 1580 + }, + { + "epoch": 0.1, + "grad_norm": 0.6014441781373475, + "learning_rate": 9.8691649514482e-07, + "loss": 0.522, + "step": 1581 + }, + { + "epoch": 0.1, + "grad_norm": 0.4004666041309846, + "learning_rate": 9.868930142277183e-07, + "loss": 0.0131, + "step": 1582 + }, + { + "epoch": 0.1, + "grad_norm": 0.503409687553413, + "learning_rate": 9.868695125387817e-07, + "loss": 0.2341, + "step": 1583 + }, + { + "epoch": 0.1, + "grad_norm": 2.226301001193882, + "learning_rate": 9.868459900790131e-07, + "loss": 0.2083, + "step": 1584 + }, + { + "epoch": 0.1, + "grad_norm": 0.36025836315339727, + "learning_rate": 9.868224468494158e-07, + "loss": 0.2862, + "step": 1585 + }, + { + "epoch": 0.1, + "grad_norm": 0.6341257291602369, + "learning_rate": 9.867988828509943e-07, + "loss": 0.0529, + "step": 1586 + }, + { + "epoch": 0.1, + "grad_norm": 0.6686417639942993, + "learning_rate": 9.86775298084754e-07, + "loss": 0.2086, + "step": 1587 + }, + { + "epoch": 0.1, + "grad_norm": 0.15439815770014853, + "learning_rate": 9.867516925517008e-07, + "loss": 0.0469, + "step": 1588 + }, + { + "epoch": 0.1, + "grad_norm": 0.6521696811939802, + "learning_rate": 9.867280662528418e-07, + "loss": 0.2327, + "step": 1589 + }, + { + "epoch": 0.1, + "grad_norm": 0.4687022140046831, + "learning_rate": 9.867044191891853e-07, + "loss": 0.016, + "step": 1590 + }, + { + "epoch": 0.1, + "grad_norm": 0.42445870053570794, + "learning_rate": 9.866807513617396e-07, + "loss": 0.2598, + "step": 1591 + }, + { + "epoch": 0.1, + "grad_norm": 0.4178296998710727, + "learning_rate": 9.86657062771515e-07, + "loss": 0.3339, + "step": 1592 + }, + { + "epoch": 0.1, + "grad_norm": 1.1190587607436973, + "learning_rate": 9.866333534195214e-07, + "loss": 0.0508, + "step": 1593 + }, + { + "epoch": 0.1, + "grad_norm": 0.7492173303820396, + "learning_rate": 9.86609623306771e-07, + "loss": 0.0173, + "step": 1594 + }, + { + "epoch": 0.1, + "grad_norm": 0.17268774248057586, + "learning_rate": 9.86585872434276e-07, + "loss": 0.0098, + "step": 1595 + }, + { + "epoch": 0.1, + "grad_norm": 0.6882161608394176, + "learning_rate": 9.865621008030492e-07, + "loss": 0.4026, + "step": 1596 + }, + { + "epoch": 0.1, + "grad_norm": 0.18914030733008877, + "learning_rate": 9.865383084141051e-07, + "loss": 0.0209, + "step": 1597 + }, + { + "epoch": 0.1, + "grad_norm": 0.8754944883215988, + "learning_rate": 9.865144952684588e-07, + "loss": 0.064, + "step": 1598 + }, + { + "epoch": 0.1, + "grad_norm": 0.2758435168021218, + "learning_rate": 9.864906613671262e-07, + "loss": 0.1013, + "step": 1599 + }, + { + "epoch": 0.1, + "grad_norm": 0.6569159055960563, + "learning_rate": 9.864668067111238e-07, + "loss": 0.2068, + "step": 1600 + }, + { + "epoch": 0.1, + "grad_norm": 0.4573223639888256, + "learning_rate": 9.864429313014698e-07, + "loss": 0.245, + "step": 1601 + }, + { + "epoch": 0.1, + "grad_norm": 0.6549498989924214, + "learning_rate": 9.86419035139182e-07, + "loss": 0.0784, + "step": 1602 + }, + { + "epoch": 0.1, + "grad_norm": 0.4307212536134745, + "learning_rate": 9.863951182252808e-07, + "loss": 0.0911, + "step": 1603 + }, + { + "epoch": 0.1, + "grad_norm": 0.9246695738009061, + "learning_rate": 9.863711805607858e-07, + "loss": 0.1848, + "step": 1604 + }, + { + "epoch": 0.1, + "grad_norm": 0.8584093011942867, + "learning_rate": 9.863472221467188e-07, + "loss": 0.2089, + "step": 1605 + }, + { + "epoch": 0.1, + "grad_norm": 0.1959506485643055, + "learning_rate": 9.863232429841013e-07, + "loss": 0.0927, + "step": 1606 + }, + { + "epoch": 0.1, + "grad_norm": 1.1312076550436183, + "learning_rate": 9.862992430739569e-07, + "loss": 0.2254, + "step": 1607 + }, + { + "epoch": 0.1, + "grad_norm": 0.7194644325458003, + "learning_rate": 9.862752224173089e-07, + "loss": 0.1336, + "step": 1608 + }, + { + "epoch": 0.1, + "grad_norm": 0.35806296749475824, + "learning_rate": 9.862511810151827e-07, + "loss": 0.091, + "step": 1609 + }, + { + "epoch": 0.1, + "grad_norm": 0.3224249152615377, + "learning_rate": 9.862271188686036e-07, + "loss": 0.1694, + "step": 1610 + }, + { + "epoch": 0.1, + "grad_norm": 0.35529476439744356, + "learning_rate": 9.86203035978598e-07, + "loss": 0.084, + "step": 1611 + }, + { + "epoch": 0.1, + "grad_norm": 0.4637912150396062, + "learning_rate": 9.861789323461936e-07, + "loss": 0.1505, + "step": 1612 + }, + { + "epoch": 0.1, + "grad_norm": 0.695547042266105, + "learning_rate": 9.861548079724184e-07, + "loss": 0.1855, + "step": 1613 + }, + { + "epoch": 0.1, + "grad_norm": 0.9573012386923306, + "learning_rate": 9.86130662858302e-07, + "loss": 0.4129, + "step": 1614 + }, + { + "epoch": 0.1, + "grad_norm": 0.8909234017172247, + "learning_rate": 9.861064970048742e-07, + "loss": 0.3612, + "step": 1615 + }, + { + "epoch": 0.1, + "grad_norm": 0.5461979307132618, + "learning_rate": 9.860823104131661e-07, + "loss": 0.1456, + "step": 1616 + }, + { + "epoch": 0.1, + "grad_norm": 1.450693533453506, + "learning_rate": 9.860581030842094e-07, + "loss": 0.3491, + "step": 1617 + }, + { + "epoch": 0.1, + "grad_norm": 0.5700827078183561, + "learning_rate": 9.86033875019037e-07, + "loss": 0.236, + "step": 1618 + }, + { + "epoch": 0.1, + "grad_norm": 0.6427023992944825, + "learning_rate": 9.860096262186822e-07, + "loss": 0.1428, + "step": 1619 + }, + { + "epoch": 0.1, + "grad_norm": 0.5089999767887992, + "learning_rate": 9.8598535668418e-07, + "loss": 0.1872, + "step": 1620 + }, + { + "epoch": 0.1, + "grad_norm": 0.22674102403414914, + "learning_rate": 9.859610664165657e-07, + "loss": 0.0153, + "step": 1621 + }, + { + "epoch": 0.1, + "grad_norm": 0.6010049723526243, + "learning_rate": 9.85936755416875e-07, + "loss": 0.2255, + "step": 1622 + }, + { + "epoch": 0.1, + "grad_norm": 0.46927568870917785, + "learning_rate": 9.859124236861458e-07, + "loss": 0.2207, + "step": 1623 + }, + { + "epoch": 0.1, + "grad_norm": 0.6100554901334297, + "learning_rate": 9.858880712254156e-07, + "loss": 0.3904, + "step": 1624 + }, + { + "epoch": 0.1, + "grad_norm": 0.5253195391980957, + "learning_rate": 9.858636980357236e-07, + "loss": 0.1064, + "step": 1625 + }, + { + "epoch": 0.1, + "grad_norm": 0.3186766994435842, + "learning_rate": 9.858393041181094e-07, + "loss": 0.0091, + "step": 1626 + }, + { + "epoch": 0.1, + "grad_norm": 0.49194077776033, + "learning_rate": 9.85814889473614e-07, + "loss": 0.1151, + "step": 1627 + }, + { + "epoch": 0.1, + "grad_norm": 0.3014511666074654, + "learning_rate": 9.857904541032788e-07, + "loss": 0.0477, + "step": 1628 + }, + { + "epoch": 0.1, + "grad_norm": 0.6800381451103563, + "learning_rate": 9.857659980081462e-07, + "loss": 0.2274, + "step": 1629 + }, + { + "epoch": 0.1, + "grad_norm": 0.5048897009278337, + "learning_rate": 9.857415211892597e-07, + "loss": 0.2173, + "step": 1630 + }, + { + "epoch": 0.1, + "grad_norm": 0.38878687723503236, + "learning_rate": 9.857170236476634e-07, + "loss": 0.1196, + "step": 1631 + }, + { + "epoch": 0.1, + "grad_norm": 0.4921853470158305, + "learning_rate": 9.856925053844024e-07, + "loss": 0.2371, + "step": 1632 + }, + { + "epoch": 0.1, + "grad_norm": 0.6741135439542971, + "learning_rate": 9.856679664005227e-07, + "loss": 0.2392, + "step": 1633 + }, + { + "epoch": 0.1, + "grad_norm": 0.7293732326588783, + "learning_rate": 9.856434066970713e-07, + "loss": 0.4094, + "step": 1634 + }, + { + "epoch": 0.1, + "grad_norm": 1.2996126678591897, + "learning_rate": 9.85618826275096e-07, + "loss": 0.3037, + "step": 1635 + }, + { + "epoch": 0.1, + "grad_norm": 0.10446453277853311, + "learning_rate": 9.855942251356452e-07, + "loss": 0.0599, + "step": 1636 + }, + { + "epoch": 0.1, + "grad_norm": 0.7036396263243663, + "learning_rate": 9.855696032797687e-07, + "loss": 0.1621, + "step": 1637 + }, + { + "epoch": 0.1, + "grad_norm": 0.6439254075434799, + "learning_rate": 9.855449607085168e-07, + "loss": 0.146, + "step": 1638 + }, + { + "epoch": 0.1, + "grad_norm": 1.0861699865315655, + "learning_rate": 9.855202974229407e-07, + "loss": 0.1874, + "step": 1639 + }, + { + "epoch": 0.1, + "grad_norm": 0.7652036360756149, + "learning_rate": 9.854956134240929e-07, + "loss": 0.2367, + "step": 1640 + }, + { + "epoch": 0.1, + "grad_norm": 0.491924900336481, + "learning_rate": 9.85470908713026e-07, + "loss": 0.2121, + "step": 1641 + }, + { + "epoch": 0.1, + "grad_norm": 1.09698943161845, + "learning_rate": 9.854461832907943e-07, + "loss": 0.2639, + "step": 1642 + }, + { + "epoch": 0.1, + "grad_norm": 0.3646403434772786, + "learning_rate": 9.854214371584526e-07, + "loss": 0.1764, + "step": 1643 + }, + { + "epoch": 0.1, + "grad_norm": 0.5071519005871684, + "learning_rate": 9.853966703170566e-07, + "loss": 0.1177, + "step": 1644 + }, + { + "epoch": 0.1, + "grad_norm": 0.6921879817661868, + "learning_rate": 9.853718827676628e-07, + "loss": 0.3415, + "step": 1645 + }, + { + "epoch": 0.1, + "grad_norm": 1.1163835110493454, + "learning_rate": 9.853470745113288e-07, + "loss": 0.1166, + "step": 1646 + }, + { + "epoch": 0.11, + "grad_norm": 0.25022652331553863, + "learning_rate": 9.85322245549113e-07, + "loss": 0.0072, + "step": 1647 + }, + { + "epoch": 0.11, + "grad_norm": 0.6749871432923831, + "learning_rate": 9.852973958820746e-07, + "loss": 0.176, + "step": 1648 + }, + { + "epoch": 0.11, + "grad_norm": 0.4975505186456713, + "learning_rate": 9.852725255112734e-07, + "loss": 0.1886, + "step": 1649 + }, + { + "epoch": 0.11, + "grad_norm": 0.6232905693402038, + "learning_rate": 9.85247634437771e-07, + "loss": 0.3126, + "step": 1650 + }, + { + "epoch": 0.11, + "grad_norm": 0.8002825993038455, + "learning_rate": 9.852227226626292e-07, + "loss": 0.1474, + "step": 1651 + }, + { + "epoch": 0.11, + "grad_norm": 0.10381943547798866, + "learning_rate": 9.851977901869105e-07, + "loss": 0.0839, + "step": 1652 + }, + { + "epoch": 0.11, + "grad_norm": 0.7543358346895345, + "learning_rate": 9.851728370116786e-07, + "loss": 0.3721, + "step": 1653 + }, + { + "epoch": 0.11, + "grad_norm": 0.8141386475697114, + "learning_rate": 9.851478631379982e-07, + "loss": 0.1532, + "step": 1654 + }, + { + "epoch": 0.11, + "grad_norm": 0.6566212458996052, + "learning_rate": 9.851228685669347e-07, + "loss": 0.2333, + "step": 1655 + }, + { + "epoch": 0.11, + "grad_norm": 0.3501251624972066, + "learning_rate": 9.850978532995545e-07, + "loss": 0.1383, + "step": 1656 + }, + { + "epoch": 0.11, + "grad_norm": 0.7267284790649643, + "learning_rate": 9.850728173369246e-07, + "loss": 0.228, + "step": 1657 + }, + { + "epoch": 0.11, + "grad_norm": 0.581595390627329, + "learning_rate": 9.850477606801132e-07, + "loss": 0.2769, + "step": 1658 + }, + { + "epoch": 0.11, + "grad_norm": 0.4579902043564122, + "learning_rate": 9.850226833301892e-07, + "loss": 0.2163, + "step": 1659 + }, + { + "epoch": 0.11, + "grad_norm": 0.9122505674135036, + "learning_rate": 9.849975852882226e-07, + "loss": 0.057, + "step": 1660 + }, + { + "epoch": 0.11, + "grad_norm": 0.3111394917274574, + "learning_rate": 9.84972466555284e-07, + "loss": 0.0714, + "step": 1661 + }, + { + "epoch": 0.11, + "grad_norm": 0.8783409127064085, + "learning_rate": 9.849473271324452e-07, + "loss": 0.3154, + "step": 1662 + }, + { + "epoch": 0.11, + "grad_norm": 0.8169232392745213, + "learning_rate": 9.849221670207783e-07, + "loss": 0.1238, + "step": 1663 + }, + { + "epoch": 0.11, + "grad_norm": 0.34584369689229244, + "learning_rate": 9.848969862213572e-07, + "loss": 0.0439, + "step": 1664 + }, + { + "epoch": 0.11, + "grad_norm": 0.20439358764503512, + "learning_rate": 9.848717847352557e-07, + "loss": 0.0118, + "step": 1665 + }, + { + "epoch": 0.11, + "grad_norm": 0.8297774778868178, + "learning_rate": 9.848465625635494e-07, + "loss": 0.0591, + "step": 1666 + }, + { + "epoch": 0.11, + "grad_norm": 0.21684237184432614, + "learning_rate": 9.848213197073138e-07, + "loss": 0.0772, + "step": 1667 + }, + { + "epoch": 0.11, + "grad_norm": 0.5148735373998339, + "learning_rate": 9.847960561676263e-07, + "loss": 0.3261, + "step": 1668 + }, + { + "epoch": 0.11, + "grad_norm": 0.6875334741208333, + "learning_rate": 9.847707719455643e-07, + "loss": 0.2342, + "step": 1669 + }, + { + "epoch": 0.11, + "grad_norm": 0.21305224593370214, + "learning_rate": 9.847454670422067e-07, + "loss": 0.0965, + "step": 1670 + }, + { + "epoch": 0.11, + "grad_norm": 0.30857334239297074, + "learning_rate": 9.84720141458633e-07, + "loss": 0.1509, + "step": 1671 + }, + { + "epoch": 0.11, + "grad_norm": 0.5982979108595999, + "learning_rate": 9.846947951959237e-07, + "loss": 0.1248, + "step": 1672 + }, + { + "epoch": 0.11, + "grad_norm": 0.3881707818725426, + "learning_rate": 9.846694282551601e-07, + "loss": 0.0306, + "step": 1673 + }, + { + "epoch": 0.11, + "grad_norm": 0.7056799214476155, + "learning_rate": 9.846440406374244e-07, + "loss": 0.1658, + "step": 1674 + }, + { + "epoch": 0.11, + "grad_norm": 0.8837410663483006, + "learning_rate": 9.846186323437998e-07, + "loss": 0.2022, + "step": 1675 + }, + { + "epoch": 0.11, + "grad_norm": 0.8431493174441403, + "learning_rate": 9.8459320337537e-07, + "loss": 0.3112, + "step": 1676 + }, + { + "epoch": 0.11, + "grad_norm": 0.24393374157755976, + "learning_rate": 9.8456775373322e-07, + "loss": 0.0037, + "step": 1677 + }, + { + "epoch": 0.11, + "grad_norm": 0.5249565529368295, + "learning_rate": 9.845422834184354e-07, + "loss": 0.2402, + "step": 1678 + }, + { + "epoch": 0.11, + "grad_norm": 0.5373256341212908, + "learning_rate": 9.84516792432103e-07, + "loss": 0.2174, + "step": 1679 + }, + { + "epoch": 0.11, + "grad_norm": 0.5148799176302914, + "learning_rate": 9.844912807753102e-07, + "loss": 0.0178, + "step": 1680 + }, + { + "epoch": 0.11, + "grad_norm": 1.2654821914220962, + "learning_rate": 9.844657484491455e-07, + "loss": 0.5374, + "step": 1681 + }, + { + "epoch": 0.11, + "grad_norm": 0.34012022508386563, + "learning_rate": 9.844401954546982e-07, + "loss": 0.2204, + "step": 1682 + }, + { + "epoch": 0.11, + "grad_norm": 0.6504776717129892, + "learning_rate": 9.84414621793058e-07, + "loss": 0.4022, + "step": 1683 + }, + { + "epoch": 0.11, + "grad_norm": 0.5094296043064444, + "learning_rate": 9.843890274653164e-07, + "loss": 0.2673, + "step": 1684 + }, + { + "epoch": 0.11, + "grad_norm": 0.9684770030115625, + "learning_rate": 9.843634124725653e-07, + "loss": 0.1952, + "step": 1685 + }, + { + "epoch": 0.11, + "grad_norm": 0.6669948245222741, + "learning_rate": 9.843377768158971e-07, + "loss": 0.2342, + "step": 1686 + }, + { + "epoch": 0.11, + "grad_norm": 0.4466624802946475, + "learning_rate": 9.843121204964057e-07, + "loss": 0.1335, + "step": 1687 + }, + { + "epoch": 0.11, + "grad_norm": 0.60892533892139, + "learning_rate": 9.842864435151859e-07, + "loss": 0.3891, + "step": 1688 + }, + { + "epoch": 0.11, + "grad_norm": 0.4968500382457546, + "learning_rate": 9.842607458733325e-07, + "loss": 0.2572, + "step": 1689 + }, + { + "epoch": 0.11, + "grad_norm": 1.670389115941225, + "learning_rate": 9.842350275719426e-07, + "loss": 0.0334, + "step": 1690 + }, + { + "epoch": 0.11, + "grad_norm": 0.6025367438324835, + "learning_rate": 9.842092886121127e-07, + "loss": 0.1054, + "step": 1691 + }, + { + "epoch": 0.11, + "grad_norm": 0.4752594685453358, + "learning_rate": 9.841835289949412e-07, + "loss": 0.2078, + "step": 1692 + }, + { + "epoch": 0.11, + "grad_norm": 0.7298000651694033, + "learning_rate": 9.84157748721527e-07, + "loss": 0.0499, + "step": 1693 + }, + { + "epoch": 0.11, + "grad_norm": 0.44105453765342617, + "learning_rate": 9.841319477929702e-07, + "loss": 0.2132, + "step": 1694 + }, + { + "epoch": 0.11, + "grad_norm": 0.5848802588162659, + "learning_rate": 9.841061262103712e-07, + "loss": 0.3941, + "step": 1695 + }, + { + "epoch": 0.11, + "grad_norm": 0.5989033355167053, + "learning_rate": 9.840802839748313e-07, + "loss": 0.2946, + "step": 1696 + }, + { + "epoch": 0.11, + "grad_norm": 0.4800551168081081, + "learning_rate": 9.84054421087454e-07, + "loss": 0.1525, + "step": 1697 + }, + { + "epoch": 0.11, + "grad_norm": 0.17172942266235122, + "learning_rate": 9.840285375493416e-07, + "loss": 0.0979, + "step": 1698 + }, + { + "epoch": 0.11, + "grad_norm": 0.6392176185366084, + "learning_rate": 9.840026333615987e-07, + "loss": 0.0169, + "step": 1699 + }, + { + "epoch": 0.11, + "grad_norm": 1.846588000790528, + "learning_rate": 9.839767085253307e-07, + "loss": 0.2467, + "step": 1700 + }, + { + "epoch": 0.11, + "grad_norm": 0.9009554247070751, + "learning_rate": 9.839507630416436e-07, + "loss": 0.1856, + "step": 1701 + }, + { + "epoch": 0.11, + "grad_norm": 0.30164687180032423, + "learning_rate": 9.839247969116437e-07, + "loss": 0.2101, + "step": 1702 + }, + { + "epoch": 0.11, + "grad_norm": 1.0101689422412143, + "learning_rate": 9.838988101364394e-07, + "loss": 0.0621, + "step": 1703 + }, + { + "epoch": 0.11, + "grad_norm": 0.6279014376251323, + "learning_rate": 9.838728027171388e-07, + "loss": 0.3739, + "step": 1704 + }, + { + "epoch": 0.11, + "grad_norm": 1.4724623998971567, + "learning_rate": 9.83846774654852e-07, + "loss": 0.3858, + "step": 1705 + }, + { + "epoch": 0.11, + "grad_norm": 0.3424337059410447, + "learning_rate": 9.83820725950689e-07, + "loss": 0.343, + "step": 1706 + }, + { + "epoch": 0.11, + "grad_norm": 0.2760635915213745, + "learning_rate": 9.837946566057614e-07, + "loss": 0.1629, + "step": 1707 + }, + { + "epoch": 0.11, + "grad_norm": 0.7490635728472637, + "learning_rate": 9.83768566621181e-07, + "loss": 0.1155, + "step": 1708 + }, + { + "epoch": 0.11, + "grad_norm": 0.8384511238522884, + "learning_rate": 9.837424559980612e-07, + "loss": 0.1987, + "step": 1709 + }, + { + "epoch": 0.11, + "grad_norm": 0.8215913083693003, + "learning_rate": 9.837163247375157e-07, + "loss": 0.2006, + "step": 1710 + }, + { + "epoch": 0.11, + "grad_norm": 0.5060243417944514, + "learning_rate": 9.836901728406594e-07, + "loss": 0.3762, + "step": 1711 + }, + { + "epoch": 0.11, + "grad_norm": 0.5476501914212019, + "learning_rate": 9.83664000308608e-07, + "loss": 0.2034, + "step": 1712 + }, + { + "epoch": 0.11, + "grad_norm": 0.44823886681977815, + "learning_rate": 9.83637807142478e-07, + "loss": 0.2989, + "step": 1713 + }, + { + "epoch": 0.11, + "grad_norm": 0.5106181915582283, + "learning_rate": 9.83611593343387e-07, + "loss": 0.2934, + "step": 1714 + }, + { + "epoch": 0.11, + "grad_norm": 0.282226581431579, + "learning_rate": 9.835853589124531e-07, + "loss": 0.2696, + "step": 1715 + }, + { + "epoch": 0.11, + "grad_norm": 0.6971792264132003, + "learning_rate": 9.83559103850796e-07, + "loss": 0.2182, + "step": 1716 + }, + { + "epoch": 0.11, + "grad_norm": 0.18208110199973887, + "learning_rate": 9.835328281595351e-07, + "loss": 0.1607, + "step": 1717 + }, + { + "epoch": 0.11, + "grad_norm": 0.7076367669794615, + "learning_rate": 9.83506531839792e-07, + "loss": 0.3098, + "step": 1718 + }, + { + "epoch": 0.11, + "grad_norm": 0.7060300322733915, + "learning_rate": 9.834802148926882e-07, + "loss": 0.1326, + "step": 1719 + }, + { + "epoch": 0.11, + "grad_norm": 0.5726378716837955, + "learning_rate": 9.834538773193463e-07, + "loss": 0.2372, + "step": 1720 + }, + { + "epoch": 0.11, + "grad_norm": 0.09794177964706838, + "learning_rate": 9.834275191208902e-07, + "loss": 0.004, + "step": 1721 + }, + { + "epoch": 0.11, + "grad_norm": 0.33448780003817236, + "learning_rate": 9.834011402984445e-07, + "loss": 0.119, + "step": 1722 + }, + { + "epoch": 0.11, + "grad_norm": 0.570240963744505, + "learning_rate": 9.833747408531344e-07, + "loss": 0.4368, + "step": 1723 + }, + { + "epoch": 0.11, + "grad_norm": 0.553949900799787, + "learning_rate": 9.833483207860859e-07, + "loss": 0.0991, + "step": 1724 + }, + { + "epoch": 0.11, + "grad_norm": 0.29333690283732444, + "learning_rate": 9.833218800984266e-07, + "loss": 0.0875, + "step": 1725 + }, + { + "epoch": 0.11, + "grad_norm": 0.3069001815477144, + "learning_rate": 9.832954187912843e-07, + "loss": 0.0407, + "step": 1726 + }, + { + "epoch": 0.11, + "grad_norm": 0.5057713466762443, + "learning_rate": 9.832689368657879e-07, + "loss": 0.2316, + "step": 1727 + }, + { + "epoch": 0.11, + "grad_norm": 0.4303615533563246, + "learning_rate": 9.83242434323067e-07, + "loss": 0.0915, + "step": 1728 + }, + { + "epoch": 0.11, + "grad_norm": 0.6236747153961416, + "learning_rate": 9.832159111642526e-07, + "loss": 0.0566, + "step": 1729 + }, + { + "epoch": 0.11, + "grad_norm": 0.3078982598280512, + "learning_rate": 9.831893673904759e-07, + "loss": 0.1209, + "step": 1730 + }, + { + "epoch": 0.11, + "grad_norm": 1.2862679414751008, + "learning_rate": 9.831628030028696e-07, + "loss": 0.2588, + "step": 1731 + }, + { + "epoch": 0.11, + "grad_norm": 0.3677271815970271, + "learning_rate": 9.831362180025666e-07, + "loss": 0.3474, + "step": 1732 + }, + { + "epoch": 0.11, + "grad_norm": 0.6741049769726314, + "learning_rate": 9.831096123907015e-07, + "loss": 0.2151, + "step": 1733 + }, + { + "epoch": 0.11, + "grad_norm": 0.6127332879081245, + "learning_rate": 9.83082986168409e-07, + "loss": 0.2339, + "step": 1734 + }, + { + "epoch": 0.11, + "grad_norm": 0.31925479786507616, + "learning_rate": 9.830563393368255e-07, + "loss": 0.0889, + "step": 1735 + }, + { + "epoch": 0.11, + "grad_norm": 0.48704935384966197, + "learning_rate": 9.830296718970872e-07, + "loss": 0.1856, + "step": 1736 + }, + { + "epoch": 0.11, + "grad_norm": 0.7564513724340943, + "learning_rate": 9.830029838503322e-07, + "loss": 0.1297, + "step": 1737 + }, + { + "epoch": 0.11, + "grad_norm": 0.8743928928257639, + "learning_rate": 9.829762751976991e-07, + "loss": 0.3113, + "step": 1738 + }, + { + "epoch": 0.11, + "grad_norm": 0.4996887780505269, + "learning_rate": 9.82949545940327e-07, + "loss": 0.2354, + "step": 1739 + }, + { + "epoch": 0.11, + "grad_norm": 0.7579165889952131, + "learning_rate": 9.829227960793564e-07, + "loss": 0.196, + "step": 1740 + }, + { + "epoch": 0.11, + "grad_norm": 0.2615387161682434, + "learning_rate": 9.828960256159287e-07, + "loss": 0.0289, + "step": 1741 + }, + { + "epoch": 0.11, + "grad_norm": 0.3813425408339454, + "learning_rate": 9.828692345511857e-07, + "loss": 0.1092, + "step": 1742 + }, + { + "epoch": 0.11, + "grad_norm": 0.27618223710505885, + "learning_rate": 9.828424228862703e-07, + "loss": 0.0345, + "step": 1743 + }, + { + "epoch": 0.11, + "grad_norm": 0.5707560816531704, + "learning_rate": 9.828155906223267e-07, + "loss": 0.0679, + "step": 1744 + }, + { + "epoch": 0.11, + "grad_norm": 0.8166632332306311, + "learning_rate": 9.827887377604995e-07, + "loss": 0.2064, + "step": 1745 + }, + { + "epoch": 0.11, + "grad_norm": 0.13638838624686567, + "learning_rate": 9.827618643019339e-07, + "loss": 0.081, + "step": 1746 + }, + { + "epoch": 0.11, + "grad_norm": 0.10007512364566283, + "learning_rate": 9.82734970247777e-07, + "loss": 0.0394, + "step": 1747 + }, + { + "epoch": 0.11, + "grad_norm": 0.6577255730064722, + "learning_rate": 9.827080555991759e-07, + "loss": 0.4635, + "step": 1748 + }, + { + "epoch": 0.11, + "grad_norm": 0.30082004733304346, + "learning_rate": 9.826811203572785e-07, + "loss": 0.051, + "step": 1749 + }, + { + "epoch": 0.11, + "grad_norm": 1.970628457968785, + "learning_rate": 9.826541645232344e-07, + "loss": 0.1083, + "step": 1750 + }, + { + "epoch": 0.11, + "grad_norm": 0.45907943999897577, + "learning_rate": 9.826271880981934e-07, + "loss": 0.2844, + "step": 1751 + }, + { + "epoch": 0.11, + "grad_norm": 0.34774585567832594, + "learning_rate": 9.826001910833062e-07, + "loss": 0.4047, + "step": 1752 + }, + { + "epoch": 0.11, + "grad_norm": 0.5476944428879885, + "learning_rate": 9.825731734797246e-07, + "loss": 0.2125, + "step": 1753 + }, + { + "epoch": 0.11, + "grad_norm": 0.5065585280997207, + "learning_rate": 9.825461352886016e-07, + "loss": 0.3552, + "step": 1754 + }, + { + "epoch": 0.11, + "grad_norm": 0.5620406168997668, + "learning_rate": 9.825190765110904e-07, + "loss": 0.1863, + "step": 1755 + }, + { + "epoch": 0.11, + "grad_norm": 0.8806592318224842, + "learning_rate": 9.824919971483451e-07, + "loss": 0.3026, + "step": 1756 + }, + { + "epoch": 0.11, + "grad_norm": 0.647794205861562, + "learning_rate": 9.824648972015218e-07, + "loss": 0.1532, + "step": 1757 + }, + { + "epoch": 0.11, + "grad_norm": 0.3841639771848569, + "learning_rate": 9.824377766717758e-07, + "loss": 0.1178, + "step": 1758 + }, + { + "epoch": 0.11, + "grad_norm": 0.47163025371174877, + "learning_rate": 9.824106355602643e-07, + "loss": 0.0146, + "step": 1759 + }, + { + "epoch": 0.11, + "grad_norm": 0.917001845279771, + "learning_rate": 9.823834738681454e-07, + "loss": 0.1631, + "step": 1760 + }, + { + "epoch": 0.11, + "grad_norm": 0.3793247641315772, + "learning_rate": 9.823562915965779e-07, + "loss": 0.265, + "step": 1761 + }, + { + "epoch": 0.11, + "grad_norm": 0.9563626167521188, + "learning_rate": 9.823290887467213e-07, + "loss": 0.3483, + "step": 1762 + }, + { + "epoch": 0.11, + "grad_norm": 0.42631993786465006, + "learning_rate": 9.82301865319736e-07, + "loss": 0.1072, + "step": 1763 + }, + { + "epoch": 0.11, + "grad_norm": 0.4259209625898786, + "learning_rate": 9.82274621316784e-07, + "loss": 0.1468, + "step": 1764 + }, + { + "epoch": 0.11, + "grad_norm": 0.4871455260003286, + "learning_rate": 9.822473567390269e-07, + "loss": 0.4063, + "step": 1765 + }, + { + "epoch": 0.11, + "grad_norm": 0.5626342707483665, + "learning_rate": 9.82220071587628e-07, + "loss": 0.2594, + "step": 1766 + }, + { + "epoch": 0.11, + "grad_norm": 0.3353387516102091, + "learning_rate": 9.821927658637517e-07, + "loss": 0.0104, + "step": 1767 + }, + { + "epoch": 0.11, + "grad_norm": 0.7804416552276265, + "learning_rate": 9.821654395685626e-07, + "loss": 0.1929, + "step": 1768 + }, + { + "epoch": 0.11, + "grad_norm": 0.23512946681285746, + "learning_rate": 9.821380927032264e-07, + "loss": 0.1229, + "step": 1769 + }, + { + "epoch": 0.11, + "grad_norm": 0.5555530016103668, + "learning_rate": 9.821107252689102e-07, + "loss": 0.2054, + "step": 1770 + }, + { + "epoch": 0.11, + "grad_norm": 0.5443962688924795, + "learning_rate": 9.820833372667812e-07, + "loss": 0.1442, + "step": 1771 + }, + { + "epoch": 0.11, + "grad_norm": 0.39619915615185125, + "learning_rate": 9.82055928698008e-07, + "loss": 0.2412, + "step": 1772 + }, + { + "epoch": 0.11, + "grad_norm": 0.3305805628162596, + "learning_rate": 9.820284995637595e-07, + "loss": 0.2055, + "step": 1773 + }, + { + "epoch": 0.11, + "grad_norm": 0.6046842976387699, + "learning_rate": 9.820010498652064e-07, + "loss": 0.4141, + "step": 1774 + }, + { + "epoch": 0.11, + "grad_norm": 0.4031768210777634, + "learning_rate": 9.819735796035197e-07, + "loss": 0.1269, + "step": 1775 + }, + { + "epoch": 0.11, + "grad_norm": 0.6229167909879683, + "learning_rate": 9.819460887798713e-07, + "loss": 0.1951, + "step": 1776 + }, + { + "epoch": 0.11, + "grad_norm": 0.40434643232921913, + "learning_rate": 9.819185773954335e-07, + "loss": 0.1349, + "step": 1777 + }, + { + "epoch": 0.11, + "grad_norm": 0.433914946149375, + "learning_rate": 9.818910454513808e-07, + "loss": 0.2466, + "step": 1778 + }, + { + "epoch": 0.11, + "grad_norm": 0.5308097070441203, + "learning_rate": 9.818634929488872e-07, + "loss": 0.1349, + "step": 1779 + }, + { + "epoch": 0.11, + "grad_norm": 0.3651917806208901, + "learning_rate": 9.818359198891284e-07, + "loss": 0.083, + "step": 1780 + }, + { + "epoch": 0.11, + "grad_norm": 0.4163734535240564, + "learning_rate": 9.818083262732806e-07, + "loss": 0.2195, + "step": 1781 + }, + { + "epoch": 0.11, + "grad_norm": 0.16223345234529754, + "learning_rate": 9.81780712102521e-07, + "loss": 0.2932, + "step": 1782 + }, + { + "epoch": 0.11, + "grad_norm": 0.6132952793487771, + "learning_rate": 9.817530773780276e-07, + "loss": 0.1361, + "step": 1783 + }, + { + "epoch": 0.11, + "grad_norm": 0.45207397666049487, + "learning_rate": 9.817254221009798e-07, + "loss": 0.448, + "step": 1784 + }, + { + "epoch": 0.11, + "grad_norm": 0.34689781529461494, + "learning_rate": 9.81697746272557e-07, + "loss": 0.1557, + "step": 1785 + }, + { + "epoch": 0.11, + "grad_norm": 0.5331420713596847, + "learning_rate": 9.816700498939399e-07, + "loss": 0.0829, + "step": 1786 + }, + { + "epoch": 0.11, + "grad_norm": 0.49071365729277694, + "learning_rate": 9.8164233296631e-07, + "loss": 0.124, + "step": 1787 + }, + { + "epoch": 0.11, + "grad_norm": 0.596526687363129, + "learning_rate": 9.816145954908503e-07, + "loss": 0.154, + "step": 1788 + }, + { + "epoch": 0.11, + "grad_norm": 0.39534540517531486, + "learning_rate": 9.815868374687436e-07, + "loss": 0.0784, + "step": 1789 + }, + { + "epoch": 0.11, + "grad_norm": 0.42546356363112214, + "learning_rate": 9.815590589011746e-07, + "loss": 0.1693, + "step": 1790 + }, + { + "epoch": 0.11, + "grad_norm": 0.31141271989092173, + "learning_rate": 9.815312597893278e-07, + "loss": 0.0798, + "step": 1791 + }, + { + "epoch": 0.11, + "grad_norm": 0.4670777628172246, + "learning_rate": 9.815034401343896e-07, + "loss": 0.1561, + "step": 1792 + }, + { + "epoch": 0.11, + "grad_norm": 0.6577163324054129, + "learning_rate": 9.814755999375466e-07, + "loss": 0.2182, + "step": 1793 + }, + { + "epoch": 0.11, + "grad_norm": 0.624311653075215, + "learning_rate": 9.814477391999867e-07, + "loss": 0.3685, + "step": 1794 + }, + { + "epoch": 0.11, + "grad_norm": 0.4001669059106104, + "learning_rate": 9.814198579228985e-07, + "loss": 0.2468, + "step": 1795 + }, + { + "epoch": 0.11, + "grad_norm": 0.6506927806465743, + "learning_rate": 9.81391956107471e-07, + "loss": 0.2173, + "step": 1796 + }, + { + "epoch": 0.11, + "grad_norm": 0.6632104392930209, + "learning_rate": 9.813640337548954e-07, + "loss": 0.213, + "step": 1797 + }, + { + "epoch": 0.11, + "grad_norm": 0.5259042393102924, + "learning_rate": 9.813360908663621e-07, + "loss": 0.1574, + "step": 1798 + }, + { + "epoch": 0.11, + "grad_norm": 0.6201969188977602, + "learning_rate": 9.813081274430636e-07, + "loss": 0.1324, + "step": 1799 + }, + { + "epoch": 0.11, + "grad_norm": 1.1144665506804912, + "learning_rate": 9.81280143486193e-07, + "loss": 0.1556, + "step": 1800 + }, + { + "epoch": 0.11, + "grad_norm": 0.4889335289184677, + "learning_rate": 9.81252138996944e-07, + "loss": 0.1724, + "step": 1801 + }, + { + "epoch": 0.11, + "grad_norm": 0.18328905865515413, + "learning_rate": 9.812241139765112e-07, + "loss": 0.0105, + "step": 1802 + }, + { + "epoch": 0.11, + "grad_norm": 1.1410813212247752, + "learning_rate": 9.811960684260906e-07, + "loss": 0.1477, + "step": 1803 + }, + { + "epoch": 0.12, + "grad_norm": 0.24938386629140047, + "learning_rate": 9.81168002346878e-07, + "loss": 0.0134, + "step": 1804 + }, + { + "epoch": 0.12, + "grad_norm": 0.5242348551646572, + "learning_rate": 9.811399157400712e-07, + "loss": 0.315, + "step": 1805 + }, + { + "epoch": 0.12, + "grad_norm": 0.19186140756534406, + "learning_rate": 9.811118086068687e-07, + "loss": 0.0754, + "step": 1806 + }, + { + "epoch": 0.12, + "grad_norm": 0.3366882759389052, + "learning_rate": 9.810836809484689e-07, + "loss": 0.1253, + "step": 1807 + }, + { + "epoch": 0.12, + "grad_norm": 0.31379195803906235, + "learning_rate": 9.810555327660723e-07, + "loss": 0.3143, + "step": 1808 + }, + { + "epoch": 0.12, + "grad_norm": 0.798839770675499, + "learning_rate": 9.810273640608798e-07, + "loss": 0.151, + "step": 1809 + }, + { + "epoch": 0.12, + "grad_norm": 0.5005194608080867, + "learning_rate": 9.809991748340926e-07, + "loss": 0.1507, + "step": 1810 + }, + { + "epoch": 0.12, + "grad_norm": 0.32667542686374074, + "learning_rate": 9.80970965086914e-07, + "loss": 0.1517, + "step": 1811 + }, + { + "epoch": 0.12, + "grad_norm": 1.2176304917892244, + "learning_rate": 9.80942734820547e-07, + "loss": 0.0326, + "step": 1812 + }, + { + "epoch": 0.12, + "grad_norm": 0.7991156169284748, + "learning_rate": 9.809144840361963e-07, + "loss": 0.3402, + "step": 1813 + }, + { + "epoch": 0.12, + "grad_norm": 0.7622023192574338, + "learning_rate": 9.808862127350668e-07, + "loss": 0.4426, + "step": 1814 + }, + { + "epoch": 0.12, + "grad_norm": 0.4018101359851353, + "learning_rate": 9.808579209183648e-07, + "loss": 0.088, + "step": 1815 + }, + { + "epoch": 0.12, + "grad_norm": 0.42093073883317284, + "learning_rate": 9.808296085872971e-07, + "loss": 0.158, + "step": 1816 + }, + { + "epoch": 0.12, + "grad_norm": 0.31664305537217885, + "learning_rate": 9.80801275743072e-07, + "loss": 0.1224, + "step": 1817 + }, + { + "epoch": 0.12, + "grad_norm": 0.34672074617306375, + "learning_rate": 9.807729223868978e-07, + "loss": 0.1798, + "step": 1818 + }, + { + "epoch": 0.12, + "grad_norm": 0.3266763970395985, + "learning_rate": 9.807445485199842e-07, + "loss": 0.0133, + "step": 1819 + }, + { + "epoch": 0.12, + "grad_norm": 0.7941496523634355, + "learning_rate": 9.807161541435417e-07, + "loss": 0.4453, + "step": 1820 + }, + { + "epoch": 0.12, + "grad_norm": 0.5296019648078376, + "learning_rate": 9.80687739258782e-07, + "loss": 0.205, + "step": 1821 + }, + { + "epoch": 0.12, + "grad_norm": 0.6639361373691804, + "learning_rate": 9.806593038669167e-07, + "loss": 0.3553, + "step": 1822 + }, + { + "epoch": 0.12, + "grad_norm": 0.20200816961047624, + "learning_rate": 9.806308479691594e-07, + "loss": 0.1893, + "step": 1823 + }, + { + "epoch": 0.12, + "grad_norm": 0.4058296945525068, + "learning_rate": 9.80602371566724e-07, + "loss": 0.3192, + "step": 1824 + }, + { + "epoch": 0.12, + "grad_norm": 0.31605906656818067, + "learning_rate": 9.805738746608251e-07, + "loss": 0.1119, + "step": 1825 + }, + { + "epoch": 0.12, + "grad_norm": 0.22778975778087518, + "learning_rate": 9.805453572526787e-07, + "loss": 0.0946, + "step": 1826 + }, + { + "epoch": 0.12, + "grad_norm": 0.4639766727675487, + "learning_rate": 9.805168193435014e-07, + "loss": 0.1883, + "step": 1827 + }, + { + "epoch": 0.12, + "grad_norm": 0.6688244519223763, + "learning_rate": 9.804882609345106e-07, + "loss": 0.0828, + "step": 1828 + }, + { + "epoch": 0.12, + "grad_norm": 0.4956690050397604, + "learning_rate": 9.804596820269249e-07, + "loss": 0.1604, + "step": 1829 + }, + { + "epoch": 0.12, + "grad_norm": 0.441950423118919, + "learning_rate": 9.80431082621963e-07, + "loss": 0.2277, + "step": 1830 + }, + { + "epoch": 0.12, + "grad_norm": 0.4657042253417374, + "learning_rate": 9.804024627208456e-07, + "loss": 0.1976, + "step": 1831 + }, + { + "epoch": 0.12, + "grad_norm": 0.4808679457578324, + "learning_rate": 9.80373822324793e-07, + "loss": 0.0222, + "step": 1832 + }, + { + "epoch": 0.12, + "grad_norm": 0.6024860333388573, + "learning_rate": 9.803451614350278e-07, + "loss": 0.1906, + "step": 1833 + }, + { + "epoch": 0.12, + "grad_norm": 0.37904837274505715, + "learning_rate": 9.803164800527723e-07, + "loss": 0.2039, + "step": 1834 + }, + { + "epoch": 0.12, + "grad_norm": 0.3253697459300525, + "learning_rate": 9.802877781792503e-07, + "loss": 0.1406, + "step": 1835 + }, + { + "epoch": 0.12, + "grad_norm": 0.3089051198714158, + "learning_rate": 9.802590558156861e-07, + "loss": 0.1815, + "step": 1836 + }, + { + "epoch": 0.12, + "grad_norm": 1.7715997059014499, + "learning_rate": 9.802303129633052e-07, + "loss": 0.3338, + "step": 1837 + }, + { + "epoch": 0.12, + "grad_norm": 0.9373057696770362, + "learning_rate": 9.802015496233336e-07, + "loss": 0.3327, + "step": 1838 + }, + { + "epoch": 0.12, + "grad_norm": 0.39383632011750946, + "learning_rate": 9.801727657969987e-07, + "loss": 0.1205, + "step": 1839 + }, + { + "epoch": 0.12, + "grad_norm": 0.4442621155258366, + "learning_rate": 9.801439614855285e-07, + "loss": 0.086, + "step": 1840 + }, + { + "epoch": 0.12, + "grad_norm": 0.36569334529732345, + "learning_rate": 9.801151366901514e-07, + "loss": 0.1447, + "step": 1841 + }, + { + "epoch": 0.12, + "grad_norm": 0.6557356390290756, + "learning_rate": 9.800862914120975e-07, + "loss": 0.3614, + "step": 1842 + }, + { + "epoch": 0.12, + "grad_norm": 0.2791723331236794, + "learning_rate": 9.800574256525974e-07, + "loss": 0.0677, + "step": 1843 + }, + { + "epoch": 0.12, + "grad_norm": 0.4177283348243584, + "learning_rate": 9.800285394128824e-07, + "loss": 0.1132, + "step": 1844 + }, + { + "epoch": 0.12, + "grad_norm": 0.5267707960046631, + "learning_rate": 9.799996326941849e-07, + "loss": 0.1599, + "step": 1845 + }, + { + "epoch": 0.12, + "grad_norm": 1.349914681365973, + "learning_rate": 9.799707054977382e-07, + "loss": 0.3591, + "step": 1846 + }, + { + "epoch": 0.12, + "grad_norm": 0.22452490998338034, + "learning_rate": 9.799417578247764e-07, + "loss": 0.1453, + "step": 1847 + }, + { + "epoch": 0.12, + "grad_norm": 0.6393751850329222, + "learning_rate": 9.799127896765344e-07, + "loss": 0.2642, + "step": 1848 + }, + { + "epoch": 0.12, + "grad_norm": 0.27700682909654584, + "learning_rate": 9.798838010542482e-07, + "loss": 0.1942, + "step": 1849 + }, + { + "epoch": 0.12, + "grad_norm": 0.8556120715818918, + "learning_rate": 9.79854791959154e-07, + "loss": 0.1704, + "step": 1850 + }, + { + "epoch": 0.12, + "grad_norm": 0.285872714361692, + "learning_rate": 9.798257623924899e-07, + "loss": 0.0514, + "step": 1851 + }, + { + "epoch": 0.12, + "grad_norm": 0.4804723649355166, + "learning_rate": 9.797967123554943e-07, + "loss": 0.2293, + "step": 1852 + }, + { + "epoch": 0.12, + "grad_norm": 0.8830097789972284, + "learning_rate": 9.797676418494063e-07, + "loss": 0.1981, + "step": 1853 + }, + { + "epoch": 0.12, + "grad_norm": 0.39143972961572615, + "learning_rate": 9.797385508754664e-07, + "loss": 0.1841, + "step": 1854 + }, + { + "epoch": 0.12, + "grad_norm": 0.5713067759027882, + "learning_rate": 9.797094394349152e-07, + "loss": 0.4405, + "step": 1855 + }, + { + "epoch": 0.12, + "grad_norm": 0.1961522534586305, + "learning_rate": 9.796803075289953e-07, + "loss": 0.1621, + "step": 1856 + }, + { + "epoch": 0.12, + "grad_norm": 0.39600435921384797, + "learning_rate": 9.79651155158949e-07, + "loss": 0.1969, + "step": 1857 + }, + { + "epoch": 0.12, + "grad_norm": 0.28276205103716984, + "learning_rate": 9.796219823260203e-07, + "loss": 0.1092, + "step": 1858 + }, + { + "epoch": 0.12, + "grad_norm": 0.18106232221884677, + "learning_rate": 9.795927890314536e-07, + "loss": 0.1051, + "step": 1859 + }, + { + "epoch": 0.12, + "grad_norm": 0.49736151366559794, + "learning_rate": 9.795635752764947e-07, + "loss": 0.2484, + "step": 1860 + }, + { + "epoch": 0.12, + "grad_norm": 0.6512118286875864, + "learning_rate": 9.795343410623893e-07, + "loss": 0.3593, + "step": 1861 + }, + { + "epoch": 0.12, + "grad_norm": 0.6993336560272034, + "learning_rate": 9.795050863903851e-07, + "loss": 0.1588, + "step": 1862 + }, + { + "epoch": 0.12, + "grad_norm": 0.7299234256592234, + "learning_rate": 9.7947581126173e-07, + "loss": 0.43, + "step": 1863 + }, + { + "epoch": 0.12, + "grad_norm": 0.8753700912961243, + "learning_rate": 9.794465156776727e-07, + "loss": 0.2569, + "step": 1864 + }, + { + "epoch": 0.12, + "grad_norm": 0.7229558812877958, + "learning_rate": 9.794171996394636e-07, + "loss": 0.4534, + "step": 1865 + }, + { + "epoch": 0.12, + "grad_norm": 0.3767583159896383, + "learning_rate": 9.793878631483528e-07, + "loss": 0.2364, + "step": 1866 + }, + { + "epoch": 0.12, + "grad_norm": 1.0080189319702062, + "learning_rate": 9.79358506205592e-07, + "loss": 0.2474, + "step": 1867 + }, + { + "epoch": 0.12, + "grad_norm": 0.32366932783785746, + "learning_rate": 9.793291288124339e-07, + "loss": 0.2354, + "step": 1868 + }, + { + "epoch": 0.12, + "grad_norm": 0.41794399068829174, + "learning_rate": 9.792997309701314e-07, + "loss": 0.2055, + "step": 1869 + }, + { + "epoch": 0.12, + "grad_norm": 0.8669801849769293, + "learning_rate": 9.79270312679939e-07, + "loss": 0.3206, + "step": 1870 + }, + { + "epoch": 0.12, + "grad_norm": 1.1628062491234834, + "learning_rate": 9.792408739431115e-07, + "loss": 0.1916, + "step": 1871 + }, + { + "epoch": 0.12, + "grad_norm": 0.22763899811533927, + "learning_rate": 9.79211414760905e-07, + "loss": 0.0115, + "step": 1872 + }, + { + "epoch": 0.12, + "grad_norm": 0.8178078855246522, + "learning_rate": 9.79181935134576e-07, + "loss": 0.2622, + "step": 1873 + }, + { + "epoch": 0.12, + "grad_norm": 0.5902380529764373, + "learning_rate": 9.791524350653825e-07, + "loss": 0.3348, + "step": 1874 + }, + { + "epoch": 0.12, + "grad_norm": 0.3034829566721241, + "learning_rate": 9.791229145545832e-07, + "loss": 0.163, + "step": 1875 + }, + { + "epoch": 0.12, + "grad_norm": 0.6596373558294532, + "learning_rate": 9.790933736034367e-07, + "loss": 0.2436, + "step": 1876 + }, + { + "epoch": 0.12, + "grad_norm": 1.2110218039827778, + "learning_rate": 9.790638122132042e-07, + "loss": 0.3134, + "step": 1877 + }, + { + "epoch": 0.12, + "grad_norm": 0.6319633247353574, + "learning_rate": 9.790342303851462e-07, + "loss": 0.6044, + "step": 1878 + }, + { + "epoch": 0.12, + "grad_norm": 0.678909030370214, + "learning_rate": 9.79004628120525e-07, + "loss": 0.3747, + "step": 1879 + }, + { + "epoch": 0.12, + "grad_norm": 0.31815956324995, + "learning_rate": 9.789750054206035e-07, + "loss": 0.1646, + "step": 1880 + }, + { + "epoch": 0.12, + "grad_norm": 0.8783055343424494, + "learning_rate": 9.789453622866453e-07, + "loss": 0.1787, + "step": 1881 + }, + { + "epoch": 0.12, + "grad_norm": 0.20925826497237446, + "learning_rate": 9.789156987199154e-07, + "loss": 0.0833, + "step": 1882 + }, + { + "epoch": 0.12, + "grad_norm": 0.5741813855041612, + "learning_rate": 9.788860147216788e-07, + "loss": 0.3773, + "step": 1883 + }, + { + "epoch": 0.12, + "grad_norm": 0.9213884749639869, + "learning_rate": 9.788563102932021e-07, + "loss": 0.2624, + "step": 1884 + }, + { + "epoch": 0.12, + "grad_norm": 0.4642293176264571, + "learning_rate": 9.788265854357527e-07, + "loss": 0.2079, + "step": 1885 + }, + { + "epoch": 0.12, + "grad_norm": 0.7433205464784334, + "learning_rate": 9.787968401505987e-07, + "loss": 0.2458, + "step": 1886 + }, + { + "epoch": 0.12, + "grad_norm": 1.3769902608715658, + "learning_rate": 9.787670744390088e-07, + "loss": 0.4972, + "step": 1887 + }, + { + "epoch": 0.12, + "grad_norm": 0.2847382639762413, + "learning_rate": 9.787372883022531e-07, + "loss": 0.2299, + "step": 1888 + }, + { + "epoch": 0.12, + "grad_norm": 0.4660825408621928, + "learning_rate": 9.787074817416023e-07, + "loss": 0.233, + "step": 1889 + }, + { + "epoch": 0.12, + "grad_norm": 0.8257495471447167, + "learning_rate": 9.786776547583281e-07, + "loss": 0.2907, + "step": 1890 + }, + { + "epoch": 0.12, + "grad_norm": 0.4428450110946486, + "learning_rate": 9.786478073537029e-07, + "loss": 0.1748, + "step": 1891 + }, + { + "epoch": 0.12, + "grad_norm": 0.9232552991844433, + "learning_rate": 9.78617939529e-07, + "loss": 0.0626, + "step": 1892 + }, + { + "epoch": 0.12, + "grad_norm": 1.3262947492739359, + "learning_rate": 9.785880512854935e-07, + "loss": 0.1915, + "step": 1893 + }, + { + "epoch": 0.12, + "grad_norm": 0.42104398880951505, + "learning_rate": 9.785581426244588e-07, + "loss": 0.0546, + "step": 1894 + }, + { + "epoch": 0.12, + "grad_norm": 0.4887469321417179, + "learning_rate": 9.785282135471718e-07, + "loss": 0.0486, + "step": 1895 + }, + { + "epoch": 0.12, + "grad_norm": 0.7994460322423196, + "learning_rate": 9.78498264054909e-07, + "loss": 0.1747, + "step": 1896 + }, + { + "epoch": 0.12, + "grad_norm": 0.43210748121813813, + "learning_rate": 9.784682941489484e-07, + "loss": 0.4854, + "step": 1897 + }, + { + "epoch": 0.12, + "grad_norm": 0.4616134580844671, + "learning_rate": 9.784383038305687e-07, + "loss": 0.2818, + "step": 1898 + }, + { + "epoch": 0.12, + "grad_norm": 0.4322884457982743, + "learning_rate": 9.78408293101049e-07, + "loss": 0.2974, + "step": 1899 + }, + { + "epoch": 0.12, + "grad_norm": 0.569075289211957, + "learning_rate": 9.7837826196167e-07, + "loss": 0.0971, + "step": 1900 + }, + { + "epoch": 0.12, + "grad_norm": 0.5319529424582372, + "learning_rate": 9.783482104137127e-07, + "loss": 0.1652, + "step": 1901 + }, + { + "epoch": 0.12, + "grad_norm": 0.41210358606979236, + "learning_rate": 9.783181384584589e-07, + "loss": 0.0598, + "step": 1902 + }, + { + "epoch": 0.12, + "grad_norm": 0.48852613679855594, + "learning_rate": 9.782880460971918e-07, + "loss": 0.2678, + "step": 1903 + }, + { + "epoch": 0.12, + "grad_norm": 0.6702462521981983, + "learning_rate": 9.782579333311954e-07, + "loss": 0.1188, + "step": 1904 + }, + { + "epoch": 0.12, + "grad_norm": 0.5641643685730272, + "learning_rate": 9.78227800161754e-07, + "loss": 0.039, + "step": 1905 + }, + { + "epoch": 0.12, + "grad_norm": 0.6205175895376341, + "learning_rate": 9.781976465901532e-07, + "loss": 0.288, + "step": 1906 + }, + { + "epoch": 0.12, + "grad_norm": 0.7442542022079933, + "learning_rate": 9.781674726176797e-07, + "loss": 0.1192, + "step": 1907 + }, + { + "epoch": 0.12, + "grad_norm": 0.4395414020640999, + "learning_rate": 9.781372782456204e-07, + "loss": 0.2025, + "step": 1908 + }, + { + "epoch": 0.12, + "grad_norm": 0.2712850025784868, + "learning_rate": 9.781070634752637e-07, + "loss": 0.2414, + "step": 1909 + }, + { + "epoch": 0.12, + "grad_norm": 0.459579176644862, + "learning_rate": 9.780768283078986e-07, + "loss": 0.1618, + "step": 1910 + }, + { + "epoch": 0.12, + "grad_norm": 0.37994028366968474, + "learning_rate": 9.780465727448149e-07, + "loss": 0.1511, + "step": 1911 + }, + { + "epoch": 0.12, + "grad_norm": 1.332826870108756, + "learning_rate": 9.780162967873034e-07, + "loss": 0.1548, + "step": 1912 + }, + { + "epoch": 0.12, + "grad_norm": 0.7704599714078209, + "learning_rate": 9.779860004366559e-07, + "loss": 0.4119, + "step": 1913 + }, + { + "epoch": 0.12, + "grad_norm": 0.6648900381526484, + "learning_rate": 9.779556836941645e-07, + "loss": 0.2656, + "step": 1914 + }, + { + "epoch": 0.12, + "grad_norm": 0.4220809764577071, + "learning_rate": 9.77925346561123e-07, + "loss": 0.2177, + "step": 1915 + }, + { + "epoch": 0.12, + "grad_norm": 0.6930813390204651, + "learning_rate": 9.778949890388254e-07, + "loss": 0.1372, + "step": 1916 + }, + { + "epoch": 0.12, + "grad_norm": 0.5281738830083382, + "learning_rate": 9.778646111285667e-07, + "loss": 0.1023, + "step": 1917 + }, + { + "epoch": 0.12, + "grad_norm": 0.14482613321771215, + "learning_rate": 9.778342128316432e-07, + "loss": 0.0347, + "step": 1918 + }, + { + "epoch": 0.12, + "grad_norm": 0.3616251398336747, + "learning_rate": 9.778037941493518e-07, + "loss": 0.1497, + "step": 1919 + }, + { + "epoch": 0.12, + "grad_norm": 2.3712443218442867, + "learning_rate": 9.7777335508299e-07, + "loss": 0.4352, + "step": 1920 + }, + { + "epoch": 0.12, + "grad_norm": 0.6481102215820237, + "learning_rate": 9.777428956338562e-07, + "loss": 0.2932, + "step": 1921 + }, + { + "epoch": 0.12, + "grad_norm": 0.6974521070601287, + "learning_rate": 9.777124158032502e-07, + "loss": 0.244, + "step": 1922 + }, + { + "epoch": 0.12, + "grad_norm": 0.8303239942182551, + "learning_rate": 9.776819155924724e-07, + "loss": 0.1638, + "step": 1923 + }, + { + "epoch": 0.12, + "grad_norm": 0.3098626885906418, + "learning_rate": 9.776513950028235e-07, + "loss": 0.2514, + "step": 1924 + }, + { + "epoch": 0.12, + "grad_norm": 0.6372159472920058, + "learning_rate": 9.77620854035606e-07, + "loss": 0.3313, + "step": 1925 + }, + { + "epoch": 0.12, + "grad_norm": 0.41749912640256737, + "learning_rate": 9.775902926921228e-07, + "loss": 0.1376, + "step": 1926 + }, + { + "epoch": 0.12, + "grad_norm": 0.7259535137665333, + "learning_rate": 9.775597109736774e-07, + "loss": 0.1202, + "step": 1927 + }, + { + "epoch": 0.12, + "grad_norm": 0.7724715954337328, + "learning_rate": 9.77529108881575e-07, + "loss": 0.3796, + "step": 1928 + }, + { + "epoch": 0.12, + "grad_norm": 0.36036089208488853, + "learning_rate": 9.77498486417121e-07, + "loss": 0.3772, + "step": 1929 + }, + { + "epoch": 0.12, + "grad_norm": 0.3278680317156285, + "learning_rate": 9.774678435816212e-07, + "loss": 0.3091, + "step": 1930 + }, + { + "epoch": 0.12, + "grad_norm": 0.8595227782650138, + "learning_rate": 9.774371803763837e-07, + "loss": 0.2001, + "step": 1931 + }, + { + "epoch": 0.12, + "grad_norm": 0.4584120445869711, + "learning_rate": 9.774064968027162e-07, + "loss": 0.0665, + "step": 1932 + }, + { + "epoch": 0.12, + "grad_norm": 0.3755435083092351, + "learning_rate": 9.77375792861928e-07, + "loss": 0.0687, + "step": 1933 + }, + { + "epoch": 0.12, + "grad_norm": 1.2137723759695154, + "learning_rate": 9.773450685553285e-07, + "loss": 0.1917, + "step": 1934 + }, + { + "epoch": 0.12, + "grad_norm": 0.9278592061036838, + "learning_rate": 9.773143238842293e-07, + "loss": 0.1079, + "step": 1935 + }, + { + "epoch": 0.12, + "grad_norm": 0.8340899317370496, + "learning_rate": 9.77283558849941e-07, + "loss": 0.1827, + "step": 1936 + }, + { + "epoch": 0.12, + "grad_norm": 0.3445475578282629, + "learning_rate": 9.77252773453777e-07, + "loss": 0.1261, + "step": 1937 + }, + { + "epoch": 0.12, + "grad_norm": 0.4587220861021094, + "learning_rate": 9.7722196769705e-07, + "loss": 0.2183, + "step": 1938 + }, + { + "epoch": 0.12, + "grad_norm": 0.5146792374457008, + "learning_rate": 9.771911415810746e-07, + "loss": 0.4111, + "step": 1939 + }, + { + "epoch": 0.12, + "grad_norm": 0.5514524419459433, + "learning_rate": 9.77160295107166e-07, + "loss": 0.1075, + "step": 1940 + }, + { + "epoch": 0.12, + "grad_norm": 1.2040956696373666, + "learning_rate": 9.771294282766399e-07, + "loss": 0.1792, + "step": 1941 + }, + { + "epoch": 0.12, + "grad_norm": 1.3809124391047485, + "learning_rate": 9.77098541090813e-07, + "loss": 0.155, + "step": 1942 + }, + { + "epoch": 0.12, + "grad_norm": 0.6677320796447382, + "learning_rate": 9.770676335510036e-07, + "loss": 0.0639, + "step": 1943 + }, + { + "epoch": 0.12, + "grad_norm": 0.6887584945356219, + "learning_rate": 9.770367056585298e-07, + "loss": 0.0538, + "step": 1944 + }, + { + "epoch": 0.12, + "grad_norm": 1.1499177396189217, + "learning_rate": 9.77005757414711e-07, + "loss": 0.042, + "step": 1945 + }, + { + "epoch": 0.12, + "grad_norm": 0.508774266944597, + "learning_rate": 9.769747888208678e-07, + "loss": 0.1313, + "step": 1946 + }, + { + "epoch": 0.12, + "grad_norm": 0.3653623836731963, + "learning_rate": 9.769437998783214e-07, + "loss": 0.2212, + "step": 1947 + }, + { + "epoch": 0.12, + "grad_norm": 0.2563068904160823, + "learning_rate": 9.769127905883937e-07, + "loss": 0.0084, + "step": 1948 + }, + { + "epoch": 0.12, + "grad_norm": 0.8803497788997626, + "learning_rate": 9.768817609524075e-07, + "loss": 0.2744, + "step": 1949 + }, + { + "epoch": 0.12, + "grad_norm": 0.4595795259897732, + "learning_rate": 9.768507109716868e-07, + "loss": 0.1341, + "step": 1950 + }, + { + "epoch": 0.12, + "grad_norm": 0.6557764103477656, + "learning_rate": 9.768196406475563e-07, + "loss": 0.3394, + "step": 1951 + }, + { + "epoch": 0.12, + "grad_norm": 0.6006556222801455, + "learning_rate": 9.76788549981341e-07, + "loss": 0.0615, + "step": 1952 + }, + { + "epoch": 0.12, + "grad_norm": 0.2228324990992887, + "learning_rate": 9.767574389743681e-07, + "loss": 0.1342, + "step": 1953 + }, + { + "epoch": 0.12, + "grad_norm": 0.7179146374553077, + "learning_rate": 9.767263076279643e-07, + "loss": 0.1997, + "step": 1954 + }, + { + "epoch": 0.12, + "grad_norm": 0.9693337983360613, + "learning_rate": 9.76695155943458e-07, + "loss": 0.112, + "step": 1955 + }, + { + "epoch": 0.12, + "grad_norm": 0.667522800453616, + "learning_rate": 9.76663983922178e-07, + "loss": 0.1727, + "step": 1956 + }, + { + "epoch": 0.12, + "grad_norm": 0.5507093670357065, + "learning_rate": 9.766327915654541e-07, + "loss": 0.1278, + "step": 1957 + }, + { + "epoch": 0.12, + "grad_norm": 0.6860862337370913, + "learning_rate": 9.766015788746173e-07, + "loss": 0.3761, + "step": 1958 + }, + { + "epoch": 0.12, + "grad_norm": 0.40121908942416445, + "learning_rate": 9.76570345850999e-07, + "loss": 0.1011, + "step": 1959 + }, + { + "epoch": 0.12, + "grad_norm": 0.411413968367731, + "learning_rate": 9.765390924959317e-07, + "loss": 0.368, + "step": 1960 + }, + { + "epoch": 0.13, + "grad_norm": 0.2786316838571697, + "learning_rate": 9.765078188107487e-07, + "loss": 0.1497, + "step": 1961 + }, + { + "epoch": 0.13, + "grad_norm": 1.059609911889508, + "learning_rate": 9.764765247967843e-07, + "loss": 0.4543, + "step": 1962 + }, + { + "epoch": 0.13, + "grad_norm": 1.1661323197525828, + "learning_rate": 9.764452104553736e-07, + "loss": 0.2113, + "step": 1963 + }, + { + "epoch": 0.13, + "grad_norm": 0.534199010122491, + "learning_rate": 9.764138757878524e-07, + "loss": 0.1222, + "step": 1964 + }, + { + "epoch": 0.13, + "grad_norm": 0.24197334755360245, + "learning_rate": 9.763825207955577e-07, + "loss": 0.027, + "step": 1965 + }, + { + "epoch": 0.13, + "grad_norm": 0.46587338170337794, + "learning_rate": 9.763511454798266e-07, + "loss": 0.478, + "step": 1966 + }, + { + "epoch": 0.13, + "grad_norm": 0.32889061524493196, + "learning_rate": 9.763197498419984e-07, + "loss": 0.1475, + "step": 1967 + }, + { + "epoch": 0.13, + "grad_norm": 0.42973472107561617, + "learning_rate": 9.76288333883412e-07, + "loss": 0.1379, + "step": 1968 + }, + { + "epoch": 0.13, + "grad_norm": 0.34327729390832834, + "learning_rate": 9.76256897605408e-07, + "loss": 0.1892, + "step": 1969 + }, + { + "epoch": 0.13, + "grad_norm": 0.8733412381690852, + "learning_rate": 9.76225441009327e-07, + "loss": 0.1151, + "step": 1970 + }, + { + "epoch": 0.13, + "grad_norm": 0.6990838412039561, + "learning_rate": 9.761939640965118e-07, + "loss": 0.2961, + "step": 1971 + }, + { + "epoch": 0.13, + "grad_norm": 1.103030258430781, + "learning_rate": 9.761624668683044e-07, + "loss": 0.184, + "step": 1972 + }, + { + "epoch": 0.13, + "grad_norm": 0.47537482875737563, + "learning_rate": 9.761309493260492e-07, + "loss": 0.2184, + "step": 1973 + }, + { + "epoch": 0.13, + "grad_norm": 0.8109631954245425, + "learning_rate": 9.760994114710904e-07, + "loss": 0.1935, + "step": 1974 + }, + { + "epoch": 0.13, + "grad_norm": 0.7905566985070605, + "learning_rate": 9.76067853304774e-07, + "loss": 0.1877, + "step": 1975 + }, + { + "epoch": 0.13, + "grad_norm": 0.31555085957082785, + "learning_rate": 9.760362748284456e-07, + "loss": 0.1956, + "step": 1976 + }, + { + "epoch": 0.13, + "grad_norm": 2.145092587492478, + "learning_rate": 9.760046760434529e-07, + "loss": 0.485, + "step": 1977 + }, + { + "epoch": 0.13, + "grad_norm": 0.7701666012878458, + "learning_rate": 9.759730569511438e-07, + "loss": 0.3231, + "step": 1978 + }, + { + "epoch": 0.13, + "grad_norm": 0.7721610642616265, + "learning_rate": 9.759414175528671e-07, + "loss": 0.189, + "step": 1979 + }, + { + "epoch": 0.13, + "grad_norm": 1.0110930986742899, + "learning_rate": 9.75909757849973e-07, + "loss": 0.1744, + "step": 1980 + }, + { + "epoch": 0.13, + "grad_norm": 0.6197373928767853, + "learning_rate": 9.758780778438117e-07, + "loss": 0.0371, + "step": 1981 + }, + { + "epoch": 0.13, + "grad_norm": 0.9999683879013466, + "learning_rate": 9.758463775357352e-07, + "loss": 0.133, + "step": 1982 + }, + { + "epoch": 0.13, + "grad_norm": 0.5769867949119858, + "learning_rate": 9.758146569270956e-07, + "loss": 0.0779, + "step": 1983 + }, + { + "epoch": 0.13, + "grad_norm": 0.5880562635464869, + "learning_rate": 9.757829160192462e-07, + "loss": 0.2503, + "step": 1984 + }, + { + "epoch": 0.13, + "grad_norm": 0.3987440069337375, + "learning_rate": 9.757511548135411e-07, + "loss": 0.1259, + "step": 1985 + }, + { + "epoch": 0.13, + "grad_norm": 0.4624906612642204, + "learning_rate": 9.757193733113355e-07, + "loss": 0.141, + "step": 1986 + }, + { + "epoch": 0.13, + "grad_norm": 0.45979987576149456, + "learning_rate": 9.75687571513985e-07, + "loss": 0.0776, + "step": 1987 + }, + { + "epoch": 0.13, + "grad_norm": 0.5147345228133103, + "learning_rate": 9.756557494228464e-07, + "loss": 0.1651, + "step": 1988 + }, + { + "epoch": 0.13, + "grad_norm": 0.6293209370364992, + "learning_rate": 9.756239070392776e-07, + "loss": 0.1009, + "step": 1989 + }, + { + "epoch": 0.13, + "grad_norm": 0.5651852931621907, + "learning_rate": 9.755920443646364e-07, + "loss": 0.3339, + "step": 1990 + }, + { + "epoch": 0.13, + "grad_norm": 0.9320924431987775, + "learning_rate": 9.755601614002828e-07, + "loss": 0.0483, + "step": 1991 + }, + { + "epoch": 0.13, + "grad_norm": 0.8842636608675487, + "learning_rate": 9.755282581475767e-07, + "loss": 0.2291, + "step": 1992 + }, + { + "epoch": 0.13, + "grad_norm": 0.43139096736901833, + "learning_rate": 9.754963346078792e-07, + "loss": 0.1092, + "step": 1993 + }, + { + "epoch": 0.13, + "grad_norm": 0.6004594186436094, + "learning_rate": 9.754643907825522e-07, + "loss": 0.2012, + "step": 1994 + }, + { + "epoch": 0.13, + "grad_norm": 0.7253309011594199, + "learning_rate": 9.754324266729583e-07, + "loss": 0.5189, + "step": 1995 + }, + { + "epoch": 0.13, + "grad_norm": 0.26812043890197435, + "learning_rate": 9.754004422804616e-07, + "loss": 0.1146, + "step": 1996 + }, + { + "epoch": 0.13, + "grad_norm": 0.6805844637181703, + "learning_rate": 9.753684376064262e-07, + "loss": 0.317, + "step": 1997 + }, + { + "epoch": 0.13, + "grad_norm": 0.9486966659565942, + "learning_rate": 9.753364126522177e-07, + "loss": 0.4689, + "step": 1998 + }, + { + "epoch": 0.13, + "grad_norm": 0.5718265247519756, + "learning_rate": 9.753043674192022e-07, + "loss": 0.2353, + "step": 1999 + }, + { + "epoch": 0.13, + "grad_norm": 0.8737459338518712, + "learning_rate": 9.75272301908747e-07, + "loss": 0.2111, + "step": 2000 + }, + { + "epoch": 0.13, + "grad_norm": 0.39156741362398484, + "learning_rate": 9.7524021612222e-07, + "loss": 0.201, + "step": 2001 + }, + { + "epoch": 0.13, + "grad_norm": 0.7781864891728763, + "learning_rate": 9.7520811006099e-07, + "loss": 0.0336, + "step": 2002 + }, + { + "epoch": 0.13, + "grad_norm": 0.5638928565719611, + "learning_rate": 9.751759837264267e-07, + "loss": 0.2304, + "step": 2003 + }, + { + "epoch": 0.13, + "grad_norm": 0.7209768030488014, + "learning_rate": 9.751438371199006e-07, + "loss": 0.3343, + "step": 2004 + }, + { + "epoch": 0.13, + "grad_norm": 0.519758722621288, + "learning_rate": 9.751116702427833e-07, + "loss": 0.332, + "step": 2005 + }, + { + "epoch": 0.13, + "grad_norm": 0.7667403647507963, + "learning_rate": 9.750794830964472e-07, + "loss": 0.205, + "step": 2006 + }, + { + "epoch": 0.13, + "grad_norm": 0.4957310847385013, + "learning_rate": 9.750472756822652e-07, + "loss": 0.2274, + "step": 2007 + }, + { + "epoch": 0.13, + "grad_norm": 0.29985426137213894, + "learning_rate": 9.750150480016114e-07, + "loss": 0.1888, + "step": 2008 + }, + { + "epoch": 0.13, + "grad_norm": 0.4795539366511488, + "learning_rate": 9.74982800055861e-07, + "loss": 0.2566, + "step": 2009 + }, + { + "epoch": 0.13, + "grad_norm": 0.6106050578516774, + "learning_rate": 9.749505318463894e-07, + "loss": 0.2971, + "step": 2010 + }, + { + "epoch": 0.13, + "grad_norm": 0.3604109131880696, + "learning_rate": 9.749182433745732e-07, + "loss": 0.1832, + "step": 2011 + }, + { + "epoch": 0.13, + "grad_norm": 0.7558736410748353, + "learning_rate": 9.7488593464179e-07, + "loss": 0.3393, + "step": 2012 + }, + { + "epoch": 0.13, + "grad_norm": 0.515464229619306, + "learning_rate": 9.748536056494186e-07, + "loss": 0.238, + "step": 2013 + }, + { + "epoch": 0.13, + "grad_norm": 0.7572105402428722, + "learning_rate": 9.748212563988375e-07, + "loss": 0.1486, + "step": 2014 + }, + { + "epoch": 0.13, + "grad_norm": 0.6228214472009372, + "learning_rate": 9.74788886891427e-07, + "loss": 0.1377, + "step": 2015 + }, + { + "epoch": 0.13, + "grad_norm": 1.5848550671694532, + "learning_rate": 9.747564971285684e-07, + "loss": 0.2804, + "step": 2016 + }, + { + "epoch": 0.13, + "grad_norm": 0.6466063998668526, + "learning_rate": 9.747240871116432e-07, + "loss": 0.2071, + "step": 2017 + }, + { + "epoch": 0.13, + "grad_norm": 0.6349211533366779, + "learning_rate": 9.74691656842034e-07, + "loss": 0.1981, + "step": 2018 + }, + { + "epoch": 0.13, + "grad_norm": 0.3721453325045769, + "learning_rate": 9.746592063211246e-07, + "loss": 0.2082, + "step": 2019 + }, + { + "epoch": 0.13, + "grad_norm": 0.15608617273693193, + "learning_rate": 9.746267355502991e-07, + "loss": 0.0133, + "step": 2020 + }, + { + "epoch": 0.13, + "grad_norm": 0.6938914285732772, + "learning_rate": 9.74594244530943e-07, + "loss": 0.1737, + "step": 2021 + }, + { + "epoch": 0.13, + "grad_norm": 0.3990814093962295, + "learning_rate": 9.745617332644424e-07, + "loss": 0.2083, + "step": 2022 + }, + { + "epoch": 0.13, + "grad_norm": 0.3651754782509643, + "learning_rate": 9.745292017521842e-07, + "loss": 0.2143, + "step": 2023 + }, + { + "epoch": 0.13, + "grad_norm": 0.4697368479888514, + "learning_rate": 9.744966499955565e-07, + "loss": 0.2256, + "step": 2024 + }, + { + "epoch": 0.13, + "grad_norm": 0.43175384708877673, + "learning_rate": 9.744640779959477e-07, + "loss": 0.3387, + "step": 2025 + }, + { + "epoch": 0.13, + "grad_norm": 0.47758884710700433, + "learning_rate": 9.744314857547476e-07, + "loss": 0.1533, + "step": 2026 + }, + { + "epoch": 0.13, + "grad_norm": 0.9890250844300368, + "learning_rate": 9.743988732733466e-07, + "loss": 0.2262, + "step": 2027 + }, + { + "epoch": 0.13, + "grad_norm": 0.2470302772005145, + "learning_rate": 9.743662405531359e-07, + "loss": 0.0653, + "step": 2028 + }, + { + "epoch": 0.13, + "grad_norm": 1.0875355284708483, + "learning_rate": 9.74333587595508e-07, + "loss": 0.2291, + "step": 2029 + }, + { + "epoch": 0.13, + "grad_norm": 0.753026424832712, + "learning_rate": 9.743009144018556e-07, + "loss": 0.3202, + "step": 2030 + }, + { + "epoch": 0.13, + "grad_norm": 0.7075134086327713, + "learning_rate": 9.742682209735727e-07, + "loss": 0.1801, + "step": 2031 + }, + { + "epoch": 0.13, + "grad_norm": 0.4536577664712018, + "learning_rate": 9.742355073120542e-07, + "loss": 0.2078, + "step": 2032 + }, + { + "epoch": 0.13, + "grad_norm": 0.8891863029578857, + "learning_rate": 9.742027734186955e-07, + "loss": 0.3736, + "step": 2033 + }, + { + "epoch": 0.13, + "grad_norm": 0.38843643792649585, + "learning_rate": 9.741700192948934e-07, + "loss": 0.1576, + "step": 2034 + }, + { + "epoch": 0.13, + "grad_norm": 0.4305876125032848, + "learning_rate": 9.741372449420448e-07, + "loss": 0.3007, + "step": 2035 + }, + { + "epoch": 0.13, + "grad_norm": 0.8447349856615494, + "learning_rate": 9.741044503615484e-07, + "loss": 0.2363, + "step": 2036 + }, + { + "epoch": 0.13, + "grad_norm": 1.066234023029225, + "learning_rate": 9.740716355548028e-07, + "loss": 0.3245, + "step": 2037 + }, + { + "epoch": 0.13, + "grad_norm": 0.49150715679904, + "learning_rate": 9.740388005232085e-07, + "loss": 0.0474, + "step": 2038 + }, + { + "epoch": 0.13, + "grad_norm": 0.41928467630095934, + "learning_rate": 9.74005945268166e-07, + "loss": 0.1455, + "step": 2039 + }, + { + "epoch": 0.13, + "grad_norm": 0.42009305044203954, + "learning_rate": 9.73973069791077e-07, + "loss": 0.0243, + "step": 2040 + }, + { + "epoch": 0.13, + "grad_norm": 0.39566816871536853, + "learning_rate": 9.73940174093344e-07, + "loss": 0.1708, + "step": 2041 + }, + { + "epoch": 0.13, + "grad_norm": 0.5963582248153219, + "learning_rate": 9.739072581763704e-07, + "loss": 0.04, + "step": 2042 + }, + { + "epoch": 0.13, + "grad_norm": 1.0059675591387007, + "learning_rate": 9.738743220415607e-07, + "loss": 0.2017, + "step": 2043 + }, + { + "epoch": 0.13, + "grad_norm": 0.46461982600549445, + "learning_rate": 9.738413656903197e-07, + "loss": 0.2319, + "step": 2044 + }, + { + "epoch": 0.13, + "grad_norm": 1.8383189109340576, + "learning_rate": 9.738083891240534e-07, + "loss": 0.2545, + "step": 2045 + }, + { + "epoch": 0.13, + "grad_norm": 0.5723536110817462, + "learning_rate": 9.737753923441687e-07, + "loss": 0.2316, + "step": 2046 + }, + { + "epoch": 0.13, + "grad_norm": 0.5474228215810266, + "learning_rate": 9.737423753520734e-07, + "loss": 0.1426, + "step": 2047 + }, + { + "epoch": 0.13, + "grad_norm": 0.48171784417932656, + "learning_rate": 9.737093381491761e-07, + "loss": 0.2324, + "step": 2048 + }, + { + "epoch": 0.13, + "grad_norm": 0.3155652055117869, + "learning_rate": 9.73676280736886e-07, + "loss": 0.2159, + "step": 2049 + }, + { + "epoch": 0.13, + "grad_norm": 0.8692302547729482, + "learning_rate": 9.736432031166138e-07, + "loss": 0.2041, + "step": 2050 + }, + { + "epoch": 0.13, + "grad_norm": 0.4674316385906084, + "learning_rate": 9.736101052897704e-07, + "loss": 0.3144, + "step": 2051 + }, + { + "epoch": 0.13, + "grad_norm": 0.7351636609704858, + "learning_rate": 9.735769872577677e-07, + "loss": 0.0644, + "step": 2052 + }, + { + "epoch": 0.13, + "grad_norm": 0.4570046401607926, + "learning_rate": 9.735438490220186e-07, + "loss": 0.022, + "step": 2053 + }, + { + "epoch": 0.13, + "grad_norm": 0.8104489250680039, + "learning_rate": 9.735106905839372e-07, + "loss": 0.2011, + "step": 2054 + }, + { + "epoch": 0.13, + "grad_norm": 0.4759229744029291, + "learning_rate": 9.734775119449378e-07, + "loss": 0.2255, + "step": 2055 + }, + { + "epoch": 0.13, + "grad_norm": 0.594143220537935, + "learning_rate": 9.73444313106436e-07, + "loss": 0.1352, + "step": 2056 + }, + { + "epoch": 0.13, + "grad_norm": 0.6073301301142678, + "learning_rate": 9.73411094069848e-07, + "loss": 0.1248, + "step": 2057 + }, + { + "epoch": 0.13, + "grad_norm": 0.5823100815634501, + "learning_rate": 9.73377854836591e-07, + "loss": 0.2456, + "step": 2058 + }, + { + "epoch": 0.13, + "grad_norm": 0.7276842922878295, + "learning_rate": 9.73344595408083e-07, + "loss": 0.2906, + "step": 2059 + }, + { + "epoch": 0.13, + "grad_norm": 0.4718660706642817, + "learning_rate": 9.733113157857433e-07, + "loss": 0.1925, + "step": 2060 + }, + { + "epoch": 0.13, + "grad_norm": 0.3890053909921082, + "learning_rate": 9.732780159709912e-07, + "loss": 0.0667, + "step": 2061 + }, + { + "epoch": 0.13, + "grad_norm": 0.5305031062772791, + "learning_rate": 9.732446959652475e-07, + "loss": 0.1155, + "step": 2062 + }, + { + "epoch": 0.13, + "grad_norm": 0.7979789520933933, + "learning_rate": 9.732113557699337e-07, + "loss": 0.0327, + "step": 2063 + }, + { + "epoch": 0.13, + "grad_norm": 0.34268174794172396, + "learning_rate": 9.731779953864723e-07, + "loss": 0.0952, + "step": 2064 + }, + { + "epoch": 0.13, + "grad_norm": 0.6587714619738644, + "learning_rate": 9.731446148162866e-07, + "loss": 0.1926, + "step": 2065 + }, + { + "epoch": 0.13, + "grad_norm": 0.5532658298962123, + "learning_rate": 9.731112140608003e-07, + "loss": 0.19, + "step": 2066 + }, + { + "epoch": 0.13, + "grad_norm": 1.2003756135881918, + "learning_rate": 9.730777931214383e-07, + "loss": 0.4008, + "step": 2067 + }, + { + "epoch": 0.13, + "grad_norm": 0.6556766157080656, + "learning_rate": 9.730443519996269e-07, + "loss": 0.2006, + "step": 2068 + }, + { + "epoch": 0.13, + "grad_norm": 0.899811363077273, + "learning_rate": 9.730108906967923e-07, + "loss": 0.2034, + "step": 2069 + }, + { + "epoch": 0.13, + "grad_norm": 1.1664057054531014, + "learning_rate": 9.729774092143626e-07, + "loss": 0.1783, + "step": 2070 + }, + { + "epoch": 0.13, + "grad_norm": 0.18792340268741953, + "learning_rate": 9.729439075537655e-07, + "loss": 0.099, + "step": 2071 + }, + { + "epoch": 0.13, + "grad_norm": 0.3933597350424269, + "learning_rate": 9.729103857164308e-07, + "loss": 0.1759, + "step": 2072 + }, + { + "epoch": 0.13, + "grad_norm": 0.8924769614688599, + "learning_rate": 9.728768437037882e-07, + "loss": 0.1931, + "step": 2073 + }, + { + "epoch": 0.13, + "grad_norm": 0.7779664685121285, + "learning_rate": 9.728432815172688e-07, + "loss": 0.1541, + "step": 2074 + }, + { + "epoch": 0.13, + "grad_norm": 0.5050616246557152, + "learning_rate": 9.728096991583047e-07, + "loss": 0.2209, + "step": 2075 + }, + { + "epoch": 0.13, + "grad_norm": 0.9552024925810386, + "learning_rate": 9.727760966283283e-07, + "loss": 0.1533, + "step": 2076 + }, + { + "epoch": 0.13, + "grad_norm": 1.2126531806060135, + "learning_rate": 9.727424739287731e-07, + "loss": 0.2265, + "step": 2077 + }, + { + "epoch": 0.13, + "grad_norm": 0.2747233731372835, + "learning_rate": 9.727088310610738e-07, + "loss": 0.1232, + "step": 2078 + }, + { + "epoch": 0.13, + "grad_norm": 0.6592036867883576, + "learning_rate": 9.726751680266652e-07, + "loss": 0.3436, + "step": 2079 + }, + { + "epoch": 0.13, + "grad_norm": 0.6027860660602337, + "learning_rate": 9.72641484826984e-07, + "loss": 0.2032, + "step": 2080 + }, + { + "epoch": 0.13, + "grad_norm": 0.7120638297149764, + "learning_rate": 9.72607781463467e-07, + "loss": 0.39, + "step": 2081 + }, + { + "epoch": 0.13, + "grad_norm": 0.7869806889949933, + "learning_rate": 9.725740579375516e-07, + "loss": 0.1947, + "step": 2082 + }, + { + "epoch": 0.13, + "grad_norm": 0.7442396837728845, + "learning_rate": 9.725403142506772e-07, + "loss": 0.0312, + "step": 2083 + }, + { + "epoch": 0.13, + "grad_norm": 1.2172920833957057, + "learning_rate": 9.725065504042832e-07, + "loss": 0.2183, + "step": 2084 + }, + { + "epoch": 0.13, + "grad_norm": 0.2403537132885519, + "learning_rate": 9.724727663998096e-07, + "loss": 0.0849, + "step": 2085 + }, + { + "epoch": 0.13, + "grad_norm": 0.4990650843509188, + "learning_rate": 9.724389622386982e-07, + "loss": 0.19, + "step": 2086 + }, + { + "epoch": 0.13, + "grad_norm": 1.4999932947105477, + "learning_rate": 9.724051379223908e-07, + "loss": 0.4414, + "step": 2087 + }, + { + "epoch": 0.13, + "grad_norm": 0.9431875913326456, + "learning_rate": 9.723712934523306e-07, + "loss": 0.2161, + "step": 2088 + }, + { + "epoch": 0.13, + "grad_norm": 0.9023665974170813, + "learning_rate": 9.723374288299614e-07, + "loss": 0.3656, + "step": 2089 + }, + { + "epoch": 0.13, + "grad_norm": 0.21688367093736713, + "learning_rate": 9.72303544056728e-07, + "loss": 0.1726, + "step": 2090 + }, + { + "epoch": 0.13, + "grad_norm": 0.5682566962497361, + "learning_rate": 9.72269639134076e-07, + "loss": 0.2856, + "step": 2091 + }, + { + "epoch": 0.13, + "grad_norm": 0.24899213121243702, + "learning_rate": 9.722357140634518e-07, + "loss": 0.0443, + "step": 2092 + }, + { + "epoch": 0.13, + "grad_norm": 0.8775025550605597, + "learning_rate": 9.722017688463026e-07, + "loss": 0.2971, + "step": 2093 + }, + { + "epoch": 0.13, + "grad_norm": 0.4040510763057725, + "learning_rate": 9.72167803484077e-07, + "loss": 0.1474, + "step": 2094 + }, + { + "epoch": 0.13, + "grad_norm": 0.5417357806702155, + "learning_rate": 9.721338179782235e-07, + "loss": 0.1652, + "step": 2095 + }, + { + "epoch": 0.13, + "grad_norm": 0.38910553878469045, + "learning_rate": 9.720998123301922e-07, + "loss": 0.1546, + "step": 2096 + }, + { + "epoch": 0.13, + "grad_norm": 0.7782408360767257, + "learning_rate": 9.720657865414338e-07, + "loss": 0.1887, + "step": 2097 + }, + { + "epoch": 0.13, + "grad_norm": 0.8654013443872984, + "learning_rate": 9.720317406134002e-07, + "loss": 0.1594, + "step": 2098 + }, + { + "epoch": 0.13, + "grad_norm": 0.6049097490226051, + "learning_rate": 9.719976745475435e-07, + "loss": 0.1257, + "step": 2099 + }, + { + "epoch": 0.13, + "grad_norm": 1.2215042007525334, + "learning_rate": 9.719635883453174e-07, + "loss": 0.2492, + "step": 2100 + }, + { + "epoch": 0.13, + "grad_norm": 0.6352060577546463, + "learning_rate": 9.719294820081755e-07, + "loss": 0.1563, + "step": 2101 + }, + { + "epoch": 0.13, + "grad_norm": 0.627771964044561, + "learning_rate": 9.718953555375734e-07, + "loss": 0.1785, + "step": 2102 + }, + { + "epoch": 0.13, + "grad_norm": 0.9944669068364482, + "learning_rate": 9.718612089349669e-07, + "loss": 0.2305, + "step": 2103 + }, + { + "epoch": 0.13, + "grad_norm": 0.5532890803846539, + "learning_rate": 9.718270422018124e-07, + "loss": 0.1413, + "step": 2104 + }, + { + "epoch": 0.13, + "grad_norm": 0.6236204924539288, + "learning_rate": 9.71792855339568e-07, + "loss": 0.0515, + "step": 2105 + }, + { + "epoch": 0.13, + "grad_norm": 0.7052950318639188, + "learning_rate": 9.71758648349692e-07, + "loss": 0.203, + "step": 2106 + }, + { + "epoch": 0.13, + "grad_norm": 1.42809166385761, + "learning_rate": 9.717244212336436e-07, + "loss": 0.0337, + "step": 2107 + }, + { + "epoch": 0.13, + "grad_norm": 0.41174206684281756, + "learning_rate": 9.716901739928831e-07, + "loss": 0.1834, + "step": 2108 + }, + { + "epoch": 0.13, + "grad_norm": 0.665161076998366, + "learning_rate": 9.716559066288714e-07, + "loss": 0.2682, + "step": 2109 + }, + { + "epoch": 0.13, + "grad_norm": 0.4224887983642203, + "learning_rate": 9.716216191430708e-07, + "loss": 0.1781, + "step": 2110 + }, + { + "epoch": 0.13, + "grad_norm": 0.2939570486585365, + "learning_rate": 9.715873115369439e-07, + "loss": 0.1703, + "step": 2111 + }, + { + "epoch": 0.13, + "grad_norm": 0.6171708726170932, + "learning_rate": 9.71552983811954e-07, + "loss": 0.1676, + "step": 2112 + }, + { + "epoch": 0.13, + "grad_norm": 0.8375808528722852, + "learning_rate": 9.71518635969566e-07, + "loss": 0.1593, + "step": 2113 + }, + { + "epoch": 0.13, + "grad_norm": 0.5639128985805464, + "learning_rate": 9.714842680112455e-07, + "loss": 0.3167, + "step": 2114 + }, + { + "epoch": 0.13, + "grad_norm": 0.550662219443129, + "learning_rate": 9.714498799384578e-07, + "loss": 0.2902, + "step": 2115 + }, + { + "epoch": 0.13, + "grad_norm": 0.5102697864545089, + "learning_rate": 9.714154717526708e-07, + "loss": 0.2088, + "step": 2116 + }, + { + "epoch": 0.14, + "grad_norm": 0.5303789548006846, + "learning_rate": 9.713810434553519e-07, + "loss": 0.1205, + "step": 2117 + }, + { + "epoch": 0.14, + "grad_norm": 0.42365031034017475, + "learning_rate": 9.713465950479703e-07, + "loss": 0.1049, + "step": 2118 + }, + { + "epoch": 0.14, + "grad_norm": 0.31375022282377896, + "learning_rate": 9.713121265319952e-07, + "loss": 0.2105, + "step": 2119 + }, + { + "epoch": 0.14, + "grad_norm": 0.44697664741525117, + "learning_rate": 9.712776379088975e-07, + "loss": 0.2007, + "step": 2120 + }, + { + "epoch": 0.14, + "grad_norm": 0.63302274232392, + "learning_rate": 9.712431291801482e-07, + "loss": 0.2584, + "step": 2121 + }, + { + "epoch": 0.14, + "grad_norm": 0.9660387719003406, + "learning_rate": 9.7120860034722e-07, + "loss": 0.1291, + "step": 2122 + }, + { + "epoch": 0.14, + "grad_norm": 1.706858797654844, + "learning_rate": 9.711740514115853e-07, + "loss": 0.1969, + "step": 2123 + }, + { + "epoch": 0.14, + "grad_norm": 0.4814523309403148, + "learning_rate": 9.711394823747185e-07, + "loss": 0.1819, + "step": 2124 + }, + { + "epoch": 0.14, + "grad_norm": 0.36670555841369007, + "learning_rate": 9.711048932380944e-07, + "loss": 0.1673, + "step": 2125 + }, + { + "epoch": 0.14, + "grad_norm": 0.6323553144994762, + "learning_rate": 9.710702840031885e-07, + "loss": 0.4136, + "step": 2126 + }, + { + "epoch": 0.14, + "grad_norm": 0.5761880647757363, + "learning_rate": 9.710356546714772e-07, + "loss": 0.1918, + "step": 2127 + }, + { + "epoch": 0.14, + "grad_norm": 0.49994957241813537, + "learning_rate": 9.71001005244438e-07, + "loss": 0.1708, + "step": 2128 + }, + { + "epoch": 0.14, + "grad_norm": 0.6593229239558963, + "learning_rate": 9.709663357235492e-07, + "loss": 0.1929, + "step": 2129 + }, + { + "epoch": 0.14, + "grad_norm": 0.608489529045219, + "learning_rate": 9.709316461102897e-07, + "loss": 0.125, + "step": 2130 + }, + { + "epoch": 0.14, + "grad_norm": 1.468802224090361, + "learning_rate": 9.708969364061394e-07, + "loss": 0.2931, + "step": 2131 + }, + { + "epoch": 0.14, + "grad_norm": 0.49492412495848126, + "learning_rate": 9.708622066125793e-07, + "loss": 0.2214, + "step": 2132 + }, + { + "epoch": 0.14, + "grad_norm": 0.18891886191960253, + "learning_rate": 9.708274567310908e-07, + "loss": 0.0844, + "step": 2133 + }, + { + "epoch": 0.14, + "grad_norm": 0.7454011611673332, + "learning_rate": 9.707926867631566e-07, + "loss": 0.211, + "step": 2134 + }, + { + "epoch": 0.14, + "grad_norm": 0.6777257527568694, + "learning_rate": 9.707578967102598e-07, + "loss": 0.2128, + "step": 2135 + }, + { + "epoch": 0.14, + "grad_norm": 0.7101565817484278, + "learning_rate": 9.70723086573885e-07, + "loss": 0.0073, + "step": 2136 + }, + { + "epoch": 0.14, + "grad_norm": 0.5642559087375111, + "learning_rate": 9.70688256355517e-07, + "loss": 0.0641, + "step": 2137 + }, + { + "epoch": 0.14, + "grad_norm": 0.44727627988655927, + "learning_rate": 9.706534060566418e-07, + "loss": 0.1266, + "step": 2138 + }, + { + "epoch": 0.14, + "grad_norm": 0.27122243561276915, + "learning_rate": 9.70618535678746e-07, + "loss": 0.0889, + "step": 2139 + }, + { + "epoch": 0.14, + "grad_norm": 1.1558674339226116, + "learning_rate": 9.705836452233174e-07, + "loss": 0.4229, + "step": 2140 + }, + { + "epoch": 0.14, + "grad_norm": 0.36806759305293124, + "learning_rate": 9.705487346918447e-07, + "loss": 0.3808, + "step": 2141 + }, + { + "epoch": 0.14, + "grad_norm": 0.6306939368757434, + "learning_rate": 9.705138040858169e-07, + "loss": 0.0952, + "step": 2142 + }, + { + "epoch": 0.14, + "grad_norm": 1.2239928866755898, + "learning_rate": 9.704788534067246e-07, + "loss": 0.2595, + "step": 2143 + }, + { + "epoch": 0.14, + "grad_norm": 0.6745385669592355, + "learning_rate": 9.704438826560584e-07, + "loss": 0.1144, + "step": 2144 + }, + { + "epoch": 0.14, + "grad_norm": 0.8328907271771963, + "learning_rate": 9.704088918353107e-07, + "loss": 0.0156, + "step": 2145 + }, + { + "epoch": 0.14, + "grad_norm": 0.9253891868737265, + "learning_rate": 9.703738809459738e-07, + "loss": 0.1115, + "step": 2146 + }, + { + "epoch": 0.14, + "grad_norm": 0.8604095819465852, + "learning_rate": 9.703388499895414e-07, + "loss": 0.3714, + "step": 2147 + }, + { + "epoch": 0.14, + "grad_norm": 0.5903399009294115, + "learning_rate": 9.703037989675086e-07, + "loss": 0.2533, + "step": 2148 + }, + { + "epoch": 0.14, + "grad_norm": 0.6681768886008498, + "learning_rate": 9.7026872788137e-07, + "loss": 0.1211, + "step": 2149 + }, + { + "epoch": 0.14, + "grad_norm": 0.9656416768461943, + "learning_rate": 9.702336367326222e-07, + "loss": 0.3824, + "step": 2150 + }, + { + "epoch": 0.14, + "grad_norm": 0.8419538800878464, + "learning_rate": 9.701985255227624e-07, + "loss": 0.1727, + "step": 2151 + }, + { + "epoch": 0.14, + "grad_norm": 0.6389175782612608, + "learning_rate": 9.701633942532879e-07, + "loss": 0.3651, + "step": 2152 + }, + { + "epoch": 0.14, + "grad_norm": 0.5482342468742762, + "learning_rate": 9.701282429256982e-07, + "loss": 0.175, + "step": 2153 + }, + { + "epoch": 0.14, + "grad_norm": 0.9234820789636007, + "learning_rate": 9.700930715414923e-07, + "loss": 0.1268, + "step": 2154 + }, + { + "epoch": 0.14, + "grad_norm": 0.4561882955178448, + "learning_rate": 9.70057880102171e-07, + "loss": 0.2099, + "step": 2155 + }, + { + "epoch": 0.14, + "grad_norm": 0.6178272401758675, + "learning_rate": 9.700226686092357e-07, + "loss": 0.2364, + "step": 2156 + }, + { + "epoch": 0.14, + "grad_norm": 0.3762372836218014, + "learning_rate": 9.699874370641885e-07, + "loss": 0.1661, + "step": 2157 + }, + { + "epoch": 0.14, + "grad_norm": 0.7340596125330184, + "learning_rate": 9.699521854685324e-07, + "loss": 0.4084, + "step": 2158 + }, + { + "epoch": 0.14, + "grad_norm": 0.4741231741355246, + "learning_rate": 9.699169138237714e-07, + "loss": 0.2054, + "step": 2159 + }, + { + "epoch": 0.14, + "grad_norm": 0.7000852701286417, + "learning_rate": 9.6988162213141e-07, + "loss": 0.1913, + "step": 2160 + }, + { + "epoch": 0.14, + "grad_norm": 0.8700922457417857, + "learning_rate": 9.698463103929541e-07, + "loss": 0.148, + "step": 2161 + }, + { + "epoch": 0.14, + "grad_norm": 0.21168941718294554, + "learning_rate": 9.698109786099103e-07, + "loss": 0.0822, + "step": 2162 + }, + { + "epoch": 0.14, + "grad_norm": 0.8618181796727525, + "learning_rate": 9.697756267837855e-07, + "loss": 0.2028, + "step": 2163 + }, + { + "epoch": 0.14, + "grad_norm": 0.33734627838567355, + "learning_rate": 9.69740254916088e-07, + "loss": 0.0987, + "step": 2164 + }, + { + "epoch": 0.14, + "grad_norm": 0.7313910951546378, + "learning_rate": 9.697048630083271e-07, + "loss": 0.1143, + "step": 2165 + }, + { + "epoch": 0.14, + "grad_norm": 0.5104355299439599, + "learning_rate": 9.696694510620126e-07, + "loss": 0.1054, + "step": 2166 + }, + { + "epoch": 0.14, + "grad_norm": 0.26417956245786783, + "learning_rate": 9.696340190786551e-07, + "loss": 0.013, + "step": 2167 + }, + { + "epoch": 0.14, + "grad_norm": 1.1226080990347633, + "learning_rate": 9.695985670597662e-07, + "loss": 0.2777, + "step": 2168 + }, + { + "epoch": 0.14, + "grad_norm": 0.5572058221991034, + "learning_rate": 9.695630950068585e-07, + "loss": 0.0854, + "step": 2169 + }, + { + "epoch": 0.14, + "grad_norm": 0.5143647653811192, + "learning_rate": 9.695276029214452e-07, + "loss": 0.074, + "step": 2170 + }, + { + "epoch": 0.14, + "grad_norm": 0.4317050042711363, + "learning_rate": 9.694920908050405e-07, + "loss": 0.1415, + "step": 2171 + }, + { + "epoch": 0.14, + "grad_norm": 0.3311311852121749, + "learning_rate": 9.694565586591593e-07, + "loss": 0.118, + "step": 2172 + }, + { + "epoch": 0.14, + "grad_norm": 0.6783385015164943, + "learning_rate": 9.694210064853176e-07, + "loss": 0.2808, + "step": 2173 + }, + { + "epoch": 0.14, + "grad_norm": 0.3257814175320826, + "learning_rate": 9.693854342850322e-07, + "loss": 0.1957, + "step": 2174 + }, + { + "epoch": 0.14, + "grad_norm": 0.7204555437328546, + "learning_rate": 9.693498420598206e-07, + "loss": 0.0513, + "step": 2175 + }, + { + "epoch": 0.14, + "grad_norm": 0.49848661994604865, + "learning_rate": 9.693142298112012e-07, + "loss": 0.0496, + "step": 2176 + }, + { + "epoch": 0.14, + "grad_norm": 0.4716172545068549, + "learning_rate": 9.692785975406933e-07, + "loss": 0.2037, + "step": 2177 + }, + { + "epoch": 0.14, + "grad_norm": 1.0155619549011925, + "learning_rate": 9.692429452498171e-07, + "loss": 0.1208, + "step": 2178 + }, + { + "epoch": 0.14, + "grad_norm": 0.5177398619634386, + "learning_rate": 9.692072729400936e-07, + "loss": 0.3334, + "step": 2179 + }, + { + "epoch": 0.14, + "grad_norm": 0.6641217851323812, + "learning_rate": 9.691715806130445e-07, + "loss": 0.3065, + "step": 2180 + }, + { + "epoch": 0.14, + "grad_norm": 0.24003933327188864, + "learning_rate": 9.691358682701926e-07, + "loss": 0.1132, + "step": 2181 + }, + { + "epoch": 0.14, + "grad_norm": 0.9836596156748919, + "learning_rate": 9.691001359130614e-07, + "loss": 0.2834, + "step": 2182 + }, + { + "epoch": 0.14, + "grad_norm": 0.3591091840601756, + "learning_rate": 9.690643835431756e-07, + "loss": 0.0909, + "step": 2183 + }, + { + "epoch": 0.14, + "grad_norm": 0.7763253196946548, + "learning_rate": 9.690286111620601e-07, + "loss": 0.3036, + "step": 2184 + }, + { + "epoch": 0.14, + "grad_norm": 0.6384863997698524, + "learning_rate": 9.689928187712414e-07, + "loss": 0.3303, + "step": 2185 + }, + { + "epoch": 0.14, + "grad_norm": 1.0370332070148198, + "learning_rate": 9.68957006372246e-07, + "loss": 0.1551, + "step": 2186 + }, + { + "epoch": 0.14, + "grad_norm": 0.6683096996660153, + "learning_rate": 9.689211739666022e-07, + "loss": 0.1618, + "step": 2187 + }, + { + "epoch": 0.14, + "grad_norm": 0.6021385146080815, + "learning_rate": 9.688853215558384e-07, + "loss": 0.0951, + "step": 2188 + }, + { + "epoch": 0.14, + "grad_norm": 0.4977924246182438, + "learning_rate": 9.688494491414842e-07, + "loss": 0.1774, + "step": 2189 + }, + { + "epoch": 0.14, + "grad_norm": 0.6280800769891541, + "learning_rate": 9.6881355672507e-07, + "loss": 0.3283, + "step": 2190 + }, + { + "epoch": 0.14, + "grad_norm": 0.49895923109815055, + "learning_rate": 9.687776443081268e-07, + "loss": 0.2626, + "step": 2191 + }, + { + "epoch": 0.14, + "grad_norm": 0.5822084227475768, + "learning_rate": 9.687417118921872e-07, + "loss": 0.2192, + "step": 2192 + }, + { + "epoch": 0.14, + "grad_norm": 0.40429637307764493, + "learning_rate": 9.68705759478784e-07, + "loss": 0.0835, + "step": 2193 + }, + { + "epoch": 0.14, + "grad_norm": 0.3349807403298228, + "learning_rate": 9.686697870694507e-07, + "loss": 0.0688, + "step": 2194 + }, + { + "epoch": 0.14, + "grad_norm": 0.7251416008599905, + "learning_rate": 9.68633794665722e-07, + "loss": 0.1115, + "step": 2195 + }, + { + "epoch": 0.14, + "grad_norm": 0.6856654549230466, + "learning_rate": 9.685977822691338e-07, + "loss": 0.0439, + "step": 2196 + }, + { + "epoch": 0.14, + "grad_norm": 0.4043057449881499, + "learning_rate": 9.685617498812221e-07, + "loss": 0.1153, + "step": 2197 + }, + { + "epoch": 0.14, + "grad_norm": 0.24902936667567233, + "learning_rate": 9.68525697503524e-07, + "loss": 0.0159, + "step": 2198 + }, + { + "epoch": 0.14, + "grad_norm": 0.3618034565837879, + "learning_rate": 9.684896251375783e-07, + "loss": 0.0098, + "step": 2199 + }, + { + "epoch": 0.14, + "grad_norm": 0.838832455790905, + "learning_rate": 9.684535327849231e-07, + "loss": 0.1777, + "step": 2200 + }, + { + "epoch": 0.14, + "grad_norm": 1.0008950477903942, + "learning_rate": 9.684174204470985e-07, + "loss": 0.44, + "step": 2201 + }, + { + "epoch": 0.14, + "grad_norm": 0.17549592469279013, + "learning_rate": 9.68381288125645e-07, + "loss": 0.0694, + "step": 2202 + }, + { + "epoch": 0.14, + "grad_norm": 0.5686764235835852, + "learning_rate": 9.683451358221044e-07, + "loss": 0.1453, + "step": 2203 + }, + { + "epoch": 0.14, + "grad_norm": 0.5265235396622149, + "learning_rate": 9.683089635380185e-07, + "loss": 0.167, + "step": 2204 + }, + { + "epoch": 0.14, + "grad_norm": 1.2578997011400992, + "learning_rate": 9.682727712749311e-07, + "loss": 0.0425, + "step": 2205 + }, + { + "epoch": 0.14, + "grad_norm": 1.786053876989963, + "learning_rate": 9.68236559034386e-07, + "loss": 0.1647, + "step": 2206 + }, + { + "epoch": 0.14, + "grad_norm": 0.4285561195686105, + "learning_rate": 9.682003268179276e-07, + "loss": 0.2602, + "step": 2207 + }, + { + "epoch": 0.14, + "grad_norm": 0.05317384824930782, + "learning_rate": 9.681640746271026e-07, + "loss": 0.0014, + "step": 2208 + }, + { + "epoch": 0.14, + "grad_norm": 0.3058819496896352, + "learning_rate": 9.681278024634568e-07, + "loss": 0.0511, + "step": 2209 + }, + { + "epoch": 0.14, + "grad_norm": 0.48222105925547604, + "learning_rate": 9.680915103285376e-07, + "loss": 0.2603, + "step": 2210 + }, + { + "epoch": 0.14, + "grad_norm": 0.35396914352780745, + "learning_rate": 9.68055198223894e-07, + "loss": 0.1367, + "step": 2211 + }, + { + "epoch": 0.14, + "grad_norm": 0.8654080235768975, + "learning_rate": 9.680188661510746e-07, + "loss": 0.3336, + "step": 2212 + }, + { + "epoch": 0.14, + "grad_norm": 0.5383644093618157, + "learning_rate": 9.679825141116294e-07, + "loss": 0.2385, + "step": 2213 + }, + { + "epoch": 0.14, + "grad_norm": 0.5615487289719443, + "learning_rate": 9.679461421071096e-07, + "loss": 0.0606, + "step": 2214 + }, + { + "epoch": 0.14, + "grad_norm": 0.8546193412942816, + "learning_rate": 9.679097501390666e-07, + "loss": 0.084, + "step": 2215 + }, + { + "epoch": 0.14, + "grad_norm": 0.4234781567013046, + "learning_rate": 9.67873338209053e-07, + "loss": 0.1631, + "step": 2216 + }, + { + "epoch": 0.14, + "grad_norm": 0.5009648123432349, + "learning_rate": 9.678369063186222e-07, + "loss": 0.3513, + "step": 2217 + }, + { + "epoch": 0.14, + "grad_norm": 0.46509501902431477, + "learning_rate": 9.678004544693285e-07, + "loss": 0.1144, + "step": 2218 + }, + { + "epoch": 0.14, + "grad_norm": 0.41297550452765464, + "learning_rate": 9.677639826627271e-07, + "loss": 0.1154, + "step": 2219 + }, + { + "epoch": 0.14, + "grad_norm": 0.3134990194762429, + "learning_rate": 9.67727490900374e-07, + "loss": 0.2109, + "step": 2220 + }, + { + "epoch": 0.14, + "grad_norm": 0.6121985257256408, + "learning_rate": 9.676909791838257e-07, + "loss": 0.2242, + "step": 2221 + }, + { + "epoch": 0.14, + "grad_norm": 4.245129433818458, + "learning_rate": 9.676544475146402e-07, + "loss": 0.439, + "step": 2222 + }, + { + "epoch": 0.14, + "grad_norm": 0.6649483068722379, + "learning_rate": 9.676178958943756e-07, + "loss": 0.1686, + "step": 2223 + }, + { + "epoch": 0.14, + "grad_norm": 0.35886327928050354, + "learning_rate": 9.675813243245918e-07, + "loss": 0.131, + "step": 2224 + }, + { + "epoch": 0.14, + "grad_norm": 0.4399401243529808, + "learning_rate": 9.675447328068488e-07, + "loss": 0.128, + "step": 2225 + }, + { + "epoch": 0.14, + "grad_norm": 1.0505089769550253, + "learning_rate": 9.675081213427074e-07, + "loss": 0.2093, + "step": 2226 + }, + { + "epoch": 0.14, + "grad_norm": 1.0391963369372383, + "learning_rate": 9.674714899337298e-07, + "loss": 0.1625, + "step": 2227 + }, + { + "epoch": 0.14, + "grad_norm": 0.8679782758021442, + "learning_rate": 9.674348385814786e-07, + "loss": 0.1818, + "step": 2228 + }, + { + "epoch": 0.14, + "grad_norm": 0.34390251896014173, + "learning_rate": 9.673981672875178e-07, + "loss": 0.1262, + "step": 2229 + }, + { + "epoch": 0.14, + "grad_norm": 0.550592707040225, + "learning_rate": 9.673614760534114e-07, + "loss": 0.0191, + "step": 2230 + }, + { + "epoch": 0.14, + "grad_norm": 0.4974107876985201, + "learning_rate": 9.673247648807248e-07, + "loss": 0.2038, + "step": 2231 + }, + { + "epoch": 0.14, + "grad_norm": 0.5077694190800893, + "learning_rate": 9.672880337710246e-07, + "loss": 0.2585, + "step": 2232 + }, + { + "epoch": 0.14, + "grad_norm": 0.37577197112213945, + "learning_rate": 9.672512827258773e-07, + "loss": 0.1661, + "step": 2233 + }, + { + "epoch": 0.14, + "grad_norm": 0.5223572975879749, + "learning_rate": 9.67214511746851e-07, + "loss": 0.1267, + "step": 2234 + }, + { + "epoch": 0.14, + "grad_norm": 0.31077655176892444, + "learning_rate": 9.671777208355145e-07, + "loss": 0.0997, + "step": 2235 + }, + { + "epoch": 0.14, + "grad_norm": 0.4371748183684701, + "learning_rate": 9.671409099934372e-07, + "loss": 0.1151, + "step": 2236 + }, + { + "epoch": 0.14, + "grad_norm": 0.28810427124386667, + "learning_rate": 9.671040792221896e-07, + "loss": 0.1184, + "step": 2237 + }, + { + "epoch": 0.14, + "grad_norm": 0.6674952069968723, + "learning_rate": 9.67067228523343e-07, + "loss": 0.2114, + "step": 2238 + }, + { + "epoch": 0.14, + "grad_norm": 0.6422148799994809, + "learning_rate": 9.670303578984696e-07, + "loss": 0.3188, + "step": 2239 + }, + { + "epoch": 0.14, + "grad_norm": 0.6988796102022719, + "learning_rate": 9.669934673491423e-07, + "loss": 0.3983, + "step": 2240 + }, + { + "epoch": 0.14, + "grad_norm": 1.3211078676716037, + "learning_rate": 9.669565568769347e-07, + "loss": 0.1478, + "step": 2241 + }, + { + "epoch": 0.14, + "grad_norm": 0.882084442419305, + "learning_rate": 9.669196264834219e-07, + "loss": 0.3688, + "step": 2242 + }, + { + "epoch": 0.14, + "grad_norm": 0.3245513035662919, + "learning_rate": 9.668826761701791e-07, + "loss": 0.1576, + "step": 2243 + }, + { + "epoch": 0.14, + "grad_norm": 0.8463906820598696, + "learning_rate": 9.668457059387826e-07, + "loss": 0.1455, + "step": 2244 + }, + { + "epoch": 0.14, + "grad_norm": 0.7189558460863216, + "learning_rate": 9.6680871579081e-07, + "loss": 0.1305, + "step": 2245 + }, + { + "epoch": 0.14, + "grad_norm": 0.5163374702417902, + "learning_rate": 9.667717057278393e-07, + "loss": 0.2395, + "step": 2246 + }, + { + "epoch": 0.14, + "grad_norm": 0.7378784092646128, + "learning_rate": 9.667346757514492e-07, + "loss": 0.1645, + "step": 2247 + }, + { + "epoch": 0.14, + "grad_norm": 0.5902167984125476, + "learning_rate": 9.666976258632195e-07, + "loss": 0.2945, + "step": 2248 + }, + { + "epoch": 0.14, + "grad_norm": 0.6015192434654577, + "learning_rate": 9.66660556064731e-07, + "loss": 0.2368, + "step": 2249 + }, + { + "epoch": 0.14, + "grad_norm": 0.5495202520755843, + "learning_rate": 9.66623466357565e-07, + "loss": 0.1012, + "step": 2250 + }, + { + "epoch": 0.14, + "grad_norm": 0.5985745751093206, + "learning_rate": 9.66586356743304e-07, + "loss": 0.0957, + "step": 2251 + }, + { + "epoch": 0.14, + "grad_norm": 0.5664242279788742, + "learning_rate": 9.665492272235309e-07, + "loss": 0.4868, + "step": 2252 + }, + { + "epoch": 0.14, + "grad_norm": 0.3543322807729264, + "learning_rate": 9.665120777998302e-07, + "loss": 0.2164, + "step": 2253 + }, + { + "epoch": 0.14, + "grad_norm": 1.012639129934071, + "learning_rate": 9.664749084737862e-07, + "loss": 0.1544, + "step": 2254 + }, + { + "epoch": 0.14, + "grad_norm": 0.81779050790222, + "learning_rate": 9.664377192469848e-07, + "loss": 0.3724, + "step": 2255 + }, + { + "epoch": 0.14, + "grad_norm": 0.8172736485518265, + "learning_rate": 9.664005101210128e-07, + "loss": 0.0195, + "step": 2256 + }, + { + "epoch": 0.14, + "grad_norm": 0.2967269382121912, + "learning_rate": 9.663632810974574e-07, + "loss": 0.148, + "step": 2257 + }, + { + "epoch": 0.14, + "grad_norm": 0.45317727513112177, + "learning_rate": 9.66326032177907e-07, + "loss": 0.1707, + "step": 2258 + }, + { + "epoch": 0.14, + "grad_norm": 0.7766974709362029, + "learning_rate": 9.662887633639505e-07, + "loss": 0.266, + "step": 2259 + }, + { + "epoch": 0.14, + "grad_norm": 0.4984038311157407, + "learning_rate": 9.66251474657178e-07, + "loss": 0.0535, + "step": 2260 + }, + { + "epoch": 0.14, + "grad_norm": 0.1891626931876308, + "learning_rate": 9.662141660591804e-07, + "loss": 0.0147, + "step": 2261 + }, + { + "epoch": 0.14, + "grad_norm": 0.3164793955238129, + "learning_rate": 9.661768375715491e-07, + "loss": 0.2204, + "step": 2262 + }, + { + "epoch": 0.14, + "grad_norm": 0.49853749721826085, + "learning_rate": 9.66139489195877e-07, + "loss": 0.1248, + "step": 2263 + }, + { + "epoch": 0.14, + "grad_norm": 0.976423822686046, + "learning_rate": 9.661021209337573e-07, + "loss": 0.1819, + "step": 2264 + }, + { + "epoch": 0.14, + "grad_norm": 0.32536603027870525, + "learning_rate": 9.660647327867838e-07, + "loss": 0.213, + "step": 2265 + }, + { + "epoch": 0.14, + "grad_norm": 0.41416220367858053, + "learning_rate": 9.660273247565522e-07, + "loss": 0.1143, + "step": 2266 + }, + { + "epoch": 0.14, + "grad_norm": 0.7376481330398571, + "learning_rate": 9.659898968446578e-07, + "loss": 0.1566, + "step": 2267 + }, + { + "epoch": 0.14, + "grad_norm": 0.3830419810598921, + "learning_rate": 9.659524490526979e-07, + "loss": 0.0325, + "step": 2268 + }, + { + "epoch": 0.14, + "grad_norm": 0.9413732448889611, + "learning_rate": 9.659149813822697e-07, + "loss": 0.3643, + "step": 2269 + }, + { + "epoch": 0.14, + "grad_norm": 0.5660978306764866, + "learning_rate": 9.658774938349718e-07, + "loss": 0.4425, + "step": 2270 + }, + { + "epoch": 0.14, + "grad_norm": 0.6829693757465235, + "learning_rate": 9.658399864124035e-07, + "loss": 0.2359, + "step": 2271 + }, + { + "epoch": 0.14, + "grad_norm": 0.4514127261456764, + "learning_rate": 9.658024591161651e-07, + "loss": 0.1984, + "step": 2272 + }, + { + "epoch": 0.14, + "grad_norm": 0.4498924244285713, + "learning_rate": 9.657649119478572e-07, + "loss": 0.0579, + "step": 2273 + }, + { + "epoch": 0.15, + "grad_norm": 0.7614178656215915, + "learning_rate": 9.65727344909082e-07, + "loss": 0.2248, + "step": 2274 + }, + { + "epoch": 0.15, + "grad_norm": 0.6076176180082014, + "learning_rate": 9.65689758001442e-07, + "loss": 0.121, + "step": 2275 + }, + { + "epoch": 0.15, + "grad_norm": 0.28470817960489736, + "learning_rate": 9.656521512265405e-07, + "loss": 0.266, + "step": 2276 + }, + { + "epoch": 0.15, + "grad_norm": 0.3198352442511579, + "learning_rate": 9.656145245859823e-07, + "loss": 0.1984, + "step": 2277 + }, + { + "epoch": 0.15, + "grad_norm": 0.38328997388644065, + "learning_rate": 9.655768780813727e-07, + "loss": 0.2533, + "step": 2278 + }, + { + "epoch": 0.15, + "grad_norm": 1.0348423197625813, + "learning_rate": 9.655392117143172e-07, + "loss": 0.2776, + "step": 2279 + }, + { + "epoch": 0.15, + "grad_norm": 0.8350534328849376, + "learning_rate": 9.655015254864233e-07, + "loss": 0.0954, + "step": 2280 + }, + { + "epoch": 0.15, + "grad_norm": 0.6564781601015938, + "learning_rate": 9.654638193992986e-07, + "loss": 0.3759, + "step": 2281 + }, + { + "epoch": 0.15, + "grad_norm": 0.4569331204775347, + "learning_rate": 9.654260934545518e-07, + "loss": 0.0432, + "step": 2282 + }, + { + "epoch": 0.15, + "grad_norm": 0.8782757935857414, + "learning_rate": 9.653883476537918e-07, + "loss": 0.0517, + "step": 2283 + }, + { + "epoch": 0.15, + "grad_norm": 0.27739462172948237, + "learning_rate": 9.653505819986297e-07, + "loss": 0.1759, + "step": 2284 + }, + { + "epoch": 0.15, + "grad_norm": 0.9304604462430499, + "learning_rate": 9.653127964906762e-07, + "loss": 0.0239, + "step": 2285 + }, + { + "epoch": 0.15, + "grad_norm": 0.6302011693903139, + "learning_rate": 9.652749911315433e-07, + "loss": 0.1394, + "step": 2286 + }, + { + "epoch": 0.15, + "grad_norm": 0.7351225115861103, + "learning_rate": 9.65237165922844e-07, + "loss": 0.3691, + "step": 2287 + }, + { + "epoch": 0.15, + "grad_norm": 0.7122696323469048, + "learning_rate": 9.65199320866192e-07, + "loss": 0.2106, + "step": 2288 + }, + { + "epoch": 0.15, + "grad_norm": 0.2624750293746841, + "learning_rate": 9.651614559632017e-07, + "loss": 0.0104, + "step": 2289 + }, + { + "epoch": 0.15, + "grad_norm": 1.6971316836718546, + "learning_rate": 9.651235712154887e-07, + "loss": 0.331, + "step": 2290 + }, + { + "epoch": 0.15, + "grad_norm": 0.552145582263909, + "learning_rate": 9.650856666246692e-07, + "loss": 0.2856, + "step": 2291 + }, + { + "epoch": 0.15, + "grad_norm": 0.8679718247585241, + "learning_rate": 9.650477421923602e-07, + "loss": 0.1551, + "step": 2292 + }, + { + "epoch": 0.15, + "grad_norm": 0.44643504149248464, + "learning_rate": 9.650097979201795e-07, + "loss": 0.0522, + "step": 2293 + }, + { + "epoch": 0.15, + "grad_norm": 0.8235716914964851, + "learning_rate": 9.649718338097462e-07, + "loss": 0.1463, + "step": 2294 + }, + { + "epoch": 0.15, + "grad_norm": 1.0343432708886806, + "learning_rate": 9.649338498626795e-07, + "loss": 0.3718, + "step": 2295 + }, + { + "epoch": 0.15, + "grad_norm": 0.7726073379068243, + "learning_rate": 9.648958460806003e-07, + "loss": 0.2415, + "step": 2296 + }, + { + "epoch": 0.15, + "grad_norm": 0.9999413965119969, + "learning_rate": 9.648578224651299e-07, + "loss": 0.2055, + "step": 2297 + }, + { + "epoch": 0.15, + "grad_norm": 0.5979325658254782, + "learning_rate": 9.6481977901789e-07, + "loss": 0.306, + "step": 2298 + }, + { + "epoch": 0.15, + "grad_norm": 0.3014136895704547, + "learning_rate": 9.647817157405043e-07, + "loss": 0.1481, + "step": 2299 + }, + { + "epoch": 0.15, + "grad_norm": 0.4454559117522062, + "learning_rate": 9.64743632634596e-07, + "loss": 0.2429, + "step": 2300 + }, + { + "epoch": 0.15, + "grad_norm": 0.6919515419081649, + "learning_rate": 9.647055297017902e-07, + "loss": 0.0867, + "step": 2301 + }, + { + "epoch": 0.15, + "grad_norm": 0.5852256753102256, + "learning_rate": 9.646674069437122e-07, + "loss": 0.2837, + "step": 2302 + }, + { + "epoch": 0.15, + "grad_norm": 0.8964603662872278, + "learning_rate": 9.646292643619887e-07, + "loss": 0.3714, + "step": 2303 + }, + { + "epoch": 0.15, + "grad_norm": 0.8299760482054624, + "learning_rate": 9.645911019582465e-07, + "loss": 0.3254, + "step": 2304 + }, + { + "epoch": 0.15, + "grad_norm": 0.7201842646276856, + "learning_rate": 9.645529197341142e-07, + "loss": 0.2404, + "step": 2305 + }, + { + "epoch": 0.15, + "grad_norm": 0.4962911567973602, + "learning_rate": 9.645147176912203e-07, + "loss": 0.0751, + "step": 2306 + }, + { + "epoch": 0.15, + "grad_norm": 0.9255300170601806, + "learning_rate": 9.64476495831195e-07, + "loss": 0.395, + "step": 2307 + }, + { + "epoch": 0.15, + "grad_norm": 0.5418588930887981, + "learning_rate": 9.644382541556684e-07, + "loss": 0.119, + "step": 2308 + }, + { + "epoch": 0.15, + "grad_norm": 0.8536579121334795, + "learning_rate": 9.643999926662723e-07, + "loss": 0.1972, + "step": 2309 + }, + { + "epoch": 0.15, + "grad_norm": 0.9897793684100308, + "learning_rate": 9.643617113646392e-07, + "loss": 0.2981, + "step": 2310 + }, + { + "epoch": 0.15, + "grad_norm": 0.6466825868896282, + "learning_rate": 9.643234102524017e-07, + "loss": 0.1807, + "step": 2311 + }, + { + "epoch": 0.15, + "grad_norm": 0.33705126163694304, + "learning_rate": 9.642850893311942e-07, + "loss": 0.2265, + "step": 2312 + }, + { + "epoch": 0.15, + "grad_norm": 0.5435083251237673, + "learning_rate": 9.642467486026516e-07, + "loss": 0.287, + "step": 2313 + }, + { + "epoch": 0.15, + "grad_norm": 0.3867512644692106, + "learning_rate": 9.642083880684093e-07, + "loss": 0.1887, + "step": 2314 + }, + { + "epoch": 0.15, + "grad_norm": 1.0036525121321793, + "learning_rate": 9.64170007730104e-07, + "loss": 0.2691, + "step": 2315 + }, + { + "epoch": 0.15, + "grad_norm": 1.5478909722228598, + "learning_rate": 9.641316075893731e-07, + "loss": 0.1671, + "step": 2316 + }, + { + "epoch": 0.15, + "grad_norm": 0.9728080261290837, + "learning_rate": 9.640931876478546e-07, + "loss": 0.0473, + "step": 2317 + }, + { + "epoch": 0.15, + "grad_norm": 0.47594745891612705, + "learning_rate": 9.64054747907188e-07, + "loss": 0.3223, + "step": 2318 + }, + { + "epoch": 0.15, + "grad_norm": 1.3104477542687964, + "learning_rate": 9.64016288369013e-07, + "loss": 0.1943, + "step": 2319 + }, + { + "epoch": 0.15, + "grad_norm": 1.1109103011879944, + "learning_rate": 9.639778090349705e-07, + "loss": 0.3341, + "step": 2320 + }, + { + "epoch": 0.15, + "grad_norm": 0.7667004818078823, + "learning_rate": 9.639393099067015e-07, + "loss": 0.2947, + "step": 2321 + }, + { + "epoch": 0.15, + "grad_norm": 0.33847381927963693, + "learning_rate": 9.639007909858493e-07, + "loss": 0.3664, + "step": 2322 + }, + { + "epoch": 0.15, + "grad_norm": 0.4719779746135633, + "learning_rate": 9.638622522740565e-07, + "loss": 0.2952, + "step": 2323 + }, + { + "epoch": 0.15, + "grad_norm": 0.32241616455743904, + "learning_rate": 9.638236937729678e-07, + "loss": 0.1506, + "step": 2324 + }, + { + "epoch": 0.15, + "grad_norm": 0.39757686248786983, + "learning_rate": 9.637851154842279e-07, + "loss": 0.1362, + "step": 2325 + }, + { + "epoch": 0.15, + "grad_norm": 0.04706264417641275, + "learning_rate": 9.637465174094825e-07, + "loss": 0.002, + "step": 2326 + }, + { + "epoch": 0.15, + "grad_norm": 0.48218257664301306, + "learning_rate": 9.637078995503784e-07, + "loss": 0.3685, + "step": 2327 + }, + { + "epoch": 0.15, + "grad_norm": 1.6935869139609587, + "learning_rate": 9.636692619085633e-07, + "loss": 0.2605, + "step": 2328 + }, + { + "epoch": 0.15, + "grad_norm": 1.0172783780874317, + "learning_rate": 9.636306044856853e-07, + "loss": 0.2496, + "step": 2329 + }, + { + "epoch": 0.15, + "grad_norm": 0.7990364493575447, + "learning_rate": 9.635919272833937e-07, + "loss": 0.2556, + "step": 2330 + }, + { + "epoch": 0.15, + "grad_norm": 0.6195611723475155, + "learning_rate": 9.635532303033385e-07, + "loss": 0.2784, + "step": 2331 + }, + { + "epoch": 0.15, + "grad_norm": 0.969221530484212, + "learning_rate": 9.635145135471708e-07, + "loss": 0.1225, + "step": 2332 + }, + { + "epoch": 0.15, + "grad_norm": 0.4353391986370538, + "learning_rate": 9.63475777016542e-07, + "loss": 0.2233, + "step": 2333 + }, + { + "epoch": 0.15, + "grad_norm": 0.4104028272530483, + "learning_rate": 9.634370207131047e-07, + "loss": 0.1183, + "step": 2334 + }, + { + "epoch": 0.15, + "grad_norm": 0.8507423609935023, + "learning_rate": 9.633982446385129e-07, + "loss": 0.1365, + "step": 2335 + }, + { + "epoch": 0.15, + "grad_norm": 0.4205435335862078, + "learning_rate": 9.633594487944202e-07, + "loss": 0.0591, + "step": 2336 + }, + { + "epoch": 0.15, + "grad_norm": 0.5029415640201254, + "learning_rate": 9.63320633182482e-07, + "loss": 0.1185, + "step": 2337 + }, + { + "epoch": 0.15, + "grad_norm": 0.6350187922324667, + "learning_rate": 9.632817978043542e-07, + "loss": 0.3711, + "step": 2338 + }, + { + "epoch": 0.15, + "grad_norm": 0.6837763626589852, + "learning_rate": 9.632429426616935e-07, + "loss": 0.2258, + "step": 2339 + }, + { + "epoch": 0.15, + "grad_norm": 1.1247648723040353, + "learning_rate": 9.632040677561577e-07, + "loss": 0.1972, + "step": 2340 + }, + { + "epoch": 0.15, + "grad_norm": 0.5807963694314374, + "learning_rate": 9.63165173089405e-07, + "loss": 0.2031, + "step": 2341 + }, + { + "epoch": 0.15, + "grad_norm": 0.6817570329315167, + "learning_rate": 9.631262586630952e-07, + "loss": 0.1247, + "step": 2342 + }, + { + "epoch": 0.15, + "grad_norm": 0.6135669974590318, + "learning_rate": 9.630873244788882e-07, + "loss": 0.341, + "step": 2343 + }, + { + "epoch": 0.15, + "grad_norm": 0.6181313290501207, + "learning_rate": 9.63048370538445e-07, + "loss": 0.2672, + "step": 2344 + }, + { + "epoch": 0.15, + "grad_norm": 0.4007863289709619, + "learning_rate": 9.630093968434275e-07, + "loss": 0.1462, + "step": 2345 + }, + { + "epoch": 0.15, + "grad_norm": 1.0453733218873578, + "learning_rate": 9.629704033954981e-07, + "loss": 0.0337, + "step": 2346 + }, + { + "epoch": 0.15, + "grad_norm": 0.4082676620728667, + "learning_rate": 9.62931390196321e-07, + "loss": 0.115, + "step": 2347 + }, + { + "epoch": 0.15, + "grad_norm": 0.6161421143518172, + "learning_rate": 9.628923572475598e-07, + "loss": 0.3096, + "step": 2348 + }, + { + "epoch": 0.15, + "grad_norm": 0.2977332789299097, + "learning_rate": 9.628533045508803e-07, + "loss": 0.0868, + "step": 2349 + }, + { + "epoch": 0.15, + "grad_norm": 0.8224576365366434, + "learning_rate": 9.628142321079485e-07, + "loss": 0.0306, + "step": 2350 + }, + { + "epoch": 0.15, + "grad_norm": 0.6319045401424148, + "learning_rate": 9.62775139920431e-07, + "loss": 0.2043, + "step": 2351 + }, + { + "epoch": 0.15, + "grad_norm": 1.6003548495530138, + "learning_rate": 9.627360279899956e-07, + "loss": 0.06, + "step": 2352 + }, + { + "epoch": 0.15, + "grad_norm": 0.985280074315826, + "learning_rate": 9.626968963183113e-07, + "loss": 0.1847, + "step": 2353 + }, + { + "epoch": 0.15, + "grad_norm": 0.2260737403352769, + "learning_rate": 9.626577449070473e-07, + "loss": 0.0967, + "step": 2354 + }, + { + "epoch": 0.15, + "grad_norm": 0.33288818264685915, + "learning_rate": 9.626185737578737e-07, + "loss": 0.1423, + "step": 2355 + }, + { + "epoch": 0.15, + "grad_norm": 1.076048043320009, + "learning_rate": 9.625793828724618e-07, + "loss": 0.2161, + "step": 2356 + }, + { + "epoch": 0.15, + "grad_norm": 0.28497596382596746, + "learning_rate": 9.625401722524835e-07, + "loss": 0.0466, + "step": 2357 + }, + { + "epoch": 0.15, + "grad_norm": 0.8612805387264523, + "learning_rate": 9.625009418996115e-07, + "loss": 0.4057, + "step": 2358 + }, + { + "epoch": 0.15, + "grad_norm": 0.896418765709942, + "learning_rate": 9.624616918155197e-07, + "loss": 0.1871, + "step": 2359 + }, + { + "epoch": 0.15, + "grad_norm": 1.1315802655313565, + "learning_rate": 9.624224220018826e-07, + "loss": 0.0523, + "step": 2360 + }, + { + "epoch": 0.15, + "grad_norm": 0.3753553941282118, + "learning_rate": 9.623831324603752e-07, + "loss": 0.1157, + "step": 2361 + }, + { + "epoch": 0.15, + "grad_norm": 0.36621201404913223, + "learning_rate": 9.623438231926739e-07, + "loss": 0.1089, + "step": 2362 + }, + { + "epoch": 0.15, + "grad_norm": 0.5573117921494466, + "learning_rate": 9.623044942004557e-07, + "loss": 0.1621, + "step": 2363 + }, + { + "epoch": 0.15, + "grad_norm": 0.78492953971227, + "learning_rate": 9.622651454853985e-07, + "loss": 0.3114, + "step": 2364 + }, + { + "epoch": 0.15, + "grad_norm": 0.3565610982750958, + "learning_rate": 9.62225777049181e-07, + "loss": 0.2818, + "step": 2365 + }, + { + "epoch": 0.15, + "grad_norm": 0.3618723782921361, + "learning_rate": 9.621863888934824e-07, + "loss": 0.1987, + "step": 2366 + }, + { + "epoch": 0.15, + "grad_norm": 0.5948519558003095, + "learning_rate": 9.621469810199833e-07, + "loss": 0.3976, + "step": 2367 + }, + { + "epoch": 0.15, + "grad_norm": 0.657854215589905, + "learning_rate": 9.621075534303654e-07, + "loss": 0.5349, + "step": 2368 + }, + { + "epoch": 0.15, + "grad_norm": 1.1712196716947927, + "learning_rate": 9.620681061263098e-07, + "loss": 0.3451, + "step": 2369 + }, + { + "epoch": 0.15, + "grad_norm": 0.16272161629774692, + "learning_rate": 9.620286391095003e-07, + "loss": 0.1046, + "step": 2370 + }, + { + "epoch": 0.15, + "grad_norm": 0.5854611664085622, + "learning_rate": 9.619891523816202e-07, + "loss": 0.4168, + "step": 2371 + }, + { + "epoch": 0.15, + "grad_norm": 0.3570465267301881, + "learning_rate": 9.61949645944354e-07, + "loss": 0.0927, + "step": 2372 + }, + { + "epoch": 0.15, + "grad_norm": 0.5179421249236883, + "learning_rate": 9.619101197993874e-07, + "loss": 0.2437, + "step": 2373 + }, + { + "epoch": 0.15, + "grad_norm": 0.8658125477907648, + "learning_rate": 9.618705739484064e-07, + "loss": 0.4816, + "step": 2374 + }, + { + "epoch": 0.15, + "grad_norm": 0.6956739182159973, + "learning_rate": 9.618310083930983e-07, + "loss": 0.1711, + "step": 2375 + }, + { + "epoch": 0.15, + "grad_norm": 0.7735803198214356, + "learning_rate": 9.617914231351508e-07, + "loss": 0.2274, + "step": 2376 + }, + { + "epoch": 0.15, + "grad_norm": 0.775590982406566, + "learning_rate": 9.617518181762531e-07, + "loss": 0.4208, + "step": 2377 + }, + { + "epoch": 0.15, + "grad_norm": 0.39703609197852757, + "learning_rate": 9.617121935180946e-07, + "loss": 0.1647, + "step": 2378 + }, + { + "epoch": 0.15, + "grad_norm": 0.3233245062122669, + "learning_rate": 9.616725491623657e-07, + "loss": 0.1064, + "step": 2379 + }, + { + "epoch": 0.15, + "grad_norm": 0.5076525687307184, + "learning_rate": 9.616328851107578e-07, + "loss": 0.2319, + "step": 2380 + }, + { + "epoch": 0.15, + "grad_norm": 0.42553805338670764, + "learning_rate": 9.61593201364963e-07, + "loss": 0.1853, + "step": 2381 + }, + { + "epoch": 0.15, + "grad_norm": 0.9205030596838517, + "learning_rate": 9.615534979266744e-07, + "loss": 0.4135, + "step": 2382 + }, + { + "epoch": 0.15, + "grad_norm": 0.5379667422988014, + "learning_rate": 9.615137747975857e-07, + "loss": 0.3133, + "step": 2383 + }, + { + "epoch": 0.15, + "grad_norm": 0.5742670032412414, + "learning_rate": 9.614740319793915e-07, + "loss": 0.2185, + "step": 2384 + }, + { + "epoch": 0.15, + "grad_norm": 0.2402152679106932, + "learning_rate": 9.614342694737876e-07, + "loss": 0.2124, + "step": 2385 + }, + { + "epoch": 0.15, + "grad_norm": 0.7856792205453577, + "learning_rate": 9.6139448728247e-07, + "loss": 0.3608, + "step": 2386 + }, + { + "epoch": 0.15, + "grad_norm": 0.4222629909090372, + "learning_rate": 9.613546854071362e-07, + "loss": 0.1599, + "step": 2387 + }, + { + "epoch": 0.15, + "grad_norm": 0.8568168659751535, + "learning_rate": 9.613148638494839e-07, + "loss": 0.0264, + "step": 2388 + }, + { + "epoch": 0.15, + "grad_norm": 0.248520687448224, + "learning_rate": 9.612750226112122e-07, + "loss": 0.25, + "step": 2389 + }, + { + "epoch": 0.15, + "grad_norm": 1.1878111336995245, + "learning_rate": 9.612351616940209e-07, + "loss": 0.2917, + "step": 2390 + }, + { + "epoch": 0.15, + "grad_norm": 0.40762221093022905, + "learning_rate": 9.611952810996102e-07, + "loss": 0.287, + "step": 2391 + }, + { + "epoch": 0.15, + "grad_norm": 0.3397480071058737, + "learning_rate": 9.611553808296818e-07, + "loss": 0.1617, + "step": 2392 + }, + { + "epoch": 0.15, + "grad_norm": 0.689809580736861, + "learning_rate": 9.611154608859378e-07, + "loss": 0.1587, + "step": 2393 + }, + { + "epoch": 0.15, + "grad_norm": 0.5182216233375155, + "learning_rate": 9.610755212700814e-07, + "loss": 0.0876, + "step": 2394 + }, + { + "epoch": 0.15, + "grad_norm": 1.223606523800506, + "learning_rate": 9.610355619838162e-07, + "loss": 0.0617, + "step": 2395 + }, + { + "epoch": 0.15, + "grad_norm": 1.10688526188776, + "learning_rate": 9.609955830288471e-07, + "loss": 0.0832, + "step": 2396 + }, + { + "epoch": 0.15, + "grad_norm": 0.5905240026624127, + "learning_rate": 9.6095558440688e-07, + "loss": 0.2074, + "step": 2397 + }, + { + "epoch": 0.15, + "grad_norm": 0.8937538166310866, + "learning_rate": 9.609155661196209e-07, + "loss": 0.2132, + "step": 2398 + }, + { + "epoch": 0.15, + "grad_norm": 0.6143162927375626, + "learning_rate": 9.60875528168777e-07, + "loss": 0.3599, + "step": 2399 + }, + { + "epoch": 0.15, + "grad_norm": 0.433598283328746, + "learning_rate": 9.608354705560567e-07, + "loss": 0.1139, + "step": 2400 + }, + { + "epoch": 0.15, + "grad_norm": 0.811439499470575, + "learning_rate": 9.60795393283169e-07, + "loss": 0.1481, + "step": 2401 + }, + { + "epoch": 0.15, + "grad_norm": 0.7920695747098897, + "learning_rate": 9.607552963518232e-07, + "loss": 0.2026, + "step": 2402 + }, + { + "epoch": 0.15, + "grad_norm": 0.9343304919547966, + "learning_rate": 9.607151797637305e-07, + "loss": 0.2416, + "step": 2403 + }, + { + "epoch": 0.15, + "grad_norm": 0.44922222916730686, + "learning_rate": 9.606750435206019e-07, + "loss": 0.0723, + "step": 2404 + }, + { + "epoch": 0.15, + "grad_norm": 0.9007662843357239, + "learning_rate": 9.6063488762415e-07, + "loss": 0.2407, + "step": 2405 + }, + { + "epoch": 0.15, + "grad_norm": 1.2592833143028035, + "learning_rate": 9.605947120760877e-07, + "loss": 0.2817, + "step": 2406 + }, + { + "epoch": 0.15, + "grad_norm": 0.8598279190527072, + "learning_rate": 9.605545168781291e-07, + "loss": 0.0815, + "step": 2407 + }, + { + "epoch": 0.15, + "grad_norm": 1.4779322437716764, + "learning_rate": 9.60514302031989e-07, + "loss": 0.1776, + "step": 2408 + }, + { + "epoch": 0.15, + "grad_norm": 0.5904274798959935, + "learning_rate": 9.60474067539383e-07, + "loss": 0.345, + "step": 2409 + }, + { + "epoch": 0.15, + "grad_norm": 0.7881322561250321, + "learning_rate": 9.604338134020278e-07, + "loss": 0.2772, + "step": 2410 + }, + { + "epoch": 0.15, + "grad_norm": 0.6838207405531467, + "learning_rate": 9.603935396216404e-07, + "loss": 0.3399, + "step": 2411 + }, + { + "epoch": 0.15, + "grad_norm": 0.5401657859931392, + "learning_rate": 9.60353246199939e-07, + "loss": 0.2189, + "step": 2412 + }, + { + "epoch": 0.15, + "grad_norm": 0.3508163251196253, + "learning_rate": 9.603129331386425e-07, + "loss": 0.1616, + "step": 2413 + }, + { + "epoch": 0.15, + "grad_norm": 0.6713099313881182, + "learning_rate": 9.602726004394713e-07, + "loss": 0.2643, + "step": 2414 + }, + { + "epoch": 0.15, + "grad_norm": 1.1978295176181208, + "learning_rate": 9.602322481041457e-07, + "loss": 0.1534, + "step": 2415 + }, + { + "epoch": 0.15, + "grad_norm": 0.8870010417763626, + "learning_rate": 9.60191876134387e-07, + "loss": 0.088, + "step": 2416 + }, + { + "epoch": 0.15, + "grad_norm": 0.37156038596610425, + "learning_rate": 9.601514845319178e-07, + "loss": 0.0091, + "step": 2417 + }, + { + "epoch": 0.15, + "grad_norm": 0.5673089570323873, + "learning_rate": 9.601110732984611e-07, + "loss": 0.0818, + "step": 2418 + }, + { + "epoch": 0.15, + "grad_norm": 0.8794165913808688, + "learning_rate": 9.600706424357414e-07, + "loss": 0.1543, + "step": 2419 + }, + { + "epoch": 0.15, + "grad_norm": 1.8860515292708557, + "learning_rate": 9.600301919454832e-07, + "loss": 0.1811, + "step": 2420 + }, + { + "epoch": 0.15, + "grad_norm": 0.41049406078086303, + "learning_rate": 9.599897218294121e-07, + "loss": 0.2829, + "step": 2421 + }, + { + "epoch": 0.15, + "grad_norm": 0.4387965860439652, + "learning_rate": 9.599492320892548e-07, + "loss": 0.0282, + "step": 2422 + }, + { + "epoch": 0.15, + "grad_norm": 0.08653651633786842, + "learning_rate": 9.599087227267386e-07, + "loss": 0.0023, + "step": 2423 + }, + { + "epoch": 0.15, + "grad_norm": 0.5448737842767393, + "learning_rate": 9.598681937435918e-07, + "loss": 0.3058, + "step": 2424 + }, + { + "epoch": 0.15, + "grad_norm": 0.7449450919035487, + "learning_rate": 9.598276451415436e-07, + "loss": 0.1391, + "step": 2425 + }, + { + "epoch": 0.15, + "grad_norm": 0.5389733243028549, + "learning_rate": 9.597870769223234e-07, + "loss": 0.202, + "step": 2426 + }, + { + "epoch": 0.15, + "grad_norm": 0.9581193222381307, + "learning_rate": 9.597464890876623e-07, + "loss": 0.0766, + "step": 2427 + }, + { + "epoch": 0.15, + "grad_norm": 0.7255173443642816, + "learning_rate": 9.597058816392917e-07, + "loss": 0.2205, + "step": 2428 + }, + { + "epoch": 0.15, + "grad_norm": 0.5465860010520189, + "learning_rate": 9.596652545789441e-07, + "loss": 0.1827, + "step": 2429 + }, + { + "epoch": 0.15, + "grad_norm": 0.44458216987835736, + "learning_rate": 9.59624607908353e-07, + "loss": 0.0578, + "step": 2430 + }, + { + "epoch": 0.16, + "grad_norm": 0.9803309693892769, + "learning_rate": 9.595839416292516e-07, + "loss": 0.1905, + "step": 2431 + }, + { + "epoch": 0.16, + "grad_norm": 0.4503185098868732, + "learning_rate": 9.595432557433758e-07, + "loss": 0.1836, + "step": 2432 + }, + { + "epoch": 0.16, + "grad_norm": 0.4490701976502874, + "learning_rate": 9.595025502524607e-07, + "loss": 0.3053, + "step": 2433 + }, + { + "epoch": 0.16, + "grad_norm": 0.5301201782137251, + "learning_rate": 9.594618251582433e-07, + "loss": 0.1949, + "step": 2434 + }, + { + "epoch": 0.16, + "grad_norm": 0.4630095575443994, + "learning_rate": 9.594210804624608e-07, + "loss": 0.1619, + "step": 2435 + }, + { + "epoch": 0.16, + "grad_norm": 0.5665595307354613, + "learning_rate": 9.593803161668511e-07, + "loss": 0.1135, + "step": 2436 + }, + { + "epoch": 0.16, + "grad_norm": 0.4015014708988468, + "learning_rate": 9.59339532273154e-07, + "loss": 0.1545, + "step": 2437 + }, + { + "epoch": 0.16, + "grad_norm": 0.7615373792978193, + "learning_rate": 9.59298728783109e-07, + "loss": 0.2178, + "step": 2438 + }, + { + "epoch": 0.16, + "grad_norm": 0.3709279526175329, + "learning_rate": 9.59257905698457e-07, + "loss": 0.1777, + "step": 2439 + }, + { + "epoch": 0.16, + "grad_norm": 0.16915273068108697, + "learning_rate": 9.592170630209393e-07, + "loss": 0.0898, + "step": 2440 + }, + { + "epoch": 0.16, + "grad_norm": 0.4554536599748477, + "learning_rate": 9.591762007522986e-07, + "loss": 0.2354, + "step": 2441 + }, + { + "epoch": 0.16, + "grad_norm": 0.6277257649198804, + "learning_rate": 9.591353188942782e-07, + "loss": 0.2001, + "step": 2442 + }, + { + "epoch": 0.16, + "grad_norm": 0.7339984100844941, + "learning_rate": 9.59094417448622e-07, + "loss": 0.1953, + "step": 2443 + }, + { + "epoch": 0.16, + "grad_norm": 0.3210775223225176, + "learning_rate": 9.590534964170751e-07, + "loss": 0.0939, + "step": 2444 + }, + { + "epoch": 0.16, + "grad_norm": 0.4799228540673482, + "learning_rate": 9.590125558013833e-07, + "loss": 0.2605, + "step": 2445 + }, + { + "epoch": 0.16, + "grad_norm": 0.5003199535506793, + "learning_rate": 9.589715956032931e-07, + "loss": 0.1503, + "step": 2446 + }, + { + "epoch": 0.16, + "grad_norm": 1.2093919587016295, + "learning_rate": 9.589306158245519e-07, + "loss": 0.0823, + "step": 2447 + }, + { + "epoch": 0.16, + "grad_norm": 1.211062135323147, + "learning_rate": 9.588896164669083e-07, + "loss": 0.0798, + "step": 2448 + }, + { + "epoch": 0.16, + "grad_norm": 0.8255251624391429, + "learning_rate": 9.58848597532111e-07, + "loss": 0.3699, + "step": 2449 + }, + { + "epoch": 0.16, + "grad_norm": 0.7480180932204308, + "learning_rate": 9.5880755902191e-07, + "loss": 0.1169, + "step": 2450 + }, + { + "epoch": 0.16, + "grad_norm": 0.5885945131637391, + "learning_rate": 9.587665009380564e-07, + "loss": 0.2496, + "step": 2451 + }, + { + "epoch": 0.16, + "grad_norm": 0.6911377401986551, + "learning_rate": 9.587254232823017e-07, + "loss": 0.1732, + "step": 2452 + }, + { + "epoch": 0.16, + "grad_norm": 0.2802853173063493, + "learning_rate": 9.586843260563981e-07, + "loss": 0.2303, + "step": 2453 + }, + { + "epoch": 0.16, + "grad_norm": 0.5706821042495621, + "learning_rate": 9.586432092620993e-07, + "loss": 0.1628, + "step": 2454 + }, + { + "epoch": 0.16, + "grad_norm": 0.5149533528375522, + "learning_rate": 9.586020729011591e-07, + "loss": 0.0887, + "step": 2455 + }, + { + "epoch": 0.16, + "grad_norm": 0.8646381630516424, + "learning_rate": 9.585609169753323e-07, + "loss": 0.1842, + "step": 2456 + }, + { + "epoch": 0.16, + "grad_norm": 0.5109092992198471, + "learning_rate": 9.585197414863754e-07, + "loss": 0.1276, + "step": 2457 + }, + { + "epoch": 0.16, + "grad_norm": 0.34629214369350364, + "learning_rate": 9.584785464360442e-07, + "loss": 0.0915, + "step": 2458 + }, + { + "epoch": 0.16, + "grad_norm": 0.7161245704669781, + "learning_rate": 9.584373318260968e-07, + "loss": 0.1963, + "step": 2459 + }, + { + "epoch": 0.16, + "grad_norm": 0.4039344109309137, + "learning_rate": 9.583960976582913e-07, + "loss": 0.1153, + "step": 2460 + }, + { + "epoch": 0.16, + "grad_norm": 0.7991041837940455, + "learning_rate": 9.583548439343864e-07, + "loss": 0.2508, + "step": 2461 + }, + { + "epoch": 0.16, + "grad_norm": 0.39917016746617423, + "learning_rate": 9.583135706561427e-07, + "loss": 0.1676, + "step": 2462 + }, + { + "epoch": 0.16, + "grad_norm": 0.4471927740248249, + "learning_rate": 9.582722778253209e-07, + "loss": 0.0567, + "step": 2463 + }, + { + "epoch": 0.16, + "grad_norm": 0.5560947222512375, + "learning_rate": 9.582309654436824e-07, + "loss": 0.0954, + "step": 2464 + }, + { + "epoch": 0.16, + "grad_norm": 0.7713230345468085, + "learning_rate": 9.581896335129896e-07, + "loss": 0.1478, + "step": 2465 + }, + { + "epoch": 0.16, + "grad_norm": 0.5992686563278361, + "learning_rate": 9.581482820350062e-07, + "loss": 0.199, + "step": 2466 + }, + { + "epoch": 0.16, + "grad_norm": 0.5839880897771591, + "learning_rate": 9.581069110114959e-07, + "loss": 0.0778, + "step": 2467 + }, + { + "epoch": 0.16, + "grad_norm": 0.7665261128257583, + "learning_rate": 9.58065520444224e-07, + "loss": 0.2288, + "step": 2468 + }, + { + "epoch": 0.16, + "grad_norm": 0.3608221879220619, + "learning_rate": 9.58024110334956e-07, + "loss": 0.1416, + "step": 2469 + }, + { + "epoch": 0.16, + "grad_norm": 0.6580402781787531, + "learning_rate": 9.579826806854587e-07, + "loss": 0.2062, + "step": 2470 + }, + { + "epoch": 0.16, + "grad_norm": 0.616037261545377, + "learning_rate": 9.579412314974996e-07, + "loss": 0.1413, + "step": 2471 + }, + { + "epoch": 0.16, + "grad_norm": 0.3845961631809863, + "learning_rate": 9.578997627728472e-07, + "loss": 0.1416, + "step": 2472 + }, + { + "epoch": 0.16, + "grad_norm": 0.6196184136165247, + "learning_rate": 9.578582745132702e-07, + "loss": 0.4322, + "step": 2473 + }, + { + "epoch": 0.16, + "grad_norm": 0.6062145541363272, + "learning_rate": 9.57816766720539e-07, + "loss": 0.0266, + "step": 2474 + }, + { + "epoch": 0.16, + "grad_norm": 0.4826225528256578, + "learning_rate": 9.57775239396424e-07, + "loss": 0.0091, + "step": 2475 + }, + { + "epoch": 0.16, + "grad_norm": 0.39785134353497276, + "learning_rate": 9.577336925426972e-07, + "loss": 0.1898, + "step": 2476 + }, + { + "epoch": 0.16, + "grad_norm": 0.6171680118428898, + "learning_rate": 9.576921261611308e-07, + "loss": 0.1495, + "step": 2477 + }, + { + "epoch": 0.16, + "grad_norm": 0.7923348973742145, + "learning_rate": 9.576505402534984e-07, + "loss": 0.1283, + "step": 2478 + }, + { + "epoch": 0.16, + "grad_norm": 0.45345901198394867, + "learning_rate": 9.576089348215738e-07, + "loss": 0.2268, + "step": 2479 + }, + { + "epoch": 0.16, + "grad_norm": 0.48493822051526925, + "learning_rate": 9.575673098671322e-07, + "loss": 0.2149, + "step": 2480 + }, + { + "epoch": 0.16, + "grad_norm": 0.7903018784896051, + "learning_rate": 9.575256653919492e-07, + "loss": 0.1499, + "step": 2481 + }, + { + "epoch": 0.16, + "grad_norm": 0.3313231203351894, + "learning_rate": 9.574840013978016e-07, + "loss": 0.0865, + "step": 2482 + }, + { + "epoch": 0.16, + "grad_norm": 0.37845476055638494, + "learning_rate": 9.57442317886467e-07, + "loss": 0.3144, + "step": 2483 + }, + { + "epoch": 0.16, + "grad_norm": 1.8310784224755157, + "learning_rate": 9.574006148597237e-07, + "loss": 0.2645, + "step": 2484 + }, + { + "epoch": 0.16, + "grad_norm": 0.5913650808489613, + "learning_rate": 9.573588923193504e-07, + "loss": 0.1877, + "step": 2485 + }, + { + "epoch": 0.16, + "grad_norm": 0.39724934213196095, + "learning_rate": 9.573171502671272e-07, + "loss": 0.1766, + "step": 2486 + }, + { + "epoch": 0.16, + "grad_norm": 0.49356068507374784, + "learning_rate": 9.572753887048352e-07, + "loss": 0.2308, + "step": 2487 + }, + { + "epoch": 0.16, + "grad_norm": 0.6912161352337316, + "learning_rate": 9.57233607634256e-07, + "loss": 0.2348, + "step": 2488 + }, + { + "epoch": 0.16, + "grad_norm": 0.6673408530739752, + "learning_rate": 9.57191807057172e-07, + "loss": 0.182, + "step": 2489 + }, + { + "epoch": 0.16, + "grad_norm": 0.11580191835952931, + "learning_rate": 9.57149986975366e-07, + "loss": 0.0047, + "step": 2490 + }, + { + "epoch": 0.16, + "grad_norm": 0.3003908593523906, + "learning_rate": 9.571081473906231e-07, + "loss": 0.0654, + "step": 2491 + }, + { + "epoch": 0.16, + "grad_norm": 0.43900193929379083, + "learning_rate": 9.570662883047273e-07, + "loss": 0.131, + "step": 2492 + }, + { + "epoch": 0.16, + "grad_norm": 0.5021973118101629, + "learning_rate": 9.570244097194652e-07, + "loss": 0.1366, + "step": 2493 + }, + { + "epoch": 0.16, + "grad_norm": 0.513413478436164, + "learning_rate": 9.569825116366229e-07, + "loss": 0.1909, + "step": 2494 + }, + { + "epoch": 0.16, + "grad_norm": 0.975627560256554, + "learning_rate": 9.569405940579879e-07, + "loss": 0.1584, + "step": 2495 + }, + { + "epoch": 0.16, + "grad_norm": 0.9213510930240534, + "learning_rate": 9.568986569853487e-07, + "loss": 0.356, + "step": 2496 + }, + { + "epoch": 0.16, + "grad_norm": 0.5005513591545933, + "learning_rate": 9.568567004204942e-07, + "loss": 0.424, + "step": 2497 + }, + { + "epoch": 0.16, + "grad_norm": 0.3215748174192554, + "learning_rate": 9.568147243652145e-07, + "loss": 0.2754, + "step": 2498 + }, + { + "epoch": 0.16, + "grad_norm": 0.592545580116721, + "learning_rate": 9.567727288213004e-07, + "loss": 0.2503, + "step": 2499 + }, + { + "epoch": 0.16, + "grad_norm": 0.32801051559945615, + "learning_rate": 9.567307137905433e-07, + "loss": 0.239, + "step": 2500 + }, + { + "epoch": 0.16, + "grad_norm": 0.7970070432843507, + "learning_rate": 9.56688679274736e-07, + "loss": 0.2516, + "step": 2501 + }, + { + "epoch": 0.16, + "grad_norm": 0.6526024385015151, + "learning_rate": 9.566466252756717e-07, + "loss": 0.2077, + "step": 2502 + }, + { + "epoch": 0.16, + "grad_norm": 0.27808512487183307, + "learning_rate": 9.566045517951443e-07, + "loss": 0.2123, + "step": 2503 + }, + { + "epoch": 0.16, + "grad_norm": 0.6026659699645546, + "learning_rate": 9.565624588349486e-07, + "loss": 0.3233, + "step": 2504 + }, + { + "epoch": 0.16, + "grad_norm": 0.421675142640018, + "learning_rate": 9.565203463968807e-07, + "loss": 0.2718, + "step": 2505 + }, + { + "epoch": 0.16, + "grad_norm": 0.9978363566766066, + "learning_rate": 9.564782144827372e-07, + "loss": 0.2785, + "step": 2506 + }, + { + "epoch": 0.16, + "grad_norm": 0.5901030590545348, + "learning_rate": 9.564360630943153e-07, + "loss": 0.143, + "step": 2507 + }, + { + "epoch": 0.16, + "grad_norm": 0.8699076113639252, + "learning_rate": 9.563938922334136e-07, + "loss": 0.3662, + "step": 2508 + }, + { + "epoch": 0.16, + "grad_norm": 0.5544078938630749, + "learning_rate": 9.563517019018308e-07, + "loss": 0.2223, + "step": 2509 + }, + { + "epoch": 0.16, + "grad_norm": 0.9647226312710868, + "learning_rate": 9.563094921013672e-07, + "loss": 0.0912, + "step": 2510 + }, + { + "epoch": 0.16, + "grad_norm": 0.4990970038813086, + "learning_rate": 9.562672628338231e-07, + "loss": 0.1941, + "step": 2511 + }, + { + "epoch": 0.16, + "grad_norm": 0.15629989285131096, + "learning_rate": 9.562250141010006e-07, + "loss": 0.1158, + "step": 2512 + }, + { + "epoch": 0.16, + "grad_norm": 0.9254353943652026, + "learning_rate": 9.561827459047017e-07, + "loss": 0.2113, + "step": 2513 + }, + { + "epoch": 0.16, + "grad_norm": 0.3158188291288584, + "learning_rate": 9.5614045824673e-07, + "loss": 0.1268, + "step": 2514 + }, + { + "epoch": 0.16, + "grad_norm": 1.4468005883944894, + "learning_rate": 9.560981511288892e-07, + "loss": 0.3139, + "step": 2515 + }, + { + "epoch": 0.16, + "grad_norm": 0.3096031642338491, + "learning_rate": 9.560558245529847e-07, + "loss": 0.0759, + "step": 2516 + }, + { + "epoch": 0.16, + "grad_norm": 0.5855834841151715, + "learning_rate": 9.560134785208215e-07, + "loss": 0.2879, + "step": 2517 + }, + { + "epoch": 0.16, + "grad_norm": 0.6364203530360595, + "learning_rate": 9.55971113034207e-07, + "loss": 0.3691, + "step": 2518 + }, + { + "epoch": 0.16, + "grad_norm": 0.5258385288479613, + "learning_rate": 9.55928728094948e-07, + "loss": 0.1821, + "step": 2519 + }, + { + "epoch": 0.16, + "grad_norm": 0.31328462882732316, + "learning_rate": 9.558863237048528e-07, + "loss": 0.2089, + "step": 2520 + }, + { + "epoch": 0.16, + "grad_norm": 0.4188983438063981, + "learning_rate": 9.55843899865731e-07, + "loss": 0.2094, + "step": 2521 + }, + { + "epoch": 0.16, + "grad_norm": 0.7633257798796065, + "learning_rate": 9.558014565793917e-07, + "loss": 0.2042, + "step": 2522 + }, + { + "epoch": 0.16, + "grad_norm": 0.3434334507804224, + "learning_rate": 9.557589938476462e-07, + "loss": 0.0868, + "step": 2523 + }, + { + "epoch": 0.16, + "grad_norm": 1.4018431164075182, + "learning_rate": 9.557165116723056e-07, + "loss": 0.3267, + "step": 2524 + }, + { + "epoch": 0.16, + "grad_norm": 0.44637835500273204, + "learning_rate": 9.556740100551827e-07, + "loss": 0.1597, + "step": 2525 + }, + { + "epoch": 0.16, + "grad_norm": 0.2952894519625469, + "learning_rate": 9.556314889980906e-07, + "loss": 0.1849, + "step": 2526 + }, + { + "epoch": 0.16, + "grad_norm": 0.606974143033971, + "learning_rate": 9.55588948502843e-07, + "loss": 0.2545, + "step": 2527 + }, + { + "epoch": 0.16, + "grad_norm": 0.808011575425698, + "learning_rate": 9.55546388571255e-07, + "loss": 0.4429, + "step": 2528 + }, + { + "epoch": 0.16, + "grad_norm": 0.38018435236573644, + "learning_rate": 9.555038092051424e-07, + "loss": 0.1824, + "step": 2529 + }, + { + "epoch": 0.16, + "grad_norm": 0.5634119634022953, + "learning_rate": 9.554612104063218e-07, + "loss": 0.2054, + "step": 2530 + }, + { + "epoch": 0.16, + "grad_norm": 0.4168603419719356, + "learning_rate": 9.5541859217661e-07, + "loss": 0.055, + "step": 2531 + }, + { + "epoch": 0.16, + "grad_norm": 0.7034347148926832, + "learning_rate": 9.55375954517826e-07, + "loss": 0.4097, + "step": 2532 + }, + { + "epoch": 0.16, + "grad_norm": 0.6641957985703538, + "learning_rate": 9.553332974317881e-07, + "loss": 0.0477, + "step": 2533 + }, + { + "epoch": 0.16, + "grad_norm": 0.16677956900373406, + "learning_rate": 9.552906209203164e-07, + "loss": 0.093, + "step": 2534 + }, + { + "epoch": 0.16, + "grad_norm": 0.672648093655843, + "learning_rate": 9.552479249852314e-07, + "loss": 0.2203, + "step": 2535 + }, + { + "epoch": 0.16, + "grad_norm": 0.9316536883359925, + "learning_rate": 9.55205209628355e-07, + "loss": 0.0684, + "step": 2536 + }, + { + "epoch": 0.16, + "grad_norm": 0.7425895926971369, + "learning_rate": 9.551624748515093e-07, + "loss": 0.3351, + "step": 2537 + }, + { + "epoch": 0.16, + "grad_norm": 0.5577282767065026, + "learning_rate": 9.551197206565172e-07, + "loss": 0.25, + "step": 2538 + }, + { + "epoch": 0.16, + "grad_norm": 0.7144229171110558, + "learning_rate": 9.55076947045203e-07, + "loss": 0.0449, + "step": 2539 + }, + { + "epoch": 0.16, + "grad_norm": 0.6754350692010835, + "learning_rate": 9.550341540193915e-07, + "loss": 0.239, + "step": 2540 + }, + { + "epoch": 0.16, + "grad_norm": 0.48017233439651535, + "learning_rate": 9.549913415809083e-07, + "loss": 0.1089, + "step": 2541 + }, + { + "epoch": 0.16, + "grad_norm": 0.2157533579531004, + "learning_rate": 9.549485097315797e-07, + "loss": 0.0096, + "step": 2542 + }, + { + "epoch": 0.16, + "grad_norm": 0.593163716860367, + "learning_rate": 9.549056584732332e-07, + "loss": 0.1677, + "step": 2543 + }, + { + "epoch": 0.16, + "grad_norm": 0.9026525850890124, + "learning_rate": 9.54862787807697e-07, + "loss": 0.0445, + "step": 2544 + }, + { + "epoch": 0.16, + "grad_norm": 0.7539484755867265, + "learning_rate": 9.548198977367997e-07, + "loss": 0.3909, + "step": 2545 + }, + { + "epoch": 0.16, + "grad_norm": 0.5651341354415033, + "learning_rate": 9.547769882623711e-07, + "loss": 0.2107, + "step": 2546 + }, + { + "epoch": 0.16, + "grad_norm": 0.5613177694374492, + "learning_rate": 9.54734059386242e-07, + "loss": 0.0349, + "step": 2547 + }, + { + "epoch": 0.16, + "grad_norm": 1.7745166693905394, + "learning_rate": 9.54691111110244e-07, + "loss": 0.1507, + "step": 2548 + }, + { + "epoch": 0.16, + "grad_norm": 0.5372854238568204, + "learning_rate": 9.54648143436209e-07, + "loss": 0.2093, + "step": 2549 + }, + { + "epoch": 0.16, + "grad_norm": 0.9249320537401026, + "learning_rate": 9.546051563659703e-07, + "loss": 0.1975, + "step": 2550 + }, + { + "epoch": 0.16, + "grad_norm": 0.7554143629377637, + "learning_rate": 9.545621499013618e-07, + "loss": 0.1915, + "step": 2551 + }, + { + "epoch": 0.16, + "grad_norm": 0.7802398204245109, + "learning_rate": 9.545191240442181e-07, + "loss": 0.1643, + "step": 2552 + }, + { + "epoch": 0.16, + "grad_norm": 0.8053293945513413, + "learning_rate": 9.54476078796375e-07, + "loss": 0.1896, + "step": 2553 + }, + { + "epoch": 0.16, + "grad_norm": 0.6111219177554211, + "learning_rate": 9.544330141596687e-07, + "loss": 0.1185, + "step": 2554 + }, + { + "epoch": 0.16, + "grad_norm": 0.7438516675839351, + "learning_rate": 9.543899301359365e-07, + "loss": 0.2293, + "step": 2555 + }, + { + "epoch": 0.16, + "grad_norm": 1.210898332409302, + "learning_rate": 9.543468267270164e-07, + "loss": 0.1928, + "step": 2556 + }, + { + "epoch": 0.16, + "grad_norm": 0.5612228283646611, + "learning_rate": 9.543037039347473e-07, + "loss": 0.2255, + "step": 2557 + }, + { + "epoch": 0.16, + "grad_norm": 0.6662477311234537, + "learning_rate": 9.54260561760969e-07, + "loss": 0.1428, + "step": 2558 + }, + { + "epoch": 0.16, + "grad_norm": 0.37963751492009234, + "learning_rate": 9.54217400207522e-07, + "loss": 0.2283, + "step": 2559 + }, + { + "epoch": 0.16, + "grad_norm": 0.48889752434070105, + "learning_rate": 9.541742192762476e-07, + "loss": 0.2742, + "step": 2560 + }, + { + "epoch": 0.16, + "grad_norm": 0.7399252688970139, + "learning_rate": 9.541310189689879e-07, + "loss": 0.3073, + "step": 2561 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568202974158361, + "learning_rate": 9.54087799287586e-07, + "loss": 0.116, + "step": 2562 + }, + { + "epoch": 0.16, + "grad_norm": 0.35524417309513245, + "learning_rate": 9.540445602338859e-07, + "loss": 0.1225, + "step": 2563 + }, + { + "epoch": 0.16, + "grad_norm": 0.8459374834491684, + "learning_rate": 9.54001301809732e-07, + "loss": 0.2193, + "step": 2564 + }, + { + "epoch": 0.16, + "grad_norm": 0.36609078980197424, + "learning_rate": 9.5395802401697e-07, + "loss": 0.0446, + "step": 2565 + }, + { + "epoch": 0.16, + "grad_norm": 0.6045336995181586, + "learning_rate": 9.539147268574459e-07, + "loss": 0.3463, + "step": 2566 + }, + { + "epoch": 0.16, + "grad_norm": 0.6689902923021831, + "learning_rate": 9.538714103330073e-07, + "loss": 0.2253, + "step": 2567 + }, + { + "epoch": 0.16, + "grad_norm": 0.9679760657231808, + "learning_rate": 9.53828074445502e-07, + "loss": 0.0427, + "step": 2568 + }, + { + "epoch": 0.16, + "grad_norm": 0.11885571743426666, + "learning_rate": 9.537847191967785e-07, + "loss": 0.0941, + "step": 2569 + }, + { + "epoch": 0.16, + "grad_norm": 0.5651051388610489, + "learning_rate": 9.537413445886868e-07, + "loss": 0.0448, + "step": 2570 + }, + { + "epoch": 0.16, + "grad_norm": 0.924577730359375, + "learning_rate": 9.53697950623077e-07, + "loss": 0.3433, + "step": 2571 + }, + { + "epoch": 0.16, + "grad_norm": 1.1859239450215893, + "learning_rate": 9.536545373018009e-07, + "loss": 0.1671, + "step": 2572 + }, + { + "epoch": 0.16, + "grad_norm": 0.6339655036031436, + "learning_rate": 9.536111046267102e-07, + "loss": 0.1357, + "step": 2573 + }, + { + "epoch": 0.16, + "grad_norm": 0.9970807259261836, + "learning_rate": 9.535676525996577e-07, + "loss": 0.1535, + "step": 2574 + }, + { + "epoch": 0.16, + "grad_norm": 1.262159878337895, + "learning_rate": 9.535241812224975e-07, + "loss": 0.2395, + "step": 2575 + }, + { + "epoch": 0.16, + "grad_norm": 0.3263401335027573, + "learning_rate": 9.53480690497084e-07, + "loss": 0.1853, + "step": 2576 + }, + { + "epoch": 0.16, + "grad_norm": 0.7486868284212916, + "learning_rate": 9.534371804252726e-07, + "loss": 0.1686, + "step": 2577 + }, + { + "epoch": 0.16, + "grad_norm": 0.8823603113615602, + "learning_rate": 9.533936510089197e-07, + "loss": 0.3605, + "step": 2578 + }, + { + "epoch": 0.16, + "grad_norm": 0.5844132652817103, + "learning_rate": 9.533501022498821e-07, + "loss": 0.2721, + "step": 2579 + }, + { + "epoch": 0.16, + "grad_norm": 0.3938624575027053, + "learning_rate": 9.533065341500178e-07, + "loss": 0.0185, + "step": 2580 + }, + { + "epoch": 0.16, + "grad_norm": 1.0230096021188217, + "learning_rate": 9.532629467111855e-07, + "loss": 0.206, + "step": 2581 + }, + { + "epoch": 0.16, + "grad_norm": 0.47886346771279825, + "learning_rate": 9.532193399352448e-07, + "loss": 0.1239, + "step": 2582 + }, + { + "epoch": 0.16, + "grad_norm": 0.41974794747755956, + "learning_rate": 9.531757138240559e-07, + "loss": 0.231, + "step": 2583 + }, + { + "epoch": 0.16, + "grad_norm": 1.4259696823285501, + "learning_rate": 9.5313206837948e-07, + "loss": 0.3168, + "step": 2584 + }, + { + "epoch": 0.16, + "grad_norm": 0.21675652071370596, + "learning_rate": 9.530884036033793e-07, + "loss": 0.1029, + "step": 2585 + }, + { + "epoch": 0.16, + "grad_norm": 0.163548726262206, + "learning_rate": 9.530447194976163e-07, + "loss": 0.0652, + "step": 2586 + }, + { + "epoch": 0.16, + "grad_norm": 0.44003913761455, + "learning_rate": 9.53001016064055e-07, + "loss": 0.2302, + "step": 2587 + }, + { + "epoch": 0.17, + "grad_norm": 0.20866023849578044, + "learning_rate": 9.529572933045595e-07, + "loss": 0.0638, + "step": 2588 + }, + { + "epoch": 0.17, + "grad_norm": 0.7631673046005106, + "learning_rate": 9.529135512209955e-07, + "loss": 0.3408, + "step": 2589 + }, + { + "epoch": 0.17, + "grad_norm": 1.371969954717516, + "learning_rate": 9.528697898152288e-07, + "loss": 0.3842, + "step": 2590 + }, + { + "epoch": 0.17, + "grad_norm": 0.6568255870713744, + "learning_rate": 9.528260090891266e-07, + "loss": 0.2441, + "step": 2591 + }, + { + "epoch": 0.17, + "grad_norm": 0.41472603573328204, + "learning_rate": 9.527822090445565e-07, + "loss": 0.1132, + "step": 2592 + }, + { + "epoch": 0.17, + "grad_norm": 0.8264338215602798, + "learning_rate": 9.527383896833872e-07, + "loss": 0.3289, + "step": 2593 + }, + { + "epoch": 0.17, + "grad_norm": 0.7378902329751158, + "learning_rate": 9.52694551007488e-07, + "loss": 0.1971, + "step": 2594 + }, + { + "epoch": 0.17, + "grad_norm": 0.5173460944303994, + "learning_rate": 9.526506930187292e-07, + "loss": 0.1739, + "step": 2595 + }, + { + "epoch": 0.17, + "grad_norm": 0.8927570683629182, + "learning_rate": 9.526068157189819e-07, + "loss": 0.4867, + "step": 2596 + }, + { + "epoch": 0.17, + "grad_norm": 0.522756344053833, + "learning_rate": 9.525629191101181e-07, + "loss": 0.0821, + "step": 2597 + }, + { + "epoch": 0.17, + "grad_norm": 0.9624025756487392, + "learning_rate": 9.525190031940105e-07, + "loss": 0.1431, + "step": 2598 + }, + { + "epoch": 0.17, + "grad_norm": 0.9228900074603312, + "learning_rate": 9.524750679725323e-07, + "loss": 0.3242, + "step": 2599 + }, + { + "epoch": 0.17, + "grad_norm": 0.31772156343051866, + "learning_rate": 9.524311134475582e-07, + "loss": 0.2404, + "step": 2600 + }, + { + "epoch": 0.17, + "grad_norm": 0.5661715722613065, + "learning_rate": 9.523871396209633e-07, + "loss": 0.1122, + "step": 2601 + }, + { + "epoch": 0.17, + "grad_norm": 1.888294519797739, + "learning_rate": 9.523431464946236e-07, + "loss": 0.2555, + "step": 2602 + }, + { + "epoch": 0.17, + "grad_norm": 1.4107675325749358, + "learning_rate": 9.522991340704161e-07, + "loss": 0.3525, + "step": 2603 + }, + { + "epoch": 0.17, + "grad_norm": 0.7403158963458948, + "learning_rate": 9.522551023502181e-07, + "loss": 0.2662, + "step": 2604 + }, + { + "epoch": 0.17, + "grad_norm": 0.5424851219006864, + "learning_rate": 9.522110513359083e-07, + "loss": 0.1412, + "step": 2605 + }, + { + "epoch": 0.17, + "grad_norm": 1.4147819856967123, + "learning_rate": 9.521669810293661e-07, + "loss": 0.0952, + "step": 2606 + }, + { + "epoch": 0.17, + "grad_norm": 0.5717881857910588, + "learning_rate": 9.521228914324715e-07, + "loss": 0.0613, + "step": 2607 + }, + { + "epoch": 0.17, + "grad_norm": 1.4241907311844593, + "learning_rate": 9.520787825471055e-07, + "loss": 0.1777, + "step": 2608 + }, + { + "epoch": 0.17, + "grad_norm": 0.2865823410065802, + "learning_rate": 9.520346543751496e-07, + "loss": 0.0107, + "step": 2609 + }, + { + "epoch": 0.17, + "grad_norm": 0.8866862834291714, + "learning_rate": 9.519905069184869e-07, + "loss": 0.2322, + "step": 2610 + }, + { + "epoch": 0.17, + "grad_norm": 0.8196263524169246, + "learning_rate": 9.519463401790004e-07, + "loss": 0.2045, + "step": 2611 + }, + { + "epoch": 0.17, + "grad_norm": 0.7649383541668356, + "learning_rate": 9.519021541585748e-07, + "loss": 0.3651, + "step": 2612 + }, + { + "epoch": 0.17, + "grad_norm": 0.44313444995716733, + "learning_rate": 9.518579488590946e-07, + "loss": 0.3697, + "step": 2613 + }, + { + "epoch": 0.17, + "grad_norm": 0.5828627418819751, + "learning_rate": 9.51813724282446e-07, + "loss": 0.2626, + "step": 2614 + }, + { + "epoch": 0.17, + "grad_norm": 0.2969889590099513, + "learning_rate": 9.517694804305156e-07, + "loss": 0.182, + "step": 2615 + }, + { + "epoch": 0.17, + "grad_norm": 0.20698729443897265, + "learning_rate": 9.517252173051911e-07, + "loss": 0.004, + "step": 2616 + }, + { + "epoch": 0.17, + "grad_norm": 0.9516054489380347, + "learning_rate": 9.516809349083608e-07, + "loss": 0.1901, + "step": 2617 + }, + { + "epoch": 0.17, + "grad_norm": 0.3040730187488041, + "learning_rate": 9.516366332419137e-07, + "loss": 0.07, + "step": 2618 + }, + { + "epoch": 0.17, + "grad_norm": 1.7427736545565709, + "learning_rate": 9.515923123077399e-07, + "loss": 0.3721, + "step": 2619 + }, + { + "epoch": 0.17, + "grad_norm": 1.4227813936269338, + "learning_rate": 9.515479721077303e-07, + "loss": 0.1311, + "step": 2620 + }, + { + "epoch": 0.17, + "grad_norm": 0.9890664145268651, + "learning_rate": 9.515036126437766e-07, + "loss": 0.1963, + "step": 2621 + }, + { + "epoch": 0.17, + "grad_norm": 0.4338410237920062, + "learning_rate": 9.514592339177709e-07, + "loss": 0.1358, + "step": 2622 + }, + { + "epoch": 0.17, + "grad_norm": 0.6461712191502266, + "learning_rate": 9.514148359316069e-07, + "loss": 0.1093, + "step": 2623 + }, + { + "epoch": 0.17, + "grad_norm": 1.0781316168167339, + "learning_rate": 9.513704186871785e-07, + "loss": 0.5929, + "step": 2624 + }, + { + "epoch": 0.17, + "grad_norm": 0.5937921628955494, + "learning_rate": 9.513259821863806e-07, + "loss": 0.025, + "step": 2625 + }, + { + "epoch": 0.17, + "grad_norm": 2.2050441558443006, + "learning_rate": 9.512815264311092e-07, + "loss": 0.3322, + "step": 2626 + }, + { + "epoch": 0.17, + "grad_norm": 0.6447535671899982, + "learning_rate": 9.512370514232606e-07, + "loss": 0.2322, + "step": 2627 + }, + { + "epoch": 0.17, + "grad_norm": 0.38982095788275767, + "learning_rate": 9.511925571647322e-07, + "loss": 0.1749, + "step": 2628 + }, + { + "epoch": 0.17, + "grad_norm": 1.2553585210304645, + "learning_rate": 9.511480436574224e-07, + "loss": 0.2043, + "step": 2629 + }, + { + "epoch": 0.17, + "grad_norm": 0.6165582812152801, + "learning_rate": 9.511035109032301e-07, + "loss": 0.0743, + "step": 2630 + }, + { + "epoch": 0.17, + "grad_norm": 1.3624701319014514, + "learning_rate": 9.510589589040552e-07, + "loss": 0.3819, + "step": 2631 + }, + { + "epoch": 0.17, + "grad_norm": 0.4195544643620065, + "learning_rate": 9.510143876617985e-07, + "loss": 0.1449, + "step": 2632 + }, + { + "epoch": 0.17, + "grad_norm": 0.2960278562831275, + "learning_rate": 9.509697971783612e-07, + "loss": 0.0074, + "step": 2633 + }, + { + "epoch": 0.17, + "grad_norm": 0.45621861203351, + "learning_rate": 9.509251874556459e-07, + "loss": 0.2407, + "step": 2634 + }, + { + "epoch": 0.17, + "grad_norm": 0.9044462460962452, + "learning_rate": 9.508805584955555e-07, + "loss": 0.0322, + "step": 2635 + }, + { + "epoch": 0.17, + "grad_norm": 0.6058690813013976, + "learning_rate": 9.508359102999941e-07, + "loss": 0.2087, + "step": 2636 + }, + { + "epoch": 0.17, + "grad_norm": 0.7613827905763582, + "learning_rate": 9.507912428708666e-07, + "loss": 0.3585, + "step": 2637 + }, + { + "epoch": 0.17, + "grad_norm": 0.6886394154987239, + "learning_rate": 9.507465562100784e-07, + "loss": 0.2638, + "step": 2638 + }, + { + "epoch": 0.17, + "grad_norm": 0.7294311137348841, + "learning_rate": 9.507018503195361e-07, + "loss": 0.1544, + "step": 2639 + }, + { + "epoch": 0.17, + "grad_norm": 0.5800813312282401, + "learning_rate": 9.506571252011466e-07, + "loss": 0.2576, + "step": 2640 + }, + { + "epoch": 0.17, + "grad_norm": 0.982939595532959, + "learning_rate": 9.506123808568185e-07, + "loss": 0.1237, + "step": 2641 + }, + { + "epoch": 0.17, + "grad_norm": 0.49116126395133736, + "learning_rate": 9.505676172884601e-07, + "loss": 0.1741, + "step": 2642 + }, + { + "epoch": 0.17, + "grad_norm": 0.8696972031988437, + "learning_rate": 9.505228344979817e-07, + "loss": 0.2839, + "step": 2643 + }, + { + "epoch": 0.17, + "grad_norm": 0.45227011206891493, + "learning_rate": 9.504780324872932e-07, + "loss": 0.1896, + "step": 2644 + }, + { + "epoch": 0.17, + "grad_norm": 0.9564678587797452, + "learning_rate": 9.504332112583064e-07, + "loss": 0.3192, + "step": 2645 + }, + { + "epoch": 0.17, + "grad_norm": 0.6422774084741891, + "learning_rate": 9.503883708129334e-07, + "loss": 0.2231, + "step": 2646 + }, + { + "epoch": 0.17, + "grad_norm": 0.9070393905137192, + "learning_rate": 9.50343511153087e-07, + "loss": 0.2916, + "step": 2647 + }, + { + "epoch": 0.17, + "grad_norm": 0.7973637615785845, + "learning_rate": 9.502986322806811e-07, + "loss": 0.5046, + "step": 2648 + }, + { + "epoch": 0.17, + "grad_norm": 0.7107042441039871, + "learning_rate": 9.502537341976305e-07, + "loss": 0.1953, + "step": 2649 + }, + { + "epoch": 0.17, + "grad_norm": 0.6169596807696893, + "learning_rate": 9.502088169058503e-07, + "loss": 0.1974, + "step": 2650 + }, + { + "epoch": 0.17, + "grad_norm": 0.5224394109098526, + "learning_rate": 9.501638804072569e-07, + "loss": 0.2402, + "step": 2651 + }, + { + "epoch": 0.17, + "grad_norm": 0.3155133421120215, + "learning_rate": 9.501189247037675e-07, + "loss": 0.0924, + "step": 2652 + }, + { + "epoch": 0.17, + "grad_norm": 0.4912769084674077, + "learning_rate": 9.500739497973e-07, + "loss": 0.1181, + "step": 2653 + }, + { + "epoch": 0.17, + "grad_norm": 0.8258919552020955, + "learning_rate": 9.500289556897729e-07, + "loss": 0.254, + "step": 2654 + }, + { + "epoch": 0.17, + "grad_norm": 0.5399399057606297, + "learning_rate": 9.49983942383106e-07, + "loss": 0.2062, + "step": 2655 + }, + { + "epoch": 0.17, + "grad_norm": 0.48393195306922543, + "learning_rate": 9.499389098792196e-07, + "loss": 0.1076, + "step": 2656 + }, + { + "epoch": 0.17, + "grad_norm": 1.0377411795960845, + "learning_rate": 9.498938581800347e-07, + "loss": 0.1113, + "step": 2657 + }, + { + "epoch": 0.17, + "grad_norm": 0.4560969694650829, + "learning_rate": 9.498487872874733e-07, + "loss": 0.3368, + "step": 2658 + }, + { + "epoch": 0.17, + "grad_norm": 0.9862958881707448, + "learning_rate": 9.498036972034584e-07, + "loss": 0.1763, + "step": 2659 + }, + { + "epoch": 0.17, + "grad_norm": 0.6833198751285896, + "learning_rate": 9.497585879299137e-07, + "loss": 0.28, + "step": 2660 + }, + { + "epoch": 0.17, + "grad_norm": 1.4888347324512454, + "learning_rate": 9.497134594687634e-07, + "loss": 0.2217, + "step": 2661 + }, + { + "epoch": 0.17, + "grad_norm": 0.5719202825316656, + "learning_rate": 9.496683118219328e-07, + "loss": 0.2603, + "step": 2662 + }, + { + "epoch": 0.17, + "grad_norm": 0.7848817116848156, + "learning_rate": 9.496231449913482e-07, + "loss": 0.0865, + "step": 2663 + }, + { + "epoch": 0.17, + "grad_norm": 0.5583436160757501, + "learning_rate": 9.495779589789364e-07, + "loss": 0.1155, + "step": 2664 + }, + { + "epoch": 0.17, + "grad_norm": 0.29229710747176135, + "learning_rate": 9.495327537866249e-07, + "loss": 0.0897, + "step": 2665 + }, + { + "epoch": 0.17, + "grad_norm": 0.6378875468827189, + "learning_rate": 9.494875294163427e-07, + "loss": 0.3491, + "step": 2666 + }, + { + "epoch": 0.17, + "grad_norm": 0.7831721155048247, + "learning_rate": 9.494422858700187e-07, + "loss": 0.2136, + "step": 2667 + }, + { + "epoch": 0.17, + "grad_norm": 0.7366203540518024, + "learning_rate": 9.493970231495834e-07, + "loss": 0.3171, + "step": 2668 + }, + { + "epoch": 0.17, + "grad_norm": 0.3410124132656613, + "learning_rate": 9.493517412569678e-07, + "loss": 0.0108, + "step": 2669 + }, + { + "epoch": 0.17, + "grad_norm": 2.0068122108176234, + "learning_rate": 9.493064401941034e-07, + "loss": 0.1751, + "step": 2670 + }, + { + "epoch": 0.17, + "grad_norm": 0.8529491418660656, + "learning_rate": 9.492611199629232e-07, + "loss": 0.2523, + "step": 2671 + }, + { + "epoch": 0.17, + "grad_norm": 0.25242722415960495, + "learning_rate": 9.492157805653604e-07, + "loss": 0.1797, + "step": 2672 + }, + { + "epoch": 0.17, + "grad_norm": 1.085744296075026, + "learning_rate": 9.491704220033494e-07, + "loss": 0.1487, + "step": 2673 + }, + { + "epoch": 0.17, + "grad_norm": 0.5368273019639394, + "learning_rate": 9.491250442788252e-07, + "loss": 0.1409, + "step": 2674 + }, + { + "epoch": 0.17, + "grad_norm": 0.7625266156549131, + "learning_rate": 9.490796473937238e-07, + "loss": 0.2027, + "step": 2675 + }, + { + "epoch": 0.17, + "grad_norm": 0.48853974831983377, + "learning_rate": 9.49034231349982e-07, + "loss": 0.0527, + "step": 2676 + }, + { + "epoch": 0.17, + "grad_norm": 0.5674460569210614, + "learning_rate": 9.48988796149537e-07, + "loss": 0.1686, + "step": 2677 + }, + { + "epoch": 0.17, + "grad_norm": 0.3302941832085532, + "learning_rate": 9.489433417943275e-07, + "loss": 0.2061, + "step": 2678 + }, + { + "epoch": 0.17, + "grad_norm": 0.6617601654386271, + "learning_rate": 9.488978682862925e-07, + "loss": 0.0978, + "step": 2679 + }, + { + "epoch": 0.17, + "grad_norm": 1.2120206993789706, + "learning_rate": 9.488523756273721e-07, + "loss": 0.1629, + "step": 2680 + }, + { + "epoch": 0.17, + "grad_norm": 0.27118500248326494, + "learning_rate": 9.488068638195071e-07, + "loss": 0.1445, + "step": 2681 + }, + { + "epoch": 0.17, + "grad_norm": 1.2673674372884545, + "learning_rate": 9.487613328646389e-07, + "loss": 0.1109, + "step": 2682 + }, + { + "epoch": 0.17, + "grad_norm": 0.39638542360387147, + "learning_rate": 9.487157827647101e-07, + "loss": 0.1716, + "step": 2683 + }, + { + "epoch": 0.17, + "grad_norm": 0.8976279257317973, + "learning_rate": 9.486702135216643e-07, + "loss": 0.3226, + "step": 2684 + }, + { + "epoch": 0.17, + "grad_norm": 0.8303920226304105, + "learning_rate": 9.486246251374449e-07, + "loss": 0.253, + "step": 2685 + }, + { + "epoch": 0.17, + "grad_norm": 1.5509054405484801, + "learning_rate": 9.485790176139972e-07, + "loss": 0.2561, + "step": 2686 + }, + { + "epoch": 0.17, + "grad_norm": 0.4830660775150088, + "learning_rate": 9.485333909532669e-07, + "loss": 0.1147, + "step": 2687 + }, + { + "epoch": 0.17, + "grad_norm": 0.8118934988535611, + "learning_rate": 9.484877451572007e-07, + "loss": 0.4417, + "step": 2688 + }, + { + "epoch": 0.17, + "grad_norm": 0.3074240992680236, + "learning_rate": 9.484420802277454e-07, + "loss": 0.0119, + "step": 2689 + }, + { + "epoch": 0.17, + "grad_norm": 0.4397118420494491, + "learning_rate": 9.483963961668496e-07, + "loss": 0.3114, + "step": 2690 + }, + { + "epoch": 0.17, + "grad_norm": 0.5599381421398959, + "learning_rate": 9.483506929764622e-07, + "loss": 0.1553, + "step": 2691 + }, + { + "epoch": 0.17, + "grad_norm": 0.5900923425201272, + "learning_rate": 9.483049706585329e-07, + "loss": 0.0354, + "step": 2692 + }, + { + "epoch": 0.17, + "grad_norm": 0.7315701933263555, + "learning_rate": 9.482592292150122e-07, + "loss": 0.1048, + "step": 2693 + }, + { + "epoch": 0.17, + "grad_norm": 0.5995646513358464, + "learning_rate": 9.482134686478518e-07, + "loss": 0.2526, + "step": 2694 + }, + { + "epoch": 0.17, + "grad_norm": 0.5594556337693576, + "learning_rate": 9.481676889590038e-07, + "loss": 0.2493, + "step": 2695 + }, + { + "epoch": 0.17, + "grad_norm": 1.006129048690075, + "learning_rate": 9.481218901504214e-07, + "loss": 0.4051, + "step": 2696 + }, + { + "epoch": 0.17, + "grad_norm": 0.46685641387092186, + "learning_rate": 9.480760722240582e-07, + "loss": 0.2618, + "step": 2697 + }, + { + "epoch": 0.17, + "grad_norm": 1.2050649689990065, + "learning_rate": 9.480302351818689e-07, + "loss": 0.1889, + "step": 2698 + }, + { + "epoch": 0.17, + "grad_norm": 0.35983640486842944, + "learning_rate": 9.479843790258093e-07, + "loss": 0.1236, + "step": 2699 + }, + { + "epoch": 0.17, + "grad_norm": 0.6665101621244433, + "learning_rate": 9.479385037578354e-07, + "loss": 0.3935, + "step": 2700 + }, + { + "epoch": 0.17, + "grad_norm": 0.7461767823578016, + "learning_rate": 9.478926093799045e-07, + "loss": 0.0802, + "step": 2701 + }, + { + "epoch": 0.17, + "grad_norm": 0.729629630723568, + "learning_rate": 9.478466958939745e-07, + "loss": 0.2092, + "step": 2702 + }, + { + "epoch": 0.17, + "grad_norm": 0.5086910171019614, + "learning_rate": 9.478007633020042e-07, + "loss": 0.5959, + "step": 2703 + }, + { + "epoch": 0.17, + "grad_norm": 1.004591934260547, + "learning_rate": 9.47754811605953e-07, + "loss": 0.2812, + "step": 2704 + }, + { + "epoch": 0.17, + "grad_norm": 0.46586024315715674, + "learning_rate": 9.477088408077816e-07, + "loss": 0.1034, + "step": 2705 + }, + { + "epoch": 0.17, + "grad_norm": 0.6870177141639254, + "learning_rate": 9.476628509094511e-07, + "loss": 0.0949, + "step": 2706 + }, + { + "epoch": 0.17, + "grad_norm": 0.890797400301904, + "learning_rate": 9.476168419129234e-07, + "loss": 0.2011, + "step": 2707 + }, + { + "epoch": 0.17, + "grad_norm": 0.4284635485018434, + "learning_rate": 9.475708138201612e-07, + "loss": 0.0761, + "step": 2708 + }, + { + "epoch": 0.17, + "grad_norm": 0.49520868939639545, + "learning_rate": 9.475247666331285e-07, + "loss": 0.1412, + "step": 2709 + }, + { + "epoch": 0.17, + "grad_norm": 0.7249237889122515, + "learning_rate": 9.474787003537897e-07, + "loss": 0.305, + "step": 2710 + }, + { + "epoch": 0.17, + "grad_norm": 0.3673553451629228, + "learning_rate": 9.474326149841099e-07, + "loss": 0.1136, + "step": 2711 + }, + { + "epoch": 0.17, + "grad_norm": 0.9513134842408071, + "learning_rate": 9.473865105260554e-07, + "loss": 0.0823, + "step": 2712 + }, + { + "epoch": 0.17, + "grad_norm": 0.275082949196564, + "learning_rate": 9.47340386981593e-07, + "loss": 0.0768, + "step": 2713 + }, + { + "epoch": 0.17, + "grad_norm": 0.38376522592074097, + "learning_rate": 9.472942443526903e-07, + "loss": 0.016, + "step": 2714 + }, + { + "epoch": 0.17, + "grad_norm": 0.6354252208438675, + "learning_rate": 9.472480826413161e-07, + "loss": 0.2452, + "step": 2715 + }, + { + "epoch": 0.17, + "grad_norm": 0.7838325564258156, + "learning_rate": 9.472019018494395e-07, + "loss": 0.2635, + "step": 2716 + }, + { + "epoch": 0.17, + "grad_norm": 0.7046123557223944, + "learning_rate": 9.471557019790308e-07, + "loss": 0.1751, + "step": 2717 + }, + { + "epoch": 0.17, + "grad_norm": 0.1994700866665963, + "learning_rate": 9.471094830320609e-07, + "loss": 0.0422, + "step": 2718 + }, + { + "epoch": 0.17, + "grad_norm": 1.0426057348591593, + "learning_rate": 9.470632450105018e-07, + "loss": 0.1409, + "step": 2719 + }, + { + "epoch": 0.17, + "grad_norm": 0.2426962738025242, + "learning_rate": 9.470169879163258e-07, + "loss": 0.1847, + "step": 2720 + }, + { + "epoch": 0.17, + "grad_norm": 1.1177294538934748, + "learning_rate": 9.469707117515066e-07, + "loss": 0.2524, + "step": 2721 + }, + { + "epoch": 0.17, + "grad_norm": 0.843378487215984, + "learning_rate": 9.469244165180183e-07, + "loss": 0.1706, + "step": 2722 + }, + { + "epoch": 0.17, + "grad_norm": 0.526731905595799, + "learning_rate": 9.46878102217836e-07, + "loss": 0.3975, + "step": 2723 + }, + { + "epoch": 0.17, + "grad_norm": 0.37146034006379797, + "learning_rate": 9.468317688529354e-07, + "loss": 0.1483, + "step": 2724 + }, + { + "epoch": 0.17, + "grad_norm": 0.1941258775726189, + "learning_rate": 9.467854164252934e-07, + "loss": 0.0108, + "step": 2725 + }, + { + "epoch": 0.17, + "grad_norm": 0.6248765243759603, + "learning_rate": 9.467390449368873e-07, + "loss": 0.1815, + "step": 2726 + }, + { + "epoch": 0.17, + "grad_norm": 0.5849445940729799, + "learning_rate": 9.466926543896954e-07, + "loss": 0.2543, + "step": 2727 + }, + { + "epoch": 0.17, + "grad_norm": 0.6364373563334739, + "learning_rate": 9.466462447856971e-07, + "loss": 0.1878, + "step": 2728 + }, + { + "epoch": 0.17, + "grad_norm": 0.6116281503984071, + "learning_rate": 9.46599816126872e-07, + "loss": 0.1425, + "step": 2729 + }, + { + "epoch": 0.17, + "grad_norm": 0.8351472099940584, + "learning_rate": 9.465533684152009e-07, + "loss": 0.1702, + "step": 2730 + }, + { + "epoch": 0.17, + "grad_norm": 0.22066066406570445, + "learning_rate": 9.465069016526657e-07, + "loss": 0.0086, + "step": 2731 + }, + { + "epoch": 0.17, + "grad_norm": 0.5012185405558273, + "learning_rate": 9.464604158412483e-07, + "loss": 0.3493, + "step": 2732 + }, + { + "epoch": 0.17, + "grad_norm": 0.7384931294052169, + "learning_rate": 9.46413910982932e-07, + "loss": 0.1253, + "step": 2733 + }, + { + "epoch": 0.17, + "grad_norm": 0.55046884653624, + "learning_rate": 9.463673870797008e-07, + "loss": 0.1763, + "step": 2734 + }, + { + "epoch": 0.17, + "grad_norm": 0.6388104129942009, + "learning_rate": 9.463208441335398e-07, + "loss": 0.3693, + "step": 2735 + }, + { + "epoch": 0.17, + "grad_norm": 0.7641560230791178, + "learning_rate": 9.462742821464342e-07, + "loss": 0.2299, + "step": 2736 + }, + { + "epoch": 0.17, + "grad_norm": 0.2596875295580591, + "learning_rate": 9.462277011203707e-07, + "loss": 0.1694, + "step": 2737 + }, + { + "epoch": 0.17, + "grad_norm": 0.551082981625449, + "learning_rate": 9.461811010573364e-07, + "loss": 0.0953, + "step": 2738 + }, + { + "epoch": 0.17, + "grad_norm": 0.4609512229823886, + "learning_rate": 9.461344819593193e-07, + "loss": 0.0984, + "step": 2739 + }, + { + "epoch": 0.17, + "grad_norm": 0.4749796653783331, + "learning_rate": 9.460878438283085e-07, + "loss": 0.1183, + "step": 2740 + }, + { + "epoch": 0.17, + "grad_norm": 0.7080032973932419, + "learning_rate": 9.460411866662935e-07, + "loss": 0.3335, + "step": 2741 + }, + { + "epoch": 0.17, + "grad_norm": 0.3570665673311762, + "learning_rate": 9.459945104752648e-07, + "loss": 0.1678, + "step": 2742 + }, + { + "epoch": 0.17, + "grad_norm": 1.0982488308184704, + "learning_rate": 9.459478152572138e-07, + "loss": 0.1659, + "step": 2743 + }, + { + "epoch": 0.17, + "grad_norm": 0.7899017343218295, + "learning_rate": 9.459011010141324e-07, + "loss": 0.1995, + "step": 2744 + }, + { + "epoch": 0.18, + "grad_norm": 0.87676712175917, + "learning_rate": 9.458543677480137e-07, + "loss": 0.1066, + "step": 2745 + }, + { + "epoch": 0.18, + "grad_norm": 0.6629090266480245, + "learning_rate": 9.458076154608515e-07, + "loss": 0.2194, + "step": 2746 + }, + { + "epoch": 0.18, + "grad_norm": 0.6939508591988042, + "learning_rate": 9.457608441546401e-07, + "loss": 0.2803, + "step": 2747 + }, + { + "epoch": 0.18, + "grad_norm": 1.0121285258968407, + "learning_rate": 9.45714053831375e-07, + "loss": 0.148, + "step": 2748 + }, + { + "epoch": 0.18, + "grad_norm": 0.7661209179901457, + "learning_rate": 9.456672444930524e-07, + "loss": 0.0927, + "step": 2749 + }, + { + "epoch": 0.18, + "grad_norm": 1.7991497350903882, + "learning_rate": 9.456204161416692e-07, + "loss": 0.0553, + "step": 2750 + }, + { + "epoch": 0.18, + "grad_norm": 0.7160695877161593, + "learning_rate": 9.455735687792232e-07, + "loss": 0.3314, + "step": 2751 + }, + { + "epoch": 0.18, + "grad_norm": 1.0368872036386638, + "learning_rate": 9.455267024077131e-07, + "loss": 0.1253, + "step": 2752 + }, + { + "epoch": 0.18, + "grad_norm": 0.4879145975404485, + "learning_rate": 9.454798170291384e-07, + "loss": 0.396, + "step": 2753 + }, + { + "epoch": 0.18, + "grad_norm": 0.8331207912215928, + "learning_rate": 9.454329126454991e-07, + "loss": 0.0842, + "step": 2754 + }, + { + "epoch": 0.18, + "grad_norm": 0.47135914673985724, + "learning_rate": 9.453859892587962e-07, + "loss": 0.2683, + "step": 2755 + }, + { + "epoch": 0.18, + "grad_norm": 0.5011380936496737, + "learning_rate": 9.453390468710317e-07, + "loss": 0.2232, + "step": 2756 + }, + { + "epoch": 0.18, + "grad_norm": 0.5455151754771657, + "learning_rate": 9.452920854842084e-07, + "loss": 0.0862, + "step": 2757 + }, + { + "epoch": 0.18, + "grad_norm": 0.4558905703488411, + "learning_rate": 9.452451051003294e-07, + "loss": 0.2177, + "step": 2758 + }, + { + "epoch": 0.18, + "grad_norm": 0.7678508793696209, + "learning_rate": 9.451981057213991e-07, + "loss": 0.4681, + "step": 2759 + }, + { + "epoch": 0.18, + "grad_norm": 0.7600235819809289, + "learning_rate": 9.451510873494228e-07, + "loss": 0.2709, + "step": 2760 + }, + { + "epoch": 0.18, + "grad_norm": 1.2709085011050159, + "learning_rate": 9.451040499864061e-07, + "loss": 0.34, + "step": 2761 + }, + { + "epoch": 0.18, + "grad_norm": 2.0000470715173306, + "learning_rate": 9.45056993634356e-07, + "loss": 0.1959, + "step": 2762 + }, + { + "epoch": 0.18, + "grad_norm": 3.0978895279532295, + "learning_rate": 9.450099182952797e-07, + "loss": 0.2758, + "step": 2763 + }, + { + "epoch": 0.18, + "grad_norm": 0.15437705270089677, + "learning_rate": 9.449628239711859e-07, + "loss": 0.0788, + "step": 2764 + }, + { + "epoch": 0.18, + "grad_norm": 0.6322614890421141, + "learning_rate": 9.449157106640834e-07, + "loss": 0.2018, + "step": 2765 + }, + { + "epoch": 0.18, + "grad_norm": 0.4956315422057888, + "learning_rate": 9.448685783759825e-07, + "loss": 0.1677, + "step": 2766 + }, + { + "epoch": 0.18, + "grad_norm": 0.6298075310166793, + "learning_rate": 9.448214271088936e-07, + "loss": 0.1066, + "step": 2767 + }, + { + "epoch": 0.18, + "grad_norm": 1.0729077712714175, + "learning_rate": 9.447742568648285e-07, + "loss": 0.0818, + "step": 2768 + }, + { + "epoch": 0.18, + "grad_norm": 0.47651090783674244, + "learning_rate": 9.447270676457994e-07, + "loss": 0.3393, + "step": 2769 + }, + { + "epoch": 0.18, + "grad_norm": 0.5555946853100611, + "learning_rate": 9.446798594538194e-07, + "loss": 0.356, + "step": 2770 + }, + { + "epoch": 0.18, + "grad_norm": 0.47069576540041774, + "learning_rate": 9.446326322909031e-07, + "loss": 0.177, + "step": 2771 + }, + { + "epoch": 0.18, + "grad_norm": 1.022874544414808, + "learning_rate": 9.445853861590646e-07, + "loss": 0.1407, + "step": 2772 + }, + { + "epoch": 0.18, + "grad_norm": 0.76619778994947, + "learning_rate": 9.445381210603198e-07, + "loss": 0.1177, + "step": 2773 + }, + { + "epoch": 0.18, + "grad_norm": 0.816355372826185, + "learning_rate": 9.444908369966852e-07, + "loss": 0.3085, + "step": 2774 + }, + { + "epoch": 0.18, + "grad_norm": 1.0502165711859144, + "learning_rate": 9.444435339701779e-07, + "loss": 0.1333, + "step": 2775 + }, + { + "epoch": 0.18, + "grad_norm": 0.4496538403505556, + "learning_rate": 9.443962119828161e-07, + "loss": 0.0114, + "step": 2776 + }, + { + "epoch": 0.18, + "grad_norm": 0.7821882847088305, + "learning_rate": 9.443488710366184e-07, + "loss": 0.2463, + "step": 2777 + }, + { + "epoch": 0.18, + "grad_norm": 1.2748034457238213, + "learning_rate": 9.443015111336048e-07, + "loss": 0.4336, + "step": 2778 + }, + { + "epoch": 0.18, + "grad_norm": 0.46986003249958325, + "learning_rate": 9.442541322757953e-07, + "loss": 0.281, + "step": 2779 + }, + { + "epoch": 0.18, + "grad_norm": 1.0312067062208974, + "learning_rate": 9.442067344652117e-07, + "loss": 0.3935, + "step": 2780 + }, + { + "epoch": 0.18, + "grad_norm": 0.8290958117572361, + "learning_rate": 9.441593177038758e-07, + "loss": 0.098, + "step": 2781 + }, + { + "epoch": 0.18, + "grad_norm": 0.4844868718260467, + "learning_rate": 9.441118819938104e-07, + "loss": 0.1814, + "step": 2782 + }, + { + "epoch": 0.18, + "grad_norm": 0.6000533724746735, + "learning_rate": 9.440644273370395e-07, + "loss": 0.0846, + "step": 2783 + }, + { + "epoch": 0.18, + "grad_norm": 1.2138730908567696, + "learning_rate": 9.440169537355873e-07, + "loss": 0.0233, + "step": 2784 + }, + { + "epoch": 0.18, + "grad_norm": 0.5013900350971062, + "learning_rate": 9.439694611914795e-07, + "loss": 0.1756, + "step": 2785 + }, + { + "epoch": 0.18, + "grad_norm": 0.36931333974677744, + "learning_rate": 9.439219497067417e-07, + "loss": 0.0095, + "step": 2786 + }, + { + "epoch": 0.18, + "grad_norm": 0.3750648838577161, + "learning_rate": 9.438744192834013e-07, + "loss": 0.1656, + "step": 2787 + }, + { + "epoch": 0.18, + "grad_norm": 0.605318119090337, + "learning_rate": 9.438268699234858e-07, + "loss": 0.0886, + "step": 2788 + }, + { + "epoch": 0.18, + "grad_norm": 0.2959328351693167, + "learning_rate": 9.437793016290239e-07, + "loss": 0.115, + "step": 2789 + }, + { + "epoch": 0.18, + "grad_norm": 0.4474127652051423, + "learning_rate": 9.43731714402045e-07, + "loss": 0.0102, + "step": 2790 + }, + { + "epoch": 0.18, + "grad_norm": 0.6136794135858379, + "learning_rate": 9.436841082445788e-07, + "loss": 0.0886, + "step": 2791 + }, + { + "epoch": 0.18, + "grad_norm": 1.1717763396087342, + "learning_rate": 9.436364831586569e-07, + "loss": 0.3371, + "step": 2792 + }, + { + "epoch": 0.18, + "grad_norm": 0.7635911988360328, + "learning_rate": 9.435888391463107e-07, + "loss": 0.3099, + "step": 2793 + }, + { + "epoch": 0.18, + "grad_norm": 0.4099248645231432, + "learning_rate": 9.435411762095729e-07, + "loss": 0.0585, + "step": 2794 + }, + { + "epoch": 0.18, + "grad_norm": 1.011576269807772, + "learning_rate": 9.43493494350477e-07, + "loss": 0.1689, + "step": 2795 + }, + { + "epoch": 0.18, + "grad_norm": 0.22326415266088995, + "learning_rate": 9.434457935710569e-07, + "loss": 0.0892, + "step": 2796 + }, + { + "epoch": 0.18, + "grad_norm": 0.19328249836507289, + "learning_rate": 9.43398073873348e-07, + "loss": 0.0992, + "step": 2797 + }, + { + "epoch": 0.18, + "grad_norm": 0.9261282826568348, + "learning_rate": 9.433503352593859e-07, + "loss": 0.2951, + "step": 2798 + }, + { + "epoch": 0.18, + "grad_norm": 0.40628338413841864, + "learning_rate": 9.433025777312072e-07, + "loss": 0.0931, + "step": 2799 + }, + { + "epoch": 0.18, + "grad_norm": 0.8013568734705643, + "learning_rate": 9.432548012908495e-07, + "loss": 0.2714, + "step": 2800 + }, + { + "epoch": 0.18, + "grad_norm": 0.7317670146233269, + "learning_rate": 9.432070059403507e-07, + "loss": 0.1874, + "step": 2801 + }, + { + "epoch": 0.18, + "grad_norm": 0.9686810419542108, + "learning_rate": 9.431591916817502e-07, + "loss": 0.1488, + "step": 2802 + }, + { + "epoch": 0.18, + "grad_norm": 0.8012002521344556, + "learning_rate": 9.431113585170877e-07, + "loss": 0.1894, + "step": 2803 + }, + { + "epoch": 0.18, + "grad_norm": 0.8454033223737674, + "learning_rate": 9.430635064484038e-07, + "loss": 0.2057, + "step": 2804 + }, + { + "epoch": 0.18, + "grad_norm": 0.7072694430681157, + "learning_rate": 9.430156354777402e-07, + "loss": 0.4607, + "step": 2805 + }, + { + "epoch": 0.18, + "grad_norm": 0.4874392147836718, + "learning_rate": 9.42967745607139e-07, + "loss": 0.1571, + "step": 2806 + }, + { + "epoch": 0.18, + "grad_norm": 0.5946393715378242, + "learning_rate": 9.429198368386433e-07, + "loss": 0.0976, + "step": 2807 + }, + { + "epoch": 0.18, + "grad_norm": 1.51813433087222, + "learning_rate": 9.428719091742968e-07, + "loss": 0.2874, + "step": 2808 + }, + { + "epoch": 0.18, + "grad_norm": 0.4208942596352635, + "learning_rate": 9.428239626161445e-07, + "loss": 0.0901, + "step": 2809 + }, + { + "epoch": 0.18, + "grad_norm": 0.23755054114355956, + "learning_rate": 9.427759971662318e-07, + "loss": 0.1652, + "step": 2810 + }, + { + "epoch": 0.18, + "grad_norm": 0.8589361478411537, + "learning_rate": 9.427280128266049e-07, + "loss": 0.1959, + "step": 2811 + }, + { + "epoch": 0.18, + "grad_norm": 0.6991347503127461, + "learning_rate": 9.426800095993111e-07, + "loss": 0.3735, + "step": 2812 + }, + { + "epoch": 0.18, + "grad_norm": 0.7716760270043876, + "learning_rate": 9.426319874863981e-07, + "loss": 0.1688, + "step": 2813 + }, + { + "epoch": 0.18, + "grad_norm": 0.7459044544613995, + "learning_rate": 9.425839464899145e-07, + "loss": 0.5136, + "step": 2814 + }, + { + "epoch": 0.18, + "grad_norm": 0.7709215179212534, + "learning_rate": 9.425358866119104e-07, + "loss": 0.2338, + "step": 2815 + }, + { + "epoch": 0.18, + "grad_norm": 0.3334461217493869, + "learning_rate": 9.424878078544356e-07, + "loss": 0.2086, + "step": 2816 + }, + { + "epoch": 0.18, + "grad_norm": 0.7471230427630468, + "learning_rate": 9.424397102195414e-07, + "loss": 0.2253, + "step": 2817 + }, + { + "epoch": 0.18, + "grad_norm": 0.8213268370779977, + "learning_rate": 9.423915937092798e-07, + "loss": 0.1502, + "step": 2818 + }, + { + "epoch": 0.18, + "grad_norm": 0.4561214217091379, + "learning_rate": 9.423434583257035e-07, + "loss": 0.1782, + "step": 2819 + }, + { + "epoch": 0.18, + "grad_norm": 0.807199616528415, + "learning_rate": 9.42295304070866e-07, + "loss": 0.154, + "step": 2820 + }, + { + "epoch": 0.18, + "grad_norm": 0.41297631557583814, + "learning_rate": 9.422471309468216e-07, + "loss": 0.1923, + "step": 2821 + }, + { + "epoch": 0.18, + "grad_norm": 0.5799683306955786, + "learning_rate": 9.421989389556258e-07, + "loss": 0.2438, + "step": 2822 + }, + { + "epoch": 0.18, + "grad_norm": 1.772270986558091, + "learning_rate": 9.421507280993341e-07, + "loss": 0.2805, + "step": 2823 + }, + { + "epoch": 0.18, + "grad_norm": 1.1538887519345769, + "learning_rate": 9.421024983800037e-07, + "loss": 0.1897, + "step": 2824 + }, + { + "epoch": 0.18, + "grad_norm": 0.3294123485329724, + "learning_rate": 9.42054249799692e-07, + "loss": 0.1754, + "step": 2825 + }, + { + "epoch": 0.18, + "grad_norm": 0.43713400313651707, + "learning_rate": 9.420059823604571e-07, + "loss": 0.0802, + "step": 2826 + }, + { + "epoch": 0.18, + "grad_norm": 1.6594140278897644, + "learning_rate": 9.419576960643587e-07, + "loss": 0.2665, + "step": 2827 + }, + { + "epoch": 0.18, + "grad_norm": 0.8470930858443656, + "learning_rate": 9.419093909134563e-07, + "loss": 0.2339, + "step": 2828 + }, + { + "epoch": 0.18, + "grad_norm": 0.4306736376756811, + "learning_rate": 9.418610669098113e-07, + "loss": 0.1545, + "step": 2829 + }, + { + "epoch": 0.18, + "grad_norm": 1.365434225331647, + "learning_rate": 9.418127240554845e-07, + "loss": 0.2879, + "step": 2830 + }, + { + "epoch": 0.18, + "grad_norm": 0.7651480910982791, + "learning_rate": 9.417643623525391e-07, + "loss": 0.2465, + "step": 2831 + }, + { + "epoch": 0.18, + "grad_norm": 0.4917679686000983, + "learning_rate": 9.417159818030378e-07, + "loss": 0.0449, + "step": 2832 + }, + { + "epoch": 0.18, + "grad_norm": 1.1656930051141519, + "learning_rate": 9.416675824090448e-07, + "loss": 0.1094, + "step": 2833 + }, + { + "epoch": 0.18, + "grad_norm": 0.41184955136228063, + "learning_rate": 9.416191641726248e-07, + "loss": 0.1345, + "step": 2834 + }, + { + "epoch": 0.18, + "grad_norm": 0.4588694991771305, + "learning_rate": 9.415707270958435e-07, + "loss": 0.1318, + "step": 2835 + }, + { + "epoch": 0.18, + "grad_norm": 1.807216627429904, + "learning_rate": 9.415222711807673e-07, + "loss": 0.3597, + "step": 2836 + }, + { + "epoch": 0.18, + "grad_norm": 0.7407881832879403, + "learning_rate": 9.414737964294634e-07, + "loss": 0.2336, + "step": 2837 + }, + { + "epoch": 0.18, + "grad_norm": 0.5398464107680441, + "learning_rate": 9.41425302844e-07, + "loss": 0.1482, + "step": 2838 + }, + { + "epoch": 0.18, + "grad_norm": 1.2362484444063, + "learning_rate": 9.413767904264457e-07, + "loss": 0.128, + "step": 2839 + }, + { + "epoch": 0.18, + "grad_norm": 0.45437418205698415, + "learning_rate": 9.413282591788703e-07, + "loss": 0.0624, + "step": 2840 + }, + { + "epoch": 0.18, + "grad_norm": 0.37334653359496567, + "learning_rate": 9.412797091033442e-07, + "loss": 0.0551, + "step": 2841 + }, + { + "epoch": 0.18, + "grad_norm": 0.9241025056858144, + "learning_rate": 9.412311402019387e-07, + "loss": 0.0881, + "step": 2842 + }, + { + "epoch": 0.18, + "grad_norm": 1.2378366271539436, + "learning_rate": 9.411825524767255e-07, + "loss": 0.0359, + "step": 2843 + }, + { + "epoch": 0.18, + "grad_norm": 1.0344821750785687, + "learning_rate": 9.411339459297779e-07, + "loss": 0.0915, + "step": 2844 + }, + { + "epoch": 0.18, + "grad_norm": 0.9693798456261467, + "learning_rate": 9.410853205631693e-07, + "loss": 0.3174, + "step": 2845 + }, + { + "epoch": 0.18, + "grad_norm": 0.5421670416631592, + "learning_rate": 9.410366763789743e-07, + "loss": 0.1069, + "step": 2846 + }, + { + "epoch": 0.18, + "grad_norm": 0.2709530149224765, + "learning_rate": 9.409880133792682e-07, + "loss": 0.1021, + "step": 2847 + }, + { + "epoch": 0.18, + "grad_norm": 0.6347101893040504, + "learning_rate": 9.409393315661268e-07, + "loss": 0.1221, + "step": 2848 + }, + { + "epoch": 0.18, + "grad_norm": 0.8951731804134153, + "learning_rate": 9.408906309416271e-07, + "loss": 0.1671, + "step": 2849 + }, + { + "epoch": 0.18, + "grad_norm": 0.7422000431521963, + "learning_rate": 9.40841911507847e-07, + "loss": 0.2216, + "step": 2850 + }, + { + "epoch": 0.18, + "grad_norm": 0.670706390499011, + "learning_rate": 9.407931732668645e-07, + "loss": 0.185, + "step": 2851 + }, + { + "epoch": 0.18, + "grad_norm": 0.2784967892109064, + "learning_rate": 9.407444162207591e-07, + "loss": 0.0595, + "step": 2852 + }, + { + "epoch": 0.18, + "grad_norm": 0.6966841610794263, + "learning_rate": 9.40695640371611e-07, + "loss": 0.4388, + "step": 2853 + }, + { + "epoch": 0.18, + "grad_norm": 0.8689733905563939, + "learning_rate": 9.406468457215011e-07, + "loss": 0.3751, + "step": 2854 + }, + { + "epoch": 0.18, + "grad_norm": 1.3656774912938305, + "learning_rate": 9.405980322725109e-07, + "loss": 0.1813, + "step": 2855 + }, + { + "epoch": 0.18, + "grad_norm": 0.6905757753799551, + "learning_rate": 9.405492000267228e-07, + "loss": 0.2256, + "step": 2856 + }, + { + "epoch": 0.18, + "grad_norm": 1.1243153878301837, + "learning_rate": 9.405003489862202e-07, + "loss": 0.1498, + "step": 2857 + }, + { + "epoch": 0.18, + "grad_norm": 1.7253835102938415, + "learning_rate": 9.404514791530873e-07, + "loss": 0.4478, + "step": 2858 + }, + { + "epoch": 0.18, + "grad_norm": 0.4978015300325407, + "learning_rate": 9.404025905294088e-07, + "loss": 0.1165, + "step": 2859 + }, + { + "epoch": 0.18, + "grad_norm": 0.44617374946937977, + "learning_rate": 9.403536831172706e-07, + "loss": 0.0725, + "step": 2860 + }, + { + "epoch": 0.18, + "grad_norm": 0.6314616661282472, + "learning_rate": 9.40304756918759e-07, + "loss": 0.1408, + "step": 2861 + }, + { + "epoch": 0.18, + "grad_norm": 0.6965625633951027, + "learning_rate": 9.402558119359614e-07, + "loss": 0.1106, + "step": 2862 + }, + { + "epoch": 0.18, + "grad_norm": 0.22985774636408715, + "learning_rate": 9.402068481709657e-07, + "loss": 0.1237, + "step": 2863 + }, + { + "epoch": 0.18, + "grad_norm": 0.6809065562402292, + "learning_rate": 9.40157865625861e-07, + "loss": 0.2483, + "step": 2864 + }, + { + "epoch": 0.18, + "grad_norm": 0.9062160343423031, + "learning_rate": 9.401088643027369e-07, + "loss": 0.196, + "step": 2865 + }, + { + "epoch": 0.18, + "grad_norm": 1.7100489542755926, + "learning_rate": 9.400598442036839e-07, + "loss": 0.063, + "step": 2866 + }, + { + "epoch": 0.18, + "grad_norm": 0.42267334696169123, + "learning_rate": 9.400108053307934e-07, + "loss": 0.0933, + "step": 2867 + }, + { + "epoch": 0.18, + "grad_norm": 0.22334642641603122, + "learning_rate": 9.399617476861573e-07, + "loss": 0.2095, + "step": 2868 + }, + { + "epoch": 0.18, + "grad_norm": 1.2243792781462794, + "learning_rate": 9.399126712718687e-07, + "loss": 0.1065, + "step": 2869 + }, + { + "epoch": 0.18, + "grad_norm": 0.362080305206458, + "learning_rate": 9.398635760900211e-07, + "loss": 0.0415, + "step": 2870 + }, + { + "epoch": 0.18, + "grad_norm": 0.8801455382905071, + "learning_rate": 9.398144621427093e-07, + "loss": 0.2932, + "step": 2871 + }, + { + "epoch": 0.18, + "grad_norm": 1.0103986879678948, + "learning_rate": 9.397653294320282e-07, + "loss": 0.2368, + "step": 2872 + }, + { + "epoch": 0.18, + "grad_norm": 0.625343372049108, + "learning_rate": 9.397161779600742e-07, + "loss": 0.216, + "step": 2873 + }, + { + "epoch": 0.18, + "grad_norm": 0.1910556190819925, + "learning_rate": 9.396670077289441e-07, + "loss": 0.1636, + "step": 2874 + }, + { + "epoch": 0.18, + "grad_norm": 0.5439027466683719, + "learning_rate": 9.396178187407356e-07, + "loss": 0.3946, + "step": 2875 + }, + { + "epoch": 0.18, + "grad_norm": 1.3236032792196466, + "learning_rate": 9.395686109975473e-07, + "loss": 0.1851, + "step": 2876 + }, + { + "epoch": 0.18, + "grad_norm": 0.2515487957717811, + "learning_rate": 9.395193845014784e-07, + "loss": 0.0908, + "step": 2877 + }, + { + "epoch": 0.18, + "grad_norm": 0.8810352395602445, + "learning_rate": 9.39470139254629e-07, + "loss": 0.2022, + "step": 2878 + }, + { + "epoch": 0.18, + "grad_norm": 0.6101326355028606, + "learning_rate": 9.394208752590997e-07, + "loss": 0.0564, + "step": 2879 + }, + { + "epoch": 0.18, + "grad_norm": 0.8622326528124292, + "learning_rate": 9.393715925169929e-07, + "loss": 0.1853, + "step": 2880 + }, + { + "epoch": 0.18, + "grad_norm": 0.6068834151155335, + "learning_rate": 9.393222910304106e-07, + "loss": 0.3243, + "step": 2881 + }, + { + "epoch": 0.18, + "grad_norm": 0.7504937712945394, + "learning_rate": 9.392729708014562e-07, + "loss": 0.1388, + "step": 2882 + }, + { + "epoch": 0.18, + "grad_norm": 0.8035627430314166, + "learning_rate": 9.392236318322337e-07, + "loss": 0.2444, + "step": 2883 + }, + { + "epoch": 0.18, + "grad_norm": 0.5813276585105368, + "learning_rate": 9.391742741248483e-07, + "loss": 0.0145, + "step": 2884 + }, + { + "epoch": 0.18, + "grad_norm": 1.3399477342637993, + "learning_rate": 9.391248976814054e-07, + "loss": 0.3238, + "step": 2885 + }, + { + "epoch": 0.18, + "grad_norm": 0.35618492077932135, + "learning_rate": 9.390755025040118e-07, + "loss": 0.1506, + "step": 2886 + }, + { + "epoch": 0.18, + "grad_norm": 1.10978232209715, + "learning_rate": 9.390260885947745e-07, + "loss": 0.1978, + "step": 2887 + }, + { + "epoch": 0.18, + "grad_norm": 0.9966430862469495, + "learning_rate": 9.389766559558017e-07, + "loss": 0.0761, + "step": 2888 + }, + { + "epoch": 0.18, + "grad_norm": 0.5628107443022811, + "learning_rate": 9.389272045892023e-07, + "loss": 0.1672, + "step": 2889 + }, + { + "epoch": 0.18, + "grad_norm": 0.5324227945428179, + "learning_rate": 9.38877734497086e-07, + "loss": 0.098, + "step": 2890 + }, + { + "epoch": 0.18, + "grad_norm": 1.1450986980950073, + "learning_rate": 9.388282456815632e-07, + "loss": 0.0692, + "step": 2891 + }, + { + "epoch": 0.18, + "grad_norm": 0.2870732688804849, + "learning_rate": 9.387787381447454e-07, + "loss": 0.1587, + "step": 2892 + }, + { + "epoch": 0.18, + "grad_norm": 1.1026915337376895, + "learning_rate": 9.387292118887444e-07, + "loss": 0.2455, + "step": 2893 + }, + { + "epoch": 0.18, + "grad_norm": 0.6398918224849218, + "learning_rate": 9.386796669156735e-07, + "loss": 0.3308, + "step": 2894 + }, + { + "epoch": 0.18, + "grad_norm": 0.9118360693252334, + "learning_rate": 9.386301032276461e-07, + "loss": 0.1218, + "step": 2895 + }, + { + "epoch": 0.18, + "grad_norm": 0.44660226248487206, + "learning_rate": 9.385805208267766e-07, + "loss": 0.0959, + "step": 2896 + }, + { + "epoch": 0.18, + "grad_norm": 1.05321455559004, + "learning_rate": 9.385309197151805e-07, + "loss": 0.1418, + "step": 2897 + }, + { + "epoch": 0.18, + "grad_norm": 0.6948435045112532, + "learning_rate": 9.384812998949739e-07, + "loss": 0.2146, + "step": 2898 + }, + { + "epoch": 0.18, + "grad_norm": 0.45948361273294575, + "learning_rate": 9.384316613682735e-07, + "loss": 0.2599, + "step": 2899 + }, + { + "epoch": 0.18, + "grad_norm": 0.3149086661546606, + "learning_rate": 9.38382004137197e-07, + "loss": 0.129, + "step": 2900 + }, + { + "epoch": 0.19, + "grad_norm": 0.6401148842503128, + "learning_rate": 9.383323282038631e-07, + "loss": 0.2047, + "step": 2901 + }, + { + "epoch": 0.19, + "grad_norm": 1.1647592754413674, + "learning_rate": 9.382826335703908e-07, + "loss": 0.1428, + "step": 2902 + }, + { + "epoch": 0.19, + "grad_norm": 3.349040834073399, + "learning_rate": 9.382329202389003e-07, + "loss": 0.2031, + "step": 2903 + }, + { + "epoch": 0.19, + "grad_norm": 0.80803777930169, + "learning_rate": 9.381831882115126e-07, + "loss": 0.0875, + "step": 2904 + }, + { + "epoch": 0.19, + "grad_norm": 0.6901241070061314, + "learning_rate": 9.381334374903491e-07, + "loss": 0.3223, + "step": 2905 + }, + { + "epoch": 0.19, + "grad_norm": 0.44340982598308587, + "learning_rate": 9.380836680775324e-07, + "loss": 0.1118, + "step": 2906 + }, + { + "epoch": 0.19, + "grad_norm": 0.36681370962138576, + "learning_rate": 9.380338799751858e-07, + "loss": 0.3782, + "step": 2907 + }, + { + "epoch": 0.19, + "grad_norm": 0.6694382343922336, + "learning_rate": 9.379840731854334e-07, + "loss": 0.3132, + "step": 2908 + }, + { + "epoch": 0.19, + "grad_norm": 0.7581315571969631, + "learning_rate": 9.379342477103998e-07, + "loss": 0.1568, + "step": 2909 + }, + { + "epoch": 0.19, + "grad_norm": 0.41683816538272617, + "learning_rate": 9.37884403552211e-07, + "loss": 0.2433, + "step": 2910 + }, + { + "epoch": 0.19, + "grad_norm": 1.0928002264705523, + "learning_rate": 9.378345407129931e-07, + "loss": 0.356, + "step": 2911 + }, + { + "epoch": 0.19, + "grad_norm": 1.6464702572252623, + "learning_rate": 9.377846591948737e-07, + "loss": 0.2987, + "step": 2912 + }, + { + "epoch": 0.19, + "grad_norm": 2.213944566539613, + "learning_rate": 9.377347589999806e-07, + "loss": 0.4899, + "step": 2913 + }, + { + "epoch": 0.19, + "grad_norm": 0.2569801615400901, + "learning_rate": 9.376848401304428e-07, + "loss": 0.215, + "step": 2914 + }, + { + "epoch": 0.19, + "grad_norm": 0.7564381394785538, + "learning_rate": 9.376349025883899e-07, + "loss": 0.1616, + "step": 2915 + }, + { + "epoch": 0.19, + "grad_norm": 0.9610078231646705, + "learning_rate": 9.375849463759522e-07, + "loss": 0.1232, + "step": 2916 + }, + { + "epoch": 0.19, + "grad_norm": 0.3820691631100241, + "learning_rate": 9.375349714952609e-07, + "loss": 0.0886, + "step": 2917 + }, + { + "epoch": 0.19, + "grad_norm": 0.5951588990461749, + "learning_rate": 9.374849779484484e-07, + "loss": 0.2954, + "step": 2918 + }, + { + "epoch": 0.19, + "grad_norm": 0.44345965442842933, + "learning_rate": 9.374349657376472e-07, + "loss": 0.2337, + "step": 2919 + }, + { + "epoch": 0.19, + "grad_norm": 0.47022348751950854, + "learning_rate": 9.373849348649909e-07, + "loss": 0.0834, + "step": 2920 + }, + { + "epoch": 0.19, + "grad_norm": 0.5549102258857699, + "learning_rate": 9.373348853326142e-07, + "loss": 0.3269, + "step": 2921 + }, + { + "epoch": 0.19, + "grad_norm": 1.3390363419600777, + "learning_rate": 9.372848171426522e-07, + "loss": 0.0601, + "step": 2922 + }, + { + "epoch": 0.19, + "grad_norm": 0.4553715406375695, + "learning_rate": 9.372347302972407e-07, + "loss": 0.1892, + "step": 2923 + }, + { + "epoch": 0.19, + "grad_norm": 2.315082537773244, + "learning_rate": 9.371846247985166e-07, + "loss": 0.1059, + "step": 2924 + }, + { + "epoch": 0.19, + "grad_norm": 0.7620149541053305, + "learning_rate": 9.371345006486176e-07, + "loss": 0.1488, + "step": 2925 + }, + { + "epoch": 0.19, + "grad_norm": 0.4819073686687152, + "learning_rate": 9.37084357849682e-07, + "loss": 0.2861, + "step": 2926 + }, + { + "epoch": 0.19, + "grad_norm": 0.7497531244440875, + "learning_rate": 9.370341964038492e-07, + "loss": 0.1649, + "step": 2927 + }, + { + "epoch": 0.19, + "grad_norm": 0.9647720741092505, + "learning_rate": 9.36984016313259e-07, + "loss": 0.4131, + "step": 2928 + }, + { + "epoch": 0.19, + "grad_norm": 0.7008518156276032, + "learning_rate": 9.369338175800521e-07, + "loss": 0.3531, + "step": 2929 + }, + { + "epoch": 0.19, + "grad_norm": 0.5904076191224823, + "learning_rate": 9.368836002063703e-07, + "loss": 0.2638, + "step": 2930 + }, + { + "epoch": 0.19, + "grad_norm": 1.1396464800581088, + "learning_rate": 9.368333641943558e-07, + "loss": 0.0155, + "step": 2931 + }, + { + "epoch": 0.19, + "grad_norm": 1.2158797865005662, + "learning_rate": 9.367831095461518e-07, + "loss": 0.4174, + "step": 2932 + }, + { + "epoch": 0.19, + "grad_norm": 0.42457405895374317, + "learning_rate": 9.367328362639024e-07, + "loss": 0.0814, + "step": 2933 + }, + { + "epoch": 0.19, + "grad_norm": 0.1301786172208941, + "learning_rate": 9.366825443497522e-07, + "loss": 0.008, + "step": 2934 + }, + { + "epoch": 0.19, + "grad_norm": 0.9093203437447465, + "learning_rate": 9.366322338058469e-07, + "loss": 0.4459, + "step": 2935 + }, + { + "epoch": 0.19, + "grad_norm": 0.49447847109671217, + "learning_rate": 9.365819046343328e-07, + "loss": 0.1887, + "step": 2936 + }, + { + "epoch": 0.19, + "grad_norm": 3.0850813480109034, + "learning_rate": 9.365315568373568e-07, + "loss": 0.0585, + "step": 2937 + }, + { + "epoch": 0.19, + "grad_norm": 0.4610127694846558, + "learning_rate": 9.364811904170672e-07, + "loss": 0.0876, + "step": 2938 + }, + { + "epoch": 0.19, + "grad_norm": 1.2660699869751306, + "learning_rate": 9.364308053756126e-07, + "loss": 0.1479, + "step": 2939 + }, + { + "epoch": 0.19, + "grad_norm": 0.5483732666481342, + "learning_rate": 9.363804017151424e-07, + "loss": 0.1685, + "step": 2940 + }, + { + "epoch": 0.19, + "grad_norm": 0.3343442458788976, + "learning_rate": 9.363299794378071e-07, + "loss": 0.0201, + "step": 2941 + }, + { + "epoch": 0.19, + "grad_norm": 1.001151416510769, + "learning_rate": 9.362795385457578e-07, + "loss": 0.2791, + "step": 2942 + }, + { + "epoch": 0.19, + "grad_norm": 0.8030334382420369, + "learning_rate": 9.362290790411463e-07, + "loss": 0.1531, + "step": 2943 + }, + { + "epoch": 0.19, + "grad_norm": 0.2317266383697126, + "learning_rate": 9.361786009261252e-07, + "loss": 0.0073, + "step": 2944 + }, + { + "epoch": 0.19, + "grad_norm": 1.047789694645446, + "learning_rate": 9.361281042028484e-07, + "loss": 0.2145, + "step": 2945 + }, + { + "epoch": 0.19, + "grad_norm": 0.4685516692379248, + "learning_rate": 9.360775888734697e-07, + "loss": 0.3019, + "step": 2946 + }, + { + "epoch": 0.19, + "grad_norm": 0.7773936351369717, + "learning_rate": 9.360270549401445e-07, + "loss": 0.3539, + "step": 2947 + }, + { + "epoch": 0.19, + "grad_norm": 0.5734200535640486, + "learning_rate": 9.359765024050288e-07, + "loss": 0.1606, + "step": 2948 + }, + { + "epoch": 0.19, + "grad_norm": 0.7189535865509294, + "learning_rate": 9.35925931270279e-07, + "loss": 0.4715, + "step": 2949 + }, + { + "epoch": 0.19, + "grad_norm": 0.3679596216667496, + "learning_rate": 9.358753415380527e-07, + "loss": 0.0498, + "step": 2950 + }, + { + "epoch": 0.19, + "grad_norm": 1.1985638110274803, + "learning_rate": 9.35824733210508e-07, + "loss": 0.2713, + "step": 2951 + }, + { + "epoch": 0.19, + "grad_norm": 0.41398293651477885, + "learning_rate": 9.35774106289804e-07, + "loss": 0.2053, + "step": 2952 + }, + { + "epoch": 0.19, + "grad_norm": 0.6099238968783113, + "learning_rate": 9.357234607781008e-07, + "loss": 0.1821, + "step": 2953 + }, + { + "epoch": 0.19, + "grad_norm": 0.8709531253950371, + "learning_rate": 9.356727966775587e-07, + "loss": 0.3359, + "step": 2954 + }, + { + "epoch": 0.19, + "grad_norm": 0.2278086119851056, + "learning_rate": 9.356221139903393e-07, + "loss": 0.005, + "step": 2955 + }, + { + "epoch": 0.19, + "grad_norm": 0.9272032589655799, + "learning_rate": 9.355714127186048e-07, + "loss": 0.3826, + "step": 2956 + }, + { + "epoch": 0.19, + "grad_norm": 1.0236446162888828, + "learning_rate": 9.355206928645183e-07, + "loss": 0.2564, + "step": 2957 + }, + { + "epoch": 0.19, + "grad_norm": 0.8801309550772775, + "learning_rate": 9.354699544302435e-07, + "loss": 0.272, + "step": 2958 + }, + { + "epoch": 0.19, + "grad_norm": 0.7152614180118375, + "learning_rate": 9.354191974179451e-07, + "loss": 0.1935, + "step": 2959 + }, + { + "epoch": 0.19, + "grad_norm": 1.242140104161178, + "learning_rate": 9.353684218297884e-07, + "loss": 0.1313, + "step": 2960 + }, + { + "epoch": 0.19, + "grad_norm": 0.4786856065967177, + "learning_rate": 9.353176276679395e-07, + "loss": 0.1123, + "step": 2961 + }, + { + "epoch": 0.19, + "grad_norm": 0.6804098926691428, + "learning_rate": 9.352668149345657e-07, + "loss": 0.3683, + "step": 2962 + }, + { + "epoch": 0.19, + "grad_norm": 0.3810212011607888, + "learning_rate": 9.352159836318345e-07, + "loss": 0.3455, + "step": 2963 + }, + { + "epoch": 0.19, + "grad_norm": 1.5196253006025056, + "learning_rate": 9.351651337619144e-07, + "loss": 0.2195, + "step": 2964 + }, + { + "epoch": 0.19, + "grad_norm": 0.7831762124216888, + "learning_rate": 9.351142653269752e-07, + "loss": 0.1341, + "step": 2965 + }, + { + "epoch": 0.19, + "grad_norm": 0.401533870177438, + "learning_rate": 9.350633783291866e-07, + "loss": 0.0186, + "step": 2966 + }, + { + "epoch": 0.19, + "grad_norm": 0.2830055046461243, + "learning_rate": 9.350124727707196e-07, + "loss": 0.0125, + "step": 2967 + }, + { + "epoch": 0.19, + "grad_norm": 0.9275444900338184, + "learning_rate": 9.349615486537461e-07, + "loss": 0.1766, + "step": 2968 + }, + { + "epoch": 0.19, + "grad_norm": 1.4813644088477882, + "learning_rate": 9.349106059804386e-07, + "loss": 0.2933, + "step": 2969 + }, + { + "epoch": 0.19, + "grad_norm": 0.5543308751412848, + "learning_rate": 9.348596447529702e-07, + "loss": 0.3231, + "step": 2970 + }, + { + "epoch": 0.19, + "grad_norm": 0.3809371773300341, + "learning_rate": 9.348086649735154e-07, + "loss": 0.086, + "step": 2971 + }, + { + "epoch": 0.19, + "grad_norm": 1.1614202413974644, + "learning_rate": 9.347576666442487e-07, + "loss": 0.1066, + "step": 2972 + }, + { + "epoch": 0.19, + "grad_norm": 0.5684829679157605, + "learning_rate": 9.347066497673461e-07, + "loss": 0.0861, + "step": 2973 + }, + { + "epoch": 0.19, + "grad_norm": 0.8791000751341507, + "learning_rate": 9.34655614344984e-07, + "loss": 0.2266, + "step": 2974 + }, + { + "epoch": 0.19, + "grad_norm": 1.5369128656478885, + "learning_rate": 9.346045603793394e-07, + "loss": 0.1987, + "step": 2975 + }, + { + "epoch": 0.19, + "grad_norm": 1.2612040497881984, + "learning_rate": 9.345534878725907e-07, + "loss": 0.0356, + "step": 2976 + }, + { + "epoch": 0.19, + "grad_norm": 0.5417937297154061, + "learning_rate": 9.345023968269167e-07, + "loss": 0.1972, + "step": 2977 + }, + { + "epoch": 0.19, + "grad_norm": 1.427665547698021, + "learning_rate": 9.344512872444969e-07, + "loss": 0.2863, + "step": 2978 + }, + { + "epoch": 0.19, + "grad_norm": 1.0737672201175752, + "learning_rate": 9.344001591275119e-07, + "loss": 0.1526, + "step": 2979 + }, + { + "epoch": 0.19, + "grad_norm": 0.7145115557551948, + "learning_rate": 9.343490124781428e-07, + "loss": 0.3481, + "step": 2980 + }, + { + "epoch": 0.19, + "grad_norm": 1.119609581908684, + "learning_rate": 9.342978472985718e-07, + "loss": 0.2666, + "step": 2981 + }, + { + "epoch": 0.19, + "grad_norm": 0.7464592130899599, + "learning_rate": 9.342466635909815e-07, + "loss": 0.2481, + "step": 2982 + }, + { + "epoch": 0.19, + "grad_norm": 0.12397808819221302, + "learning_rate": 9.341954613575555e-07, + "loss": 0.0045, + "step": 2983 + }, + { + "epoch": 0.19, + "grad_norm": 0.40409144562835825, + "learning_rate": 9.341442406004784e-07, + "loss": 0.1173, + "step": 2984 + }, + { + "epoch": 0.19, + "grad_norm": 0.3297218341450486, + "learning_rate": 9.340930013219352e-07, + "loss": 0.0489, + "step": 2985 + }, + { + "epoch": 0.19, + "grad_norm": 0.382178387832423, + "learning_rate": 9.340417435241119e-07, + "loss": 0.0356, + "step": 2986 + }, + { + "epoch": 0.19, + "grad_norm": 0.7806082011581832, + "learning_rate": 9.339904672091953e-07, + "loss": 0.1423, + "step": 2987 + }, + { + "epoch": 0.19, + "grad_norm": 0.5177359123813741, + "learning_rate": 9.33939172379373e-07, + "loss": 0.2016, + "step": 2988 + }, + { + "epoch": 0.19, + "grad_norm": 0.31598636718736156, + "learning_rate": 9.338878590368333e-07, + "loss": 0.076, + "step": 2989 + }, + { + "epoch": 0.19, + "grad_norm": 1.3549764100792365, + "learning_rate": 9.338365271837654e-07, + "loss": 0.1827, + "step": 2990 + }, + { + "epoch": 0.19, + "grad_norm": 1.1013235744070637, + "learning_rate": 9.337851768223588e-07, + "loss": 0.3258, + "step": 2991 + }, + { + "epoch": 0.19, + "grad_norm": 0.772870087027144, + "learning_rate": 9.337338079548048e-07, + "loss": 0.4289, + "step": 2992 + }, + { + "epoch": 0.19, + "grad_norm": 0.5294229312283212, + "learning_rate": 9.336824205832947e-07, + "loss": 0.2537, + "step": 2993 + }, + { + "epoch": 0.19, + "grad_norm": 1.1287496183325993, + "learning_rate": 9.336310147100205e-07, + "loss": 0.1337, + "step": 2994 + }, + { + "epoch": 0.19, + "grad_norm": 0.44638174383621354, + "learning_rate": 9.335795903371755e-07, + "loss": 0.2678, + "step": 2995 + }, + { + "epoch": 0.19, + "grad_norm": 0.593454638131262, + "learning_rate": 9.335281474669538e-07, + "loss": 0.2518, + "step": 2996 + }, + { + "epoch": 0.19, + "grad_norm": 0.44976684179108467, + "learning_rate": 9.334766861015496e-07, + "loss": 0.0079, + "step": 2997 + }, + { + "epoch": 0.19, + "grad_norm": 0.4337280332740164, + "learning_rate": 9.334252062431587e-07, + "loss": 0.3132, + "step": 2998 + }, + { + "epoch": 0.19, + "grad_norm": 0.2867560830291846, + "learning_rate": 9.333737078939772e-07, + "loss": 0.1556, + "step": 2999 + }, + { + "epoch": 0.19, + "grad_norm": 0.4448942958304109, + "learning_rate": 9.333221910562022e-07, + "loss": 0.1751, + "step": 3000 + }, + { + "epoch": 0.19, + "grad_norm": 1.263005784438112, + "learning_rate": 9.332706557320314e-07, + "loss": 0.1779, + "step": 3001 + }, + { + "epoch": 0.19, + "grad_norm": 0.5466353098286592, + "learning_rate": 9.332191019236632e-07, + "loss": 0.099, + "step": 3002 + }, + { + "epoch": 0.19, + "grad_norm": 0.7976615097556736, + "learning_rate": 9.331675296332975e-07, + "loss": 0.1148, + "step": 3003 + }, + { + "epoch": 0.19, + "grad_norm": 0.8784737078696657, + "learning_rate": 9.331159388631341e-07, + "loss": 0.2069, + "step": 3004 + }, + { + "epoch": 0.19, + "grad_norm": 0.535056162166755, + "learning_rate": 9.330643296153742e-07, + "loss": 0.1783, + "step": 3005 + }, + { + "epoch": 0.19, + "grad_norm": 0.239662280674292, + "learning_rate": 9.330127018922193e-07, + "loss": 0.1772, + "step": 3006 + }, + { + "epoch": 0.19, + "grad_norm": 1.4329142584792107, + "learning_rate": 9.329610556958722e-07, + "loss": 0.2907, + "step": 3007 + }, + { + "epoch": 0.19, + "grad_norm": 0.28893600436723366, + "learning_rate": 9.32909391028536e-07, + "loss": 0.1061, + "step": 3008 + }, + { + "epoch": 0.19, + "grad_norm": 0.6270169813375313, + "learning_rate": 9.32857707892415e-07, + "loss": 0.1671, + "step": 3009 + }, + { + "epoch": 0.19, + "grad_norm": 0.7369538297638233, + "learning_rate": 9.328060062897138e-07, + "loss": 0.0758, + "step": 3010 + }, + { + "epoch": 0.19, + "grad_norm": 0.4241868162913618, + "learning_rate": 9.327542862226386e-07, + "loss": 0.1742, + "step": 3011 + }, + { + "epoch": 0.19, + "grad_norm": 0.34388789875246983, + "learning_rate": 9.327025476933954e-07, + "loss": 0.0318, + "step": 3012 + }, + { + "epoch": 0.19, + "grad_norm": 0.5254738570119284, + "learning_rate": 9.326507907041918e-07, + "loss": 0.067, + "step": 3013 + }, + { + "epoch": 0.19, + "grad_norm": 0.3090281030578117, + "learning_rate": 9.325990152572358e-07, + "loss": 0.1806, + "step": 3014 + }, + { + "epoch": 0.19, + "grad_norm": 0.573799180637348, + "learning_rate": 9.32547221354736e-07, + "loss": 0.1102, + "step": 3015 + }, + { + "epoch": 0.19, + "grad_norm": 1.5302728596888309, + "learning_rate": 9.324954089989023e-07, + "loss": 0.2121, + "step": 3016 + }, + { + "epoch": 0.19, + "grad_norm": 0.33753031745126294, + "learning_rate": 9.324435781919449e-07, + "loss": 0.1485, + "step": 3017 + }, + { + "epoch": 0.19, + "grad_norm": 0.998994753179129, + "learning_rate": 9.323917289360753e-07, + "loss": 0.2142, + "step": 3018 + }, + { + "epoch": 0.19, + "grad_norm": 0.19605758441521368, + "learning_rate": 9.323398612335054e-07, + "loss": 0.0065, + "step": 3019 + }, + { + "epoch": 0.19, + "grad_norm": 0.4283434558037134, + "learning_rate": 9.322879750864476e-07, + "loss": 0.0639, + "step": 3020 + }, + { + "epoch": 0.19, + "grad_norm": 0.8714663833200217, + "learning_rate": 9.32236070497116e-07, + "loss": 0.2954, + "step": 3021 + }, + { + "epoch": 0.19, + "grad_norm": 0.858049201538456, + "learning_rate": 9.321841474677247e-07, + "loss": 0.1448, + "step": 3022 + }, + { + "epoch": 0.19, + "grad_norm": 1.1389143844340983, + "learning_rate": 9.321322060004888e-07, + "loss": 0.3223, + "step": 3023 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234024329695265, + "learning_rate": 9.320802460976245e-07, + "loss": 0.1825, + "step": 3024 + }, + { + "epoch": 0.19, + "grad_norm": 0.9722075998628446, + "learning_rate": 9.32028267761348e-07, + "loss": 0.2889, + "step": 3025 + }, + { + "epoch": 0.19, + "grad_norm": 0.9437832253730604, + "learning_rate": 9.319762709938775e-07, + "loss": 0.3222, + "step": 3026 + }, + { + "epoch": 0.19, + "grad_norm": 0.830519395355223, + "learning_rate": 9.319242557974305e-07, + "loss": 0.3216, + "step": 3027 + }, + { + "epoch": 0.19, + "grad_norm": 0.46865160802583294, + "learning_rate": 9.318722221742267e-07, + "loss": 0.1342, + "step": 3028 + }, + { + "epoch": 0.19, + "grad_norm": 0.7704922069197175, + "learning_rate": 9.318201701264857e-07, + "loss": 0.2531, + "step": 3029 + }, + { + "epoch": 0.19, + "grad_norm": 0.9057929614247713, + "learning_rate": 9.317680996564281e-07, + "loss": 0.1031, + "step": 3030 + }, + { + "epoch": 0.19, + "grad_norm": 0.6183863786282269, + "learning_rate": 9.317160107662754e-07, + "loss": 0.0579, + "step": 3031 + }, + { + "epoch": 0.19, + "grad_norm": 0.6514179057007335, + "learning_rate": 9.316639034582498e-07, + "loss": 0.1904, + "step": 3032 + }, + { + "epoch": 0.19, + "grad_norm": 1.2777188197898017, + "learning_rate": 9.316117777345746e-07, + "loss": 0.036, + "step": 3033 + }, + { + "epoch": 0.19, + "grad_norm": 0.7732503085369049, + "learning_rate": 9.315596335974731e-07, + "loss": 0.4658, + "step": 3034 + }, + { + "epoch": 0.19, + "grad_norm": 0.8356075156372184, + "learning_rate": 9.3150747104917e-07, + "loss": 0.2359, + "step": 3035 + }, + { + "epoch": 0.19, + "grad_norm": 0.4573641176035874, + "learning_rate": 9.314552900918908e-07, + "loss": 0.0818, + "step": 3036 + }, + { + "epoch": 0.19, + "grad_norm": 1.085807918386808, + "learning_rate": 9.314030907278618e-07, + "loss": 0.4449, + "step": 3037 + }, + { + "epoch": 0.19, + "grad_norm": 0.8291466502959902, + "learning_rate": 9.313508729593096e-07, + "loss": 0.2132, + "step": 3038 + }, + { + "epoch": 0.19, + "grad_norm": 3.2470427402668394, + "learning_rate": 9.312986367884619e-07, + "loss": 0.1351, + "step": 3039 + }, + { + "epoch": 0.19, + "grad_norm": 0.8357623921356458, + "learning_rate": 9.312463822175474e-07, + "loss": 0.2655, + "step": 3040 + }, + { + "epoch": 0.19, + "grad_norm": 0.6289347384014446, + "learning_rate": 9.311941092487954e-07, + "loss": 0.1375, + "step": 3041 + }, + { + "epoch": 0.19, + "grad_norm": 0.6852238162700425, + "learning_rate": 9.31141817884436e-07, + "loss": 0.1956, + "step": 3042 + }, + { + "epoch": 0.19, + "grad_norm": 1.2071504666592814, + "learning_rate": 9.310895081266996e-07, + "loss": 0.4316, + "step": 3043 + }, + { + "epoch": 0.19, + "grad_norm": 0.672865138314787, + "learning_rate": 9.310371799778184e-07, + "loss": 0.1289, + "step": 3044 + }, + { + "epoch": 0.19, + "grad_norm": 1.15820142374764, + "learning_rate": 9.309848334400245e-07, + "loss": 0.1639, + "step": 3045 + }, + { + "epoch": 0.19, + "grad_norm": 0.5501107913660684, + "learning_rate": 9.309324685155513e-07, + "loss": 0.2278, + "step": 3046 + }, + { + "epoch": 0.19, + "grad_norm": 0.8376405818747217, + "learning_rate": 9.308800852066328e-07, + "loss": 0.2784, + "step": 3047 + }, + { + "epoch": 0.19, + "grad_norm": 0.8444680283046364, + "learning_rate": 9.308276835155036e-07, + "loss": 0.1992, + "step": 3048 + }, + { + "epoch": 0.19, + "grad_norm": 0.9159911788834041, + "learning_rate": 9.307752634443992e-07, + "loss": 0.3426, + "step": 3049 + }, + { + "epoch": 0.19, + "grad_norm": 1.2841327552971487, + "learning_rate": 9.307228249955563e-07, + "loss": 0.2586, + "step": 3050 + }, + { + "epoch": 0.19, + "grad_norm": 0.3766795409358213, + "learning_rate": 9.306703681712118e-07, + "loss": 0.2657, + "step": 3051 + }, + { + "epoch": 0.19, + "grad_norm": 0.6124945721716994, + "learning_rate": 9.306178929736037e-07, + "loss": 0.1604, + "step": 3052 + }, + { + "epoch": 0.19, + "grad_norm": 1.0815748384024368, + "learning_rate": 9.305653994049705e-07, + "loss": 0.2442, + "step": 3053 + }, + { + "epoch": 0.19, + "grad_norm": 1.2882148876130204, + "learning_rate": 9.305128874675519e-07, + "loss": 0.1155, + "step": 3054 + }, + { + "epoch": 0.19, + "grad_norm": 0.3607427777409394, + "learning_rate": 9.304603571635879e-07, + "loss": 0.2395, + "step": 3055 + }, + { + "epoch": 0.19, + "grad_norm": 0.6693896201291227, + "learning_rate": 9.3040780849532e-07, + "loss": 0.2456, + "step": 3056 + }, + { + "epoch": 0.19, + "grad_norm": 0.31469024781178107, + "learning_rate": 9.303552414649896e-07, + "loss": 0.1758, + "step": 3057 + }, + { + "epoch": 0.2, + "grad_norm": 0.4505317171746773, + "learning_rate": 9.303026560748395e-07, + "loss": 0.297, + "step": 3058 + }, + { + "epoch": 0.2, + "grad_norm": 0.6597112278091493, + "learning_rate": 9.302500523271131e-07, + "loss": 0.1577, + "step": 3059 + }, + { + "epoch": 0.2, + "grad_norm": 0.7323757246455531, + "learning_rate": 9.301974302240545e-07, + "loss": 0.2976, + "step": 3060 + }, + { + "epoch": 0.2, + "grad_norm": 1.7159980576059044, + "learning_rate": 9.301447897679087e-07, + "loss": 0.2212, + "step": 3061 + }, + { + "epoch": 0.2, + "grad_norm": 0.6263325445076474, + "learning_rate": 9.300921309609215e-07, + "loss": 0.2819, + "step": 3062 + }, + { + "epoch": 0.2, + "grad_norm": 0.7504126780496241, + "learning_rate": 9.300394538053394e-07, + "loss": 0.203, + "step": 3063 + }, + { + "epoch": 0.2, + "grad_norm": 0.8346360508181327, + "learning_rate": 9.299867583034098e-07, + "loss": 0.2596, + "step": 3064 + }, + { + "epoch": 0.2, + "grad_norm": 0.296066955293461, + "learning_rate": 9.299340444573807e-07, + "loss": 0.1294, + "step": 3065 + }, + { + "epoch": 0.2, + "grad_norm": 0.7019518256264955, + "learning_rate": 9.298813122695009e-07, + "loss": 0.1725, + "step": 3066 + }, + { + "epoch": 0.2, + "grad_norm": 0.7518499734295849, + "learning_rate": 9.298285617420202e-07, + "loss": 0.1102, + "step": 3067 + }, + { + "epoch": 0.2, + "grad_norm": 1.3364157984802234, + "learning_rate": 9.29775792877189e-07, + "loss": 0.1319, + "step": 3068 + }, + { + "epoch": 0.2, + "grad_norm": 0.33961466118762706, + "learning_rate": 9.297230056772585e-07, + "loss": 0.191, + "step": 3069 + }, + { + "epoch": 0.2, + "grad_norm": 0.5594391802486758, + "learning_rate": 9.296702001444807e-07, + "loss": 0.1917, + "step": 3070 + }, + { + "epoch": 0.2, + "grad_norm": 1.2721794436919729, + "learning_rate": 9.296173762811084e-07, + "loss": 0.3695, + "step": 3071 + }, + { + "epoch": 0.2, + "grad_norm": 0.9533258107642867, + "learning_rate": 9.295645340893952e-07, + "loss": 0.2842, + "step": 3072 + }, + { + "epoch": 0.2, + "grad_norm": 1.073891511275848, + "learning_rate": 9.295116735715955e-07, + "loss": 0.518, + "step": 3073 + }, + { + "epoch": 0.2, + "grad_norm": 0.9572885740850663, + "learning_rate": 9.294587947299644e-07, + "loss": 0.0734, + "step": 3074 + }, + { + "epoch": 0.2, + "grad_norm": 1.2986012575020074, + "learning_rate": 9.294058975667575e-07, + "loss": 0.3546, + "step": 3075 + }, + { + "epoch": 0.2, + "grad_norm": 0.4913653469206503, + "learning_rate": 9.293529820842322e-07, + "loss": 0.1909, + "step": 3076 + }, + { + "epoch": 0.2, + "grad_norm": 0.7482298590180433, + "learning_rate": 9.293000482846453e-07, + "loss": 0.141, + "step": 3077 + }, + { + "epoch": 0.2, + "grad_norm": 0.4274984032690888, + "learning_rate": 9.292470961702555e-07, + "loss": 0.273, + "step": 3078 + }, + { + "epoch": 0.2, + "grad_norm": 1.0737864233379126, + "learning_rate": 9.291941257433217e-07, + "loss": 0.1208, + "step": 3079 + }, + { + "epoch": 0.2, + "grad_norm": 0.6277167426847763, + "learning_rate": 9.291411370061036e-07, + "loss": 0.314, + "step": 3080 + }, + { + "epoch": 0.2, + "grad_norm": 0.490536857820418, + "learning_rate": 9.29088129960862e-07, + "loss": 0.1106, + "step": 3081 + }, + { + "epoch": 0.2, + "grad_norm": 0.9631473008227158, + "learning_rate": 9.290351046098581e-07, + "loss": 0.1021, + "step": 3082 + }, + { + "epoch": 0.2, + "grad_norm": 1.040663390524604, + "learning_rate": 9.289820609553542e-07, + "loss": 0.0639, + "step": 3083 + }, + { + "epoch": 0.2, + "grad_norm": 0.5977638524386644, + "learning_rate": 9.289289989996132e-07, + "loss": 0.2086, + "step": 3084 + }, + { + "epoch": 0.2, + "grad_norm": 0.7715634833014068, + "learning_rate": 9.288759187448989e-07, + "loss": 0.1186, + "step": 3085 + }, + { + "epoch": 0.2, + "grad_norm": 2.0831515958975575, + "learning_rate": 9.288228201934758e-07, + "loss": 0.1367, + "step": 3086 + }, + { + "epoch": 0.2, + "grad_norm": 0.59423851730006, + "learning_rate": 9.287697033476091e-07, + "loss": 0.4256, + "step": 3087 + }, + { + "epoch": 0.2, + "grad_norm": 2.4208760634114546, + "learning_rate": 9.287165682095649e-07, + "loss": 0.3848, + "step": 3088 + }, + { + "epoch": 0.2, + "grad_norm": 0.23706709793319297, + "learning_rate": 9.286634147816102e-07, + "loss": 0.2232, + "step": 3089 + }, + { + "epoch": 0.2, + "grad_norm": 1.3404176398453622, + "learning_rate": 9.286102430660123e-07, + "loss": 0.2283, + "step": 3090 + }, + { + "epoch": 0.2, + "grad_norm": 1.1211225236398437, + "learning_rate": 9.285570530650399e-07, + "loss": 0.2886, + "step": 3091 + }, + { + "epoch": 0.2, + "grad_norm": 0.8640493538302704, + "learning_rate": 9.285038447809621e-07, + "loss": 0.1125, + "step": 3092 + }, + { + "epoch": 0.2, + "grad_norm": 0.7734970842558655, + "learning_rate": 9.284506182160489e-07, + "loss": 0.2469, + "step": 3093 + }, + { + "epoch": 0.2, + "grad_norm": 0.6817783569138928, + "learning_rate": 9.283973733725709e-07, + "loss": 0.0149, + "step": 3094 + }, + { + "epoch": 0.2, + "grad_norm": 0.746164700972667, + "learning_rate": 9.283441102528e-07, + "loss": 0.4098, + "step": 3095 + }, + { + "epoch": 0.2, + "grad_norm": 0.6777464618768618, + "learning_rate": 9.282908288590082e-07, + "loss": 0.1893, + "step": 3096 + }, + { + "epoch": 0.2, + "grad_norm": 0.631082316411457, + "learning_rate": 9.282375291934685e-07, + "loss": 0.1531, + "step": 3097 + }, + { + "epoch": 0.2, + "grad_norm": 0.4404346136544487, + "learning_rate": 9.281842112584552e-07, + "loss": 0.0854, + "step": 3098 + }, + { + "epoch": 0.2, + "grad_norm": 0.6649865624253605, + "learning_rate": 9.281308750562425e-07, + "loss": 0.1809, + "step": 3099 + }, + { + "epoch": 0.2, + "grad_norm": 0.553830471140561, + "learning_rate": 9.280775205891062e-07, + "loss": 0.2709, + "step": 3100 + }, + { + "epoch": 0.2, + "grad_norm": 0.787758787758307, + "learning_rate": 9.280241478593221e-07, + "loss": 0.31, + "step": 3101 + }, + { + "epoch": 0.2, + "grad_norm": 0.36627677398165226, + "learning_rate": 9.279707568691676e-07, + "loss": 0.2286, + "step": 3102 + }, + { + "epoch": 0.2, + "grad_norm": 0.5283829688432854, + "learning_rate": 9.279173476209202e-07, + "loss": 0.3185, + "step": 3103 + }, + { + "epoch": 0.2, + "grad_norm": 1.6095925885429754, + "learning_rate": 9.278639201168585e-07, + "loss": 0.0636, + "step": 3104 + }, + { + "epoch": 0.2, + "grad_norm": 1.4672601309115711, + "learning_rate": 9.27810474359262e-07, + "loss": 0.2262, + "step": 3105 + }, + { + "epoch": 0.2, + "grad_norm": 0.8983970054704082, + "learning_rate": 9.277570103504104e-07, + "loss": 0.1639, + "step": 3106 + }, + { + "epoch": 0.2, + "grad_norm": 0.6342052380139972, + "learning_rate": 9.277035280925852e-07, + "loss": 0.1156, + "step": 3107 + }, + { + "epoch": 0.2, + "grad_norm": 1.0159746958773266, + "learning_rate": 9.276500275880675e-07, + "loss": 0.2254, + "step": 3108 + }, + { + "epoch": 0.2, + "grad_norm": 1.431654174276687, + "learning_rate": 9.275965088391397e-07, + "loss": 0.2284, + "step": 3109 + }, + { + "epoch": 0.2, + "grad_norm": 1.1935745659077637, + "learning_rate": 9.275429718480856e-07, + "loss": 0.1168, + "step": 3110 + }, + { + "epoch": 0.2, + "grad_norm": 0.4555984205316003, + "learning_rate": 9.274894166171887e-07, + "loss": 0.06, + "step": 3111 + }, + { + "epoch": 0.2, + "grad_norm": 1.1012201002196862, + "learning_rate": 9.274358431487339e-07, + "loss": 0.2859, + "step": 3112 + }, + { + "epoch": 0.2, + "grad_norm": 0.9332877898438722, + "learning_rate": 9.27382251445007e-07, + "loss": 0.2569, + "step": 3113 + }, + { + "epoch": 0.2, + "grad_norm": 0.7874397460911885, + "learning_rate": 9.273286415082939e-07, + "loss": 0.3878, + "step": 3114 + }, + { + "epoch": 0.2, + "grad_norm": 0.7959578824051317, + "learning_rate": 9.272750133408819e-07, + "loss": 0.1595, + "step": 3115 + }, + { + "epoch": 0.2, + "grad_norm": 0.46285371181947127, + "learning_rate": 9.27221366945059e-07, + "loss": 0.1919, + "step": 3116 + }, + { + "epoch": 0.2, + "grad_norm": 2.1482722745697806, + "learning_rate": 9.271677023231137e-07, + "loss": 0.2325, + "step": 3117 + }, + { + "epoch": 0.2, + "grad_norm": 1.7778395195759555, + "learning_rate": 9.271140194773355e-07, + "loss": 0.3524, + "step": 3118 + }, + { + "epoch": 0.2, + "grad_norm": 0.5775770200927671, + "learning_rate": 9.270603184100148e-07, + "loss": 0.2144, + "step": 3119 + }, + { + "epoch": 0.2, + "grad_norm": 0.857440018690515, + "learning_rate": 9.270065991234421e-07, + "loss": 0.2713, + "step": 3120 + }, + { + "epoch": 0.2, + "grad_norm": 0.5705712224451414, + "learning_rate": 9.269528616199097e-07, + "loss": 0.2801, + "step": 3121 + }, + { + "epoch": 0.2, + "grad_norm": 0.4706964326756469, + "learning_rate": 9.2689910590171e-07, + "loss": 0.0459, + "step": 3122 + }, + { + "epoch": 0.2, + "grad_norm": 0.42521057349481917, + "learning_rate": 9.268453319711362e-07, + "loss": 0.1089, + "step": 3123 + }, + { + "epoch": 0.2, + "grad_norm": 0.40657016370441595, + "learning_rate": 9.267915398304823e-07, + "loss": 0.1521, + "step": 3124 + }, + { + "epoch": 0.2, + "grad_norm": 0.46261893384349917, + "learning_rate": 9.267377294820435e-07, + "loss": 0.1424, + "step": 3125 + }, + { + "epoch": 0.2, + "grad_norm": 0.6833032160213107, + "learning_rate": 9.266839009281153e-07, + "loss": 0.1854, + "step": 3126 + }, + { + "epoch": 0.2, + "grad_norm": 1.7736416060711386, + "learning_rate": 9.266300541709942e-07, + "loss": 0.3294, + "step": 3127 + }, + { + "epoch": 0.2, + "grad_norm": 1.2219178648628992, + "learning_rate": 9.265761892129773e-07, + "loss": 0.1536, + "step": 3128 + }, + { + "epoch": 0.2, + "grad_norm": 0.6033314381868828, + "learning_rate": 9.265223060563626e-07, + "loss": 0.1082, + "step": 3129 + }, + { + "epoch": 0.2, + "grad_norm": 0.262137011317028, + "learning_rate": 9.26468404703449e-07, + "loss": 0.1726, + "step": 3130 + }, + { + "epoch": 0.2, + "grad_norm": 0.7195111645442679, + "learning_rate": 9.264144851565359e-07, + "loss": 0.1617, + "step": 3131 + }, + { + "epoch": 0.2, + "grad_norm": 0.3230029545712064, + "learning_rate": 9.263605474179237e-07, + "loss": 0.0923, + "step": 3132 + }, + { + "epoch": 0.2, + "grad_norm": 2.956611115674006, + "learning_rate": 9.263065914899133e-07, + "loss": 0.3059, + "step": 3133 + }, + { + "epoch": 0.2, + "grad_norm": 0.359438984188222, + "learning_rate": 9.262526173748069e-07, + "loss": 0.0781, + "step": 3134 + }, + { + "epoch": 0.2, + "grad_norm": 0.36141643563992937, + "learning_rate": 9.261986250749067e-07, + "loss": 0.1543, + "step": 3135 + }, + { + "epoch": 0.2, + "grad_norm": 0.9103019185795369, + "learning_rate": 9.261446145925167e-07, + "loss": 0.2738, + "step": 3136 + }, + { + "epoch": 0.2, + "grad_norm": 1.3150390424059855, + "learning_rate": 9.260905859299407e-07, + "loss": 0.3519, + "step": 3137 + }, + { + "epoch": 0.2, + "grad_norm": 0.4051852584242612, + "learning_rate": 9.260365390894837e-07, + "loss": 0.0811, + "step": 3138 + }, + { + "epoch": 0.2, + "grad_norm": 0.8460199513689234, + "learning_rate": 9.259824740734516e-07, + "loss": 0.122, + "step": 3139 + }, + { + "epoch": 0.2, + "grad_norm": 0.9071104525547069, + "learning_rate": 9.259283908841506e-07, + "loss": 0.3743, + "step": 3140 + }, + { + "epoch": 0.2, + "grad_norm": 0.9524789406700434, + "learning_rate": 9.258742895238885e-07, + "loss": 0.1818, + "step": 3141 + }, + { + "epoch": 0.2, + "grad_norm": 0.6664822300195525, + "learning_rate": 9.25820169994973e-07, + "loss": 0.1822, + "step": 3142 + }, + { + "epoch": 0.2, + "grad_norm": 0.6172938015588753, + "learning_rate": 9.25766032299713e-07, + "loss": 0.2441, + "step": 3143 + }, + { + "epoch": 0.2, + "grad_norm": 1.849651525952206, + "learning_rate": 9.257118764404181e-07, + "loss": 0.107, + "step": 3144 + }, + { + "epoch": 0.2, + "grad_norm": 0.7729040587341335, + "learning_rate": 9.256577024193989e-07, + "loss": 0.3767, + "step": 3145 + }, + { + "epoch": 0.2, + "grad_norm": 0.6564250188192128, + "learning_rate": 9.256035102389663e-07, + "loss": 0.0797, + "step": 3146 + }, + { + "epoch": 0.2, + "grad_norm": 3.4430327980532014, + "learning_rate": 9.255492999014324e-07, + "loss": 0.2038, + "step": 3147 + }, + { + "epoch": 0.2, + "grad_norm": 0.6259749299365427, + "learning_rate": 9.254950714091101e-07, + "loss": 0.0472, + "step": 3148 + }, + { + "epoch": 0.2, + "grad_norm": 1.575732959471976, + "learning_rate": 9.254408247643125e-07, + "loss": 0.123, + "step": 3149 + }, + { + "epoch": 0.2, + "grad_norm": 0.7782952435832244, + "learning_rate": 9.253865599693543e-07, + "loss": 0.1398, + "step": 3150 + }, + { + "epoch": 0.2, + "grad_norm": 0.5640252414464564, + "learning_rate": 9.253322770265501e-07, + "loss": 0.1967, + "step": 3151 + }, + { + "epoch": 0.2, + "grad_norm": 0.6524550108989349, + "learning_rate": 9.252779759382158e-07, + "loss": 0.268, + "step": 3152 + }, + { + "epoch": 0.2, + "grad_norm": 0.640403865390597, + "learning_rate": 9.252236567066685e-07, + "loss": 0.2419, + "step": 3153 + }, + { + "epoch": 0.2, + "grad_norm": 1.1765964313625707, + "learning_rate": 9.251693193342249e-07, + "loss": 0.3511, + "step": 3154 + }, + { + "epoch": 0.2, + "grad_norm": 1.7874003331630535, + "learning_rate": 9.251149638232037e-07, + "loss": 0.1543, + "step": 3155 + }, + { + "epoch": 0.2, + "grad_norm": 1.1031380268003386, + "learning_rate": 9.250605901759233e-07, + "loss": 0.2093, + "step": 3156 + }, + { + "epoch": 0.2, + "grad_norm": 0.7965048234082887, + "learning_rate": 9.250061983947038e-07, + "loss": 0.3321, + "step": 3157 + }, + { + "epoch": 0.2, + "grad_norm": 0.6196023978433362, + "learning_rate": 9.249517884818654e-07, + "loss": 0.4197, + "step": 3158 + }, + { + "epoch": 0.2, + "grad_norm": 0.5654349449389893, + "learning_rate": 9.248973604397295e-07, + "loss": 0.1719, + "step": 3159 + }, + { + "epoch": 0.2, + "grad_norm": 0.698012012240679, + "learning_rate": 9.248429142706181e-07, + "loss": 0.2616, + "step": 3160 + }, + { + "epoch": 0.2, + "grad_norm": 0.2680825715026603, + "learning_rate": 9.247884499768539e-07, + "loss": 0.1239, + "step": 3161 + }, + { + "epoch": 0.2, + "grad_norm": 0.4076732361684476, + "learning_rate": 9.247339675607605e-07, + "loss": 0.1892, + "step": 3162 + }, + { + "epoch": 0.2, + "grad_norm": 0.3428337471960096, + "learning_rate": 9.246794670246623e-07, + "loss": 0.0122, + "step": 3163 + }, + { + "epoch": 0.2, + "grad_norm": 0.6622795978962026, + "learning_rate": 9.246249483708842e-07, + "loss": 0.4124, + "step": 3164 + }, + { + "epoch": 0.2, + "grad_norm": 0.43485167127355784, + "learning_rate": 9.24570411601752e-07, + "loss": 0.1117, + "step": 3165 + }, + { + "epoch": 0.2, + "grad_norm": 0.5284284378374157, + "learning_rate": 9.24515856719593e-07, + "loss": 0.1605, + "step": 3166 + }, + { + "epoch": 0.2, + "grad_norm": 0.451271690731251, + "learning_rate": 9.244612837267338e-07, + "loss": 0.1461, + "step": 3167 + }, + { + "epoch": 0.2, + "grad_norm": 0.41283869920204125, + "learning_rate": 9.244066926255031e-07, + "loss": 0.0468, + "step": 3168 + }, + { + "epoch": 0.2, + "grad_norm": 0.859544709261444, + "learning_rate": 9.243520834182297e-07, + "loss": 0.0915, + "step": 3169 + }, + { + "epoch": 0.2, + "grad_norm": 0.5412777735384318, + "learning_rate": 9.242974561072436e-07, + "loss": 0.1003, + "step": 3170 + }, + { + "epoch": 0.2, + "grad_norm": 0.8852762174299448, + "learning_rate": 9.242428106948748e-07, + "loss": 0.035, + "step": 3171 + }, + { + "epoch": 0.2, + "grad_norm": 0.46427628358900325, + "learning_rate": 9.241881471834549e-07, + "loss": 0.2438, + "step": 3172 + }, + { + "epoch": 0.2, + "grad_norm": 0.3303918204521397, + "learning_rate": 9.24133465575316e-07, + "loss": 0.1165, + "step": 3173 + }, + { + "epoch": 0.2, + "grad_norm": 0.3878750718711695, + "learning_rate": 9.240787658727909e-07, + "loss": 0.3216, + "step": 3174 + }, + { + "epoch": 0.2, + "grad_norm": 0.5824463727444047, + "learning_rate": 9.240240480782129e-07, + "loss": 0.1372, + "step": 3175 + }, + { + "epoch": 0.2, + "grad_norm": 0.4037450558744446, + "learning_rate": 9.239693121939168e-07, + "loss": 0.1651, + "step": 3176 + }, + { + "epoch": 0.2, + "grad_norm": 0.8191030154457623, + "learning_rate": 9.239145582222376e-07, + "loss": 0.2761, + "step": 3177 + }, + { + "epoch": 0.2, + "grad_norm": 0.4330613171027677, + "learning_rate": 9.238597861655111e-07, + "loss": 0.1141, + "step": 3178 + }, + { + "epoch": 0.2, + "grad_norm": 1.9178433176483896, + "learning_rate": 9.23804996026074e-07, + "loss": 0.2528, + "step": 3179 + }, + { + "epoch": 0.2, + "grad_norm": 0.8536592757608799, + "learning_rate": 9.237501878062638e-07, + "loss": 0.4177, + "step": 3180 + }, + { + "epoch": 0.2, + "grad_norm": 0.5320097823585402, + "learning_rate": 9.236953615084189e-07, + "loss": 0.3115, + "step": 3181 + }, + { + "epoch": 0.2, + "grad_norm": 0.5953625757265195, + "learning_rate": 9.23640517134878e-07, + "loss": 0.1737, + "step": 3182 + }, + { + "epoch": 0.2, + "grad_norm": 0.8007362504841291, + "learning_rate": 9.23585654687981e-07, + "loss": 0.0391, + "step": 3183 + }, + { + "epoch": 0.2, + "grad_norm": 0.8458625569678068, + "learning_rate": 9.235307741700685e-07, + "loss": 0.0691, + "step": 3184 + }, + { + "epoch": 0.2, + "grad_norm": 0.7845374611852582, + "learning_rate": 9.234758755834818e-07, + "loss": 0.4652, + "step": 3185 + }, + { + "epoch": 0.2, + "grad_norm": 0.9736760960263428, + "learning_rate": 9.234209589305629e-07, + "loss": 0.1573, + "step": 3186 + }, + { + "epoch": 0.2, + "grad_norm": 1.2100880068248985, + "learning_rate": 9.233660242136548e-07, + "loss": 0.2671, + "step": 3187 + }, + { + "epoch": 0.2, + "grad_norm": 1.8863558069875754, + "learning_rate": 9.233110714351009e-07, + "loss": 0.285, + "step": 3188 + }, + { + "epoch": 0.2, + "grad_norm": 0.387485326428505, + "learning_rate": 9.232561005972459e-07, + "loss": 0.1589, + "step": 3189 + }, + { + "epoch": 0.2, + "grad_norm": 0.7961383788030786, + "learning_rate": 9.232011117024348e-07, + "loss": 0.2707, + "step": 3190 + }, + { + "epoch": 0.2, + "grad_norm": 1.9879379142928602, + "learning_rate": 9.231461047530133e-07, + "loss": 0.2961, + "step": 3191 + }, + { + "epoch": 0.2, + "grad_norm": 0.7928095490159558, + "learning_rate": 9.230910797513285e-07, + "loss": 0.143, + "step": 3192 + }, + { + "epoch": 0.2, + "grad_norm": 0.7905290172441409, + "learning_rate": 9.230360366997277e-07, + "loss": 0.3047, + "step": 3193 + }, + { + "epoch": 0.2, + "grad_norm": 1.171167431695269, + "learning_rate": 9.229809756005592e-07, + "loss": 0.0411, + "step": 3194 + }, + { + "epoch": 0.2, + "grad_norm": 0.7363608158765037, + "learning_rate": 9.229258964561719e-07, + "loss": 0.2254, + "step": 3195 + }, + { + "epoch": 0.2, + "grad_norm": 1.054144945273629, + "learning_rate": 9.228707992689157e-07, + "loss": 0.2566, + "step": 3196 + }, + { + "epoch": 0.2, + "grad_norm": 0.1728080823178616, + "learning_rate": 9.228156840411411e-07, + "loss": 0.0789, + "step": 3197 + }, + { + "epoch": 0.2, + "grad_norm": 0.4382911752143824, + "learning_rate": 9.227605507751997e-07, + "loss": 0.0144, + "step": 3198 + }, + { + "epoch": 0.2, + "grad_norm": 1.4021783476774723, + "learning_rate": 9.227053994734431e-07, + "loss": 0.4258, + "step": 3199 + }, + { + "epoch": 0.2, + "grad_norm": 0.469696850201351, + "learning_rate": 9.226502301382244e-07, + "loss": 0.0046, + "step": 3200 + }, + { + "epoch": 0.2, + "grad_norm": 0.7784926479436306, + "learning_rate": 9.225950427718974e-07, + "loss": 0.187, + "step": 3201 + }, + { + "epoch": 0.2, + "grad_norm": 0.8830903774871972, + "learning_rate": 9.225398373768163e-07, + "loss": 0.2908, + "step": 3202 + }, + { + "epoch": 0.2, + "grad_norm": 0.7850095466288786, + "learning_rate": 9.224846139553362e-07, + "loss": 0.2986, + "step": 3203 + }, + { + "epoch": 0.2, + "grad_norm": 0.7171573984108145, + "learning_rate": 9.224293725098132e-07, + "loss": 0.136, + "step": 3204 + }, + { + "epoch": 0.2, + "grad_norm": 0.7147338692888996, + "learning_rate": 9.223741130426041e-07, + "loss": 0.2694, + "step": 3205 + }, + { + "epoch": 0.2, + "grad_norm": 0.5449525920143408, + "learning_rate": 9.223188355560663e-07, + "loss": 0.1357, + "step": 3206 + }, + { + "epoch": 0.2, + "grad_norm": 0.8683597603208748, + "learning_rate": 9.222635400525578e-07, + "loss": 0.0655, + "step": 3207 + }, + { + "epoch": 0.2, + "grad_norm": 0.7005967370533396, + "learning_rate": 9.22208226534438e-07, + "loss": 0.1175, + "step": 3208 + }, + { + "epoch": 0.2, + "grad_norm": 0.43738091256188927, + "learning_rate": 9.221528950040663e-07, + "loss": 0.1073, + "step": 3209 + }, + { + "epoch": 0.2, + "grad_norm": 0.8581921996171491, + "learning_rate": 9.220975454638036e-07, + "loss": 0.081, + "step": 3210 + }, + { + "epoch": 0.2, + "grad_norm": 1.1351383388479153, + "learning_rate": 9.220421779160111e-07, + "loss": 0.0515, + "step": 3211 + }, + { + "epoch": 0.2, + "grad_norm": 0.650205733208947, + "learning_rate": 9.219867923630509e-07, + "loss": 0.1263, + "step": 3212 + }, + { + "epoch": 0.2, + "grad_norm": 0.5094960839844335, + "learning_rate": 9.219313888072859e-07, + "loss": 0.1935, + "step": 3213 + }, + { + "epoch": 0.2, + "grad_norm": 2.3775566066303138, + "learning_rate": 9.218759672510795e-07, + "loss": 0.29, + "step": 3214 + }, + { + "epoch": 0.21, + "grad_norm": 0.5839181438936653, + "learning_rate": 9.218205276967963e-07, + "loss": 0.121, + "step": 3215 + }, + { + "epoch": 0.21, + "grad_norm": 0.619965388979965, + "learning_rate": 9.217650701468015e-07, + "loss": 0.2328, + "step": 3216 + }, + { + "epoch": 0.21, + "grad_norm": 0.4594208800221896, + "learning_rate": 9.217095946034609e-07, + "loss": 0.1759, + "step": 3217 + }, + { + "epoch": 0.21, + "grad_norm": 0.894595064160595, + "learning_rate": 9.216541010691412e-07, + "loss": 0.0834, + "step": 3218 + }, + { + "epoch": 0.21, + "grad_norm": 0.6950540322671729, + "learning_rate": 9.215985895462101e-07, + "loss": 0.0271, + "step": 3219 + }, + { + "epoch": 0.21, + "grad_norm": 0.7352186234647778, + "learning_rate": 9.215430600370356e-07, + "loss": 0.1694, + "step": 3220 + }, + { + "epoch": 0.21, + "grad_norm": 0.44343233299035845, + "learning_rate": 9.214875125439865e-07, + "loss": 0.1934, + "step": 3221 + }, + { + "epoch": 0.21, + "grad_norm": 1.4432183421572362, + "learning_rate": 9.21431947069433e-07, + "loss": 0.1698, + "step": 3222 + }, + { + "epoch": 0.21, + "grad_norm": 0.48105287720482787, + "learning_rate": 9.213763636157454e-07, + "loss": 0.1607, + "step": 3223 + }, + { + "epoch": 0.21, + "grad_norm": 0.31192982752598486, + "learning_rate": 9.213207621852952e-07, + "loss": 0.0758, + "step": 3224 + }, + { + "epoch": 0.21, + "grad_norm": 0.6726849577678, + "learning_rate": 9.212651427804543e-07, + "loss": 0.1775, + "step": 3225 + }, + { + "epoch": 0.21, + "grad_norm": 0.6263180529191299, + "learning_rate": 9.212095054035955e-07, + "loss": 0.0706, + "step": 3226 + }, + { + "epoch": 0.21, + "grad_norm": 0.4635604648015689, + "learning_rate": 9.211538500570923e-07, + "loss": 0.1287, + "step": 3227 + }, + { + "epoch": 0.21, + "grad_norm": 0.8474045210307591, + "learning_rate": 9.210981767433195e-07, + "loss": 0.3819, + "step": 3228 + }, + { + "epoch": 0.21, + "grad_norm": 0.6874972538717036, + "learning_rate": 9.210424854646519e-07, + "loss": 0.2192, + "step": 3229 + }, + { + "epoch": 0.21, + "grad_norm": 0.7691094876067304, + "learning_rate": 9.209867762234653e-07, + "loss": 0.022, + "step": 3230 + }, + { + "epoch": 0.21, + "grad_norm": 0.696244205564151, + "learning_rate": 9.209310490221367e-07, + "loss": 0.3037, + "step": 3231 + }, + { + "epoch": 0.21, + "grad_norm": 0.8153250419759999, + "learning_rate": 9.208753038630434e-07, + "loss": 0.1564, + "step": 3232 + }, + { + "epoch": 0.21, + "grad_norm": 0.9541435679121205, + "learning_rate": 9.208195407485634e-07, + "loss": 0.0973, + "step": 3233 + }, + { + "epoch": 0.21, + "grad_norm": 0.8509587442576104, + "learning_rate": 9.20763759681076e-07, + "loss": 0.1268, + "step": 3234 + }, + { + "epoch": 0.21, + "grad_norm": 0.2920801299379249, + "learning_rate": 9.207079606629606e-07, + "loss": 0.1542, + "step": 3235 + }, + { + "epoch": 0.21, + "grad_norm": 1.1822533011617806, + "learning_rate": 9.206521436965981e-07, + "loss": 0.1604, + "step": 3236 + }, + { + "epoch": 0.21, + "grad_norm": 1.0059318658786853, + "learning_rate": 9.205963087843693e-07, + "loss": 0.2971, + "step": 3237 + }, + { + "epoch": 0.21, + "grad_norm": 0.6283855641188356, + "learning_rate": 9.205404559286567e-07, + "loss": 0.2701, + "step": 3238 + }, + { + "epoch": 0.21, + "grad_norm": 0.6352247159513633, + "learning_rate": 9.204845851318427e-07, + "loss": 0.2549, + "step": 3239 + }, + { + "epoch": 0.21, + "grad_norm": 1.0524122518502468, + "learning_rate": 9.204286963963111e-07, + "loss": 0.3879, + "step": 3240 + }, + { + "epoch": 0.21, + "grad_norm": 0.9223261899788481, + "learning_rate": 9.20372789724446e-07, + "loss": 0.1511, + "step": 3241 + }, + { + "epoch": 0.21, + "grad_norm": 1.0375883721197088, + "learning_rate": 9.203168651186329e-07, + "loss": 0.1093, + "step": 3242 + }, + { + "epoch": 0.21, + "grad_norm": 0.6249258115181283, + "learning_rate": 9.202609225812571e-07, + "loss": 0.1495, + "step": 3243 + }, + { + "epoch": 0.21, + "grad_norm": 0.3723623613833438, + "learning_rate": 9.202049621147055e-07, + "loss": 0.2153, + "step": 3244 + }, + { + "epoch": 0.21, + "grad_norm": 1.116965781580051, + "learning_rate": 9.201489837213658e-07, + "loss": 0.1362, + "step": 3245 + }, + { + "epoch": 0.21, + "grad_norm": 1.5384575646385588, + "learning_rate": 9.200929874036257e-07, + "loss": 0.2366, + "step": 3246 + }, + { + "epoch": 0.21, + "grad_norm": 1.0716566854155367, + "learning_rate": 9.200369731638741e-07, + "loss": 0.0693, + "step": 3247 + }, + { + "epoch": 0.21, + "grad_norm": 0.6249185693333995, + "learning_rate": 9.19980941004501e-07, + "loss": 0.1311, + "step": 3248 + }, + { + "epoch": 0.21, + "grad_norm": 0.7468108907428338, + "learning_rate": 9.199248909278967e-07, + "loss": 0.1688, + "step": 3249 + }, + { + "epoch": 0.21, + "grad_norm": 1.3635267021327973, + "learning_rate": 9.198688229364524e-07, + "loss": 0.0944, + "step": 3250 + }, + { + "epoch": 0.21, + "grad_norm": 0.49514635687869285, + "learning_rate": 9.198127370325601e-07, + "loss": 0.1282, + "step": 3251 + }, + { + "epoch": 0.21, + "grad_norm": 0.5079375544928111, + "learning_rate": 9.197566332186124e-07, + "loss": 0.106, + "step": 3252 + }, + { + "epoch": 0.21, + "grad_norm": 1.4813235463590242, + "learning_rate": 9.197005114970029e-07, + "loss": 0.0962, + "step": 3253 + }, + { + "epoch": 0.21, + "grad_norm": 0.4454324149510626, + "learning_rate": 9.19644371870126e-07, + "loss": 0.2286, + "step": 3254 + }, + { + "epoch": 0.21, + "grad_norm": 0.4247545785031421, + "learning_rate": 9.195882143403766e-07, + "loss": 0.0118, + "step": 3255 + }, + { + "epoch": 0.21, + "grad_norm": 0.6139334715147486, + "learning_rate": 9.195320389101504e-07, + "loss": 0.3141, + "step": 3256 + }, + { + "epoch": 0.21, + "grad_norm": 0.8679100346942271, + "learning_rate": 9.194758455818441e-07, + "loss": 0.1987, + "step": 3257 + }, + { + "epoch": 0.21, + "grad_norm": 0.7583339385851732, + "learning_rate": 9.194196343578549e-07, + "loss": 0.4295, + "step": 3258 + }, + { + "epoch": 0.21, + "grad_norm": 0.5185716101109296, + "learning_rate": 9.193634052405811e-07, + "loss": 0.2579, + "step": 3259 + }, + { + "epoch": 0.21, + "grad_norm": 2.2887558954245897, + "learning_rate": 9.193071582324213e-07, + "loss": 0.0536, + "step": 3260 + }, + { + "epoch": 0.21, + "grad_norm": 0.7758585411009997, + "learning_rate": 9.192508933357752e-07, + "loss": 0.171, + "step": 3261 + }, + { + "epoch": 0.21, + "grad_norm": 0.2598361492938699, + "learning_rate": 9.191946105530432e-07, + "loss": 0.0823, + "step": 3262 + }, + { + "epoch": 0.21, + "grad_norm": 0.5276822194378585, + "learning_rate": 9.191383098866265e-07, + "loss": 0.1623, + "step": 3263 + }, + { + "epoch": 0.21, + "grad_norm": 0.6953044661041045, + "learning_rate": 9.190819913389269e-07, + "loss": 0.2203, + "step": 3264 + }, + { + "epoch": 0.21, + "grad_norm": 0.27018655248726875, + "learning_rate": 9.190256549123471e-07, + "loss": 0.2548, + "step": 3265 + }, + { + "epoch": 0.21, + "grad_norm": 0.6395899427747753, + "learning_rate": 9.189693006092905e-07, + "loss": 0.0718, + "step": 3266 + }, + { + "epoch": 0.21, + "grad_norm": 0.22490555307165516, + "learning_rate": 9.189129284321614e-07, + "loss": 0.0889, + "step": 3267 + }, + { + "epoch": 0.21, + "grad_norm": 0.9837568311853337, + "learning_rate": 9.188565383833647e-07, + "loss": 0.1677, + "step": 3268 + }, + { + "epoch": 0.21, + "grad_norm": 0.6026477433004154, + "learning_rate": 9.188001304653058e-07, + "loss": 0.1218, + "step": 3269 + }, + { + "epoch": 0.21, + "grad_norm": 0.8915853470145135, + "learning_rate": 9.187437046803915e-07, + "loss": 0.1409, + "step": 3270 + }, + { + "epoch": 0.21, + "grad_norm": 0.9289081429824383, + "learning_rate": 9.18687261031029e-07, + "loss": 0.2263, + "step": 3271 + }, + { + "epoch": 0.21, + "grad_norm": 0.7102800021580643, + "learning_rate": 9.186307995196263e-07, + "loss": 0.0423, + "step": 3272 + }, + { + "epoch": 0.21, + "grad_norm": 0.40979456304112044, + "learning_rate": 9.185743201485922e-07, + "loss": 0.2591, + "step": 3273 + }, + { + "epoch": 0.21, + "grad_norm": 0.8633246845478875, + "learning_rate": 9.185178229203361e-07, + "loss": 0.1852, + "step": 3274 + }, + { + "epoch": 0.21, + "grad_norm": 0.36422496372365826, + "learning_rate": 9.184613078372685e-07, + "loss": 0.2195, + "step": 3275 + }, + { + "epoch": 0.21, + "grad_norm": 0.6511353010145003, + "learning_rate": 9.184047749018e-07, + "loss": 0.1936, + "step": 3276 + }, + { + "epoch": 0.21, + "grad_norm": 0.6264697924714663, + "learning_rate": 9.183482241163429e-07, + "loss": 0.1882, + "step": 3277 + }, + { + "epoch": 0.21, + "grad_norm": 0.6652060262657263, + "learning_rate": 9.182916554833096e-07, + "loss": 0.2467, + "step": 3278 + }, + { + "epoch": 0.21, + "grad_norm": 2.013861147834192, + "learning_rate": 9.182350690051132e-07, + "loss": 0.2543, + "step": 3279 + }, + { + "epoch": 0.21, + "grad_norm": 0.5113679897931713, + "learning_rate": 9.181784646841683e-07, + "loss": 0.317, + "step": 3280 + }, + { + "epoch": 0.21, + "grad_norm": 0.5882751125353862, + "learning_rate": 9.181218425228893e-07, + "loss": 0.1704, + "step": 3281 + }, + { + "epoch": 0.21, + "grad_norm": 0.9396655086336284, + "learning_rate": 9.180652025236919e-07, + "loss": 0.1283, + "step": 3282 + }, + { + "epoch": 0.21, + "grad_norm": 0.40541933418651066, + "learning_rate": 9.180085446889927e-07, + "loss": 0.1214, + "step": 3283 + }, + { + "epoch": 0.21, + "grad_norm": 1.1760734602911802, + "learning_rate": 9.179518690212085e-07, + "loss": 0.1954, + "step": 3284 + }, + { + "epoch": 0.21, + "grad_norm": 0.46856952626879744, + "learning_rate": 9.178951755227575e-07, + "loss": 0.0292, + "step": 3285 + }, + { + "epoch": 0.21, + "grad_norm": 0.7802914955954653, + "learning_rate": 9.178384641960582e-07, + "loss": 0.4202, + "step": 3286 + }, + { + "epoch": 0.21, + "grad_norm": 0.7315321948269884, + "learning_rate": 9.1778173504353e-07, + "loss": 0.1565, + "step": 3287 + }, + { + "epoch": 0.21, + "grad_norm": 1.222393850481576, + "learning_rate": 9.177249880675933e-07, + "loss": 0.2781, + "step": 3288 + }, + { + "epoch": 0.21, + "grad_norm": 0.666436708168778, + "learning_rate": 9.176682232706687e-07, + "loss": 0.1222, + "step": 3289 + }, + { + "epoch": 0.21, + "grad_norm": 1.9280795230892156, + "learning_rate": 9.176114406551783e-07, + "loss": 0.1178, + "step": 3290 + }, + { + "epoch": 0.21, + "grad_norm": 0.7398973842010642, + "learning_rate": 9.175546402235441e-07, + "loss": 0.326, + "step": 3291 + }, + { + "epoch": 0.21, + "grad_norm": 0.4088669314826049, + "learning_rate": 9.174978219781897e-07, + "loss": 0.2255, + "step": 3292 + }, + { + "epoch": 0.21, + "grad_norm": 0.175157809384668, + "learning_rate": 9.174409859215388e-07, + "loss": 0.0047, + "step": 3293 + }, + { + "epoch": 0.21, + "grad_norm": 0.9253171979955276, + "learning_rate": 9.173841320560164e-07, + "loss": 0.0849, + "step": 3294 + }, + { + "epoch": 0.21, + "grad_norm": 0.7178912350948362, + "learning_rate": 9.173272603840478e-07, + "loss": 0.115, + "step": 3295 + }, + { + "epoch": 0.21, + "grad_norm": 0.8153411201239142, + "learning_rate": 9.172703709080593e-07, + "loss": 0.1246, + "step": 3296 + }, + { + "epoch": 0.21, + "grad_norm": 0.5726353685004575, + "learning_rate": 9.172134636304782e-07, + "loss": 0.37, + "step": 3297 + }, + { + "epoch": 0.21, + "grad_norm": 0.460500779942402, + "learning_rate": 9.171565385537318e-07, + "loss": 0.1719, + "step": 3298 + }, + { + "epoch": 0.21, + "grad_norm": 1.2420851980112937, + "learning_rate": 9.17099595680249e-07, + "loss": 0.182, + "step": 3299 + }, + { + "epoch": 0.21, + "grad_norm": 0.4513015773792623, + "learning_rate": 9.170426350124589e-07, + "loss": 0.2616, + "step": 3300 + }, + { + "epoch": 0.21, + "grad_norm": 0.5483890191093747, + "learning_rate": 9.169856565527916e-07, + "loss": 0.0853, + "step": 3301 + }, + { + "epoch": 0.21, + "grad_norm": 1.3191579094376202, + "learning_rate": 9.169286603036781e-07, + "loss": 0.2658, + "step": 3302 + }, + { + "epoch": 0.21, + "grad_norm": 0.49489582655847625, + "learning_rate": 9.168716462675497e-07, + "loss": 0.2081, + "step": 3303 + }, + { + "epoch": 0.21, + "grad_norm": 0.40434010307233975, + "learning_rate": 9.168146144468388e-07, + "loss": 0.1118, + "step": 3304 + }, + { + "epoch": 0.21, + "grad_norm": 1.965669223985352, + "learning_rate": 9.167575648439787e-07, + "loss": 0.0482, + "step": 3305 + }, + { + "epoch": 0.21, + "grad_norm": 0.8037598988165185, + "learning_rate": 9.16700497461403e-07, + "loss": 0.1939, + "step": 3306 + }, + { + "epoch": 0.21, + "grad_norm": 1.0228184358035761, + "learning_rate": 9.166434123015462e-07, + "loss": 0.1571, + "step": 3307 + }, + { + "epoch": 0.21, + "grad_norm": 1.106800100492122, + "learning_rate": 9.165863093668442e-07, + "loss": 0.2343, + "step": 3308 + }, + { + "epoch": 0.21, + "grad_norm": 0.13701452775901876, + "learning_rate": 9.165291886597328e-07, + "loss": 0.0035, + "step": 3309 + }, + { + "epoch": 0.21, + "grad_norm": 0.6820776536663483, + "learning_rate": 9.164720501826488e-07, + "loss": 0.1915, + "step": 3310 + }, + { + "epoch": 0.21, + "grad_norm": 0.5055141620505854, + "learning_rate": 9.164148939380299e-07, + "loss": 0.3251, + "step": 3311 + }, + { + "epoch": 0.21, + "grad_norm": 0.730238558757734, + "learning_rate": 9.163577199283144e-07, + "loss": 0.261, + "step": 3312 + }, + { + "epoch": 0.21, + "grad_norm": 1.3908591679499098, + "learning_rate": 9.163005281559418e-07, + "loss": 0.2514, + "step": 3313 + }, + { + "epoch": 0.21, + "grad_norm": 0.7861487704535053, + "learning_rate": 9.162433186233516e-07, + "loss": 0.2354, + "step": 3314 + }, + { + "epoch": 0.21, + "grad_norm": 0.6252351727346455, + "learning_rate": 9.161860913329848e-07, + "loss": 0.144, + "step": 3315 + }, + { + "epoch": 0.21, + "grad_norm": 4.939100655055246, + "learning_rate": 9.161288462872827e-07, + "loss": 0.2447, + "step": 3316 + }, + { + "epoch": 0.21, + "grad_norm": 0.9484435047301384, + "learning_rate": 9.160715834886876e-07, + "loss": 0.3138, + "step": 3317 + }, + { + "epoch": 0.21, + "grad_norm": 0.9739724864529796, + "learning_rate": 9.160143029396421e-07, + "loss": 0.1434, + "step": 3318 + }, + { + "epoch": 0.21, + "grad_norm": 1.4506480021370867, + "learning_rate": 9.159570046425902e-07, + "loss": 0.0983, + "step": 3319 + }, + { + "epoch": 0.21, + "grad_norm": 0.5178633961922393, + "learning_rate": 9.158996885999763e-07, + "loss": 0.1196, + "step": 3320 + }, + { + "epoch": 0.21, + "grad_norm": 0.9579539480681765, + "learning_rate": 9.158423548142458e-07, + "loss": 0.2286, + "step": 3321 + }, + { + "epoch": 0.21, + "grad_norm": 0.3888085819064977, + "learning_rate": 9.157850032878445e-07, + "loss": 0.0631, + "step": 3322 + }, + { + "epoch": 0.21, + "grad_norm": 0.845659596007509, + "learning_rate": 9.157276340232189e-07, + "loss": 0.167, + "step": 3323 + }, + { + "epoch": 0.21, + "grad_norm": 0.3534485994896798, + "learning_rate": 9.156702470228168e-07, + "loss": 0.0901, + "step": 3324 + }, + { + "epoch": 0.21, + "grad_norm": 1.1605635089106716, + "learning_rate": 9.156128422890864e-07, + "loss": 0.3227, + "step": 3325 + }, + { + "epoch": 0.21, + "grad_norm": 0.7428022722660884, + "learning_rate": 9.155554198244766e-07, + "loss": 0.1184, + "step": 3326 + }, + { + "epoch": 0.21, + "grad_norm": 1.2918648195664704, + "learning_rate": 9.154979796314373e-07, + "loss": 0.3346, + "step": 3327 + }, + { + "epoch": 0.21, + "grad_norm": 0.5856853294500018, + "learning_rate": 9.15440521712419e-07, + "loss": 0.1636, + "step": 3328 + }, + { + "epoch": 0.21, + "grad_norm": 0.9437189009731898, + "learning_rate": 9.153830460698727e-07, + "loss": 0.0998, + "step": 3329 + }, + { + "epoch": 0.21, + "grad_norm": 0.11880129973324605, + "learning_rate": 9.153255527062508e-07, + "loss": 0.0034, + "step": 3330 + }, + { + "epoch": 0.21, + "grad_norm": 0.7106187310767957, + "learning_rate": 9.152680416240058e-07, + "loss": 0.1795, + "step": 3331 + }, + { + "epoch": 0.21, + "grad_norm": 1.165237470034804, + "learning_rate": 9.152105128255913e-07, + "loss": 0.1602, + "step": 3332 + }, + { + "epoch": 0.21, + "grad_norm": 0.5943122879158438, + "learning_rate": 9.151529663134618e-07, + "loss": 0.2253, + "step": 3333 + }, + { + "epoch": 0.21, + "grad_norm": 0.6323695057504283, + "learning_rate": 9.150954020900721e-07, + "loss": 0.27, + "step": 3334 + }, + { + "epoch": 0.21, + "grad_norm": 0.6057181664144572, + "learning_rate": 9.150378201578783e-07, + "loss": 0.0806, + "step": 3335 + }, + { + "epoch": 0.21, + "grad_norm": 0.8036668462639066, + "learning_rate": 9.149802205193364e-07, + "loss": 0.3558, + "step": 3336 + }, + { + "epoch": 0.21, + "grad_norm": 0.9407285413340083, + "learning_rate": 9.149226031769044e-07, + "loss": 0.5969, + "step": 3337 + }, + { + "epoch": 0.21, + "grad_norm": 1.0538561578320227, + "learning_rate": 9.148649681330399e-07, + "loss": 0.2665, + "step": 3338 + }, + { + "epoch": 0.21, + "grad_norm": 0.9601589805852963, + "learning_rate": 9.148073153902018e-07, + "loss": 0.1061, + "step": 3339 + }, + { + "epoch": 0.21, + "grad_norm": 1.2809480204308987, + "learning_rate": 9.1474964495085e-07, + "loss": 0.3411, + "step": 3340 + }, + { + "epoch": 0.21, + "grad_norm": 0.9292083215413531, + "learning_rate": 9.146919568174443e-07, + "loss": 0.4065, + "step": 3341 + }, + { + "epoch": 0.21, + "grad_norm": 0.8696332294107295, + "learning_rate": 9.146342509924463e-07, + "loss": 0.1685, + "step": 3342 + }, + { + "epoch": 0.21, + "grad_norm": 0.8419173746837841, + "learning_rate": 9.145765274783176e-07, + "loss": 0.3281, + "step": 3343 + }, + { + "epoch": 0.21, + "grad_norm": 0.7068294963354437, + "learning_rate": 9.145187862775208e-07, + "loss": 0.3983, + "step": 3344 + }, + { + "epoch": 0.21, + "grad_norm": 2.7314959557750793, + "learning_rate": 9.144610273925193e-07, + "loss": 0.1848, + "step": 3345 + }, + { + "epoch": 0.21, + "grad_norm": 0.33549357986216816, + "learning_rate": 9.144032508257772e-07, + "loss": 0.0108, + "step": 3346 + }, + { + "epoch": 0.21, + "grad_norm": 1.2287606074263027, + "learning_rate": 9.143454565797593e-07, + "loss": 0.3093, + "step": 3347 + }, + { + "epoch": 0.21, + "grad_norm": 0.5120194785272824, + "learning_rate": 9.142876446569314e-07, + "loss": 0.1189, + "step": 3348 + }, + { + "epoch": 0.21, + "grad_norm": 0.7351605139084882, + "learning_rate": 9.142298150597597e-07, + "loss": 0.2479, + "step": 3349 + }, + { + "epoch": 0.21, + "grad_norm": 0.8778220062242337, + "learning_rate": 9.141719677907114e-07, + "loss": 0.2634, + "step": 3350 + }, + { + "epoch": 0.21, + "grad_norm": 0.6896967732039392, + "learning_rate": 9.141141028522544e-07, + "loss": 0.4148, + "step": 3351 + }, + { + "epoch": 0.21, + "grad_norm": 0.14780695521541093, + "learning_rate": 9.140562202468571e-07, + "loss": 0.0727, + "step": 3352 + }, + { + "epoch": 0.21, + "grad_norm": 0.33412376814886113, + "learning_rate": 9.139983199769894e-07, + "loss": 0.1369, + "step": 3353 + }, + { + "epoch": 0.21, + "grad_norm": 0.4228120604743642, + "learning_rate": 9.139404020451209e-07, + "loss": 0.3021, + "step": 3354 + }, + { + "epoch": 0.21, + "grad_norm": 0.5890829771220137, + "learning_rate": 9.138824664537228e-07, + "loss": 0.3531, + "step": 3355 + }, + { + "epoch": 0.21, + "grad_norm": 0.9781033684600996, + "learning_rate": 9.138245132052667e-07, + "loss": 0.1932, + "step": 3356 + }, + { + "epoch": 0.21, + "grad_norm": 0.224919100610762, + "learning_rate": 9.13766542302225e-07, + "loss": 0.1376, + "step": 3357 + }, + { + "epoch": 0.21, + "grad_norm": 0.6696608362091178, + "learning_rate": 9.137085537470706e-07, + "loss": 0.3244, + "step": 3358 + }, + { + "epoch": 0.21, + "grad_norm": 0.43231195722747895, + "learning_rate": 9.136505475422778e-07, + "loss": 0.0095, + "step": 3359 + }, + { + "epoch": 0.21, + "grad_norm": 0.6223939786173917, + "learning_rate": 9.135925236903212e-07, + "loss": 0.1022, + "step": 3360 + }, + { + "epoch": 0.21, + "grad_norm": 0.5430507895496046, + "learning_rate": 9.135344821936759e-07, + "loss": 0.3908, + "step": 3361 + }, + { + "epoch": 0.21, + "grad_norm": 0.4061619624480858, + "learning_rate": 9.134764230548184e-07, + "loss": 0.0202, + "step": 3362 + }, + { + "epoch": 0.21, + "grad_norm": 0.7312764284057695, + "learning_rate": 9.134183462762255e-07, + "loss": 0.132, + "step": 3363 + }, + { + "epoch": 0.21, + "grad_norm": 0.4608353146168241, + "learning_rate": 9.133602518603749e-07, + "loss": 0.4335, + "step": 3364 + }, + { + "epoch": 0.21, + "grad_norm": 0.726778267690586, + "learning_rate": 9.133021398097449e-07, + "loss": 0.3721, + "step": 3365 + }, + { + "epoch": 0.21, + "grad_norm": 0.3008911729782962, + "learning_rate": 9.132440101268149e-07, + "loss": 0.0331, + "step": 3366 + }, + { + "epoch": 0.21, + "grad_norm": 0.34111277979521704, + "learning_rate": 9.131858628140647e-07, + "loss": 0.1128, + "step": 3367 + }, + { + "epoch": 0.21, + "grad_norm": 0.6935948489725566, + "learning_rate": 9.131276978739748e-07, + "loss": 0.0844, + "step": 3368 + }, + { + "epoch": 0.21, + "grad_norm": 1.2863030783735738, + "learning_rate": 9.130695153090271e-07, + "loss": 0.1082, + "step": 3369 + }, + { + "epoch": 0.21, + "grad_norm": 0.28620158719811806, + "learning_rate": 9.130113151217033e-07, + "loss": 0.0241, + "step": 3370 + }, + { + "epoch": 0.21, + "grad_norm": 0.8812706385404825, + "learning_rate": 9.129530973144866e-07, + "loss": 0.2179, + "step": 3371 + }, + { + "epoch": 0.22, + "grad_norm": 0.56321103348772, + "learning_rate": 9.128948618898606e-07, + "loss": 0.3122, + "step": 3372 + }, + { + "epoch": 0.22, + "grad_norm": 0.7809217075713873, + "learning_rate": 9.128366088503098e-07, + "loss": 0.2553, + "step": 3373 + }, + { + "epoch": 0.22, + "grad_norm": 2.487430715921176, + "learning_rate": 9.127783381983194e-07, + "loss": 0.1502, + "step": 3374 + }, + { + "epoch": 0.22, + "grad_norm": 0.5128932989123394, + "learning_rate": 9.127200499363752e-07, + "loss": 0.0713, + "step": 3375 + }, + { + "epoch": 0.22, + "grad_norm": 0.686968120630709, + "learning_rate": 9.126617440669641e-07, + "loss": 0.2624, + "step": 3376 + }, + { + "epoch": 0.22, + "grad_norm": 0.9774532076092192, + "learning_rate": 9.126034205925733e-07, + "loss": 0.1044, + "step": 3377 + }, + { + "epoch": 0.22, + "grad_norm": 0.7031551533349818, + "learning_rate": 9.125450795156912e-07, + "loss": 0.0799, + "step": 3378 + }, + { + "epoch": 0.22, + "grad_norm": 0.550485547328557, + "learning_rate": 9.124867208388067e-07, + "loss": 0.2822, + "step": 3379 + }, + { + "epoch": 0.22, + "grad_norm": 0.6038386615340078, + "learning_rate": 9.124283445644097e-07, + "loss": 0.1177, + "step": 3380 + }, + { + "epoch": 0.22, + "grad_norm": 0.3482277354928197, + "learning_rate": 9.123699506949901e-07, + "loss": 0.0413, + "step": 3381 + }, + { + "epoch": 0.22, + "grad_norm": 0.5948768393364658, + "learning_rate": 9.123115392330396e-07, + "loss": 0.1652, + "step": 3382 + }, + { + "epoch": 0.22, + "grad_norm": 0.3894779516719231, + "learning_rate": 9.1225311018105e-07, + "loss": 0.1263, + "step": 3383 + }, + { + "epoch": 0.22, + "grad_norm": 0.4785026138141128, + "learning_rate": 9.121946635415139e-07, + "loss": 0.0096, + "step": 3384 + }, + { + "epoch": 0.22, + "grad_norm": 0.8184672825827085, + "learning_rate": 9.121361993169249e-07, + "loss": 0.0957, + "step": 3385 + }, + { + "epoch": 0.22, + "grad_norm": 1.246811980644127, + "learning_rate": 9.120777175097771e-07, + "loss": 0.2403, + "step": 3386 + }, + { + "epoch": 0.22, + "grad_norm": 0.9640966324191959, + "learning_rate": 9.120192181225656e-07, + "loss": 0.3179, + "step": 3387 + }, + { + "epoch": 0.22, + "grad_norm": 0.6895051692302739, + "learning_rate": 9.119607011577859e-07, + "loss": 0.2984, + "step": 3388 + }, + { + "epoch": 0.22, + "grad_norm": 0.29280048192975544, + "learning_rate": 9.119021666179345e-07, + "loss": 0.134, + "step": 3389 + }, + { + "epoch": 0.22, + "grad_norm": 2.137237951547494, + "learning_rate": 9.118436145055089e-07, + "loss": 0.1653, + "step": 3390 + }, + { + "epoch": 0.22, + "grad_norm": 0.5688838425370352, + "learning_rate": 9.117850448230065e-07, + "loss": 0.1128, + "step": 3391 + }, + { + "epoch": 0.22, + "grad_norm": 0.4400530005215225, + "learning_rate": 9.117264575729265e-07, + "loss": 0.0096, + "step": 3392 + }, + { + "epoch": 0.22, + "grad_norm": 0.7937961745154153, + "learning_rate": 9.116678527577679e-07, + "loss": 0.1149, + "step": 3393 + }, + { + "epoch": 0.22, + "grad_norm": 1.2148334514226202, + "learning_rate": 9.116092303800314e-07, + "loss": 0.1107, + "step": 3394 + }, + { + "epoch": 0.22, + "grad_norm": 0.9967046538924188, + "learning_rate": 9.115505904422175e-07, + "loss": 0.4086, + "step": 3395 + }, + { + "epoch": 0.22, + "grad_norm": 1.4439013094625506, + "learning_rate": 9.114919329468282e-07, + "loss": 0.2029, + "step": 3396 + }, + { + "epoch": 0.22, + "grad_norm": 0.5850941191041946, + "learning_rate": 9.114332578963657e-07, + "loss": 0.0204, + "step": 3397 + }, + { + "epoch": 0.22, + "grad_norm": 0.5596567682767649, + "learning_rate": 9.113745652933336e-07, + "loss": 0.2591, + "step": 3398 + }, + { + "epoch": 0.22, + "grad_norm": 0.7153097443238435, + "learning_rate": 9.113158551402354e-07, + "loss": 0.3206, + "step": 3399 + }, + { + "epoch": 0.22, + "grad_norm": 0.660737198000805, + "learning_rate": 9.11257127439576e-07, + "loss": 0.3275, + "step": 3400 + }, + { + "epoch": 0.22, + "grad_norm": 0.8420471598318644, + "learning_rate": 9.111983821938607e-07, + "loss": 0.2957, + "step": 3401 + }, + { + "epoch": 0.22, + "grad_norm": 0.3974637733227828, + "learning_rate": 9.11139619405596e-07, + "loss": 0.1292, + "step": 3402 + }, + { + "epoch": 0.22, + "grad_norm": 0.8618460911519606, + "learning_rate": 9.110808390772885e-07, + "loss": 0.2779, + "step": 3403 + }, + { + "epoch": 0.22, + "grad_norm": 0.5857774826771328, + "learning_rate": 9.110220412114461e-07, + "loss": 0.2278, + "step": 3404 + }, + { + "epoch": 0.22, + "grad_norm": 1.1973854569132587, + "learning_rate": 9.10963225810577e-07, + "loss": 0.1789, + "step": 3405 + }, + { + "epoch": 0.22, + "grad_norm": 0.6026320213695966, + "learning_rate": 9.109043928771909e-07, + "loss": 0.1854, + "step": 3406 + }, + { + "epoch": 0.22, + "grad_norm": 0.7815164386452087, + "learning_rate": 9.10845542413797e-07, + "loss": 0.2431, + "step": 3407 + }, + { + "epoch": 0.22, + "grad_norm": 0.4577216473190088, + "learning_rate": 9.107866744229066e-07, + "loss": 0.1924, + "step": 3408 + }, + { + "epoch": 0.22, + "grad_norm": 0.40027900877199785, + "learning_rate": 9.107277889070309e-07, + "loss": 0.0155, + "step": 3409 + }, + { + "epoch": 0.22, + "grad_norm": 2.293075533621946, + "learning_rate": 9.106688858686819e-07, + "loss": 0.1404, + "step": 3410 + }, + { + "epoch": 0.22, + "grad_norm": 1.0836027098191086, + "learning_rate": 9.106099653103727e-07, + "loss": 0.2441, + "step": 3411 + }, + { + "epoch": 0.22, + "grad_norm": 0.23668753584277077, + "learning_rate": 9.10551027234617e-07, + "loss": 0.1216, + "step": 3412 + }, + { + "epoch": 0.22, + "grad_norm": 0.58709382814804, + "learning_rate": 9.104920716439293e-07, + "loss": 0.2864, + "step": 3413 + }, + { + "epoch": 0.22, + "grad_norm": 0.5794583746193465, + "learning_rate": 9.104330985408244e-07, + "loss": 0.2686, + "step": 3414 + }, + { + "epoch": 0.22, + "grad_norm": 0.8895225246835184, + "learning_rate": 9.103741079278186e-07, + "loss": 0.5283, + "step": 3415 + }, + { + "epoch": 0.22, + "grad_norm": 0.39283921122768195, + "learning_rate": 9.103150998074283e-07, + "loss": 0.2204, + "step": 3416 + }, + { + "epoch": 0.22, + "grad_norm": 1.804370593211932, + "learning_rate": 9.10256074182171e-07, + "loss": 0.1024, + "step": 3417 + }, + { + "epoch": 0.22, + "grad_norm": 0.7469095195242345, + "learning_rate": 9.101970310545649e-07, + "loss": 0.1825, + "step": 3418 + }, + { + "epoch": 0.22, + "grad_norm": 0.4301125149502763, + "learning_rate": 9.101379704271288e-07, + "loss": 0.1813, + "step": 3419 + }, + { + "epoch": 0.22, + "grad_norm": 0.43787974520146306, + "learning_rate": 9.100788923023826e-07, + "loss": 0.1324, + "step": 3420 + }, + { + "epoch": 0.22, + "grad_norm": 0.9814626283804289, + "learning_rate": 9.100197966828462e-07, + "loss": 0.1023, + "step": 3421 + }, + { + "epoch": 0.22, + "grad_norm": 0.6053470594817225, + "learning_rate": 9.099606835710413e-07, + "loss": 0.305, + "step": 3422 + }, + { + "epoch": 0.22, + "grad_norm": 0.43098791540108977, + "learning_rate": 9.099015529694893e-07, + "loss": 0.0927, + "step": 3423 + }, + { + "epoch": 0.22, + "grad_norm": 0.30846510323918225, + "learning_rate": 9.098424048807131e-07, + "loss": 0.2379, + "step": 3424 + }, + { + "epoch": 0.22, + "grad_norm": 0.3731493271743987, + "learning_rate": 9.097832393072362e-07, + "loss": 0.1818, + "step": 3425 + }, + { + "epoch": 0.22, + "grad_norm": 0.6252877550256376, + "learning_rate": 9.097240562515824e-07, + "loss": 0.1592, + "step": 3426 + }, + { + "epoch": 0.22, + "grad_norm": 0.5703630961706992, + "learning_rate": 9.096648557162767e-07, + "loss": 0.3112, + "step": 3427 + }, + { + "epoch": 0.22, + "grad_norm": 1.0771455845190558, + "learning_rate": 9.096056377038448e-07, + "loss": 0.2474, + "step": 3428 + }, + { + "epoch": 0.22, + "grad_norm": 0.6758815883228315, + "learning_rate": 9.095464022168129e-07, + "loss": 0.2477, + "step": 3429 + }, + { + "epoch": 0.22, + "grad_norm": 0.3800595338507114, + "learning_rate": 9.094871492577081e-07, + "loss": 0.2124, + "step": 3430 + }, + { + "epoch": 0.22, + "grad_norm": 0.6086154417384398, + "learning_rate": 9.094278788290586e-07, + "loss": 0.142, + "step": 3431 + }, + { + "epoch": 0.22, + "grad_norm": 0.8861271522221894, + "learning_rate": 9.093685909333925e-07, + "loss": 0.081, + "step": 3432 + }, + { + "epoch": 0.22, + "grad_norm": 0.5888344856273661, + "learning_rate": 9.093092855732395e-07, + "loss": 0.5064, + "step": 3433 + }, + { + "epoch": 0.22, + "grad_norm": 0.5248938488932281, + "learning_rate": 9.092499627511295e-07, + "loss": 0.0358, + "step": 3434 + }, + { + "epoch": 0.22, + "grad_norm": 0.6760893053749472, + "learning_rate": 9.091906224695935e-07, + "loss": 0.1522, + "step": 3435 + }, + { + "epoch": 0.22, + "grad_norm": 0.5565931103970779, + "learning_rate": 9.091312647311629e-07, + "loss": 0.2553, + "step": 3436 + }, + { + "epoch": 0.22, + "grad_norm": 0.43821355945024576, + "learning_rate": 9.0907188953837e-07, + "loss": 0.115, + "step": 3437 + }, + { + "epoch": 0.22, + "grad_norm": 2.0487219325921577, + "learning_rate": 9.09012496893748e-07, + "loss": 0.2093, + "step": 3438 + }, + { + "epoch": 0.22, + "grad_norm": 0.710696025931875, + "learning_rate": 9.089530867998307e-07, + "loss": 0.1013, + "step": 3439 + }, + { + "epoch": 0.22, + "grad_norm": 3.76015426839559, + "learning_rate": 9.088936592591528e-07, + "loss": 0.2028, + "step": 3440 + }, + { + "epoch": 0.22, + "grad_norm": 0.6190455024758854, + "learning_rate": 9.088342142742492e-07, + "loss": 0.3572, + "step": 3441 + }, + { + "epoch": 0.22, + "grad_norm": 1.0968735196658128, + "learning_rate": 9.087747518476561e-07, + "loss": 0.0332, + "step": 3442 + }, + { + "epoch": 0.22, + "grad_norm": 0.5591905620825228, + "learning_rate": 9.087152719819104e-07, + "loss": 0.2026, + "step": 3443 + }, + { + "epoch": 0.22, + "grad_norm": 0.8116329837785369, + "learning_rate": 9.086557746795497e-07, + "loss": 0.2132, + "step": 3444 + }, + { + "epoch": 0.22, + "grad_norm": 0.5818075691140604, + "learning_rate": 9.085962599431121e-07, + "loss": 0.2607, + "step": 3445 + }, + { + "epoch": 0.22, + "grad_norm": 0.7708689610124019, + "learning_rate": 9.085367277751366e-07, + "loss": 0.29, + "step": 3446 + }, + { + "epoch": 0.22, + "grad_norm": 0.8386997305596612, + "learning_rate": 9.084771781781631e-07, + "loss": 0.0975, + "step": 3447 + }, + { + "epoch": 0.22, + "grad_norm": 0.2847712370763844, + "learning_rate": 9.08417611154732e-07, + "loss": 0.0138, + "step": 3448 + }, + { + "epoch": 0.22, + "grad_norm": 0.5567018486648099, + "learning_rate": 9.083580267073846e-07, + "loss": 0.2348, + "step": 3449 + }, + { + "epoch": 0.22, + "grad_norm": 0.14978507688318166, + "learning_rate": 9.082984248386629e-07, + "loss": 0.0757, + "step": 3450 + }, + { + "epoch": 0.22, + "grad_norm": 0.8480235839216598, + "learning_rate": 9.082388055511096e-07, + "loss": 0.1654, + "step": 3451 + }, + { + "epoch": 0.22, + "grad_norm": 0.758476965822796, + "learning_rate": 9.081791688472682e-07, + "loss": 0.0173, + "step": 3452 + }, + { + "epoch": 0.22, + "grad_norm": 0.28901291714694527, + "learning_rate": 9.081195147296829e-07, + "loss": 0.1958, + "step": 3453 + }, + { + "epoch": 0.22, + "grad_norm": 1.5632435502419064, + "learning_rate": 9.080598432008986e-07, + "loss": 0.1931, + "step": 3454 + }, + { + "epoch": 0.22, + "grad_norm": 0.20853929637542068, + "learning_rate": 9.080001542634612e-07, + "loss": 0.0078, + "step": 3455 + }, + { + "epoch": 0.22, + "grad_norm": 0.8147606927698112, + "learning_rate": 9.079404479199168e-07, + "loss": 0.2416, + "step": 3456 + }, + { + "epoch": 0.22, + "grad_norm": 0.5896403903375568, + "learning_rate": 9.078807241728131e-07, + "loss": 0.0463, + "step": 3457 + }, + { + "epoch": 0.22, + "grad_norm": 0.6193532850598485, + "learning_rate": 9.078209830246977e-07, + "loss": 0.3053, + "step": 3458 + }, + { + "epoch": 0.22, + "grad_norm": 0.3673911368542017, + "learning_rate": 9.077612244781195e-07, + "loss": 0.095, + "step": 3459 + }, + { + "epoch": 0.22, + "grad_norm": 0.4741756122471617, + "learning_rate": 9.077014485356274e-07, + "loss": 0.2309, + "step": 3460 + }, + { + "epoch": 0.22, + "grad_norm": 0.6886654317580722, + "learning_rate": 9.076416551997721e-07, + "loss": 0.1479, + "step": 3461 + }, + { + "epoch": 0.22, + "grad_norm": 0.3884602630498089, + "learning_rate": 9.075818444731044e-07, + "loss": 0.0143, + "step": 3462 + }, + { + "epoch": 0.22, + "grad_norm": 0.9800609006038236, + "learning_rate": 9.075220163581758e-07, + "loss": 0.2561, + "step": 3463 + }, + { + "epoch": 0.22, + "grad_norm": 0.38942150556075106, + "learning_rate": 9.074621708575387e-07, + "loss": 0.103, + "step": 3464 + }, + { + "epoch": 0.22, + "grad_norm": 1.3611265364250646, + "learning_rate": 9.074023079737465e-07, + "loss": 0.2226, + "step": 3465 + }, + { + "epoch": 0.22, + "grad_norm": 0.8001913346072398, + "learning_rate": 9.073424277093527e-07, + "loss": 0.4161, + "step": 3466 + }, + { + "epoch": 0.22, + "grad_norm": 0.44584243740872953, + "learning_rate": 9.07282530066912e-07, + "loss": 0.1643, + "step": 3467 + }, + { + "epoch": 0.22, + "grad_norm": 0.44721985428880234, + "learning_rate": 9.0722261504898e-07, + "loss": 0.0276, + "step": 3468 + }, + { + "epoch": 0.22, + "grad_norm": 0.5476807711359044, + "learning_rate": 9.071626826581124e-07, + "loss": 0.3451, + "step": 3469 + }, + { + "epoch": 0.22, + "grad_norm": 0.4448469967113406, + "learning_rate": 9.071027328968665e-07, + "loss": 0.2502, + "step": 3470 + }, + { + "epoch": 0.22, + "grad_norm": 0.9780446987211733, + "learning_rate": 9.070427657677994e-07, + "loss": 0.1259, + "step": 3471 + }, + { + "epoch": 0.22, + "grad_norm": 0.6524682311126996, + "learning_rate": 9.069827812734698e-07, + "loss": 0.1111, + "step": 3472 + }, + { + "epoch": 0.22, + "grad_norm": 0.8261236053892801, + "learning_rate": 9.069227794164366e-07, + "loss": 0.273, + "step": 3473 + }, + { + "epoch": 0.22, + "grad_norm": 0.8456024771978315, + "learning_rate": 9.068627601992598e-07, + "loss": 0.1448, + "step": 3474 + }, + { + "epoch": 0.22, + "grad_norm": 0.4122556009766779, + "learning_rate": 9.068027236244995e-07, + "loss": 0.2236, + "step": 3475 + }, + { + "epoch": 0.22, + "grad_norm": 0.8702978327038665, + "learning_rate": 9.067426696947174e-07, + "loss": 0.3429, + "step": 3476 + }, + { + "epoch": 0.22, + "grad_norm": 1.6658536681051026, + "learning_rate": 9.066825984124751e-07, + "loss": 0.2262, + "step": 3477 + }, + { + "epoch": 0.22, + "grad_norm": 0.8015228154440659, + "learning_rate": 9.066225097803358e-07, + "loss": 0.1998, + "step": 3478 + }, + { + "epoch": 0.22, + "grad_norm": 0.674366712686864, + "learning_rate": 9.065624038008628e-07, + "loss": 0.1484, + "step": 3479 + }, + { + "epoch": 0.22, + "grad_norm": 0.6267862760521721, + "learning_rate": 9.065022804766204e-07, + "loss": 0.107, + "step": 3480 + }, + { + "epoch": 0.22, + "grad_norm": 1.0302001191660715, + "learning_rate": 9.064421398101733e-07, + "loss": 0.3326, + "step": 3481 + }, + { + "epoch": 0.22, + "grad_norm": 0.7957291490691056, + "learning_rate": 9.063819818040878e-07, + "loss": 0.1871, + "step": 3482 + }, + { + "epoch": 0.22, + "grad_norm": 1.1249923368632135, + "learning_rate": 9.063218064609299e-07, + "loss": 0.2669, + "step": 3483 + }, + { + "epoch": 0.22, + "grad_norm": 0.7913429341548515, + "learning_rate": 9.062616137832668e-07, + "loss": 0.4369, + "step": 3484 + }, + { + "epoch": 0.22, + "grad_norm": 0.4232030837204642, + "learning_rate": 9.062014037736667e-07, + "loss": 0.3205, + "step": 3485 + }, + { + "epoch": 0.22, + "grad_norm": 0.5464013784120936, + "learning_rate": 9.061411764346982e-07, + "loss": 0.0086, + "step": 3486 + }, + { + "epoch": 0.22, + "grad_norm": 0.47319299397048015, + "learning_rate": 9.060809317689306e-07, + "loss": 0.2368, + "step": 3487 + }, + { + "epoch": 0.22, + "grad_norm": 1.461021949249322, + "learning_rate": 9.060206697789341e-07, + "loss": 0.1098, + "step": 3488 + }, + { + "epoch": 0.22, + "grad_norm": 0.50489660755318, + "learning_rate": 9.059603904672797e-07, + "loss": 0.1486, + "step": 3489 + }, + { + "epoch": 0.22, + "grad_norm": 0.7062686015839136, + "learning_rate": 9.059000938365388e-07, + "loss": 0.378, + "step": 3490 + }, + { + "epoch": 0.22, + "grad_norm": 0.2318923336010885, + "learning_rate": 9.058397798892841e-07, + "loss": 0.0119, + "step": 3491 + }, + { + "epoch": 0.22, + "grad_norm": 6.119039946903587, + "learning_rate": 9.057794486280885e-07, + "loss": 0.1205, + "step": 3492 + }, + { + "epoch": 0.22, + "grad_norm": 1.1552504432905428, + "learning_rate": 9.057191000555259e-07, + "loss": 0.2216, + "step": 3493 + }, + { + "epoch": 0.22, + "grad_norm": 0.18240345717577552, + "learning_rate": 9.056587341741708e-07, + "loss": 0.0044, + "step": 3494 + }, + { + "epoch": 0.22, + "grad_norm": 1.920239819563848, + "learning_rate": 9.055983509865988e-07, + "loss": 0.1902, + "step": 3495 + }, + { + "epoch": 0.22, + "grad_norm": 0.5578636167243833, + "learning_rate": 9.055379504953857e-07, + "loss": 0.2409, + "step": 3496 + }, + { + "epoch": 0.22, + "grad_norm": 0.32673939571085087, + "learning_rate": 9.054775327031084e-07, + "loss": 0.1857, + "step": 3497 + }, + { + "epoch": 0.22, + "grad_norm": 0.3054597557355878, + "learning_rate": 9.054170976123444e-07, + "loss": 0.074, + "step": 3498 + }, + { + "epoch": 0.22, + "grad_norm": 0.47660839361366525, + "learning_rate": 9.05356645225672e-07, + "loss": 0.1878, + "step": 3499 + }, + { + "epoch": 0.22, + "grad_norm": 0.8488780011047519, + "learning_rate": 9.052961755456704e-07, + "loss": 0.2275, + "step": 3500 + }, + { + "epoch": 0.22, + "grad_norm": 0.4759906315349322, + "learning_rate": 9.052356885749191e-07, + "loss": 0.2249, + "step": 3501 + }, + { + "epoch": 0.22, + "grad_norm": 0.18884537303613233, + "learning_rate": 9.051751843159987e-07, + "loss": 0.1303, + "step": 3502 + }, + { + "epoch": 0.22, + "grad_norm": 0.8426818040273766, + "learning_rate": 9.051146627714905e-07, + "loss": 0.188, + "step": 3503 + }, + { + "epoch": 0.22, + "grad_norm": 1.1731863384220949, + "learning_rate": 9.050541239439763e-07, + "loss": 0.2124, + "step": 3504 + }, + { + "epoch": 0.22, + "grad_norm": 2.3368678322200815, + "learning_rate": 9.04993567836039e-07, + "loss": 0.4364, + "step": 3505 + }, + { + "epoch": 0.22, + "grad_norm": 0.621984000338872, + "learning_rate": 9.049329944502619e-07, + "loss": 0.3331, + "step": 3506 + }, + { + "epoch": 0.22, + "grad_norm": 1.0557024485074031, + "learning_rate": 9.048724037892293e-07, + "loss": 0.2421, + "step": 3507 + }, + { + "epoch": 0.22, + "grad_norm": 1.034919908349202, + "learning_rate": 9.04811795855526e-07, + "loss": 0.216, + "step": 3508 + }, + { + "epoch": 0.22, + "grad_norm": 0.5535103811207018, + "learning_rate": 9.047511706517377e-07, + "loss": 0.2032, + "step": 3509 + }, + { + "epoch": 0.22, + "grad_norm": 0.422120351200221, + "learning_rate": 9.046905281804509e-07, + "loss": 0.2738, + "step": 3510 + }, + { + "epoch": 0.22, + "grad_norm": 1.001188790890657, + "learning_rate": 9.046298684442525e-07, + "loss": 0.3171, + "step": 3511 + }, + { + "epoch": 0.22, + "grad_norm": 0.7747187881056723, + "learning_rate": 9.045691914457305e-07, + "loss": 0.0981, + "step": 3512 + }, + { + "epoch": 0.22, + "grad_norm": 1.279504743833741, + "learning_rate": 9.045084971874737e-07, + "loss": 0.0469, + "step": 3513 + }, + { + "epoch": 0.22, + "grad_norm": 0.5824157956468737, + "learning_rate": 9.044477856720712e-07, + "loss": 0.2563, + "step": 3514 + }, + { + "epoch": 0.22, + "grad_norm": 3.0540488650606123, + "learning_rate": 9.04387056902113e-07, + "loss": 0.199, + "step": 3515 + }, + { + "epoch": 0.22, + "grad_norm": 1.0224820944299369, + "learning_rate": 9.043263108801901e-07, + "loss": 0.3534, + "step": 3516 + }, + { + "epoch": 0.22, + "grad_norm": 0.480155624685507, + "learning_rate": 9.04265547608894e-07, + "loss": 0.0166, + "step": 3517 + }, + { + "epoch": 0.22, + "grad_norm": 2.1695620230090724, + "learning_rate": 9.042047670908169e-07, + "loss": 0.1602, + "step": 3518 + }, + { + "epoch": 0.22, + "grad_norm": 0.616620524514955, + "learning_rate": 9.041439693285519e-07, + "loss": 0.1087, + "step": 3519 + }, + { + "epoch": 0.22, + "grad_norm": 1.0588946185105392, + "learning_rate": 9.040831543246928e-07, + "loss": 0.1376, + "step": 3520 + }, + { + "epoch": 0.22, + "grad_norm": 0.43301704960439397, + "learning_rate": 9.040223220818339e-07, + "loss": 0.1515, + "step": 3521 + }, + { + "epoch": 0.22, + "grad_norm": 0.9372290561504759, + "learning_rate": 9.039614726025706e-07, + "loss": 0.0296, + "step": 3522 + }, + { + "epoch": 0.22, + "grad_norm": 0.7290055521646502, + "learning_rate": 9.039006058894988e-07, + "loss": 0.1478, + "step": 3523 + }, + { + "epoch": 0.22, + "grad_norm": 0.6009524441824383, + "learning_rate": 9.038397219452154e-07, + "loss": 0.0492, + "step": 3524 + }, + { + "epoch": 0.22, + "grad_norm": 0.3510990732077877, + "learning_rate": 9.037788207723174e-07, + "loss": 0.1141, + "step": 3525 + }, + { + "epoch": 0.22, + "grad_norm": 0.1877054520282975, + "learning_rate": 9.037179023734034e-07, + "loss": 0.0938, + "step": 3526 + }, + { + "epoch": 0.22, + "grad_norm": 0.38774478221254344, + "learning_rate": 9.036569667510719e-07, + "loss": 0.2048, + "step": 3527 + }, + { + "epoch": 0.22, + "grad_norm": 0.2650309446210396, + "learning_rate": 9.035960139079229e-07, + "loss": 0.1019, + "step": 3528 + }, + { + "epoch": 0.23, + "grad_norm": 1.4097990888608176, + "learning_rate": 9.035350438465566e-07, + "loss": 0.0862, + "step": 3529 + }, + { + "epoch": 0.23, + "grad_norm": 0.4170593866752147, + "learning_rate": 9.034740565695741e-07, + "loss": 0.0864, + "step": 3530 + }, + { + "epoch": 0.23, + "grad_norm": 0.49179527875071183, + "learning_rate": 9.034130520795773e-07, + "loss": 0.0727, + "step": 3531 + }, + { + "epoch": 0.23, + "grad_norm": 0.4844474975087014, + "learning_rate": 9.033520303791686e-07, + "loss": 0.1681, + "step": 3532 + }, + { + "epoch": 0.23, + "grad_norm": 1.7568547366344485, + "learning_rate": 9.032909914709516e-07, + "loss": 0.065, + "step": 3533 + }, + { + "epoch": 0.23, + "grad_norm": 0.439624171337634, + "learning_rate": 9.032299353575301e-07, + "loss": 0.1562, + "step": 3534 + }, + { + "epoch": 0.23, + "grad_norm": 0.40402549015647976, + "learning_rate": 9.03168862041509e-07, + "loss": 0.108, + "step": 3535 + }, + { + "epoch": 0.23, + "grad_norm": 0.7998233890169933, + "learning_rate": 9.031077715254937e-07, + "loss": 0.2988, + "step": 3536 + }, + { + "epoch": 0.23, + "grad_norm": 0.3299364680259563, + "learning_rate": 9.030466638120907e-07, + "loss": 0.1662, + "step": 3537 + }, + { + "epoch": 0.23, + "grad_norm": 0.33398480048584933, + "learning_rate": 9.029855389039067e-07, + "loss": 0.0746, + "step": 3538 + }, + { + "epoch": 0.23, + "grad_norm": 0.5046925690501163, + "learning_rate": 9.029243968035495e-07, + "loss": 0.1415, + "step": 3539 + }, + { + "epoch": 0.23, + "grad_norm": 1.0899228427771, + "learning_rate": 9.028632375136276e-07, + "loss": 0.2744, + "step": 3540 + }, + { + "epoch": 0.23, + "grad_norm": 1.4820328028191452, + "learning_rate": 9.028020610367499e-07, + "loss": 0.0907, + "step": 3541 + }, + { + "epoch": 0.23, + "grad_norm": 1.6686129981117492, + "learning_rate": 9.027408673755268e-07, + "loss": 0.1155, + "step": 3542 + }, + { + "epoch": 0.23, + "grad_norm": 0.7304402378877142, + "learning_rate": 9.026796565325687e-07, + "loss": 0.0507, + "step": 3543 + }, + { + "epoch": 0.23, + "grad_norm": 0.6368166231070489, + "learning_rate": 9.026184285104867e-07, + "loss": 0.3022, + "step": 3544 + }, + { + "epoch": 0.23, + "grad_norm": 0.6703179137627501, + "learning_rate": 9.025571833118935e-07, + "loss": 0.2356, + "step": 3545 + }, + { + "epoch": 0.23, + "grad_norm": 0.8085542338999495, + "learning_rate": 9.024959209394014e-07, + "loss": 0.2126, + "step": 3546 + }, + { + "epoch": 0.23, + "grad_norm": 2.228714239918408, + "learning_rate": 9.024346413956243e-07, + "loss": 0.2395, + "step": 3547 + }, + { + "epoch": 0.23, + "grad_norm": 1.0296238013379133, + "learning_rate": 9.023733446831762e-07, + "loss": 0.1334, + "step": 3548 + }, + { + "epoch": 0.23, + "grad_norm": 0.8240975558791944, + "learning_rate": 9.023120308046726e-07, + "loss": 0.2793, + "step": 3549 + }, + { + "epoch": 0.23, + "grad_norm": 0.5901663841476886, + "learning_rate": 9.022506997627289e-07, + "loss": 0.1835, + "step": 3550 + }, + { + "epoch": 0.23, + "grad_norm": 0.4576929666443724, + "learning_rate": 9.021893515599618e-07, + "loss": 0.107, + "step": 3551 + }, + { + "epoch": 0.23, + "grad_norm": 0.7028228227971861, + "learning_rate": 9.021279861989884e-07, + "loss": 0.1563, + "step": 3552 + }, + { + "epoch": 0.23, + "grad_norm": 0.49652258624001755, + "learning_rate": 9.020666036824267e-07, + "loss": 0.1915, + "step": 3553 + }, + { + "epoch": 0.23, + "grad_norm": 1.0911481865292731, + "learning_rate": 9.020052040128955e-07, + "loss": 0.2879, + "step": 3554 + }, + { + "epoch": 0.23, + "grad_norm": 0.9767828769964186, + "learning_rate": 9.019437871930143e-07, + "loss": 0.2459, + "step": 3555 + }, + { + "epoch": 0.23, + "grad_norm": 0.6713058897409105, + "learning_rate": 9.018823532254028e-07, + "loss": 0.1628, + "step": 3556 + }, + { + "epoch": 0.23, + "grad_norm": 0.8181149095416154, + "learning_rate": 9.018209021126824e-07, + "loss": 0.1507, + "step": 3557 + }, + { + "epoch": 0.23, + "grad_norm": 1.2652378181965303, + "learning_rate": 9.017594338574745e-07, + "loss": 0.2984, + "step": 3558 + }, + { + "epoch": 0.23, + "grad_norm": 0.5574580406275796, + "learning_rate": 9.016979484624017e-07, + "loss": 0.2797, + "step": 3559 + }, + { + "epoch": 0.23, + "grad_norm": 1.085960464776571, + "learning_rate": 9.016364459300867e-07, + "loss": 0.0731, + "step": 3560 + }, + { + "epoch": 0.23, + "grad_norm": 0.9820920115828107, + "learning_rate": 9.015749262631535e-07, + "loss": 0.1038, + "step": 3561 + }, + { + "epoch": 0.23, + "grad_norm": 0.7786666678562189, + "learning_rate": 9.015133894642268e-07, + "loss": 0.1636, + "step": 3562 + }, + { + "epoch": 0.23, + "grad_norm": 1.8077732798249364, + "learning_rate": 9.014518355359318e-07, + "loss": 0.2149, + "step": 3563 + }, + { + "epoch": 0.23, + "grad_norm": 0.8379422447571268, + "learning_rate": 9.013902644808944e-07, + "loss": 0.0319, + "step": 3564 + }, + { + "epoch": 0.23, + "grad_norm": 0.8020231600160077, + "learning_rate": 9.013286763017414e-07, + "loss": 0.0149, + "step": 3565 + }, + { + "epoch": 0.23, + "grad_norm": 0.7451250982089039, + "learning_rate": 9.012670710011003e-07, + "loss": 0.1621, + "step": 3566 + }, + { + "epoch": 0.23, + "grad_norm": 0.5775504911964549, + "learning_rate": 9.012054485815993e-07, + "loss": 0.0363, + "step": 3567 + }, + { + "epoch": 0.23, + "grad_norm": 0.36901694160006543, + "learning_rate": 9.011438090458674e-07, + "loss": 0.1155, + "step": 3568 + }, + { + "epoch": 0.23, + "grad_norm": 0.4530151377449247, + "learning_rate": 9.010821523965342e-07, + "loss": 0.4444, + "step": 3569 + }, + { + "epoch": 0.23, + "grad_norm": 0.7453481219095509, + "learning_rate": 9.0102047863623e-07, + "loss": 0.401, + "step": 3570 + }, + { + "epoch": 0.23, + "grad_norm": 0.6608854831066162, + "learning_rate": 9.009587877675862e-07, + "loss": 0.0626, + "step": 3571 + }, + { + "epoch": 0.23, + "grad_norm": 1.6146155111136735, + "learning_rate": 9.008970797932343e-07, + "loss": 0.1994, + "step": 3572 + }, + { + "epoch": 0.23, + "grad_norm": 0.6827877979600937, + "learning_rate": 9.008353547158072e-07, + "loss": 0.1298, + "step": 3573 + }, + { + "epoch": 0.23, + "grad_norm": 0.44509956643235676, + "learning_rate": 9.00773612537938e-07, + "loss": 0.0597, + "step": 3574 + }, + { + "epoch": 0.23, + "grad_norm": 0.9247401941630845, + "learning_rate": 9.007118532622608e-07, + "loss": 0.3066, + "step": 3575 + }, + { + "epoch": 0.23, + "grad_norm": 0.3500663477620438, + "learning_rate": 9.006500768914106e-07, + "loss": 0.2817, + "step": 3576 + }, + { + "epoch": 0.23, + "grad_norm": 1.147320025553286, + "learning_rate": 9.005882834280226e-07, + "loss": 0.335, + "step": 3577 + }, + { + "epoch": 0.23, + "grad_norm": 0.5504171426043073, + "learning_rate": 9.005264728747331e-07, + "loss": 0.3613, + "step": 3578 + }, + { + "epoch": 0.23, + "grad_norm": 0.8790360380964415, + "learning_rate": 9.004646452341792e-07, + "loss": 0.0743, + "step": 3579 + }, + { + "epoch": 0.23, + "grad_norm": 1.3644271116628688, + "learning_rate": 9.004028005089985e-07, + "loss": 0.1868, + "step": 3580 + }, + { + "epoch": 0.23, + "grad_norm": 0.34683762899779425, + "learning_rate": 9.003409387018293e-07, + "loss": 0.0939, + "step": 3581 + }, + { + "epoch": 0.23, + "grad_norm": 1.7087190263976482, + "learning_rate": 9.00279059815311e-07, + "loss": 0.1915, + "step": 3582 + }, + { + "epoch": 0.23, + "grad_norm": 0.9889991485407398, + "learning_rate": 9.002171638520833e-07, + "loss": 0.2909, + "step": 3583 + }, + { + "epoch": 0.23, + "grad_norm": 0.6306306038822274, + "learning_rate": 9.001552508147868e-07, + "loss": 0.3875, + "step": 3584 + }, + { + "epoch": 0.23, + "grad_norm": 0.8414396902575082, + "learning_rate": 9.000933207060629e-07, + "loss": 0.2494, + "step": 3585 + }, + { + "epoch": 0.23, + "grad_norm": 0.3442777423239455, + "learning_rate": 9.000313735285537e-07, + "loss": 0.241, + "step": 3586 + }, + { + "epoch": 0.23, + "grad_norm": 0.7755224462971616, + "learning_rate": 8.999694092849019e-07, + "loss": 0.0217, + "step": 3587 + }, + { + "epoch": 0.23, + "grad_norm": 0.7497411582538355, + "learning_rate": 8.999074279777511e-07, + "loss": 0.269, + "step": 3588 + }, + { + "epoch": 0.23, + "grad_norm": 0.9658463537603914, + "learning_rate": 8.998454296097456e-07, + "loss": 0.216, + "step": 3589 + }, + { + "epoch": 0.23, + "grad_norm": 1.156068956355357, + "learning_rate": 8.997834141835301e-07, + "loss": 0.0136, + "step": 3590 + }, + { + "epoch": 0.23, + "grad_norm": 1.0337821522188728, + "learning_rate": 8.997213817017506e-07, + "loss": 0.1946, + "step": 3591 + }, + { + "epoch": 0.23, + "grad_norm": 0.5773588168066646, + "learning_rate": 8.996593321670533e-07, + "loss": 0.203, + "step": 3592 + }, + { + "epoch": 0.23, + "grad_norm": 0.5244756314694453, + "learning_rate": 8.995972655820856e-07, + "loss": 0.2456, + "step": 3593 + }, + { + "epoch": 0.23, + "grad_norm": 0.4893468963190216, + "learning_rate": 8.995351819494952e-07, + "loss": 0.0508, + "step": 3594 + }, + { + "epoch": 0.23, + "grad_norm": 0.8466957374097075, + "learning_rate": 8.994730812719307e-07, + "loss": 0.1263, + "step": 3595 + }, + { + "epoch": 0.23, + "grad_norm": 0.7415694023113186, + "learning_rate": 8.994109635520416e-07, + "loss": 0.2457, + "step": 3596 + }, + { + "epoch": 0.23, + "grad_norm": 0.7099841326077181, + "learning_rate": 8.99348828792478e-07, + "loss": 0.0197, + "step": 3597 + }, + { + "epoch": 0.23, + "grad_norm": 0.38683694772595284, + "learning_rate": 8.992866769958904e-07, + "loss": 0.1075, + "step": 3598 + }, + { + "epoch": 0.23, + "grad_norm": 0.17439063080097897, + "learning_rate": 8.992245081649304e-07, + "loss": 0.0243, + "step": 3599 + }, + { + "epoch": 0.23, + "grad_norm": 2.970067514021893, + "learning_rate": 8.991623223022505e-07, + "loss": 0.1047, + "step": 3600 + }, + { + "epoch": 0.23, + "grad_norm": 1.4837965208476653, + "learning_rate": 8.991001194105034e-07, + "loss": 0.0782, + "step": 3601 + }, + { + "epoch": 0.23, + "grad_norm": 0.9435958016207453, + "learning_rate": 8.990378994923431e-07, + "loss": 0.2051, + "step": 3602 + }, + { + "epoch": 0.23, + "grad_norm": 1.2429304708913325, + "learning_rate": 8.989756625504237e-07, + "loss": 0.1081, + "step": 3603 + }, + { + "epoch": 0.23, + "grad_norm": 1.055464231568157, + "learning_rate": 8.989134085874006e-07, + "loss": 0.2532, + "step": 3604 + }, + { + "epoch": 0.23, + "grad_norm": 2.569220773678353, + "learning_rate": 8.988511376059295e-07, + "loss": 0.1961, + "step": 3605 + }, + { + "epoch": 0.23, + "grad_norm": 0.24562876287549157, + "learning_rate": 8.98788849608667e-07, + "loss": 0.0949, + "step": 3606 + }, + { + "epoch": 0.23, + "grad_norm": 0.5056491008363143, + "learning_rate": 8.987265445982706e-07, + "loss": 0.1763, + "step": 3607 + }, + { + "epoch": 0.23, + "grad_norm": 0.9208834173216598, + "learning_rate": 8.986642225773984e-07, + "loss": 0.3181, + "step": 3608 + }, + { + "epoch": 0.23, + "grad_norm": 0.5927033106464928, + "learning_rate": 8.986018835487088e-07, + "loss": 0.1896, + "step": 3609 + }, + { + "epoch": 0.23, + "grad_norm": 0.8667824774310198, + "learning_rate": 8.985395275148618e-07, + "loss": 0.1819, + "step": 3610 + }, + { + "epoch": 0.23, + "grad_norm": 0.5911908766796049, + "learning_rate": 8.984771544785172e-07, + "loss": 0.4034, + "step": 3611 + }, + { + "epoch": 0.23, + "grad_norm": 0.730668819028163, + "learning_rate": 8.984147644423361e-07, + "loss": 0.1424, + "step": 3612 + }, + { + "epoch": 0.23, + "grad_norm": 0.5961925230821833, + "learning_rate": 8.983523574089805e-07, + "loss": 0.3239, + "step": 3613 + }, + { + "epoch": 0.23, + "grad_norm": 0.3782501076675327, + "learning_rate": 8.982899333811123e-07, + "loss": 0.2761, + "step": 3614 + }, + { + "epoch": 0.23, + "grad_norm": 1.225164008271313, + "learning_rate": 8.98227492361395e-07, + "loss": 0.1838, + "step": 3615 + }, + { + "epoch": 0.23, + "grad_norm": 0.6473365017567108, + "learning_rate": 8.981650343524923e-07, + "loss": 0.0759, + "step": 3616 + }, + { + "epoch": 0.23, + "grad_norm": 1.7565623573857385, + "learning_rate": 8.981025593570689e-07, + "loss": 0.2515, + "step": 3617 + }, + { + "epoch": 0.23, + "grad_norm": 0.4702202017001563, + "learning_rate": 8.980400673777899e-07, + "loss": 0.0947, + "step": 3618 + }, + { + "epoch": 0.23, + "grad_norm": 0.9656628407236097, + "learning_rate": 8.979775584173215e-07, + "loss": 0.2115, + "step": 3619 + }, + { + "epoch": 0.23, + "grad_norm": 0.860858477890622, + "learning_rate": 8.979150324783304e-07, + "loss": 0.0973, + "step": 3620 + }, + { + "epoch": 0.23, + "grad_norm": 1.0181446681122626, + "learning_rate": 8.978524895634842e-07, + "loss": 0.1194, + "step": 3621 + }, + { + "epoch": 0.23, + "grad_norm": 2.53271215351201, + "learning_rate": 8.97789929675451e-07, + "loss": 0.4167, + "step": 3622 + }, + { + "epoch": 0.23, + "grad_norm": 0.6576937400855337, + "learning_rate": 8.977273528168995e-07, + "loss": 0.2884, + "step": 3623 + }, + { + "epoch": 0.23, + "grad_norm": 0.4513117586626686, + "learning_rate": 8.976647589905e-07, + "loss": 0.1806, + "step": 3624 + }, + { + "epoch": 0.23, + "grad_norm": 0.8760306204582416, + "learning_rate": 8.976021481989222e-07, + "loss": 0.2436, + "step": 3625 + }, + { + "epoch": 0.23, + "grad_norm": 0.5021420846358876, + "learning_rate": 8.975395204448375e-07, + "loss": 0.2186, + "step": 3626 + }, + { + "epoch": 0.23, + "grad_norm": 0.6725821463708842, + "learning_rate": 8.974768757309178e-07, + "loss": 0.234, + "step": 3627 + }, + { + "epoch": 0.23, + "grad_norm": 0.5985002495509291, + "learning_rate": 8.974142140598355e-07, + "loss": 0.2526, + "step": 3628 + }, + { + "epoch": 0.23, + "grad_norm": 0.07129142280051656, + "learning_rate": 8.97351535434264e-07, + "loss": 0.003, + "step": 3629 + }, + { + "epoch": 0.23, + "grad_norm": 0.7747541248706957, + "learning_rate": 8.972888398568771e-07, + "loss": 0.0597, + "step": 3630 + }, + { + "epoch": 0.23, + "grad_norm": 0.42375668303332786, + "learning_rate": 8.972261273303496e-07, + "loss": 0.1236, + "step": 3631 + }, + { + "epoch": 0.23, + "grad_norm": 0.20468387464282548, + "learning_rate": 8.971633978573572e-07, + "loss": 0.151, + "step": 3632 + }, + { + "epoch": 0.23, + "grad_norm": 0.650357402418647, + "learning_rate": 8.971006514405757e-07, + "loss": 0.1957, + "step": 3633 + }, + { + "epoch": 0.23, + "grad_norm": 0.5187919697079516, + "learning_rate": 8.970378880826821e-07, + "loss": 0.1054, + "step": 3634 + }, + { + "epoch": 0.23, + "grad_norm": 1.0042025818428884, + "learning_rate": 8.969751077863541e-07, + "loss": 0.2455, + "step": 3635 + }, + { + "epoch": 0.23, + "grad_norm": 3.1524725878424387, + "learning_rate": 8.969123105542701e-07, + "loss": 0.1395, + "step": 3636 + }, + { + "epoch": 0.23, + "grad_norm": 0.5317960185564282, + "learning_rate": 8.968494963891088e-07, + "loss": 0.1611, + "step": 3637 + }, + { + "epoch": 0.23, + "grad_norm": 0.2729045166459671, + "learning_rate": 8.967866652935505e-07, + "loss": 0.19, + "step": 3638 + }, + { + "epoch": 0.23, + "grad_norm": 1.7719217778106593, + "learning_rate": 8.967238172702752e-07, + "loss": 0.2054, + "step": 3639 + }, + { + "epoch": 0.23, + "grad_norm": 1.0165043542325558, + "learning_rate": 8.966609523219644e-07, + "loss": 0.1063, + "step": 3640 + }, + { + "epoch": 0.23, + "grad_norm": 0.9964120352545338, + "learning_rate": 8.965980704513001e-07, + "loss": 0.4258, + "step": 3641 + }, + { + "epoch": 0.23, + "grad_norm": 0.5789623426836484, + "learning_rate": 8.965351716609646e-07, + "loss": 0.2288, + "step": 3642 + }, + { + "epoch": 0.23, + "grad_norm": 0.4845448513474563, + "learning_rate": 8.964722559536417e-07, + "loss": 0.1012, + "step": 3643 + }, + { + "epoch": 0.23, + "grad_norm": 0.6260612248673366, + "learning_rate": 8.964093233320154e-07, + "loss": 0.1403, + "step": 3644 + }, + { + "epoch": 0.23, + "grad_norm": 0.7093553891900518, + "learning_rate": 8.963463737987705e-07, + "loss": 0.1666, + "step": 3645 + }, + { + "epoch": 0.23, + "grad_norm": 0.9548331172676473, + "learning_rate": 8.962834073565923e-07, + "loss": 0.1197, + "step": 3646 + }, + { + "epoch": 0.23, + "grad_norm": 0.6320170371849005, + "learning_rate": 8.962204240081675e-07, + "loss": 0.2755, + "step": 3647 + }, + { + "epoch": 0.23, + "grad_norm": 0.8136771418377878, + "learning_rate": 8.961574237561829e-07, + "loss": 0.178, + "step": 3648 + }, + { + "epoch": 0.23, + "grad_norm": 0.6243822576557541, + "learning_rate": 8.960944066033262e-07, + "loss": 0.336, + "step": 3649 + }, + { + "epoch": 0.23, + "grad_norm": 0.5325262107975081, + "learning_rate": 8.960313725522859e-07, + "loss": 0.131, + "step": 3650 + }, + { + "epoch": 0.23, + "grad_norm": 0.18395353413330576, + "learning_rate": 8.959683216057511e-07, + "loss": 0.1292, + "step": 3651 + }, + { + "epoch": 0.23, + "grad_norm": 0.9914847779468045, + "learning_rate": 8.959052537664117e-07, + "loss": 0.1831, + "step": 3652 + }, + { + "epoch": 0.23, + "grad_norm": 0.38738104396809164, + "learning_rate": 8.958421690369583e-07, + "loss": 0.1754, + "step": 3653 + }, + { + "epoch": 0.23, + "grad_norm": 0.836668169740809, + "learning_rate": 8.957790674200822e-07, + "loss": 0.2014, + "step": 3654 + }, + { + "epoch": 0.23, + "grad_norm": 0.6888123401638223, + "learning_rate": 8.957159489184756e-07, + "loss": 0.2075, + "step": 3655 + }, + { + "epoch": 0.23, + "grad_norm": 0.40301539098331646, + "learning_rate": 8.956528135348309e-07, + "loss": 0.049, + "step": 3656 + }, + { + "epoch": 0.23, + "grad_norm": 0.1883763894185228, + "learning_rate": 8.955896612718419e-07, + "loss": 0.0882, + "step": 3657 + }, + { + "epoch": 0.23, + "grad_norm": 0.41368478323780855, + "learning_rate": 8.955264921322028e-07, + "loss": 0.273, + "step": 3658 + }, + { + "epoch": 0.23, + "grad_norm": 0.8492349494450537, + "learning_rate": 8.954633061186085e-07, + "loss": 0.1135, + "step": 3659 + }, + { + "epoch": 0.23, + "grad_norm": 0.9084133800749307, + "learning_rate": 8.954001032337544e-07, + "loss": 0.237, + "step": 3660 + }, + { + "epoch": 0.23, + "grad_norm": 0.9855625340864884, + "learning_rate": 8.953368834803371e-07, + "loss": 0.2454, + "step": 3661 + }, + { + "epoch": 0.23, + "grad_norm": 0.5264867254320804, + "learning_rate": 8.952736468610537e-07, + "loss": 0.1048, + "step": 3662 + }, + { + "epoch": 0.23, + "grad_norm": 0.3548027213828942, + "learning_rate": 8.952103933786018e-07, + "loss": 0.1395, + "step": 3663 + }, + { + "epoch": 0.23, + "grad_norm": 0.20752820812496128, + "learning_rate": 8.951471230356802e-07, + "loss": 0.0762, + "step": 3664 + }, + { + "epoch": 0.23, + "grad_norm": 0.2104146494183132, + "learning_rate": 8.950838358349879e-07, + "loss": 0.0713, + "step": 3665 + }, + { + "epoch": 0.23, + "grad_norm": 0.5097759774422576, + "learning_rate": 8.950205317792248e-07, + "loss": 0.149, + "step": 3666 + }, + { + "epoch": 0.23, + "grad_norm": 0.8002322322693497, + "learning_rate": 8.949572108710919e-07, + "loss": 0.1094, + "step": 3667 + }, + { + "epoch": 0.23, + "grad_norm": 0.7567734818145712, + "learning_rate": 8.948938731132905e-07, + "loss": 0.1855, + "step": 3668 + }, + { + "epoch": 0.23, + "grad_norm": 0.5485931173642535, + "learning_rate": 8.948305185085224e-07, + "loss": 0.1999, + "step": 3669 + }, + { + "epoch": 0.23, + "grad_norm": 0.7758814921664122, + "learning_rate": 8.947671470594909e-07, + "loss": 0.253, + "step": 3670 + }, + { + "epoch": 0.23, + "grad_norm": 1.548740515899794, + "learning_rate": 8.947037587688991e-07, + "loss": 0.1348, + "step": 3671 + }, + { + "epoch": 0.23, + "grad_norm": 0.8562468637051345, + "learning_rate": 8.946403536394517e-07, + "loss": 0.2659, + "step": 3672 + }, + { + "epoch": 0.23, + "grad_norm": 0.5972999892965324, + "learning_rate": 8.945769316738534e-07, + "loss": 0.3071, + "step": 3673 + }, + { + "epoch": 0.23, + "grad_norm": 0.7637563834327782, + "learning_rate": 8.945134928748099e-07, + "loss": 0.2423, + "step": 3674 + }, + { + "epoch": 0.23, + "grad_norm": 1.724880431718904, + "learning_rate": 8.944500372450279e-07, + "loss": 0.0644, + "step": 3675 + }, + { + "epoch": 0.23, + "grad_norm": 0.29369274739420687, + "learning_rate": 8.943865647872142e-07, + "loss": 0.0833, + "step": 3676 + }, + { + "epoch": 0.23, + "grad_norm": 0.5174022107305446, + "learning_rate": 8.943230755040769e-07, + "loss": 0.0989, + "step": 3677 + }, + { + "epoch": 0.23, + "grad_norm": 0.3386383112516127, + "learning_rate": 8.942595693983246e-07, + "loss": 0.1398, + "step": 3678 + }, + { + "epoch": 0.23, + "grad_norm": 1.2894587813608773, + "learning_rate": 8.941960464726664e-07, + "loss": 0.2248, + "step": 3679 + }, + { + "epoch": 0.23, + "grad_norm": 0.9263771756047618, + "learning_rate": 8.941325067298125e-07, + "loss": 0.1979, + "step": 3680 + }, + { + "epoch": 0.23, + "grad_norm": 0.8364227497770703, + "learning_rate": 8.940689501724736e-07, + "loss": 0.2238, + "step": 3681 + }, + { + "epoch": 0.23, + "grad_norm": 0.6979653931635937, + "learning_rate": 8.940053768033608e-07, + "loss": 0.1571, + "step": 3682 + }, + { + "epoch": 0.23, + "grad_norm": 0.11305024212531636, + "learning_rate": 8.93941786625187e-07, + "loss": 0.0059, + "step": 3683 + }, + { + "epoch": 0.23, + "grad_norm": 0.5546405488659987, + "learning_rate": 8.938781796406645e-07, + "loss": 0.4754, + "step": 3684 + }, + { + "epoch": 0.23, + "grad_norm": 0.8013374007740004, + "learning_rate": 8.93814555852507e-07, + "loss": 0.223, + "step": 3685 + }, + { + "epoch": 0.24, + "grad_norm": 0.36380398795993935, + "learning_rate": 8.937509152634288e-07, + "loss": 0.1786, + "step": 3686 + }, + { + "epoch": 0.24, + "grad_norm": 0.30294934011358565, + "learning_rate": 8.936872578761452e-07, + "loss": 0.0062, + "step": 3687 + }, + { + "epoch": 0.24, + "grad_norm": 1.113872932046635, + "learning_rate": 8.936235836933716e-07, + "loss": 0.1251, + "step": 3688 + }, + { + "epoch": 0.24, + "grad_norm": 0.927337177241909, + "learning_rate": 8.935598927178247e-07, + "loss": 0.198, + "step": 3689 + }, + { + "epoch": 0.24, + "grad_norm": 0.9345968878431729, + "learning_rate": 8.934961849522218e-07, + "loss": 0.1778, + "step": 3690 + }, + { + "epoch": 0.24, + "grad_norm": 0.7948996619172269, + "learning_rate": 8.934324603992803e-07, + "loss": 0.0913, + "step": 3691 + }, + { + "epoch": 0.24, + "grad_norm": 0.8982354655223829, + "learning_rate": 8.933687190617194e-07, + "loss": 0.1007, + "step": 3692 + }, + { + "epoch": 0.24, + "grad_norm": 0.72415753707952, + "learning_rate": 8.933049609422581e-07, + "loss": 0.3315, + "step": 3693 + }, + { + "epoch": 0.24, + "grad_norm": 0.506559311550516, + "learning_rate": 8.932411860436165e-07, + "loss": 0.1811, + "step": 3694 + }, + { + "epoch": 0.24, + "grad_norm": 0.5399864366168942, + "learning_rate": 8.931773943685155e-07, + "loss": 0.0679, + "step": 3695 + }, + { + "epoch": 0.24, + "grad_norm": 0.5290569932581717, + "learning_rate": 8.931135859196762e-07, + "loss": 0.2001, + "step": 3696 + }, + { + "epoch": 0.24, + "grad_norm": 0.683327435889844, + "learning_rate": 8.930497606998213e-07, + "loss": 0.2632, + "step": 3697 + }, + { + "epoch": 0.24, + "grad_norm": 0.5840598372568698, + "learning_rate": 8.929859187116734e-07, + "loss": 0.1184, + "step": 3698 + }, + { + "epoch": 0.24, + "grad_norm": 0.5773771090751436, + "learning_rate": 8.929220599579562e-07, + "loss": 0.1974, + "step": 3699 + }, + { + "epoch": 0.24, + "grad_norm": 2.7974491427480053, + "learning_rate": 8.92858184441394e-07, + "loss": 0.1526, + "step": 3700 + }, + { + "epoch": 0.24, + "grad_norm": 0.87679290499142, + "learning_rate": 8.92794292164712e-07, + "loss": 0.1365, + "step": 3701 + }, + { + "epoch": 0.24, + "grad_norm": 0.44484177771603206, + "learning_rate": 8.927303831306358e-07, + "loss": 0.0774, + "step": 3702 + }, + { + "epoch": 0.24, + "grad_norm": 0.819815447823791, + "learning_rate": 8.926664573418922e-07, + "loss": 0.177, + "step": 3703 + }, + { + "epoch": 0.24, + "grad_norm": 0.9231950718037863, + "learning_rate": 8.92602514801208e-07, + "loss": 0.2868, + "step": 3704 + }, + { + "epoch": 0.24, + "grad_norm": 0.20792229694152603, + "learning_rate": 8.925385555113111e-07, + "loss": 0.0374, + "step": 3705 + }, + { + "epoch": 0.24, + "grad_norm": 0.6290562676820869, + "learning_rate": 8.924745794749307e-07, + "loss": 0.1801, + "step": 3706 + }, + { + "epoch": 0.24, + "grad_norm": 1.8703829044456655, + "learning_rate": 8.924105866947955e-07, + "loss": 0.1889, + "step": 3707 + }, + { + "epoch": 0.24, + "grad_norm": 0.38764591570315005, + "learning_rate": 8.923465771736359e-07, + "loss": 0.3859, + "step": 3708 + }, + { + "epoch": 0.24, + "grad_norm": 0.7098724276435888, + "learning_rate": 8.922825509141827e-07, + "loss": 0.2385, + "step": 3709 + }, + { + "epoch": 0.24, + "grad_norm": 0.38401416793688486, + "learning_rate": 8.922185079191671e-07, + "loss": 0.0083, + "step": 3710 + }, + { + "epoch": 0.24, + "grad_norm": 1.0987334480298931, + "learning_rate": 8.921544481913217e-07, + "loss": 0.3234, + "step": 3711 + }, + { + "epoch": 0.24, + "grad_norm": 0.5960848589499238, + "learning_rate": 8.920903717333789e-07, + "loss": 0.1536, + "step": 3712 + }, + { + "epoch": 0.24, + "grad_norm": 0.7386602970647814, + "learning_rate": 8.92026278548073e-07, + "loss": 0.2815, + "step": 3713 + }, + { + "epoch": 0.24, + "grad_norm": 0.31023055252201687, + "learning_rate": 8.919621686381378e-07, + "loss": 0.0413, + "step": 3714 + }, + { + "epoch": 0.24, + "grad_norm": 1.0399407344066338, + "learning_rate": 8.918980420063086e-07, + "loss": 0.1246, + "step": 3715 + }, + { + "epoch": 0.24, + "grad_norm": 0.2737335065949902, + "learning_rate": 8.918338986553211e-07, + "loss": 0.0326, + "step": 3716 + }, + { + "epoch": 0.24, + "grad_norm": 1.296067112610091, + "learning_rate": 8.917697385879117e-07, + "loss": 0.3811, + "step": 3717 + }, + { + "epoch": 0.24, + "grad_norm": 0.794927169010393, + "learning_rate": 8.917055618068178e-07, + "loss": 0.2224, + "step": 3718 + }, + { + "epoch": 0.24, + "grad_norm": 0.4387865933496512, + "learning_rate": 8.916413683147772e-07, + "loss": 0.1924, + "step": 3719 + }, + { + "epoch": 0.24, + "grad_norm": 0.5285970623958716, + "learning_rate": 8.915771581145285e-07, + "loss": 0.1733, + "step": 3720 + }, + { + "epoch": 0.24, + "grad_norm": 0.6351532285632555, + "learning_rate": 8.915129312088112e-07, + "loss": 0.0195, + "step": 3721 + }, + { + "epoch": 0.24, + "grad_norm": 0.42340552146390065, + "learning_rate": 8.914486876003649e-07, + "loss": 0.1661, + "step": 3722 + }, + { + "epoch": 0.24, + "grad_norm": 0.8304222957107547, + "learning_rate": 8.913844272919309e-07, + "loss": 0.3068, + "step": 3723 + }, + { + "epoch": 0.24, + "grad_norm": 0.5835541018828425, + "learning_rate": 8.913201502862504e-07, + "loss": 0.1241, + "step": 3724 + }, + { + "epoch": 0.24, + "grad_norm": 1.0814322182376175, + "learning_rate": 8.912558565860657e-07, + "loss": 0.1055, + "step": 3725 + }, + { + "epoch": 0.24, + "grad_norm": 0.5133578413313381, + "learning_rate": 8.911915461941196e-07, + "loss": 0.2432, + "step": 3726 + }, + { + "epoch": 0.24, + "grad_norm": 0.7605543377735867, + "learning_rate": 8.911272191131559e-07, + "loss": 0.016, + "step": 3727 + }, + { + "epoch": 0.24, + "grad_norm": 0.6770768411894138, + "learning_rate": 8.910628753459184e-07, + "loss": 0.1394, + "step": 3728 + }, + { + "epoch": 0.24, + "grad_norm": 0.40152719956764704, + "learning_rate": 8.909985148951528e-07, + "loss": 0.0609, + "step": 3729 + }, + { + "epoch": 0.24, + "grad_norm": 0.7574561410672707, + "learning_rate": 8.909341377636044e-07, + "loss": 0.0764, + "step": 3730 + }, + { + "epoch": 0.24, + "grad_norm": 0.7746749103878682, + "learning_rate": 8.908697439540198e-07, + "loss": 0.0302, + "step": 3731 + }, + { + "epoch": 0.24, + "grad_norm": 0.6624940135187026, + "learning_rate": 8.908053334691463e-07, + "loss": 0.0921, + "step": 3732 + }, + { + "epoch": 0.24, + "grad_norm": 0.328521198911137, + "learning_rate": 8.907409063117317e-07, + "loss": 0.2881, + "step": 3733 + }, + { + "epoch": 0.24, + "grad_norm": 0.9408310520518564, + "learning_rate": 8.906764624845244e-07, + "loss": 0.0253, + "step": 3734 + }, + { + "epoch": 0.24, + "grad_norm": 1.829655235082486, + "learning_rate": 8.906120019902739e-07, + "loss": 0.0305, + "step": 3735 + }, + { + "epoch": 0.24, + "grad_norm": 0.5149580058749262, + "learning_rate": 8.905475248317302e-07, + "loss": 0.3189, + "step": 3736 + }, + { + "epoch": 0.24, + "grad_norm": 0.16660177430372192, + "learning_rate": 8.904830310116439e-07, + "loss": 0.0031, + "step": 3737 + }, + { + "epoch": 0.24, + "grad_norm": 0.3968282733334314, + "learning_rate": 8.904185205327666e-07, + "loss": 0.1126, + "step": 3738 + }, + { + "epoch": 0.24, + "grad_norm": 0.4412258546287834, + "learning_rate": 8.903539933978504e-07, + "loss": 0.2691, + "step": 3739 + }, + { + "epoch": 0.24, + "grad_norm": 0.4155714819587609, + "learning_rate": 8.902894496096481e-07, + "loss": 0.1032, + "step": 3740 + }, + { + "epoch": 0.24, + "grad_norm": 0.6939420353806862, + "learning_rate": 8.902248891709132e-07, + "loss": 0.1945, + "step": 3741 + }, + { + "epoch": 0.24, + "grad_norm": 0.31966914646427064, + "learning_rate": 8.901603120844003e-07, + "loss": 0.108, + "step": 3742 + }, + { + "epoch": 0.24, + "grad_norm": 0.7902264693969758, + "learning_rate": 8.900957183528639e-07, + "loss": 0.3815, + "step": 3743 + }, + { + "epoch": 0.24, + "grad_norm": 1.0140197309363976, + "learning_rate": 8.900311079790601e-07, + "loss": 0.1647, + "step": 3744 + }, + { + "epoch": 0.24, + "grad_norm": 0.3103666139597173, + "learning_rate": 8.899664809657453e-07, + "loss": 0.0161, + "step": 3745 + }, + { + "epoch": 0.24, + "grad_norm": 0.9515611665852027, + "learning_rate": 8.899018373156763e-07, + "loss": 0.1369, + "step": 3746 + }, + { + "epoch": 0.24, + "grad_norm": 0.5987886609029958, + "learning_rate": 8.898371770316111e-07, + "loss": 0.2235, + "step": 3747 + }, + { + "epoch": 0.24, + "grad_norm": 0.6756562521449005, + "learning_rate": 8.897725001163083e-07, + "loss": 0.058, + "step": 3748 + }, + { + "epoch": 0.24, + "grad_norm": 0.31828381595214017, + "learning_rate": 8.897078065725272e-07, + "loss": 0.2259, + "step": 3749 + }, + { + "epoch": 0.24, + "grad_norm": 0.7350619858446041, + "learning_rate": 8.896430964030277e-07, + "loss": 0.0763, + "step": 3750 + }, + { + "epoch": 0.24, + "grad_norm": 1.3275744319752052, + "learning_rate": 8.895783696105703e-07, + "loss": 0.0234, + "step": 3751 + }, + { + "epoch": 0.24, + "grad_norm": 0.9536052494238427, + "learning_rate": 8.895136261979166e-07, + "loss": 0.0572, + "step": 3752 + }, + { + "epoch": 0.24, + "grad_norm": 0.6724851710180773, + "learning_rate": 8.894488661678285e-07, + "loss": 0.0616, + "step": 3753 + }, + { + "epoch": 0.24, + "grad_norm": 0.2867622074934325, + "learning_rate": 8.893840895230689e-07, + "loss": 0.0287, + "step": 3754 + }, + { + "epoch": 0.24, + "grad_norm": 1.579933443117816, + "learning_rate": 8.893192962664012e-07, + "loss": 0.0795, + "step": 3755 + }, + { + "epoch": 0.24, + "grad_norm": 0.7035797738352091, + "learning_rate": 8.892544864005898e-07, + "loss": 0.035, + "step": 3756 + }, + { + "epoch": 0.24, + "grad_norm": 0.42956073102948233, + "learning_rate": 8.891896599283994e-07, + "loss": 0.3451, + "step": 3757 + }, + { + "epoch": 0.24, + "grad_norm": 0.8172843223769409, + "learning_rate": 8.891248168525957e-07, + "loss": 0.1576, + "step": 3758 + }, + { + "epoch": 0.24, + "grad_norm": 0.8120322099943349, + "learning_rate": 8.890599571759454e-07, + "loss": 0.4472, + "step": 3759 + }, + { + "epoch": 0.24, + "grad_norm": 1.6618354528280581, + "learning_rate": 8.88995080901215e-07, + "loss": 0.1247, + "step": 3760 + }, + { + "epoch": 0.24, + "grad_norm": 0.9775245784699352, + "learning_rate": 8.889301880311724e-07, + "loss": 0.1023, + "step": 3761 + }, + { + "epoch": 0.24, + "grad_norm": 0.555236487849536, + "learning_rate": 8.888652785685861e-07, + "loss": 0.1841, + "step": 3762 + }, + { + "epoch": 0.24, + "grad_norm": 0.3382029675723734, + "learning_rate": 8.888003525162256e-07, + "loss": 0.1968, + "step": 3763 + }, + { + "epoch": 0.24, + "grad_norm": 0.8481347461852061, + "learning_rate": 8.887354098768602e-07, + "loss": 0.0487, + "step": 3764 + }, + { + "epoch": 0.24, + "grad_norm": 0.8186472691297958, + "learning_rate": 8.886704506532609e-07, + "loss": 0.1845, + "step": 3765 + }, + { + "epoch": 0.24, + "grad_norm": 0.5869573020823922, + "learning_rate": 8.886054748481988e-07, + "loss": 0.1403, + "step": 3766 + }, + { + "epoch": 0.24, + "grad_norm": 1.4056982892023109, + "learning_rate": 8.885404824644459e-07, + "loss": 0.1476, + "step": 3767 + }, + { + "epoch": 0.24, + "grad_norm": 0.8142308314391018, + "learning_rate": 8.88475473504775e-07, + "loss": 0.2135, + "step": 3768 + }, + { + "epoch": 0.24, + "grad_norm": 0.7653636598760375, + "learning_rate": 8.884104479719594e-07, + "loss": 0.1933, + "step": 3769 + }, + { + "epoch": 0.24, + "grad_norm": 0.6010367539071723, + "learning_rate": 8.883454058687734e-07, + "loss": 0.1709, + "step": 3770 + }, + { + "epoch": 0.24, + "grad_norm": 0.8750845813632222, + "learning_rate": 8.882803471979916e-07, + "loss": 0.2474, + "step": 3771 + }, + { + "epoch": 0.24, + "grad_norm": 0.9290887108245592, + "learning_rate": 8.882152719623898e-07, + "loss": 0.1797, + "step": 3772 + }, + { + "epoch": 0.24, + "grad_norm": 1.1949534146437264, + "learning_rate": 8.881501801647439e-07, + "loss": 0.0995, + "step": 3773 + }, + { + "epoch": 0.24, + "grad_norm": 0.6126499073367744, + "learning_rate": 8.880850718078312e-07, + "loss": 0.3397, + "step": 3774 + }, + { + "epoch": 0.24, + "grad_norm": 0.892068876882014, + "learning_rate": 8.880199468944291e-07, + "loss": 0.1426, + "step": 3775 + }, + { + "epoch": 0.24, + "grad_norm": 0.6419376933464616, + "learning_rate": 8.87954805427316e-07, + "loss": 0.2155, + "step": 3776 + }, + { + "epoch": 0.24, + "grad_norm": 0.20500359466811582, + "learning_rate": 8.878896474092712e-07, + "loss": 0.0904, + "step": 3777 + }, + { + "epoch": 0.24, + "grad_norm": 0.5216916846480123, + "learning_rate": 8.878244728430742e-07, + "loss": 0.1431, + "step": 3778 + }, + { + "epoch": 0.24, + "grad_norm": 2.3122812443037244, + "learning_rate": 8.877592817315054e-07, + "loss": 0.1966, + "step": 3779 + }, + { + "epoch": 0.24, + "grad_norm": 0.7561955810362375, + "learning_rate": 8.876940740773463e-07, + "loss": 0.4138, + "step": 3780 + }, + { + "epoch": 0.24, + "grad_norm": 1.1604726993973071, + "learning_rate": 8.876288498833786e-07, + "loss": 0.3771, + "step": 3781 + }, + { + "epoch": 0.24, + "grad_norm": 0.7917105456265333, + "learning_rate": 8.87563609152385e-07, + "loss": 0.0606, + "step": 3782 + }, + { + "epoch": 0.24, + "grad_norm": 1.3562561210111628, + "learning_rate": 8.874983518871486e-07, + "loss": 0.2973, + "step": 3783 + }, + { + "epoch": 0.24, + "grad_norm": 0.5272699290899061, + "learning_rate": 8.874330780904537e-07, + "loss": 0.1819, + "step": 3784 + }, + { + "epoch": 0.24, + "grad_norm": 0.5617773042352198, + "learning_rate": 8.873677877650847e-07, + "loss": 0.2149, + "step": 3785 + }, + { + "epoch": 0.24, + "grad_norm": 0.7927284887821636, + "learning_rate": 8.873024809138272e-07, + "loss": 0.2816, + "step": 3786 + }, + { + "epoch": 0.24, + "grad_norm": 1.127433478609849, + "learning_rate": 8.872371575394674e-07, + "loss": 0.3469, + "step": 3787 + }, + { + "epoch": 0.24, + "grad_norm": 0.8124809502485485, + "learning_rate": 8.87171817644792e-07, + "loss": 0.1047, + "step": 3788 + }, + { + "epoch": 0.24, + "grad_norm": 0.6961022059067764, + "learning_rate": 8.871064612325885e-07, + "loss": 0.2903, + "step": 3789 + }, + { + "epoch": 0.24, + "grad_norm": 0.8602851144623159, + "learning_rate": 8.870410883056451e-07, + "loss": 0.026, + "step": 3790 + }, + { + "epoch": 0.24, + "grad_norm": 0.9576690687775362, + "learning_rate": 8.869756988667508e-07, + "loss": 0.2327, + "step": 3791 + }, + { + "epoch": 0.24, + "grad_norm": 0.38831604542025516, + "learning_rate": 8.869102929186953e-07, + "loss": 0.1492, + "step": 3792 + }, + { + "epoch": 0.24, + "grad_norm": 1.4334388429938352, + "learning_rate": 8.868448704642691e-07, + "loss": 0.267, + "step": 3793 + }, + { + "epoch": 0.24, + "grad_norm": 1.055976164987003, + "learning_rate": 8.867794315062629e-07, + "loss": 0.3208, + "step": 3794 + }, + { + "epoch": 0.24, + "grad_norm": 0.5403932677561274, + "learning_rate": 8.867139760474687e-07, + "loss": 0.1477, + "step": 3795 + }, + { + "epoch": 0.24, + "grad_norm": 0.21410656598523806, + "learning_rate": 8.866485040906788e-07, + "loss": 0.0974, + "step": 3796 + }, + { + "epoch": 0.24, + "grad_norm": 0.9468380997808159, + "learning_rate": 8.865830156386866e-07, + "loss": 0.2684, + "step": 3797 + }, + { + "epoch": 0.24, + "grad_norm": 0.9282333660947354, + "learning_rate": 8.865175106942857e-07, + "loss": 0.3556, + "step": 3798 + }, + { + "epoch": 0.24, + "grad_norm": 0.33742281295306603, + "learning_rate": 8.864519892602709e-07, + "loss": 0.0425, + "step": 3799 + }, + { + "epoch": 0.24, + "grad_norm": 1.2564181620645343, + "learning_rate": 8.863864513394372e-07, + "loss": 0.227, + "step": 3800 + }, + { + "epoch": 0.24, + "grad_norm": 0.023836021967167408, + "learning_rate": 8.863208969345809e-07, + "loss": 0.0004, + "step": 3801 + }, + { + "epoch": 0.24, + "grad_norm": 1.4932632983783014, + "learning_rate": 8.862553260484984e-07, + "loss": 0.0972, + "step": 3802 + }, + { + "epoch": 0.24, + "grad_norm": 0.8612358583508158, + "learning_rate": 8.861897386839874e-07, + "loss": 0.2424, + "step": 3803 + }, + { + "epoch": 0.24, + "grad_norm": 0.8281799329071436, + "learning_rate": 8.861241348438457e-07, + "loss": 0.2149, + "step": 3804 + }, + { + "epoch": 0.24, + "grad_norm": 0.3504376151949932, + "learning_rate": 8.860585145308722e-07, + "loss": 0.0925, + "step": 3805 + }, + { + "epoch": 0.24, + "grad_norm": 0.7966390297010175, + "learning_rate": 8.859928777478664e-07, + "loss": 0.2819, + "step": 3806 + }, + { + "epoch": 0.24, + "grad_norm": 0.8976778103953185, + "learning_rate": 8.859272244976286e-07, + "loss": 0.0704, + "step": 3807 + }, + { + "epoch": 0.24, + "grad_norm": 0.6317080723498039, + "learning_rate": 8.858615547829594e-07, + "loss": 0.1435, + "step": 3808 + }, + { + "epoch": 0.24, + "grad_norm": 1.633199222329602, + "learning_rate": 8.857958686066607e-07, + "loss": 0.2054, + "step": 3809 + }, + { + "epoch": 0.24, + "grad_norm": 3.2357002728651416, + "learning_rate": 8.857301659715347e-07, + "loss": 0.1984, + "step": 3810 + }, + { + "epoch": 0.24, + "grad_norm": 1.4089302300435955, + "learning_rate": 8.856644468803845e-07, + "loss": 0.3983, + "step": 3811 + }, + { + "epoch": 0.24, + "grad_norm": 1.4135613464162196, + "learning_rate": 8.855987113360134e-07, + "loss": 0.176, + "step": 3812 + }, + { + "epoch": 0.24, + "grad_norm": 0.37719396902233837, + "learning_rate": 8.855329593412264e-07, + "loss": 0.0489, + "step": 3813 + }, + { + "epoch": 0.24, + "grad_norm": 1.8875365007259433, + "learning_rate": 8.854671908988283e-07, + "loss": 0.1269, + "step": 3814 + }, + { + "epoch": 0.24, + "grad_norm": 1.0181689889722372, + "learning_rate": 8.854014060116249e-07, + "loss": 0.2733, + "step": 3815 + }, + { + "epoch": 0.24, + "grad_norm": 0.5273144421567434, + "learning_rate": 8.853356046824228e-07, + "loss": 0.1581, + "step": 3816 + }, + { + "epoch": 0.24, + "grad_norm": 0.9671347036238022, + "learning_rate": 8.852697869140292e-07, + "loss": 0.1124, + "step": 3817 + }, + { + "epoch": 0.24, + "grad_norm": 0.7650637101924961, + "learning_rate": 8.85203952709252e-07, + "loss": 0.2541, + "step": 3818 + }, + { + "epoch": 0.24, + "grad_norm": 0.4237115447194473, + "learning_rate": 8.851381020708998e-07, + "loss": 0.0422, + "step": 3819 + }, + { + "epoch": 0.24, + "grad_norm": 0.6233864538524351, + "learning_rate": 8.850722350017818e-07, + "loss": 0.0644, + "step": 3820 + }, + { + "epoch": 0.24, + "grad_norm": 0.428489971635069, + "learning_rate": 8.850063515047083e-07, + "loss": 0.183, + "step": 3821 + }, + { + "epoch": 0.24, + "grad_norm": 0.8953283052132391, + "learning_rate": 8.8494045158249e-07, + "loss": 0.3399, + "step": 3822 + }, + { + "epoch": 0.24, + "grad_norm": 0.6535801662496468, + "learning_rate": 8.848745352379381e-07, + "loss": 0.1782, + "step": 3823 + }, + { + "epoch": 0.24, + "grad_norm": 0.49997927614318116, + "learning_rate": 8.848086024738648e-07, + "loss": 0.1637, + "step": 3824 + }, + { + "epoch": 0.24, + "grad_norm": 0.6410418209206888, + "learning_rate": 8.847426532930829e-07, + "loss": 0.1569, + "step": 3825 + }, + { + "epoch": 0.24, + "grad_norm": 0.9459033036201963, + "learning_rate": 8.846766876984061e-07, + "loss": 0.195, + "step": 3826 + }, + { + "epoch": 0.24, + "grad_norm": 0.6702740291675603, + "learning_rate": 8.846107056926484e-07, + "loss": 0.4062, + "step": 3827 + }, + { + "epoch": 0.24, + "grad_norm": 0.4524134624553657, + "learning_rate": 8.84544707278625e-07, + "loss": 0.1911, + "step": 3828 + }, + { + "epoch": 0.24, + "grad_norm": 0.9767117423338574, + "learning_rate": 8.844786924591512e-07, + "loss": 0.0574, + "step": 3829 + }, + { + "epoch": 0.24, + "grad_norm": 0.881510543980618, + "learning_rate": 8.844126612370435e-07, + "loss": 0.0707, + "step": 3830 + }, + { + "epoch": 0.24, + "grad_norm": 0.7627701634454506, + "learning_rate": 8.84346613615119e-07, + "loss": 0.057, + "step": 3831 + }, + { + "epoch": 0.24, + "grad_norm": 0.2173122088258248, + "learning_rate": 8.842805495961952e-07, + "loss": 0.002, + "step": 3832 + }, + { + "epoch": 0.24, + "grad_norm": 0.580379117277505, + "learning_rate": 8.842144691830906e-07, + "loss": 0.1342, + "step": 3833 + }, + { + "epoch": 0.24, + "grad_norm": 1.120180075118133, + "learning_rate": 8.841483723786246e-07, + "loss": 0.2354, + "step": 3834 + }, + { + "epoch": 0.24, + "grad_norm": 0.4914165517828563, + "learning_rate": 8.840822591856167e-07, + "loss": 0.4501, + "step": 3835 + }, + { + "epoch": 0.24, + "grad_norm": 1.0371190255382727, + "learning_rate": 8.840161296068876e-07, + "loss": 0.2488, + "step": 3836 + }, + { + "epoch": 0.24, + "grad_norm": 0.5973299600775365, + "learning_rate": 8.839499836452582e-07, + "loss": 0.3872, + "step": 3837 + }, + { + "epoch": 0.24, + "grad_norm": 0.395174487770906, + "learning_rate": 8.838838213035509e-07, + "loss": 0.1156, + "step": 3838 + }, + { + "epoch": 0.24, + "grad_norm": 3.356218367510238, + "learning_rate": 8.83817642584588e-07, + "loss": 0.1328, + "step": 3839 + }, + { + "epoch": 0.24, + "grad_norm": 1.0232113436227914, + "learning_rate": 8.837514474911929e-07, + "loss": 0.0872, + "step": 3840 + }, + { + "epoch": 0.24, + "grad_norm": 0.6967371947473936, + "learning_rate": 8.836852360261895e-07, + "loss": 0.1095, + "step": 3841 + }, + { + "epoch": 0.25, + "grad_norm": 2.6899814270235796, + "learning_rate": 8.836190081924027e-07, + "loss": 0.0583, + "step": 3842 + }, + { + "epoch": 0.25, + "grad_norm": 0.47253467787743975, + "learning_rate": 8.835527639926579e-07, + "loss": 0.1843, + "step": 3843 + }, + { + "epoch": 0.25, + "grad_norm": 0.6822384251805668, + "learning_rate": 8.834865034297812e-07, + "loss": 0.2154, + "step": 3844 + }, + { + "epoch": 0.25, + "grad_norm": 0.5155496044580115, + "learning_rate": 8.834202265065993e-07, + "loss": 0.0798, + "step": 3845 + }, + { + "epoch": 0.25, + "grad_norm": 0.6233805862568065, + "learning_rate": 8.833539332259396e-07, + "loss": 0.2562, + "step": 3846 + }, + { + "epoch": 0.25, + "grad_norm": 0.745255601651051, + "learning_rate": 8.832876235906306e-07, + "loss": 0.1589, + "step": 3847 + }, + { + "epoch": 0.25, + "grad_norm": 0.18299751295070765, + "learning_rate": 8.832212976035012e-07, + "loss": 0.0722, + "step": 3848 + }, + { + "epoch": 0.25, + "grad_norm": 0.37131939273704917, + "learning_rate": 8.831549552673807e-07, + "loss": 0.0921, + "step": 3849 + }, + { + "epoch": 0.25, + "grad_norm": 1.0402267070631617, + "learning_rate": 8.830885965850998e-07, + "loss": 0.1206, + "step": 3850 + }, + { + "epoch": 0.25, + "grad_norm": 0.8733116472389192, + "learning_rate": 8.83022221559489e-07, + "loss": 0.3168, + "step": 3851 + }, + { + "epoch": 0.25, + "grad_norm": 2.158788411492282, + "learning_rate": 8.829558301933804e-07, + "loss": 0.1649, + "step": 3852 + }, + { + "epoch": 0.25, + "grad_norm": 0.7594693834484741, + "learning_rate": 8.828894224896062e-07, + "loss": 0.3117, + "step": 3853 + }, + { + "epoch": 0.25, + "grad_norm": 1.00607259640321, + "learning_rate": 8.828229984509996e-07, + "loss": 0.3921, + "step": 3854 + }, + { + "epoch": 0.25, + "grad_norm": 1.3712145253287933, + "learning_rate": 8.827565580803941e-07, + "loss": 0.1097, + "step": 3855 + }, + { + "epoch": 0.25, + "grad_norm": 1.5518431096903318, + "learning_rate": 8.826901013806245e-07, + "loss": 0.0852, + "step": 3856 + }, + { + "epoch": 0.25, + "grad_norm": 7.260101596858923, + "learning_rate": 8.826236283545259e-07, + "loss": 0.2245, + "step": 3857 + }, + { + "epoch": 0.25, + "grad_norm": 0.9216122789470282, + "learning_rate": 8.825571390049343e-07, + "loss": 0.141, + "step": 3858 + }, + { + "epoch": 0.25, + "grad_norm": 0.6574770388498139, + "learning_rate": 8.824906333346858e-07, + "loss": 0.1655, + "step": 3859 + }, + { + "epoch": 0.25, + "grad_norm": 0.4896930052088437, + "learning_rate": 8.824241113466182e-07, + "loss": 0.129, + "step": 3860 + }, + { + "epoch": 0.25, + "grad_norm": 1.045338539757439, + "learning_rate": 8.823575730435693e-07, + "loss": 0.1246, + "step": 3861 + }, + { + "epoch": 0.25, + "grad_norm": 1.5777459068356137, + "learning_rate": 8.822910184283776e-07, + "loss": 0.2317, + "step": 3862 + }, + { + "epoch": 0.25, + "grad_norm": 1.7493117669883436, + "learning_rate": 8.822244475038825e-07, + "loss": 0.2838, + "step": 3863 + }, + { + "epoch": 0.25, + "grad_norm": 0.7158063307446045, + "learning_rate": 8.821578602729241e-07, + "loss": 0.0901, + "step": 3864 + }, + { + "epoch": 0.25, + "grad_norm": 0.9323696081863153, + "learning_rate": 8.820912567383432e-07, + "loss": 0.4398, + "step": 3865 + }, + { + "epoch": 0.25, + "grad_norm": 0.5992314213267559, + "learning_rate": 8.820246369029812e-07, + "loss": 0.106, + "step": 3866 + }, + { + "epoch": 0.25, + "grad_norm": 0.5060150979246018, + "learning_rate": 8.819580007696802e-07, + "loss": 0.1455, + "step": 3867 + }, + { + "epoch": 0.25, + "grad_norm": 1.1236957189997694, + "learning_rate": 8.818913483412831e-07, + "loss": 0.3267, + "step": 3868 + }, + { + "epoch": 0.25, + "grad_norm": 0.7399667473326889, + "learning_rate": 8.818246796206332e-07, + "loss": 0.0643, + "step": 3869 + }, + { + "epoch": 0.25, + "grad_norm": 0.5064265662293846, + "learning_rate": 8.817579946105751e-07, + "loss": 0.148, + "step": 3870 + }, + { + "epoch": 0.25, + "grad_norm": 0.6474326731435445, + "learning_rate": 8.816912933139535e-07, + "loss": 0.2105, + "step": 3871 + }, + { + "epoch": 0.25, + "grad_norm": 0.8501109047346274, + "learning_rate": 8.81624575733614e-07, + "loss": 0.3672, + "step": 3872 + }, + { + "epoch": 0.25, + "grad_norm": 0.7323724092358175, + "learning_rate": 8.81557841872403e-07, + "loss": 0.317, + "step": 3873 + }, + { + "epoch": 0.25, + "grad_norm": 1.2862806210495028, + "learning_rate": 8.814910917331673e-07, + "loss": 0.0497, + "step": 3874 + }, + { + "epoch": 0.25, + "grad_norm": 0.3478016951604009, + "learning_rate": 8.814243253187548e-07, + "loss": 0.0718, + "step": 3875 + }, + { + "epoch": 0.25, + "grad_norm": 0.38831128658614433, + "learning_rate": 8.813575426320139e-07, + "loss": 0.1864, + "step": 3876 + }, + { + "epoch": 0.25, + "grad_norm": 1.9050264415368248, + "learning_rate": 8.812907436757935e-07, + "loss": 0.2218, + "step": 3877 + }, + { + "epoch": 0.25, + "grad_norm": 0.9137963310217068, + "learning_rate": 8.812239284529435e-07, + "loss": 0.0833, + "step": 3878 + }, + { + "epoch": 0.25, + "grad_norm": 2.2065487505746293, + "learning_rate": 8.811570969663144e-07, + "loss": 0.1434, + "step": 3879 + }, + { + "epoch": 0.25, + "grad_norm": 0.5826485797134789, + "learning_rate": 8.810902492187573e-07, + "loss": 0.1305, + "step": 3880 + }, + { + "epoch": 0.25, + "grad_norm": 0.5433145356257765, + "learning_rate": 8.81023385213124e-07, + "loss": 0.1828, + "step": 3881 + }, + { + "epoch": 0.25, + "grad_norm": 0.8426871437234196, + "learning_rate": 8.809565049522671e-07, + "loss": 0.2539, + "step": 3882 + }, + { + "epoch": 0.25, + "grad_norm": 0.6029357850617248, + "learning_rate": 8.8088960843904e-07, + "loss": 0.082, + "step": 3883 + }, + { + "epoch": 0.25, + "grad_norm": 0.8689748220497369, + "learning_rate": 8.808226956762967e-07, + "loss": 0.3289, + "step": 3884 + }, + { + "epoch": 0.25, + "grad_norm": 0.5100034663399725, + "learning_rate": 8.807557666668914e-07, + "loss": 0.3517, + "step": 3885 + }, + { + "epoch": 0.25, + "grad_norm": 0.9071223821693056, + "learning_rate": 8.806888214136798e-07, + "loss": 0.1622, + "step": 3886 + }, + { + "epoch": 0.25, + "grad_norm": 0.25651385324206816, + "learning_rate": 8.806218599195177e-07, + "loss": 0.1169, + "step": 3887 + }, + { + "epoch": 0.25, + "grad_norm": 1.6205044009298561, + "learning_rate": 8.805548821872619e-07, + "loss": 0.3588, + "step": 3888 + }, + { + "epoch": 0.25, + "grad_norm": 0.2051808882190857, + "learning_rate": 8.8048788821977e-07, + "loss": 0.0084, + "step": 3889 + }, + { + "epoch": 0.25, + "grad_norm": 0.5313437251721552, + "learning_rate": 8.804208780198997e-07, + "loss": 0.156, + "step": 3890 + }, + { + "epoch": 0.25, + "grad_norm": 0.5608721803888229, + "learning_rate": 8.8035385159051e-07, + "loss": 0.1737, + "step": 3891 + }, + { + "epoch": 0.25, + "grad_norm": 2.114285425950021, + "learning_rate": 8.802868089344605e-07, + "loss": 0.0337, + "step": 3892 + }, + { + "epoch": 0.25, + "grad_norm": 1.1108144278333296, + "learning_rate": 8.802197500546112e-07, + "loss": 0.2733, + "step": 3893 + }, + { + "epoch": 0.25, + "grad_norm": 0.6756606510327932, + "learning_rate": 8.801526749538231e-07, + "loss": 0.0114, + "step": 3894 + }, + { + "epoch": 0.25, + "grad_norm": 0.7827544871261352, + "learning_rate": 8.800855836349577e-07, + "loss": 0.1278, + "step": 3895 + }, + { + "epoch": 0.25, + "grad_norm": 0.8810978403334839, + "learning_rate": 8.800184761008771e-07, + "loss": 0.2304, + "step": 3896 + }, + { + "epoch": 0.25, + "grad_norm": 1.0625358915168617, + "learning_rate": 8.799513523544444e-07, + "loss": 0.0853, + "step": 3897 + }, + { + "epoch": 0.25, + "grad_norm": 0.8936459555487971, + "learning_rate": 8.798842123985233e-07, + "loss": 0.1544, + "step": 3898 + }, + { + "epoch": 0.25, + "grad_norm": 0.27459104503815157, + "learning_rate": 8.798170562359779e-07, + "loss": 0.1053, + "step": 3899 + }, + { + "epoch": 0.25, + "grad_norm": 1.0829617560998321, + "learning_rate": 8.797498838696735e-07, + "loss": 0.3077, + "step": 3900 + }, + { + "epoch": 0.25, + "grad_norm": 0.4402056171559821, + "learning_rate": 8.796826953024756e-07, + "loss": 0.1567, + "step": 3901 + }, + { + "epoch": 0.25, + "grad_norm": 0.625065524314885, + "learning_rate": 8.796154905372506e-07, + "loss": 0.1401, + "step": 3902 + }, + { + "epoch": 0.25, + "grad_norm": 0.09889685619163836, + "learning_rate": 8.795482695768656e-07, + "loss": 0.006, + "step": 3903 + }, + { + "epoch": 0.25, + "grad_norm": 0.40167191233341654, + "learning_rate": 8.794810324241886e-07, + "loss": 0.1823, + "step": 3904 + }, + { + "epoch": 0.25, + "grad_norm": 0.9797198670885936, + "learning_rate": 8.794137790820879e-07, + "loss": 0.0842, + "step": 3905 + }, + { + "epoch": 0.25, + "grad_norm": 0.6342464441119157, + "learning_rate": 8.793465095534327e-07, + "loss": 0.0054, + "step": 3906 + }, + { + "epoch": 0.25, + "grad_norm": 0.8674351748185777, + "learning_rate": 8.792792238410926e-07, + "loss": 0.2681, + "step": 3907 + }, + { + "epoch": 0.25, + "grad_norm": 0.8127929286039683, + "learning_rate": 8.792119219479386e-07, + "loss": 0.3621, + "step": 3908 + }, + { + "epoch": 0.25, + "grad_norm": 0.998942057555394, + "learning_rate": 8.791446038768415e-07, + "loss": 0.1985, + "step": 3909 + }, + { + "epoch": 0.25, + "grad_norm": 0.733145838965004, + "learning_rate": 8.790772696306736e-07, + "loss": 0.0689, + "step": 3910 + }, + { + "epoch": 0.25, + "grad_norm": 0.8314182062716361, + "learning_rate": 8.790099192123073e-07, + "loss": 0.3592, + "step": 3911 + }, + { + "epoch": 0.25, + "grad_norm": 1.429362481326908, + "learning_rate": 8.789425526246159e-07, + "loss": 0.0961, + "step": 3912 + }, + { + "epoch": 0.25, + "grad_norm": 0.5162858269850434, + "learning_rate": 8.788751698704734e-07, + "loss": 0.0957, + "step": 3913 + }, + { + "epoch": 0.25, + "grad_norm": 0.778633990635694, + "learning_rate": 8.788077709527546e-07, + "loss": 0.1796, + "step": 3914 + }, + { + "epoch": 0.25, + "grad_norm": 0.2358032788236904, + "learning_rate": 8.787403558743347e-07, + "loss": 0.0903, + "step": 3915 + }, + { + "epoch": 0.25, + "grad_norm": 0.6581315004821515, + "learning_rate": 8.786729246380899e-07, + "loss": 0.006, + "step": 3916 + }, + { + "epoch": 0.25, + "grad_norm": 0.9809002158680535, + "learning_rate": 8.786054772468968e-07, + "loss": 0.1373, + "step": 3917 + }, + { + "epoch": 0.25, + "grad_norm": 1.0868837241043616, + "learning_rate": 8.785380137036331e-07, + "loss": 0.3455, + "step": 3918 + }, + { + "epoch": 0.25, + "grad_norm": 0.33965775754026445, + "learning_rate": 8.784705340111767e-07, + "loss": 0.1726, + "step": 3919 + }, + { + "epoch": 0.25, + "grad_norm": 0.3881610503078679, + "learning_rate": 8.784030381724066e-07, + "loss": 0.1477, + "step": 3920 + }, + { + "epoch": 0.25, + "grad_norm": 1.2779562197772847, + "learning_rate": 8.783355261902021e-07, + "loss": 0.3554, + "step": 3921 + }, + { + "epoch": 0.25, + "grad_norm": 0.46884013910490463, + "learning_rate": 8.782679980674436e-07, + "loss": 0.1424, + "step": 3922 + }, + { + "epoch": 0.25, + "grad_norm": 0.4142878280564671, + "learning_rate": 8.782004538070118e-07, + "loss": 0.049, + "step": 3923 + }, + { + "epoch": 0.25, + "grad_norm": 0.435630693591566, + "learning_rate": 8.781328934117885e-07, + "loss": 0.1453, + "step": 3924 + }, + { + "epoch": 0.25, + "grad_norm": 0.7895980903877988, + "learning_rate": 8.780653168846556e-07, + "loss": 0.1049, + "step": 3925 + }, + { + "epoch": 0.25, + "grad_norm": 1.078528448372595, + "learning_rate": 8.779977242284964e-07, + "loss": 0.1944, + "step": 3926 + }, + { + "epoch": 0.25, + "grad_norm": 0.3513130449104446, + "learning_rate": 8.779301154461945e-07, + "loss": 0.0848, + "step": 3927 + }, + { + "epoch": 0.25, + "grad_norm": 0.6341401280500651, + "learning_rate": 8.778624905406339e-07, + "loss": 0.0727, + "step": 3928 + }, + { + "epoch": 0.25, + "grad_norm": 1.1356196801368217, + "learning_rate": 8.777948495147e-07, + "loss": 0.0199, + "step": 3929 + }, + { + "epoch": 0.25, + "grad_norm": 1.036735641930418, + "learning_rate": 8.777271923712783e-07, + "loss": 0.1751, + "step": 3930 + }, + { + "epoch": 0.25, + "grad_norm": 0.9435075619147448, + "learning_rate": 8.776595191132553e-07, + "loss": 0.3591, + "step": 3931 + }, + { + "epoch": 0.25, + "grad_norm": 0.6932852345179985, + "learning_rate": 8.775918297435181e-07, + "loss": 0.2179, + "step": 3932 + }, + { + "epoch": 0.25, + "grad_norm": 0.6888035519656425, + "learning_rate": 8.775241242649543e-07, + "loss": 0.0518, + "step": 3933 + }, + { + "epoch": 0.25, + "grad_norm": 0.5971998529801377, + "learning_rate": 8.774564026804525e-07, + "loss": 0.231, + "step": 3934 + }, + { + "epoch": 0.25, + "grad_norm": 1.1419428339338078, + "learning_rate": 8.773886649929017e-07, + "loss": 0.1401, + "step": 3935 + }, + { + "epoch": 0.25, + "grad_norm": 0.627408617573541, + "learning_rate": 8.773209112051918e-07, + "loss": 0.1987, + "step": 3936 + }, + { + "epoch": 0.25, + "grad_norm": 0.9453981933126301, + "learning_rate": 8.772531413202133e-07, + "loss": 0.2172, + "step": 3937 + }, + { + "epoch": 0.25, + "grad_norm": 1.3180858625553908, + "learning_rate": 8.771853553408575e-07, + "loss": 0.1285, + "step": 3938 + }, + { + "epoch": 0.25, + "grad_norm": 0.3865235113430008, + "learning_rate": 8.771175532700162e-07, + "loss": 0.158, + "step": 3939 + }, + { + "epoch": 0.25, + "grad_norm": 0.9173031682855154, + "learning_rate": 8.770497351105819e-07, + "loss": 0.2518, + "step": 3940 + }, + { + "epoch": 0.25, + "grad_norm": 0.6388942602289808, + "learning_rate": 8.76981900865448e-07, + "loss": 0.1793, + "step": 3941 + }, + { + "epoch": 0.25, + "grad_norm": 0.5680970316244751, + "learning_rate": 8.769140505375083e-07, + "loss": 0.1978, + "step": 3942 + }, + { + "epoch": 0.25, + "grad_norm": 0.9833493507997556, + "learning_rate": 8.768461841296577e-07, + "loss": 0.2289, + "step": 3943 + }, + { + "epoch": 0.25, + "grad_norm": 0.8079163924290765, + "learning_rate": 8.767783016447912e-07, + "loss": 0.0793, + "step": 3944 + }, + { + "epoch": 0.25, + "grad_norm": 0.5906476527055019, + "learning_rate": 8.767104030858049e-07, + "loss": 0.1805, + "step": 3945 + }, + { + "epoch": 0.25, + "grad_norm": 0.6493009863255066, + "learning_rate": 8.766424884555956e-07, + "loss": 0.2852, + "step": 3946 + }, + { + "epoch": 0.25, + "grad_norm": 1.8273038894585243, + "learning_rate": 8.765745577570606e-07, + "loss": 0.2025, + "step": 3947 + }, + { + "epoch": 0.25, + "grad_norm": 0.44501179601301144, + "learning_rate": 8.765066109930979e-07, + "loss": 0.1043, + "step": 3948 + }, + { + "epoch": 0.25, + "grad_norm": 0.5612978662180629, + "learning_rate": 8.764386481666062e-07, + "loss": 0.0897, + "step": 3949 + }, + { + "epoch": 0.25, + "grad_norm": 0.6603623168005797, + "learning_rate": 8.763706692804852e-07, + "loss": 0.3171, + "step": 3950 + }, + { + "epoch": 0.25, + "grad_norm": 1.1833506083656713, + "learning_rate": 8.763026743376347e-07, + "loss": 0.264, + "step": 3951 + }, + { + "epoch": 0.25, + "grad_norm": 0.456151883137029, + "learning_rate": 8.762346633409559e-07, + "loss": 0.2565, + "step": 3952 + }, + { + "epoch": 0.25, + "grad_norm": 0.9147030031313685, + "learning_rate": 8.761666362933497e-07, + "loss": 0.2614, + "step": 3953 + }, + { + "epoch": 0.25, + "grad_norm": 0.6258556487063822, + "learning_rate": 8.760985931977189e-07, + "loss": 0.3442, + "step": 3954 + }, + { + "epoch": 0.25, + "grad_norm": 0.5065709907473496, + "learning_rate": 8.76030534056966e-07, + "loss": 0.3571, + "step": 3955 + }, + { + "epoch": 0.25, + "grad_norm": 0.7141061511668348, + "learning_rate": 8.759624588739945e-07, + "loss": 0.0737, + "step": 3956 + }, + { + "epoch": 0.25, + "grad_norm": 0.8226379117225698, + "learning_rate": 8.758943676517088e-07, + "loss": 0.1746, + "step": 3957 + }, + { + "epoch": 0.25, + "grad_norm": 0.564638146772833, + "learning_rate": 8.758262603930137e-07, + "loss": 0.1079, + "step": 3958 + }, + { + "epoch": 0.25, + "grad_norm": 0.27651579401960985, + "learning_rate": 8.757581371008148e-07, + "loss": 0.1224, + "step": 3959 + }, + { + "epoch": 0.25, + "grad_norm": 0.3603241503001796, + "learning_rate": 8.756899977780185e-07, + "loss": 0.1384, + "step": 3960 + }, + { + "epoch": 0.25, + "grad_norm": 0.7128421237886637, + "learning_rate": 8.756218424275315e-07, + "loss": 0.3432, + "step": 3961 + }, + { + "epoch": 0.25, + "grad_norm": 0.5427821502755281, + "learning_rate": 8.755536710522617e-07, + "loss": 0.109, + "step": 3962 + }, + { + "epoch": 0.25, + "grad_norm": 0.8949546193401502, + "learning_rate": 8.754854836551173e-07, + "loss": 0.1083, + "step": 3963 + }, + { + "epoch": 0.25, + "grad_norm": 1.7523697441707426, + "learning_rate": 8.754172802390074e-07, + "loss": 0.2896, + "step": 3964 + }, + { + "epoch": 0.25, + "grad_norm": 0.470811108919015, + "learning_rate": 8.753490608068416e-07, + "loss": 0.204, + "step": 3965 + }, + { + "epoch": 0.25, + "grad_norm": 0.859132997973097, + "learning_rate": 8.752808253615305e-07, + "loss": 0.1744, + "step": 3966 + }, + { + "epoch": 0.25, + "grad_norm": 0.7903062271790547, + "learning_rate": 8.752125739059848e-07, + "loss": 0.0353, + "step": 3967 + }, + { + "epoch": 0.25, + "grad_norm": 0.6433512424106262, + "learning_rate": 8.751443064431164e-07, + "loss": 0.3293, + "step": 3968 + }, + { + "epoch": 0.25, + "grad_norm": 0.9272328463302985, + "learning_rate": 8.750760229758378e-07, + "loss": 0.0907, + "step": 3969 + }, + { + "epoch": 0.25, + "grad_norm": 1.0245838538405079, + "learning_rate": 8.750077235070621e-07, + "loss": 0.3167, + "step": 3970 + }, + { + "epoch": 0.25, + "grad_norm": 0.6949014063101808, + "learning_rate": 8.74939408039703e-07, + "loss": 0.3784, + "step": 3971 + }, + { + "epoch": 0.25, + "grad_norm": 0.16563874919046356, + "learning_rate": 8.748710765766752e-07, + "loss": 0.0028, + "step": 3972 + }, + { + "epoch": 0.25, + "grad_norm": 1.2013521468154433, + "learning_rate": 8.748027291208934e-07, + "loss": 0.0904, + "step": 3973 + }, + { + "epoch": 0.25, + "grad_norm": 0.6037748857188424, + "learning_rate": 8.747343656752739e-07, + "loss": 0.4494, + "step": 3974 + }, + { + "epoch": 0.25, + "grad_norm": 0.3484933915549889, + "learning_rate": 8.74665986242733e-07, + "loss": 0.3483, + "step": 3975 + }, + { + "epoch": 0.25, + "grad_norm": 0.8444821672923177, + "learning_rate": 8.74597590826188e-07, + "loss": 0.1801, + "step": 3976 + }, + { + "epoch": 0.25, + "grad_norm": 0.5820251833319969, + "learning_rate": 8.745291794285568e-07, + "loss": 0.1341, + "step": 3977 + }, + { + "epoch": 0.25, + "grad_norm": 1.5458408601507707, + "learning_rate": 8.744607520527577e-07, + "loss": 0.1478, + "step": 3978 + }, + { + "epoch": 0.25, + "grad_norm": 0.5041824124099171, + "learning_rate": 8.743923087017102e-07, + "loss": 0.2239, + "step": 3979 + }, + { + "epoch": 0.25, + "grad_norm": 0.6601179844297378, + "learning_rate": 8.743238493783343e-07, + "loss": 0.3004, + "step": 3980 + }, + { + "epoch": 0.25, + "grad_norm": 0.8275809964913583, + "learning_rate": 8.742553740855505e-07, + "loss": 0.2218, + "step": 3981 + }, + { + "epoch": 0.25, + "grad_norm": 0.3065065175720807, + "learning_rate": 8.741868828262802e-07, + "loss": 0.145, + "step": 3982 + }, + { + "epoch": 0.25, + "grad_norm": 0.8819701262901135, + "learning_rate": 8.741183756034449e-07, + "loss": 0.2202, + "step": 3983 + }, + { + "epoch": 0.25, + "grad_norm": 1.0010176950752636, + "learning_rate": 8.74049852419968e-07, + "loss": 0.3801, + "step": 3984 + }, + { + "epoch": 0.25, + "grad_norm": 0.5025045189329125, + "learning_rate": 8.739813132787723e-07, + "loss": 0.008, + "step": 3985 + }, + { + "epoch": 0.25, + "grad_norm": 0.6235355600080691, + "learning_rate": 8.73912758182782e-07, + "loss": 0.1102, + "step": 3986 + }, + { + "epoch": 0.25, + "grad_norm": 0.9359451485156095, + "learning_rate": 8.738441871349219e-07, + "loss": 0.1621, + "step": 3987 + }, + { + "epoch": 0.25, + "grad_norm": 0.9931543106272482, + "learning_rate": 8.737756001381169e-07, + "loss": 0.1736, + "step": 3988 + }, + { + "epoch": 0.25, + "grad_norm": 0.48887747494279876, + "learning_rate": 8.737069971952937e-07, + "loss": 0.3141, + "step": 3989 + }, + { + "epoch": 0.25, + "grad_norm": 1.3276454607633839, + "learning_rate": 8.736383783093787e-07, + "loss": 0.4351, + "step": 3990 + }, + { + "epoch": 0.25, + "grad_norm": 0.6281135719184645, + "learning_rate": 8.735697434832993e-07, + "loss": 0.1919, + "step": 3991 + }, + { + "epoch": 0.25, + "grad_norm": 0.9004611249581926, + "learning_rate": 8.735010927199837e-07, + "loss": 0.1233, + "step": 3992 + }, + { + "epoch": 0.25, + "grad_norm": 0.6245121951560472, + "learning_rate": 8.734324260223608e-07, + "loss": 0.01, + "step": 3993 + }, + { + "epoch": 0.25, + "grad_norm": 0.8564528634864154, + "learning_rate": 8.733637433933598e-07, + "loss": 0.3092, + "step": 3994 + }, + { + "epoch": 0.25, + "grad_norm": 0.6624523979672308, + "learning_rate": 8.732950448359109e-07, + "loss": 0.2709, + "step": 3995 + }, + { + "epoch": 0.25, + "grad_norm": 0.876141690078714, + "learning_rate": 8.732263303529451e-07, + "loss": 0.2849, + "step": 3996 + }, + { + "epoch": 0.25, + "grad_norm": 1.0187704625173444, + "learning_rate": 8.731575999473936e-07, + "loss": 0.2479, + "step": 3997 + }, + { + "epoch": 0.25, + "grad_norm": 0.9978460960576566, + "learning_rate": 8.73088853622189e-07, + "loss": 0.146, + "step": 3998 + }, + { + "epoch": 0.26, + "grad_norm": 0.8034763387545482, + "learning_rate": 8.730200913802637e-07, + "loss": 0.1926, + "step": 3999 + }, + { + "epoch": 0.26, + "grad_norm": 1.4719686255204056, + "learning_rate": 8.729513132245514e-07, + "loss": 0.231, + "step": 4000 + }, + { + "epoch": 0.26, + "grad_norm": 1.3168863902789227, + "learning_rate": 8.728825191579865e-07, + "loss": 0.0533, + "step": 4001 + }, + { + "epoch": 0.26, + "grad_norm": 0.5907737292904186, + "learning_rate": 8.728137091835038e-07, + "loss": 0.1546, + "step": 4002 + }, + { + "epoch": 0.26, + "grad_norm": 0.43436093732337505, + "learning_rate": 8.727448833040386e-07, + "loss": 0.1933, + "step": 4003 + }, + { + "epoch": 0.26, + "grad_norm": 1.182335143143738, + "learning_rate": 8.726760415225274e-07, + "loss": 0.0765, + "step": 4004 + }, + { + "epoch": 0.26, + "grad_norm": 0.8972510251624722, + "learning_rate": 8.726071838419072e-07, + "loss": 0.1038, + "step": 4005 + }, + { + "epoch": 0.26, + "grad_norm": 0.6094007452363491, + "learning_rate": 8.725383102651153e-07, + "loss": 0.1785, + "step": 4006 + }, + { + "epoch": 0.26, + "grad_norm": 0.29551837758871335, + "learning_rate": 8.724694207950905e-07, + "loss": 0.1089, + "step": 4007 + }, + { + "epoch": 0.26, + "grad_norm": 1.1253683787788942, + "learning_rate": 8.724005154347713e-07, + "loss": 0.2786, + "step": 4008 + }, + { + "epoch": 0.26, + "grad_norm": 0.423186108471077, + "learning_rate": 8.723315941870974e-07, + "loss": 0.1108, + "step": 4009 + }, + { + "epoch": 0.26, + "grad_norm": 0.48852532601750126, + "learning_rate": 8.722626570550093e-07, + "loss": 0.2561, + "step": 4010 + }, + { + "epoch": 0.26, + "grad_norm": 0.7463282720614175, + "learning_rate": 8.72193704041448e-07, + "loss": 0.3525, + "step": 4011 + }, + { + "epoch": 0.26, + "grad_norm": 0.5450143580604226, + "learning_rate": 8.72124735149355e-07, + "loss": 0.2108, + "step": 4012 + }, + { + "epoch": 0.26, + "grad_norm": 0.9253499962593161, + "learning_rate": 8.720557503816728e-07, + "loss": 0.3795, + "step": 4013 + }, + { + "epoch": 0.26, + "grad_norm": 0.46562963399667523, + "learning_rate": 8.719867497413443e-07, + "loss": 0.007, + "step": 4014 + }, + { + "epoch": 0.26, + "grad_norm": 0.5332682655484945, + "learning_rate": 8.719177332313131e-07, + "loss": 0.186, + "step": 4015 + }, + { + "epoch": 0.26, + "grad_norm": 0.5535338783673729, + "learning_rate": 8.71848700854524e-07, + "loss": 0.1613, + "step": 4016 + }, + { + "epoch": 0.26, + "grad_norm": 0.6403299260618557, + "learning_rate": 8.717796526139217e-07, + "loss": 0.2925, + "step": 4017 + }, + { + "epoch": 0.26, + "grad_norm": 0.6254917532022537, + "learning_rate": 8.71710588512452e-07, + "loss": 0.2384, + "step": 4018 + }, + { + "epoch": 0.26, + "grad_norm": 0.5750364342194938, + "learning_rate": 8.716415085530615e-07, + "loss": 0.0331, + "step": 4019 + }, + { + "epoch": 0.26, + "grad_norm": 0.4569063266582358, + "learning_rate": 8.71572412738697e-07, + "loss": 0.3298, + "step": 4020 + }, + { + "epoch": 0.26, + "grad_norm": 0.891163346975924, + "learning_rate": 8.715033010723065e-07, + "loss": 0.0961, + "step": 4021 + }, + { + "epoch": 0.26, + "grad_norm": 1.3413829687791063, + "learning_rate": 8.714341735568384e-07, + "loss": 0.0372, + "step": 4022 + }, + { + "epoch": 0.26, + "grad_norm": 0.49756565405090214, + "learning_rate": 8.713650301952418e-07, + "loss": 0.0951, + "step": 4023 + }, + { + "epoch": 0.26, + "grad_norm": 0.9182125512484611, + "learning_rate": 8.712958709904664e-07, + "loss": 0.2331, + "step": 4024 + }, + { + "epoch": 0.26, + "grad_norm": 0.9725612485583259, + "learning_rate": 8.712266959454628e-07, + "loss": 0.2, + "step": 4025 + }, + { + "epoch": 0.26, + "grad_norm": 1.4010945358686009, + "learning_rate": 8.711575050631821e-07, + "loss": 0.031, + "step": 4026 + }, + { + "epoch": 0.26, + "grad_norm": 0.5290640232322119, + "learning_rate": 8.71088298346576e-07, + "loss": 0.2232, + "step": 4027 + }, + { + "epoch": 0.26, + "grad_norm": 0.22726065289959244, + "learning_rate": 8.710190757985971e-07, + "loss": 0.0997, + "step": 4028 + }, + { + "epoch": 0.26, + "grad_norm": 1.5089060046635212, + "learning_rate": 8.709498374221987e-07, + "loss": 0.0809, + "step": 4029 + }, + { + "epoch": 0.26, + "grad_norm": 0.8505876302845143, + "learning_rate": 8.708805832203345e-07, + "loss": 0.1974, + "step": 4030 + }, + { + "epoch": 0.26, + "grad_norm": 1.5800888585351849, + "learning_rate": 8.708113131959591e-07, + "loss": 0.3174, + "step": 4031 + }, + { + "epoch": 0.26, + "grad_norm": 1.2127749732338962, + "learning_rate": 8.707420273520276e-07, + "loss": 0.2511, + "step": 4032 + }, + { + "epoch": 0.26, + "grad_norm": 0.9619904283254103, + "learning_rate": 8.706727256914959e-07, + "loss": 0.2079, + "step": 4033 + }, + { + "epoch": 0.26, + "grad_norm": 0.788683859357756, + "learning_rate": 8.706034082173205e-07, + "loss": 0.2252, + "step": 4034 + }, + { + "epoch": 0.26, + "grad_norm": 2.8666167015743347, + "learning_rate": 8.70534074932459e-07, + "loss": 0.2964, + "step": 4035 + }, + { + "epoch": 0.26, + "grad_norm": 0.8339818264528626, + "learning_rate": 8.704647258398688e-07, + "loss": 0.1802, + "step": 4036 + }, + { + "epoch": 0.26, + "grad_norm": 1.7500161946388975, + "learning_rate": 8.703953609425087e-07, + "loss": 0.234, + "step": 4037 + }, + { + "epoch": 0.26, + "grad_norm": 1.0373680778071752, + "learning_rate": 8.703259802433378e-07, + "loss": 0.0323, + "step": 4038 + }, + { + "epoch": 0.26, + "grad_norm": 0.7686844705049926, + "learning_rate": 8.702565837453163e-07, + "loss": 0.1601, + "step": 4039 + }, + { + "epoch": 0.26, + "grad_norm": 0.34216488586559624, + "learning_rate": 8.701871714514046e-07, + "loss": 0.0436, + "step": 4040 + }, + { + "epoch": 0.26, + "grad_norm": 0.7622924070033409, + "learning_rate": 8.701177433645639e-07, + "loss": 0.3274, + "step": 4041 + }, + { + "epoch": 0.26, + "grad_norm": 1.876673436189718, + "learning_rate": 8.700482994877563e-07, + "loss": 0.1475, + "step": 4042 + }, + { + "epoch": 0.26, + "grad_norm": 0.3006986467282531, + "learning_rate": 8.699788398239442e-07, + "loss": 0.0065, + "step": 4043 + }, + { + "epoch": 0.26, + "grad_norm": 0.35755714554411305, + "learning_rate": 8.699093643760913e-07, + "loss": 0.1083, + "step": 4044 + }, + { + "epoch": 0.26, + "grad_norm": 1.9828179439260103, + "learning_rate": 8.698398731471612e-07, + "loss": 0.1531, + "step": 4045 + }, + { + "epoch": 0.26, + "grad_norm": 0.39576422761612184, + "learning_rate": 8.697703661401185e-07, + "loss": 0.2006, + "step": 4046 + }, + { + "epoch": 0.26, + "grad_norm": 0.8437275424897863, + "learning_rate": 8.697008433579289e-07, + "loss": 0.2849, + "step": 4047 + }, + { + "epoch": 0.26, + "grad_norm": 1.7544173146891293, + "learning_rate": 8.69631304803558e-07, + "loss": 0.3707, + "step": 4048 + }, + { + "epoch": 0.26, + "grad_norm": 0.7168024231112956, + "learning_rate": 8.695617504799726e-07, + "loss": 0.1958, + "step": 4049 + }, + { + "epoch": 0.26, + "grad_norm": 0.19259379309971653, + "learning_rate": 8.694921803901401e-07, + "loss": 0.0996, + "step": 4050 + }, + { + "epoch": 0.26, + "grad_norm": 0.5194265600276591, + "learning_rate": 8.694225945370282e-07, + "loss": 0.0215, + "step": 4051 + }, + { + "epoch": 0.26, + "grad_norm": 0.6012828951597724, + "learning_rate": 8.693529929236058e-07, + "loss": 0.2555, + "step": 4052 + }, + { + "epoch": 0.26, + "grad_norm": 0.8690152717732422, + "learning_rate": 8.692833755528424e-07, + "loss": 0.0856, + "step": 4053 + }, + { + "epoch": 0.26, + "grad_norm": 1.3277992090830388, + "learning_rate": 8.692137424277079e-07, + "loss": 0.3708, + "step": 4054 + }, + { + "epoch": 0.26, + "grad_norm": 0.5039056616220217, + "learning_rate": 8.691440935511726e-07, + "loss": 0.3282, + "step": 4055 + }, + { + "epoch": 0.26, + "grad_norm": 0.42331557662746544, + "learning_rate": 8.690744289262086e-07, + "loss": 0.3823, + "step": 4056 + }, + { + "epoch": 0.26, + "grad_norm": 0.9142575813248571, + "learning_rate": 8.690047485557873e-07, + "loss": 0.2637, + "step": 4057 + }, + { + "epoch": 0.26, + "grad_norm": 3.7196566229620287, + "learning_rate": 8.689350524428817e-07, + "loss": 0.24, + "step": 4058 + }, + { + "epoch": 0.26, + "grad_norm": 1.2910743427622435, + "learning_rate": 8.688653405904651e-07, + "loss": 0.2829, + "step": 4059 + }, + { + "epoch": 0.26, + "grad_norm": 0.9113426193291279, + "learning_rate": 8.687956130015115e-07, + "loss": 0.2902, + "step": 4060 + }, + { + "epoch": 0.26, + "grad_norm": 0.5598130609352595, + "learning_rate": 8.687258696789957e-07, + "loss": 0.1831, + "step": 4061 + }, + { + "epoch": 0.26, + "grad_norm": 0.5193925121894146, + "learning_rate": 8.686561106258932e-07, + "loss": 0.0891, + "step": 4062 + }, + { + "epoch": 0.26, + "grad_norm": 0.6753090305556863, + "learning_rate": 8.685863358451797e-07, + "loss": 0.2076, + "step": 4063 + }, + { + "epoch": 0.26, + "grad_norm": 0.3933141989346814, + "learning_rate": 8.685165453398323e-07, + "loss": 0.1361, + "step": 4064 + }, + { + "epoch": 0.26, + "grad_norm": 0.9869841152758954, + "learning_rate": 8.684467391128283e-07, + "loss": 0.0818, + "step": 4065 + }, + { + "epoch": 0.26, + "grad_norm": 0.5617361231289953, + "learning_rate": 8.683769171671456e-07, + "loss": 0.0773, + "step": 4066 + }, + { + "epoch": 0.26, + "grad_norm": 0.9474663227051654, + "learning_rate": 8.683070795057632e-07, + "loss": 0.1526, + "step": 4067 + }, + { + "epoch": 0.26, + "grad_norm": 0.08787256262239636, + "learning_rate": 8.682372261316603e-07, + "loss": 0.0033, + "step": 4068 + }, + { + "epoch": 0.26, + "grad_norm": 0.5999741487371676, + "learning_rate": 8.681673570478172e-07, + "loss": 0.0251, + "step": 4069 + }, + { + "epoch": 0.26, + "grad_norm": 0.5186519952801143, + "learning_rate": 8.680974722572144e-07, + "loss": 0.1501, + "step": 4070 + }, + { + "epoch": 0.26, + "grad_norm": 0.4551390871962548, + "learning_rate": 8.680275717628336e-07, + "loss": 0.185, + "step": 4071 + }, + { + "epoch": 0.26, + "grad_norm": 1.3686354176626132, + "learning_rate": 8.679576555676566e-07, + "loss": 0.1716, + "step": 4072 + }, + { + "epoch": 0.26, + "grad_norm": 0.7757396827823145, + "learning_rate": 8.678877236746664e-07, + "loss": 0.1621, + "step": 4073 + }, + { + "epoch": 0.26, + "grad_norm": 0.8828172389719617, + "learning_rate": 8.678177760868466e-07, + "loss": 0.2308, + "step": 4074 + }, + { + "epoch": 0.26, + "grad_norm": 0.6983117036471638, + "learning_rate": 8.677478128071808e-07, + "loss": 0.2866, + "step": 4075 + }, + { + "epoch": 0.26, + "grad_norm": 0.38771195245275397, + "learning_rate": 8.676778338386541e-07, + "loss": 0.1309, + "step": 4076 + }, + { + "epoch": 0.26, + "grad_norm": 0.6801236831527018, + "learning_rate": 8.676078391842518e-07, + "loss": 0.2795, + "step": 4077 + }, + { + "epoch": 0.26, + "grad_norm": 1.5694146176378214, + "learning_rate": 8.675378288469603e-07, + "loss": 0.175, + "step": 4078 + }, + { + "epoch": 0.26, + "grad_norm": 1.6219395904385054, + "learning_rate": 8.674678028297659e-07, + "loss": 0.247, + "step": 4079 + }, + { + "epoch": 0.26, + "grad_norm": 0.34342221249927024, + "learning_rate": 8.673977611356566e-07, + "loss": 0.1954, + "step": 4080 + }, + { + "epoch": 0.26, + "grad_norm": 0.7600292127363575, + "learning_rate": 8.673277037676201e-07, + "loss": 0.3134, + "step": 4081 + }, + { + "epoch": 0.26, + "grad_norm": 0.3481644163119903, + "learning_rate": 8.672576307286455e-07, + "loss": 0.0081, + "step": 4082 + }, + { + "epoch": 0.26, + "grad_norm": 1.6503751638113178, + "learning_rate": 8.671875420217218e-07, + "loss": 0.0743, + "step": 4083 + }, + { + "epoch": 0.26, + "grad_norm": 1.1873273700527, + "learning_rate": 8.671174376498396e-07, + "loss": 0.3154, + "step": 4084 + }, + { + "epoch": 0.26, + "grad_norm": 0.8396698016655777, + "learning_rate": 8.670473176159896e-07, + "loss": 0.0073, + "step": 4085 + }, + { + "epoch": 0.26, + "grad_norm": 0.6112612695587594, + "learning_rate": 8.66977181923163e-07, + "loss": 0.3248, + "step": 4086 + }, + { + "epoch": 0.26, + "grad_norm": 0.7350065130972876, + "learning_rate": 8.669070305743522e-07, + "loss": 0.1309, + "step": 4087 + }, + { + "epoch": 0.26, + "grad_norm": 1.0898361003584103, + "learning_rate": 8.668368635725498e-07, + "loss": 0.2625, + "step": 4088 + }, + { + "epoch": 0.26, + "grad_norm": 0.5157372002681769, + "learning_rate": 8.667666809207494e-07, + "loss": 0.1469, + "step": 4089 + }, + { + "epoch": 0.26, + "grad_norm": 0.7806264136291773, + "learning_rate": 8.666964826219452e-07, + "loss": 0.221, + "step": 4090 + }, + { + "epoch": 0.26, + "grad_norm": 0.913072198088876, + "learning_rate": 8.666262686791317e-07, + "loss": 0.4681, + "step": 4091 + }, + { + "epoch": 0.26, + "grad_norm": 0.764650493860681, + "learning_rate": 8.665560390953048e-07, + "loss": 0.4531, + "step": 4092 + }, + { + "epoch": 0.26, + "grad_norm": 0.8470516436514497, + "learning_rate": 8.664857938734603e-07, + "loss": 0.4609, + "step": 4093 + }, + { + "epoch": 0.26, + "grad_norm": 2.3627619097956996, + "learning_rate": 8.664155330165951e-07, + "loss": 0.315, + "step": 4094 + }, + { + "epoch": 0.26, + "grad_norm": 0.9197571070088587, + "learning_rate": 8.663452565277066e-07, + "loss": 0.4106, + "step": 4095 + }, + { + "epoch": 0.26, + "grad_norm": 0.4429333737292287, + "learning_rate": 8.66274964409793e-07, + "loss": 0.1465, + "step": 4096 + }, + { + "epoch": 0.26, + "grad_norm": 1.3301824337049717, + "learning_rate": 8.662046566658534e-07, + "loss": 0.155, + "step": 4097 + }, + { + "epoch": 0.26, + "grad_norm": 0.8237105848216212, + "learning_rate": 8.661343332988868e-07, + "loss": 0.3154, + "step": 4098 + }, + { + "epoch": 0.26, + "grad_norm": 0.8642760805583275, + "learning_rate": 8.660639943118935e-07, + "loss": 0.2097, + "step": 4099 + }, + { + "epoch": 0.26, + "grad_norm": 0.9988221169075769, + "learning_rate": 8.659936397078742e-07, + "loss": 0.1076, + "step": 4100 + }, + { + "epoch": 0.26, + "grad_norm": 1.08165401706748, + "learning_rate": 8.659232694898306e-07, + "loss": 0.1478, + "step": 4101 + }, + { + "epoch": 0.26, + "grad_norm": 0.9392110429916637, + "learning_rate": 8.658528836607648e-07, + "loss": 0.2727, + "step": 4102 + }, + { + "epoch": 0.26, + "grad_norm": 0.6255102925376163, + "learning_rate": 8.657824822236794e-07, + "loss": 0.1134, + "step": 4103 + }, + { + "epoch": 0.26, + "grad_norm": 1.531671919199018, + "learning_rate": 8.657120651815781e-07, + "loss": 0.097, + "step": 4104 + }, + { + "epoch": 0.26, + "grad_norm": 1.0349825853821242, + "learning_rate": 8.656416325374649e-07, + "loss": 0.248, + "step": 4105 + }, + { + "epoch": 0.26, + "grad_norm": 1.41708388544839, + "learning_rate": 8.655711842943446e-07, + "loss": 0.2295, + "step": 4106 + }, + { + "epoch": 0.26, + "grad_norm": 1.31534042476054, + "learning_rate": 8.655007204552227e-07, + "loss": 0.3639, + "step": 4107 + }, + { + "epoch": 0.26, + "grad_norm": 0.40837283380562023, + "learning_rate": 8.654302410231054e-07, + "loss": 0.2332, + "step": 4108 + }, + { + "epoch": 0.26, + "grad_norm": 0.30675613563980453, + "learning_rate": 8.653597460009993e-07, + "loss": 0.0117, + "step": 4109 + }, + { + "epoch": 0.26, + "grad_norm": 0.5425657633918686, + "learning_rate": 8.652892353919119e-07, + "loss": 0.0441, + "step": 4110 + }, + { + "epoch": 0.26, + "grad_norm": 0.4927982659656652, + "learning_rate": 8.652187091988516e-07, + "loss": 0.1758, + "step": 4111 + }, + { + "epoch": 0.26, + "grad_norm": 0.8854424625965419, + "learning_rate": 8.651481674248267e-07, + "loss": 0.4199, + "step": 4112 + }, + { + "epoch": 0.26, + "grad_norm": 0.7812687945958526, + "learning_rate": 8.650776100728471e-07, + "loss": 0.1715, + "step": 4113 + }, + { + "epoch": 0.26, + "grad_norm": 2.0582769701423547, + "learning_rate": 8.650070371459228e-07, + "loss": 0.0757, + "step": 4114 + }, + { + "epoch": 0.26, + "grad_norm": 0.3886860276188469, + "learning_rate": 8.649364486470646e-07, + "loss": 0.1217, + "step": 4115 + }, + { + "epoch": 0.26, + "grad_norm": 0.7357733802014849, + "learning_rate": 8.648658445792838e-07, + "loss": 0.3296, + "step": 4116 + }, + { + "epoch": 0.26, + "grad_norm": 3.7205992295519654, + "learning_rate": 8.647952249455925e-07, + "loss": 0.1298, + "step": 4117 + }, + { + "epoch": 0.26, + "grad_norm": 1.2698102001347191, + "learning_rate": 8.647245897490036e-07, + "loss": 0.0348, + "step": 4118 + }, + { + "epoch": 0.26, + "grad_norm": 1.0515583970453326, + "learning_rate": 8.646539389925307e-07, + "loss": 0.1677, + "step": 4119 + }, + { + "epoch": 0.26, + "grad_norm": 0.5393363410624852, + "learning_rate": 8.645832726791876e-07, + "loss": 0.1455, + "step": 4120 + }, + { + "epoch": 0.26, + "grad_norm": 0.7251985075075046, + "learning_rate": 8.645125908119892e-07, + "loss": 0.1008, + "step": 4121 + }, + { + "epoch": 0.26, + "grad_norm": 0.8636367474351939, + "learning_rate": 8.644418933939508e-07, + "loss": 0.3264, + "step": 4122 + }, + { + "epoch": 0.26, + "grad_norm": 0.6678994209994875, + "learning_rate": 8.643711804280888e-07, + "loss": 0.2842, + "step": 4123 + }, + { + "epoch": 0.26, + "grad_norm": 0.4542562535018774, + "learning_rate": 8.643004519174197e-07, + "loss": 0.078, + "step": 4124 + }, + { + "epoch": 0.26, + "grad_norm": 0.569213117890287, + "learning_rate": 8.642297078649608e-07, + "loss": 0.2563, + "step": 4125 + }, + { + "epoch": 0.26, + "grad_norm": 0.886986849481526, + "learning_rate": 8.641589482737306e-07, + "loss": 0.1211, + "step": 4126 + }, + { + "epoch": 0.26, + "grad_norm": 1.3409962692945223, + "learning_rate": 8.640881731467475e-07, + "loss": 0.1253, + "step": 4127 + }, + { + "epoch": 0.26, + "grad_norm": 1.0903431349800219, + "learning_rate": 8.640173824870311e-07, + "loss": 0.0725, + "step": 4128 + }, + { + "epoch": 0.26, + "grad_norm": 0.9233903438867485, + "learning_rate": 8.639465762976013e-07, + "loss": 0.2513, + "step": 4129 + }, + { + "epoch": 0.26, + "grad_norm": 0.731236143951161, + "learning_rate": 8.638757545814789e-07, + "loss": 0.0069, + "step": 4130 + }, + { + "epoch": 0.26, + "grad_norm": 0.9535380261485513, + "learning_rate": 8.638049173416855e-07, + "loss": 0.2661, + "step": 4131 + }, + { + "epoch": 0.26, + "grad_norm": 1.341231042585357, + "learning_rate": 8.637340645812429e-07, + "loss": 0.1597, + "step": 4132 + }, + { + "epoch": 0.26, + "grad_norm": 0.8995777054348525, + "learning_rate": 8.636631963031739e-07, + "loss": 0.2024, + "step": 4133 + }, + { + "epoch": 0.26, + "grad_norm": 0.16401430139744694, + "learning_rate": 8.635923125105018e-07, + "loss": 0.0235, + "step": 4134 + }, + { + "epoch": 0.26, + "grad_norm": 1.843344059109508, + "learning_rate": 8.63521413206251e-07, + "loss": 0.0331, + "step": 4135 + }, + { + "epoch": 0.26, + "grad_norm": 0.6263545356874659, + "learning_rate": 8.634504983934456e-07, + "loss": 0.3176, + "step": 4136 + }, + { + "epoch": 0.26, + "grad_norm": 0.4808943140162613, + "learning_rate": 8.633795680751116e-07, + "loss": 0.1106, + "step": 4137 + }, + { + "epoch": 0.26, + "grad_norm": 1.6112905735795604, + "learning_rate": 8.633086222542746e-07, + "loss": 0.1585, + "step": 4138 + }, + { + "epoch": 0.26, + "grad_norm": 0.5330909187707842, + "learning_rate": 8.632376609339615e-07, + "loss": 0.0261, + "step": 4139 + }, + { + "epoch": 0.26, + "grad_norm": 0.5105147410937386, + "learning_rate": 8.631666841171995e-07, + "loss": 0.225, + "step": 4140 + }, + { + "epoch": 0.26, + "grad_norm": 0.4487846225878657, + "learning_rate": 8.630956918070167e-07, + "loss": 0.022, + "step": 4141 + }, + { + "epoch": 0.26, + "grad_norm": 0.8339169275027136, + "learning_rate": 8.630246840064419e-07, + "loss": 0.2695, + "step": 4142 + }, + { + "epoch": 0.26, + "grad_norm": 1.0898084150738283, + "learning_rate": 8.629536607185041e-07, + "loss": 0.3548, + "step": 4143 + }, + { + "epoch": 0.26, + "grad_norm": 1.2029661526671163, + "learning_rate": 8.628826219462337e-07, + "loss": 0.0799, + "step": 4144 + }, + { + "epoch": 0.26, + "grad_norm": 0.9993333429873376, + "learning_rate": 8.62811567692661e-07, + "loss": 0.3005, + "step": 4145 + }, + { + "epoch": 0.26, + "grad_norm": 0.4544022280862907, + "learning_rate": 8.627404979608177e-07, + "loss": 0.1412, + "step": 4146 + }, + { + "epoch": 0.26, + "grad_norm": 2.992980608984845, + "learning_rate": 8.626694127537354e-07, + "loss": 0.1137, + "step": 4147 + }, + { + "epoch": 0.26, + "grad_norm": 1.205693774665616, + "learning_rate": 8.625983120744469e-07, + "loss": 0.1241, + "step": 4148 + }, + { + "epoch": 0.26, + "grad_norm": 1.1628846331352964, + "learning_rate": 8.625271959259856e-07, + "loss": 0.2407, + "step": 4149 + }, + { + "epoch": 0.26, + "grad_norm": 2.680904559539922, + "learning_rate": 8.624560643113852e-07, + "loss": 0.3343, + "step": 4150 + }, + { + "epoch": 0.26, + "grad_norm": 1.0543011777212534, + "learning_rate": 8.623849172336805e-07, + "loss": 0.1855, + "step": 4151 + }, + { + "epoch": 0.26, + "grad_norm": 0.8079673985740236, + "learning_rate": 8.623137546959068e-07, + "loss": 0.3205, + "step": 4152 + }, + { + "epoch": 0.26, + "grad_norm": 0.8083249109115577, + "learning_rate": 8.622425767011e-07, + "loss": 0.2182, + "step": 4153 + }, + { + "epoch": 0.26, + "grad_norm": 0.5073675655689701, + "learning_rate": 8.621713832522966e-07, + "loss": 0.2256, + "step": 4154 + }, + { + "epoch": 0.26, + "grad_norm": 0.6951868169192322, + "learning_rate": 8.62100174352534e-07, + "loss": 0.2103, + "step": 4155 + }, + { + "epoch": 0.27, + "grad_norm": 0.46143325777616173, + "learning_rate": 8.620289500048501e-07, + "loss": 0.0863, + "step": 4156 + }, + { + "epoch": 0.27, + "grad_norm": 0.8654396278114236, + "learning_rate": 8.619577102122833e-07, + "loss": 0.21, + "step": 4157 + }, + { + "epoch": 0.27, + "grad_norm": 4.844883718788873, + "learning_rate": 8.61886454977873e-07, + "loss": 0.1231, + "step": 4158 + }, + { + "epoch": 0.27, + "grad_norm": 0.9131056581684664, + "learning_rate": 8.618151843046591e-07, + "loss": 0.2919, + "step": 4159 + }, + { + "epoch": 0.27, + "grad_norm": 0.7942533845691214, + "learning_rate": 8.617438981956819e-07, + "loss": 0.1762, + "step": 4160 + }, + { + "epoch": 0.27, + "grad_norm": 0.5094814576754809, + "learning_rate": 8.616725966539831e-07, + "loss": 0.1071, + "step": 4161 + }, + { + "epoch": 0.27, + "grad_norm": 0.3741130726984353, + "learning_rate": 8.616012796826041e-07, + "loss": 0.1949, + "step": 4162 + }, + { + "epoch": 0.27, + "grad_norm": 0.2950232822520885, + "learning_rate": 8.615299472845876e-07, + "loss": 0.1617, + "step": 4163 + }, + { + "epoch": 0.27, + "grad_norm": 1.1629432705817841, + "learning_rate": 8.614585994629769e-07, + "loss": 0.072, + "step": 4164 + }, + { + "epoch": 0.27, + "grad_norm": 0.43310283494739926, + "learning_rate": 8.613872362208157e-07, + "loss": 0.2264, + "step": 4165 + }, + { + "epoch": 0.27, + "grad_norm": 0.53920155113599, + "learning_rate": 8.613158575611484e-07, + "loss": 0.1356, + "step": 4166 + }, + { + "epoch": 0.27, + "grad_norm": 0.588776252123757, + "learning_rate": 8.612444634870204e-07, + "loss": 0.1094, + "step": 4167 + }, + { + "epoch": 0.27, + "grad_norm": 1.255630474149303, + "learning_rate": 8.611730540014772e-07, + "loss": 0.125, + "step": 4168 + }, + { + "epoch": 0.27, + "grad_norm": 0.1919129974843073, + "learning_rate": 8.611016291075656e-07, + "loss": 0.1151, + "step": 4169 + }, + { + "epoch": 0.27, + "grad_norm": 0.23080007066420352, + "learning_rate": 8.610301888083327e-07, + "loss": 0.129, + "step": 4170 + }, + { + "epoch": 0.27, + "grad_norm": 0.6946707730211346, + "learning_rate": 8.60958733106826e-07, + "loss": 0.591, + "step": 4171 + }, + { + "epoch": 0.27, + "grad_norm": 0.45516192914906256, + "learning_rate": 8.608872620060943e-07, + "loss": 0.1313, + "step": 4172 + }, + { + "epoch": 0.27, + "grad_norm": 0.8978727104176235, + "learning_rate": 8.608157755091864e-07, + "loss": 0.4418, + "step": 4173 + }, + { + "epoch": 0.27, + "grad_norm": 1.8782263943138546, + "learning_rate": 8.607442736191521e-07, + "loss": 0.2521, + "step": 4174 + }, + { + "epoch": 0.27, + "grad_norm": 0.8028398729137047, + "learning_rate": 8.606727563390421e-07, + "loss": 0.188, + "step": 4175 + }, + { + "epoch": 0.27, + "grad_norm": 0.21838660358954995, + "learning_rate": 8.606012236719073e-07, + "loss": 0.1483, + "step": 4176 + }, + { + "epoch": 0.27, + "grad_norm": 1.1902942487618677, + "learning_rate": 8.605296756207993e-07, + "loss": 0.2007, + "step": 4177 + }, + { + "epoch": 0.27, + "grad_norm": 0.28907669710924344, + "learning_rate": 8.604581121887706e-07, + "loss": 0.1095, + "step": 4178 + }, + { + "epoch": 0.27, + "grad_norm": 1.318027359896157, + "learning_rate": 8.60386533378874e-07, + "loss": 0.3918, + "step": 4179 + }, + { + "epoch": 0.27, + "grad_norm": 1.751457980254541, + "learning_rate": 8.603149391941638e-07, + "loss": 0.2503, + "step": 4180 + }, + { + "epoch": 0.27, + "grad_norm": 0.5886811753906173, + "learning_rate": 8.602433296376937e-07, + "loss": 0.348, + "step": 4181 + }, + { + "epoch": 0.27, + "grad_norm": 1.5060503883283605, + "learning_rate": 8.601717047125191e-07, + "loss": 0.1095, + "step": 4182 + }, + { + "epoch": 0.27, + "grad_norm": 1.2121079761848257, + "learning_rate": 8.601000644216955e-07, + "loss": 0.223, + "step": 4183 + }, + { + "epoch": 0.27, + "grad_norm": 1.4177256732661798, + "learning_rate": 8.600284087682792e-07, + "loss": 0.4123, + "step": 4184 + }, + { + "epoch": 0.27, + "grad_norm": 1.930620527742801, + "learning_rate": 8.599567377553274e-07, + "loss": 0.1856, + "step": 4185 + }, + { + "epoch": 0.27, + "grad_norm": 1.5694074812344485, + "learning_rate": 8.598850513858975e-07, + "loss": 0.1965, + "step": 4186 + }, + { + "epoch": 0.27, + "grad_norm": 1.343892461363804, + "learning_rate": 8.598133496630477e-07, + "loss": 0.1279, + "step": 4187 + }, + { + "epoch": 0.27, + "grad_norm": 0.5437361057509675, + "learning_rate": 8.597416325898373e-07, + "loss": 0.0913, + "step": 4188 + }, + { + "epoch": 0.27, + "grad_norm": 1.159113119356025, + "learning_rate": 8.596699001693255e-07, + "loss": 0.6262, + "step": 4189 + }, + { + "epoch": 0.27, + "grad_norm": 0.4849247105383991, + "learning_rate": 8.595981524045729e-07, + "loss": 0.2941, + "step": 4190 + }, + { + "epoch": 0.27, + "grad_norm": 2.164368321820298, + "learning_rate": 8.595263892986402e-07, + "loss": 0.1936, + "step": 4191 + }, + { + "epoch": 0.27, + "grad_norm": 1.2799470303055762, + "learning_rate": 8.594546108545888e-07, + "loss": 0.4101, + "step": 4192 + }, + { + "epoch": 0.27, + "grad_norm": 1.0023004339422021, + "learning_rate": 8.593828170754813e-07, + "loss": 0.1801, + "step": 4193 + }, + { + "epoch": 0.27, + "grad_norm": 1.4085905230365117, + "learning_rate": 8.593110079643803e-07, + "loss": 0.2139, + "step": 4194 + }, + { + "epoch": 0.27, + "grad_norm": 1.5349107399844006, + "learning_rate": 8.592391835243495e-07, + "loss": 0.4644, + "step": 4195 + }, + { + "epoch": 0.27, + "grad_norm": 0.6752446175755001, + "learning_rate": 8.591673437584528e-07, + "loss": 0.0281, + "step": 4196 + }, + { + "epoch": 0.27, + "grad_norm": 0.18305378645340323, + "learning_rate": 8.590954886697553e-07, + "loss": 0.0904, + "step": 4197 + }, + { + "epoch": 0.27, + "grad_norm": 1.26359003237301, + "learning_rate": 8.590236182613224e-07, + "loss": 0.0882, + "step": 4198 + }, + { + "epoch": 0.27, + "grad_norm": 0.38274271577641134, + "learning_rate": 8.589517325362201e-07, + "loss": 0.0959, + "step": 4199 + }, + { + "epoch": 0.27, + "grad_norm": 0.41296319207016935, + "learning_rate": 8.588798314975154e-07, + "loss": 0.1613, + "step": 4200 + }, + { + "epoch": 0.27, + "grad_norm": 1.2904919767391199, + "learning_rate": 8.588079151482756e-07, + "loss": 0.1069, + "step": 4201 + }, + { + "epoch": 0.27, + "grad_norm": 1.5138270595551349, + "learning_rate": 8.58735983491569e-07, + "loss": 0.2439, + "step": 4202 + }, + { + "epoch": 0.27, + "grad_norm": 2.0077304062080574, + "learning_rate": 8.58664036530464e-07, + "loss": 0.2434, + "step": 4203 + }, + { + "epoch": 0.27, + "grad_norm": 0.81302314365293, + "learning_rate": 8.585920742680304e-07, + "loss": 0.4034, + "step": 4204 + }, + { + "epoch": 0.27, + "grad_norm": 3.4243340845075125, + "learning_rate": 8.585200967073378e-07, + "loss": 0.2181, + "step": 4205 + }, + { + "epoch": 0.27, + "grad_norm": 1.412047738650341, + "learning_rate": 8.584481038514572e-07, + "loss": 0.1865, + "step": 4206 + }, + { + "epoch": 0.27, + "grad_norm": 0.7237220104875748, + "learning_rate": 8.583760957034601e-07, + "loss": 0.3277, + "step": 4207 + }, + { + "epoch": 0.27, + "grad_norm": 0.7640779684273645, + "learning_rate": 8.583040722664183e-07, + "loss": 0.0173, + "step": 4208 + }, + { + "epoch": 0.27, + "grad_norm": 0.6214385840338672, + "learning_rate": 8.582320335434045e-07, + "loss": 0.2223, + "step": 4209 + }, + { + "epoch": 0.27, + "grad_norm": 0.9811865311502546, + "learning_rate": 8.581599795374918e-07, + "loss": 0.1144, + "step": 4210 + }, + { + "epoch": 0.27, + "grad_norm": 0.2670950809825193, + "learning_rate": 8.580879102517547e-07, + "loss": 0.1149, + "step": 4211 + }, + { + "epoch": 0.27, + "grad_norm": 1.1956863785449852, + "learning_rate": 8.580158256892672e-07, + "loss": 0.3023, + "step": 4212 + }, + { + "epoch": 0.27, + "grad_norm": 1.2071355650323525, + "learning_rate": 8.57943725853105e-07, + "loss": 0.4585, + "step": 4213 + }, + { + "epoch": 0.27, + "grad_norm": 0.6563889522905176, + "learning_rate": 8.578716107463439e-07, + "loss": 0.2213, + "step": 4214 + }, + { + "epoch": 0.27, + "grad_norm": 1.215855606348245, + "learning_rate": 8.577994803720605e-07, + "loss": 0.1861, + "step": 4215 + }, + { + "epoch": 0.27, + "grad_norm": 0.8469151423347601, + "learning_rate": 8.57727334733332e-07, + "loss": 0.2754, + "step": 4216 + }, + { + "epoch": 0.27, + "grad_norm": 0.9923423963328847, + "learning_rate": 8.576551738332362e-07, + "loss": 0.2037, + "step": 4217 + }, + { + "epoch": 0.27, + "grad_norm": 1.971817001544454, + "learning_rate": 8.575829976748519e-07, + "loss": 0.1913, + "step": 4218 + }, + { + "epoch": 0.27, + "grad_norm": 1.8981390336960575, + "learning_rate": 8.575108062612579e-07, + "loss": 0.1501, + "step": 4219 + }, + { + "epoch": 0.27, + "grad_norm": 1.2470736773943565, + "learning_rate": 8.574385995955342e-07, + "loss": 0.2306, + "step": 4220 + }, + { + "epoch": 0.27, + "grad_norm": 0.8319293764527479, + "learning_rate": 8.573663776807614e-07, + "loss": 0.318, + "step": 4221 + }, + { + "epoch": 0.27, + "grad_norm": 2.5835820336388515, + "learning_rate": 8.572941405200205e-07, + "loss": 0.2691, + "step": 4222 + }, + { + "epoch": 0.27, + "grad_norm": 1.0016540452258798, + "learning_rate": 8.572218881163932e-07, + "loss": 0.4274, + "step": 4223 + }, + { + "epoch": 0.27, + "grad_norm": 5.360160874124541, + "learning_rate": 8.571496204729623e-07, + "loss": 0.0397, + "step": 4224 + }, + { + "epoch": 0.27, + "grad_norm": 0.42308800478398084, + "learning_rate": 8.570773375928104e-07, + "loss": 0.2743, + "step": 4225 + }, + { + "epoch": 0.27, + "grad_norm": 2.8029133247581695, + "learning_rate": 8.570050394790216e-07, + "loss": 0.4695, + "step": 4226 + }, + { + "epoch": 0.27, + "grad_norm": 0.9014662793121879, + "learning_rate": 8.569327261346801e-07, + "loss": 0.2776, + "step": 4227 + }, + { + "epoch": 0.27, + "grad_norm": 0.420264671288587, + "learning_rate": 8.568603975628709e-07, + "loss": 0.2015, + "step": 4228 + }, + { + "epoch": 0.27, + "grad_norm": 1.1059528845144877, + "learning_rate": 8.567880537666799e-07, + "loss": 0.2098, + "step": 4229 + }, + { + "epoch": 0.27, + "grad_norm": 1.256259511897296, + "learning_rate": 8.567156947491932e-07, + "loss": 0.2977, + "step": 4230 + }, + { + "epoch": 0.27, + "grad_norm": 0.23574349099195155, + "learning_rate": 8.56643320513498e-07, + "loss": 0.0856, + "step": 4231 + }, + { + "epoch": 0.27, + "grad_norm": 0.341729503021465, + "learning_rate": 8.565709310626816e-07, + "loss": 0.1234, + "step": 4232 + }, + { + "epoch": 0.27, + "grad_norm": 1.454600428281282, + "learning_rate": 8.564985263998325e-07, + "loss": 0.2918, + "step": 4233 + }, + { + "epoch": 0.27, + "grad_norm": 0.7518635243267678, + "learning_rate": 8.564261065280398e-07, + "loss": 0.0767, + "step": 4234 + }, + { + "epoch": 0.27, + "grad_norm": 0.8384725035293839, + "learning_rate": 8.563536714503928e-07, + "loss": 0.1559, + "step": 4235 + }, + { + "epoch": 0.27, + "grad_norm": 0.800838341046286, + "learning_rate": 8.562812211699816e-07, + "loss": 0.1743, + "step": 4236 + }, + { + "epoch": 0.27, + "grad_norm": 0.8013847848128576, + "learning_rate": 8.562087556898976e-07, + "loss": 0.2045, + "step": 4237 + }, + { + "epoch": 0.27, + "grad_norm": 0.31272994473471644, + "learning_rate": 8.56136275013232e-07, + "loss": 0.2826, + "step": 4238 + }, + { + "epoch": 0.27, + "grad_norm": 0.49798282015021056, + "learning_rate": 8.560637791430768e-07, + "loss": 0.1582, + "step": 4239 + }, + { + "epoch": 0.27, + "grad_norm": 1.621320727025723, + "learning_rate": 8.559912680825252e-07, + "loss": 0.3903, + "step": 4240 + }, + { + "epoch": 0.27, + "grad_norm": 0.28427224685192864, + "learning_rate": 8.559187418346702e-07, + "loss": 0.166, + "step": 4241 + }, + { + "epoch": 0.27, + "grad_norm": 0.9318865664038639, + "learning_rate": 8.558462004026063e-07, + "loss": 0.2234, + "step": 4242 + }, + { + "epoch": 0.27, + "grad_norm": 0.4894180794341328, + "learning_rate": 8.557736437894283e-07, + "loss": 0.1665, + "step": 4243 + }, + { + "epoch": 0.27, + "grad_norm": 1.5977000699130905, + "learning_rate": 8.557010719982314e-07, + "loss": 0.3489, + "step": 4244 + }, + { + "epoch": 0.27, + "grad_norm": 1.8276768852244196, + "learning_rate": 8.556284850321116e-07, + "loss": 0.4377, + "step": 4245 + }, + { + "epoch": 0.27, + "grad_norm": 0.5917687601179266, + "learning_rate": 8.555558828941658e-07, + "loss": 0.1854, + "step": 4246 + }, + { + "epoch": 0.27, + "grad_norm": 1.0923027668181713, + "learning_rate": 8.554832655874913e-07, + "loss": 0.1182, + "step": 4247 + }, + { + "epoch": 0.27, + "grad_norm": 0.9908405100430573, + "learning_rate": 8.554106331151861e-07, + "loss": 0.1269, + "step": 4248 + }, + { + "epoch": 0.27, + "grad_norm": 0.9048833651868197, + "learning_rate": 8.553379854803488e-07, + "loss": 0.196, + "step": 4249 + }, + { + "epoch": 0.27, + "grad_norm": 7.05747604125339, + "learning_rate": 8.552653226860787e-07, + "loss": 0.1749, + "step": 4250 + }, + { + "epoch": 0.27, + "grad_norm": 0.6215436704290188, + "learning_rate": 8.551926447354758e-07, + "loss": 0.1114, + "step": 4251 + }, + { + "epoch": 0.27, + "grad_norm": 0.5683414048674594, + "learning_rate": 8.551199516316407e-07, + "loss": 0.3162, + "step": 4252 + }, + { + "epoch": 0.27, + "grad_norm": 0.5778537672359699, + "learning_rate": 8.550472433776744e-07, + "loss": 0.1325, + "step": 4253 + }, + { + "epoch": 0.27, + "grad_norm": 0.3504835916634373, + "learning_rate": 8.549745199766791e-07, + "loss": 0.1099, + "step": 4254 + }, + { + "epoch": 0.27, + "grad_norm": 0.18648009972585985, + "learning_rate": 8.549017814317572e-07, + "loss": 0.0076, + "step": 4255 + }, + { + "epoch": 0.27, + "grad_norm": 0.7118260180527443, + "learning_rate": 8.548290277460118e-07, + "loss": 0.4571, + "step": 4256 + }, + { + "epoch": 0.27, + "grad_norm": 2.7519858159781565, + "learning_rate": 8.547562589225468e-07, + "loss": 0.2112, + "step": 4257 + }, + { + "epoch": 0.27, + "grad_norm": 1.2938195918426418, + "learning_rate": 8.546834749644666e-07, + "loss": 0.2733, + "step": 4258 + }, + { + "epoch": 0.27, + "grad_norm": 0.3827592266894607, + "learning_rate": 8.546106758748763e-07, + "loss": 0.0178, + "step": 4259 + }, + { + "epoch": 0.27, + "grad_norm": 0.8115191964053644, + "learning_rate": 8.545378616568819e-07, + "loss": 0.1191, + "step": 4260 + }, + { + "epoch": 0.27, + "grad_norm": 0.756630858578341, + "learning_rate": 8.544650323135895e-07, + "loss": 0.007, + "step": 4261 + }, + { + "epoch": 0.27, + "grad_norm": 0.8229417280241946, + "learning_rate": 8.543921878481063e-07, + "loss": 0.2933, + "step": 4262 + }, + { + "epoch": 0.27, + "grad_norm": 1.1633109845165468, + "learning_rate": 8.543193282635399e-07, + "loss": 0.209, + "step": 4263 + }, + { + "epoch": 0.27, + "grad_norm": 0.7959031999077241, + "learning_rate": 8.542464535629987e-07, + "loss": 0.0123, + "step": 4264 + }, + { + "epoch": 0.27, + "grad_norm": 0.32068562583454563, + "learning_rate": 8.541735637495917e-07, + "loss": 0.161, + "step": 4265 + }, + { + "epoch": 0.27, + "grad_norm": 3.1121281956030695, + "learning_rate": 8.541006588264286e-07, + "loss": 0.1259, + "step": 4266 + }, + { + "epoch": 0.27, + "grad_norm": 0.7871506502797747, + "learning_rate": 8.540277387966193e-07, + "loss": 0.1888, + "step": 4267 + }, + { + "epoch": 0.27, + "grad_norm": 0.3010533321259848, + "learning_rate": 8.539548036632751e-07, + "loss": 0.1007, + "step": 4268 + }, + { + "epoch": 0.27, + "grad_norm": 0.6552887264914901, + "learning_rate": 8.538818534295075e-07, + "loss": 0.0432, + "step": 4269 + }, + { + "epoch": 0.27, + "grad_norm": 1.4525615232710638, + "learning_rate": 8.538088880984285e-07, + "loss": 0.1468, + "step": 4270 + }, + { + "epoch": 0.27, + "grad_norm": 1.7399268411262727, + "learning_rate": 8.537359076731512e-07, + "loss": 0.1885, + "step": 4271 + }, + { + "epoch": 0.27, + "grad_norm": 0.3202130344918199, + "learning_rate": 8.53662912156789e-07, + "loss": 0.0391, + "step": 4272 + }, + { + "epoch": 0.27, + "grad_norm": 0.5685864471737712, + "learning_rate": 8.535899015524559e-07, + "loss": 0.1587, + "step": 4273 + }, + { + "epoch": 0.27, + "grad_norm": 0.6777539456984496, + "learning_rate": 8.535168758632669e-07, + "loss": 0.1231, + "step": 4274 + }, + { + "epoch": 0.27, + "grad_norm": 0.2755821239711442, + "learning_rate": 8.534438350923373e-07, + "loss": 0.1061, + "step": 4275 + }, + { + "epoch": 0.27, + "grad_norm": 0.5404488903288633, + "learning_rate": 8.533707792427831e-07, + "loss": 0.2662, + "step": 4276 + }, + { + "epoch": 0.27, + "grad_norm": 2.6068717340366714, + "learning_rate": 8.532977083177213e-07, + "loss": 0.2832, + "step": 4277 + }, + { + "epoch": 0.27, + "grad_norm": 0.6639554778520406, + "learning_rate": 8.532246223202688e-07, + "loss": 0.1044, + "step": 4278 + }, + { + "epoch": 0.27, + "grad_norm": 0.1836937541968538, + "learning_rate": 8.53151521253544e-07, + "loss": 0.1085, + "step": 4279 + }, + { + "epoch": 0.27, + "grad_norm": 0.4925808917133856, + "learning_rate": 8.530784051206653e-07, + "loss": 0.1829, + "step": 4280 + }, + { + "epoch": 0.27, + "grad_norm": 0.7013712454806474, + "learning_rate": 8.530052739247521e-07, + "loss": 0.1747, + "step": 4281 + }, + { + "epoch": 0.27, + "grad_norm": 0.36553437198870564, + "learning_rate": 8.529321276689244e-07, + "loss": 0.218, + "step": 4282 + }, + { + "epoch": 0.27, + "grad_norm": 1.1997752663695453, + "learning_rate": 8.528589663563024e-07, + "loss": 0.224, + "step": 4283 + }, + { + "epoch": 0.27, + "grad_norm": 1.7877892821714088, + "learning_rate": 8.527857899900077e-07, + "loss": 0.0509, + "step": 4284 + }, + { + "epoch": 0.27, + "grad_norm": 0.5189093467424057, + "learning_rate": 8.527125985731621e-07, + "loss": 0.1515, + "step": 4285 + }, + { + "epoch": 0.27, + "grad_norm": 0.794490678121487, + "learning_rate": 8.526393921088877e-07, + "loss": 0.0195, + "step": 4286 + }, + { + "epoch": 0.27, + "grad_norm": 0.9463137408634047, + "learning_rate": 8.525661706003082e-07, + "loss": 0.1803, + "step": 4287 + }, + { + "epoch": 0.27, + "grad_norm": 1.733742604782018, + "learning_rate": 8.524929340505471e-07, + "loss": 0.3151, + "step": 4288 + }, + { + "epoch": 0.27, + "grad_norm": 0.888426562772552, + "learning_rate": 8.524196824627288e-07, + "loss": 0.3374, + "step": 4289 + }, + { + "epoch": 0.27, + "grad_norm": 0.5635952352373081, + "learning_rate": 8.523464158399783e-07, + "loss": 0.0777, + "step": 4290 + }, + { + "epoch": 0.27, + "grad_norm": 0.9068369267471855, + "learning_rate": 8.522731341854215e-07, + "loss": 0.1167, + "step": 4291 + }, + { + "epoch": 0.27, + "grad_norm": 0.16553678920009054, + "learning_rate": 8.521998375021846e-07, + "loss": 0.0062, + "step": 4292 + }, + { + "epoch": 0.27, + "grad_norm": 0.751025825975747, + "learning_rate": 8.521265257933947e-07, + "loss": 0.2014, + "step": 4293 + }, + { + "epoch": 0.27, + "grad_norm": 1.5086890057313618, + "learning_rate": 8.520531990621792e-07, + "loss": 0.0529, + "step": 4294 + }, + { + "epoch": 0.27, + "grad_norm": 0.5195701383131921, + "learning_rate": 8.519798573116666e-07, + "loss": 0.1368, + "step": 4295 + }, + { + "epoch": 0.27, + "grad_norm": 0.4886367835918936, + "learning_rate": 8.519065005449857e-07, + "loss": 0.3366, + "step": 4296 + }, + { + "epoch": 0.27, + "grad_norm": 1.345860065984868, + "learning_rate": 8.51833128765266e-07, + "loss": 0.2093, + "step": 4297 + }, + { + "epoch": 0.27, + "grad_norm": 0.6163713218739876, + "learning_rate": 8.51759741975638e-07, + "loss": 0.1095, + "step": 4298 + }, + { + "epoch": 0.27, + "grad_norm": 1.2497052208533497, + "learning_rate": 8.51686340179232e-07, + "loss": 0.1417, + "step": 4299 + }, + { + "epoch": 0.27, + "grad_norm": 2.0159735011566458, + "learning_rate": 8.516129233791799e-07, + "loss": 0.1864, + "step": 4300 + }, + { + "epoch": 0.27, + "grad_norm": 0.48030318363378866, + "learning_rate": 8.515394915786136e-07, + "loss": 0.0638, + "step": 4301 + }, + { + "epoch": 0.27, + "grad_norm": 0.7073627873066558, + "learning_rate": 8.51466044780666e-07, + "loss": 0.5101, + "step": 4302 + }, + { + "epoch": 0.27, + "grad_norm": 0.42053252529749324, + "learning_rate": 8.513925829884703e-07, + "loss": 0.244, + "step": 4303 + }, + { + "epoch": 0.27, + "grad_norm": 0.7435353546687125, + "learning_rate": 8.513191062051607e-07, + "loss": 0.1459, + "step": 4304 + }, + { + "epoch": 0.27, + "grad_norm": 0.47268336912859865, + "learning_rate": 8.512456144338716e-07, + "loss": 0.3455, + "step": 4305 + }, + { + "epoch": 0.27, + "grad_norm": 0.7420844477051433, + "learning_rate": 8.511721076777387e-07, + "loss": 0.169, + "step": 4306 + }, + { + "epoch": 0.27, + "grad_norm": 0.9693912136338695, + "learning_rate": 8.510985859398976e-07, + "loss": 0.4203, + "step": 4307 + }, + { + "epoch": 0.27, + "grad_norm": 2.2386222437024794, + "learning_rate": 8.510250492234852e-07, + "loss": 0.0616, + "step": 4308 + }, + { + "epoch": 0.27, + "grad_norm": 0.9317269982300498, + "learning_rate": 8.509514975316384e-07, + "loss": 0.2988, + "step": 4309 + }, + { + "epoch": 0.27, + "grad_norm": 0.3509798007559648, + "learning_rate": 8.508779308674952e-07, + "loss": 0.1942, + "step": 4310 + }, + { + "epoch": 0.27, + "grad_norm": 0.8425443795757862, + "learning_rate": 8.508043492341943e-07, + "loss": 0.1486, + "step": 4311 + }, + { + "epoch": 0.27, + "grad_norm": 0.3790680364923795, + "learning_rate": 8.507307526348744e-07, + "loss": 0.0867, + "step": 4312 + }, + { + "epoch": 0.28, + "grad_norm": 0.6909365406472099, + "learning_rate": 8.506571410726758e-07, + "loss": 0.1258, + "step": 4313 + }, + { + "epoch": 0.28, + "grad_norm": 1.112657815616951, + "learning_rate": 8.505835145507385e-07, + "loss": 0.1601, + "step": 4314 + }, + { + "epoch": 0.28, + "grad_norm": 0.8192066693388006, + "learning_rate": 8.505098730722038e-07, + "loss": 0.0203, + "step": 4315 + }, + { + "epoch": 0.28, + "grad_norm": 2.023798586627928, + "learning_rate": 8.504362166402131e-07, + "loss": 0.41, + "step": 4316 + }, + { + "epoch": 0.28, + "grad_norm": 1.4184967784167302, + "learning_rate": 8.503625452579091e-07, + "loss": 0.1804, + "step": 4317 + }, + { + "epoch": 0.28, + "grad_norm": 1.2020954289720547, + "learning_rate": 8.502888589284346e-07, + "loss": 0.1918, + "step": 4318 + }, + { + "epoch": 0.28, + "grad_norm": 0.9166679015975862, + "learning_rate": 8.502151576549332e-07, + "loss": 0.0869, + "step": 4319 + }, + { + "epoch": 0.28, + "grad_norm": 0.7368593006208842, + "learning_rate": 8.50141441440549e-07, + "loss": 0.0869, + "step": 4320 + }, + { + "epoch": 0.28, + "grad_norm": 0.9989025002647027, + "learning_rate": 8.500677102884273e-07, + "loss": 0.0395, + "step": 4321 + }, + { + "epoch": 0.28, + "grad_norm": 0.8430282057065523, + "learning_rate": 8.499939642017131e-07, + "loss": 0.3408, + "step": 4322 + }, + { + "epoch": 0.28, + "grad_norm": 0.4278914053106401, + "learning_rate": 8.49920203183553e-07, + "loss": 0.2442, + "step": 4323 + }, + { + "epoch": 0.28, + "grad_norm": 0.8225644744674708, + "learning_rate": 8.498464272370936e-07, + "loss": 0.3567, + "step": 4324 + }, + { + "epoch": 0.28, + "grad_norm": 2.8640436344809874, + "learning_rate": 8.497726363654823e-07, + "loss": 0.1426, + "step": 4325 + }, + { + "epoch": 0.28, + "grad_norm": 1.3736799735416585, + "learning_rate": 8.496988305718671e-07, + "loss": 0.2412, + "step": 4326 + }, + { + "epoch": 0.28, + "grad_norm": 0.7029669234188077, + "learning_rate": 8.496250098593968e-07, + "loss": 0.2283, + "step": 4327 + }, + { + "epoch": 0.28, + "grad_norm": 1.4224616227847302, + "learning_rate": 8.495511742312209e-07, + "loss": 0.439, + "step": 4328 + }, + { + "epoch": 0.28, + "grad_norm": 0.585775683575167, + "learning_rate": 8.494773236904891e-07, + "loss": 0.2986, + "step": 4329 + }, + { + "epoch": 0.28, + "grad_norm": 0.6835802848282123, + "learning_rate": 8.494034582403522e-07, + "loss": 0.2607, + "step": 4330 + }, + { + "epoch": 0.28, + "grad_norm": 0.2957920462701642, + "learning_rate": 8.493295778839614e-07, + "loss": 0.0239, + "step": 4331 + }, + { + "epoch": 0.28, + "grad_norm": 0.5438956127282539, + "learning_rate": 8.492556826244686e-07, + "loss": 0.0947, + "step": 4332 + }, + { + "epoch": 0.28, + "grad_norm": 0.9298790936968806, + "learning_rate": 8.491817724650261e-07, + "loss": 0.2266, + "step": 4333 + }, + { + "epoch": 0.28, + "grad_norm": 0.6943324528089971, + "learning_rate": 8.491078474087873e-07, + "loss": 0.4088, + "step": 4334 + }, + { + "epoch": 0.28, + "grad_norm": 0.32700302844867757, + "learning_rate": 8.49033907458906e-07, + "loss": 0.4245, + "step": 4335 + }, + { + "epoch": 0.28, + "grad_norm": 1.598715524527703, + "learning_rate": 8.489599526185366e-07, + "loss": 0.2194, + "step": 4336 + }, + { + "epoch": 0.28, + "grad_norm": 1.7196597826352433, + "learning_rate": 8.488859828908341e-07, + "loss": 0.291, + "step": 4337 + }, + { + "epoch": 0.28, + "grad_norm": 5.118949806060416, + "learning_rate": 8.488119982789541e-07, + "loss": 0.1923, + "step": 4338 + }, + { + "epoch": 0.28, + "grad_norm": 0.5843986075098508, + "learning_rate": 8.487379987860533e-07, + "loss": 0.0867, + "step": 4339 + }, + { + "epoch": 0.28, + "grad_norm": 0.9454531375694244, + "learning_rate": 8.486639844152881e-07, + "loss": 0.3082, + "step": 4340 + }, + { + "epoch": 0.28, + "grad_norm": 0.9987469523016238, + "learning_rate": 8.485899551698166e-07, + "loss": 0.3858, + "step": 4341 + }, + { + "epoch": 0.28, + "grad_norm": 0.40715029117676954, + "learning_rate": 8.485159110527969e-07, + "loss": 0.1751, + "step": 4342 + }, + { + "epoch": 0.28, + "grad_norm": 0.499792871528072, + "learning_rate": 8.484418520673878e-07, + "loss": 0.0946, + "step": 4343 + }, + { + "epoch": 0.28, + "grad_norm": 2.262308483993313, + "learning_rate": 8.483677782167489e-07, + "loss": 0.1008, + "step": 4344 + }, + { + "epoch": 0.28, + "grad_norm": 0.4171213140551655, + "learning_rate": 8.482936895040402e-07, + "loss": 0.1927, + "step": 4345 + }, + { + "epoch": 0.28, + "grad_norm": 0.5700755835615721, + "learning_rate": 8.482195859324225e-07, + "loss": 0.3416, + "step": 4346 + }, + { + "epoch": 0.28, + "grad_norm": 0.525540193469076, + "learning_rate": 8.481454675050572e-07, + "loss": 0.0943, + "step": 4347 + }, + { + "epoch": 0.28, + "grad_norm": 1.7161781408228356, + "learning_rate": 8.480713342251065e-07, + "loss": 0.1144, + "step": 4348 + }, + { + "epoch": 0.28, + "grad_norm": 0.48178956519723415, + "learning_rate": 8.47997186095733e-07, + "loss": 0.0822, + "step": 4349 + }, + { + "epoch": 0.28, + "grad_norm": 0.4131590219410887, + "learning_rate": 8.479230231201e-07, + "loss": 0.1703, + "step": 4350 + }, + { + "epoch": 0.28, + "grad_norm": 0.4610693527349393, + "learning_rate": 8.478488453013713e-07, + "loss": 0.0302, + "step": 4351 + }, + { + "epoch": 0.28, + "grad_norm": 0.572453851246035, + "learning_rate": 8.477746526427117e-07, + "loss": 0.2856, + "step": 4352 + }, + { + "epoch": 0.28, + "grad_norm": 1.0447256574844934, + "learning_rate": 8.477004451472862e-07, + "loss": 0.1349, + "step": 4353 + }, + { + "epoch": 0.28, + "grad_norm": 1.1982465611298891, + "learning_rate": 8.476262228182608e-07, + "loss": 0.3739, + "step": 4354 + }, + { + "epoch": 0.28, + "grad_norm": 0.524392966559799, + "learning_rate": 8.475519856588019e-07, + "loss": 0.1272, + "step": 4355 + }, + { + "epoch": 0.28, + "grad_norm": 0.40241971349987826, + "learning_rate": 8.474777336720766e-07, + "loss": 0.1078, + "step": 4356 + }, + { + "epoch": 0.28, + "grad_norm": 0.6715665943043577, + "learning_rate": 8.474034668612528e-07, + "loss": 0.3504, + "step": 4357 + }, + { + "epoch": 0.28, + "grad_norm": 1.0836109662832707, + "learning_rate": 8.473291852294986e-07, + "loss": 0.0272, + "step": 4358 + }, + { + "epoch": 0.28, + "grad_norm": 0.2938766956526506, + "learning_rate": 8.472548887799833e-07, + "loss": 0.0779, + "step": 4359 + }, + { + "epoch": 0.28, + "grad_norm": 0.816943845780774, + "learning_rate": 8.471805775158762e-07, + "loss": 0.2372, + "step": 4360 + }, + { + "epoch": 0.28, + "grad_norm": 4.75841659991107, + "learning_rate": 8.471062514403478e-07, + "loss": 0.1443, + "step": 4361 + }, + { + "epoch": 0.28, + "grad_norm": 0.47206538795515435, + "learning_rate": 8.470319105565689e-07, + "loss": 0.2239, + "step": 4362 + }, + { + "epoch": 0.28, + "grad_norm": 1.411996083151186, + "learning_rate": 8.469575548677111e-07, + "loss": 0.2346, + "step": 4363 + }, + { + "epoch": 0.28, + "grad_norm": 1.9391920864355068, + "learning_rate": 8.468831843769466e-07, + "loss": 0.0221, + "step": 4364 + }, + { + "epoch": 0.28, + "grad_norm": 0.49146848990345166, + "learning_rate": 8.468087990874479e-07, + "loss": 0.175, + "step": 4365 + }, + { + "epoch": 0.28, + "grad_norm": 0.5375892507560971, + "learning_rate": 8.46734399002389e-07, + "loss": 0.0121, + "step": 4366 + }, + { + "epoch": 0.28, + "grad_norm": 1.0964068343453515, + "learning_rate": 8.466599841249435e-07, + "loss": 0.3299, + "step": 4367 + }, + { + "epoch": 0.28, + "grad_norm": 0.5268867637463387, + "learning_rate": 8.465855544582861e-07, + "loss": 0.0925, + "step": 4368 + }, + { + "epoch": 0.28, + "grad_norm": 0.5820936578181061, + "learning_rate": 8.465111100055922e-07, + "loss": 0.2211, + "step": 4369 + }, + { + "epoch": 0.28, + "grad_norm": 0.45649807209567056, + "learning_rate": 8.46436650770038e-07, + "loss": 0.1642, + "step": 4370 + }, + { + "epoch": 0.28, + "grad_norm": 1.3095959954200143, + "learning_rate": 8.463621767547997e-07, + "loss": 0.0989, + "step": 4371 + }, + { + "epoch": 0.28, + "grad_norm": 3.032850155819076, + "learning_rate": 8.462876879630547e-07, + "loss": 0.0369, + "step": 4372 + }, + { + "epoch": 0.28, + "grad_norm": 0.7607701314968326, + "learning_rate": 8.462131843979808e-07, + "loss": 0.1293, + "step": 4373 + }, + { + "epoch": 0.28, + "grad_norm": 0.47250294306162127, + "learning_rate": 8.461386660627563e-07, + "loss": 0.0511, + "step": 4374 + }, + { + "epoch": 0.28, + "grad_norm": 0.8788969238131757, + "learning_rate": 8.460641329605607e-07, + "loss": 0.2171, + "step": 4375 + }, + { + "epoch": 0.28, + "grad_norm": 1.72297044819535, + "learning_rate": 8.459895850945735e-07, + "loss": 0.2224, + "step": 4376 + }, + { + "epoch": 0.28, + "grad_norm": 1.4394715100207545, + "learning_rate": 8.45915022467975e-07, + "loss": 0.1797, + "step": 4377 + }, + { + "epoch": 0.28, + "grad_norm": 0.3353120173309568, + "learning_rate": 8.458404450839462e-07, + "loss": 0.1442, + "step": 4378 + }, + { + "epoch": 0.28, + "grad_norm": 0.6724250722787823, + "learning_rate": 8.457658529456689e-07, + "loss": 0.2112, + "step": 4379 + }, + { + "epoch": 0.28, + "grad_norm": 0.8829546336921816, + "learning_rate": 8.456912460563253e-07, + "loss": 0.1709, + "step": 4380 + }, + { + "epoch": 0.28, + "grad_norm": 0.6053766450016911, + "learning_rate": 8.456166244190981e-07, + "loss": 0.2125, + "step": 4381 + }, + { + "epoch": 0.28, + "grad_norm": 1.1657277703372586, + "learning_rate": 8.455419880371709e-07, + "loss": 0.2216, + "step": 4382 + }, + { + "epoch": 0.28, + "grad_norm": 0.5844690456974323, + "learning_rate": 8.45467336913728e-07, + "loss": 0.152, + "step": 4383 + }, + { + "epoch": 0.28, + "grad_norm": 0.6908481820040442, + "learning_rate": 8.453926710519539e-07, + "loss": 0.1361, + "step": 4384 + }, + { + "epoch": 0.28, + "grad_norm": 0.45749695868996737, + "learning_rate": 8.453179904550343e-07, + "loss": 0.0473, + "step": 4385 + }, + { + "epoch": 0.28, + "grad_norm": 1.3220879391832774, + "learning_rate": 8.452432951261548e-07, + "loss": 0.2768, + "step": 4386 + }, + { + "epoch": 0.28, + "grad_norm": 0.7971791292876084, + "learning_rate": 8.451685850685025e-07, + "loss": 0.0975, + "step": 4387 + }, + { + "epoch": 0.28, + "grad_norm": 0.8164272301833383, + "learning_rate": 8.450938602852644e-07, + "loss": 0.1351, + "step": 4388 + }, + { + "epoch": 0.28, + "grad_norm": 5.015208488226374, + "learning_rate": 8.450191207796285e-07, + "loss": 0.2406, + "step": 4389 + }, + { + "epoch": 0.28, + "grad_norm": 1.0417216634065258, + "learning_rate": 8.449443665547833e-07, + "loss": 0.2907, + "step": 4390 + }, + { + "epoch": 0.28, + "grad_norm": 0.38297480315899257, + "learning_rate": 8.44869597613918e-07, + "loss": 0.2583, + "step": 4391 + }, + { + "epoch": 0.28, + "grad_norm": 1.8137261665206976, + "learning_rate": 8.447948139602225e-07, + "loss": 0.1434, + "step": 4392 + }, + { + "epoch": 0.28, + "grad_norm": 0.8724312544785039, + "learning_rate": 8.44720015596887e-07, + "loss": 0.118, + "step": 4393 + }, + { + "epoch": 0.28, + "grad_norm": 1.307410897826849, + "learning_rate": 8.446452025271027e-07, + "loss": 0.1632, + "step": 4394 + }, + { + "epoch": 0.28, + "grad_norm": 0.17111584161678214, + "learning_rate": 8.445703747540613e-07, + "loss": 0.0935, + "step": 4395 + }, + { + "epoch": 0.28, + "grad_norm": 1.286088804021365, + "learning_rate": 8.444955322809548e-07, + "loss": 0.212, + "step": 4396 + }, + { + "epoch": 0.28, + "grad_norm": 0.9218959587526094, + "learning_rate": 8.444206751109766e-07, + "loss": 0.2434, + "step": 4397 + }, + { + "epoch": 0.28, + "grad_norm": 0.608305385898315, + "learning_rate": 8.443458032473201e-07, + "loss": 0.2542, + "step": 4398 + }, + { + "epoch": 0.28, + "grad_norm": 0.7082549088828657, + "learning_rate": 8.442709166931792e-07, + "loss": 0.0726, + "step": 4399 + }, + { + "epoch": 0.28, + "grad_norm": 0.6564970763572002, + "learning_rate": 8.44196015451749e-07, + "loss": 0.0889, + "step": 4400 + }, + { + "epoch": 0.28, + "grad_norm": 1.5232565841103223, + "learning_rate": 8.441210995262249e-07, + "loss": 0.1873, + "step": 4401 + }, + { + "epoch": 0.28, + "grad_norm": 0.6616604078127134, + "learning_rate": 8.440461689198029e-07, + "loss": 0.2932, + "step": 4402 + }, + { + "epoch": 0.28, + "grad_norm": 0.7353454228622197, + "learning_rate": 8.439712236356797e-07, + "loss": 0.224, + "step": 4403 + }, + { + "epoch": 0.28, + "grad_norm": 1.0530861035964485, + "learning_rate": 8.438962636770526e-07, + "loss": 0.2295, + "step": 4404 + }, + { + "epoch": 0.28, + "grad_norm": 1.2168156362266176, + "learning_rate": 8.438212890471197e-07, + "loss": 0.3593, + "step": 4405 + }, + { + "epoch": 0.28, + "grad_norm": 0.48474040718209976, + "learning_rate": 8.437462997490793e-07, + "loss": 0.006, + "step": 4406 + }, + { + "epoch": 0.28, + "grad_norm": 1.7252387637225124, + "learning_rate": 8.436712957861308e-07, + "loss": 0.0512, + "step": 4407 + }, + { + "epoch": 0.28, + "grad_norm": 0.5362488869203919, + "learning_rate": 8.435962771614739e-07, + "loss": 0.1622, + "step": 4408 + }, + { + "epoch": 0.28, + "grad_norm": 0.7438715173511999, + "learning_rate": 8.435212438783091e-07, + "loss": 0.269, + "step": 4409 + }, + { + "epoch": 0.28, + "grad_norm": 1.120037485777825, + "learning_rate": 8.434461959398376e-07, + "loss": 0.1691, + "step": 4410 + }, + { + "epoch": 0.28, + "grad_norm": 0.47257631506788905, + "learning_rate": 8.433711333492608e-07, + "loss": 0.2809, + "step": 4411 + }, + { + "epoch": 0.28, + "grad_norm": 8.824371668720856, + "learning_rate": 8.432960561097814e-07, + "loss": 0.3591, + "step": 4412 + }, + { + "epoch": 0.28, + "grad_norm": 0.6413136399100594, + "learning_rate": 8.43220964224602e-07, + "loss": 0.1596, + "step": 4413 + }, + { + "epoch": 0.28, + "grad_norm": 0.30765493077013994, + "learning_rate": 8.431458576969263e-07, + "loss": 0.1539, + "step": 4414 + }, + { + "epoch": 0.28, + "grad_norm": 1.817059073970843, + "learning_rate": 8.430707365299585e-07, + "loss": 0.3303, + "step": 4415 + }, + { + "epoch": 0.28, + "grad_norm": 1.219533996366765, + "learning_rate": 8.429956007269035e-07, + "loss": 0.0526, + "step": 4416 + }, + { + "epoch": 0.28, + "grad_norm": 0.9711890001749441, + "learning_rate": 8.429204502909666e-07, + "loss": 0.2074, + "step": 4417 + }, + { + "epoch": 0.28, + "grad_norm": 0.5516315541274354, + "learning_rate": 8.428452852253538e-07, + "loss": 0.1186, + "step": 4418 + }, + { + "epoch": 0.28, + "grad_norm": 1.382794531359862, + "learning_rate": 8.427701055332721e-07, + "loss": 0.1941, + "step": 4419 + }, + { + "epoch": 0.28, + "grad_norm": 1.3404852144539467, + "learning_rate": 8.426949112179287e-07, + "loss": 0.1036, + "step": 4420 + }, + { + "epoch": 0.28, + "grad_norm": 0.6916350369653512, + "learning_rate": 8.426197022825313e-07, + "loss": 0.1553, + "step": 4421 + }, + { + "epoch": 0.28, + "grad_norm": 0.8270012871780776, + "learning_rate": 8.425444787302887e-07, + "loss": 0.1212, + "step": 4422 + }, + { + "epoch": 0.28, + "grad_norm": 1.246385309263991, + "learning_rate": 8.4246924056441e-07, + "loss": 0.2484, + "step": 4423 + }, + { + "epoch": 0.28, + "grad_norm": 0.673906575969224, + "learning_rate": 8.423939877881053e-07, + "loss": 0.1861, + "step": 4424 + }, + { + "epoch": 0.28, + "grad_norm": 0.7879070177283408, + "learning_rate": 8.423187204045845e-07, + "loss": 0.339, + "step": 4425 + }, + { + "epoch": 0.28, + "grad_norm": 0.348886836199494, + "learning_rate": 8.422434384170591e-07, + "loss": 0.1119, + "step": 4426 + }, + { + "epoch": 0.28, + "grad_norm": 0.6745275690026145, + "learning_rate": 8.421681418287405e-07, + "loss": 0.3574, + "step": 4427 + }, + { + "epoch": 0.28, + "grad_norm": 0.43383469870148433, + "learning_rate": 8.420928306428412e-07, + "loss": 0.123, + "step": 4428 + }, + { + "epoch": 0.28, + "grad_norm": 0.7763078183179507, + "learning_rate": 8.420175048625742e-07, + "loss": 0.1554, + "step": 4429 + }, + { + "epoch": 0.28, + "grad_norm": 0.7748784863518428, + "learning_rate": 8.419421644911526e-07, + "loss": 0.0832, + "step": 4430 + }, + { + "epoch": 0.28, + "grad_norm": 0.6505815947911184, + "learning_rate": 8.418668095317911e-07, + "loss": 0.248, + "step": 4431 + }, + { + "epoch": 0.28, + "grad_norm": 0.5520398041901158, + "learning_rate": 8.417914399877042e-07, + "loss": 0.1497, + "step": 4432 + }, + { + "epoch": 0.28, + "grad_norm": 1.0835156592478004, + "learning_rate": 8.417160558621073e-07, + "loss": 0.3796, + "step": 4433 + }, + { + "epoch": 0.28, + "grad_norm": 0.5971967168656509, + "learning_rate": 8.416406571582167e-07, + "loss": 0.242, + "step": 4434 + }, + { + "epoch": 0.28, + "grad_norm": 0.916643175209122, + "learning_rate": 8.415652438792486e-07, + "loss": 0.3095, + "step": 4435 + }, + { + "epoch": 0.28, + "grad_norm": 0.775772313594287, + "learning_rate": 8.414898160284207e-07, + "loss": 0.1477, + "step": 4436 + }, + { + "epoch": 0.28, + "grad_norm": 0.9152256704284947, + "learning_rate": 8.414143736089509e-07, + "loss": 0.0795, + "step": 4437 + }, + { + "epoch": 0.28, + "grad_norm": 1.0285415379650333, + "learning_rate": 8.413389166240574e-07, + "loss": 0.3897, + "step": 4438 + }, + { + "epoch": 0.28, + "grad_norm": 0.6570578741704823, + "learning_rate": 8.412634450769598e-07, + "loss": 0.181, + "step": 4439 + }, + { + "epoch": 0.28, + "grad_norm": 1.2676443027375748, + "learning_rate": 8.411879589708775e-07, + "loss": 0.1724, + "step": 4440 + }, + { + "epoch": 0.28, + "grad_norm": 0.7798554241070158, + "learning_rate": 8.411124583090307e-07, + "loss": 0.0496, + "step": 4441 + }, + { + "epoch": 0.28, + "grad_norm": 0.5629140076740263, + "learning_rate": 8.410369430946412e-07, + "loss": 0.2308, + "step": 4442 + }, + { + "epoch": 0.28, + "grad_norm": 0.780273770448044, + "learning_rate": 8.409614133309297e-07, + "loss": 0.2241, + "step": 4443 + }, + { + "epoch": 0.28, + "grad_norm": 0.5670293097966727, + "learning_rate": 8.408858690211191e-07, + "loss": 0.3358, + "step": 4444 + }, + { + "epoch": 0.28, + "grad_norm": 0.5042621290216677, + "learning_rate": 8.408103101684321e-07, + "loss": 0.142, + "step": 4445 + }, + { + "epoch": 0.28, + "grad_norm": 0.2387795036670779, + "learning_rate": 8.40734736776092e-07, + "loss": 0.0405, + "step": 4446 + }, + { + "epoch": 0.28, + "grad_norm": 0.7124827909652498, + "learning_rate": 8.406591488473232e-07, + "loss": 0.2548, + "step": 4447 + }, + { + "epoch": 0.28, + "grad_norm": 0.5194044697111061, + "learning_rate": 8.405835463853502e-07, + "loss": 0.1652, + "step": 4448 + }, + { + "epoch": 0.28, + "grad_norm": 0.5701851869266625, + "learning_rate": 8.405079293933985e-07, + "loss": 0.0565, + "step": 4449 + }, + { + "epoch": 0.28, + "grad_norm": 1.0910496102773226, + "learning_rate": 8.404322978746941e-07, + "loss": 0.4179, + "step": 4450 + }, + { + "epoch": 0.28, + "grad_norm": 0.6895464900298918, + "learning_rate": 8.403566518324634e-07, + "loss": 0.1947, + "step": 4451 + }, + { + "epoch": 0.28, + "grad_norm": 0.6471583129533248, + "learning_rate": 8.402809912699338e-07, + "loss": 0.3407, + "step": 4452 + }, + { + "epoch": 0.28, + "grad_norm": 2.0732438394333603, + "learning_rate": 8.402053161903331e-07, + "loss": 0.2065, + "step": 4453 + }, + { + "epoch": 0.28, + "grad_norm": 0.47199993676472796, + "learning_rate": 8.401296265968896e-07, + "loss": 0.0839, + "step": 4454 + }, + { + "epoch": 0.28, + "grad_norm": 0.6270475259464904, + "learning_rate": 8.400539224928325e-07, + "loss": 0.2564, + "step": 4455 + }, + { + "epoch": 0.28, + "grad_norm": 1.084180740135046, + "learning_rate": 8.399782038813916e-07, + "loss": 0.1497, + "step": 4456 + }, + { + "epoch": 0.28, + "grad_norm": 0.5701749049464911, + "learning_rate": 8.399024707657969e-07, + "loss": 0.1572, + "step": 4457 + }, + { + "epoch": 0.28, + "grad_norm": 0.6142287732929258, + "learning_rate": 8.398267231492797e-07, + "loss": 0.0791, + "step": 4458 + }, + { + "epoch": 0.28, + "grad_norm": 0.42223800371297737, + "learning_rate": 8.397509610350712e-07, + "loss": 0.1058, + "step": 4459 + }, + { + "epoch": 0.28, + "grad_norm": 0.18619950043951156, + "learning_rate": 8.396751844264038e-07, + "loss": 0.002, + "step": 4460 + }, + { + "epoch": 0.28, + "grad_norm": 1.0950956996937822, + "learning_rate": 8.395993933265101e-07, + "loss": 0.295, + "step": 4461 + }, + { + "epoch": 0.28, + "grad_norm": 1.1258189986203382, + "learning_rate": 8.395235877386236e-07, + "loss": 0.1735, + "step": 4462 + }, + { + "epoch": 0.28, + "grad_norm": 0.6911988427258985, + "learning_rate": 8.394477676659784e-07, + "loss": 0.2315, + "step": 4463 + }, + { + "epoch": 0.28, + "grad_norm": 0.8369721166034685, + "learning_rate": 8.39371933111809e-07, + "loss": 0.4423, + "step": 4464 + }, + { + "epoch": 0.28, + "grad_norm": 1.2261142755252696, + "learning_rate": 8.392960840793508e-07, + "loss": 0.346, + "step": 4465 + }, + { + "epoch": 0.28, + "grad_norm": 0.6335384642094622, + "learning_rate": 8.392202205718395e-07, + "loss": 0.1489, + "step": 4466 + }, + { + "epoch": 0.28, + "grad_norm": 0.5759397326677361, + "learning_rate": 8.391443425925118e-07, + "loss": 0.1198, + "step": 4467 + }, + { + "epoch": 0.28, + "grad_norm": 0.8138314939314969, + "learning_rate": 8.390684501446046e-07, + "loss": 0.3187, + "step": 4468 + }, + { + "epoch": 0.28, + "grad_norm": 0.9922259606701328, + "learning_rate": 8.389925432313556e-07, + "loss": 0.2434, + "step": 4469 + }, + { + "epoch": 0.29, + "grad_norm": 1.9439010131970127, + "learning_rate": 8.389166218560033e-07, + "loss": 0.281, + "step": 4470 + }, + { + "epoch": 0.29, + "grad_norm": 0.928976400579361, + "learning_rate": 8.388406860217867e-07, + "loss": 0.1049, + "step": 4471 + }, + { + "epoch": 0.29, + "grad_norm": 3.8344175767333075, + "learning_rate": 8.387647357319452e-07, + "loss": 0.1803, + "step": 4472 + }, + { + "epoch": 0.29, + "grad_norm": 0.3053728221696224, + "learning_rate": 8.38688770989719e-07, + "loss": 0.2961, + "step": 4473 + }, + { + "epoch": 0.29, + "grad_norm": 1.6094112650877064, + "learning_rate": 8.386127917983492e-07, + "loss": 0.2262, + "step": 4474 + }, + { + "epoch": 0.29, + "grad_norm": 0.39069129879624453, + "learning_rate": 8.385367981610769e-07, + "loss": 0.0516, + "step": 4475 + }, + { + "epoch": 0.29, + "grad_norm": 0.33031838581037737, + "learning_rate": 8.384607900811441e-07, + "loss": 0.1315, + "step": 4476 + }, + { + "epoch": 0.29, + "grad_norm": 2.073025608080206, + "learning_rate": 8.383847675617938e-07, + "loss": 0.1077, + "step": 4477 + }, + { + "epoch": 0.29, + "grad_norm": 0.47848041576443184, + "learning_rate": 8.383087306062689e-07, + "loss": 0.1378, + "step": 4478 + }, + { + "epoch": 0.29, + "grad_norm": 3.092192500525181, + "learning_rate": 8.382326792178136e-07, + "loss": 0.1555, + "step": 4479 + }, + { + "epoch": 0.29, + "grad_norm": 0.4073851225514589, + "learning_rate": 8.381566133996722e-07, + "loss": 0.1103, + "step": 4480 + }, + { + "epoch": 0.29, + "grad_norm": 0.5885160317150906, + "learning_rate": 8.3808053315509e-07, + "loss": 0.1013, + "step": 4481 + }, + { + "epoch": 0.29, + "grad_norm": 1.4601262119205156, + "learning_rate": 8.380044384873126e-07, + "loss": 0.0265, + "step": 4482 + }, + { + "epoch": 0.29, + "grad_norm": 0.4609037119067469, + "learning_rate": 8.379283293995862e-07, + "loss": 0.1188, + "step": 4483 + }, + { + "epoch": 0.29, + "grad_norm": 0.7100195224733987, + "learning_rate": 8.378522058951582e-07, + "loss": 0.1741, + "step": 4484 + }, + { + "epoch": 0.29, + "grad_norm": 0.9272890709236896, + "learning_rate": 8.377760679772759e-07, + "loss": 0.2117, + "step": 4485 + }, + { + "epoch": 0.29, + "grad_norm": 4.396616398343421, + "learning_rate": 8.376999156491873e-07, + "loss": 0.2864, + "step": 4486 + }, + { + "epoch": 0.29, + "grad_norm": 0.9904552251003063, + "learning_rate": 8.376237489141416e-07, + "loss": 0.3021, + "step": 4487 + }, + { + "epoch": 0.29, + "grad_norm": 0.668455169482055, + "learning_rate": 8.37547567775388e-07, + "loss": 0.4558, + "step": 4488 + }, + { + "epoch": 0.29, + "grad_norm": 1.3726506649987562, + "learning_rate": 8.374713722361766e-07, + "loss": 0.3314, + "step": 4489 + }, + { + "epoch": 0.29, + "grad_norm": 0.50682511573133, + "learning_rate": 8.373951622997581e-07, + "loss": 0.2216, + "step": 4490 + }, + { + "epoch": 0.29, + "grad_norm": 0.6726497042068822, + "learning_rate": 8.373189379693837e-07, + "loss": 0.2344, + "step": 4491 + }, + { + "epoch": 0.29, + "grad_norm": 1.609016184030008, + "learning_rate": 8.372426992483054e-07, + "loss": 0.3915, + "step": 4492 + }, + { + "epoch": 0.29, + "grad_norm": 1.1723781811917382, + "learning_rate": 8.371664461397756e-07, + "loss": 0.3616, + "step": 4493 + }, + { + "epoch": 0.29, + "grad_norm": 0.5127086883005437, + "learning_rate": 8.370901786470474e-07, + "loss": 0.1553, + "step": 4494 + }, + { + "epoch": 0.29, + "grad_norm": 0.67868702046723, + "learning_rate": 8.370138967733744e-07, + "loss": 0.1686, + "step": 4495 + }, + { + "epoch": 0.29, + "grad_norm": 1.0072948668837383, + "learning_rate": 8.369376005220113e-07, + "loss": 0.3462, + "step": 4496 + }, + { + "epoch": 0.29, + "grad_norm": 1.9595075385103153, + "learning_rate": 8.368612898962126e-07, + "loss": 0.1108, + "step": 4497 + }, + { + "epoch": 0.29, + "grad_norm": 3.602483397325656, + "learning_rate": 8.367849648992342e-07, + "loss": 0.0887, + "step": 4498 + }, + { + "epoch": 0.29, + "grad_norm": 1.374804428447395, + "learning_rate": 8.367086255343322e-07, + "loss": 0.4079, + "step": 4499 + }, + { + "epoch": 0.29, + "grad_norm": 0.874141071120875, + "learning_rate": 8.366322718047634e-07, + "loss": 0.299, + "step": 4500 + }, + { + "epoch": 0.29, + "grad_norm": 1.2232745684819886, + "learning_rate": 8.36555903713785e-07, + "loss": 0.3907, + "step": 4501 + }, + { + "epoch": 0.29, + "grad_norm": 0.9824008189289976, + "learning_rate": 8.364795212646553e-07, + "loss": 0.2109, + "step": 4502 + }, + { + "epoch": 0.29, + "grad_norm": 1.2373988796750734, + "learning_rate": 8.364031244606329e-07, + "loss": 0.0894, + "step": 4503 + }, + { + "epoch": 0.29, + "grad_norm": 1.2971308108811155, + "learning_rate": 8.363267133049768e-07, + "loss": 0.145, + "step": 4504 + }, + { + "epoch": 0.29, + "grad_norm": 0.6923696073736749, + "learning_rate": 8.362502878009472e-07, + "loss": 0.1401, + "step": 4505 + }, + { + "epoch": 0.29, + "grad_norm": 0.8011960752370959, + "learning_rate": 8.361738479518041e-07, + "loss": 0.2743, + "step": 4506 + }, + { + "epoch": 0.29, + "grad_norm": 0.9141753106075413, + "learning_rate": 8.360973937608091e-07, + "loss": 0.2063, + "step": 4507 + }, + { + "epoch": 0.29, + "grad_norm": 0.9111172786342697, + "learning_rate": 8.360209252312236e-07, + "loss": 0.3673, + "step": 4508 + }, + { + "epoch": 0.29, + "grad_norm": 0.21164414406498477, + "learning_rate": 8.359444423663099e-07, + "loss": 0.1103, + "step": 4509 + }, + { + "epoch": 0.29, + "grad_norm": 1.2468528092728297, + "learning_rate": 8.358679451693309e-07, + "loss": 0.1975, + "step": 4510 + }, + { + "epoch": 0.29, + "grad_norm": 0.3175666792175712, + "learning_rate": 8.357914336435503e-07, + "loss": 0.1048, + "step": 4511 + }, + { + "epoch": 0.29, + "grad_norm": 0.41181223836699016, + "learning_rate": 8.357149077922321e-07, + "loss": 0.003, + "step": 4512 + }, + { + "epoch": 0.29, + "grad_norm": 1.0454166567368108, + "learning_rate": 8.356383676186409e-07, + "loss": 0.3953, + "step": 4513 + }, + { + "epoch": 0.29, + "grad_norm": 0.6133492766439936, + "learning_rate": 8.355618131260424e-07, + "loss": 0.1993, + "step": 4514 + }, + { + "epoch": 0.29, + "grad_norm": 0.5328039269781779, + "learning_rate": 8.354852443177023e-07, + "loss": 0.0975, + "step": 4515 + }, + { + "epoch": 0.29, + "grad_norm": 0.5075796705113474, + "learning_rate": 8.354086611968871e-07, + "loss": 0.1081, + "step": 4516 + }, + { + "epoch": 0.29, + "grad_norm": 0.6732698907190383, + "learning_rate": 8.353320637668644e-07, + "loss": 0.1204, + "step": 4517 + }, + { + "epoch": 0.29, + "grad_norm": 0.8895655181978893, + "learning_rate": 8.352554520309017e-07, + "loss": 0.2238, + "step": 4518 + }, + { + "epoch": 0.29, + "grad_norm": 0.36560431686336975, + "learning_rate": 8.351788259922676e-07, + "loss": 0.1056, + "step": 4519 + }, + { + "epoch": 0.29, + "grad_norm": 0.565130468908776, + "learning_rate": 8.351021856542308e-07, + "loss": 0.2859, + "step": 4520 + }, + { + "epoch": 0.29, + "grad_norm": 0.7907019642042231, + "learning_rate": 8.350255310200611e-07, + "loss": 0.1872, + "step": 4521 + }, + { + "epoch": 0.29, + "grad_norm": 0.9747517445554856, + "learning_rate": 8.349488620930289e-07, + "loss": 0.1934, + "step": 4522 + }, + { + "epoch": 0.29, + "grad_norm": 0.9964218880977478, + "learning_rate": 8.348721788764048e-07, + "loss": 0.4043, + "step": 4523 + }, + { + "epoch": 0.29, + "grad_norm": 3.487042549188682, + "learning_rate": 8.347954813734605e-07, + "loss": 0.1708, + "step": 4524 + }, + { + "epoch": 0.29, + "grad_norm": 0.8260511257866386, + "learning_rate": 8.347187695874678e-07, + "loss": 0.0331, + "step": 4525 + }, + { + "epoch": 0.29, + "grad_norm": 2.2344444427781944, + "learning_rate": 8.346420435216996e-07, + "loss": 0.1984, + "step": 4526 + }, + { + "epoch": 0.29, + "grad_norm": 3.441167080806813, + "learning_rate": 8.34565303179429e-07, + "loss": 0.239, + "step": 4527 + }, + { + "epoch": 0.29, + "grad_norm": 1.164332167445458, + "learning_rate": 8.344885485639302e-07, + "loss": 0.1012, + "step": 4528 + }, + { + "epoch": 0.29, + "grad_norm": 0.774718838597696, + "learning_rate": 8.344117796784773e-07, + "loss": 0.1094, + "step": 4529 + }, + { + "epoch": 0.29, + "grad_norm": 0.5371450454458172, + "learning_rate": 8.343349965263457e-07, + "loss": 0.2501, + "step": 4530 + }, + { + "epoch": 0.29, + "grad_norm": 0.5861537448207714, + "learning_rate": 8.342581991108112e-07, + "loss": 0.1648, + "step": 4531 + }, + { + "epoch": 0.29, + "grad_norm": 0.7117029575482539, + "learning_rate": 8.341813874351499e-07, + "loss": 0.1814, + "step": 4532 + }, + { + "epoch": 0.29, + "grad_norm": 0.49056028703580873, + "learning_rate": 8.341045615026388e-07, + "loss": 0.1694, + "step": 4533 + }, + { + "epoch": 0.29, + "grad_norm": 0.48242432652565337, + "learning_rate": 8.340277213165554e-07, + "loss": 0.2004, + "step": 4534 + }, + { + "epoch": 0.29, + "grad_norm": 0.922951045137883, + "learning_rate": 8.33950866880178e-07, + "loss": 0.1574, + "step": 4535 + }, + { + "epoch": 0.29, + "grad_norm": 0.3845890214990481, + "learning_rate": 8.338739981967853e-07, + "loss": 0.2496, + "step": 4536 + }, + { + "epoch": 0.29, + "grad_norm": 1.1293397170787918, + "learning_rate": 8.337971152696565e-07, + "loss": 0.2678, + "step": 4537 + }, + { + "epoch": 0.29, + "grad_norm": 0.9948576701873844, + "learning_rate": 8.33720218102072e-07, + "loss": 0.1849, + "step": 4538 + }, + { + "epoch": 0.29, + "grad_norm": 0.5148305889359297, + "learning_rate": 8.336433066973121e-07, + "loss": 0.1714, + "step": 4539 + }, + { + "epoch": 0.29, + "grad_norm": 1.1373115981841952, + "learning_rate": 8.33566381058658e-07, + "loss": 0.1738, + "step": 4540 + }, + { + "epoch": 0.29, + "grad_norm": 0.6299221472633558, + "learning_rate": 8.334894411893913e-07, + "loss": 0.4132, + "step": 4541 + }, + { + "epoch": 0.29, + "grad_norm": 0.7694773051375428, + "learning_rate": 8.33412487092795e-07, + "loss": 0.189, + "step": 4542 + }, + { + "epoch": 0.29, + "grad_norm": 0.9193780317310255, + "learning_rate": 8.333355187721515e-07, + "loss": 0.5219, + "step": 4543 + }, + { + "epoch": 0.29, + "grad_norm": 0.3733278175440414, + "learning_rate": 8.332585362307447e-07, + "loss": 0.1146, + "step": 4544 + }, + { + "epoch": 0.29, + "grad_norm": 0.39541841523059135, + "learning_rate": 8.331815394718589e-07, + "loss": 0.1744, + "step": 4545 + }, + { + "epoch": 0.29, + "grad_norm": 0.5466051194717116, + "learning_rate": 8.331045284987789e-07, + "loss": 0.1114, + "step": 4546 + }, + { + "epoch": 0.29, + "grad_norm": 0.5900345574222747, + "learning_rate": 8.3302750331479e-07, + "loss": 0.114, + "step": 4547 + }, + { + "epoch": 0.29, + "grad_norm": 0.9652657532302261, + "learning_rate": 8.329504639231783e-07, + "loss": 0.5105, + "step": 4548 + }, + { + "epoch": 0.29, + "grad_norm": 0.9475445901641059, + "learning_rate": 8.328734103272306e-07, + "loss": 0.3432, + "step": 4549 + }, + { + "epoch": 0.29, + "grad_norm": 0.6756970679755269, + "learning_rate": 8.32796342530234e-07, + "loss": 0.3576, + "step": 4550 + }, + { + "epoch": 0.29, + "grad_norm": 0.7712110044567839, + "learning_rate": 8.327192605354765e-07, + "loss": 0.278, + "step": 4551 + }, + { + "epoch": 0.29, + "grad_norm": 0.9991316278556664, + "learning_rate": 8.326421643462465e-07, + "loss": 0.3683, + "step": 4552 + }, + { + "epoch": 0.29, + "grad_norm": 0.4824304657385116, + "learning_rate": 8.325650539658329e-07, + "loss": 0.2923, + "step": 4553 + }, + { + "epoch": 0.29, + "grad_norm": 0.6166678229650034, + "learning_rate": 8.324879293975257e-07, + "loss": 0.2142, + "step": 4554 + }, + { + "epoch": 0.29, + "grad_norm": 2.9273594079410605, + "learning_rate": 8.32410790644615e-07, + "loss": 0.1816, + "step": 4555 + }, + { + "epoch": 0.29, + "grad_norm": 1.2826245959745397, + "learning_rate": 8.323336377103917e-07, + "loss": 0.3449, + "step": 4556 + }, + { + "epoch": 0.29, + "grad_norm": 0.6230622406872659, + "learning_rate": 8.322564705981474e-07, + "loss": 0.6153, + "step": 4557 + }, + { + "epoch": 0.29, + "grad_norm": 0.3643263834950898, + "learning_rate": 8.321792893111741e-07, + "loss": 0.1063, + "step": 4558 + }, + { + "epoch": 0.29, + "grad_norm": 1.0041656352019737, + "learning_rate": 8.321020938527645e-07, + "loss": 0.3541, + "step": 4559 + }, + { + "epoch": 0.29, + "grad_norm": 0.875425035071312, + "learning_rate": 8.320248842262121e-07, + "loss": 0.2437, + "step": 4560 + }, + { + "epoch": 0.29, + "grad_norm": 3.158182413474993, + "learning_rate": 8.319476604348106e-07, + "loss": 0.0853, + "step": 4561 + }, + { + "epoch": 0.29, + "grad_norm": 1.1630444341751074, + "learning_rate": 8.318704224818547e-07, + "loss": 0.2336, + "step": 4562 + }, + { + "epoch": 0.29, + "grad_norm": 0.741685929124548, + "learning_rate": 8.317931703706393e-07, + "loss": 0.2526, + "step": 4563 + }, + { + "epoch": 0.29, + "grad_norm": 0.1489538592135694, + "learning_rate": 8.317159041044604e-07, + "loss": 0.0017, + "step": 4564 + }, + { + "epoch": 0.29, + "grad_norm": 0.5622834596398911, + "learning_rate": 8.316386236866142e-07, + "loss": 0.2137, + "step": 4565 + }, + { + "epoch": 0.29, + "grad_norm": 2.4250045625899355, + "learning_rate": 8.315613291203976e-07, + "loss": 0.1515, + "step": 4566 + }, + { + "epoch": 0.29, + "grad_norm": 0.9054182827221595, + "learning_rate": 8.314840204091082e-07, + "loss": 0.3344, + "step": 4567 + }, + { + "epoch": 0.29, + "grad_norm": 1.0175592120730144, + "learning_rate": 8.314066975560441e-07, + "loss": 0.3002, + "step": 4568 + }, + { + "epoch": 0.29, + "grad_norm": 0.7709846955208957, + "learning_rate": 8.31329360564504e-07, + "loss": 0.2873, + "step": 4569 + }, + { + "epoch": 0.29, + "grad_norm": 1.1544419692367198, + "learning_rate": 8.312520094377872e-07, + "loss": 0.0196, + "step": 4570 + }, + { + "epoch": 0.29, + "grad_norm": 0.860307808512491, + "learning_rate": 8.311746441791941e-07, + "loss": 0.2551, + "step": 4571 + }, + { + "epoch": 0.29, + "grad_norm": 0.36251735774144683, + "learning_rate": 8.310972647920247e-07, + "loss": 0.4366, + "step": 4572 + }, + { + "epoch": 0.29, + "grad_norm": 4.193437641023994, + "learning_rate": 8.310198712795806e-07, + "loss": 0.1652, + "step": 4573 + }, + { + "epoch": 0.29, + "grad_norm": 0.4583817126495332, + "learning_rate": 8.309424636451632e-07, + "loss": 0.0906, + "step": 4574 + }, + { + "epoch": 0.29, + "grad_norm": 0.57855699259455, + "learning_rate": 8.308650418920751e-07, + "loss": 0.3078, + "step": 4575 + }, + { + "epoch": 0.29, + "grad_norm": 0.6839847152348562, + "learning_rate": 8.307876060236191e-07, + "loss": 0.1316, + "step": 4576 + }, + { + "epoch": 0.29, + "grad_norm": 0.7829264233824589, + "learning_rate": 8.307101560430989e-07, + "loss": 0.2309, + "step": 4577 + }, + { + "epoch": 0.29, + "grad_norm": 1.2290653221772467, + "learning_rate": 8.306326919538186e-07, + "loss": 0.1449, + "step": 4578 + }, + { + "epoch": 0.29, + "grad_norm": 0.7162220578243333, + "learning_rate": 8.305552137590831e-07, + "loss": 0.206, + "step": 4579 + }, + { + "epoch": 0.29, + "grad_norm": 0.9666480360558779, + "learning_rate": 8.304777214621976e-07, + "loss": 0.1349, + "step": 4580 + }, + { + "epoch": 0.29, + "grad_norm": 1.1651099365529307, + "learning_rate": 8.304002150664682e-07, + "loss": 0.2618, + "step": 4581 + }, + { + "epoch": 0.29, + "grad_norm": 0.15035581837467768, + "learning_rate": 8.303226945752014e-07, + "loss": 0.0857, + "step": 4582 + }, + { + "epoch": 0.29, + "grad_norm": 0.6239645041727994, + "learning_rate": 8.302451599917044e-07, + "loss": 0.3263, + "step": 4583 + }, + { + "epoch": 0.29, + "grad_norm": 1.3917517143758265, + "learning_rate": 8.301676113192852e-07, + "loss": 0.28, + "step": 4584 + }, + { + "epoch": 0.29, + "grad_norm": 0.3866381229855671, + "learning_rate": 8.300900485612518e-07, + "loss": 0.0685, + "step": 4585 + }, + { + "epoch": 0.29, + "grad_norm": 0.4654222159644312, + "learning_rate": 8.300124717209134e-07, + "loss": 0.2056, + "step": 4586 + }, + { + "epoch": 0.29, + "grad_norm": 0.6494826203489248, + "learning_rate": 8.299348808015795e-07, + "loss": 0.0846, + "step": 4587 + }, + { + "epoch": 0.29, + "grad_norm": 0.8289824743122928, + "learning_rate": 8.298572758065602e-07, + "loss": 0.1841, + "step": 4588 + }, + { + "epoch": 0.29, + "grad_norm": 0.35527394307717897, + "learning_rate": 8.297796567391668e-07, + "loss": 0.1913, + "step": 4589 + }, + { + "epoch": 0.29, + "grad_norm": 0.6139715393711419, + "learning_rate": 8.297020236027101e-07, + "loss": 0.2243, + "step": 4590 + }, + { + "epoch": 0.29, + "grad_norm": 1.037744256135214, + "learning_rate": 8.296243764005022e-07, + "loss": 0.0647, + "step": 4591 + }, + { + "epoch": 0.29, + "grad_norm": 0.40674151977736805, + "learning_rate": 8.295467151358559e-07, + "loss": 0.098, + "step": 4592 + }, + { + "epoch": 0.29, + "grad_norm": 2.3260904276436847, + "learning_rate": 8.294690398120842e-07, + "loss": 0.1931, + "step": 4593 + }, + { + "epoch": 0.29, + "grad_norm": 1.3688959676407841, + "learning_rate": 8.29391350432501e-07, + "loss": 0.1907, + "step": 4594 + }, + { + "epoch": 0.29, + "grad_norm": 0.7784545250374711, + "learning_rate": 8.293136470004206e-07, + "loss": 0.2911, + "step": 4595 + }, + { + "epoch": 0.29, + "grad_norm": 0.7557252652780297, + "learning_rate": 8.29235929519158e-07, + "loss": 0.1921, + "step": 4596 + }, + { + "epoch": 0.29, + "grad_norm": 0.703423784255771, + "learning_rate": 8.291581979920288e-07, + "loss": 0.3141, + "step": 4597 + }, + { + "epoch": 0.29, + "grad_norm": 0.5752349269239349, + "learning_rate": 8.290804524223491e-07, + "loss": 0.0607, + "step": 4598 + }, + { + "epoch": 0.29, + "grad_norm": 0.9029668416968109, + "learning_rate": 8.29002692813436e-07, + "loss": 0.1323, + "step": 4599 + }, + { + "epoch": 0.29, + "grad_norm": 2.2662017446158336, + "learning_rate": 8.289249191686063e-07, + "loss": 0.3032, + "step": 4600 + }, + { + "epoch": 0.29, + "grad_norm": 0.7251310077502929, + "learning_rate": 8.288471314911786e-07, + "loss": 0.1822, + "step": 4601 + }, + { + "epoch": 0.29, + "grad_norm": 0.11923768026772609, + "learning_rate": 8.287693297844711e-07, + "loss": 0.0059, + "step": 4602 + }, + { + "epoch": 0.29, + "grad_norm": 0.6914129724847398, + "learning_rate": 8.286915140518032e-07, + "loss": 0.081, + "step": 4603 + }, + { + "epoch": 0.29, + "grad_norm": 0.5500244242234917, + "learning_rate": 8.286136842964944e-07, + "loss": 0.3442, + "step": 4604 + }, + { + "epoch": 0.29, + "grad_norm": 0.5948898502347938, + "learning_rate": 8.285358405218654e-07, + "loss": 0.204, + "step": 4605 + }, + { + "epoch": 0.29, + "grad_norm": 0.5806936398897938, + "learning_rate": 8.284579827312368e-07, + "loss": 0.1383, + "step": 4606 + }, + { + "epoch": 0.29, + "grad_norm": 0.2924043730778653, + "learning_rate": 8.283801109279305e-07, + "loss": 0.007, + "step": 4607 + }, + { + "epoch": 0.29, + "grad_norm": 0.888021047655419, + "learning_rate": 8.283022251152685e-07, + "loss": 0.2459, + "step": 4608 + }, + { + "epoch": 0.29, + "grad_norm": 0.4635528743666432, + "learning_rate": 8.282243252965737e-07, + "loss": 0.2149, + "step": 4609 + }, + { + "epoch": 0.29, + "grad_norm": 1.7244105226734352, + "learning_rate": 8.281464114751692e-07, + "loss": 0.2014, + "step": 4610 + }, + { + "epoch": 0.29, + "grad_norm": 2.4907907933620943, + "learning_rate": 8.280684836543793e-07, + "loss": 0.1493, + "step": 4611 + }, + { + "epoch": 0.29, + "grad_norm": 1.6492772531144928, + "learning_rate": 8.279905418375283e-07, + "loss": 0.1192, + "step": 4612 + }, + { + "epoch": 0.29, + "grad_norm": 1.1096932116011753, + "learning_rate": 8.279125860279415e-07, + "loss": 0.4534, + "step": 4613 + }, + { + "epoch": 0.29, + "grad_norm": 0.5776216364701081, + "learning_rate": 8.278346162289445e-07, + "loss": 0.1065, + "step": 4614 + }, + { + "epoch": 0.29, + "grad_norm": 0.7320969443695299, + "learning_rate": 8.277566324438639e-07, + "loss": 0.3739, + "step": 4615 + }, + { + "epoch": 0.29, + "grad_norm": 1.5258340744849652, + "learning_rate": 8.276786346760262e-07, + "loss": 0.2819, + "step": 4616 + }, + { + "epoch": 0.29, + "grad_norm": 2.405493453938572, + "learning_rate": 8.276006229287594e-07, + "loss": 0.0787, + "step": 4617 + }, + { + "epoch": 0.29, + "grad_norm": 1.9396627536486883, + "learning_rate": 8.275225972053917e-07, + "loss": 0.0345, + "step": 4618 + }, + { + "epoch": 0.29, + "grad_norm": 0.8877227374545172, + "learning_rate": 8.274445575092512e-07, + "loss": 0.0973, + "step": 4619 + }, + { + "epoch": 0.29, + "grad_norm": 1.189231121656869, + "learning_rate": 8.273665038436679e-07, + "loss": 0.111, + "step": 4620 + }, + { + "epoch": 0.29, + "grad_norm": 2.423798873505761, + "learning_rate": 8.272884362119712e-07, + "loss": 0.2148, + "step": 4621 + }, + { + "epoch": 0.29, + "grad_norm": 2.554313924010514, + "learning_rate": 8.272103546174921e-07, + "loss": 0.0598, + "step": 4622 + }, + { + "epoch": 0.29, + "grad_norm": 0.46607967107758846, + "learning_rate": 8.271322590635615e-07, + "loss": 0.1261, + "step": 4623 + }, + { + "epoch": 0.29, + "grad_norm": 1.0127076897066063, + "learning_rate": 8.27054149553511e-07, + "loss": 0.2251, + "step": 4624 + }, + { + "epoch": 0.29, + "grad_norm": 0.9804034283357304, + "learning_rate": 8.269760260906731e-07, + "loss": 0.1116, + "step": 4625 + }, + { + "epoch": 0.3, + "grad_norm": 0.7108680094980235, + "learning_rate": 8.268978886783805e-07, + "loss": 0.311, + "step": 4626 + }, + { + "epoch": 0.3, + "grad_norm": 0.47837300243419517, + "learning_rate": 8.268197373199669e-07, + "loss": 0.0157, + "step": 4627 + }, + { + "epoch": 0.3, + "grad_norm": 0.8288377237677884, + "learning_rate": 8.267415720187662e-07, + "loss": 0.1729, + "step": 4628 + }, + { + "epoch": 0.3, + "grad_norm": 1.7902554788474534, + "learning_rate": 8.266633927781135e-07, + "loss": 0.3295, + "step": 4629 + }, + { + "epoch": 0.3, + "grad_norm": 1.550425026591602, + "learning_rate": 8.265851996013436e-07, + "loss": 0.0477, + "step": 4630 + }, + { + "epoch": 0.3, + "grad_norm": 0.7396059757718584, + "learning_rate": 8.265069924917924e-07, + "loss": 0.0871, + "step": 4631 + }, + { + "epoch": 0.3, + "grad_norm": 0.6696288100632679, + "learning_rate": 8.264287714527969e-07, + "loss": 0.0493, + "step": 4632 + }, + { + "epoch": 0.3, + "grad_norm": 0.5719594532710218, + "learning_rate": 8.263505364876937e-07, + "loss": 0.1919, + "step": 4633 + }, + { + "epoch": 0.3, + "grad_norm": 0.7864193915426383, + "learning_rate": 8.262722875998204e-07, + "loss": 0.1095, + "step": 4634 + }, + { + "epoch": 0.3, + "grad_norm": 0.9594146892344215, + "learning_rate": 8.261940247925154e-07, + "loss": 0.1684, + "step": 4635 + }, + { + "epoch": 0.3, + "grad_norm": 1.6935617213124876, + "learning_rate": 8.261157480691178e-07, + "loss": 0.3149, + "step": 4636 + }, + { + "epoch": 0.3, + "grad_norm": 1.38494366850966, + "learning_rate": 8.260374574329668e-07, + "loss": 0.1426, + "step": 4637 + }, + { + "epoch": 0.3, + "grad_norm": 0.7821993996292126, + "learning_rate": 8.259591528874022e-07, + "loss": 0.1792, + "step": 4638 + }, + { + "epoch": 0.3, + "grad_norm": 0.5951570960976268, + "learning_rate": 8.25880834435765e-07, + "loss": 0.1402, + "step": 4639 + }, + { + "epoch": 0.3, + "grad_norm": 1.3405780359458321, + "learning_rate": 8.258025020813963e-07, + "loss": 0.1172, + "step": 4640 + }, + { + "epoch": 0.3, + "grad_norm": 0.6995719513746204, + "learning_rate": 8.25724155827638e-07, + "loss": 0.0942, + "step": 4641 + }, + { + "epoch": 0.3, + "grad_norm": 1.2475965460830065, + "learning_rate": 8.256457956778324e-07, + "loss": 0.1576, + "step": 4642 + }, + { + "epoch": 0.3, + "grad_norm": 0.6485662326946411, + "learning_rate": 8.255674216353224e-07, + "loss": 0.0878, + "step": 4643 + }, + { + "epoch": 0.3, + "grad_norm": 0.9023323330171991, + "learning_rate": 8.254890337034519e-07, + "loss": 0.3953, + "step": 4644 + }, + { + "epoch": 0.3, + "grad_norm": 1.1302258505314244, + "learning_rate": 8.254106318855648e-07, + "loss": 0.0984, + "step": 4645 + }, + { + "epoch": 0.3, + "grad_norm": 0.3699190511740946, + "learning_rate": 8.25332216185006e-07, + "loss": 0.0037, + "step": 4646 + }, + { + "epoch": 0.3, + "grad_norm": 0.336241396869838, + "learning_rate": 8.252537866051208e-07, + "loss": 0.3013, + "step": 4647 + }, + { + "epoch": 0.3, + "grad_norm": 1.0047472333264649, + "learning_rate": 8.251753431492553e-07, + "loss": 0.0824, + "step": 4648 + }, + { + "epoch": 0.3, + "grad_norm": 0.7740210959644076, + "learning_rate": 8.250968858207559e-07, + "loss": 0.1275, + "step": 4649 + }, + { + "epoch": 0.3, + "grad_norm": 0.6199947318242983, + "learning_rate": 8.2501841462297e-07, + "loss": 0.1217, + "step": 4650 + }, + { + "epoch": 0.3, + "grad_norm": 0.8882994311764234, + "learning_rate": 8.24939929559245e-07, + "loss": 0.1321, + "step": 4651 + }, + { + "epoch": 0.3, + "grad_norm": 0.889286149368149, + "learning_rate": 8.248614306329295e-07, + "loss": 0.3605, + "step": 4652 + }, + { + "epoch": 0.3, + "grad_norm": 2.649976919422674, + "learning_rate": 8.247829178473722e-07, + "loss": 0.145, + "step": 4653 + }, + { + "epoch": 0.3, + "grad_norm": 0.9265333723529212, + "learning_rate": 8.247043912059228e-07, + "loss": 0.2944, + "step": 4654 + }, + { + "epoch": 0.3, + "grad_norm": 0.2498562724746331, + "learning_rate": 8.246258507119313e-07, + "loss": 0.0102, + "step": 4655 + }, + { + "epoch": 0.3, + "grad_norm": 2.876708751259279, + "learning_rate": 8.245472963687484e-07, + "loss": 0.0302, + "step": 4656 + }, + { + "epoch": 0.3, + "grad_norm": 0.8298423753477335, + "learning_rate": 8.244687281797254e-07, + "loss": 0.4475, + "step": 4657 + }, + { + "epoch": 0.3, + "grad_norm": 0.6697123832860922, + "learning_rate": 8.243901461482143e-07, + "loss": 0.096, + "step": 4658 + }, + { + "epoch": 0.3, + "grad_norm": 0.49985156868772807, + "learning_rate": 8.243115502775675e-07, + "loss": 0.1561, + "step": 4659 + }, + { + "epoch": 0.3, + "grad_norm": 0.8748840466630781, + "learning_rate": 8.242329405711379e-07, + "loss": 0.3036, + "step": 4660 + }, + { + "epoch": 0.3, + "grad_norm": 0.4727218219758068, + "learning_rate": 8.241543170322793e-07, + "loss": 0.1853, + "step": 4661 + }, + { + "epoch": 0.3, + "grad_norm": 0.6569204405804752, + "learning_rate": 8.240756796643459e-07, + "loss": 0.1419, + "step": 4662 + }, + { + "epoch": 0.3, + "grad_norm": 0.7974699904017554, + "learning_rate": 8.239970284706925e-07, + "loss": 0.1973, + "step": 4663 + }, + { + "epoch": 0.3, + "grad_norm": 2.2116589416155774, + "learning_rate": 8.239183634546746e-07, + "loss": 0.0917, + "step": 4664 + }, + { + "epoch": 0.3, + "grad_norm": 1.9486569978203159, + "learning_rate": 8.238396846196481e-07, + "loss": 0.2162, + "step": 4665 + }, + { + "epoch": 0.3, + "grad_norm": 0.585708348381636, + "learning_rate": 8.237609919689696e-07, + "loss": 0.2014, + "step": 4666 + }, + { + "epoch": 0.3, + "grad_norm": 0.7825492050299566, + "learning_rate": 8.236822855059965e-07, + "loss": 0.1459, + "step": 4667 + }, + { + "epoch": 0.3, + "grad_norm": 0.8328841281850551, + "learning_rate": 8.236035652340864e-07, + "loss": 0.4653, + "step": 4668 + }, + { + "epoch": 0.3, + "grad_norm": 0.7004520417155415, + "learning_rate": 8.235248311565977e-07, + "loss": 0.0492, + "step": 4669 + }, + { + "epoch": 0.3, + "grad_norm": 2.5439465978243794, + "learning_rate": 8.234460832768893e-07, + "loss": 0.266, + "step": 4670 + }, + { + "epoch": 0.3, + "grad_norm": 0.40523210034565077, + "learning_rate": 8.233673215983205e-07, + "loss": 0.2039, + "step": 4671 + }, + { + "epoch": 0.3, + "grad_norm": 0.8095650622236058, + "learning_rate": 8.23288546124252e-07, + "loss": 0.0136, + "step": 4672 + }, + { + "epoch": 0.3, + "grad_norm": 0.9191328261543144, + "learning_rate": 8.232097568580443e-07, + "loss": 0.0245, + "step": 4673 + }, + { + "epoch": 0.3, + "grad_norm": 1.0014710037084338, + "learning_rate": 8.231309538030585e-07, + "loss": 0.1874, + "step": 4674 + }, + { + "epoch": 0.3, + "grad_norm": 0.8227465043781447, + "learning_rate": 8.230521369626567e-07, + "loss": 0.1479, + "step": 4675 + }, + { + "epoch": 0.3, + "grad_norm": 0.8263908331466868, + "learning_rate": 8.229733063402012e-07, + "loss": 0.0944, + "step": 4676 + }, + { + "epoch": 0.3, + "grad_norm": 1.2417722896625074, + "learning_rate": 8.228944619390554e-07, + "loss": 0.1235, + "step": 4677 + }, + { + "epoch": 0.3, + "grad_norm": 0.5775048761976556, + "learning_rate": 8.228156037625826e-07, + "loss": 0.2096, + "step": 4678 + }, + { + "epoch": 0.3, + "grad_norm": 0.14880514998075245, + "learning_rate": 8.227367318141471e-07, + "loss": 0.0576, + "step": 4679 + }, + { + "epoch": 0.3, + "grad_norm": 0.8514689445420394, + "learning_rate": 8.226578460971141e-07, + "loss": 0.2681, + "step": 4680 + }, + { + "epoch": 0.3, + "grad_norm": 0.353638450122576, + "learning_rate": 8.225789466148487e-07, + "loss": 0.0365, + "step": 4681 + }, + { + "epoch": 0.3, + "grad_norm": 1.3520576242029907, + "learning_rate": 8.225000333707169e-07, + "loss": 0.2149, + "step": 4682 + }, + { + "epoch": 0.3, + "grad_norm": 1.2241922239463783, + "learning_rate": 8.224211063680852e-07, + "loss": 0.0974, + "step": 4683 + }, + { + "epoch": 0.3, + "grad_norm": 1.0618219535173103, + "learning_rate": 8.223421656103212e-07, + "loss": 0.1163, + "step": 4684 + }, + { + "epoch": 0.3, + "grad_norm": 1.0534455178092248, + "learning_rate": 8.222632111007924e-07, + "loss": 0.1829, + "step": 4685 + }, + { + "epoch": 0.3, + "grad_norm": 0.5360778566064571, + "learning_rate": 8.221842428428672e-07, + "loss": 0.3163, + "step": 4686 + }, + { + "epoch": 0.3, + "grad_norm": 0.407809610791214, + "learning_rate": 8.221052608399144e-07, + "loss": 0.0037, + "step": 4687 + }, + { + "epoch": 0.3, + "grad_norm": 0.5739234045881964, + "learning_rate": 8.220262650953037e-07, + "loss": 0.1646, + "step": 4688 + }, + { + "epoch": 0.3, + "grad_norm": 0.4669324054169145, + "learning_rate": 8.219472556124052e-07, + "loss": 0.4065, + "step": 4689 + }, + { + "epoch": 0.3, + "grad_norm": 0.5825268662027375, + "learning_rate": 8.218682323945895e-07, + "loss": 0.2036, + "step": 4690 + }, + { + "epoch": 0.3, + "grad_norm": 4.4307318733168914, + "learning_rate": 8.217891954452281e-07, + "loss": 0.3523, + "step": 4691 + }, + { + "epoch": 0.3, + "grad_norm": 1.0414104009912146, + "learning_rate": 8.217101447676928e-07, + "loss": 0.4357, + "step": 4692 + }, + { + "epoch": 0.3, + "grad_norm": 0.35544067754905573, + "learning_rate": 8.216310803653559e-07, + "loss": 0.2828, + "step": 4693 + }, + { + "epoch": 0.3, + "grad_norm": 0.760208699033786, + "learning_rate": 8.215520022415905e-07, + "loss": 0.2042, + "step": 4694 + }, + { + "epoch": 0.3, + "grad_norm": 0.5296508099772066, + "learning_rate": 8.214729103997704e-07, + "loss": 0.2238, + "step": 4695 + }, + { + "epoch": 0.3, + "grad_norm": 1.5131090269687375, + "learning_rate": 8.213938048432696e-07, + "loss": 0.152, + "step": 4696 + }, + { + "epoch": 0.3, + "grad_norm": 2.266830854764602, + "learning_rate": 8.213146855754632e-07, + "loss": 0.1553, + "step": 4697 + }, + { + "epoch": 0.3, + "grad_norm": 0.7225666498136933, + "learning_rate": 8.212355525997261e-07, + "loss": 0.2243, + "step": 4698 + }, + { + "epoch": 0.3, + "grad_norm": 1.1794197600511782, + "learning_rate": 8.211564059194347e-07, + "loss": 0.0153, + "step": 4699 + }, + { + "epoch": 0.3, + "grad_norm": 0.8186364889420014, + "learning_rate": 8.210772455379656e-07, + "loss": 0.1994, + "step": 4700 + }, + { + "epoch": 0.3, + "grad_norm": 0.8741105299284755, + "learning_rate": 8.209980714586955e-07, + "loss": 0.3851, + "step": 4701 + }, + { + "epoch": 0.3, + "grad_norm": 0.9295748485479789, + "learning_rate": 8.209188836850024e-07, + "loss": 0.145, + "step": 4702 + }, + { + "epoch": 0.3, + "grad_norm": 0.8603070684898152, + "learning_rate": 8.208396822202647e-07, + "loss": 0.1078, + "step": 4703 + }, + { + "epoch": 0.3, + "grad_norm": 2.1274263718944066, + "learning_rate": 8.207604670678612e-07, + "loss": 0.0527, + "step": 4704 + }, + { + "epoch": 0.3, + "grad_norm": 0.4563964387411023, + "learning_rate": 8.206812382311712e-07, + "loss": 0.2161, + "step": 4705 + }, + { + "epoch": 0.3, + "grad_norm": 0.7493729818410559, + "learning_rate": 8.20601995713575e-07, + "loss": 0.4807, + "step": 4706 + }, + { + "epoch": 0.3, + "grad_norm": 0.8077790117145862, + "learning_rate": 8.205227395184533e-07, + "loss": 0.0051, + "step": 4707 + }, + { + "epoch": 0.3, + "grad_norm": 2.3994268722811354, + "learning_rate": 8.204434696491871e-07, + "loss": 0.0114, + "step": 4708 + }, + { + "epoch": 0.3, + "grad_norm": 1.9107950272394667, + "learning_rate": 8.203641861091582e-07, + "loss": 0.0118, + "step": 4709 + }, + { + "epoch": 0.3, + "grad_norm": 0.7385816562045615, + "learning_rate": 8.202848889017493e-07, + "loss": 0.2511, + "step": 4710 + }, + { + "epoch": 0.3, + "grad_norm": 0.14026937237341053, + "learning_rate": 8.202055780303431e-07, + "loss": 0.0637, + "step": 4711 + }, + { + "epoch": 0.3, + "grad_norm": 1.0653851814224091, + "learning_rate": 8.201262534983232e-07, + "loss": 0.156, + "step": 4712 + }, + { + "epoch": 0.3, + "grad_norm": 0.8707820745859838, + "learning_rate": 8.200469153090739e-07, + "loss": 0.202, + "step": 4713 + }, + { + "epoch": 0.3, + "grad_norm": 5.559567352341435, + "learning_rate": 8.199675634659798e-07, + "loss": 0.2786, + "step": 4714 + }, + { + "epoch": 0.3, + "grad_norm": 0.8831184985543606, + "learning_rate": 8.198881979724262e-07, + "loss": 0.023, + "step": 4715 + }, + { + "epoch": 0.3, + "grad_norm": 0.7523378082412079, + "learning_rate": 8.19808818831799e-07, + "loss": 0.2363, + "step": 4716 + }, + { + "epoch": 0.3, + "grad_norm": 1.2402894662560693, + "learning_rate": 8.197294260474848e-07, + "loss": 0.2434, + "step": 4717 + }, + { + "epoch": 0.3, + "grad_norm": 1.9662177624727837, + "learning_rate": 8.196500196228704e-07, + "loss": 0.2969, + "step": 4718 + }, + { + "epoch": 0.3, + "grad_norm": 1.8447923730615003, + "learning_rate": 8.195705995613436e-07, + "loss": 0.0784, + "step": 4719 + }, + { + "epoch": 0.3, + "grad_norm": 1.1200294678811002, + "learning_rate": 8.194911658662927e-07, + "loss": 0.356, + "step": 4720 + }, + { + "epoch": 0.3, + "grad_norm": 0.4122942864613039, + "learning_rate": 8.194117185411062e-07, + "loss": 0.1679, + "step": 4721 + }, + { + "epoch": 0.3, + "grad_norm": 0.5700926088637928, + "learning_rate": 8.193322575891739e-07, + "loss": 0.3432, + "step": 4722 + }, + { + "epoch": 0.3, + "grad_norm": 0.5156157924268693, + "learning_rate": 8.192527830138856e-07, + "loss": 0.0841, + "step": 4723 + }, + { + "epoch": 0.3, + "grad_norm": 0.4370874859474639, + "learning_rate": 8.191732948186316e-07, + "loss": 0.239, + "step": 4724 + }, + { + "epoch": 0.3, + "grad_norm": 1.093057648590071, + "learning_rate": 8.190937930068033e-07, + "loss": 0.2754, + "step": 4725 + }, + { + "epoch": 0.3, + "grad_norm": 1.2555123969253632, + "learning_rate": 8.190142775817923e-07, + "loss": 0.3539, + "step": 4726 + }, + { + "epoch": 0.3, + "grad_norm": 0.41472778104884717, + "learning_rate": 8.189347485469911e-07, + "loss": 0.1164, + "step": 4727 + }, + { + "epoch": 0.3, + "grad_norm": 1.8913508137970223, + "learning_rate": 8.188552059057923e-07, + "loss": 0.071, + "step": 4728 + }, + { + "epoch": 0.3, + "grad_norm": 1.278132125183237, + "learning_rate": 8.187756496615895e-07, + "loss": 0.17, + "step": 4729 + }, + { + "epoch": 0.3, + "grad_norm": 0.8443359027948496, + "learning_rate": 8.186960798177765e-07, + "loss": 0.2811, + "step": 4730 + }, + { + "epoch": 0.3, + "grad_norm": 0.5137199003709494, + "learning_rate": 8.18616496377748e-07, + "loss": 0.0148, + "step": 4731 + }, + { + "epoch": 0.3, + "grad_norm": 0.848923482668164, + "learning_rate": 8.185368993448993e-07, + "loss": 0.0536, + "step": 4732 + }, + { + "epoch": 0.3, + "grad_norm": 0.6488707373978129, + "learning_rate": 8.184572887226263e-07, + "loss": 0.0111, + "step": 4733 + }, + { + "epoch": 0.3, + "grad_norm": 0.27362005094595204, + "learning_rate": 8.183776645143252e-07, + "loss": 0.2206, + "step": 4734 + }, + { + "epoch": 0.3, + "grad_norm": 0.2135317880173803, + "learning_rate": 8.182980267233927e-07, + "loss": 0.016, + "step": 4735 + }, + { + "epoch": 0.3, + "grad_norm": 3.5562540739822275, + "learning_rate": 8.182183753532268e-07, + "loss": 0.3809, + "step": 4736 + }, + { + "epoch": 0.3, + "grad_norm": 0.4851006141667179, + "learning_rate": 8.18138710407225e-07, + "loss": 0.1858, + "step": 4737 + }, + { + "epoch": 0.3, + "grad_norm": 0.8018425586726883, + "learning_rate": 8.180590318887866e-07, + "loss": 0.1847, + "step": 4738 + }, + { + "epoch": 0.3, + "grad_norm": 0.3074670578761211, + "learning_rate": 8.179793398013102e-07, + "loss": 0.0349, + "step": 4739 + }, + { + "epoch": 0.3, + "grad_norm": 1.998361311194355, + "learning_rate": 8.178996341481961e-07, + "loss": 0.1196, + "step": 4740 + }, + { + "epoch": 0.3, + "grad_norm": 0.668081158103692, + "learning_rate": 8.178199149328446e-07, + "loss": 0.2594, + "step": 4741 + }, + { + "epoch": 0.3, + "grad_norm": 0.5350107573420417, + "learning_rate": 8.177401821586567e-07, + "loss": 0.2036, + "step": 4742 + }, + { + "epoch": 0.3, + "grad_norm": 0.3575336661143542, + "learning_rate": 8.176604358290338e-07, + "loss": 0.1067, + "step": 4743 + }, + { + "epoch": 0.3, + "grad_norm": 0.34226669859146513, + "learning_rate": 8.17580675947378e-07, + "loss": 0.1925, + "step": 4744 + }, + { + "epoch": 0.3, + "grad_norm": 0.5353285593550388, + "learning_rate": 8.175009025170922e-07, + "loss": 0.2514, + "step": 4745 + }, + { + "epoch": 0.3, + "grad_norm": 2.4845438326924945, + "learning_rate": 8.174211155415798e-07, + "loss": 0.2897, + "step": 4746 + }, + { + "epoch": 0.3, + "grad_norm": 1.6661578403739732, + "learning_rate": 8.173413150242444e-07, + "loss": 0.2717, + "step": 4747 + }, + { + "epoch": 0.3, + "grad_norm": 0.46315836179789466, + "learning_rate": 8.172615009684905e-07, + "loss": 0.2408, + "step": 4748 + }, + { + "epoch": 0.3, + "grad_norm": 0.8786415967748011, + "learning_rate": 8.171816733777232e-07, + "loss": 0.1703, + "step": 4749 + }, + { + "epoch": 0.3, + "grad_norm": 0.3677112592552782, + "learning_rate": 8.171018322553482e-07, + "loss": 0.1777, + "step": 4750 + }, + { + "epoch": 0.3, + "grad_norm": 2.531046619906957, + "learning_rate": 8.170219776047715e-07, + "loss": 0.3927, + "step": 4751 + }, + { + "epoch": 0.3, + "grad_norm": 0.6553025686291939, + "learning_rate": 8.169421094294e-07, + "loss": 0.28, + "step": 4752 + }, + { + "epoch": 0.3, + "grad_norm": 0.7449111866321586, + "learning_rate": 8.168622277326409e-07, + "loss": 0.308, + "step": 4753 + }, + { + "epoch": 0.3, + "grad_norm": 0.49963409292098165, + "learning_rate": 8.167823325179023e-07, + "loss": 0.1949, + "step": 4754 + }, + { + "epoch": 0.3, + "grad_norm": 1.4718151113525169, + "learning_rate": 8.167024237885927e-07, + "loss": 0.2263, + "step": 4755 + }, + { + "epoch": 0.3, + "grad_norm": 0.45236383778448064, + "learning_rate": 8.166225015481208e-07, + "loss": 0.267, + "step": 4756 + }, + { + "epoch": 0.3, + "grad_norm": 0.6146614210267652, + "learning_rate": 8.165425657998966e-07, + "loss": 0.1569, + "step": 4757 + }, + { + "epoch": 0.3, + "grad_norm": 0.6358796438298233, + "learning_rate": 8.164626165473302e-07, + "loss": 0.243, + "step": 4758 + }, + { + "epoch": 0.3, + "grad_norm": 0.720794840933295, + "learning_rate": 8.163826537938323e-07, + "loss": 0.3877, + "step": 4759 + }, + { + "epoch": 0.3, + "grad_norm": 0.6339632423895353, + "learning_rate": 8.163026775428146e-07, + "loss": 0.1155, + "step": 4760 + }, + { + "epoch": 0.3, + "grad_norm": 0.7540032251864254, + "learning_rate": 8.162226877976886e-07, + "loss": 0.1105, + "step": 4761 + }, + { + "epoch": 0.3, + "grad_norm": 0.7409483817849162, + "learning_rate": 8.161426845618671e-07, + "loss": 0.1984, + "step": 4762 + }, + { + "epoch": 0.3, + "grad_norm": 0.8535988151051185, + "learning_rate": 8.160626678387632e-07, + "loss": 0.2088, + "step": 4763 + }, + { + "epoch": 0.3, + "grad_norm": 1.0317250470698331, + "learning_rate": 8.159826376317906e-07, + "loss": 0.2576, + "step": 4764 + }, + { + "epoch": 0.3, + "grad_norm": 0.7235580007157351, + "learning_rate": 8.159025939443634e-07, + "loss": 0.2651, + "step": 4765 + }, + { + "epoch": 0.3, + "grad_norm": 0.42120452784390594, + "learning_rate": 8.158225367798966e-07, + "loss": 0.2216, + "step": 4766 + }, + { + "epoch": 0.3, + "grad_norm": 0.2914330147067489, + "learning_rate": 8.157424661418054e-07, + "loss": 0.0033, + "step": 4767 + }, + { + "epoch": 0.3, + "grad_norm": 3.3571703923846106, + "learning_rate": 8.156623820335058e-07, + "loss": 0.3403, + "step": 4768 + }, + { + "epoch": 0.3, + "grad_norm": 1.9458826621621397, + "learning_rate": 8.155822844584145e-07, + "loss": 0.2017, + "step": 4769 + }, + { + "epoch": 0.3, + "grad_norm": 0.8301991987674295, + "learning_rate": 8.155021734199486e-07, + "loss": 0.0587, + "step": 4770 + }, + { + "epoch": 0.3, + "grad_norm": 0.13018554076094138, + "learning_rate": 8.154220489215256e-07, + "loss": 0.0026, + "step": 4771 + }, + { + "epoch": 0.3, + "grad_norm": 0.4448594031418288, + "learning_rate": 8.153419109665641e-07, + "loss": 0.1517, + "step": 4772 + }, + { + "epoch": 0.3, + "grad_norm": 0.5194526639357139, + "learning_rate": 8.152617595584825e-07, + "loss": 0.1278, + "step": 4773 + }, + { + "epoch": 0.3, + "grad_norm": 1.1607749301962398, + "learning_rate": 8.151815947007007e-07, + "loss": 0.4137, + "step": 4774 + }, + { + "epoch": 0.3, + "grad_norm": 3.8357605391741285, + "learning_rate": 8.151014163966384e-07, + "loss": 0.0445, + "step": 4775 + }, + { + "epoch": 0.3, + "grad_norm": 0.5136069215013145, + "learning_rate": 8.150212246497164e-07, + "loss": 0.099, + "step": 4776 + }, + { + "epoch": 0.3, + "grad_norm": 1.9389764063044246, + "learning_rate": 8.149410194633554e-07, + "loss": 0.0236, + "step": 4777 + }, + { + "epoch": 0.3, + "grad_norm": 0.23778314579438078, + "learning_rate": 8.148608008409775e-07, + "loss": 0.0924, + "step": 4778 + }, + { + "epoch": 0.3, + "grad_norm": 0.7102426735803246, + "learning_rate": 8.14780568786005e-07, + "loss": 0.0872, + "step": 4779 + }, + { + "epoch": 0.3, + "grad_norm": 1.6122280657389112, + "learning_rate": 8.147003233018605e-07, + "loss": 0.1284, + "step": 4780 + }, + { + "epoch": 0.3, + "grad_norm": 0.5522734828408397, + "learning_rate": 8.146200643919676e-07, + "loss": 0.162, + "step": 4781 + }, + { + "epoch": 0.3, + "grad_norm": 0.3747575208807832, + "learning_rate": 8.145397920597505e-07, + "loss": 0.0953, + "step": 4782 + }, + { + "epoch": 0.31, + "grad_norm": 0.7088931277898864, + "learning_rate": 8.144595063086335e-07, + "loss": 0.0691, + "step": 4783 + }, + { + "epoch": 0.31, + "grad_norm": 0.6193736346032409, + "learning_rate": 8.143792071420417e-07, + "loss": 0.0099, + "step": 4784 + }, + { + "epoch": 0.31, + "grad_norm": 1.0455965018983127, + "learning_rate": 8.142988945634009e-07, + "loss": 0.1422, + "step": 4785 + }, + { + "epoch": 0.31, + "grad_norm": 0.6153189687685371, + "learning_rate": 8.142185685761375e-07, + "loss": 0.1013, + "step": 4786 + }, + { + "epoch": 0.31, + "grad_norm": 0.5786634886782436, + "learning_rate": 8.141382291836783e-07, + "loss": 0.1876, + "step": 4787 + }, + { + "epoch": 0.31, + "grad_norm": 0.33442578786895083, + "learning_rate": 8.140578763894508e-07, + "loss": 0.0659, + "step": 4788 + }, + { + "epoch": 0.31, + "grad_norm": 0.7147668461843721, + "learning_rate": 8.139775101968829e-07, + "loss": 0.164, + "step": 4789 + }, + { + "epoch": 0.31, + "grad_norm": 4.151225999367275, + "learning_rate": 8.138971306094033e-07, + "loss": 0.3153, + "step": 4790 + }, + { + "epoch": 0.31, + "grad_norm": 1.0706416998260024, + "learning_rate": 8.13816737630441e-07, + "loss": 0.0871, + "step": 4791 + }, + { + "epoch": 0.31, + "grad_norm": 0.584298758890585, + "learning_rate": 8.137363312634258e-07, + "loss": 0.1895, + "step": 4792 + }, + { + "epoch": 0.31, + "grad_norm": 0.6246688718089787, + "learning_rate": 8.136559115117881e-07, + "loss": 0.2332, + "step": 4793 + }, + { + "epoch": 0.31, + "grad_norm": 1.2684598818504136, + "learning_rate": 8.135754783789587e-07, + "loss": 0.0355, + "step": 4794 + }, + { + "epoch": 0.31, + "grad_norm": 1.5161168130043927, + "learning_rate": 8.134950318683691e-07, + "loss": 0.1363, + "step": 4795 + }, + { + "epoch": 0.31, + "grad_norm": 0.645203944916089, + "learning_rate": 8.134145719834511e-07, + "loss": 0.1471, + "step": 4796 + }, + { + "epoch": 0.31, + "grad_norm": 0.7895951644051908, + "learning_rate": 8.133340987276375e-07, + "loss": 0.2596, + "step": 4797 + }, + { + "epoch": 0.31, + "grad_norm": 0.6105455471126823, + "learning_rate": 8.132536121043613e-07, + "loss": 0.1016, + "step": 4798 + }, + { + "epoch": 0.31, + "grad_norm": 0.7463146149380439, + "learning_rate": 8.131731121170563e-07, + "loss": 0.4144, + "step": 4799 + }, + { + "epoch": 0.31, + "grad_norm": 0.703424109933384, + "learning_rate": 8.130925987691568e-07, + "loss": 0.215, + "step": 4800 + }, + { + "epoch": 0.31, + "grad_norm": 1.1396604059930846, + "learning_rate": 8.130120720640976e-07, + "loss": 0.3308, + "step": 4801 + }, + { + "epoch": 0.31, + "grad_norm": 1.1795720050065013, + "learning_rate": 8.129315320053143e-07, + "loss": 0.1339, + "step": 4802 + }, + { + "epoch": 0.31, + "grad_norm": 0.8063374908340881, + "learning_rate": 8.128509785962427e-07, + "loss": 0.2179, + "step": 4803 + }, + { + "epoch": 0.31, + "grad_norm": 0.38986296951702754, + "learning_rate": 8.127704118403194e-07, + "loss": 0.2449, + "step": 4804 + }, + { + "epoch": 0.31, + "grad_norm": 1.3440784561137122, + "learning_rate": 8.126898317409816e-07, + "loss": 0.1668, + "step": 4805 + }, + { + "epoch": 0.31, + "grad_norm": 0.27625082109998883, + "learning_rate": 8.12609238301667e-07, + "loss": 0.1278, + "step": 4806 + }, + { + "epoch": 0.31, + "grad_norm": 0.5367519695412856, + "learning_rate": 8.125286315258139e-07, + "loss": 0.1201, + "step": 4807 + }, + { + "epoch": 0.31, + "grad_norm": 0.3586161860975536, + "learning_rate": 8.124480114168611e-07, + "loss": 0.1583, + "step": 4808 + }, + { + "epoch": 0.31, + "grad_norm": 0.5083697358079993, + "learning_rate": 8.12367377978248e-07, + "loss": 0.1727, + "step": 4809 + }, + { + "epoch": 0.31, + "grad_norm": 0.37507640697416395, + "learning_rate": 8.122867312134147e-07, + "loss": 0.2108, + "step": 4810 + }, + { + "epoch": 0.31, + "grad_norm": 0.9980083401077641, + "learning_rate": 8.122060711258017e-07, + "loss": 0.2243, + "step": 4811 + }, + { + "epoch": 0.31, + "grad_norm": 0.8726441713134545, + "learning_rate": 8.121253977188499e-07, + "loss": 0.266, + "step": 4812 + }, + { + "epoch": 0.31, + "grad_norm": 0.8992933782380833, + "learning_rate": 8.120447109960014e-07, + "loss": 0.3619, + "step": 4813 + }, + { + "epoch": 0.31, + "grad_norm": 0.26986011613380834, + "learning_rate": 8.11964010960698e-07, + "loss": 0.0722, + "step": 4814 + }, + { + "epoch": 0.31, + "grad_norm": 0.8749899382429204, + "learning_rate": 8.11883297616383e-07, + "loss": 0.2008, + "step": 4815 + }, + { + "epoch": 0.31, + "grad_norm": 0.41960032165463595, + "learning_rate": 8.118025709664994e-07, + "loss": 0.1805, + "step": 4816 + }, + { + "epoch": 0.31, + "grad_norm": 0.5619040980184855, + "learning_rate": 8.117218310144913e-07, + "loss": 0.3494, + "step": 4817 + }, + { + "epoch": 0.31, + "grad_norm": 0.3700513371571906, + "learning_rate": 8.116410777638034e-07, + "loss": 0.06, + "step": 4818 + }, + { + "epoch": 0.31, + "grad_norm": 0.669604798969257, + "learning_rate": 8.115603112178806e-07, + "loss": 0.2491, + "step": 4819 + }, + { + "epoch": 0.31, + "grad_norm": 1.265929445107414, + "learning_rate": 8.114795313801686e-07, + "loss": 0.1283, + "step": 4820 + }, + { + "epoch": 0.31, + "grad_norm": 2.125882682580364, + "learning_rate": 8.113987382541136e-07, + "loss": 0.0843, + "step": 4821 + }, + { + "epoch": 0.31, + "grad_norm": 0.5303452032060689, + "learning_rate": 8.113179318431624e-07, + "loss": 0.3097, + "step": 4822 + }, + { + "epoch": 0.31, + "grad_norm": 0.6740244575310982, + "learning_rate": 8.112371121507625e-07, + "loss": 0.1511, + "step": 4823 + }, + { + "epoch": 0.31, + "grad_norm": 1.8220889534531641, + "learning_rate": 8.111562791803617e-07, + "loss": 0.132, + "step": 4824 + }, + { + "epoch": 0.31, + "grad_norm": 5.358675217109917, + "learning_rate": 8.110754329354086e-07, + "loss": 0.3406, + "step": 4825 + }, + { + "epoch": 0.31, + "grad_norm": 4.891636297348018, + "learning_rate": 8.10994573419352e-07, + "loss": 0.2714, + "step": 4826 + }, + { + "epoch": 0.31, + "grad_norm": 0.2897380692260539, + "learning_rate": 8.109137006356419e-07, + "loss": 0.1168, + "step": 4827 + }, + { + "epoch": 0.31, + "grad_norm": 1.487431741184058, + "learning_rate": 8.108328145877282e-07, + "loss": 0.3414, + "step": 4828 + }, + { + "epoch": 0.31, + "grad_norm": 0.661817392387723, + "learning_rate": 8.107519152790619e-07, + "loss": 0.3947, + "step": 4829 + }, + { + "epoch": 0.31, + "grad_norm": 1.0208401780032037, + "learning_rate": 8.10671002713094e-07, + "loss": 0.2575, + "step": 4830 + }, + { + "epoch": 0.31, + "grad_norm": 1.2116693190295287, + "learning_rate": 8.105900768932767e-07, + "loss": 0.3512, + "step": 4831 + }, + { + "epoch": 0.31, + "grad_norm": 1.4987501829098024, + "learning_rate": 8.105091378230624e-07, + "loss": 0.0436, + "step": 4832 + }, + { + "epoch": 0.31, + "grad_norm": 0.8342103669152526, + "learning_rate": 8.10428185505904e-07, + "loss": 0.5072, + "step": 4833 + }, + { + "epoch": 0.31, + "grad_norm": 0.8757327035704235, + "learning_rate": 8.103472199452553e-07, + "loss": 0.4112, + "step": 4834 + }, + { + "epoch": 0.31, + "grad_norm": 0.5415438431193039, + "learning_rate": 8.102662411445702e-07, + "loss": 0.2089, + "step": 4835 + }, + { + "epoch": 0.31, + "grad_norm": 0.4508114582543965, + "learning_rate": 8.101852491073036e-07, + "loss": 0.262, + "step": 4836 + }, + { + "epoch": 0.31, + "grad_norm": 1.5815245402928737, + "learning_rate": 8.101042438369108e-07, + "loss": 0.1776, + "step": 4837 + }, + { + "epoch": 0.31, + "grad_norm": 0.8965872238094595, + "learning_rate": 8.100232253368474e-07, + "loss": 0.33, + "step": 4838 + }, + { + "epoch": 0.31, + "grad_norm": 0.35410971342035186, + "learning_rate": 8.099421936105702e-07, + "loss": 0.0168, + "step": 4839 + }, + { + "epoch": 0.31, + "grad_norm": 2.2150666514128075, + "learning_rate": 8.098611486615357e-07, + "loss": 0.0392, + "step": 4840 + }, + { + "epoch": 0.31, + "grad_norm": 1.033711989512499, + "learning_rate": 8.097800904932018e-07, + "loss": 0.2871, + "step": 4841 + }, + { + "epoch": 0.31, + "grad_norm": 1.0445909649421055, + "learning_rate": 8.096990191090265e-07, + "loss": 0.211, + "step": 4842 + }, + { + "epoch": 0.31, + "grad_norm": 3.588543664066317, + "learning_rate": 8.096179345124685e-07, + "loss": 0.1082, + "step": 4843 + }, + { + "epoch": 0.31, + "grad_norm": 0.914117310489967, + "learning_rate": 8.09536836706987e-07, + "loss": 0.0876, + "step": 4844 + }, + { + "epoch": 0.31, + "grad_norm": 0.36171155517429354, + "learning_rate": 8.094557256960419e-07, + "loss": 0.1091, + "step": 4845 + }, + { + "epoch": 0.31, + "grad_norm": 1.9676226298196176, + "learning_rate": 8.093746014830933e-07, + "loss": 0.0858, + "step": 4846 + }, + { + "epoch": 0.31, + "grad_norm": 0.6714200761593854, + "learning_rate": 8.092934640716023e-07, + "loss": 0.2552, + "step": 4847 + }, + { + "epoch": 0.31, + "grad_norm": 0.7545950416077731, + "learning_rate": 8.092123134650304e-07, + "loss": 0.3534, + "step": 4848 + }, + { + "epoch": 0.31, + "grad_norm": 0.4545882543203872, + "learning_rate": 8.091311496668396e-07, + "loss": 0.0389, + "step": 4849 + }, + { + "epoch": 0.31, + "grad_norm": 1.4086507364432175, + "learning_rate": 8.090499726804924e-07, + "loss": 0.3278, + "step": 4850 + }, + { + "epoch": 0.31, + "grad_norm": 0.6553200148490134, + "learning_rate": 8.089687825094524e-07, + "loss": 0.1967, + "step": 4851 + }, + { + "epoch": 0.31, + "grad_norm": 0.9493328856123927, + "learning_rate": 8.088875791571829e-07, + "loss": 0.0726, + "step": 4852 + }, + { + "epoch": 0.31, + "grad_norm": 0.7316791064460415, + "learning_rate": 8.088063626271482e-07, + "loss": 0.3481, + "step": 4853 + }, + { + "epoch": 0.31, + "grad_norm": 0.8532338216162878, + "learning_rate": 8.087251329228135e-07, + "loss": 0.4165, + "step": 4854 + }, + { + "epoch": 0.31, + "grad_norm": 1.182696331300221, + "learning_rate": 8.086438900476437e-07, + "loss": 0.0125, + "step": 4855 + }, + { + "epoch": 0.31, + "grad_norm": 1.2246967929530859, + "learning_rate": 8.085626340051054e-07, + "loss": 0.0899, + "step": 4856 + }, + { + "epoch": 0.31, + "grad_norm": 1.5421316152513138, + "learning_rate": 8.084813647986648e-07, + "loss": 0.1918, + "step": 4857 + }, + { + "epoch": 0.31, + "grad_norm": 0.5652342853883778, + "learning_rate": 8.08400082431789e-07, + "loss": 0.1306, + "step": 4858 + }, + { + "epoch": 0.31, + "grad_norm": 0.585916229816176, + "learning_rate": 8.083187869079458e-07, + "loss": 0.1555, + "step": 4859 + }, + { + "epoch": 0.31, + "grad_norm": 0.4553496465574186, + "learning_rate": 8.082374782306032e-07, + "loss": 0.1156, + "step": 4860 + }, + { + "epoch": 0.31, + "grad_norm": 0.2827301344247135, + "learning_rate": 8.081561564032302e-07, + "loss": 0.2535, + "step": 4861 + }, + { + "epoch": 0.31, + "grad_norm": 0.4708930856967913, + "learning_rate": 8.080748214292961e-07, + "loss": 0.1788, + "step": 4862 + }, + { + "epoch": 0.31, + "grad_norm": 0.895992974866067, + "learning_rate": 8.079934733122707e-07, + "loss": 0.2384, + "step": 4863 + }, + { + "epoch": 0.31, + "grad_norm": 0.9405193118592018, + "learning_rate": 8.079121120556247e-07, + "loss": 0.0869, + "step": 4864 + }, + { + "epoch": 0.31, + "grad_norm": 0.7232272466958303, + "learning_rate": 8.07830737662829e-07, + "loss": 0.1209, + "step": 4865 + }, + { + "epoch": 0.31, + "grad_norm": 2.066643556053722, + "learning_rate": 8.077493501373554e-07, + "loss": 0.1455, + "step": 4866 + }, + { + "epoch": 0.31, + "grad_norm": 0.47089877034493316, + "learning_rate": 8.076679494826757e-07, + "loss": 0.141, + "step": 4867 + }, + { + "epoch": 0.31, + "grad_norm": 1.121471486827043, + "learning_rate": 8.075865357022628e-07, + "loss": 0.046, + "step": 4868 + }, + { + "epoch": 0.31, + "grad_norm": 2.158549015956193, + "learning_rate": 8.075051087995899e-07, + "loss": 0.1862, + "step": 4869 + }, + { + "epoch": 0.31, + "grad_norm": 0.9730069506178508, + "learning_rate": 8.074236687781309e-07, + "loss": 0.4972, + "step": 4870 + }, + { + "epoch": 0.31, + "grad_norm": 1.0793488888012408, + "learning_rate": 8.073422156413603e-07, + "loss": 0.373, + "step": 4871 + }, + { + "epoch": 0.31, + "grad_norm": 1.1993090734989529, + "learning_rate": 8.072607493927528e-07, + "loss": 0.0676, + "step": 4872 + }, + { + "epoch": 0.31, + "grad_norm": 0.5660302550557087, + "learning_rate": 8.071792700357842e-07, + "loss": 0.2421, + "step": 4873 + }, + { + "epoch": 0.31, + "grad_norm": 0.7597048493635756, + "learning_rate": 8.070977775739304e-07, + "loss": 0.2999, + "step": 4874 + }, + { + "epoch": 0.31, + "grad_norm": 0.38808110568411536, + "learning_rate": 8.070162720106679e-07, + "loss": 0.2452, + "step": 4875 + }, + { + "epoch": 0.31, + "grad_norm": 0.8209025724927607, + "learning_rate": 8.069347533494744e-07, + "loss": 0.1285, + "step": 4876 + }, + { + "epoch": 0.31, + "grad_norm": 0.8209839213021729, + "learning_rate": 8.068532215938269e-07, + "loss": 0.2401, + "step": 4877 + }, + { + "epoch": 0.31, + "grad_norm": 0.9038983135784068, + "learning_rate": 8.067716767472044e-07, + "loss": 0.1235, + "step": 4878 + }, + { + "epoch": 0.31, + "grad_norm": 0.7128631175040728, + "learning_rate": 8.066901188130854e-07, + "loss": 0.3511, + "step": 4879 + }, + { + "epoch": 0.31, + "grad_norm": 0.7170598170129175, + "learning_rate": 8.066085477949494e-07, + "loss": 0.0227, + "step": 4880 + }, + { + "epoch": 0.31, + "grad_norm": 0.62173391419443, + "learning_rate": 8.065269636962763e-07, + "loss": 0.4253, + "step": 4881 + }, + { + "epoch": 0.31, + "grad_norm": 6.947325453129527, + "learning_rate": 8.06445366520547e-07, + "loss": 0.1273, + "step": 4882 + }, + { + "epoch": 0.31, + "grad_norm": 1.0286287147679303, + "learning_rate": 8.063637562712421e-07, + "loss": 0.1125, + "step": 4883 + }, + { + "epoch": 0.31, + "grad_norm": 1.187943991890367, + "learning_rate": 8.062821329518435e-07, + "loss": 0.1838, + "step": 4884 + }, + { + "epoch": 0.31, + "grad_norm": 0.2909373566917518, + "learning_rate": 8.062004965658336e-07, + "loss": 0.0796, + "step": 4885 + }, + { + "epoch": 0.31, + "grad_norm": 0.11145047618391998, + "learning_rate": 8.061188471166947e-07, + "loss": 0.0973, + "step": 4886 + }, + { + "epoch": 0.31, + "grad_norm": 0.5975929567822871, + "learning_rate": 8.060371846079106e-07, + "loss": 0.2333, + "step": 4887 + }, + { + "epoch": 0.31, + "grad_norm": 0.8888907966553493, + "learning_rate": 8.059555090429649e-07, + "loss": 0.3266, + "step": 4888 + }, + { + "epoch": 0.31, + "grad_norm": 0.6178344911289139, + "learning_rate": 8.058738204253421e-07, + "loss": 0.4046, + "step": 4889 + }, + { + "epoch": 0.31, + "grad_norm": 0.3801275160196431, + "learning_rate": 8.057921187585273e-07, + "loss": 0.2519, + "step": 4890 + }, + { + "epoch": 0.31, + "grad_norm": 1.8939016166809852, + "learning_rate": 8.057104040460061e-07, + "loss": 0.2632, + "step": 4891 + }, + { + "epoch": 0.31, + "grad_norm": 0.6356544303041343, + "learning_rate": 8.056286762912643e-07, + "loss": 0.1726, + "step": 4892 + }, + { + "epoch": 0.31, + "grad_norm": 0.3755835281239818, + "learning_rate": 8.055469354977889e-07, + "loss": 0.1538, + "step": 4893 + }, + { + "epoch": 0.31, + "grad_norm": 0.45209701749726516, + "learning_rate": 8.054651816690669e-07, + "loss": 0.0788, + "step": 4894 + }, + { + "epoch": 0.31, + "grad_norm": 0.584728209799264, + "learning_rate": 8.053834148085864e-07, + "loss": 0.1629, + "step": 4895 + }, + { + "epoch": 0.31, + "grad_norm": 0.6760059551912412, + "learning_rate": 8.053016349198354e-07, + "loss": 0.1113, + "step": 4896 + }, + { + "epoch": 0.31, + "grad_norm": 0.36028504095486713, + "learning_rate": 8.052198420063029e-07, + "loss": 0.0753, + "step": 4897 + }, + { + "epoch": 0.31, + "grad_norm": 0.3252513191684513, + "learning_rate": 8.051380360714783e-07, + "loss": 0.1073, + "step": 4898 + }, + { + "epoch": 0.31, + "grad_norm": 0.9650826262026042, + "learning_rate": 8.050562171188519e-07, + "loss": 0.3438, + "step": 4899 + }, + { + "epoch": 0.31, + "grad_norm": 0.6943397455835096, + "learning_rate": 8.049743851519139e-07, + "loss": 0.0816, + "step": 4900 + }, + { + "epoch": 0.31, + "grad_norm": 0.6936148521736064, + "learning_rate": 8.048925401741555e-07, + "loss": 0.2057, + "step": 4901 + }, + { + "epoch": 0.31, + "grad_norm": 1.5132755435461887, + "learning_rate": 8.048106821890686e-07, + "loss": 0.1608, + "step": 4902 + }, + { + "epoch": 0.31, + "grad_norm": 0.6639518584758515, + "learning_rate": 8.04728811200145e-07, + "loss": 0.37, + "step": 4903 + }, + { + "epoch": 0.31, + "grad_norm": 0.7177421838414897, + "learning_rate": 8.046469272108779e-07, + "loss": 0.2359, + "step": 4904 + }, + { + "epoch": 0.31, + "grad_norm": 1.5432046176468095, + "learning_rate": 8.045650302247604e-07, + "loss": 0.2579, + "step": 4905 + }, + { + "epoch": 0.31, + "grad_norm": 1.2688296023595715, + "learning_rate": 8.044831202452864e-07, + "loss": 0.111, + "step": 4906 + }, + { + "epoch": 0.31, + "grad_norm": 0.1354848906039784, + "learning_rate": 8.044011972759507e-07, + "loss": 0.0047, + "step": 4907 + }, + { + "epoch": 0.31, + "grad_norm": 0.3150780133923562, + "learning_rate": 8.043192613202479e-07, + "loss": 0.0636, + "step": 4908 + }, + { + "epoch": 0.31, + "grad_norm": 0.43616701414274484, + "learning_rate": 8.042373123816735e-07, + "loss": 0.1428, + "step": 4909 + }, + { + "epoch": 0.31, + "grad_norm": 2.4719856867245222, + "learning_rate": 8.041553504637237e-07, + "loss": 0.1362, + "step": 4910 + }, + { + "epoch": 0.31, + "grad_norm": 2.52111897260746, + "learning_rate": 8.040733755698955e-07, + "loss": 0.1843, + "step": 4911 + }, + { + "epoch": 0.31, + "grad_norm": 0.8024011424709545, + "learning_rate": 8.039913877036855e-07, + "loss": 0.3262, + "step": 4912 + }, + { + "epoch": 0.31, + "grad_norm": 0.9411942662342757, + "learning_rate": 8.03909386868592e-07, + "loss": 0.1619, + "step": 4913 + }, + { + "epoch": 0.31, + "grad_norm": 4.379404535635887, + "learning_rate": 8.038273730681131e-07, + "loss": 0.1127, + "step": 4914 + }, + { + "epoch": 0.31, + "grad_norm": 0.3482281884152491, + "learning_rate": 8.037453463057476e-07, + "loss": 0.2595, + "step": 4915 + }, + { + "epoch": 0.31, + "grad_norm": 1.5181406813260363, + "learning_rate": 8.036633065849952e-07, + "loss": 0.1973, + "step": 4916 + }, + { + "epoch": 0.31, + "grad_norm": 2.9929727899045164, + "learning_rate": 8.035812539093556e-07, + "loss": 0.2721, + "step": 4917 + }, + { + "epoch": 0.31, + "grad_norm": 0.41468132224453363, + "learning_rate": 8.034991882823295e-07, + "loss": 0.1691, + "step": 4918 + }, + { + "epoch": 0.31, + "grad_norm": 0.7564116362954902, + "learning_rate": 8.034171097074178e-07, + "loss": 0.2941, + "step": 4919 + }, + { + "epoch": 0.31, + "grad_norm": 1.0419087637830673, + "learning_rate": 8.033350181881223e-07, + "loss": 0.3469, + "step": 4920 + }, + { + "epoch": 0.31, + "grad_norm": 0.9600973468805437, + "learning_rate": 8.032529137279452e-07, + "loss": 0.3099, + "step": 4921 + }, + { + "epoch": 0.31, + "grad_norm": 0.8859863115949071, + "learning_rate": 8.03170796330389e-07, + "loss": 0.2808, + "step": 4922 + }, + { + "epoch": 0.31, + "grad_norm": 0.3729753295058565, + "learning_rate": 8.030886659989575e-07, + "loss": 0.1751, + "step": 4923 + }, + { + "epoch": 0.31, + "grad_norm": 1.7550129699030261, + "learning_rate": 8.03006522737154e-07, + "loss": 0.294, + "step": 4924 + }, + { + "epoch": 0.31, + "grad_norm": 1.1835190904054413, + "learning_rate": 8.029243665484832e-07, + "loss": 0.2282, + "step": 4925 + }, + { + "epoch": 0.31, + "grad_norm": 0.4322986351748098, + "learning_rate": 8.028421974364499e-07, + "loss": 0.3293, + "step": 4926 + }, + { + "epoch": 0.31, + "grad_norm": 0.6129394757652573, + "learning_rate": 8.027600154045597e-07, + "loss": 0.2775, + "step": 4927 + }, + { + "epoch": 0.31, + "grad_norm": 0.6999076153066456, + "learning_rate": 8.026778204563186e-07, + "loss": 0.3713, + "step": 4928 + }, + { + "epoch": 0.31, + "grad_norm": 0.528830069210588, + "learning_rate": 8.025956125952333e-07, + "loss": 0.1196, + "step": 4929 + }, + { + "epoch": 0.31, + "grad_norm": 1.061599656854315, + "learning_rate": 8.025133918248108e-07, + "loss": 0.2902, + "step": 4930 + }, + { + "epoch": 0.31, + "grad_norm": 0.5694961102214035, + "learning_rate": 8.024311581485588e-07, + "loss": 0.0822, + "step": 4931 + }, + { + "epoch": 0.31, + "grad_norm": 0.5561115290492764, + "learning_rate": 8.023489115699857e-07, + "loss": 0.1893, + "step": 4932 + }, + { + "epoch": 0.31, + "grad_norm": 0.6327645403669329, + "learning_rate": 8.022666520926003e-07, + "loss": 0.3066, + "step": 4933 + }, + { + "epoch": 0.31, + "grad_norm": 0.589508861362452, + "learning_rate": 8.021843797199119e-07, + "loss": 0.0339, + "step": 4934 + }, + { + "epoch": 0.31, + "grad_norm": 0.8778632804172244, + "learning_rate": 8.021020944554304e-07, + "loss": 0.0416, + "step": 4935 + }, + { + "epoch": 0.31, + "grad_norm": 0.7568867874491414, + "learning_rate": 8.020197963026662e-07, + "loss": 0.333, + "step": 4936 + }, + { + "epoch": 0.31, + "grad_norm": 0.5578812210514541, + "learning_rate": 8.019374852651302e-07, + "loss": 0.1473, + "step": 4937 + }, + { + "epoch": 0.31, + "grad_norm": 0.30723336690661857, + "learning_rate": 8.018551613463344e-07, + "loss": 0.2139, + "step": 4938 + }, + { + "epoch": 0.31, + "grad_norm": 0.2492561977176489, + "learning_rate": 8.017728245497903e-07, + "loss": 0.1933, + "step": 4939 + }, + { + "epoch": 0.32, + "grad_norm": 0.6182145102174467, + "learning_rate": 8.016904748790112e-07, + "loss": 0.1276, + "step": 4940 + }, + { + "epoch": 0.32, + "grad_norm": 0.31288799637361114, + "learning_rate": 8.016081123375097e-07, + "loss": 0.0768, + "step": 4941 + }, + { + "epoch": 0.32, + "grad_norm": 0.578768112587766, + "learning_rate": 8.015257369287999e-07, + "loss": 0.2145, + "step": 4942 + }, + { + "epoch": 0.32, + "grad_norm": 0.6243010586183911, + "learning_rate": 8.014433486563961e-07, + "loss": 0.2143, + "step": 4943 + }, + { + "epoch": 0.32, + "grad_norm": 0.5772522433540507, + "learning_rate": 8.01360947523813e-07, + "loss": 0.1336, + "step": 4944 + }, + { + "epoch": 0.32, + "grad_norm": 3.154009492996312, + "learning_rate": 8.01278533534566e-07, + "loss": 0.1029, + "step": 4945 + }, + { + "epoch": 0.32, + "grad_norm": 0.5637675702015749, + "learning_rate": 8.011961066921712e-07, + "loss": 0.3088, + "step": 4946 + }, + { + "epoch": 0.32, + "grad_norm": 0.8447865031556825, + "learning_rate": 8.01113667000145e-07, + "loss": 0.2675, + "step": 4947 + }, + { + "epoch": 0.32, + "grad_norm": 0.8427574294223372, + "learning_rate": 8.010312144620045e-07, + "loss": 0.0268, + "step": 4948 + }, + { + "epoch": 0.32, + "grad_norm": 0.6887712855980462, + "learning_rate": 8.009487490812671e-07, + "loss": 0.1334, + "step": 4949 + }, + { + "epoch": 0.32, + "grad_norm": 1.150689543029537, + "learning_rate": 8.008662708614513e-07, + "loss": 0.11, + "step": 4950 + }, + { + "epoch": 0.32, + "grad_norm": 0.19335587316152167, + "learning_rate": 8.007837798060754e-07, + "loss": 0.07, + "step": 4951 + }, + { + "epoch": 0.32, + "grad_norm": 5.580050594643492, + "learning_rate": 8.007012759186589e-07, + "loss": 0.2524, + "step": 4952 + }, + { + "epoch": 0.32, + "grad_norm": 0.3589227727435151, + "learning_rate": 8.006187592027213e-07, + "loss": 0.1275, + "step": 4953 + }, + { + "epoch": 0.32, + "grad_norm": 1.0979953933170263, + "learning_rate": 8.005362296617833e-07, + "loss": 0.4041, + "step": 4954 + }, + { + "epoch": 0.32, + "grad_norm": 0.8801438577089142, + "learning_rate": 8.004536872993655e-07, + "loss": 0.0907, + "step": 4955 + }, + { + "epoch": 0.32, + "grad_norm": 0.6296143796044339, + "learning_rate": 8.003711321189895e-07, + "loss": 0.0866, + "step": 4956 + }, + { + "epoch": 0.32, + "grad_norm": 0.7793038701861162, + "learning_rate": 8.00288564124177e-07, + "loss": 0.2095, + "step": 4957 + }, + { + "epoch": 0.32, + "grad_norm": 0.37715338537872434, + "learning_rate": 8.002059833184509e-07, + "loss": 0.0245, + "step": 4958 + }, + { + "epoch": 0.32, + "grad_norm": 0.7644760400658014, + "learning_rate": 8.001233897053339e-07, + "loss": 0.3113, + "step": 4959 + }, + { + "epoch": 0.32, + "grad_norm": 0.6204781145389356, + "learning_rate": 8.000407832883498e-07, + "loss": 0.2237, + "step": 4960 + }, + { + "epoch": 0.32, + "grad_norm": 0.6320265579175591, + "learning_rate": 7.999581640710229e-07, + "loss": 0.1138, + "step": 4961 + }, + { + "epoch": 0.32, + "grad_norm": 0.8650039911746136, + "learning_rate": 7.998755320568777e-07, + "loss": 0.0433, + "step": 4962 + }, + { + "epoch": 0.32, + "grad_norm": 0.4204365329912512, + "learning_rate": 7.997928872494393e-07, + "loss": 0.0049, + "step": 4963 + }, + { + "epoch": 0.32, + "grad_norm": 0.5922103444989494, + "learning_rate": 7.997102296522338e-07, + "loss": 0.3073, + "step": 4964 + }, + { + "epoch": 0.32, + "grad_norm": 0.4635430753988371, + "learning_rate": 7.996275592687873e-07, + "loss": 0.1315, + "step": 4965 + }, + { + "epoch": 0.32, + "grad_norm": 1.4221298468018966, + "learning_rate": 7.995448761026269e-07, + "loss": 0.0955, + "step": 4966 + }, + { + "epoch": 0.32, + "grad_norm": 0.7395847734958292, + "learning_rate": 7.994621801572799e-07, + "loss": 0.0599, + "step": 4967 + }, + { + "epoch": 0.32, + "grad_norm": 2.738676426321658, + "learning_rate": 7.993794714362743e-07, + "loss": 0.2013, + "step": 4968 + }, + { + "epoch": 0.32, + "grad_norm": 1.726919285608865, + "learning_rate": 7.992967499431386e-07, + "loss": 0.3061, + "step": 4969 + }, + { + "epoch": 0.32, + "grad_norm": 1.6451475246976672, + "learning_rate": 7.992140156814018e-07, + "loss": 0.1609, + "step": 4970 + }, + { + "epoch": 0.32, + "grad_norm": 0.7994856861678853, + "learning_rate": 7.991312686545937e-07, + "loss": 0.2259, + "step": 4971 + }, + { + "epoch": 0.32, + "grad_norm": 0.5418123855404976, + "learning_rate": 7.990485088662444e-07, + "loss": 0.0922, + "step": 4972 + }, + { + "epoch": 0.32, + "grad_norm": 1.2629885242231957, + "learning_rate": 7.989657363198844e-07, + "loss": 0.3647, + "step": 4973 + }, + { + "epoch": 0.32, + "grad_norm": 0.7995087081555955, + "learning_rate": 7.988829510190451e-07, + "loss": 0.3445, + "step": 4974 + }, + { + "epoch": 0.32, + "grad_norm": 0.6500852431645989, + "learning_rate": 7.988001529672586e-07, + "loss": 0.0953, + "step": 4975 + }, + { + "epoch": 0.32, + "grad_norm": 0.4354991781289608, + "learning_rate": 7.987173421680566e-07, + "loss": 0.3204, + "step": 4976 + }, + { + "epoch": 0.32, + "grad_norm": 0.45323271759854744, + "learning_rate": 7.986345186249724e-07, + "loss": 0.1694, + "step": 4977 + }, + { + "epoch": 0.32, + "grad_norm": 0.6257471185124001, + "learning_rate": 7.985516823415393e-07, + "loss": 0.2966, + "step": 4978 + }, + { + "epoch": 0.32, + "grad_norm": 1.9748700051738195, + "learning_rate": 7.984688333212911e-07, + "loss": 0.3093, + "step": 4979 + }, + { + "epoch": 0.32, + "grad_norm": 1.766471716127174, + "learning_rate": 7.983859715677626e-07, + "loss": 0.0133, + "step": 4980 + }, + { + "epoch": 0.32, + "grad_norm": 1.780145636884804, + "learning_rate": 7.983030970844886e-07, + "loss": 0.1167, + "step": 4981 + }, + { + "epoch": 0.32, + "grad_norm": 1.514590505070607, + "learning_rate": 7.98220209875005e-07, + "loss": 0.2056, + "step": 4982 + }, + { + "epoch": 0.32, + "grad_norm": 0.7500044643537416, + "learning_rate": 7.981373099428477e-07, + "loss": 0.1141, + "step": 4983 + }, + { + "epoch": 0.32, + "grad_norm": 1.9443011383275393, + "learning_rate": 7.980543972915534e-07, + "loss": 0.0075, + "step": 4984 + }, + { + "epoch": 0.32, + "grad_norm": 0.3823654877624576, + "learning_rate": 7.979714719246594e-07, + "loss": 0.1604, + "step": 4985 + }, + { + "epoch": 0.32, + "grad_norm": 0.5027893630783101, + "learning_rate": 7.978885338457033e-07, + "loss": 0.1724, + "step": 4986 + }, + { + "epoch": 0.32, + "grad_norm": 0.490008379196501, + "learning_rate": 7.978055830582235e-07, + "loss": 0.1139, + "step": 4987 + }, + { + "epoch": 0.32, + "grad_norm": 3.825093526811515, + "learning_rate": 7.97722619565759e-07, + "loss": 0.1198, + "step": 4988 + }, + { + "epoch": 0.32, + "grad_norm": 1.312071666363919, + "learning_rate": 7.976396433718491e-07, + "loss": 0.1773, + "step": 4989 + }, + { + "epoch": 0.32, + "grad_norm": 0.620296038300229, + "learning_rate": 7.975566544800336e-07, + "loss": 0.3288, + "step": 4990 + }, + { + "epoch": 0.32, + "grad_norm": 0.5364438222770149, + "learning_rate": 7.97473652893853e-07, + "loss": 0.229, + "step": 4991 + }, + { + "epoch": 0.32, + "grad_norm": 0.6438409456228207, + "learning_rate": 7.973906386168484e-07, + "loss": 0.3674, + "step": 4992 + }, + { + "epoch": 0.32, + "grad_norm": 1.1231179404140257, + "learning_rate": 7.973076116525613e-07, + "loss": 0.2368, + "step": 4993 + }, + { + "epoch": 0.32, + "grad_norm": 1.486323983764233, + "learning_rate": 7.972245720045339e-07, + "loss": 0.0239, + "step": 4994 + }, + { + "epoch": 0.32, + "grad_norm": 0.5149627780660425, + "learning_rate": 7.971415196763087e-07, + "loss": 0.1882, + "step": 4995 + }, + { + "epoch": 0.32, + "grad_norm": 0.48207292341649965, + "learning_rate": 7.97058454671429e-07, + "loss": 0.2135, + "step": 4996 + }, + { + "epoch": 0.32, + "grad_norm": 0.451393549295626, + "learning_rate": 7.969753769934385e-07, + "loss": 0.1189, + "step": 4997 + }, + { + "epoch": 0.32, + "grad_norm": 1.8268227433301185, + "learning_rate": 7.968922866458812e-07, + "loss": 0.0217, + "step": 4998 + }, + { + "epoch": 0.32, + "grad_norm": 1.8272394906628622, + "learning_rate": 7.968091836323024e-07, + "loss": 0.1134, + "step": 4999 + }, + { + "epoch": 0.32, + "grad_norm": 0.47506817233651355, + "learning_rate": 7.967260679562469e-07, + "loss": 0.2445, + "step": 5000 + }, + { + "epoch": 0.32, + "grad_norm": 0.5423948336077351, + "learning_rate": 7.966429396212609e-07, + "loss": 0.3635, + "step": 5001 + }, + { + "epoch": 0.32, + "grad_norm": 2.06826896756584, + "learning_rate": 7.96559798630891e-07, + "loss": 0.2838, + "step": 5002 + }, + { + "epoch": 0.32, + "grad_norm": 0.416550947442645, + "learning_rate": 7.964766449886837e-07, + "loss": 0.1613, + "step": 5003 + }, + { + "epoch": 0.32, + "grad_norm": 0.4322128171498775, + "learning_rate": 7.963934786981869e-07, + "loss": 0.0897, + "step": 5004 + }, + { + "epoch": 0.32, + "grad_norm": 0.3111004790221444, + "learning_rate": 7.963102997629483e-07, + "loss": 0.0062, + "step": 5005 + }, + { + "epoch": 0.32, + "grad_norm": 0.4275767116908408, + "learning_rate": 7.962271081865168e-07, + "loss": 0.088, + "step": 5006 + }, + { + "epoch": 0.32, + "grad_norm": 0.6975253562041295, + "learning_rate": 7.961439039724413e-07, + "loss": 0.1739, + "step": 5007 + }, + { + "epoch": 0.32, + "grad_norm": 0.4395987336849303, + "learning_rate": 7.960606871242715e-07, + "loss": 0.0743, + "step": 5008 + }, + { + "epoch": 0.32, + "grad_norm": 1.1549968254191867, + "learning_rate": 7.959774576455575e-07, + "loss": 0.3855, + "step": 5009 + }, + { + "epoch": 0.32, + "grad_norm": 0.9447495405402501, + "learning_rate": 7.958942155398505e-07, + "loss": 0.2621, + "step": 5010 + }, + { + "epoch": 0.32, + "grad_norm": 0.9002141884337671, + "learning_rate": 7.958109608107013e-07, + "loss": 0.2056, + "step": 5011 + }, + { + "epoch": 0.32, + "grad_norm": 0.47126443233971976, + "learning_rate": 7.957276934616617e-07, + "loss": 0.1012, + "step": 5012 + }, + { + "epoch": 0.32, + "grad_norm": 0.8064655018044103, + "learning_rate": 7.956444134962845e-07, + "loss": 0.209, + "step": 5013 + }, + { + "epoch": 0.32, + "grad_norm": 1.5132628000304005, + "learning_rate": 7.955611209181221e-07, + "loss": 0.3676, + "step": 5014 + }, + { + "epoch": 0.32, + "grad_norm": 0.7680638476418366, + "learning_rate": 7.954778157307282e-07, + "loss": 0.2247, + "step": 5015 + }, + { + "epoch": 0.32, + "grad_norm": 0.3801852438888622, + "learning_rate": 7.953944979376566e-07, + "loss": 0.1818, + "step": 5016 + }, + { + "epoch": 0.32, + "grad_norm": 1.0267432775653922, + "learning_rate": 7.953111675424621e-07, + "loss": 0.1018, + "step": 5017 + }, + { + "epoch": 0.32, + "grad_norm": 0.3157116719404107, + "learning_rate": 7.952278245486994e-07, + "loss": 0.0912, + "step": 5018 + }, + { + "epoch": 0.32, + "grad_norm": 0.677428114293082, + "learning_rate": 7.951444689599243e-07, + "loss": 0.062, + "step": 5019 + }, + { + "epoch": 0.32, + "grad_norm": 0.538623523501602, + "learning_rate": 7.950611007796926e-07, + "loss": 0.1322, + "step": 5020 + }, + { + "epoch": 0.32, + "grad_norm": 1.734947706556767, + "learning_rate": 7.949777200115614e-07, + "loss": 0.3189, + "step": 5021 + }, + { + "epoch": 0.32, + "grad_norm": 0.5005699296473773, + "learning_rate": 7.948943266590877e-07, + "loss": 0.1052, + "step": 5022 + }, + { + "epoch": 0.32, + "grad_norm": 0.956372170598805, + "learning_rate": 7.948109207258291e-07, + "loss": 0.038, + "step": 5023 + }, + { + "epoch": 0.32, + "grad_norm": 0.29887402833314614, + "learning_rate": 7.947275022153442e-07, + "loss": 0.1527, + "step": 5024 + }, + { + "epoch": 0.32, + "grad_norm": 0.5773722082686125, + "learning_rate": 7.946440711311913e-07, + "loss": 0.0651, + "step": 5025 + }, + { + "epoch": 0.32, + "grad_norm": 0.7473159928478327, + "learning_rate": 7.9456062747693e-07, + "loss": 0.3616, + "step": 5026 + }, + { + "epoch": 0.32, + "grad_norm": 1.1540229629180792, + "learning_rate": 7.944771712561205e-07, + "loss": 0.1379, + "step": 5027 + }, + { + "epoch": 0.32, + "grad_norm": 2.402763511775212, + "learning_rate": 7.943937024723227e-07, + "loss": 0.1207, + "step": 5028 + }, + { + "epoch": 0.32, + "grad_norm": 3.1898707403402686, + "learning_rate": 7.943102211290978e-07, + "loss": 0.157, + "step": 5029 + }, + { + "epoch": 0.32, + "grad_norm": 3.4718896143639624, + "learning_rate": 7.942267272300073e-07, + "loss": 0.3093, + "step": 5030 + }, + { + "epoch": 0.32, + "grad_norm": 0.6827378006526623, + "learning_rate": 7.941432207786129e-07, + "loss": 0.3476, + "step": 5031 + }, + { + "epoch": 0.32, + "grad_norm": 0.838382983533253, + "learning_rate": 7.940597017784777e-07, + "loss": 0.5095, + "step": 5032 + }, + { + "epoch": 0.32, + "grad_norm": 10.25771237914452, + "learning_rate": 7.939761702331643e-07, + "loss": 0.0783, + "step": 5033 + }, + { + "epoch": 0.32, + "grad_norm": 0.6697000274673348, + "learning_rate": 7.938926261462365e-07, + "loss": 0.1997, + "step": 5034 + }, + { + "epoch": 0.32, + "grad_norm": 0.45422559182740535, + "learning_rate": 7.938090695212586e-07, + "loss": 0.1168, + "step": 5035 + }, + { + "epoch": 0.32, + "grad_norm": 0.749265570823233, + "learning_rate": 7.937255003617951e-07, + "loss": 0.177, + "step": 5036 + }, + { + "epoch": 0.32, + "grad_norm": 3.2582617138738326, + "learning_rate": 7.936419186714112e-07, + "loss": 0.2065, + "step": 5037 + }, + { + "epoch": 0.32, + "grad_norm": 2.1733341664774195, + "learning_rate": 7.935583244536729e-07, + "loss": 0.1761, + "step": 5038 + }, + { + "epoch": 0.32, + "grad_norm": 1.290772020618876, + "learning_rate": 7.934747177121462e-07, + "loss": 0.1028, + "step": 5039 + }, + { + "epoch": 0.32, + "grad_norm": 0.6410520876912739, + "learning_rate": 7.933910984503982e-07, + "loss": 0.2144, + "step": 5040 + }, + { + "epoch": 0.32, + "grad_norm": 1.6934250674178204, + "learning_rate": 7.933074666719961e-07, + "loss": 0.0993, + "step": 5041 + }, + { + "epoch": 0.32, + "grad_norm": 1.047608512089884, + "learning_rate": 7.932238223805078e-07, + "loss": 0.2721, + "step": 5042 + }, + { + "epoch": 0.32, + "grad_norm": 0.18622970671123631, + "learning_rate": 7.93140165579502e-07, + "loss": 0.0026, + "step": 5043 + }, + { + "epoch": 0.32, + "grad_norm": 0.4924091168199843, + "learning_rate": 7.930564962725474e-07, + "loss": 0.0448, + "step": 5044 + }, + { + "epoch": 0.32, + "grad_norm": 1.0139061701689729, + "learning_rate": 7.929728144632134e-07, + "loss": 0.2306, + "step": 5045 + }, + { + "epoch": 0.32, + "grad_norm": 0.42514525358423894, + "learning_rate": 7.928891201550702e-07, + "loss": 0.1029, + "step": 5046 + }, + { + "epoch": 0.32, + "grad_norm": 0.6606443876500947, + "learning_rate": 7.928054133516884e-07, + "loss": 0.0526, + "step": 5047 + }, + { + "epoch": 0.32, + "grad_norm": 0.5533387802104182, + "learning_rate": 7.92721694056639e-07, + "loss": 0.2507, + "step": 5048 + }, + { + "epoch": 0.32, + "grad_norm": 4.392260737879013, + "learning_rate": 7.926379622734938e-07, + "loss": 0.0962, + "step": 5049 + }, + { + "epoch": 0.32, + "grad_norm": 0.6639063362367161, + "learning_rate": 7.925542180058248e-07, + "loss": 0.059, + "step": 5050 + }, + { + "epoch": 0.32, + "grad_norm": 0.5527057815532743, + "learning_rate": 7.924704612572048e-07, + "loss": 0.1333, + "step": 5051 + }, + { + "epoch": 0.32, + "grad_norm": 0.8813921275691163, + "learning_rate": 7.923866920312067e-07, + "loss": 0.245, + "step": 5052 + }, + { + "epoch": 0.32, + "grad_norm": 0.6499958517425046, + "learning_rate": 7.923029103314049e-07, + "loss": 0.2011, + "step": 5053 + }, + { + "epoch": 0.32, + "grad_norm": 0.6815565286916628, + "learning_rate": 7.922191161613731e-07, + "loss": 0.0448, + "step": 5054 + }, + { + "epoch": 0.32, + "grad_norm": 1.9708873600600691, + "learning_rate": 7.921353095246866e-07, + "loss": 0.2493, + "step": 5055 + }, + { + "epoch": 0.32, + "grad_norm": 0.3973793893255023, + "learning_rate": 7.920514904249204e-07, + "loss": 0.095, + "step": 5056 + }, + { + "epoch": 0.32, + "grad_norm": 0.4297452153589135, + "learning_rate": 7.919676588656505e-07, + "loss": 0.1796, + "step": 5057 + }, + { + "epoch": 0.32, + "grad_norm": 0.8032506156239824, + "learning_rate": 7.918838148504535e-07, + "loss": 0.1595, + "step": 5058 + }, + { + "epoch": 0.32, + "grad_norm": 0.622115335804525, + "learning_rate": 7.91799958382906e-07, + "loss": 0.1206, + "step": 5059 + }, + { + "epoch": 0.32, + "grad_norm": 0.7769521696973315, + "learning_rate": 7.917160894665858e-07, + "loss": 0.1069, + "step": 5060 + }, + { + "epoch": 0.32, + "grad_norm": 0.2420123982376446, + "learning_rate": 7.916322081050709e-07, + "loss": 0.1345, + "step": 5061 + }, + { + "epoch": 0.32, + "grad_norm": 2.3595717039836064, + "learning_rate": 7.915483143019395e-07, + "loss": 0.1815, + "step": 5062 + }, + { + "epoch": 0.32, + "grad_norm": 0.5631361585386891, + "learning_rate": 7.91464408060771e-07, + "loss": 0.2543, + "step": 5063 + }, + { + "epoch": 0.32, + "grad_norm": 0.5781748938014992, + "learning_rate": 7.91380489385145e-07, + "loss": 0.321, + "step": 5064 + }, + { + "epoch": 0.32, + "grad_norm": 0.4852625379275442, + "learning_rate": 7.912965582786415e-07, + "loss": 0.2118, + "step": 5065 + }, + { + "epoch": 0.32, + "grad_norm": 0.5333212149403556, + "learning_rate": 7.912126147448413e-07, + "loss": 0.1468, + "step": 5066 + }, + { + "epoch": 0.32, + "grad_norm": 0.18167122602406438, + "learning_rate": 7.911286587873256e-07, + "loss": 0.1121, + "step": 5067 + }, + { + "epoch": 0.32, + "grad_norm": 0.3149254689417239, + "learning_rate": 7.910446904096759e-07, + "loss": 0.1077, + "step": 5068 + }, + { + "epoch": 0.32, + "grad_norm": 6.907826203685545, + "learning_rate": 7.909607096154749e-07, + "loss": 0.318, + "step": 5069 + }, + { + "epoch": 0.32, + "grad_norm": 1.2544560723678784, + "learning_rate": 7.908767164083049e-07, + "loss": 0.3798, + "step": 5070 + }, + { + "epoch": 0.32, + "grad_norm": 3.1243881852506736, + "learning_rate": 7.907927107917495e-07, + "loss": 0.1088, + "step": 5071 + }, + { + "epoch": 0.32, + "grad_norm": 1.4039041042046467, + "learning_rate": 7.907086927693925e-07, + "loss": 0.1204, + "step": 5072 + }, + { + "epoch": 0.32, + "grad_norm": 1.337441881657516, + "learning_rate": 7.906246623448183e-07, + "loss": 0.2662, + "step": 5073 + }, + { + "epoch": 0.32, + "grad_norm": 1.3869484615283691, + "learning_rate": 7.905406195216117e-07, + "loss": 0.194, + "step": 5074 + }, + { + "epoch": 0.32, + "grad_norm": 1.3772397922865434, + "learning_rate": 7.904565643033583e-07, + "loss": 0.1906, + "step": 5075 + }, + { + "epoch": 0.32, + "grad_norm": 1.075791385822625, + "learning_rate": 7.90372496693644e-07, + "loss": 0.2967, + "step": 5076 + }, + { + "epoch": 0.32, + "grad_norm": 1.1951627905017583, + "learning_rate": 7.902884166960551e-07, + "loss": 0.0529, + "step": 5077 + }, + { + "epoch": 0.32, + "grad_norm": 0.4299172986861356, + "learning_rate": 7.902043243141789e-07, + "loss": 0.1034, + "step": 5078 + }, + { + "epoch": 0.32, + "grad_norm": 0.27911503588386616, + "learning_rate": 7.901202195516028e-07, + "loss": 0.0044, + "step": 5079 + }, + { + "epoch": 0.32, + "grad_norm": 0.3465537627965502, + "learning_rate": 7.90036102411915e-07, + "loss": 0.0845, + "step": 5080 + }, + { + "epoch": 0.32, + "grad_norm": 1.8558313789097676, + "learning_rate": 7.89951972898704e-07, + "loss": 0.169, + "step": 5081 + }, + { + "epoch": 0.32, + "grad_norm": 2.4629701720174717, + "learning_rate": 7.898678310155589e-07, + "loss": 0.1314, + "step": 5082 + }, + { + "epoch": 0.32, + "grad_norm": 0.5101844974842896, + "learning_rate": 7.897836767660695e-07, + "loss": 0.1962, + "step": 5083 + }, + { + "epoch": 0.32, + "grad_norm": 0.5551973788374136, + "learning_rate": 7.896995101538259e-07, + "loss": 0.1197, + "step": 5084 + }, + { + "epoch": 0.32, + "grad_norm": 0.5410604563063769, + "learning_rate": 7.896153311824188e-07, + "loss": 0.245, + "step": 5085 + }, + { + "epoch": 0.32, + "grad_norm": 1.950281909471845, + "learning_rate": 7.895311398554394e-07, + "loss": 0.2554, + "step": 5086 + }, + { + "epoch": 0.32, + "grad_norm": 0.8591973063541584, + "learning_rate": 7.894469361764798e-07, + "loss": 0.1654, + "step": 5087 + }, + { + "epoch": 0.32, + "grad_norm": 0.522447476544494, + "learning_rate": 7.893627201491318e-07, + "loss": 0.2683, + "step": 5088 + }, + { + "epoch": 0.32, + "grad_norm": 0.5860219876403301, + "learning_rate": 7.892784917769884e-07, + "loss": 0.1917, + "step": 5089 + }, + { + "epoch": 0.32, + "grad_norm": 0.3691056995316721, + "learning_rate": 7.891942510636431e-07, + "loss": 0.1302, + "step": 5090 + }, + { + "epoch": 0.32, + "grad_norm": 1.5598536136114467, + "learning_rate": 7.891099980126899e-07, + "loss": 0.4858, + "step": 5091 + }, + { + "epoch": 0.32, + "grad_norm": 0.6224550751677805, + "learning_rate": 7.890257326277227e-07, + "loss": 0.1352, + "step": 5092 + }, + { + "epoch": 0.32, + "grad_norm": 0.5249354254426418, + "learning_rate": 7.889414549123369e-07, + "loss": 0.0301, + "step": 5093 + }, + { + "epoch": 0.32, + "grad_norm": 0.5549734263590133, + "learning_rate": 7.888571648701277e-07, + "loss": 0.4364, + "step": 5094 + }, + { + "epoch": 0.32, + "grad_norm": 1.252075031879599, + "learning_rate": 7.887728625046912e-07, + "loss": 0.2175, + "step": 5095 + }, + { + "epoch": 0.32, + "grad_norm": 0.577590550688623, + "learning_rate": 7.88688547819624e-07, + "loss": 0.2646, + "step": 5096 + }, + { + "epoch": 0.33, + "grad_norm": 0.9450411854298872, + "learning_rate": 7.886042208185229e-07, + "loss": 0.2347, + "step": 5097 + }, + { + "epoch": 0.33, + "grad_norm": 0.5053901836705134, + "learning_rate": 7.885198815049857e-07, + "loss": 0.2114, + "step": 5098 + }, + { + "epoch": 0.33, + "grad_norm": 2.3266340655180504, + "learning_rate": 7.884355298826102e-07, + "loss": 0.1107, + "step": 5099 + }, + { + "epoch": 0.33, + "grad_norm": 0.9160849313524885, + "learning_rate": 7.883511659549952e-07, + "loss": 0.1369, + "step": 5100 + }, + { + "epoch": 0.33, + "grad_norm": 0.8819874018164324, + "learning_rate": 7.882667897257398e-07, + "loss": 0.3088, + "step": 5101 + }, + { + "epoch": 0.33, + "grad_norm": 0.6339437987921024, + "learning_rate": 7.881824011984437e-07, + "loss": 0.2548, + "step": 5102 + }, + { + "epoch": 0.33, + "grad_norm": 0.18967550245166595, + "learning_rate": 7.880980003767071e-07, + "loss": 0.0126, + "step": 5103 + }, + { + "epoch": 0.33, + "grad_norm": 0.7208254564609355, + "learning_rate": 7.880135872641305e-07, + "loss": 0.2752, + "step": 5104 + }, + { + "epoch": 0.33, + "grad_norm": 0.4640709049335785, + "learning_rate": 7.879291618643154e-07, + "loss": 0.1532, + "step": 5105 + }, + { + "epoch": 0.33, + "grad_norm": 0.8370361202927852, + "learning_rate": 7.878447241808634e-07, + "loss": 0.0573, + "step": 5106 + }, + { + "epoch": 0.33, + "grad_norm": 0.7902582978217804, + "learning_rate": 7.877602742173768e-07, + "loss": 0.1405, + "step": 5107 + }, + { + "epoch": 0.33, + "grad_norm": 0.6673761349895175, + "learning_rate": 7.876758119774585e-07, + "loss": 0.0331, + "step": 5108 + }, + { + "epoch": 0.33, + "grad_norm": 0.9481514783975585, + "learning_rate": 7.875913374647116e-07, + "loss": 0.2157, + "step": 5109 + }, + { + "epoch": 0.33, + "grad_norm": 0.7219554855726629, + "learning_rate": 7.875068506827403e-07, + "loss": 0.1573, + "step": 5110 + }, + { + "epoch": 0.33, + "grad_norm": 0.723677411654189, + "learning_rate": 7.874223516351487e-07, + "loss": 0.1056, + "step": 5111 + }, + { + "epoch": 0.33, + "grad_norm": 2.314401714303733, + "learning_rate": 7.873378403255419e-07, + "loss": 0.209, + "step": 5112 + }, + { + "epoch": 0.33, + "grad_norm": 1.9923260613096916, + "learning_rate": 7.872533167575251e-07, + "loss": 0.1989, + "step": 5113 + }, + { + "epoch": 0.33, + "grad_norm": 1.3756478802285395, + "learning_rate": 7.871687809347045e-07, + "loss": 0.2573, + "step": 5114 + }, + { + "epoch": 0.33, + "grad_norm": 0.960812771431285, + "learning_rate": 7.870842328606862e-07, + "loss": 0.1823, + "step": 5115 + }, + { + "epoch": 0.33, + "grad_norm": 0.7207169323889557, + "learning_rate": 7.869996725390775e-07, + "loss": 0.2595, + "step": 5116 + }, + { + "epoch": 0.33, + "grad_norm": 0.7463231305737968, + "learning_rate": 7.869150999734859e-07, + "loss": 0.0896, + "step": 5117 + }, + { + "epoch": 0.33, + "grad_norm": 0.819749616163352, + "learning_rate": 7.868305151675192e-07, + "loss": 0.2348, + "step": 5118 + }, + { + "epoch": 0.33, + "grad_norm": 1.1100931362103976, + "learning_rate": 7.867459181247863e-07, + "loss": 0.2892, + "step": 5119 + }, + { + "epoch": 0.33, + "grad_norm": 0.511772839504525, + "learning_rate": 7.866613088488961e-07, + "loss": 0.2312, + "step": 5120 + }, + { + "epoch": 0.33, + "grad_norm": 1.3402164407726527, + "learning_rate": 7.86576687343458e-07, + "loss": 0.0916, + "step": 5121 + }, + { + "epoch": 0.33, + "grad_norm": 0.9386732144001584, + "learning_rate": 7.864920536120825e-07, + "loss": 0.5051, + "step": 5122 + }, + { + "epoch": 0.33, + "grad_norm": 0.5137049325530657, + "learning_rate": 7.864074076583799e-07, + "loss": 0.2519, + "step": 5123 + }, + { + "epoch": 0.33, + "grad_norm": 3.1003313269417925, + "learning_rate": 7.863227494859619e-07, + "loss": 0.0859, + "step": 5124 + }, + { + "epoch": 0.33, + "grad_norm": 0.6060528816479874, + "learning_rate": 7.862380790984396e-07, + "loss": 0.2056, + "step": 5125 + }, + { + "epoch": 0.33, + "grad_norm": 0.6734469477166636, + "learning_rate": 7.861533964994254e-07, + "loss": 0.1005, + "step": 5126 + }, + { + "epoch": 0.33, + "grad_norm": 0.9326064729223768, + "learning_rate": 7.860687016925321e-07, + "loss": 0.1176, + "step": 5127 + }, + { + "epoch": 0.33, + "grad_norm": 0.5140312436999763, + "learning_rate": 7.859839946813729e-07, + "loss": 0.1963, + "step": 5128 + }, + { + "epoch": 0.33, + "grad_norm": 1.046058161699342, + "learning_rate": 7.858992754695617e-07, + "loss": 0.1158, + "step": 5129 + }, + { + "epoch": 0.33, + "grad_norm": 0.5555687076704139, + "learning_rate": 7.858145440607124e-07, + "loss": 0.2217, + "step": 5130 + }, + { + "epoch": 0.33, + "grad_norm": 1.0269123056921516, + "learning_rate": 7.857298004584403e-07, + "loss": 0.1093, + "step": 5131 + }, + { + "epoch": 0.33, + "grad_norm": 2.51625517947423, + "learning_rate": 7.856450446663605e-07, + "loss": 0.187, + "step": 5132 + }, + { + "epoch": 0.33, + "grad_norm": 1.529660144764917, + "learning_rate": 7.855602766880889e-07, + "loss": 0.1998, + "step": 5133 + }, + { + "epoch": 0.33, + "grad_norm": 1.7936212130851568, + "learning_rate": 7.854754965272419e-07, + "loss": 0.1397, + "step": 5134 + }, + { + "epoch": 0.33, + "grad_norm": 0.38627722105736007, + "learning_rate": 7.853907041874361e-07, + "loss": 0.2149, + "step": 5135 + }, + { + "epoch": 0.33, + "grad_norm": 0.7981630064468679, + "learning_rate": 7.853058996722895e-07, + "loss": 0.1107, + "step": 5136 + }, + { + "epoch": 0.33, + "grad_norm": 0.4979561136612249, + "learning_rate": 7.852210829854193e-07, + "loss": 0.2579, + "step": 5137 + }, + { + "epoch": 0.33, + "grad_norm": 2.6970440182009874, + "learning_rate": 7.851362541304446e-07, + "loss": 0.2837, + "step": 5138 + }, + { + "epoch": 0.33, + "grad_norm": 0.5434395421011616, + "learning_rate": 7.85051413110984e-07, + "loss": 0.0616, + "step": 5139 + }, + { + "epoch": 0.33, + "grad_norm": 0.9180870135310615, + "learning_rate": 7.849665599306572e-07, + "loss": 0.3237, + "step": 5140 + }, + { + "epoch": 0.33, + "grad_norm": 0.7764686954870078, + "learning_rate": 7.848816945930841e-07, + "loss": 0.1006, + "step": 5141 + }, + { + "epoch": 0.33, + "grad_norm": 2.104607388805604, + "learning_rate": 7.84796817101885e-07, + "loss": 0.2477, + "step": 5142 + }, + { + "epoch": 0.33, + "grad_norm": 1.474006178564293, + "learning_rate": 7.847119274606814e-07, + "loss": 0.2312, + "step": 5143 + }, + { + "epoch": 0.33, + "grad_norm": 0.7354791717268591, + "learning_rate": 7.846270256730945e-07, + "loss": 0.4119, + "step": 5144 + }, + { + "epoch": 0.33, + "grad_norm": 0.8495632459382707, + "learning_rate": 7.845421117427466e-07, + "loss": 0.2803, + "step": 5145 + }, + { + "epoch": 0.33, + "grad_norm": 0.9405873283152801, + "learning_rate": 7.844571856732601e-07, + "loss": 0.275, + "step": 5146 + }, + { + "epoch": 0.33, + "grad_norm": 1.1669588160239925, + "learning_rate": 7.843722474682583e-07, + "loss": 0.258, + "step": 5147 + }, + { + "epoch": 0.33, + "grad_norm": 0.41844449084585966, + "learning_rate": 7.842872971313648e-07, + "loss": 0.0111, + "step": 5148 + }, + { + "epoch": 0.33, + "grad_norm": 0.18090898926874738, + "learning_rate": 7.842023346662036e-07, + "loss": 0.0212, + "step": 5149 + }, + { + "epoch": 0.33, + "grad_norm": 0.9171412220481466, + "learning_rate": 7.841173600763996e-07, + "loss": 0.2245, + "step": 5150 + }, + { + "epoch": 0.33, + "grad_norm": 0.40540167800773685, + "learning_rate": 7.840323733655778e-07, + "loss": 0.1728, + "step": 5151 + }, + { + "epoch": 0.33, + "grad_norm": 0.37619284814137943, + "learning_rate": 7.839473745373642e-07, + "loss": 0.1055, + "step": 5152 + }, + { + "epoch": 0.33, + "grad_norm": 2.264378141577581, + "learning_rate": 7.838623635953845e-07, + "loss": 0.0205, + "step": 5153 + }, + { + "epoch": 0.33, + "grad_norm": 0.866356084279496, + "learning_rate": 7.83777340543266e-07, + "loss": 0.1617, + "step": 5154 + }, + { + "epoch": 0.33, + "grad_norm": 0.7757190835594977, + "learning_rate": 7.836923053846354e-07, + "loss": 0.2728, + "step": 5155 + }, + { + "epoch": 0.33, + "grad_norm": 0.3825698622205671, + "learning_rate": 7.836072581231209e-07, + "loss": 0.1509, + "step": 5156 + }, + { + "epoch": 0.33, + "grad_norm": 0.8802356116921449, + "learning_rate": 7.835221987623506e-07, + "loss": 0.2452, + "step": 5157 + }, + { + "epoch": 0.33, + "grad_norm": 0.3535140613776956, + "learning_rate": 7.834371273059534e-07, + "loss": 0.2698, + "step": 5158 + }, + { + "epoch": 0.33, + "grad_norm": 0.7063955805418285, + "learning_rate": 7.833520437575585e-07, + "loss": 0.4772, + "step": 5159 + }, + { + "epoch": 0.33, + "grad_norm": 0.8677992657140369, + "learning_rate": 7.832669481207958e-07, + "loss": 0.1712, + "step": 5160 + }, + { + "epoch": 0.33, + "grad_norm": 1.0564033872324807, + "learning_rate": 7.831818403992958e-07, + "loss": 0.2168, + "step": 5161 + }, + { + "epoch": 0.33, + "grad_norm": 0.1737715122182201, + "learning_rate": 7.830967205966892e-07, + "loss": 0.0726, + "step": 5162 + }, + { + "epoch": 0.33, + "grad_norm": 0.4813717911308954, + "learning_rate": 7.830115887166072e-07, + "loss": 0.0935, + "step": 5163 + }, + { + "epoch": 0.33, + "grad_norm": 0.21965268421480677, + "learning_rate": 7.82926444762682e-07, + "loss": 0.0828, + "step": 5164 + }, + { + "epoch": 0.33, + "grad_norm": 0.6091583475193905, + "learning_rate": 7.82841288738546e-07, + "loss": 0.1261, + "step": 5165 + }, + { + "epoch": 0.33, + "grad_norm": 0.41979584647571394, + "learning_rate": 7.82756120647832e-07, + "loss": 0.012, + "step": 5166 + }, + { + "epoch": 0.33, + "grad_norm": 0.3935984589136889, + "learning_rate": 7.826709404941735e-07, + "loss": 0.0103, + "step": 5167 + }, + { + "epoch": 0.33, + "grad_norm": 0.822421978783359, + "learning_rate": 7.825857482812044e-07, + "loss": 0.1331, + "step": 5168 + }, + { + "epoch": 0.33, + "grad_norm": 1.1806423913775659, + "learning_rate": 7.825005440125593e-07, + "loss": 0.2662, + "step": 5169 + }, + { + "epoch": 0.33, + "grad_norm": 0.15141360722514163, + "learning_rate": 7.824153276918731e-07, + "loss": 0.0022, + "step": 5170 + }, + { + "epoch": 0.33, + "grad_norm": 2.003885880421857, + "learning_rate": 7.82330099322781e-07, + "loss": 0.3928, + "step": 5171 + }, + { + "epoch": 0.33, + "grad_norm": 0.43733114030772724, + "learning_rate": 7.822448589089197e-07, + "loss": 0.1057, + "step": 5172 + }, + { + "epoch": 0.33, + "grad_norm": 1.6463146289694512, + "learning_rate": 7.821596064539251e-07, + "loss": 0.1843, + "step": 5173 + }, + { + "epoch": 0.33, + "grad_norm": 0.7131295128071986, + "learning_rate": 7.820743419614345e-07, + "loss": 0.2946, + "step": 5174 + }, + { + "epoch": 0.33, + "grad_norm": 0.3734179210841508, + "learning_rate": 7.819890654350855e-07, + "loss": 0.0989, + "step": 5175 + }, + { + "epoch": 0.33, + "grad_norm": 1.0249790109579513, + "learning_rate": 7.819037768785159e-07, + "loss": 0.3991, + "step": 5176 + }, + { + "epoch": 0.33, + "grad_norm": 0.4857390485229206, + "learning_rate": 7.818184762953648e-07, + "loss": 0.0315, + "step": 5177 + }, + { + "epoch": 0.33, + "grad_norm": 1.364107220300501, + "learning_rate": 7.817331636892708e-07, + "loss": 0.0135, + "step": 5178 + }, + { + "epoch": 0.33, + "grad_norm": 1.6163913761357585, + "learning_rate": 7.816478390638737e-07, + "loss": 0.0153, + "step": 5179 + }, + { + "epoch": 0.33, + "grad_norm": 0.7892463179352985, + "learning_rate": 7.815625024228136e-07, + "loss": 0.198, + "step": 5180 + }, + { + "epoch": 0.33, + "grad_norm": 0.4749420067595575, + "learning_rate": 7.814771537697311e-07, + "loss": 0.1432, + "step": 5181 + }, + { + "epoch": 0.33, + "grad_norm": 0.6439197328856424, + "learning_rate": 7.813917931082675e-07, + "loss": 0.1727, + "step": 5182 + }, + { + "epoch": 0.33, + "grad_norm": 0.9286148777239651, + "learning_rate": 7.813064204420643e-07, + "loss": 0.2073, + "step": 5183 + }, + { + "epoch": 0.33, + "grad_norm": 0.5687769427647333, + "learning_rate": 7.812210357747635e-07, + "loss": 0.1604, + "step": 5184 + }, + { + "epoch": 0.33, + "grad_norm": 1.1008413881744683, + "learning_rate": 7.811356391100081e-07, + "loss": 0.2216, + "step": 5185 + }, + { + "epoch": 0.33, + "grad_norm": 1.2699507225473614, + "learning_rate": 7.810502304514413e-07, + "loss": 0.2641, + "step": 5186 + }, + { + "epoch": 0.33, + "grad_norm": 2.1134241983281936, + "learning_rate": 7.809648098027066e-07, + "loss": 0.3247, + "step": 5187 + }, + { + "epoch": 0.33, + "grad_norm": 4.625339701996283, + "learning_rate": 7.808793771674484e-07, + "loss": 0.2065, + "step": 5188 + }, + { + "epoch": 0.33, + "grad_norm": 2.3914872799377487, + "learning_rate": 7.807939325493112e-07, + "loss": 0.2097, + "step": 5189 + }, + { + "epoch": 0.33, + "grad_norm": 7.376255327123015, + "learning_rate": 7.807084759519404e-07, + "loss": 0.1322, + "step": 5190 + }, + { + "epoch": 0.33, + "grad_norm": 0.6763194881524864, + "learning_rate": 7.806230073789818e-07, + "loss": 0.3152, + "step": 5191 + }, + { + "epoch": 0.33, + "grad_norm": 0.863660255909624, + "learning_rate": 7.805375268340814e-07, + "loss": 0.0923, + "step": 5192 + }, + { + "epoch": 0.33, + "grad_norm": 0.8275494096837781, + "learning_rate": 7.804520343208865e-07, + "loss": 0.2021, + "step": 5193 + }, + { + "epoch": 0.33, + "grad_norm": 3.269752146604219, + "learning_rate": 7.803665298430437e-07, + "loss": 0.1851, + "step": 5194 + }, + { + "epoch": 0.33, + "grad_norm": 0.699971489648117, + "learning_rate": 7.802810134042013e-07, + "loss": 0.1585, + "step": 5195 + }, + { + "epoch": 0.33, + "grad_norm": 4.795300966736381, + "learning_rate": 7.801954850080074e-07, + "loss": 0.105, + "step": 5196 + }, + { + "epoch": 0.33, + "grad_norm": 0.5764921562418399, + "learning_rate": 7.801099446581107e-07, + "loss": 0.2153, + "step": 5197 + }, + { + "epoch": 0.33, + "grad_norm": 0.5092916387115642, + "learning_rate": 7.800243923581609e-07, + "loss": 0.0748, + "step": 5198 + }, + { + "epoch": 0.33, + "grad_norm": 0.45864637235860056, + "learning_rate": 7.799388281118076e-07, + "loss": 0.1253, + "step": 5199 + }, + { + "epoch": 0.33, + "grad_norm": 0.2302589845756009, + "learning_rate": 7.79853251922701e-07, + "loss": 0.1033, + "step": 5200 + }, + { + "epoch": 0.33, + "grad_norm": 0.6796973251578036, + "learning_rate": 7.797676637944921e-07, + "loss": 0.2755, + "step": 5201 + }, + { + "epoch": 0.33, + "grad_norm": 0.9209543229656066, + "learning_rate": 7.796820637308323e-07, + "loss": 0.3127, + "step": 5202 + }, + { + "epoch": 0.33, + "grad_norm": 0.5949097161132205, + "learning_rate": 7.795964517353733e-07, + "loss": 0.3117, + "step": 5203 + }, + { + "epoch": 0.33, + "grad_norm": 0.6011857914950647, + "learning_rate": 7.795108278117678e-07, + "loss": 0.0846, + "step": 5204 + }, + { + "epoch": 0.33, + "grad_norm": 1.7723605848731894, + "learning_rate": 7.794251919636685e-07, + "loss": 0.1317, + "step": 5205 + }, + { + "epoch": 0.33, + "grad_norm": 0.42760420529026383, + "learning_rate": 7.793395441947287e-07, + "loss": 0.1835, + "step": 5206 + }, + { + "epoch": 0.33, + "grad_norm": 0.8185202619388978, + "learning_rate": 7.792538845086024e-07, + "loss": 0.1726, + "step": 5207 + }, + { + "epoch": 0.33, + "grad_norm": 1.3600276855074998, + "learning_rate": 7.79168212908944e-07, + "loss": 0.194, + "step": 5208 + }, + { + "epoch": 0.33, + "grad_norm": 1.1362800072731472, + "learning_rate": 7.790825293994086e-07, + "loss": 0.2714, + "step": 5209 + }, + { + "epoch": 0.33, + "grad_norm": 1.039725504216318, + "learning_rate": 7.789968339836514e-07, + "loss": 0.2195, + "step": 5210 + }, + { + "epoch": 0.33, + "grad_norm": 1.2588782562309624, + "learning_rate": 7.789111266653283e-07, + "loss": 0.3284, + "step": 5211 + }, + { + "epoch": 0.33, + "grad_norm": 1.9196694656825979, + "learning_rate": 7.78825407448096e-07, + "loss": 0.0946, + "step": 5212 + }, + { + "epoch": 0.33, + "grad_norm": 0.8374289363909836, + "learning_rate": 7.787396763356111e-07, + "loss": 0.3104, + "step": 5213 + }, + { + "epoch": 0.33, + "grad_norm": 0.7024580538454515, + "learning_rate": 7.786539333315315e-07, + "loss": 0.1579, + "step": 5214 + }, + { + "epoch": 0.33, + "grad_norm": 0.7842770416764756, + "learning_rate": 7.785681784395148e-07, + "loss": 0.2168, + "step": 5215 + }, + { + "epoch": 0.33, + "grad_norm": 0.9262774147299662, + "learning_rate": 7.784824116632196e-07, + "loss": 0.4278, + "step": 5216 + }, + { + "epoch": 0.33, + "grad_norm": 0.8510804981741832, + "learning_rate": 7.78396633006305e-07, + "loss": 0.261, + "step": 5217 + }, + { + "epoch": 0.33, + "grad_norm": 0.5361973129473463, + "learning_rate": 7.783108424724303e-07, + "loss": 0.399, + "step": 5218 + }, + { + "epoch": 0.33, + "grad_norm": 4.024259909503765, + "learning_rate": 7.782250400652556e-07, + "loss": 0.0766, + "step": 5219 + }, + { + "epoch": 0.33, + "grad_norm": 0.6120064942509154, + "learning_rate": 7.781392257884415e-07, + "loss": 0.1121, + "step": 5220 + }, + { + "epoch": 0.33, + "grad_norm": 0.3654686798068923, + "learning_rate": 7.780533996456489e-07, + "loss": 0.0905, + "step": 5221 + }, + { + "epoch": 0.33, + "grad_norm": 0.6877879776820551, + "learning_rate": 7.779675616405391e-07, + "loss": 0.2864, + "step": 5222 + }, + { + "epoch": 0.33, + "grad_norm": 0.8962972809336581, + "learning_rate": 7.778817117767747e-07, + "loss": 0.2471, + "step": 5223 + }, + { + "epoch": 0.33, + "grad_norm": 1.1438333180741294, + "learning_rate": 7.777958500580175e-07, + "loss": 0.2553, + "step": 5224 + }, + { + "epoch": 0.33, + "grad_norm": 6.01063537119916, + "learning_rate": 7.777099764879311e-07, + "loss": 0.1714, + "step": 5225 + }, + { + "epoch": 0.33, + "grad_norm": 0.547708033777952, + "learning_rate": 7.776240910701787e-07, + "loss": 0.2227, + "step": 5226 + }, + { + "epoch": 0.33, + "grad_norm": 0.9399754562836553, + "learning_rate": 7.775381938084245e-07, + "loss": 0.1651, + "step": 5227 + }, + { + "epoch": 0.33, + "grad_norm": 0.573631678102885, + "learning_rate": 7.77452284706333e-07, + "loss": 0.2619, + "step": 5228 + }, + { + "epoch": 0.33, + "grad_norm": 0.8042545184195301, + "learning_rate": 7.773663637675694e-07, + "loss": 0.3167, + "step": 5229 + }, + { + "epoch": 0.33, + "grad_norm": 2.7307787292330183, + "learning_rate": 7.77280430995799e-07, + "loss": 0.357, + "step": 5230 + }, + { + "epoch": 0.33, + "grad_norm": 0.41078368085224987, + "learning_rate": 7.771944863946882e-07, + "loss": 0.1782, + "step": 5231 + }, + { + "epoch": 0.33, + "grad_norm": 0.5626768386257761, + "learning_rate": 7.771085299679033e-07, + "loss": 0.1987, + "step": 5232 + }, + { + "epoch": 0.33, + "grad_norm": 0.30757453219627906, + "learning_rate": 7.770225617191115e-07, + "loss": 0.1097, + "step": 5233 + }, + { + "epoch": 0.33, + "grad_norm": 0.7249081777321619, + "learning_rate": 7.769365816519802e-07, + "loss": 0.0807, + "step": 5234 + }, + { + "epoch": 0.33, + "grad_norm": 1.9128744385671228, + "learning_rate": 7.768505897701777e-07, + "loss": 0.2948, + "step": 5235 + }, + { + "epoch": 0.33, + "grad_norm": 2.5775201829528736, + "learning_rate": 7.767645860773725e-07, + "loss": 0.1744, + "step": 5236 + }, + { + "epoch": 0.33, + "grad_norm": 0.7369713102491516, + "learning_rate": 7.766785705772338e-07, + "loss": 0.1896, + "step": 5237 + }, + { + "epoch": 0.33, + "grad_norm": 0.35607296837526103, + "learning_rate": 7.765925432734309e-07, + "loss": 0.186, + "step": 5238 + }, + { + "epoch": 0.33, + "grad_norm": 0.4936113410080766, + "learning_rate": 7.765065041696341e-07, + "loss": 0.1866, + "step": 5239 + }, + { + "epoch": 0.33, + "grad_norm": 0.6865003619440185, + "learning_rate": 7.764204532695141e-07, + "loss": 0.17, + "step": 5240 + }, + { + "epoch": 0.33, + "grad_norm": 0.3432388715961265, + "learning_rate": 7.763343905767419e-07, + "loss": 0.1504, + "step": 5241 + }, + { + "epoch": 0.33, + "grad_norm": 0.6067028995860944, + "learning_rate": 7.762483160949888e-07, + "loss": 0.1617, + "step": 5242 + }, + { + "epoch": 0.33, + "grad_norm": 0.9284293087567614, + "learning_rate": 7.761622298279276e-07, + "loss": 0.1262, + "step": 5243 + }, + { + "epoch": 0.33, + "grad_norm": 0.6042827755347948, + "learning_rate": 7.760761317792303e-07, + "loss": 0.0869, + "step": 5244 + }, + { + "epoch": 0.33, + "grad_norm": 0.6440464766518622, + "learning_rate": 7.759900219525703e-07, + "loss": 0.3204, + "step": 5245 + }, + { + "epoch": 0.33, + "grad_norm": 0.6139440459427701, + "learning_rate": 7.759039003516211e-07, + "loss": 0.0814, + "step": 5246 + }, + { + "epoch": 0.33, + "grad_norm": 0.3709636354163115, + "learning_rate": 7.758177669800568e-07, + "loss": 0.1332, + "step": 5247 + }, + { + "epoch": 0.33, + "grad_norm": 0.5544713127409657, + "learning_rate": 7.757316218415523e-07, + "loss": 0.2422, + "step": 5248 + }, + { + "epoch": 0.33, + "grad_norm": 1.157099304716162, + "learning_rate": 7.756454649397824e-07, + "loss": 0.052, + "step": 5249 + }, + { + "epoch": 0.33, + "grad_norm": 0.6744716019691676, + "learning_rate": 7.75559296278423e-07, + "loss": 0.1596, + "step": 5250 + }, + { + "epoch": 0.33, + "grad_norm": 0.6174406562037895, + "learning_rate": 7.754731158611498e-07, + "loss": 0.3179, + "step": 5251 + }, + { + "epoch": 0.33, + "grad_norm": 0.5190619052521371, + "learning_rate": 7.753869236916399e-07, + "loss": 0.3108, + "step": 5252 + }, + { + "epoch": 0.33, + "grad_norm": 0.9681623585174963, + "learning_rate": 7.753007197735703e-07, + "loss": 0.0746, + "step": 5253 + }, + { + "epoch": 0.34, + "grad_norm": 0.7248986660787387, + "learning_rate": 7.752145041106184e-07, + "loss": 0.2915, + "step": 5254 + }, + { + "epoch": 0.34, + "grad_norm": 0.8032978124242338, + "learning_rate": 7.751282767064626e-07, + "loss": 0.1974, + "step": 5255 + }, + { + "epoch": 0.34, + "grad_norm": 0.7604321560764785, + "learning_rate": 7.750420375647815e-07, + "loss": 0.3739, + "step": 5256 + }, + { + "epoch": 0.34, + "grad_norm": 3.8691280985179755, + "learning_rate": 7.74955786689254e-07, + "loss": 0.3987, + "step": 5257 + }, + { + "epoch": 0.34, + "grad_norm": 0.7048945497271688, + "learning_rate": 7.7486952408356e-07, + "loss": 0.1943, + "step": 5258 + }, + { + "epoch": 0.34, + "grad_norm": 5.26554239065647, + "learning_rate": 7.747832497513795e-07, + "loss": 0.2652, + "step": 5259 + }, + { + "epoch": 0.34, + "grad_norm": 0.43491317356013665, + "learning_rate": 7.746969636963933e-07, + "loss": 0.2595, + "step": 5260 + }, + { + "epoch": 0.34, + "grad_norm": 1.6478578418172165, + "learning_rate": 7.746106659222823e-07, + "loss": 0.25, + "step": 5261 + }, + { + "epoch": 0.34, + "grad_norm": 2.1504782887633436, + "learning_rate": 7.745243564327283e-07, + "loss": 0.1144, + "step": 5262 + }, + { + "epoch": 0.34, + "grad_norm": 4.101033691732244, + "learning_rate": 7.744380352314134e-07, + "loss": 0.0991, + "step": 5263 + }, + { + "epoch": 0.34, + "grad_norm": 0.8055943990726835, + "learning_rate": 7.743517023220203e-07, + "loss": 0.3228, + "step": 5264 + }, + { + "epoch": 0.34, + "grad_norm": 0.7353106485944705, + "learning_rate": 7.742653577082318e-07, + "loss": 0.1819, + "step": 5265 + }, + { + "epoch": 0.34, + "grad_norm": 1.0442489787901628, + "learning_rate": 7.741790013937321e-07, + "loss": 0.3745, + "step": 5266 + }, + { + "epoch": 0.34, + "grad_norm": 2.4147080737370854, + "learning_rate": 7.740926333822049e-07, + "loss": 0.298, + "step": 5267 + }, + { + "epoch": 0.34, + "grad_norm": 0.4203104667063722, + "learning_rate": 7.74006253677335e-07, + "loss": 0.2799, + "step": 5268 + }, + { + "epoch": 0.34, + "grad_norm": 0.5690448686724064, + "learning_rate": 7.739198622828073e-07, + "loss": 0.0067, + "step": 5269 + }, + { + "epoch": 0.34, + "grad_norm": 0.37927647647532053, + "learning_rate": 7.738334592023079e-07, + "loss": 0.2907, + "step": 5270 + }, + { + "epoch": 0.34, + "grad_norm": 1.9868200960221822, + "learning_rate": 7.737470444395226e-07, + "loss": 0.3532, + "step": 5271 + }, + { + "epoch": 0.34, + "grad_norm": 2.577393204628036, + "learning_rate": 7.73660617998138e-07, + "loss": 0.3059, + "step": 5272 + }, + { + "epoch": 0.34, + "grad_norm": 1.33371205203811, + "learning_rate": 7.735741798818414e-07, + "loss": 0.1558, + "step": 5273 + }, + { + "epoch": 0.34, + "grad_norm": 0.547719657318448, + "learning_rate": 7.734877300943202e-07, + "loss": 0.1946, + "step": 5274 + }, + { + "epoch": 0.34, + "grad_norm": 0.9383937743435579, + "learning_rate": 7.734012686392628e-07, + "loss": 0.1982, + "step": 5275 + }, + { + "epoch": 0.34, + "grad_norm": 0.9772115285261509, + "learning_rate": 7.733147955203576e-07, + "loss": 0.1183, + "step": 5276 + }, + { + "epoch": 0.34, + "grad_norm": 0.6408425941334519, + "learning_rate": 7.732283107412938e-07, + "loss": 0.2941, + "step": 5277 + }, + { + "epoch": 0.34, + "grad_norm": 3.2603429994239863, + "learning_rate": 7.731418143057611e-07, + "loss": 0.2918, + "step": 5278 + }, + { + "epoch": 0.34, + "grad_norm": 1.190932591688707, + "learning_rate": 7.730553062174494e-07, + "loss": 0.4235, + "step": 5279 + }, + { + "epoch": 0.34, + "grad_norm": 0.9899504115425867, + "learning_rate": 7.729687864800494e-07, + "loss": 0.0565, + "step": 5280 + }, + { + "epoch": 0.34, + "grad_norm": 1.0326428444784788, + "learning_rate": 7.728822550972522e-07, + "loss": 0.016, + "step": 5281 + }, + { + "epoch": 0.34, + "grad_norm": 3.7889003525642555, + "learning_rate": 7.727957120727495e-07, + "loss": 0.0446, + "step": 5282 + }, + { + "epoch": 0.34, + "grad_norm": 0.4504325124157612, + "learning_rate": 7.727091574102334e-07, + "loss": 0.1024, + "step": 5283 + }, + { + "epoch": 0.34, + "grad_norm": 0.5993252725635362, + "learning_rate": 7.726225911133965e-07, + "loss": 0.2273, + "step": 5284 + }, + { + "epoch": 0.34, + "grad_norm": 0.6508119301918696, + "learning_rate": 7.725360131859317e-07, + "loss": 0.0937, + "step": 5285 + }, + { + "epoch": 0.34, + "grad_norm": 1.7281171500449641, + "learning_rate": 7.724494236315327e-07, + "loss": 0.3123, + "step": 5286 + }, + { + "epoch": 0.34, + "grad_norm": 0.7887649925558102, + "learning_rate": 7.723628224538937e-07, + "loss": 0.0142, + "step": 5287 + }, + { + "epoch": 0.34, + "grad_norm": 0.3721712379544822, + "learning_rate": 7.722762096567089e-07, + "loss": 0.1807, + "step": 5288 + }, + { + "epoch": 0.34, + "grad_norm": 0.570330342971187, + "learning_rate": 7.721895852436739e-07, + "loss": 0.1508, + "step": 5289 + }, + { + "epoch": 0.34, + "grad_norm": 1.017985400053795, + "learning_rate": 7.72102949218484e-07, + "loss": 0.1733, + "step": 5290 + }, + { + "epoch": 0.34, + "grad_norm": 1.6570959670651932, + "learning_rate": 7.720163015848352e-07, + "loss": 0.1342, + "step": 5291 + }, + { + "epoch": 0.34, + "grad_norm": 0.31935624622912157, + "learning_rate": 7.719296423464243e-07, + "loss": 0.0046, + "step": 5292 + }, + { + "epoch": 0.34, + "grad_norm": 1.02227589469212, + "learning_rate": 7.718429715069481e-07, + "loss": 0.2811, + "step": 5293 + }, + { + "epoch": 0.34, + "grad_norm": 1.0304873467200701, + "learning_rate": 7.717562890701043e-07, + "loss": 0.278, + "step": 5294 + }, + { + "epoch": 0.34, + "grad_norm": 2.0786198227661057, + "learning_rate": 7.716695950395908e-07, + "loss": 0.2424, + "step": 5295 + }, + { + "epoch": 0.34, + "grad_norm": 1.161837307128368, + "learning_rate": 7.715828894191063e-07, + "loss": 0.1475, + "step": 5296 + }, + { + "epoch": 0.34, + "grad_norm": 2.0594379087178116, + "learning_rate": 7.714961722123498e-07, + "loss": 0.0915, + "step": 5297 + }, + { + "epoch": 0.34, + "grad_norm": 1.613491070317713, + "learning_rate": 7.71409443423021e-07, + "loss": 0.1684, + "step": 5298 + }, + { + "epoch": 0.34, + "grad_norm": 0.48638051637562585, + "learning_rate": 7.713227030548195e-07, + "loss": 0.1002, + "step": 5299 + }, + { + "epoch": 0.34, + "grad_norm": 1.9718802165761276, + "learning_rate": 7.712359511114461e-07, + "loss": 0.2009, + "step": 5300 + }, + { + "epoch": 0.34, + "grad_norm": 1.755756688591537, + "learning_rate": 7.711491875966019e-07, + "loss": 0.2435, + "step": 5301 + }, + { + "epoch": 0.34, + "grad_norm": 0.4700821926865363, + "learning_rate": 7.710624125139882e-07, + "loss": 0.1341, + "step": 5302 + }, + { + "epoch": 0.34, + "grad_norm": 0.7815767874893984, + "learning_rate": 7.70975625867307e-07, + "loss": 0.2079, + "step": 5303 + }, + { + "epoch": 0.34, + "grad_norm": 0.41626744975865426, + "learning_rate": 7.708888276602609e-07, + "loss": 0.2388, + "step": 5304 + }, + { + "epoch": 0.34, + "grad_norm": 1.498746829276991, + "learning_rate": 7.70802017896553e-07, + "loss": 0.2425, + "step": 5305 + }, + { + "epoch": 0.34, + "grad_norm": 0.38162158973715743, + "learning_rate": 7.707151965798866e-07, + "loss": 0.2501, + "step": 5306 + }, + { + "epoch": 0.34, + "grad_norm": 1.016607126681126, + "learning_rate": 7.706283637139657e-07, + "loss": 0.2299, + "step": 5307 + }, + { + "epoch": 0.34, + "grad_norm": 0.7391466955217589, + "learning_rate": 7.705415193024947e-07, + "loss": 0.1713, + "step": 5308 + }, + { + "epoch": 0.34, + "grad_norm": 4.020352974328961, + "learning_rate": 7.704546633491787e-07, + "loss": 0.0339, + "step": 5309 + }, + { + "epoch": 0.34, + "grad_norm": 0.6292426340417446, + "learning_rate": 7.703677958577231e-07, + "loss": 0.217, + "step": 5310 + }, + { + "epoch": 0.34, + "grad_norm": 4.829214248599567, + "learning_rate": 7.702809168318337e-07, + "loss": 0.1083, + "step": 5311 + }, + { + "epoch": 0.34, + "grad_norm": 0.5662842095404212, + "learning_rate": 7.701940262752171e-07, + "loss": 0.2041, + "step": 5312 + }, + { + "epoch": 0.34, + "grad_norm": 0.650322089205414, + "learning_rate": 7.701071241915802e-07, + "loss": 0.2903, + "step": 5313 + }, + { + "epoch": 0.34, + "grad_norm": 0.5903015047276998, + "learning_rate": 7.700202105846303e-07, + "loss": 0.3205, + "step": 5314 + }, + { + "epoch": 0.34, + "grad_norm": 2.0010473667544635, + "learning_rate": 7.699332854580756e-07, + "loss": 0.1309, + "step": 5315 + }, + { + "epoch": 0.34, + "grad_norm": 0.267577849455619, + "learning_rate": 7.698463488156241e-07, + "loss": 0.1097, + "step": 5316 + }, + { + "epoch": 0.34, + "grad_norm": 0.8435166915977652, + "learning_rate": 7.69759400660985e-07, + "loss": 0.0634, + "step": 5317 + }, + { + "epoch": 0.34, + "grad_norm": 1.0060092200291906, + "learning_rate": 7.696724409978677e-07, + "loss": 0.1808, + "step": 5318 + }, + { + "epoch": 0.34, + "grad_norm": 0.6870243220663309, + "learning_rate": 7.695854698299819e-07, + "loss": 0.0972, + "step": 5319 + }, + { + "epoch": 0.34, + "grad_norm": 0.5387727882044846, + "learning_rate": 7.694984871610379e-07, + "loss": 0.349, + "step": 5320 + }, + { + "epoch": 0.34, + "grad_norm": 5.325458812570029, + "learning_rate": 7.694114929947469e-07, + "loss": 0.1727, + "step": 5321 + }, + { + "epoch": 0.34, + "grad_norm": 0.5208017490544538, + "learning_rate": 7.693244873348197e-07, + "loss": 0.1576, + "step": 5322 + }, + { + "epoch": 0.34, + "grad_norm": 0.49418080354995575, + "learning_rate": 7.692374701849687e-07, + "loss": 0.0074, + "step": 5323 + }, + { + "epoch": 0.34, + "grad_norm": 0.7438736074259235, + "learning_rate": 7.691504415489058e-07, + "loss": 0.2522, + "step": 5324 + }, + { + "epoch": 0.34, + "grad_norm": 1.271699322155713, + "learning_rate": 7.690634014303441e-07, + "loss": 0.3671, + "step": 5325 + }, + { + "epoch": 0.34, + "grad_norm": 0.8152545212787211, + "learning_rate": 7.689763498329969e-07, + "loss": 0.2574, + "step": 5326 + }, + { + "epoch": 0.34, + "grad_norm": 0.9140083684797955, + "learning_rate": 7.688892867605778e-07, + "loss": 0.1236, + "step": 5327 + }, + { + "epoch": 0.34, + "grad_norm": 0.6380556696314743, + "learning_rate": 7.688022122168012e-07, + "loss": 0.319, + "step": 5328 + }, + { + "epoch": 0.34, + "grad_norm": 0.6314997933841358, + "learning_rate": 7.68715126205382e-07, + "loss": 0.3729, + "step": 5329 + }, + { + "epoch": 0.34, + "grad_norm": 0.8182762182779711, + "learning_rate": 7.686280287300352e-07, + "loss": 0.0136, + "step": 5330 + }, + { + "epoch": 0.34, + "grad_norm": 0.3156776685054188, + "learning_rate": 7.685409197944768e-07, + "loss": 0.0872, + "step": 5331 + }, + { + "epoch": 0.34, + "grad_norm": 0.831457989118222, + "learning_rate": 7.684537994024228e-07, + "loss": 0.1813, + "step": 5332 + }, + { + "epoch": 0.34, + "grad_norm": 0.7341932180942895, + "learning_rate": 7.683666675575901e-07, + "loss": 0.1579, + "step": 5333 + }, + { + "epoch": 0.34, + "grad_norm": 0.6616775004102179, + "learning_rate": 7.682795242636958e-07, + "loss": 0.1285, + "step": 5334 + }, + { + "epoch": 0.34, + "grad_norm": 8.092916168305884, + "learning_rate": 7.681923695244578e-07, + "loss": 0.0534, + "step": 5335 + }, + { + "epoch": 0.34, + "grad_norm": 3.1168417053413657, + "learning_rate": 7.681052033435942e-07, + "loss": 0.2868, + "step": 5336 + }, + { + "epoch": 0.34, + "grad_norm": 10.866520975312506, + "learning_rate": 7.680180257248235e-07, + "loss": 0.2316, + "step": 5337 + }, + { + "epoch": 0.34, + "grad_norm": 1.3393368998848683, + "learning_rate": 7.679308366718652e-07, + "loss": 0.1601, + "step": 5338 + }, + { + "epoch": 0.34, + "grad_norm": 0.427738334861401, + "learning_rate": 7.678436361884388e-07, + "loss": 0.2265, + "step": 5339 + }, + { + "epoch": 0.34, + "grad_norm": 2.454088902427455, + "learning_rate": 7.677564242782644e-07, + "loss": 0.2034, + "step": 5340 + }, + { + "epoch": 0.34, + "grad_norm": 0.4295246616964228, + "learning_rate": 7.676692009450626e-07, + "loss": 0.0805, + "step": 5341 + }, + { + "epoch": 0.34, + "grad_norm": 1.056853686692961, + "learning_rate": 7.675819661925547e-07, + "loss": 0.3044, + "step": 5342 + }, + { + "epoch": 0.34, + "grad_norm": 0.9531499999715173, + "learning_rate": 7.674947200244622e-07, + "loss": 0.1795, + "step": 5343 + }, + { + "epoch": 0.34, + "grad_norm": 4.417559042119717, + "learning_rate": 7.67407462444507e-07, + "loss": 0.1089, + "step": 5344 + }, + { + "epoch": 0.34, + "grad_norm": 1.6925665703216968, + "learning_rate": 7.673201934564122e-07, + "loss": 0.1718, + "step": 5345 + }, + { + "epoch": 0.34, + "grad_norm": 1.2011792930434808, + "learning_rate": 7.672329130639005e-07, + "loss": 0.3044, + "step": 5346 + }, + { + "epoch": 0.34, + "grad_norm": 0.5730655337707138, + "learning_rate": 7.671456212706956e-07, + "loss": 0.1644, + "step": 5347 + }, + { + "epoch": 0.34, + "grad_norm": 1.3752020337562845, + "learning_rate": 7.670583180805213e-07, + "loss": 0.1373, + "step": 5348 + }, + { + "epoch": 0.34, + "grad_norm": 0.6841256839270571, + "learning_rate": 7.669710034971024e-07, + "loss": 0.2897, + "step": 5349 + }, + { + "epoch": 0.34, + "grad_norm": 0.4700650050761052, + "learning_rate": 7.668836775241638e-07, + "loss": 0.0782, + "step": 5350 + }, + { + "epoch": 0.34, + "grad_norm": 3.6904821541206747, + "learning_rate": 7.667963401654308e-07, + "loss": 0.3029, + "step": 5351 + }, + { + "epoch": 0.34, + "grad_norm": 0.25120294168718676, + "learning_rate": 7.667089914246299e-07, + "loss": 0.1555, + "step": 5352 + }, + { + "epoch": 0.34, + "grad_norm": 0.77480443285652, + "learning_rate": 7.666216313054871e-07, + "loss": 0.2512, + "step": 5353 + }, + { + "epoch": 0.34, + "grad_norm": 0.9440982341360855, + "learning_rate": 7.665342598117296e-07, + "loss": 0.4071, + "step": 5354 + }, + { + "epoch": 0.34, + "grad_norm": 0.5085383472991699, + "learning_rate": 7.664468769470847e-07, + "loss": 0.3125, + "step": 5355 + }, + { + "epoch": 0.34, + "grad_norm": 0.48361497281116317, + "learning_rate": 7.663594827152805e-07, + "loss": 0.1925, + "step": 5356 + }, + { + "epoch": 0.34, + "grad_norm": 0.3529085530710126, + "learning_rate": 7.662720771200452e-07, + "loss": 0.2793, + "step": 5357 + }, + { + "epoch": 0.34, + "grad_norm": 0.5916127837381652, + "learning_rate": 7.66184660165108e-07, + "loss": 0.1994, + "step": 5358 + }, + { + "epoch": 0.34, + "grad_norm": 0.4505598947898049, + "learning_rate": 7.660972318541981e-07, + "loss": 0.1177, + "step": 5359 + }, + { + "epoch": 0.34, + "grad_norm": 0.1316654222220077, + "learning_rate": 7.660097921910451e-07, + "loss": 0.002, + "step": 5360 + }, + { + "epoch": 0.34, + "grad_norm": 0.4002509006801685, + "learning_rate": 7.659223411793799e-07, + "loss": 0.1185, + "step": 5361 + }, + { + "epoch": 0.34, + "grad_norm": 0.40040102302262925, + "learning_rate": 7.658348788229329e-07, + "loss": 0.2263, + "step": 5362 + }, + { + "epoch": 0.34, + "grad_norm": 15.19910629247316, + "learning_rate": 7.657474051254356e-07, + "loss": 0.3614, + "step": 5363 + }, + { + "epoch": 0.34, + "grad_norm": 1.2013994840857332, + "learning_rate": 7.656599200906197e-07, + "loss": 0.1998, + "step": 5364 + }, + { + "epoch": 0.34, + "grad_norm": 1.6985400370745665, + "learning_rate": 7.655724237222177e-07, + "loss": 0.1717, + "step": 5365 + }, + { + "epoch": 0.34, + "grad_norm": 0.8095266896357952, + "learning_rate": 7.654849160239623e-07, + "loss": 0.216, + "step": 5366 + }, + { + "epoch": 0.34, + "grad_norm": 0.264126168233839, + "learning_rate": 7.653973969995865e-07, + "loss": 0.195, + "step": 5367 + }, + { + "epoch": 0.34, + "grad_norm": 1.1730880943862239, + "learning_rate": 7.653098666528244e-07, + "loss": 0.2856, + "step": 5368 + }, + { + "epoch": 0.34, + "grad_norm": 6.68317277219934, + "learning_rate": 7.652223249874098e-07, + "loss": 0.2053, + "step": 5369 + }, + { + "epoch": 0.34, + "grad_norm": 0.44636271056922516, + "learning_rate": 7.651347720070777e-07, + "loss": 0.1098, + "step": 5370 + }, + { + "epoch": 0.34, + "grad_norm": 0.35610538276850995, + "learning_rate": 7.650472077155634e-07, + "loss": 0.2497, + "step": 5371 + }, + { + "epoch": 0.34, + "grad_norm": 0.6266563931129285, + "learning_rate": 7.649596321166024e-07, + "loss": 0.3499, + "step": 5372 + }, + { + "epoch": 0.34, + "grad_norm": 0.16752537143108334, + "learning_rate": 7.648720452139308e-07, + "loss": 0.0047, + "step": 5373 + }, + { + "epoch": 0.34, + "grad_norm": 2.633535854388574, + "learning_rate": 7.647844470112854e-07, + "loss": 0.2081, + "step": 5374 + }, + { + "epoch": 0.34, + "grad_norm": 0.6073753413259145, + "learning_rate": 7.646968375124032e-07, + "loss": 0.297, + "step": 5375 + }, + { + "epoch": 0.34, + "grad_norm": 0.899192877962476, + "learning_rate": 7.646092167210216e-07, + "loss": 0.1677, + "step": 5376 + }, + { + "epoch": 0.34, + "grad_norm": 0.4810626287457248, + "learning_rate": 7.64521584640879e-07, + "loss": 0.2114, + "step": 5377 + }, + { + "epoch": 0.34, + "grad_norm": 6.842217120586512, + "learning_rate": 7.644339412757138e-07, + "loss": 0.2486, + "step": 5378 + }, + { + "epoch": 0.34, + "grad_norm": 3.236412666233146, + "learning_rate": 7.643462866292651e-07, + "loss": 0.1284, + "step": 5379 + }, + { + "epoch": 0.34, + "grad_norm": 0.465679153010764, + "learning_rate": 7.642586207052726e-07, + "loss": 0.0819, + "step": 5380 + }, + { + "epoch": 0.34, + "grad_norm": 0.5932907918056111, + "learning_rate": 7.641709435074759e-07, + "loss": 0.025, + "step": 5381 + }, + { + "epoch": 0.34, + "grad_norm": 1.8587739825305312, + "learning_rate": 7.640832550396157e-07, + "loss": 0.1432, + "step": 5382 + }, + { + "epoch": 0.34, + "grad_norm": 0.7548112413610587, + "learning_rate": 7.639955553054331e-07, + "loss": 0.2997, + "step": 5383 + }, + { + "epoch": 0.34, + "grad_norm": 1.3740770436734397, + "learning_rate": 7.639078443086693e-07, + "loss": 0.0577, + "step": 5384 + }, + { + "epoch": 0.34, + "grad_norm": 0.6269091369820854, + "learning_rate": 7.638201220530663e-07, + "loss": 0.219, + "step": 5385 + }, + { + "epoch": 0.34, + "grad_norm": 0.35717292554144603, + "learning_rate": 7.637323885423667e-07, + "loss": 0.1343, + "step": 5386 + }, + { + "epoch": 0.34, + "grad_norm": 2.3400871097740708, + "learning_rate": 7.63644643780313e-07, + "loss": 0.3339, + "step": 5387 + }, + { + "epoch": 0.34, + "grad_norm": 0.8314382332385768, + "learning_rate": 7.635568877706491e-07, + "loss": 0.1248, + "step": 5388 + }, + { + "epoch": 0.34, + "grad_norm": 0.7448008465431624, + "learning_rate": 7.634691205171185e-07, + "loss": 0.2172, + "step": 5389 + }, + { + "epoch": 0.34, + "grad_norm": 1.3429403669183726, + "learning_rate": 7.633813420234654e-07, + "loss": 0.1637, + "step": 5390 + }, + { + "epoch": 0.34, + "grad_norm": 0.5481036834158419, + "learning_rate": 7.632935522934349e-07, + "loss": 0.3444, + "step": 5391 + }, + { + "epoch": 0.34, + "grad_norm": 0.26179972085160513, + "learning_rate": 7.632057513307721e-07, + "loss": 0.0127, + "step": 5392 + }, + { + "epoch": 0.34, + "grad_norm": 0.7934907645401681, + "learning_rate": 7.63117939139223e-07, + "loss": 0.3575, + "step": 5393 + }, + { + "epoch": 0.34, + "grad_norm": 0.4847344941338364, + "learning_rate": 7.630301157225335e-07, + "loss": 0.0863, + "step": 5394 + }, + { + "epoch": 0.34, + "grad_norm": 0.5144340151696197, + "learning_rate": 7.629422810844506e-07, + "loss": 0.058, + "step": 5395 + }, + { + "epoch": 0.34, + "grad_norm": 0.5197423783363038, + "learning_rate": 7.628544352287213e-07, + "loss": 0.1952, + "step": 5396 + }, + { + "epoch": 0.34, + "grad_norm": 0.9357355154274088, + "learning_rate": 7.627665781590936e-07, + "loss": 0.0964, + "step": 5397 + }, + { + "epoch": 0.34, + "grad_norm": 0.7982943321960051, + "learning_rate": 7.626787098793153e-07, + "loss": 0.2457, + "step": 5398 + }, + { + "epoch": 0.34, + "grad_norm": 0.7873596990257173, + "learning_rate": 7.625908303931352e-07, + "loss": 0.1931, + "step": 5399 + }, + { + "epoch": 0.34, + "grad_norm": 0.03574043489678248, + "learning_rate": 7.625029397043024e-07, + "loss": 0.0002, + "step": 5400 + }, + { + "epoch": 0.34, + "grad_norm": 1.3735738747832986, + "learning_rate": 7.624150378165665e-07, + "loss": 0.3125, + "step": 5401 + }, + { + "epoch": 0.34, + "grad_norm": 0.6538545634079106, + "learning_rate": 7.623271247336776e-07, + "loss": 0.1629, + "step": 5402 + }, + { + "epoch": 0.34, + "grad_norm": 1.2481701307765698, + "learning_rate": 7.622392004593861e-07, + "loss": 0.2579, + "step": 5403 + }, + { + "epoch": 0.34, + "grad_norm": 1.1906190333031517, + "learning_rate": 7.621512649974434e-07, + "loss": 0.3404, + "step": 5404 + }, + { + "epoch": 0.34, + "grad_norm": 0.7951479589285008, + "learning_rate": 7.620633183516004e-07, + "loss": 0.2589, + "step": 5405 + }, + { + "epoch": 0.34, + "grad_norm": 0.6350964950267514, + "learning_rate": 7.619753605256096e-07, + "loss": 0.1353, + "step": 5406 + }, + { + "epoch": 0.34, + "grad_norm": 0.14833055215320196, + "learning_rate": 7.618873915232233e-07, + "loss": 0.0045, + "step": 5407 + }, + { + "epoch": 0.34, + "grad_norm": 0.7273594822670096, + "learning_rate": 7.617994113481944e-07, + "loss": 0.2311, + "step": 5408 + }, + { + "epoch": 0.34, + "grad_norm": 0.3214943997490827, + "learning_rate": 7.617114200042764e-07, + "loss": 0.131, + "step": 5409 + }, + { + "epoch": 0.35, + "grad_norm": 0.7704313806013201, + "learning_rate": 7.61623417495223e-07, + "loss": 0.1477, + "step": 5410 + }, + { + "epoch": 0.35, + "grad_norm": 0.48631190898778454, + "learning_rate": 7.615354038247887e-07, + "loss": 0.0818, + "step": 5411 + }, + { + "epoch": 0.35, + "grad_norm": 0.6814791525704665, + "learning_rate": 7.614473789967284e-07, + "loss": 0.3285, + "step": 5412 + }, + { + "epoch": 0.35, + "grad_norm": 0.5000201149916452, + "learning_rate": 7.613593430147973e-07, + "loss": 0.3307, + "step": 5413 + }, + { + "epoch": 0.35, + "grad_norm": 6.086899312102441, + "learning_rate": 7.612712958827511e-07, + "loss": 0.2334, + "step": 5414 + }, + { + "epoch": 0.35, + "grad_norm": 0.43445168035263737, + "learning_rate": 7.611832376043464e-07, + "loss": 0.1203, + "step": 5415 + }, + { + "epoch": 0.35, + "grad_norm": 0.5063496868375928, + "learning_rate": 7.610951681833397e-07, + "loss": 0.1298, + "step": 5416 + }, + { + "epoch": 0.35, + "grad_norm": 2.1891589968534975, + "learning_rate": 7.610070876234882e-07, + "loss": 0.1901, + "step": 5417 + }, + { + "epoch": 0.35, + "grad_norm": 0.5705667873301133, + "learning_rate": 7.609189959285497e-07, + "loss": 0.0862, + "step": 5418 + }, + { + "epoch": 0.35, + "grad_norm": 2.165052832144716, + "learning_rate": 7.608308931022822e-07, + "loss": 0.2605, + "step": 5419 + }, + { + "epoch": 0.35, + "grad_norm": 3.0442779890428175, + "learning_rate": 7.607427791484447e-07, + "loss": 0.1809, + "step": 5420 + }, + { + "epoch": 0.35, + "grad_norm": 0.4496070838672782, + "learning_rate": 7.606546540707959e-07, + "loss": 0.1794, + "step": 5421 + }, + { + "epoch": 0.35, + "grad_norm": 0.6030649034819161, + "learning_rate": 7.605665178730956e-07, + "loss": 0.1341, + "step": 5422 + }, + { + "epoch": 0.35, + "grad_norm": 0.2806532213380455, + "learning_rate": 7.604783705591039e-07, + "loss": 0.1071, + "step": 5423 + }, + { + "epoch": 0.35, + "grad_norm": 0.747730965098597, + "learning_rate": 7.603902121325811e-07, + "loss": 0.262, + "step": 5424 + }, + { + "epoch": 0.35, + "grad_norm": 0.26019060436072894, + "learning_rate": 7.603020425972886e-07, + "loss": 0.119, + "step": 5425 + }, + { + "epoch": 0.35, + "grad_norm": 0.6580863305263909, + "learning_rate": 7.602138619569876e-07, + "loss": 0.0574, + "step": 5426 + }, + { + "epoch": 0.35, + "grad_norm": 0.586572500302497, + "learning_rate": 7.601256702154402e-07, + "loss": 0.1988, + "step": 5427 + }, + { + "epoch": 0.35, + "grad_norm": 0.9429699165236303, + "learning_rate": 7.600374673764087e-07, + "loss": 0.1048, + "step": 5428 + }, + { + "epoch": 0.35, + "grad_norm": 0.6030129263368172, + "learning_rate": 7.599492534436562e-07, + "loss": 0.1095, + "step": 5429 + }, + { + "epoch": 0.35, + "grad_norm": 0.3970639490193903, + "learning_rate": 7.598610284209459e-07, + "loss": 0.123, + "step": 5430 + }, + { + "epoch": 0.35, + "grad_norm": 0.8875642730833075, + "learning_rate": 7.597727923120419e-07, + "loss": 0.3333, + "step": 5431 + }, + { + "epoch": 0.35, + "grad_norm": 0.7935667196936881, + "learning_rate": 7.596845451207081e-07, + "loss": 0.1299, + "step": 5432 + }, + { + "epoch": 0.35, + "grad_norm": 1.8845982663534275, + "learning_rate": 7.595962868507098e-07, + "loss": 0.0206, + "step": 5433 + }, + { + "epoch": 0.35, + "grad_norm": 0.6958167365070135, + "learning_rate": 7.595080175058119e-07, + "loss": 0.4635, + "step": 5434 + }, + { + "epoch": 0.35, + "grad_norm": 0.8434032033922428, + "learning_rate": 7.594197370897806e-07, + "loss": 0.138, + "step": 5435 + }, + { + "epoch": 0.35, + "grad_norm": 0.2340305106813205, + "learning_rate": 7.593314456063815e-07, + "loss": 0.1236, + "step": 5436 + }, + { + "epoch": 0.35, + "grad_norm": 0.3510436566896165, + "learning_rate": 7.592431430593818e-07, + "loss": 0.2035, + "step": 5437 + }, + { + "epoch": 0.35, + "grad_norm": 1.113828575830915, + "learning_rate": 7.591548294525482e-07, + "loss": 0.2367, + "step": 5438 + }, + { + "epoch": 0.35, + "grad_norm": 0.6365103004291444, + "learning_rate": 7.590665047896489e-07, + "loss": 0.0133, + "step": 5439 + }, + { + "epoch": 0.35, + "grad_norm": 3.5961683705584258, + "learning_rate": 7.589781690744515e-07, + "loss": 0.0773, + "step": 5440 + }, + { + "epoch": 0.35, + "grad_norm": 0.7132547038521998, + "learning_rate": 7.588898223107249e-07, + "loss": 0.1689, + "step": 5441 + }, + { + "epoch": 0.35, + "grad_norm": 0.4314675017187079, + "learning_rate": 7.588014645022381e-07, + "loss": 0.109, + "step": 5442 + }, + { + "epoch": 0.35, + "grad_norm": 5.498069862488393, + "learning_rate": 7.587130956527605e-07, + "loss": 0.3384, + "step": 5443 + }, + { + "epoch": 0.35, + "grad_norm": 0.4025239974395892, + "learning_rate": 7.586247157660623e-07, + "loss": 0.0816, + "step": 5444 + }, + { + "epoch": 0.35, + "grad_norm": 2.006154927034557, + "learning_rate": 7.585363248459138e-07, + "loss": 0.0924, + "step": 5445 + }, + { + "epoch": 0.35, + "grad_norm": 0.5946390055624198, + "learning_rate": 7.584479228960858e-07, + "loss": 0.4267, + "step": 5446 + }, + { + "epoch": 0.35, + "grad_norm": 2.1102941873243526, + "learning_rate": 7.583595099203499e-07, + "loss": 0.1461, + "step": 5447 + }, + { + "epoch": 0.35, + "grad_norm": 0.8735929599987753, + "learning_rate": 7.582710859224779e-07, + "loss": 0.1238, + "step": 5448 + }, + { + "epoch": 0.35, + "grad_norm": 2.5119734516248617, + "learning_rate": 7.581826509062422e-07, + "loss": 0.0727, + "step": 5449 + }, + { + "epoch": 0.35, + "grad_norm": 0.5092960882392609, + "learning_rate": 7.580942048754158e-07, + "loss": 0.218, + "step": 5450 + }, + { + "epoch": 0.35, + "grad_norm": 0.48091907921323257, + "learning_rate": 7.580057478337716e-07, + "loss": 0.135, + "step": 5451 + }, + { + "epoch": 0.35, + "grad_norm": 2.6804820882221856, + "learning_rate": 7.579172797850835e-07, + "loss": 0.1032, + "step": 5452 + }, + { + "epoch": 0.35, + "grad_norm": 0.5761016630247517, + "learning_rate": 7.578288007331259e-07, + "loss": 0.2332, + "step": 5453 + }, + { + "epoch": 0.35, + "grad_norm": 2.148788013242087, + "learning_rate": 7.577403106816733e-07, + "loss": 0.3384, + "step": 5454 + }, + { + "epoch": 0.35, + "grad_norm": 1.577660597165397, + "learning_rate": 7.576518096345008e-07, + "loss": 0.2448, + "step": 5455 + }, + { + "epoch": 0.35, + "grad_norm": 1.1541021511978142, + "learning_rate": 7.575632975953844e-07, + "loss": 0.1887, + "step": 5456 + }, + { + "epoch": 0.35, + "grad_norm": 0.9556198088872199, + "learning_rate": 7.574747745680998e-07, + "loss": 0.4103, + "step": 5457 + }, + { + "epoch": 0.35, + "grad_norm": 6.011746950565354, + "learning_rate": 7.573862405564238e-07, + "loss": 0.235, + "step": 5458 + }, + { + "epoch": 0.35, + "grad_norm": 4.867669886562826, + "learning_rate": 7.572976955641333e-07, + "loss": 0.0455, + "step": 5459 + }, + { + "epoch": 0.35, + "grad_norm": 1.1562166733665182, + "learning_rate": 7.57209139595006e-07, + "loss": 0.2442, + "step": 5460 + }, + { + "epoch": 0.35, + "grad_norm": 0.9807165096759748, + "learning_rate": 7.571205726528196e-07, + "loss": 0.2519, + "step": 5461 + }, + { + "epoch": 0.35, + "grad_norm": 0.19813535693980622, + "learning_rate": 7.570319947413528e-07, + "loss": 0.0902, + "step": 5462 + }, + { + "epoch": 0.35, + "grad_norm": 0.33052386627680735, + "learning_rate": 7.569434058643843e-07, + "loss": 0.1445, + "step": 5463 + }, + { + "epoch": 0.35, + "grad_norm": 1.195067167890204, + "learning_rate": 7.568548060256937e-07, + "loss": 0.1691, + "step": 5464 + }, + { + "epoch": 0.35, + "grad_norm": 0.7083560570882403, + "learning_rate": 7.567661952290607e-07, + "loss": 0.1655, + "step": 5465 + }, + { + "epoch": 0.35, + "grad_norm": 0.6414384746879106, + "learning_rate": 7.566775734782656e-07, + "loss": 0.2678, + "step": 5466 + }, + { + "epoch": 0.35, + "grad_norm": 6.237016991388835, + "learning_rate": 7.565889407770891e-07, + "loss": 0.1284, + "step": 5467 + }, + { + "epoch": 0.35, + "grad_norm": 1.1814716207331748, + "learning_rate": 7.565002971293127e-07, + "loss": 0.3381, + "step": 5468 + }, + { + "epoch": 0.35, + "grad_norm": 1.222609216444924, + "learning_rate": 7.564116425387181e-07, + "loss": 0.1326, + "step": 5469 + }, + { + "epoch": 0.35, + "grad_norm": 0.3834778682885506, + "learning_rate": 7.563229770090873e-07, + "loss": 0.4125, + "step": 5470 + }, + { + "epoch": 0.35, + "grad_norm": 0.8188635420752655, + "learning_rate": 7.562343005442031e-07, + "loss": 0.0042, + "step": 5471 + }, + { + "epoch": 0.35, + "grad_norm": 0.3317451274996792, + "learning_rate": 7.561456131478486e-07, + "loss": 0.1304, + "step": 5472 + }, + { + "epoch": 0.35, + "grad_norm": 0.7369737169901256, + "learning_rate": 7.56056914823807e-07, + "loss": 0.2119, + "step": 5473 + }, + { + "epoch": 0.35, + "grad_norm": 1.7048416421697092, + "learning_rate": 7.55968205575863e-07, + "loss": 0.2323, + "step": 5474 + }, + { + "epoch": 0.35, + "grad_norm": 0.8939613133381339, + "learning_rate": 7.558794854078006e-07, + "loss": 0.1165, + "step": 5475 + }, + { + "epoch": 0.35, + "grad_norm": 0.9367642014346794, + "learning_rate": 7.55790754323405e-07, + "loss": 0.2342, + "step": 5476 + }, + { + "epoch": 0.35, + "grad_norm": 0.8430705717483711, + "learning_rate": 7.557020123264615e-07, + "loss": 0.2196, + "step": 5477 + }, + { + "epoch": 0.35, + "grad_norm": 0.5027771057273157, + "learning_rate": 7.556132594207564e-07, + "loss": 0.0873, + "step": 5478 + }, + { + "epoch": 0.35, + "grad_norm": 0.45055304957519526, + "learning_rate": 7.555244956100757e-07, + "loss": 0.1712, + "step": 5479 + }, + { + "epoch": 0.35, + "grad_norm": 0.5384406883428974, + "learning_rate": 7.554357208982063e-07, + "loss": 0.2173, + "step": 5480 + }, + { + "epoch": 0.35, + "grad_norm": 0.5123732613510469, + "learning_rate": 7.553469352889355e-07, + "loss": 0.2544, + "step": 5481 + }, + { + "epoch": 0.35, + "grad_norm": 0.6622802529235681, + "learning_rate": 7.552581387860513e-07, + "loss": 0.172, + "step": 5482 + }, + { + "epoch": 0.35, + "grad_norm": 1.246609448365193, + "learning_rate": 7.551693313933416e-07, + "loss": 0.3406, + "step": 5483 + }, + { + "epoch": 0.35, + "grad_norm": 0.6176914522877467, + "learning_rate": 7.550805131145954e-07, + "loss": 0.2154, + "step": 5484 + }, + { + "epoch": 0.35, + "grad_norm": 0.5857405908457256, + "learning_rate": 7.549916839536017e-07, + "loss": 0.1204, + "step": 5485 + }, + { + "epoch": 0.35, + "grad_norm": 0.9430934698659614, + "learning_rate": 7.549028439141502e-07, + "loss": 0.1439, + "step": 5486 + }, + { + "epoch": 0.35, + "grad_norm": 0.5069939926317235, + "learning_rate": 7.548139930000308e-07, + "loss": 0.132, + "step": 5487 + }, + { + "epoch": 0.35, + "grad_norm": 0.9469093464280597, + "learning_rate": 7.547251312150344e-07, + "loss": 0.1164, + "step": 5488 + }, + { + "epoch": 0.35, + "grad_norm": 0.38428685088550063, + "learning_rate": 7.546362585629517e-07, + "loss": 0.1245, + "step": 5489 + }, + { + "epoch": 0.35, + "grad_norm": 0.4850082210504356, + "learning_rate": 7.545473750475744e-07, + "loss": 0.0876, + "step": 5490 + }, + { + "epoch": 0.35, + "grad_norm": 0.814698988719398, + "learning_rate": 7.544584806726944e-07, + "loss": 0.2527, + "step": 5491 + }, + { + "epoch": 0.35, + "grad_norm": 0.8193787101128907, + "learning_rate": 7.54369575442104e-07, + "loss": 0.0961, + "step": 5492 + }, + { + "epoch": 0.35, + "grad_norm": 0.6859605299798347, + "learning_rate": 7.542806593595961e-07, + "loss": 0.2168, + "step": 5493 + }, + { + "epoch": 0.35, + "grad_norm": 0.46656504276993166, + "learning_rate": 7.541917324289644e-07, + "loss": 0.0417, + "step": 5494 + }, + { + "epoch": 0.35, + "grad_norm": 0.6236552184627319, + "learning_rate": 7.541027946540022e-07, + "loss": 0.1295, + "step": 5495 + }, + { + "epoch": 0.35, + "grad_norm": 0.8537973831497453, + "learning_rate": 7.540138460385039e-07, + "loss": 0.1608, + "step": 5496 + }, + { + "epoch": 0.35, + "grad_norm": 1.2793862954463233, + "learning_rate": 7.539248865862644e-07, + "loss": 0.1617, + "step": 5497 + }, + { + "epoch": 0.35, + "grad_norm": 0.6398392521187805, + "learning_rate": 7.538359163010789e-07, + "loss": 0.1793, + "step": 5498 + }, + { + "epoch": 0.35, + "grad_norm": 0.4287984261789411, + "learning_rate": 7.537469351867429e-07, + "loss": 0.0101, + "step": 5499 + }, + { + "epoch": 0.35, + "grad_norm": 1.6166844168596135, + "learning_rate": 7.536579432470525e-07, + "loss": 0.3597, + "step": 5500 + }, + { + "epoch": 0.35, + "grad_norm": 0.4182742071772365, + "learning_rate": 7.535689404858041e-07, + "loss": 0.2603, + "step": 5501 + }, + { + "epoch": 0.35, + "grad_norm": 0.3854342258052628, + "learning_rate": 7.534799269067951e-07, + "loss": 0.1638, + "step": 5502 + }, + { + "epoch": 0.35, + "grad_norm": 0.8940115750825707, + "learning_rate": 7.53390902513823e-07, + "loss": 0.2861, + "step": 5503 + }, + { + "epoch": 0.35, + "grad_norm": 4.30640763425162, + "learning_rate": 7.533018673106855e-07, + "loss": 0.2263, + "step": 5504 + }, + { + "epoch": 0.35, + "grad_norm": 0.7424977747555532, + "learning_rate": 7.532128213011813e-07, + "loss": 0.0734, + "step": 5505 + }, + { + "epoch": 0.35, + "grad_norm": 1.7383830668521845, + "learning_rate": 7.531237644891089e-07, + "loss": 0.3017, + "step": 5506 + }, + { + "epoch": 0.35, + "grad_norm": 1.6425962515759647, + "learning_rate": 7.530346968782679e-07, + "loss": 0.1868, + "step": 5507 + }, + { + "epoch": 0.35, + "grad_norm": 0.8891034027526099, + "learning_rate": 7.529456184724582e-07, + "loss": 0.2093, + "step": 5508 + }, + { + "epoch": 0.35, + "grad_norm": 0.6308586312027443, + "learning_rate": 7.528565292754798e-07, + "loss": 0.2022, + "step": 5509 + }, + { + "epoch": 0.35, + "grad_norm": 1.2966416101669163, + "learning_rate": 7.527674292911337e-07, + "loss": 0.346, + "step": 5510 + }, + { + "epoch": 0.35, + "grad_norm": 4.565257841276505, + "learning_rate": 7.526783185232207e-07, + "loss": 0.2415, + "step": 5511 + }, + { + "epoch": 0.35, + "grad_norm": 0.6378618019879686, + "learning_rate": 7.525891969755429e-07, + "loss": 0.2266, + "step": 5512 + }, + { + "epoch": 0.35, + "grad_norm": 0.7370565367478511, + "learning_rate": 7.525000646519022e-07, + "loss": 0.0854, + "step": 5513 + }, + { + "epoch": 0.35, + "grad_norm": 0.13803143539431034, + "learning_rate": 7.52410921556101e-07, + "loss": 0.0438, + "step": 5514 + }, + { + "epoch": 0.35, + "grad_norm": 0.8618364879921652, + "learning_rate": 7.523217676919427e-07, + "loss": 0.3591, + "step": 5515 + }, + { + "epoch": 0.35, + "grad_norm": 1.3538662856414543, + "learning_rate": 7.522326030632303e-07, + "loss": 0.3528, + "step": 5516 + }, + { + "epoch": 0.35, + "grad_norm": 0.4175646798417754, + "learning_rate": 7.521434276737682e-07, + "loss": 0.2288, + "step": 5517 + }, + { + "epoch": 0.35, + "grad_norm": 0.7815260591885742, + "learning_rate": 7.520542415273605e-07, + "loss": 0.2603, + "step": 5518 + }, + { + "epoch": 0.35, + "grad_norm": 2.6129913870949957, + "learning_rate": 7.51965044627812e-07, + "loss": 0.079, + "step": 5519 + }, + { + "epoch": 0.35, + "grad_norm": 0.9206101002044337, + "learning_rate": 7.518758369789284e-07, + "loss": 0.0695, + "step": 5520 + }, + { + "epoch": 0.35, + "grad_norm": 0.9850240890306616, + "learning_rate": 7.517866185845152e-07, + "loss": 0.3234, + "step": 5521 + }, + { + "epoch": 0.35, + "grad_norm": 0.4658518357104681, + "learning_rate": 7.516973894483788e-07, + "loss": 0.3014, + "step": 5522 + }, + { + "epoch": 0.35, + "grad_norm": 0.5990441004132712, + "learning_rate": 7.516081495743258e-07, + "loss": 0.0746, + "step": 5523 + }, + { + "epoch": 0.35, + "grad_norm": 0.9125143811157321, + "learning_rate": 7.515188989661631e-07, + "loss": 0.098, + "step": 5524 + }, + { + "epoch": 0.35, + "grad_norm": 1.6742846516022356, + "learning_rate": 7.514296376276988e-07, + "loss": 0.1169, + "step": 5525 + }, + { + "epoch": 0.35, + "grad_norm": 0.9410200416897807, + "learning_rate": 7.513403655627407e-07, + "loss": 0.2208, + "step": 5526 + }, + { + "epoch": 0.35, + "grad_norm": 1.1831979924336038, + "learning_rate": 7.512510827750973e-07, + "loss": 0.336, + "step": 5527 + }, + { + "epoch": 0.35, + "grad_norm": 0.6164531950287837, + "learning_rate": 7.511617892685775e-07, + "loss": 0.2676, + "step": 5528 + }, + { + "epoch": 0.35, + "grad_norm": 1.0073977437619148, + "learning_rate": 7.51072485046991e-07, + "loss": 0.0981, + "step": 5529 + }, + { + "epoch": 0.35, + "grad_norm": 0.5920372762894701, + "learning_rate": 7.509831701141476e-07, + "loss": 0.2343, + "step": 5530 + }, + { + "epoch": 0.35, + "grad_norm": 2.5246433394510883, + "learning_rate": 7.508938444738575e-07, + "loss": 0.2047, + "step": 5531 + }, + { + "epoch": 0.35, + "grad_norm": 0.41019274446527026, + "learning_rate": 7.508045081299317e-07, + "loss": 0.3106, + "step": 5532 + }, + { + "epoch": 0.35, + "grad_norm": 0.9708985114565645, + "learning_rate": 7.507151610861815e-07, + "loss": 0.2222, + "step": 5533 + }, + { + "epoch": 0.35, + "grad_norm": 1.4165937334867051, + "learning_rate": 7.506258033464183e-07, + "loss": 0.3681, + "step": 5534 + }, + { + "epoch": 0.35, + "grad_norm": 3.665154380980617, + "learning_rate": 7.505364349144547e-07, + "loss": 0.0905, + "step": 5535 + }, + { + "epoch": 0.35, + "grad_norm": 0.3393190699864382, + "learning_rate": 7.504470557941032e-07, + "loss": 0.1616, + "step": 5536 + }, + { + "epoch": 0.35, + "grad_norm": 1.3690012066375732, + "learning_rate": 7.503576659891767e-07, + "loss": 0.3203, + "step": 5537 + }, + { + "epoch": 0.35, + "grad_norm": 0.8934533090948807, + "learning_rate": 7.502682655034889e-07, + "loss": 0.2346, + "step": 5538 + }, + { + "epoch": 0.35, + "grad_norm": 0.39530251328778704, + "learning_rate": 7.501788543408538e-07, + "loss": 0.2292, + "step": 5539 + }, + { + "epoch": 0.35, + "grad_norm": 0.7845545391057316, + "learning_rate": 7.50089432505086e-07, + "loss": 0.4175, + "step": 5540 + }, + { + "epoch": 0.35, + "grad_norm": 0.6618787956586863, + "learning_rate": 7.5e-07, + "loss": 0.3286, + "step": 5541 + }, + { + "epoch": 0.35, + "grad_norm": 0.7129523944298292, + "learning_rate": 7.499105568294117e-07, + "loss": 0.3006, + "step": 5542 + }, + { + "epoch": 0.35, + "grad_norm": 0.34117821356147626, + "learning_rate": 7.498211029971364e-07, + "loss": 0.0433, + "step": 5543 + }, + { + "epoch": 0.35, + "grad_norm": 0.31330844388871015, + "learning_rate": 7.497316385069907e-07, + "loss": 0.0605, + "step": 5544 + }, + { + "epoch": 0.35, + "grad_norm": 0.38548290144148534, + "learning_rate": 7.496421633627914e-07, + "loss": 0.2189, + "step": 5545 + }, + { + "epoch": 0.35, + "grad_norm": 1.0065995220738866, + "learning_rate": 7.495526775683555e-07, + "loss": 0.2171, + "step": 5546 + }, + { + "epoch": 0.35, + "grad_norm": 0.5711721107906526, + "learning_rate": 7.494631811275007e-07, + "loss": 0.1296, + "step": 5547 + }, + { + "epoch": 0.35, + "grad_norm": 1.3489269810369782, + "learning_rate": 7.493736740440451e-07, + "loss": 0.2914, + "step": 5548 + }, + { + "epoch": 0.35, + "grad_norm": 0.951365099114088, + "learning_rate": 7.492841563218073e-07, + "loss": 0.3966, + "step": 5549 + }, + { + "epoch": 0.35, + "grad_norm": 3.8814859294325017, + "learning_rate": 7.491946279646063e-07, + "loss": 0.2021, + "step": 5550 + }, + { + "epoch": 0.35, + "grad_norm": 0.7517411318173365, + "learning_rate": 7.491050889762615e-07, + "loss": 0.0111, + "step": 5551 + }, + { + "epoch": 0.35, + "grad_norm": 2.061138678911251, + "learning_rate": 7.490155393605928e-07, + "loss": 0.1306, + "step": 5552 + }, + { + "epoch": 0.35, + "grad_norm": 1.0569228540036513, + "learning_rate": 7.489259791214207e-07, + "loss": 0.1079, + "step": 5553 + }, + { + "epoch": 0.35, + "grad_norm": 0.9278994889987051, + "learning_rate": 7.488364082625658e-07, + "loss": 0.3206, + "step": 5554 + }, + { + "epoch": 0.35, + "grad_norm": 0.8308183209066643, + "learning_rate": 7.487468267878496e-07, + "loss": 0.3183, + "step": 5555 + }, + { + "epoch": 0.35, + "grad_norm": 1.509597860789319, + "learning_rate": 7.486572347010936e-07, + "loss": 0.2289, + "step": 5556 + }, + { + "epoch": 0.35, + "grad_norm": 1.1048010166411033, + "learning_rate": 7.485676320061203e-07, + "loss": 0.214, + "step": 5557 + }, + { + "epoch": 0.35, + "grad_norm": 0.39917487915009037, + "learning_rate": 7.48478018706752e-07, + "loss": 0.1675, + "step": 5558 + }, + { + "epoch": 0.35, + "grad_norm": 0.9550023711294141, + "learning_rate": 7.48388394806812e-07, + "loss": 0.2732, + "step": 5559 + }, + { + "epoch": 0.35, + "grad_norm": 0.29584155540136137, + "learning_rate": 7.482987603101236e-07, + "loss": 0.0789, + "step": 5560 + }, + { + "epoch": 0.35, + "grad_norm": 0.6734566651477544, + "learning_rate": 7.482091152205111e-07, + "loss": 0.1139, + "step": 5561 + }, + { + "epoch": 0.35, + "grad_norm": 0.2411371701448759, + "learning_rate": 7.481194595417987e-07, + "loss": 0.0923, + "step": 5562 + }, + { + "epoch": 0.35, + "grad_norm": 0.5948113824946445, + "learning_rate": 7.480297932778115e-07, + "loss": 0.3879, + "step": 5563 + }, + { + "epoch": 0.35, + "grad_norm": 1.2976684576031656, + "learning_rate": 7.479401164323744e-07, + "loss": 0.0527, + "step": 5564 + }, + { + "epoch": 0.35, + "grad_norm": 0.6931557936213576, + "learning_rate": 7.478504290093137e-07, + "loss": 0.3137, + "step": 5565 + }, + { + "epoch": 0.35, + "grad_norm": 1.93424600996979, + "learning_rate": 7.477607310124556e-07, + "loss": 0.0462, + "step": 5566 + }, + { + "epoch": 0.36, + "grad_norm": 0.7233646757544797, + "learning_rate": 7.476710224456267e-07, + "loss": 0.2318, + "step": 5567 + }, + { + "epoch": 0.36, + "grad_norm": 1.7154052090167797, + "learning_rate": 7.475813033126539e-07, + "loss": 0.2672, + "step": 5568 + }, + { + "epoch": 0.36, + "grad_norm": 0.36258857569078284, + "learning_rate": 7.47491573617365e-07, + "loss": 0.153, + "step": 5569 + }, + { + "epoch": 0.36, + "grad_norm": 1.030209076820536, + "learning_rate": 7.474018333635881e-07, + "loss": 0.2758, + "step": 5570 + }, + { + "epoch": 0.36, + "grad_norm": 0.9478448659022397, + "learning_rate": 7.473120825551516e-07, + "loss": 0.2027, + "step": 5571 + }, + { + "epoch": 0.36, + "grad_norm": 1.7993747337189498, + "learning_rate": 7.472223211958845e-07, + "loss": 0.0878, + "step": 5572 + }, + { + "epoch": 0.36, + "grad_norm": 8.318048620247762, + "learning_rate": 7.471325492896163e-07, + "loss": 0.2963, + "step": 5573 + }, + { + "epoch": 0.36, + "grad_norm": 0.7229115735007899, + "learning_rate": 7.470427668401766e-07, + "loss": 0.1638, + "step": 5574 + }, + { + "epoch": 0.36, + "grad_norm": 0.8616759970056704, + "learning_rate": 7.469529738513959e-07, + "loss": 0.3767, + "step": 5575 + }, + { + "epoch": 0.36, + "grad_norm": 0.2840704025590366, + "learning_rate": 7.468631703271049e-07, + "loss": 0.1306, + "step": 5576 + }, + { + "epoch": 0.36, + "grad_norm": 0.24844192009676036, + "learning_rate": 7.467733562711349e-07, + "loss": 0.0068, + "step": 5577 + }, + { + "epoch": 0.36, + "grad_norm": 0.7622158886912915, + "learning_rate": 7.466835316873173e-07, + "loss": 0.1614, + "step": 5578 + }, + { + "epoch": 0.36, + "grad_norm": 0.5074933879665838, + "learning_rate": 7.465936965794844e-07, + "loss": 0.0773, + "step": 5579 + }, + { + "epoch": 0.36, + "grad_norm": 1.2259056590904156, + "learning_rate": 7.465038509514687e-07, + "loss": 0.3512, + "step": 5580 + }, + { + "epoch": 0.36, + "grad_norm": 1.346760402131009, + "learning_rate": 7.464139948071032e-07, + "loss": 0.1872, + "step": 5581 + }, + { + "epoch": 0.36, + "grad_norm": 0.4691340629334052, + "learning_rate": 7.463241281502213e-07, + "loss": 0.2107, + "step": 5582 + }, + { + "epoch": 0.36, + "grad_norm": 0.627102704587824, + "learning_rate": 7.462342509846569e-07, + "loss": 0.1586, + "step": 5583 + }, + { + "epoch": 0.36, + "grad_norm": 0.2568205072347259, + "learning_rate": 7.461443633142445e-07, + "loss": 0.0881, + "step": 5584 + }, + { + "epoch": 0.36, + "grad_norm": 0.4491214798196706, + "learning_rate": 7.460544651428186e-07, + "loss": 0.3717, + "step": 5585 + }, + { + "epoch": 0.36, + "grad_norm": 1.5255583347139716, + "learning_rate": 7.459645564742147e-07, + "loss": 0.192, + "step": 5586 + }, + { + "epoch": 0.36, + "grad_norm": 0.7139963882199662, + "learning_rate": 7.458746373122682e-07, + "loss": 0.2344, + "step": 5587 + }, + { + "epoch": 0.36, + "grad_norm": 0.8754339092552733, + "learning_rate": 7.457847076608154e-07, + "loss": 0.2091, + "step": 5588 + }, + { + "epoch": 0.36, + "grad_norm": 3.737726411806768, + "learning_rate": 7.456947675236931e-07, + "loss": 0.227, + "step": 5589 + }, + { + "epoch": 0.36, + "grad_norm": 1.943797248690084, + "learning_rate": 7.45604816904738e-07, + "loss": 0.4343, + "step": 5590 + }, + { + "epoch": 0.36, + "grad_norm": 1.5659900317588396, + "learning_rate": 7.455148558077875e-07, + "loss": 0.107, + "step": 5591 + }, + { + "epoch": 0.36, + "grad_norm": 4.007616680055804, + "learning_rate": 7.454248842366799e-07, + "loss": 0.1941, + "step": 5592 + }, + { + "epoch": 0.36, + "grad_norm": 0.41591064194367167, + "learning_rate": 7.453349021952533e-07, + "loss": 0.0516, + "step": 5593 + }, + { + "epoch": 0.36, + "grad_norm": 1.3143605152013824, + "learning_rate": 7.452449096873467e-07, + "loss": 0.0915, + "step": 5594 + }, + { + "epoch": 0.36, + "grad_norm": 0.8047655118185578, + "learning_rate": 7.451549067167993e-07, + "loss": 0.179, + "step": 5595 + }, + { + "epoch": 0.36, + "grad_norm": 1.744229908964181, + "learning_rate": 7.450648932874506e-07, + "loss": 0.0828, + "step": 5596 + }, + { + "epoch": 0.36, + "grad_norm": 0.6446813391299382, + "learning_rate": 7.449748694031411e-07, + "loss": 0.1499, + "step": 5597 + }, + { + "epoch": 0.36, + "grad_norm": 0.9662495016901049, + "learning_rate": 7.44884835067711e-07, + "loss": 0.3574, + "step": 5598 + }, + { + "epoch": 0.36, + "grad_norm": 0.7946516559915121, + "learning_rate": 7.447947902850015e-07, + "loss": 0.3775, + "step": 5599 + }, + { + "epoch": 0.36, + "grad_norm": 0.6918885825520503, + "learning_rate": 7.447047350588542e-07, + "loss": 0.2905, + "step": 5600 + }, + { + "epoch": 0.36, + "grad_norm": 0.9162217735464907, + "learning_rate": 7.44614669393111e-07, + "loss": 0.2766, + "step": 5601 + }, + { + "epoch": 0.36, + "grad_norm": 0.5856221977423952, + "learning_rate": 7.445245932916145e-07, + "loss": 0.1782, + "step": 5602 + }, + { + "epoch": 0.36, + "grad_norm": 1.130971782366612, + "learning_rate": 7.44434506758207e-07, + "loss": 0.3936, + "step": 5603 + }, + { + "epoch": 0.36, + "grad_norm": 0.49956457392869424, + "learning_rate": 7.443444097967322e-07, + "loss": 0.1684, + "step": 5604 + }, + { + "epoch": 0.36, + "grad_norm": 5.442124914605853, + "learning_rate": 7.442543024110336e-07, + "loss": 0.1308, + "step": 5605 + }, + { + "epoch": 0.36, + "grad_norm": 1.1012306541307841, + "learning_rate": 7.441641846049556e-07, + "loss": 0.157, + "step": 5606 + }, + { + "epoch": 0.36, + "grad_norm": 0.8128936184046678, + "learning_rate": 7.440740563823424e-07, + "loss": 0.2805, + "step": 5607 + }, + { + "epoch": 0.36, + "grad_norm": 0.0958500784278856, + "learning_rate": 7.439839177470395e-07, + "loss": 0.0014, + "step": 5608 + }, + { + "epoch": 0.36, + "grad_norm": 1.0622095849821385, + "learning_rate": 7.438937687028922e-07, + "loss": 0.0322, + "step": 5609 + }, + { + "epoch": 0.36, + "grad_norm": 0.82240783570984, + "learning_rate": 7.438036092537464e-07, + "loss": 0.1379, + "step": 5610 + }, + { + "epoch": 0.36, + "grad_norm": 1.0064401223486799, + "learning_rate": 7.437134394034486e-07, + "loss": 0.271, + "step": 5611 + }, + { + "epoch": 0.36, + "grad_norm": 0.5824506377365629, + "learning_rate": 7.436232591558453e-07, + "loss": 0.1038, + "step": 5612 + }, + { + "epoch": 0.36, + "grad_norm": 0.6278024259582801, + "learning_rate": 7.435330685147842e-07, + "loss": 0.3068, + "step": 5613 + }, + { + "epoch": 0.36, + "grad_norm": 7.263689566913914, + "learning_rate": 7.434428674841129e-07, + "loss": 0.4099, + "step": 5614 + }, + { + "epoch": 0.36, + "grad_norm": 0.536778118306459, + "learning_rate": 7.433526560676795e-07, + "loss": 0.2519, + "step": 5615 + }, + { + "epoch": 0.36, + "grad_norm": 0.22056175117818586, + "learning_rate": 7.432624342693325e-07, + "loss": 0.0012, + "step": 5616 + }, + { + "epoch": 0.36, + "grad_norm": 1.0107420371992206, + "learning_rate": 7.431722020929209e-07, + "loss": 0.3657, + "step": 5617 + }, + { + "epoch": 0.36, + "grad_norm": 0.38133028570314137, + "learning_rate": 7.430819595422944e-07, + "loss": 0.0995, + "step": 5618 + }, + { + "epoch": 0.36, + "grad_norm": 0.3274933073862944, + "learning_rate": 7.429917066213029e-07, + "loss": 0.1068, + "step": 5619 + }, + { + "epoch": 0.36, + "grad_norm": 0.7391526671153332, + "learning_rate": 7.429014433337968e-07, + "loss": 0.2051, + "step": 5620 + }, + { + "epoch": 0.36, + "grad_norm": 0.6382017648543542, + "learning_rate": 7.428111696836268e-07, + "loss": 0.0841, + "step": 5621 + }, + { + "epoch": 0.36, + "grad_norm": 0.5479588869795209, + "learning_rate": 7.427208856746443e-07, + "loss": 0.0117, + "step": 5622 + }, + { + "epoch": 0.36, + "grad_norm": 1.4681924472219252, + "learning_rate": 7.426305913107007e-07, + "loss": 0.2606, + "step": 5623 + }, + { + "epoch": 0.36, + "grad_norm": 0.4349478615381459, + "learning_rate": 7.425402865956484e-07, + "loss": 0.2065, + "step": 5624 + }, + { + "epoch": 0.36, + "grad_norm": 0.48133390750990807, + "learning_rate": 7.424499715333398e-07, + "loss": 0.2311, + "step": 5625 + }, + { + "epoch": 0.36, + "grad_norm": 0.43393289337501645, + "learning_rate": 7.42359646127628e-07, + "loss": 0.0131, + "step": 5626 + }, + { + "epoch": 0.36, + "grad_norm": 0.9901231319832863, + "learning_rate": 7.422693103823667e-07, + "loss": 0.3646, + "step": 5627 + }, + { + "epoch": 0.36, + "grad_norm": 0.8897654773571841, + "learning_rate": 7.421789643014095e-07, + "loss": 0.2595, + "step": 5628 + }, + { + "epoch": 0.36, + "grad_norm": 0.08700536088908316, + "learning_rate": 7.420886078886109e-07, + "loss": 0.0035, + "step": 5629 + }, + { + "epoch": 0.36, + "grad_norm": 0.5091640443719467, + "learning_rate": 7.419982411478255e-07, + "loss": 0.2817, + "step": 5630 + }, + { + "epoch": 0.36, + "grad_norm": 0.6308944245010688, + "learning_rate": 7.419078640829087e-07, + "loss": 0.3434, + "step": 5631 + }, + { + "epoch": 0.36, + "grad_norm": 1.325752180559273, + "learning_rate": 7.418174766977161e-07, + "loss": 0.0618, + "step": 5632 + }, + { + "epoch": 0.36, + "grad_norm": 0.7016784882125594, + "learning_rate": 7.417270789961039e-07, + "loss": 0.2734, + "step": 5633 + }, + { + "epoch": 0.36, + "grad_norm": 0.7981139381820834, + "learning_rate": 7.416366709819286e-07, + "loss": 0.2542, + "step": 5634 + }, + { + "epoch": 0.36, + "grad_norm": 0.570908425410927, + "learning_rate": 7.415462526590471e-07, + "loss": 0.315, + "step": 5635 + }, + { + "epoch": 0.36, + "grad_norm": 0.6039784607978905, + "learning_rate": 7.414558240313169e-07, + "loss": 0.0198, + "step": 5636 + }, + { + "epoch": 0.36, + "grad_norm": 0.8918662710400138, + "learning_rate": 7.413653851025958e-07, + "loss": 0.0737, + "step": 5637 + }, + { + "epoch": 0.36, + "grad_norm": 0.6018072937725762, + "learning_rate": 7.412749358767422e-07, + "loss": 0.1989, + "step": 5638 + }, + { + "epoch": 0.36, + "grad_norm": 0.5927248964447291, + "learning_rate": 7.41184476357615e-07, + "loss": 0.1069, + "step": 5639 + }, + { + "epoch": 0.36, + "grad_norm": 0.8822401615052278, + "learning_rate": 7.410940065490731e-07, + "loss": 0.1038, + "step": 5640 + }, + { + "epoch": 0.36, + "grad_norm": 0.9121659841341959, + "learning_rate": 7.410035264549761e-07, + "loss": 0.1719, + "step": 5641 + }, + { + "epoch": 0.36, + "grad_norm": 1.2428128379739538, + "learning_rate": 7.409130360791842e-07, + "loss": 0.2085, + "step": 5642 + }, + { + "epoch": 0.36, + "grad_norm": 1.9114363804896441, + "learning_rate": 7.408225354255579e-07, + "loss": 0.2893, + "step": 5643 + }, + { + "epoch": 0.36, + "grad_norm": 0.56843863463519, + "learning_rate": 7.407320244979581e-07, + "loss": 0.1195, + "step": 5644 + }, + { + "epoch": 0.36, + "grad_norm": 0.5664476519349504, + "learning_rate": 7.406415033002463e-07, + "loss": 0.1656, + "step": 5645 + }, + { + "epoch": 0.36, + "grad_norm": 0.8187165423109634, + "learning_rate": 7.405509718362841e-07, + "loss": 0.0665, + "step": 5646 + }, + { + "epoch": 0.36, + "grad_norm": 0.7366153668689416, + "learning_rate": 7.404604301099339e-07, + "loss": 0.2392, + "step": 5647 + }, + { + "epoch": 0.36, + "grad_norm": 0.8249155852385768, + "learning_rate": 7.403698781250586e-07, + "loss": 0.4827, + "step": 5648 + }, + { + "epoch": 0.36, + "grad_norm": 0.18021192622080656, + "learning_rate": 7.402793158855209e-07, + "loss": 0.0045, + "step": 5649 + }, + { + "epoch": 0.36, + "grad_norm": 0.6927277860411692, + "learning_rate": 7.401887433951847e-07, + "loss": 0.1217, + "step": 5650 + }, + { + "epoch": 0.36, + "grad_norm": 0.3272280206798197, + "learning_rate": 7.400981606579138e-07, + "loss": 0.0934, + "step": 5651 + }, + { + "epoch": 0.36, + "grad_norm": 0.2544410262624939, + "learning_rate": 7.400075676775724e-07, + "loss": 0.1007, + "step": 5652 + }, + { + "epoch": 0.36, + "grad_norm": 2.2623609796207327, + "learning_rate": 7.39916964458026e-07, + "loss": 0.2208, + "step": 5653 + }, + { + "epoch": 0.36, + "grad_norm": 1.3304452078577227, + "learning_rate": 7.398263510031395e-07, + "loss": 0.2133, + "step": 5654 + }, + { + "epoch": 0.36, + "grad_norm": 0.6820556533519123, + "learning_rate": 7.397357273167788e-07, + "loss": 0.2927, + "step": 5655 + }, + { + "epoch": 0.36, + "grad_norm": 3.988366638314936, + "learning_rate": 7.396450934028101e-07, + "loss": 0.2815, + "step": 5656 + }, + { + "epoch": 0.36, + "grad_norm": 2.792737750103525, + "learning_rate": 7.395544492650999e-07, + "loss": 0.077, + "step": 5657 + }, + { + "epoch": 0.36, + "grad_norm": 1.1168716330631443, + "learning_rate": 7.394637949075154e-07, + "loss": 0.3105, + "step": 5658 + }, + { + "epoch": 0.36, + "grad_norm": 0.6715781478947498, + "learning_rate": 7.393731303339239e-07, + "loss": 0.2448, + "step": 5659 + }, + { + "epoch": 0.36, + "grad_norm": 1.4642124289382994, + "learning_rate": 7.392824555481935e-07, + "loss": 0.0388, + "step": 5660 + }, + { + "epoch": 0.36, + "grad_norm": 0.3997446258465748, + "learning_rate": 7.391917705541925e-07, + "loss": 0.1816, + "step": 5661 + }, + { + "epoch": 0.36, + "grad_norm": 3.179714691245469, + "learning_rate": 7.391010753557898e-07, + "loss": 0.1698, + "step": 5662 + }, + { + "epoch": 0.36, + "grad_norm": 5.5078712312561215, + "learning_rate": 7.390103699568546e-07, + "loss": 0.0489, + "step": 5663 + }, + { + "epoch": 0.36, + "grad_norm": 0.8097312463539856, + "learning_rate": 7.389196543612566e-07, + "loss": 0.2355, + "step": 5664 + }, + { + "epoch": 0.36, + "grad_norm": 0.612219173660267, + "learning_rate": 7.388289285728657e-07, + "loss": 0.1088, + "step": 5665 + }, + { + "epoch": 0.36, + "grad_norm": 0.4675509341118895, + "learning_rate": 7.387381925955527e-07, + "loss": 0.1119, + "step": 5666 + }, + { + "epoch": 0.36, + "grad_norm": 0.36521656834890354, + "learning_rate": 7.386474464331884e-07, + "loss": 0.0157, + "step": 5667 + }, + { + "epoch": 0.36, + "grad_norm": 0.6613329380880284, + "learning_rate": 7.385566900896444e-07, + "loss": 0.4098, + "step": 5668 + }, + { + "epoch": 0.36, + "grad_norm": 1.308144804980323, + "learning_rate": 7.384659235687923e-07, + "loss": 0.1153, + "step": 5669 + }, + { + "epoch": 0.36, + "grad_norm": 0.2843589636687059, + "learning_rate": 7.383751468745045e-07, + "loss": 0.2318, + "step": 5670 + }, + { + "epoch": 0.36, + "grad_norm": 1.9875939890431915, + "learning_rate": 7.382843600106539e-07, + "loss": 0.3433, + "step": 5671 + }, + { + "epoch": 0.36, + "grad_norm": 0.4268691867817186, + "learning_rate": 7.381935629811133e-07, + "loss": 0.1946, + "step": 5672 + }, + { + "epoch": 0.36, + "grad_norm": 0.5107892003036639, + "learning_rate": 7.381027557897567e-07, + "loss": 0.0937, + "step": 5673 + }, + { + "epoch": 0.36, + "grad_norm": 3.27515494830063, + "learning_rate": 7.380119384404578e-07, + "loss": 0.0747, + "step": 5674 + }, + { + "epoch": 0.36, + "grad_norm": 0.6790032271875316, + "learning_rate": 7.379211109370911e-07, + "loss": 0.2643, + "step": 5675 + }, + { + "epoch": 0.36, + "grad_norm": 0.26761219533808595, + "learning_rate": 7.378302732835316e-07, + "loss": 0.1076, + "step": 5676 + }, + { + "epoch": 0.36, + "grad_norm": 0.500631776750934, + "learning_rate": 7.377394254836547e-07, + "loss": 0.1016, + "step": 5677 + }, + { + "epoch": 0.36, + "grad_norm": 0.5614007688444749, + "learning_rate": 7.376485675413356e-07, + "loss": 0.1787, + "step": 5678 + }, + { + "epoch": 0.36, + "grad_norm": 0.3439127880289776, + "learning_rate": 7.375576994604511e-07, + "loss": 0.0058, + "step": 5679 + }, + { + "epoch": 0.36, + "grad_norm": 1.158611270169246, + "learning_rate": 7.374668212448776e-07, + "loss": 0.2079, + "step": 5680 + }, + { + "epoch": 0.36, + "grad_norm": 0.9345290844976936, + "learning_rate": 7.373759328984921e-07, + "loss": 0.371, + "step": 5681 + }, + { + "epoch": 0.36, + "grad_norm": 0.6145023831308265, + "learning_rate": 7.37285034425172e-07, + "loss": 0.1397, + "step": 5682 + }, + { + "epoch": 0.36, + "grad_norm": 1.2459527759727258, + "learning_rate": 7.371941258287955e-07, + "loss": 0.0973, + "step": 5683 + }, + { + "epoch": 0.36, + "grad_norm": 0.9089091616003668, + "learning_rate": 7.371032071132408e-07, + "loss": 0.2132, + "step": 5684 + }, + { + "epoch": 0.36, + "grad_norm": 1.1794201041196493, + "learning_rate": 7.370122782823866e-07, + "loss": 0.3221, + "step": 5685 + }, + { + "epoch": 0.36, + "grad_norm": 0.886339245117712, + "learning_rate": 7.36921339340112e-07, + "loss": 0.0274, + "step": 5686 + }, + { + "epoch": 0.36, + "grad_norm": 0.6004655532216744, + "learning_rate": 7.368303902902969e-07, + "loss": 0.2572, + "step": 5687 + }, + { + "epoch": 0.36, + "grad_norm": 1.5554489179893731, + "learning_rate": 7.367394311368212e-07, + "loss": 0.1274, + "step": 5688 + }, + { + "epoch": 0.36, + "grad_norm": 1.8241326705184715, + "learning_rate": 7.366484618835656e-07, + "loss": 0.115, + "step": 5689 + }, + { + "epoch": 0.36, + "grad_norm": 5.236697076427162, + "learning_rate": 7.36557482534411e-07, + "loss": 0.0819, + "step": 5690 + }, + { + "epoch": 0.36, + "grad_norm": 1.0167983250243307, + "learning_rate": 7.364664930932384e-07, + "loss": 0.0316, + "step": 5691 + }, + { + "epoch": 0.36, + "grad_norm": 0.6488208347208061, + "learning_rate": 7.3637549356393e-07, + "loss": 0.3033, + "step": 5692 + }, + { + "epoch": 0.36, + "grad_norm": 0.6161355625411944, + "learning_rate": 7.362844839503677e-07, + "loss": 0.2882, + "step": 5693 + }, + { + "epoch": 0.36, + "grad_norm": 1.2956359600938927, + "learning_rate": 7.361934642564345e-07, + "loss": 0.5045, + "step": 5694 + }, + { + "epoch": 0.36, + "grad_norm": 0.3618528380705309, + "learning_rate": 7.361024344860132e-07, + "loss": 0.332, + "step": 5695 + }, + { + "epoch": 0.36, + "grad_norm": 1.1060362685573237, + "learning_rate": 7.360113946429873e-07, + "loss": 0.1582, + "step": 5696 + }, + { + "epoch": 0.36, + "grad_norm": 0.5879530344047207, + "learning_rate": 7.35920344731241e-07, + "loss": 0.1334, + "step": 5697 + }, + { + "epoch": 0.36, + "grad_norm": 3.851411422399082, + "learning_rate": 7.358292847546585e-07, + "loss": 0.1131, + "step": 5698 + }, + { + "epoch": 0.36, + "grad_norm": 0.5272105328609933, + "learning_rate": 7.357382147171247e-07, + "loss": 0.2389, + "step": 5699 + }, + { + "epoch": 0.36, + "grad_norm": 0.5566793042569607, + "learning_rate": 7.356471346225248e-07, + "loss": 0.0871, + "step": 5700 + }, + { + "epoch": 0.36, + "grad_norm": 0.5260969300371358, + "learning_rate": 7.355560444747444e-07, + "loss": 0.1068, + "step": 5701 + }, + { + "epoch": 0.36, + "grad_norm": 0.43081534978532504, + "learning_rate": 7.354649442776696e-07, + "loss": 0.0796, + "step": 5702 + }, + { + "epoch": 0.36, + "grad_norm": 0.9839624499303066, + "learning_rate": 7.35373834035187e-07, + "loss": 0.1926, + "step": 5703 + }, + { + "epoch": 0.36, + "grad_norm": 0.4249842854795375, + "learning_rate": 7.352827137511835e-07, + "loss": 0.048, + "step": 5704 + }, + { + "epoch": 0.36, + "grad_norm": 0.4826094211081446, + "learning_rate": 7.351915834295462e-07, + "loss": 0.2404, + "step": 5705 + }, + { + "epoch": 0.36, + "grad_norm": 0.46693183919675874, + "learning_rate": 7.351004430741633e-07, + "loss": 0.1472, + "step": 5706 + }, + { + "epoch": 0.36, + "grad_norm": 3.4642519926183546, + "learning_rate": 7.350092926889229e-07, + "loss": 0.2596, + "step": 5707 + }, + { + "epoch": 0.36, + "grad_norm": 0.3382804651693839, + "learning_rate": 7.349181322777137e-07, + "loss": 0.0568, + "step": 5708 + }, + { + "epoch": 0.36, + "grad_norm": 0.2676612799329265, + "learning_rate": 7.348269618444247e-07, + "loss": 0.0702, + "step": 5709 + }, + { + "epoch": 0.36, + "grad_norm": 0.9893205741285498, + "learning_rate": 7.347357813929454e-07, + "loss": 0.3175, + "step": 5710 + }, + { + "epoch": 0.36, + "grad_norm": 0.8570067933630025, + "learning_rate": 7.346445909271658e-07, + "loss": 0.2645, + "step": 5711 + }, + { + "epoch": 0.36, + "grad_norm": 0.9840025305888707, + "learning_rate": 7.345533904509763e-07, + "loss": 0.0598, + "step": 5712 + }, + { + "epoch": 0.36, + "grad_norm": 0.4118979936901152, + "learning_rate": 7.344621799682675e-07, + "loss": 0.1159, + "step": 5713 + }, + { + "epoch": 0.36, + "grad_norm": 3.171307740778579, + "learning_rate": 7.343709594829311e-07, + "loss": 0.0209, + "step": 5714 + }, + { + "epoch": 0.36, + "grad_norm": 0.6712994625698699, + "learning_rate": 7.34279728998858e-07, + "loss": 0.1095, + "step": 5715 + }, + { + "epoch": 0.36, + "grad_norm": 1.0261338381513776, + "learning_rate": 7.34188488519941e-07, + "loss": 0.28, + "step": 5716 + }, + { + "epoch": 0.36, + "grad_norm": 0.4612067449269904, + "learning_rate": 7.340972380500722e-07, + "loss": 0.244, + "step": 5717 + }, + { + "epoch": 0.36, + "grad_norm": 1.6316223035935755, + "learning_rate": 7.340059775931447e-07, + "loss": 0.46, + "step": 5718 + }, + { + "epoch": 0.36, + "grad_norm": 0.3100217606040487, + "learning_rate": 7.339147071530518e-07, + "loss": 0.1125, + "step": 5719 + }, + { + "epoch": 0.36, + "grad_norm": 1.087385815884791, + "learning_rate": 7.338234267336872e-07, + "loss": 0.26, + "step": 5720 + }, + { + "epoch": 0.36, + "grad_norm": 0.678636488255624, + "learning_rate": 7.337321363389452e-07, + "loss": 0.2709, + "step": 5721 + }, + { + "epoch": 0.36, + "grad_norm": 0.62897584509711, + "learning_rate": 7.336408359727203e-07, + "loss": 0.2842, + "step": 5722 + }, + { + "epoch": 0.36, + "grad_norm": 0.18081937509925167, + "learning_rate": 7.335495256389077e-07, + "loss": 0.0797, + "step": 5723 + }, + { + "epoch": 0.37, + "grad_norm": 0.6151101410382191, + "learning_rate": 7.334582053414029e-07, + "loss": 0.1554, + "step": 5724 + }, + { + "epoch": 0.37, + "grad_norm": 0.6673374149633937, + "learning_rate": 7.333668750841016e-07, + "loss": 0.1558, + "step": 5725 + }, + { + "epoch": 0.37, + "grad_norm": 3.1063472558336795, + "learning_rate": 7.332755348709005e-07, + "loss": 0.1048, + "step": 5726 + }, + { + "epoch": 0.37, + "grad_norm": 1.206564300336258, + "learning_rate": 7.331841847056961e-07, + "loss": 0.0903, + "step": 5727 + }, + { + "epoch": 0.37, + "grad_norm": 0.5047740906055621, + "learning_rate": 7.330928245923856e-07, + "loss": 0.0925, + "step": 5728 + }, + { + "epoch": 0.37, + "grad_norm": 1.3432459942569583, + "learning_rate": 7.330014545348665e-07, + "loss": 0.058, + "step": 5729 + }, + { + "epoch": 0.37, + "grad_norm": 0.5192291835658449, + "learning_rate": 7.329100745370371e-07, + "loss": 0.1255, + "step": 5730 + }, + { + "epoch": 0.37, + "grad_norm": 0.37345549224087526, + "learning_rate": 7.328186846027958e-07, + "loss": 0.1841, + "step": 5731 + }, + { + "epoch": 0.37, + "grad_norm": 0.7453052128979122, + "learning_rate": 7.327272847360411e-07, + "loss": 0.3567, + "step": 5732 + }, + { + "epoch": 0.37, + "grad_norm": 1.0458302309578975, + "learning_rate": 7.326358749406729e-07, + "loss": 0.2815, + "step": 5733 + }, + { + "epoch": 0.37, + "grad_norm": 0.9210884382714356, + "learning_rate": 7.325444552205903e-07, + "loss": 0.1767, + "step": 5734 + }, + { + "epoch": 0.37, + "grad_norm": 0.4306498508361315, + "learning_rate": 7.32453025579694e-07, + "loss": 0.2736, + "step": 5735 + }, + { + "epoch": 0.37, + "grad_norm": 0.6874778263152205, + "learning_rate": 7.323615860218842e-07, + "loss": 0.271, + "step": 5736 + }, + { + "epoch": 0.37, + "grad_norm": 4.233931078266332, + "learning_rate": 7.322701365510622e-07, + "loss": 0.3315, + "step": 5737 + }, + { + "epoch": 0.37, + "grad_norm": 0.8323186982992874, + "learning_rate": 7.321786771711291e-07, + "loss": 0.3319, + "step": 5738 + }, + { + "epoch": 0.37, + "grad_norm": 0.7889333812689606, + "learning_rate": 7.32087207885987e-07, + "loss": 0.3679, + "step": 5739 + }, + { + "epoch": 0.37, + "grad_norm": 1.7308759383840684, + "learning_rate": 7.31995728699538e-07, + "loss": 0.3541, + "step": 5740 + }, + { + "epoch": 0.37, + "grad_norm": 1.686361723910737, + "learning_rate": 7.319042396156848e-07, + "loss": 0.2275, + "step": 5741 + }, + { + "epoch": 0.37, + "grad_norm": 0.5245958672103136, + "learning_rate": 7.318127406383307e-07, + "loss": 0.3172, + "step": 5742 + }, + { + "epoch": 0.37, + "grad_norm": 5.226124069878523, + "learning_rate": 7.317212317713789e-07, + "loss": 0.1312, + "step": 5743 + }, + { + "epoch": 0.37, + "grad_norm": 1.120338026666057, + "learning_rate": 7.316297130187336e-07, + "loss": 0.2483, + "step": 5744 + }, + { + "epoch": 0.37, + "grad_norm": 0.689993816657338, + "learning_rate": 7.315381843842994e-07, + "loss": 0.2007, + "step": 5745 + }, + { + "epoch": 0.37, + "grad_norm": 0.22364874179971245, + "learning_rate": 7.314466458719805e-07, + "loss": 0.0673, + "step": 5746 + }, + { + "epoch": 0.37, + "grad_norm": 0.3927315160954293, + "learning_rate": 7.313550974856824e-07, + "loss": 0.1532, + "step": 5747 + }, + { + "epoch": 0.37, + "grad_norm": 1.0515841969074804, + "learning_rate": 7.312635392293108e-07, + "loss": 0.3764, + "step": 5748 + }, + { + "epoch": 0.37, + "grad_norm": 0.3917197689690394, + "learning_rate": 7.311719711067716e-07, + "loss": 0.1184, + "step": 5749 + }, + { + "epoch": 0.37, + "grad_norm": 0.6463061072605664, + "learning_rate": 7.310803931219717e-07, + "loss": 0.0841, + "step": 5750 + }, + { + "epoch": 0.37, + "grad_norm": 0.9927125562299818, + "learning_rate": 7.309888052788174e-07, + "loss": 0.2835, + "step": 5751 + }, + { + "epoch": 0.37, + "grad_norm": 0.1886007504177757, + "learning_rate": 7.308972075812165e-07, + "loss": 0.0942, + "step": 5752 + }, + { + "epoch": 0.37, + "grad_norm": 0.9151289201691571, + "learning_rate": 7.308056000330766e-07, + "loss": 0.1421, + "step": 5753 + }, + { + "epoch": 0.37, + "grad_norm": 0.900235705381781, + "learning_rate": 7.307139826383058e-07, + "loss": 0.1209, + "step": 5754 + }, + { + "epoch": 0.37, + "grad_norm": 0.5837265002658238, + "learning_rate": 7.306223554008126e-07, + "loss": 0.1662, + "step": 5755 + }, + { + "epoch": 0.37, + "grad_norm": 2.064992461090195, + "learning_rate": 7.305307183245062e-07, + "loss": 0.0825, + "step": 5756 + }, + { + "epoch": 0.37, + "grad_norm": 0.42029654362577756, + "learning_rate": 7.304390714132958e-07, + "loss": 0.1253, + "step": 5757 + }, + { + "epoch": 0.37, + "grad_norm": 0.22787478007896503, + "learning_rate": 7.303474146710915e-07, + "loss": 0.0995, + "step": 5758 + }, + { + "epoch": 0.37, + "grad_norm": 0.4476001216086352, + "learning_rate": 7.302557481018034e-07, + "loss": 0.1563, + "step": 5759 + }, + { + "epoch": 0.37, + "grad_norm": 1.20644091302951, + "learning_rate": 7.301640717093423e-07, + "loss": 0.0606, + "step": 5760 + }, + { + "epoch": 0.37, + "grad_norm": 0.932890582717694, + "learning_rate": 7.30072385497619e-07, + "loss": 0.1974, + "step": 5761 + }, + { + "epoch": 0.37, + "grad_norm": 0.775157928985653, + "learning_rate": 7.299806894705455e-07, + "loss": 0.1684, + "step": 5762 + }, + { + "epoch": 0.37, + "grad_norm": 0.5930366332677323, + "learning_rate": 7.298889836320334e-07, + "loss": 0.1152, + "step": 5763 + }, + { + "epoch": 0.37, + "grad_norm": 1.5754587056495242, + "learning_rate": 7.29797267985995e-07, + "loss": 0.1185, + "step": 5764 + }, + { + "epoch": 0.37, + "grad_norm": 0.4145190021337893, + "learning_rate": 7.297055425363432e-07, + "loss": 0.0865, + "step": 5765 + }, + { + "epoch": 0.37, + "grad_norm": 4.459475753926773, + "learning_rate": 7.296138072869913e-07, + "loss": 0.1672, + "step": 5766 + }, + { + "epoch": 0.37, + "grad_norm": 2.327574824334071, + "learning_rate": 7.295220622418527e-07, + "loss": 0.0507, + "step": 5767 + }, + { + "epoch": 0.37, + "grad_norm": 1.345594717162949, + "learning_rate": 7.294303074048415e-07, + "loss": 0.0843, + "step": 5768 + }, + { + "epoch": 0.37, + "grad_norm": 1.0252372310202145, + "learning_rate": 7.293385427798721e-07, + "loss": 0.2572, + "step": 5769 + }, + { + "epoch": 0.37, + "grad_norm": 0.6216556517457695, + "learning_rate": 7.292467683708596e-07, + "loss": 0.2336, + "step": 5770 + }, + { + "epoch": 0.37, + "grad_norm": 0.6782792319158881, + "learning_rate": 7.291549841817192e-07, + "loss": 0.2633, + "step": 5771 + }, + { + "epoch": 0.37, + "grad_norm": 0.5688118833042158, + "learning_rate": 7.290631902163664e-07, + "loss": 0.0075, + "step": 5772 + }, + { + "epoch": 0.37, + "grad_norm": 0.3895485852110559, + "learning_rate": 7.289713864787175e-07, + "loss": 0.4064, + "step": 5773 + }, + { + "epoch": 0.37, + "grad_norm": 2.506150204514353, + "learning_rate": 7.288795729726889e-07, + "loss": 0.0723, + "step": 5774 + }, + { + "epoch": 0.37, + "grad_norm": 1.0517219096754018, + "learning_rate": 7.287877497021977e-07, + "loss": 0.1546, + "step": 5775 + }, + { + "epoch": 0.37, + "grad_norm": 0.8877197643238348, + "learning_rate": 7.28695916671161e-07, + "loss": 0.0975, + "step": 5776 + }, + { + "epoch": 0.37, + "grad_norm": 0.6454200695293008, + "learning_rate": 7.286040738834968e-07, + "loss": 0.1935, + "step": 5777 + }, + { + "epoch": 0.37, + "grad_norm": 0.715357209162237, + "learning_rate": 7.285122213431233e-07, + "loss": 0.0033, + "step": 5778 + }, + { + "epoch": 0.37, + "grad_norm": 0.4877099297085567, + "learning_rate": 7.28420359053959e-07, + "loss": 0.1364, + "step": 5779 + }, + { + "epoch": 0.37, + "grad_norm": 0.888481172298206, + "learning_rate": 7.283284870199231e-07, + "loss": 0.231, + "step": 5780 + }, + { + "epoch": 0.37, + "grad_norm": 0.7195130961315708, + "learning_rate": 7.28236605244935e-07, + "loss": 0.301, + "step": 5781 + }, + { + "epoch": 0.37, + "grad_norm": 0.998004839873859, + "learning_rate": 7.281447137329144e-07, + "loss": 0.2926, + "step": 5782 + }, + { + "epoch": 0.37, + "grad_norm": 1.0589799086919884, + "learning_rate": 7.280528124877817e-07, + "loss": 0.0335, + "step": 5783 + }, + { + "epoch": 0.37, + "grad_norm": 1.246847115705105, + "learning_rate": 7.279609015134577e-07, + "loss": 0.2298, + "step": 5784 + }, + { + "epoch": 0.37, + "grad_norm": 0.5127667301057278, + "learning_rate": 7.278689808138632e-07, + "loss": 0.0162, + "step": 5785 + }, + { + "epoch": 0.37, + "grad_norm": 0.5686118131564267, + "learning_rate": 7.2777705039292e-07, + "loss": 0.3105, + "step": 5786 + }, + { + "epoch": 0.37, + "grad_norm": 1.5504845596655386, + "learning_rate": 7.276851102545499e-07, + "loss": 0.1512, + "step": 5787 + }, + { + "epoch": 0.37, + "grad_norm": 0.3507589777277134, + "learning_rate": 7.275931604026752e-07, + "loss": 0.2006, + "step": 5788 + }, + { + "epoch": 0.37, + "grad_norm": 0.6363302877756208, + "learning_rate": 7.275012008412191e-07, + "loss": 0.2754, + "step": 5789 + }, + { + "epoch": 0.37, + "grad_norm": 0.7886439160855335, + "learning_rate": 7.27409231574104e-07, + "loss": 0.0081, + "step": 5790 + }, + { + "epoch": 0.37, + "grad_norm": 0.9040161559453388, + "learning_rate": 7.273172526052542e-07, + "loss": 0.3219, + "step": 5791 + }, + { + "epoch": 0.37, + "grad_norm": 6.417099807574797, + "learning_rate": 7.272252639385935e-07, + "loss": 0.3511, + "step": 5792 + }, + { + "epoch": 0.37, + "grad_norm": 0.803623343303704, + "learning_rate": 7.27133265578046e-07, + "loss": 0.2845, + "step": 5793 + }, + { + "epoch": 0.37, + "grad_norm": 12.176265351119403, + "learning_rate": 7.270412575275368e-07, + "loss": 0.1648, + "step": 5794 + }, + { + "epoch": 0.37, + "grad_norm": 0.553487019276648, + "learning_rate": 7.269492397909913e-07, + "loss": 0.2341, + "step": 5795 + }, + { + "epoch": 0.37, + "grad_norm": 0.20324779002863755, + "learning_rate": 7.268572123723351e-07, + "loss": 0.0031, + "step": 5796 + }, + { + "epoch": 0.37, + "grad_norm": 0.21900193556265898, + "learning_rate": 7.267651752754939e-07, + "loss": 0.2145, + "step": 5797 + }, + { + "epoch": 0.37, + "grad_norm": 0.8876393913576334, + "learning_rate": 7.266731285043948e-07, + "loss": 0.2838, + "step": 5798 + }, + { + "epoch": 0.37, + "grad_norm": 6.9612015987432265, + "learning_rate": 7.265810720629642e-07, + "loss": 0.0532, + "step": 5799 + }, + { + "epoch": 0.37, + "grad_norm": 1.6036271570510925, + "learning_rate": 7.264890059551295e-07, + "loss": 0.1828, + "step": 5800 + }, + { + "epoch": 0.37, + "grad_norm": 0.8418700313466142, + "learning_rate": 7.263969301848187e-07, + "loss": 0.2084, + "step": 5801 + }, + { + "epoch": 0.37, + "grad_norm": 1.3243730146921566, + "learning_rate": 7.263048447559596e-07, + "loss": 0.1082, + "step": 5802 + }, + { + "epoch": 0.37, + "grad_norm": 0.4295623478554113, + "learning_rate": 7.262127496724809e-07, + "loss": 0.1655, + "step": 5803 + }, + { + "epoch": 0.37, + "grad_norm": 0.42034912858230894, + "learning_rate": 7.261206449383115e-07, + "loss": 0.1166, + "step": 5804 + }, + { + "epoch": 0.37, + "grad_norm": 0.792203867453036, + "learning_rate": 7.260285305573809e-07, + "loss": 0.3044, + "step": 5805 + }, + { + "epoch": 0.37, + "grad_norm": 1.1834953034758922, + "learning_rate": 7.259364065336188e-07, + "loss": 0.2387, + "step": 5806 + }, + { + "epoch": 0.37, + "grad_norm": 1.1441268848969504, + "learning_rate": 7.258442728709554e-07, + "loss": 0.082, + "step": 5807 + }, + { + "epoch": 0.37, + "grad_norm": 7.547858405168101, + "learning_rate": 7.257521295733213e-07, + "loss": 0.203, + "step": 5808 + }, + { + "epoch": 0.37, + "grad_norm": 0.6498259210423912, + "learning_rate": 7.256599766446476e-07, + "loss": 0.1696, + "step": 5809 + }, + { + "epoch": 0.37, + "grad_norm": 0.589797651636558, + "learning_rate": 7.255678140888657e-07, + "loss": 0.203, + "step": 5810 + }, + { + "epoch": 0.37, + "grad_norm": 3.295665441714581, + "learning_rate": 7.254756419099073e-07, + "loss": 0.0987, + "step": 5811 + }, + { + "epoch": 0.37, + "grad_norm": 0.1615737150670548, + "learning_rate": 7.253834601117048e-07, + "loss": 0.0033, + "step": 5812 + }, + { + "epoch": 0.37, + "grad_norm": 0.44912207714477587, + "learning_rate": 7.252912686981907e-07, + "loss": 0.1508, + "step": 5813 + }, + { + "epoch": 0.37, + "grad_norm": 0.963656407308307, + "learning_rate": 7.251990676732984e-07, + "loss": 0.137, + "step": 5814 + }, + { + "epoch": 0.37, + "grad_norm": 0.7939091592666753, + "learning_rate": 7.251068570409611e-07, + "loss": 0.1983, + "step": 5815 + }, + { + "epoch": 0.37, + "grad_norm": 0.24188510757011006, + "learning_rate": 7.250146368051126e-07, + "loss": 0.0978, + "step": 5816 + }, + { + "epoch": 0.37, + "grad_norm": 1.0629619612234744, + "learning_rate": 7.249224069696876e-07, + "loss": 0.1746, + "step": 5817 + }, + { + "epoch": 0.37, + "grad_norm": 0.9571145455150913, + "learning_rate": 7.248301675386204e-07, + "loss": 0.18, + "step": 5818 + }, + { + "epoch": 0.37, + "grad_norm": 1.0775381212771094, + "learning_rate": 7.247379185158463e-07, + "loss": 0.1911, + "step": 5819 + }, + { + "epoch": 0.37, + "grad_norm": 3.4392557445096235, + "learning_rate": 7.246456599053008e-07, + "loss": 0.2503, + "step": 5820 + }, + { + "epoch": 0.37, + "grad_norm": 0.9725169780983117, + "learning_rate": 7.245533917109198e-07, + "loss": 0.3065, + "step": 5821 + }, + { + "epoch": 0.37, + "grad_norm": 5.966288576702168, + "learning_rate": 7.244611139366398e-07, + "loss": 0.2274, + "step": 5822 + }, + { + "epoch": 0.37, + "grad_norm": 0.7998015479631919, + "learning_rate": 7.243688265863974e-07, + "loss": 0.318, + "step": 5823 + }, + { + "epoch": 0.37, + "grad_norm": 1.1673036168895607, + "learning_rate": 7.2427652966413e-07, + "loss": 0.2272, + "step": 5824 + }, + { + "epoch": 0.37, + "grad_norm": 1.2907306020411378, + "learning_rate": 7.241842231737748e-07, + "loss": 0.08, + "step": 5825 + }, + { + "epoch": 0.37, + "grad_norm": 0.9708427215621421, + "learning_rate": 7.2409190711927e-07, + "loss": 0.3598, + "step": 5826 + }, + { + "epoch": 0.37, + "grad_norm": 0.6238604424541833, + "learning_rate": 7.239995815045541e-07, + "loss": 0.5109, + "step": 5827 + }, + { + "epoch": 0.37, + "grad_norm": 3.3065175720140885, + "learning_rate": 7.239072463335657e-07, + "loss": 0.1389, + "step": 5828 + }, + { + "epoch": 0.37, + "grad_norm": 2.0029626689565476, + "learning_rate": 7.238149016102439e-07, + "loss": 0.3771, + "step": 5829 + }, + { + "epoch": 0.37, + "grad_norm": 0.19797498975210448, + "learning_rate": 7.237225473385286e-07, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.37, + "grad_norm": 1.432444886934499, + "learning_rate": 7.236301835223597e-07, + "loss": 0.179, + "step": 5831 + }, + { + "epoch": 0.37, + "grad_norm": 0.7383596119254882, + "learning_rate": 7.235378101656775e-07, + "loss": 0.2657, + "step": 5832 + }, + { + "epoch": 0.37, + "grad_norm": 3.9716360262510553, + "learning_rate": 7.234454272724231e-07, + "loss": 0.0608, + "step": 5833 + }, + { + "epoch": 0.37, + "grad_norm": 0.8742398751449905, + "learning_rate": 7.233530348465376e-07, + "loss": 0.3217, + "step": 5834 + }, + { + "epoch": 0.37, + "grad_norm": 1.296015653419755, + "learning_rate": 7.232606328919626e-07, + "loss": 0.2541, + "step": 5835 + }, + { + "epoch": 0.37, + "grad_norm": 1.0472261813186758, + "learning_rate": 7.231682214126401e-07, + "loss": 0.2328, + "step": 5836 + }, + { + "epoch": 0.37, + "grad_norm": 0.7530299828248986, + "learning_rate": 7.230758004125127e-07, + "loss": 0.1346, + "step": 5837 + }, + { + "epoch": 0.37, + "grad_norm": 1.4179811070545931, + "learning_rate": 7.229833698955232e-07, + "loss": 0.0977, + "step": 5838 + }, + { + "epoch": 0.37, + "grad_norm": 0.5539109258430226, + "learning_rate": 7.228909298656149e-07, + "loss": 0.1684, + "step": 5839 + }, + { + "epoch": 0.37, + "grad_norm": 0.685320802630804, + "learning_rate": 7.227984803267315e-07, + "loss": 0.1562, + "step": 5840 + }, + { + "epoch": 0.37, + "grad_norm": 0.46337319748195305, + "learning_rate": 7.22706021282817e-07, + "loss": 0.1288, + "step": 5841 + }, + { + "epoch": 0.37, + "grad_norm": 4.22438321986461, + "learning_rate": 7.226135527378161e-07, + "loss": 0.0241, + "step": 5842 + }, + { + "epoch": 0.37, + "grad_norm": 0.6880371677332253, + "learning_rate": 7.225210746956733e-07, + "loss": 0.2242, + "step": 5843 + }, + { + "epoch": 0.37, + "grad_norm": 0.3904455356259318, + "learning_rate": 7.224285871603342e-07, + "loss": 0.1122, + "step": 5844 + }, + { + "epoch": 0.37, + "grad_norm": 2.80368536327891, + "learning_rate": 7.223360901357445e-07, + "loss": 0.0082, + "step": 5845 + }, + { + "epoch": 0.37, + "grad_norm": 0.12360576823331884, + "learning_rate": 7.222435836258503e-07, + "loss": 0.0081, + "step": 5846 + }, + { + "epoch": 0.37, + "grad_norm": 1.072400722055551, + "learning_rate": 7.221510676345979e-07, + "loss": 0.482, + "step": 5847 + }, + { + "epoch": 0.37, + "grad_norm": 2.7497709231459546, + "learning_rate": 7.220585421659344e-07, + "loss": 0.2764, + "step": 5848 + }, + { + "epoch": 0.37, + "grad_norm": 2.1114486730556554, + "learning_rate": 7.21966007223807e-07, + "loss": 0.1383, + "step": 5849 + }, + { + "epoch": 0.37, + "grad_norm": 0.9980044127105586, + "learning_rate": 7.218734628121638e-07, + "loss": 0.1708, + "step": 5850 + }, + { + "epoch": 0.37, + "grad_norm": 1.0969380583317159, + "learning_rate": 7.217809089349524e-07, + "loss": 0.2902, + "step": 5851 + }, + { + "epoch": 0.37, + "grad_norm": 0.6849231119905266, + "learning_rate": 7.216883455961218e-07, + "loss": 0.0865, + "step": 5852 + }, + { + "epoch": 0.37, + "grad_norm": 1.243019603258892, + "learning_rate": 7.215957727996207e-07, + "loss": 0.1, + "step": 5853 + }, + { + "epoch": 0.37, + "grad_norm": 0.385221654833512, + "learning_rate": 7.215031905493983e-07, + "loss": 0.1145, + "step": 5854 + }, + { + "epoch": 0.37, + "grad_norm": 0.5605233878714039, + "learning_rate": 7.214105988494045e-07, + "loss": 0.1167, + "step": 5855 + }, + { + "epoch": 0.37, + "grad_norm": 0.6201025446143363, + "learning_rate": 7.213179977035897e-07, + "loss": 0.0043, + "step": 5856 + }, + { + "epoch": 0.37, + "grad_norm": 0.6588082719258256, + "learning_rate": 7.212253871159041e-07, + "loss": 0.3062, + "step": 5857 + }, + { + "epoch": 0.37, + "grad_norm": 6.448959617441987, + "learning_rate": 7.211327670902988e-07, + "loss": 0.2483, + "step": 5858 + }, + { + "epoch": 0.37, + "grad_norm": 0.7645063728979504, + "learning_rate": 7.210401376307252e-07, + "loss": 0.3142, + "step": 5859 + }, + { + "epoch": 0.37, + "grad_norm": 0.3454183970448466, + "learning_rate": 7.209474987411346e-07, + "loss": 0.0788, + "step": 5860 + }, + { + "epoch": 0.37, + "grad_norm": 0.489328950945489, + "learning_rate": 7.208548504254799e-07, + "loss": 0.0732, + "step": 5861 + }, + { + "epoch": 0.37, + "grad_norm": 1.0839647157281016, + "learning_rate": 7.207621926877133e-07, + "loss": 0.2387, + "step": 5862 + }, + { + "epoch": 0.37, + "grad_norm": 0.6501698188666848, + "learning_rate": 7.206695255317876e-07, + "loss": 0.456, + "step": 5863 + }, + { + "epoch": 0.37, + "grad_norm": 0.7720724698021013, + "learning_rate": 7.205768489616565e-07, + "loss": 0.3358, + "step": 5864 + }, + { + "epoch": 0.37, + "grad_norm": 0.7459872200113052, + "learning_rate": 7.204841629812734e-07, + "loss": 0.2244, + "step": 5865 + }, + { + "epoch": 0.37, + "grad_norm": 0.6919461914751762, + "learning_rate": 7.203914675945928e-07, + "loss": 0.2421, + "step": 5866 + }, + { + "epoch": 0.37, + "grad_norm": 0.528182902739734, + "learning_rate": 7.202987628055693e-07, + "loss": 0.2035, + "step": 5867 + }, + { + "epoch": 0.37, + "grad_norm": 0.46861676408355124, + "learning_rate": 7.202060486181575e-07, + "loss": 0.0723, + "step": 5868 + }, + { + "epoch": 0.37, + "grad_norm": 2.879367632376332, + "learning_rate": 7.201133250363132e-07, + "loss": 0.1566, + "step": 5869 + }, + { + "epoch": 0.37, + "grad_norm": 0.38307353665079696, + "learning_rate": 7.200205920639918e-07, + "loss": 0.0125, + "step": 5870 + }, + { + "epoch": 0.37, + "grad_norm": 0.473801322198328, + "learning_rate": 7.199278497051497e-07, + "loss": 0.3076, + "step": 5871 + }, + { + "epoch": 0.37, + "grad_norm": 0.39376928629463165, + "learning_rate": 7.198350979637434e-07, + "loss": 0.1229, + "step": 5872 + }, + { + "epoch": 0.37, + "grad_norm": 4.502550421033535, + "learning_rate": 7.197423368437299e-07, + "loss": 0.3354, + "step": 5873 + }, + { + "epoch": 0.37, + "grad_norm": 0.5897333491847417, + "learning_rate": 7.196495663490665e-07, + "loss": 0.1523, + "step": 5874 + }, + { + "epoch": 0.37, + "grad_norm": 1.159565031612828, + "learning_rate": 7.195567864837111e-07, + "loss": 0.2187, + "step": 5875 + }, + { + "epoch": 0.37, + "grad_norm": 2.502572097228924, + "learning_rate": 7.194639972516218e-07, + "loss": 0.2899, + "step": 5876 + }, + { + "epoch": 0.37, + "grad_norm": 0.5176055018510678, + "learning_rate": 7.193711986567573e-07, + "loss": 0.0791, + "step": 5877 + }, + { + "epoch": 0.37, + "grad_norm": 1.3059301873472071, + "learning_rate": 7.192783907030765e-07, + "loss": 0.1164, + "step": 5878 + }, + { + "epoch": 0.37, + "grad_norm": 0.7189273682628163, + "learning_rate": 7.191855733945386e-07, + "loss": 0.1507, + "step": 5879 + }, + { + "epoch": 0.37, + "grad_norm": 0.6557773111881142, + "learning_rate": 7.190927467351037e-07, + "loss": 0.3814, + "step": 5880 + }, + { + "epoch": 0.38, + "grad_norm": 1.1147535581280337, + "learning_rate": 7.189999107287317e-07, + "loss": 0.1315, + "step": 5881 + }, + { + "epoch": 0.38, + "grad_norm": 0.9928493129046877, + "learning_rate": 7.189070653793833e-07, + "loss": 0.0101, + "step": 5882 + }, + { + "epoch": 0.38, + "grad_norm": 0.37211700710997386, + "learning_rate": 7.188142106910193e-07, + "loss": 0.1829, + "step": 5883 + }, + { + "epoch": 0.38, + "grad_norm": 6.165946089905001, + "learning_rate": 7.187213466676013e-07, + "loss": 0.3081, + "step": 5884 + }, + { + "epoch": 0.38, + "grad_norm": 0.4124492140865974, + "learning_rate": 7.18628473313091e-07, + "loss": 0.0811, + "step": 5885 + }, + { + "epoch": 0.38, + "grad_norm": 0.8954038009244101, + "learning_rate": 7.185355906314505e-07, + "loss": 0.1387, + "step": 5886 + }, + { + "epoch": 0.38, + "grad_norm": 1.3092639590667032, + "learning_rate": 7.184426986266423e-07, + "loss": 0.2749, + "step": 5887 + }, + { + "epoch": 0.38, + "grad_norm": 0.6949233311868814, + "learning_rate": 7.183497973026296e-07, + "loss": 0.0865, + "step": 5888 + }, + { + "epoch": 0.38, + "grad_norm": 3.2710940156925137, + "learning_rate": 7.182568866633756e-07, + "loss": 0.1948, + "step": 5889 + }, + { + "epoch": 0.38, + "grad_norm": 1.1978573352474637, + "learning_rate": 7.18163966712844e-07, + "loss": 0.2208, + "step": 5890 + }, + { + "epoch": 0.38, + "grad_norm": 0.6818154892923289, + "learning_rate": 7.18071037454999e-07, + "loss": 0.2935, + "step": 5891 + }, + { + "epoch": 0.38, + "grad_norm": 1.4738591744705973, + "learning_rate": 7.179780988938051e-07, + "loss": 0.1539, + "step": 5892 + }, + { + "epoch": 0.38, + "grad_norm": 1.1680365976824745, + "learning_rate": 7.178851510332274e-07, + "loss": 0.2514, + "step": 5893 + }, + { + "epoch": 0.38, + "grad_norm": 0.5345444115984965, + "learning_rate": 7.177921938772311e-07, + "loss": 0.3865, + "step": 5894 + }, + { + "epoch": 0.38, + "grad_norm": 0.4549782245854918, + "learning_rate": 7.17699227429782e-07, + "loss": 0.2943, + "step": 5895 + }, + { + "epoch": 0.38, + "grad_norm": 0.5654763469609636, + "learning_rate": 7.176062516948463e-07, + "loss": 0.0977, + "step": 5896 + }, + { + "epoch": 0.38, + "grad_norm": 0.46045765447375575, + "learning_rate": 7.175132666763905e-07, + "loss": 0.1541, + "step": 5897 + }, + { + "epoch": 0.38, + "grad_norm": 0.5610590384086495, + "learning_rate": 7.174202723783814e-07, + "loss": 0.0925, + "step": 5898 + }, + { + "epoch": 0.38, + "grad_norm": 0.2928975090718757, + "learning_rate": 7.173272688047865e-07, + "loss": 0.121, + "step": 5899 + }, + { + "epoch": 0.38, + "grad_norm": 0.9139770998882433, + "learning_rate": 7.172342559595732e-07, + "loss": 0.066, + "step": 5900 + }, + { + "epoch": 0.38, + "grad_norm": 1.485855768140534, + "learning_rate": 7.1714123384671e-07, + "loss": 0.1278, + "step": 5901 + }, + { + "epoch": 0.38, + "grad_norm": 2.444138561462059, + "learning_rate": 7.170482024701651e-07, + "loss": 0.0571, + "step": 5902 + }, + { + "epoch": 0.38, + "grad_norm": 2.6205883371938983, + "learning_rate": 7.169551618339078e-07, + "loss": 0.243, + "step": 5903 + }, + { + "epoch": 0.38, + "grad_norm": 0.6260682375338533, + "learning_rate": 7.168621119419072e-07, + "loss": 0.1723, + "step": 5904 + }, + { + "epoch": 0.38, + "grad_norm": 2.1867454917890057, + "learning_rate": 7.167690527981327e-07, + "loss": 0.3286, + "step": 5905 + }, + { + "epoch": 0.38, + "grad_norm": 0.407792610972802, + "learning_rate": 7.166759844065548e-07, + "loss": 0.1464, + "step": 5906 + }, + { + "epoch": 0.38, + "grad_norm": 1.7570667036092302, + "learning_rate": 7.165829067711439e-07, + "loss": 0.1031, + "step": 5907 + }, + { + "epoch": 0.38, + "grad_norm": 7.360062034053246, + "learning_rate": 7.164898198958706e-07, + "loss": 0.1911, + "step": 5908 + }, + { + "epoch": 0.38, + "grad_norm": 0.6199722405916185, + "learning_rate": 7.163967237847066e-07, + "loss": 0.159, + "step": 5909 + }, + { + "epoch": 0.38, + "grad_norm": 2.3338296173886413, + "learning_rate": 7.163036184416231e-07, + "loss": 0.366, + "step": 5910 + }, + { + "epoch": 0.38, + "grad_norm": 1.213067992527338, + "learning_rate": 7.162105038705926e-07, + "loss": 0.1768, + "step": 5911 + }, + { + "epoch": 0.38, + "grad_norm": 1.480109946037491, + "learning_rate": 7.161173800755874e-07, + "loss": 0.1274, + "step": 5912 + }, + { + "epoch": 0.38, + "grad_norm": 0.6505464364524993, + "learning_rate": 7.160242470605803e-07, + "loss": 0.0872, + "step": 5913 + }, + { + "epoch": 0.38, + "grad_norm": 1.6077104443243688, + "learning_rate": 7.159311048295444e-07, + "loss": 0.2574, + "step": 5914 + }, + { + "epoch": 0.38, + "grad_norm": 0.7781919438857764, + "learning_rate": 7.158379533864537e-07, + "loss": 0.167, + "step": 5915 + }, + { + "epoch": 0.38, + "grad_norm": 1.0321930163705826, + "learning_rate": 7.15744792735282e-07, + "loss": 0.1722, + "step": 5916 + }, + { + "epoch": 0.38, + "grad_norm": 1.5674156331427114, + "learning_rate": 7.156516228800035e-07, + "loss": 0.1605, + "step": 5917 + }, + { + "epoch": 0.38, + "grad_norm": 0.7950157651962791, + "learning_rate": 7.155584438245935e-07, + "loss": 0.1945, + "step": 5918 + }, + { + "epoch": 0.38, + "grad_norm": 2.091492675135481, + "learning_rate": 7.154652555730267e-07, + "loss": 0.0534, + "step": 5919 + }, + { + "epoch": 0.38, + "grad_norm": 0.6350082291984965, + "learning_rate": 7.153720581292793e-07, + "loss": 0.2207, + "step": 5920 + }, + { + "epoch": 0.38, + "grad_norm": 0.6751077283045518, + "learning_rate": 7.152788514973267e-07, + "loss": 0.1156, + "step": 5921 + }, + { + "epoch": 0.38, + "grad_norm": 0.7838490625511402, + "learning_rate": 7.151856356811456e-07, + "loss": 0.2122, + "step": 5922 + }, + { + "epoch": 0.38, + "grad_norm": 0.6838821417234526, + "learning_rate": 7.150924106847127e-07, + "loss": 0.2095, + "step": 5923 + }, + { + "epoch": 0.38, + "grad_norm": 0.6116515680506596, + "learning_rate": 7.149991765120054e-07, + "loss": 0.4388, + "step": 5924 + }, + { + "epoch": 0.38, + "grad_norm": 1.0175319634749764, + "learning_rate": 7.149059331670008e-07, + "loss": 0.1162, + "step": 5925 + }, + { + "epoch": 0.38, + "grad_norm": 0.4889298794243602, + "learning_rate": 7.14812680653677e-07, + "loss": 0.0855, + "step": 5926 + }, + { + "epoch": 0.38, + "grad_norm": 1.1329174315251007, + "learning_rate": 7.147194189760124e-07, + "loss": 0.2025, + "step": 5927 + }, + { + "epoch": 0.38, + "grad_norm": 0.38708904205903855, + "learning_rate": 7.146261481379858e-07, + "loss": 0.0041, + "step": 5928 + }, + { + "epoch": 0.38, + "grad_norm": 0.4835130500903364, + "learning_rate": 7.145328681435764e-07, + "loss": 0.1677, + "step": 5929 + }, + { + "epoch": 0.38, + "grad_norm": 1.7288205730665984, + "learning_rate": 7.144395789967635e-07, + "loss": 0.3148, + "step": 5930 + }, + { + "epoch": 0.38, + "grad_norm": 0.9432187522235403, + "learning_rate": 7.14346280701527e-07, + "loss": 0.0094, + "step": 5931 + }, + { + "epoch": 0.38, + "grad_norm": 1.0728331646521974, + "learning_rate": 7.142529732618474e-07, + "loss": 0.2973, + "step": 5932 + }, + { + "epoch": 0.38, + "grad_norm": 5.564085645081182, + "learning_rate": 7.141596566817052e-07, + "loss": 0.365, + "step": 5933 + }, + { + "epoch": 0.38, + "grad_norm": 0.912222774090835, + "learning_rate": 7.140663309650816e-07, + "loss": 0.326, + "step": 5934 + }, + { + "epoch": 0.38, + "grad_norm": 1.7604164267103952, + "learning_rate": 7.13972996115958e-07, + "loss": 0.2514, + "step": 5935 + }, + { + "epoch": 0.38, + "grad_norm": 0.436229042738186, + "learning_rate": 7.138796521383162e-07, + "loss": 0.112, + "step": 5936 + }, + { + "epoch": 0.38, + "grad_norm": 4.998368663674957, + "learning_rate": 7.137862990361382e-07, + "loss": 0.0441, + "step": 5937 + }, + { + "epoch": 0.38, + "grad_norm": 0.4927003810543219, + "learning_rate": 7.136929368134074e-07, + "loss": 0.0082, + "step": 5938 + }, + { + "epoch": 0.38, + "grad_norm": 0.7354966554476302, + "learning_rate": 7.135995654741062e-07, + "loss": 0.4271, + "step": 5939 + }, + { + "epoch": 0.38, + "grad_norm": 0.74617529396601, + "learning_rate": 7.135061850222179e-07, + "loss": 0.1556, + "step": 5940 + }, + { + "epoch": 0.38, + "grad_norm": 1.435093383695401, + "learning_rate": 7.134127954617268e-07, + "loss": 0.178, + "step": 5941 + }, + { + "epoch": 0.38, + "grad_norm": 0.477204625878189, + "learning_rate": 7.133193967966168e-07, + "loss": 0.2543, + "step": 5942 + }, + { + "epoch": 0.38, + "grad_norm": 0.43863566913845464, + "learning_rate": 7.132259890308725e-07, + "loss": 0.183, + "step": 5943 + }, + { + "epoch": 0.38, + "grad_norm": 0.44991637776734, + "learning_rate": 7.131325721684788e-07, + "loss": 0.378, + "step": 5944 + }, + { + "epoch": 0.38, + "grad_norm": 0.5281526357455267, + "learning_rate": 7.130391462134211e-07, + "loss": 0.2169, + "step": 5945 + }, + { + "epoch": 0.38, + "grad_norm": 0.8521604255758248, + "learning_rate": 7.129457111696852e-07, + "loss": 0.174, + "step": 5946 + }, + { + "epoch": 0.38, + "grad_norm": 0.68207597659719, + "learning_rate": 7.128522670412571e-07, + "loss": 0.0522, + "step": 5947 + }, + { + "epoch": 0.38, + "grad_norm": 0.2533660514279892, + "learning_rate": 7.127588138321235e-07, + "loss": 0.0686, + "step": 5948 + }, + { + "epoch": 0.38, + "grad_norm": 0.2090779232570901, + "learning_rate": 7.126653515462713e-07, + "loss": 0.0796, + "step": 5949 + }, + { + "epoch": 0.38, + "grad_norm": 1.2050745694824858, + "learning_rate": 7.125718801876876e-07, + "loss": 0.2794, + "step": 5950 + }, + { + "epoch": 0.38, + "grad_norm": 1.6509178274863041, + "learning_rate": 7.1247839976036e-07, + "loss": 0.0936, + "step": 5951 + }, + { + "epoch": 0.38, + "grad_norm": 0.8250706157425587, + "learning_rate": 7.123849102682771e-07, + "loss": 0.265, + "step": 5952 + }, + { + "epoch": 0.38, + "grad_norm": 0.8781829729621038, + "learning_rate": 7.122914117154267e-07, + "loss": 0.3591, + "step": 5953 + }, + { + "epoch": 0.38, + "grad_norm": 10.066038436900582, + "learning_rate": 7.12197904105798e-07, + "loss": 0.1235, + "step": 5954 + }, + { + "epoch": 0.38, + "grad_norm": 2.3810299623815854, + "learning_rate": 7.121043874433801e-07, + "loss": 0.1342, + "step": 5955 + }, + { + "epoch": 0.38, + "grad_norm": 1.1710195322965773, + "learning_rate": 7.120108617321627e-07, + "loss": 0.1984, + "step": 5956 + }, + { + "epoch": 0.38, + "grad_norm": 0.49842423271635744, + "learning_rate": 7.119173269761357e-07, + "loss": 0.2331, + "step": 5957 + }, + { + "epoch": 0.38, + "grad_norm": 4.023827614331166, + "learning_rate": 7.118237831792895e-07, + "loss": 0.3535, + "step": 5958 + }, + { + "epoch": 0.38, + "grad_norm": 0.25458583003373897, + "learning_rate": 7.117302303456149e-07, + "loss": 0.0158, + "step": 5959 + }, + { + "epoch": 0.38, + "grad_norm": 0.6364437863317047, + "learning_rate": 7.116366684791032e-07, + "loss": 0.0611, + "step": 5960 + }, + { + "epoch": 0.38, + "grad_norm": 1.823260968022451, + "learning_rate": 7.115430975837456e-07, + "loss": 0.1022, + "step": 5961 + }, + { + "epoch": 0.38, + "grad_norm": 4.751828298169778, + "learning_rate": 7.114495176635343e-07, + "loss": 0.0226, + "step": 5962 + }, + { + "epoch": 0.38, + "grad_norm": 0.630816834039371, + "learning_rate": 7.113559287224614e-07, + "loss": 0.1887, + "step": 5963 + }, + { + "epoch": 0.38, + "grad_norm": 0.45860696416686153, + "learning_rate": 7.112623307645198e-07, + "loss": 0.2074, + "step": 5964 + }, + { + "epoch": 0.38, + "grad_norm": 0.6064617076623829, + "learning_rate": 7.111687237937024e-07, + "loss": 0.2096, + "step": 5965 + }, + { + "epoch": 0.38, + "grad_norm": 0.9400640168041525, + "learning_rate": 7.11075107814003e-07, + "loss": 0.276, + "step": 5966 + }, + { + "epoch": 0.38, + "grad_norm": 0.9060897632718069, + "learning_rate": 7.10981482829415e-07, + "loss": 0.0865, + "step": 5967 + }, + { + "epoch": 0.38, + "grad_norm": 0.7413702713438625, + "learning_rate": 7.108878488439327e-07, + "loss": 0.2822, + "step": 5968 + }, + { + "epoch": 0.38, + "grad_norm": 1.7268002955728479, + "learning_rate": 7.10794205861551e-07, + "loss": 0.1332, + "step": 5969 + }, + { + "epoch": 0.38, + "grad_norm": 0.5774531281108808, + "learning_rate": 7.107005538862646e-07, + "loss": 0.0733, + "step": 5970 + }, + { + "epoch": 0.38, + "grad_norm": 3.129077254375828, + "learning_rate": 7.10606892922069e-07, + "loss": 0.0939, + "step": 5971 + }, + { + "epoch": 0.38, + "grad_norm": 1.471993788958997, + "learning_rate": 7.1051322297296e-07, + "loss": 0.2458, + "step": 5972 + }, + { + "epoch": 0.38, + "grad_norm": 0.21795629225975438, + "learning_rate": 7.104195440429338e-07, + "loss": 0.0039, + "step": 5973 + }, + { + "epoch": 0.38, + "grad_norm": 1.0986424896095943, + "learning_rate": 7.103258561359868e-07, + "loss": 0.3232, + "step": 5974 + }, + { + "epoch": 0.38, + "grad_norm": 0.5780048420019451, + "learning_rate": 7.102321592561161e-07, + "loss": 0.2073, + "step": 5975 + }, + { + "epoch": 0.38, + "grad_norm": 0.2990008380893296, + "learning_rate": 7.101384534073186e-07, + "loss": 0.1052, + "step": 5976 + }, + { + "epoch": 0.38, + "grad_norm": 1.3446679506575092, + "learning_rate": 7.100447385935924e-07, + "loss": 0.3446, + "step": 5977 + }, + { + "epoch": 0.38, + "grad_norm": 0.7417729866746887, + "learning_rate": 7.099510148189353e-07, + "loss": 0.3005, + "step": 5978 + }, + { + "epoch": 0.38, + "grad_norm": 4.380731448893932, + "learning_rate": 7.098572820873461e-07, + "loss": 0.0407, + "step": 5979 + }, + { + "epoch": 0.38, + "grad_norm": 0.6480951396834793, + "learning_rate": 7.097635404028233e-07, + "loss": 0.2887, + "step": 5980 + }, + { + "epoch": 0.38, + "grad_norm": 4.659078677383878, + "learning_rate": 7.096697897693661e-07, + "loss": 0.3541, + "step": 5981 + }, + { + "epoch": 0.38, + "grad_norm": 1.1218545597779377, + "learning_rate": 7.095760301909742e-07, + "loss": 0.3356, + "step": 5982 + }, + { + "epoch": 0.38, + "grad_norm": 0.9847545022936296, + "learning_rate": 7.094822616716476e-07, + "loss": 0.1656, + "step": 5983 + }, + { + "epoch": 0.38, + "grad_norm": 1.1206415516885497, + "learning_rate": 7.093884842153866e-07, + "loss": 0.5928, + "step": 5984 + }, + { + "epoch": 0.38, + "grad_norm": 5.9162260514925755, + "learning_rate": 7.092946978261918e-07, + "loss": 0.1928, + "step": 5985 + }, + { + "epoch": 0.38, + "grad_norm": 0.7143376489897715, + "learning_rate": 7.092009025080647e-07, + "loss": 0.1116, + "step": 5986 + }, + { + "epoch": 0.38, + "grad_norm": 1.5437738671263492, + "learning_rate": 7.091070982650063e-07, + "loss": 0.2653, + "step": 5987 + }, + { + "epoch": 0.38, + "grad_norm": 1.2406481074654894, + "learning_rate": 7.090132851010189e-07, + "loss": 0.3299, + "step": 5988 + }, + { + "epoch": 0.38, + "grad_norm": 0.9467414831579638, + "learning_rate": 7.089194630201045e-07, + "loss": 0.2875, + "step": 5989 + }, + { + "epoch": 0.38, + "grad_norm": 1.9279443760306558, + "learning_rate": 7.088256320262658e-07, + "loss": 0.0794, + "step": 5990 + }, + { + "epoch": 0.38, + "grad_norm": 0.5185733427488309, + "learning_rate": 7.087317921235059e-07, + "loss": 0.1306, + "step": 5991 + }, + { + "epoch": 0.38, + "grad_norm": 0.7948092221675529, + "learning_rate": 7.086379433158282e-07, + "loss": 0.0462, + "step": 5992 + }, + { + "epoch": 0.38, + "grad_norm": 0.6098031442702017, + "learning_rate": 7.085440856072364e-07, + "loss": 0.1068, + "step": 5993 + }, + { + "epoch": 0.38, + "grad_norm": 2.106216605920332, + "learning_rate": 7.084502190017346e-07, + "loss": 0.0095, + "step": 5994 + }, + { + "epoch": 0.38, + "grad_norm": 0.2519067909669138, + "learning_rate": 7.083563435033275e-07, + "loss": 0.1246, + "step": 5995 + }, + { + "epoch": 0.38, + "grad_norm": 1.1356303064312276, + "learning_rate": 7.0826245911602e-07, + "loss": 0.2158, + "step": 5996 + }, + { + "epoch": 0.38, + "grad_norm": 0.5893033155310382, + "learning_rate": 7.081685658438172e-07, + "loss": 0.236, + "step": 5997 + }, + { + "epoch": 0.38, + "grad_norm": 0.5284526499896604, + "learning_rate": 7.080746636907249e-07, + "loss": 0.1444, + "step": 5998 + }, + { + "epoch": 0.38, + "grad_norm": 0.62541020470911, + "learning_rate": 7.079807526607492e-07, + "loss": 0.1926, + "step": 5999 + }, + { + "epoch": 0.38, + "grad_norm": 0.6633854058116887, + "learning_rate": 7.078868327578965e-07, + "loss": 0.3817, + "step": 6000 + }, + { + "epoch": 0.38, + "grad_norm": 2.5390019204594863, + "learning_rate": 7.077929039861737e-07, + "loss": 0.1851, + "step": 6001 + }, + { + "epoch": 0.38, + "grad_norm": 2.1361359268069435, + "learning_rate": 7.076989663495877e-07, + "loss": 0.1746, + "step": 6002 + }, + { + "epoch": 0.38, + "grad_norm": 0.10970779392084333, + "learning_rate": 7.076050198521464e-07, + "loss": 0.0664, + "step": 6003 + }, + { + "epoch": 0.38, + "grad_norm": 1.0860967601909939, + "learning_rate": 7.075110644978577e-07, + "loss": 0.4086, + "step": 6004 + }, + { + "epoch": 0.38, + "grad_norm": 0.5685774998213641, + "learning_rate": 7.074171002907296e-07, + "loss": 0.3996, + "step": 6005 + }, + { + "epoch": 0.38, + "grad_norm": 0.6201711996331208, + "learning_rate": 7.073231272347713e-07, + "loss": 0.3049, + "step": 6006 + }, + { + "epoch": 0.38, + "grad_norm": 0.6000797086416605, + "learning_rate": 7.072291453339915e-07, + "loss": 0.3039, + "step": 6007 + }, + { + "epoch": 0.38, + "grad_norm": 0.45780747046117626, + "learning_rate": 7.071351545923998e-07, + "loss": 0.2359, + "step": 6008 + }, + { + "epoch": 0.38, + "grad_norm": 0.3257811079413989, + "learning_rate": 7.07041155014006e-07, + "loss": 0.1005, + "step": 6009 + }, + { + "epoch": 0.38, + "grad_norm": 1.0872698006568964, + "learning_rate": 7.069471466028203e-07, + "loss": 0.1744, + "step": 6010 + }, + { + "epoch": 0.38, + "grad_norm": 0.280298811781577, + "learning_rate": 7.068531293628533e-07, + "loss": 0.0576, + "step": 6011 + }, + { + "epoch": 0.38, + "grad_norm": 3.0526762193027266, + "learning_rate": 7.06759103298116e-07, + "loss": 0.1753, + "step": 6012 + }, + { + "epoch": 0.38, + "grad_norm": 1.288961530840641, + "learning_rate": 7.066650684126198e-07, + "loss": 0.3042, + "step": 6013 + }, + { + "epoch": 0.38, + "grad_norm": 0.37518201346987406, + "learning_rate": 7.065710247103762e-07, + "loss": 0.1116, + "step": 6014 + }, + { + "epoch": 0.38, + "grad_norm": 3.2338419063587356, + "learning_rate": 7.064769721953975e-07, + "loss": 0.0568, + "step": 6015 + }, + { + "epoch": 0.38, + "grad_norm": 1.3789297878621667, + "learning_rate": 7.06382910871696e-07, + "loss": 0.1485, + "step": 6016 + }, + { + "epoch": 0.38, + "grad_norm": 2.220911917100636, + "learning_rate": 7.062888407432847e-07, + "loss": 0.145, + "step": 6017 + }, + { + "epoch": 0.38, + "grad_norm": 0.44760179866017075, + "learning_rate": 7.061947618141768e-07, + "loss": 0.2021, + "step": 6018 + }, + { + "epoch": 0.38, + "grad_norm": 0.3631122735822044, + "learning_rate": 7.061006740883858e-07, + "loss": 0.0854, + "step": 6019 + }, + { + "epoch": 0.38, + "grad_norm": 0.6197074357619311, + "learning_rate": 7.060065775699257e-07, + "loss": 0.1229, + "step": 6020 + }, + { + "epoch": 0.38, + "grad_norm": 7.303806414473023, + "learning_rate": 7.059124722628112e-07, + "loss": 0.1706, + "step": 6021 + }, + { + "epoch": 0.38, + "grad_norm": 0.5110289241404611, + "learning_rate": 7.058183581710564e-07, + "loss": 0.0506, + "step": 6022 + }, + { + "epoch": 0.38, + "grad_norm": 1.2294421418777262, + "learning_rate": 7.057242352986767e-07, + "loss": 0.0219, + "step": 6023 + }, + { + "epoch": 0.38, + "grad_norm": 2.587852148027603, + "learning_rate": 7.056301036496874e-07, + "loss": 0.1163, + "step": 6024 + }, + { + "epoch": 0.38, + "grad_norm": 0.6857314604523593, + "learning_rate": 7.055359632281048e-07, + "loss": 0.5359, + "step": 6025 + }, + { + "epoch": 0.38, + "grad_norm": 0.923768138429377, + "learning_rate": 7.054418140379448e-07, + "loss": 0.2454, + "step": 6026 + }, + { + "epoch": 0.38, + "grad_norm": 0.47052950296915724, + "learning_rate": 7.053476560832239e-07, + "loss": 0.1317, + "step": 6027 + }, + { + "epoch": 0.38, + "grad_norm": 0.4121696819836364, + "learning_rate": 7.052534893679593e-07, + "loss": 0.2421, + "step": 6028 + }, + { + "epoch": 0.38, + "grad_norm": 0.44023866777409837, + "learning_rate": 7.051593138961681e-07, + "loss": 0.2292, + "step": 6029 + }, + { + "epoch": 0.38, + "grad_norm": 0.696574520204921, + "learning_rate": 7.050651296718683e-07, + "loss": 0.0751, + "step": 6030 + }, + { + "epoch": 0.38, + "grad_norm": 0.7402758210906683, + "learning_rate": 7.049709366990777e-07, + "loss": 0.2056, + "step": 6031 + }, + { + "epoch": 0.38, + "grad_norm": 0.4702207701856258, + "learning_rate": 7.04876734981815e-07, + "loss": 0.0254, + "step": 6032 + }, + { + "epoch": 0.38, + "grad_norm": 5.999792419930243, + "learning_rate": 7.047825245240988e-07, + "loss": 0.1965, + "step": 6033 + }, + { + "epoch": 0.38, + "grad_norm": 0.4877844417113965, + "learning_rate": 7.046883053299486e-07, + "loss": 0.116, + "step": 6034 + }, + { + "epoch": 0.38, + "grad_norm": 1.479980954195937, + "learning_rate": 7.045940774033838e-07, + "loss": 0.0313, + "step": 6035 + }, + { + "epoch": 0.38, + "grad_norm": 0.6086161206512105, + "learning_rate": 7.044998407484243e-07, + "loss": 0.1707, + "step": 6036 + }, + { + "epoch": 0.38, + "grad_norm": 1.187171812083103, + "learning_rate": 7.044055953690905e-07, + "loss": 0.2214, + "step": 6037 + }, + { + "epoch": 0.39, + "grad_norm": 1.0984291500373335, + "learning_rate": 7.043113412694031e-07, + "loss": 0.3816, + "step": 6038 + }, + { + "epoch": 0.39, + "grad_norm": 3.3096972089826218, + "learning_rate": 7.042170784533832e-07, + "loss": 0.1404, + "step": 6039 + }, + { + "epoch": 0.39, + "grad_norm": 0.4580931602231195, + "learning_rate": 7.041228069250522e-07, + "loss": 0.1525, + "step": 6040 + }, + { + "epoch": 0.39, + "grad_norm": 0.8939244281693258, + "learning_rate": 7.040285266884318e-07, + "loss": 0.1209, + "step": 6041 + }, + { + "epoch": 0.39, + "grad_norm": 0.39145821050631974, + "learning_rate": 7.039342377475443e-07, + "loss": 0.0901, + "step": 6042 + }, + { + "epoch": 0.39, + "grad_norm": 1.4035249113873034, + "learning_rate": 7.038399401064124e-07, + "loss": 0.0596, + "step": 6043 + }, + { + "epoch": 0.39, + "grad_norm": 0.4781807350668425, + "learning_rate": 7.037456337690588e-07, + "loss": 0.1057, + "step": 6044 + }, + { + "epoch": 0.39, + "grad_norm": 0.5610597985770469, + "learning_rate": 7.03651318739507e-07, + "loss": 0.0974, + "step": 6045 + }, + { + "epoch": 0.39, + "grad_norm": 0.77816916366953, + "learning_rate": 7.035569950217806e-07, + "loss": 0.1531, + "step": 6046 + }, + { + "epoch": 0.39, + "grad_norm": 0.5232881218806097, + "learning_rate": 7.034626626199034e-07, + "loss": 0.322, + "step": 6047 + }, + { + "epoch": 0.39, + "grad_norm": 0.38390568318509016, + "learning_rate": 7.033683215379002e-07, + "loss": 0.0462, + "step": 6048 + }, + { + "epoch": 0.39, + "grad_norm": 0.47140231927303317, + "learning_rate": 7.032739717797954e-07, + "loss": 0.2785, + "step": 6049 + }, + { + "epoch": 0.39, + "grad_norm": 0.22127341481379223, + "learning_rate": 7.031796133496144e-07, + "loss": 0.1747, + "step": 6050 + }, + { + "epoch": 0.39, + "grad_norm": 0.5056716221063638, + "learning_rate": 7.030852462513826e-07, + "loss": 0.3887, + "step": 6051 + }, + { + "epoch": 0.39, + "grad_norm": 0.6106568240598306, + "learning_rate": 7.02990870489126e-07, + "loss": 0.4211, + "step": 6052 + }, + { + "epoch": 0.39, + "grad_norm": 0.6996628918092257, + "learning_rate": 7.028964860668706e-07, + "loss": 0.3672, + "step": 6053 + }, + { + "epoch": 0.39, + "grad_norm": 0.8179165141596455, + "learning_rate": 7.028020929886436e-07, + "loss": 0.2225, + "step": 6054 + }, + { + "epoch": 0.39, + "grad_norm": 0.6077679931879819, + "learning_rate": 7.027076912584714e-07, + "loss": 0.1728, + "step": 6055 + }, + { + "epoch": 0.39, + "grad_norm": 1.433169476401399, + "learning_rate": 7.026132808803816e-07, + "loss": 0.0204, + "step": 6056 + }, + { + "epoch": 0.39, + "grad_norm": 0.7197853818572455, + "learning_rate": 7.02518861858402e-07, + "loss": 0.2011, + "step": 6057 + }, + { + "epoch": 0.39, + "grad_norm": 0.6432525605362424, + "learning_rate": 7.024244341965606e-07, + "loss": 0.3238, + "step": 6058 + }, + { + "epoch": 0.39, + "grad_norm": 0.6331576072836341, + "learning_rate": 7.023299978988859e-07, + "loss": 0.2177, + "step": 6059 + }, + { + "epoch": 0.39, + "grad_norm": 6.527315811383846, + "learning_rate": 7.022355529694068e-07, + "loss": 0.0996, + "step": 6060 + }, + { + "epoch": 0.39, + "grad_norm": 2.5933556118618397, + "learning_rate": 7.021410994121524e-07, + "loss": 0.0674, + "step": 6061 + }, + { + "epoch": 0.39, + "grad_norm": 1.0699772808004508, + "learning_rate": 7.020466372311525e-07, + "loss": 0.0994, + "step": 6062 + }, + { + "epoch": 0.39, + "grad_norm": 1.8203860558455764, + "learning_rate": 7.019521664304369e-07, + "loss": 0.1917, + "step": 6063 + }, + { + "epoch": 0.39, + "grad_norm": 0.7259338690054316, + "learning_rate": 7.018576870140357e-07, + "loss": 0.1829, + "step": 6064 + }, + { + "epoch": 0.39, + "grad_norm": 0.7431332243039687, + "learning_rate": 7.017631989859799e-07, + "loss": 0.194, + "step": 6065 + }, + { + "epoch": 0.39, + "grad_norm": 0.9385818688452157, + "learning_rate": 7.016687023503004e-07, + "loss": 0.1326, + "step": 6066 + }, + { + "epoch": 0.39, + "grad_norm": 0.6791666522907975, + "learning_rate": 7.015741971110287e-07, + "loss": 0.1821, + "step": 6067 + }, + { + "epoch": 0.39, + "grad_norm": 0.7781131213115616, + "learning_rate": 7.014796832721965e-07, + "loss": 0.1181, + "step": 6068 + }, + { + "epoch": 0.39, + "grad_norm": 0.5342813351196428, + "learning_rate": 7.013851608378358e-07, + "loss": 0.1441, + "step": 6069 + }, + { + "epoch": 0.39, + "grad_norm": 0.4942220602028391, + "learning_rate": 7.012906298119796e-07, + "loss": 0.0107, + "step": 6070 + }, + { + "epoch": 0.39, + "grad_norm": 3.515678796282028, + "learning_rate": 7.011960901986603e-07, + "loss": 0.1908, + "step": 6071 + }, + { + "epoch": 0.39, + "grad_norm": 0.5427986787855831, + "learning_rate": 7.011015420019115e-07, + "loss": 0.1059, + "step": 6072 + }, + { + "epoch": 0.39, + "grad_norm": 0.3379644858261285, + "learning_rate": 7.010069852257665e-07, + "loss": 0.013, + "step": 6073 + }, + { + "epoch": 0.39, + "grad_norm": 0.744299123554008, + "learning_rate": 7.009124198742595e-07, + "loss": 0.0254, + "step": 6074 + }, + { + "epoch": 0.39, + "grad_norm": 0.9154454539648509, + "learning_rate": 7.008178459514249e-07, + "loss": 0.2579, + "step": 6075 + }, + { + "epoch": 0.39, + "grad_norm": 1.0282065266041691, + "learning_rate": 7.007232634612972e-07, + "loss": 0.1421, + "step": 6076 + }, + { + "epoch": 0.39, + "grad_norm": 1.5030692551752733, + "learning_rate": 7.006286724079115e-07, + "loss": 0.1216, + "step": 6077 + }, + { + "epoch": 0.39, + "grad_norm": 0.7340609318070733, + "learning_rate": 7.005340727953034e-07, + "loss": 0.3215, + "step": 6078 + }, + { + "epoch": 0.39, + "grad_norm": 0.7045726026405857, + "learning_rate": 7.004394646275086e-07, + "loss": 0.3236, + "step": 6079 + }, + { + "epoch": 0.39, + "grad_norm": 0.2988262294681189, + "learning_rate": 7.003448479085634e-07, + "loss": 0.096, + "step": 6080 + }, + { + "epoch": 0.39, + "grad_norm": 7.053010225464449, + "learning_rate": 7.002502226425041e-07, + "loss": 0.116, + "step": 6081 + }, + { + "epoch": 0.39, + "grad_norm": 0.8059010644650134, + "learning_rate": 7.001555888333679e-07, + "loss": 0.2569, + "step": 6082 + }, + { + "epoch": 0.39, + "grad_norm": 0.5653028761079574, + "learning_rate": 7.000609464851918e-07, + "loss": 0.2474, + "step": 6083 + }, + { + "epoch": 0.39, + "grad_norm": 1.2069537818478193, + "learning_rate": 6.999662956020136e-07, + "loss": 0.3237, + "step": 6084 + }, + { + "epoch": 0.39, + "grad_norm": 0.44003547281224015, + "learning_rate": 6.998716361878712e-07, + "loss": 0.4035, + "step": 6085 + }, + { + "epoch": 0.39, + "grad_norm": 0.9190976304451878, + "learning_rate": 6.99776968246803e-07, + "loss": 0.1739, + "step": 6086 + }, + { + "epoch": 0.39, + "grad_norm": 1.1500311324465582, + "learning_rate": 6.996822917828476e-07, + "loss": 0.2147, + "step": 6087 + }, + { + "epoch": 0.39, + "grad_norm": 0.03477222308533221, + "learning_rate": 6.995876068000445e-07, + "loss": 0.0008, + "step": 6088 + }, + { + "epoch": 0.39, + "grad_norm": 0.5832092753911268, + "learning_rate": 6.994929133024329e-07, + "loss": 0.3132, + "step": 6089 + }, + { + "epoch": 0.39, + "grad_norm": 8.524669557833631, + "learning_rate": 6.993982112940525e-07, + "loss": 0.1944, + "step": 6090 + }, + { + "epoch": 0.39, + "grad_norm": 1.5149018659822298, + "learning_rate": 6.993035007789434e-07, + "loss": 0.3219, + "step": 6091 + }, + { + "epoch": 0.39, + "grad_norm": 0.1683874690062729, + "learning_rate": 6.992087817611466e-07, + "loss": 0.0672, + "step": 6092 + }, + { + "epoch": 0.39, + "grad_norm": 2.6868417573536876, + "learning_rate": 6.991140542447024e-07, + "loss": 0.1113, + "step": 6093 + }, + { + "epoch": 0.39, + "grad_norm": 1.306435480036509, + "learning_rate": 6.990193182336524e-07, + "loss": 0.1352, + "step": 6094 + }, + { + "epoch": 0.39, + "grad_norm": 3.981389326213064, + "learning_rate": 6.989245737320383e-07, + "loss": 0.1822, + "step": 6095 + }, + { + "epoch": 0.39, + "grad_norm": 1.5881073089154503, + "learning_rate": 6.988298207439021e-07, + "loss": 0.4433, + "step": 6096 + }, + { + "epoch": 0.39, + "grad_norm": 0.862365536486719, + "learning_rate": 6.98735059273286e-07, + "loss": 0.2503, + "step": 6097 + }, + { + "epoch": 0.39, + "grad_norm": 2.137412049406784, + "learning_rate": 6.986402893242326e-07, + "loss": 0.0186, + "step": 6098 + }, + { + "epoch": 0.39, + "grad_norm": 0.629167955438788, + "learning_rate": 6.985455109007853e-07, + "loss": 0.0771, + "step": 6099 + }, + { + "epoch": 0.39, + "grad_norm": 0.7568199747826508, + "learning_rate": 6.984507240069873e-07, + "loss": 0.0125, + "step": 6100 + }, + { + "epoch": 0.39, + "grad_norm": 0.4294833206324395, + "learning_rate": 6.983559286468825e-07, + "loss": 0.1637, + "step": 6101 + }, + { + "epoch": 0.39, + "grad_norm": 1.0130177508255256, + "learning_rate": 6.982611248245152e-07, + "loss": 0.4066, + "step": 6102 + }, + { + "epoch": 0.39, + "grad_norm": 0.6991349913435341, + "learning_rate": 6.981663125439295e-07, + "loss": 0.3924, + "step": 6103 + }, + { + "epoch": 0.39, + "grad_norm": 0.8832818984152153, + "learning_rate": 6.980714918091706e-07, + "loss": 0.3424, + "step": 6104 + }, + { + "epoch": 0.39, + "grad_norm": 1.3879801519145525, + "learning_rate": 6.979766626242838e-07, + "loss": 0.0624, + "step": 6105 + }, + { + "epoch": 0.39, + "grad_norm": 1.195726083938898, + "learning_rate": 6.978818249933145e-07, + "loss": 0.1041, + "step": 6106 + }, + { + "epoch": 0.39, + "grad_norm": 1.0888581955067818, + "learning_rate": 6.977869789203088e-07, + "loss": 0.247, + "step": 6107 + }, + { + "epoch": 0.39, + "grad_norm": 3.1743268342712296, + "learning_rate": 6.97692124409313e-07, + "loss": 0.0383, + "step": 6108 + }, + { + "epoch": 0.39, + "grad_norm": 1.0650161679738679, + "learning_rate": 6.975972614643737e-07, + "loss": 0.1798, + "step": 6109 + }, + { + "epoch": 0.39, + "grad_norm": 1.9407940742659258, + "learning_rate": 6.97502390089538e-07, + "loss": 0.3623, + "step": 6110 + }, + { + "epoch": 0.39, + "grad_norm": 0.9933576601280183, + "learning_rate": 6.974075102888535e-07, + "loss": 0.2004, + "step": 6111 + }, + { + "epoch": 0.39, + "grad_norm": 0.9204704042847702, + "learning_rate": 6.973126220663675e-07, + "loss": 0.3363, + "step": 6112 + }, + { + "epoch": 0.39, + "grad_norm": 1.229552263715744, + "learning_rate": 6.972177254261285e-07, + "loss": 0.3976, + "step": 6113 + }, + { + "epoch": 0.39, + "grad_norm": 0.7396302309870308, + "learning_rate": 6.971228203721848e-07, + "loss": 0.1982, + "step": 6114 + }, + { + "epoch": 0.39, + "grad_norm": 0.5209664358944699, + "learning_rate": 6.970279069085855e-07, + "loss": 0.144, + "step": 6115 + }, + { + "epoch": 0.39, + "grad_norm": 1.4425491786711748, + "learning_rate": 6.969329850393795e-07, + "loss": 0.1312, + "step": 6116 + }, + { + "epoch": 0.39, + "grad_norm": 0.5263695534515982, + "learning_rate": 6.968380547686166e-07, + "loss": 0.3222, + "step": 6117 + }, + { + "epoch": 0.39, + "grad_norm": 1.575979932582315, + "learning_rate": 6.967431161003465e-07, + "loss": 0.0941, + "step": 6118 + }, + { + "epoch": 0.39, + "grad_norm": 4.222430021897893, + "learning_rate": 6.966481690386195e-07, + "loss": 0.0992, + "step": 6119 + }, + { + "epoch": 0.39, + "grad_norm": 1.2576809504348785, + "learning_rate": 6.965532135874863e-07, + "loss": 0.1708, + "step": 6120 + }, + { + "epoch": 0.39, + "grad_norm": 6.854626612892088, + "learning_rate": 6.96458249750998e-07, + "loss": 0.0214, + "step": 6121 + }, + { + "epoch": 0.39, + "grad_norm": 1.2937559918030428, + "learning_rate": 6.963632775332055e-07, + "loss": 0.3294, + "step": 6122 + }, + { + "epoch": 0.39, + "grad_norm": 0.6462685025279785, + "learning_rate": 6.962682969381613e-07, + "loss": 0.1446, + "step": 6123 + }, + { + "epoch": 0.39, + "grad_norm": 2.144011777287466, + "learning_rate": 6.961733079699168e-07, + "loss": 0.3612, + "step": 6124 + }, + { + "epoch": 0.39, + "grad_norm": 1.762259905785336, + "learning_rate": 6.960783106325246e-07, + "loss": 0.4583, + "step": 6125 + }, + { + "epoch": 0.39, + "grad_norm": 0.5472923641710189, + "learning_rate": 6.959833049300375e-07, + "loss": 0.1341, + "step": 6126 + }, + { + "epoch": 0.39, + "grad_norm": 0.618147425453013, + "learning_rate": 6.958882908665087e-07, + "loss": 0.2312, + "step": 6127 + }, + { + "epoch": 0.39, + "grad_norm": 10.039500895902682, + "learning_rate": 6.957932684459915e-07, + "loss": 0.2508, + "step": 6128 + }, + { + "epoch": 0.39, + "grad_norm": 0.7293971604524371, + "learning_rate": 6.9569823767254e-07, + "loss": 0.2275, + "step": 6129 + }, + { + "epoch": 0.39, + "grad_norm": 0.8612548679002621, + "learning_rate": 6.956031985502084e-07, + "loss": 0.0322, + "step": 6130 + }, + { + "epoch": 0.39, + "grad_norm": 0.8993388064077994, + "learning_rate": 6.955081510830509e-07, + "loss": 0.1982, + "step": 6131 + }, + { + "epoch": 0.39, + "grad_norm": 0.8921071696384599, + "learning_rate": 6.954130952751227e-07, + "loss": 0.1631, + "step": 6132 + }, + { + "epoch": 0.39, + "grad_norm": 0.44758189450209807, + "learning_rate": 6.953180311304792e-07, + "loss": 0.2095, + "step": 6133 + }, + { + "epoch": 0.39, + "grad_norm": 1.0596157588948227, + "learning_rate": 6.952229586531756e-07, + "loss": 0.2035, + "step": 6134 + }, + { + "epoch": 0.39, + "grad_norm": 0.43274357682038095, + "learning_rate": 6.951278778472682e-07, + "loss": 0.0727, + "step": 6135 + }, + { + "epoch": 0.39, + "grad_norm": 0.32998110733984454, + "learning_rate": 6.950327887168133e-07, + "loss": 0.2334, + "step": 6136 + }, + { + "epoch": 0.39, + "grad_norm": 0.6791760685309348, + "learning_rate": 6.949376912658678e-07, + "loss": 0.2407, + "step": 6137 + }, + { + "epoch": 0.39, + "grad_norm": 0.4853614325615889, + "learning_rate": 6.948425854984883e-07, + "loss": 0.1118, + "step": 6138 + }, + { + "epoch": 0.39, + "grad_norm": 1.2479386879988221, + "learning_rate": 6.947474714187324e-07, + "loss": 0.1549, + "step": 6139 + }, + { + "epoch": 0.39, + "grad_norm": 1.3361662390743687, + "learning_rate": 6.946523490306578e-07, + "loss": 0.0888, + "step": 6140 + }, + { + "epoch": 0.39, + "grad_norm": 0.6868660305286954, + "learning_rate": 6.945572183383229e-07, + "loss": 0.1573, + "step": 6141 + }, + { + "epoch": 0.39, + "grad_norm": 1.2057914628517277, + "learning_rate": 6.944620793457857e-07, + "loss": 0.3429, + "step": 6142 + }, + { + "epoch": 0.39, + "grad_norm": 0.7712588457012892, + "learning_rate": 6.943669320571055e-07, + "loss": 0.0063, + "step": 6143 + }, + { + "epoch": 0.39, + "grad_norm": 1.214776392904596, + "learning_rate": 6.942717764763412e-07, + "loss": 0.2139, + "step": 6144 + }, + { + "epoch": 0.39, + "grad_norm": 10.824250979215668, + "learning_rate": 6.941766126075524e-07, + "loss": 0.1055, + "step": 6145 + }, + { + "epoch": 0.39, + "grad_norm": 2.429547489762958, + "learning_rate": 6.94081440454799e-07, + "loss": 0.1091, + "step": 6146 + }, + { + "epoch": 0.39, + "grad_norm": 0.9529996714874591, + "learning_rate": 6.939862600221411e-07, + "loss": 0.2029, + "step": 6147 + }, + { + "epoch": 0.39, + "grad_norm": 1.5255961179150546, + "learning_rate": 6.938910713136393e-07, + "loss": 0.405, + "step": 6148 + }, + { + "epoch": 0.39, + "grad_norm": 0.6362585419101171, + "learning_rate": 6.937958743333548e-07, + "loss": 0.1283, + "step": 6149 + }, + { + "epoch": 0.39, + "grad_norm": 2.465940031687446, + "learning_rate": 6.937006690853486e-07, + "loss": 0.0199, + "step": 6150 + }, + { + "epoch": 0.39, + "grad_norm": 0.7828230822884971, + "learning_rate": 6.936054555736825e-07, + "loss": 0.2407, + "step": 6151 + }, + { + "epoch": 0.39, + "grad_norm": 0.25168434736024076, + "learning_rate": 6.935102338024185e-07, + "loss": 0.174, + "step": 6152 + }, + { + "epoch": 0.39, + "grad_norm": 1.8794224204941419, + "learning_rate": 6.93415003775619e-07, + "loss": 0.3587, + "step": 6153 + }, + { + "epoch": 0.39, + "grad_norm": 0.9803000681134058, + "learning_rate": 6.933197654973466e-07, + "loss": 0.1463, + "step": 6154 + }, + { + "epoch": 0.39, + "grad_norm": 0.20458817097033785, + "learning_rate": 6.932245189716643e-07, + "loss": 0.0215, + "step": 6155 + }, + { + "epoch": 0.39, + "grad_norm": 1.1878195279602228, + "learning_rate": 6.931292642026356e-07, + "loss": 0.1178, + "step": 6156 + }, + { + "epoch": 0.39, + "grad_norm": 0.9650560839732295, + "learning_rate": 6.930340011943244e-07, + "loss": 0.3217, + "step": 6157 + }, + { + "epoch": 0.39, + "grad_norm": 0.6340945096257533, + "learning_rate": 6.929387299507944e-07, + "loss": 0.4892, + "step": 6158 + }, + { + "epoch": 0.39, + "grad_norm": 1.655075799508818, + "learning_rate": 6.928434504761105e-07, + "loss": 0.0638, + "step": 6159 + }, + { + "epoch": 0.39, + "grad_norm": 4.61875768161717, + "learning_rate": 6.927481627743373e-07, + "loss": 0.1523, + "step": 6160 + }, + { + "epoch": 0.39, + "grad_norm": 1.033864607713213, + "learning_rate": 6.926528668495399e-07, + "loss": 0.2278, + "step": 6161 + }, + { + "epoch": 0.39, + "grad_norm": 0.7826141802236259, + "learning_rate": 6.925575627057841e-07, + "loss": 0.0551, + "step": 6162 + }, + { + "epoch": 0.39, + "grad_norm": 0.28373260478851425, + "learning_rate": 6.924622503471356e-07, + "loss": 0.0812, + "step": 6163 + }, + { + "epoch": 0.39, + "grad_norm": 2.8340090495261956, + "learning_rate": 6.923669297776604e-07, + "loss": 0.1287, + "step": 6164 + }, + { + "epoch": 0.39, + "grad_norm": 1.672714393111128, + "learning_rate": 6.922716010014255e-07, + "loss": 0.1098, + "step": 6165 + }, + { + "epoch": 0.39, + "grad_norm": 0.871746698251231, + "learning_rate": 6.921762640224974e-07, + "loss": 0.0985, + "step": 6166 + }, + { + "epoch": 0.39, + "grad_norm": 1.7736099022362137, + "learning_rate": 6.920809188449435e-07, + "loss": 0.0043, + "step": 6167 + }, + { + "epoch": 0.39, + "grad_norm": 0.8113946228779848, + "learning_rate": 6.919855654728316e-07, + "loss": 0.1173, + "step": 6168 + }, + { + "epoch": 0.39, + "grad_norm": 0.9167850203021984, + "learning_rate": 6.918902039102296e-07, + "loss": 0.2546, + "step": 6169 + }, + { + "epoch": 0.39, + "grad_norm": 0.7498944438835116, + "learning_rate": 6.917948341612056e-07, + "loss": 0.2329, + "step": 6170 + }, + { + "epoch": 0.39, + "grad_norm": 8.063631914034943, + "learning_rate": 6.916994562298285e-07, + "loss": 0.1669, + "step": 6171 + }, + { + "epoch": 0.39, + "grad_norm": 0.7192687269786702, + "learning_rate": 6.916040701201674e-07, + "loss": 0.2198, + "step": 6172 + }, + { + "epoch": 0.39, + "grad_norm": 0.7899638685431615, + "learning_rate": 6.915086758362914e-07, + "loss": 0.2831, + "step": 6173 + }, + { + "epoch": 0.39, + "grad_norm": 0.9522419273413391, + "learning_rate": 6.914132733822701e-07, + "loss": 0.0687, + "step": 6174 + }, + { + "epoch": 0.39, + "grad_norm": 0.3403160937259742, + "learning_rate": 6.913178627621739e-07, + "loss": 0.0447, + "step": 6175 + }, + { + "epoch": 0.39, + "grad_norm": 0.8785572537418758, + "learning_rate": 6.912224439800731e-07, + "loss": 0.08, + "step": 6176 + }, + { + "epoch": 0.39, + "grad_norm": 1.0105327342004704, + "learning_rate": 6.911270170400384e-07, + "loss": 0.1012, + "step": 6177 + }, + { + "epoch": 0.39, + "grad_norm": 2.7582801548032934, + "learning_rate": 6.91031581946141e-07, + "loss": 0.2048, + "step": 6178 + }, + { + "epoch": 0.39, + "grad_norm": 0.4176320377241375, + "learning_rate": 6.909361387024522e-07, + "loss": 0.0817, + "step": 6179 + }, + { + "epoch": 0.39, + "grad_norm": 6.719948092491281, + "learning_rate": 6.908406873130439e-07, + "loss": 0.1972, + "step": 6180 + }, + { + "epoch": 0.39, + "grad_norm": 1.4633957144557999, + "learning_rate": 6.907452277819883e-07, + "loss": 0.2768, + "step": 6181 + }, + { + "epoch": 0.39, + "grad_norm": 0.384205034374505, + "learning_rate": 6.906497601133579e-07, + "loss": 0.1005, + "step": 6182 + }, + { + "epoch": 0.39, + "grad_norm": 0.5906550470369817, + "learning_rate": 6.905542843112253e-07, + "loss": 0.1545, + "step": 6183 + }, + { + "epoch": 0.39, + "grad_norm": 0.5678397989120448, + "learning_rate": 6.90458800379664e-07, + "loss": 0.2816, + "step": 6184 + }, + { + "epoch": 0.39, + "grad_norm": 1.384474132225101, + "learning_rate": 6.903633083227474e-07, + "loss": 0.1302, + "step": 6185 + }, + { + "epoch": 0.39, + "grad_norm": 1.982603229099718, + "learning_rate": 6.902678081445494e-07, + "loss": 0.0161, + "step": 6186 + }, + { + "epoch": 0.39, + "grad_norm": 1.2659266276101535, + "learning_rate": 6.901722998491441e-07, + "loss": 0.0176, + "step": 6187 + }, + { + "epoch": 0.39, + "grad_norm": 1.2677240536185697, + "learning_rate": 6.900767834406063e-07, + "loss": 0.1869, + "step": 6188 + }, + { + "epoch": 0.39, + "grad_norm": 2.5057187867373214, + "learning_rate": 6.899812589230108e-07, + "loss": 0.1465, + "step": 6189 + }, + { + "epoch": 0.39, + "grad_norm": 2.6191932612231814, + "learning_rate": 6.89885726300433e-07, + "loss": 0.3231, + "step": 6190 + }, + { + "epoch": 0.39, + "grad_norm": 1.2446369380745481, + "learning_rate": 6.897901855769483e-07, + "loss": 0.0954, + "step": 6191 + }, + { + "epoch": 0.39, + "grad_norm": 0.8173892522714576, + "learning_rate": 6.896946367566327e-07, + "loss": 0.1784, + "step": 6192 + }, + { + "epoch": 0.39, + "grad_norm": 0.5691305531822278, + "learning_rate": 6.895990798435625e-07, + "loss": 0.2155, + "step": 6193 + }, + { + "epoch": 0.4, + "grad_norm": 1.0434859744339169, + "learning_rate": 6.895035148418144e-07, + "loss": 0.5934, + "step": 6194 + }, + { + "epoch": 0.4, + "grad_norm": 0.35781916837890215, + "learning_rate": 6.894079417554655e-07, + "loss": 0.0663, + "step": 6195 + }, + { + "epoch": 0.4, + "grad_norm": 0.24015978873153349, + "learning_rate": 6.893123605885931e-07, + "loss": 0.0623, + "step": 6196 + }, + { + "epoch": 0.4, + "grad_norm": 1.321693853670986, + "learning_rate": 6.892167713452748e-07, + "loss": 0.3533, + "step": 6197 + }, + { + "epoch": 0.4, + "grad_norm": 0.48782737986281427, + "learning_rate": 6.891211740295887e-07, + "loss": 0.1524, + "step": 6198 + }, + { + "epoch": 0.4, + "grad_norm": 0.11333829615908207, + "learning_rate": 6.890255686456133e-07, + "loss": 0.0053, + "step": 6199 + }, + { + "epoch": 0.4, + "grad_norm": 0.9010655888481797, + "learning_rate": 6.889299551974268e-07, + "loss": 0.3647, + "step": 6200 + }, + { + "epoch": 0.4, + "grad_norm": 0.5065096572806629, + "learning_rate": 6.888343336891087e-07, + "loss": 0.2117, + "step": 6201 + }, + { + "epoch": 0.4, + "grad_norm": 5.14211318909038, + "learning_rate": 6.887387041247384e-07, + "loss": 0.1118, + "step": 6202 + }, + { + "epoch": 0.4, + "grad_norm": 0.3927341176803692, + "learning_rate": 6.886430665083955e-07, + "loss": 0.0819, + "step": 6203 + }, + { + "epoch": 0.4, + "grad_norm": 1.3924321143043787, + "learning_rate": 6.885474208441601e-07, + "loss": 0.127, + "step": 6204 + }, + { + "epoch": 0.4, + "grad_norm": 0.4337249428796377, + "learning_rate": 6.88451767136113e-07, + "loss": 0.2451, + "step": 6205 + }, + { + "epoch": 0.4, + "grad_norm": 0.6311639888403874, + "learning_rate": 6.883561053883344e-07, + "loss": 0.401, + "step": 6206 + }, + { + "epoch": 0.4, + "grad_norm": 0.5139037001185284, + "learning_rate": 6.882604356049058e-07, + "loss": 0.1032, + "step": 6207 + }, + { + "epoch": 0.4, + "grad_norm": 1.137946904279486, + "learning_rate": 6.881647577899086e-07, + "loss": 0.2727, + "step": 6208 + }, + { + "epoch": 0.4, + "grad_norm": 1.146831100302851, + "learning_rate": 6.880690719474245e-07, + "loss": 0.0998, + "step": 6209 + }, + { + "epoch": 0.4, + "grad_norm": 0.8310608478545772, + "learning_rate": 6.879733780815357e-07, + "loss": 0.1059, + "step": 6210 + }, + { + "epoch": 0.4, + "grad_norm": 0.4979772168898703, + "learning_rate": 6.878776761963248e-07, + "loss": 0.0443, + "step": 6211 + }, + { + "epoch": 0.4, + "grad_norm": 0.8899238443742646, + "learning_rate": 6.877819662958743e-07, + "loss": 0.292, + "step": 6212 + }, + { + "epoch": 0.4, + "grad_norm": 0.4890042180484366, + "learning_rate": 6.87686248384268e-07, + "loss": 0.3342, + "step": 6213 + }, + { + "epoch": 0.4, + "grad_norm": 0.6739477447958372, + "learning_rate": 6.875905224655889e-07, + "loss": 0.2706, + "step": 6214 + }, + { + "epoch": 0.4, + "grad_norm": 0.5369417685652146, + "learning_rate": 6.874947885439211e-07, + "loss": 0.0872, + "step": 6215 + }, + { + "epoch": 0.4, + "grad_norm": 0.12216211068175181, + "learning_rate": 6.873990466233486e-07, + "loss": 0.0638, + "step": 6216 + }, + { + "epoch": 0.4, + "grad_norm": 1.9266919734418062, + "learning_rate": 6.87303296707956e-07, + "loss": 0.0298, + "step": 6217 + }, + { + "epoch": 0.4, + "grad_norm": 0.9563162868912487, + "learning_rate": 6.872075388018284e-07, + "loss": 0.072, + "step": 6218 + }, + { + "epoch": 0.4, + "grad_norm": 0.86366765529282, + "learning_rate": 6.871117729090508e-07, + "loss": 0.2834, + "step": 6219 + }, + { + "epoch": 0.4, + "grad_norm": 1.183708277852859, + "learning_rate": 6.870159990337086e-07, + "loss": 0.0882, + "step": 6220 + }, + { + "epoch": 0.4, + "grad_norm": 7.285448526287848, + "learning_rate": 6.869202171798881e-07, + "loss": 0.2821, + "step": 6221 + }, + { + "epoch": 0.4, + "grad_norm": 1.9273322831659865, + "learning_rate": 6.868244273516755e-07, + "loss": 0.2233, + "step": 6222 + }, + { + "epoch": 0.4, + "grad_norm": 8.179773481144634, + "learning_rate": 6.867286295531571e-07, + "loss": 0.2397, + "step": 6223 + }, + { + "epoch": 0.4, + "grad_norm": 0.8681641867231622, + "learning_rate": 6.8663282378842e-07, + "loss": 0.1859, + "step": 6224 + }, + { + "epoch": 0.4, + "grad_norm": 0.5727729885534764, + "learning_rate": 6.865370100615515e-07, + "loss": 0.265, + "step": 6225 + }, + { + "epoch": 0.4, + "grad_norm": 0.30902591923894296, + "learning_rate": 6.864411883766393e-07, + "loss": 0.1712, + "step": 6226 + }, + { + "epoch": 0.4, + "grad_norm": 1.1296518840080736, + "learning_rate": 6.863453587377711e-07, + "loss": 0.3299, + "step": 6227 + }, + { + "epoch": 0.4, + "grad_norm": 3.940590276393374, + "learning_rate": 6.862495211490352e-07, + "loss": 0.1227, + "step": 6228 + }, + { + "epoch": 0.4, + "grad_norm": 0.4717924375144069, + "learning_rate": 6.861536756145205e-07, + "loss": 0.183, + "step": 6229 + }, + { + "epoch": 0.4, + "grad_norm": 2.908760307610912, + "learning_rate": 6.860578221383155e-07, + "loss": 0.0143, + "step": 6230 + }, + { + "epoch": 0.4, + "grad_norm": 0.9651197902918841, + "learning_rate": 6.859619607245101e-07, + "loss": 0.1343, + "step": 6231 + }, + { + "epoch": 0.4, + "grad_norm": 0.9555572567907259, + "learning_rate": 6.858660913771934e-07, + "loss": 0.0517, + "step": 6232 + }, + { + "epoch": 0.4, + "grad_norm": 1.0019228992172688, + "learning_rate": 6.857702141004558e-07, + "loss": 0.1124, + "step": 6233 + }, + { + "epoch": 0.4, + "grad_norm": 1.375353658408546, + "learning_rate": 6.856743288983873e-07, + "loss": 0.1988, + "step": 6234 + }, + { + "epoch": 0.4, + "grad_norm": 1.5454114092696687, + "learning_rate": 6.855784357750786e-07, + "loss": 0.4377, + "step": 6235 + }, + { + "epoch": 0.4, + "grad_norm": 0.2153189426097419, + "learning_rate": 6.854825347346209e-07, + "loss": 0.0625, + "step": 6236 + }, + { + "epoch": 0.4, + "grad_norm": 5.931577506079022, + "learning_rate": 6.853866257811054e-07, + "loss": 0.0245, + "step": 6237 + }, + { + "epoch": 0.4, + "grad_norm": 7.686939894574234, + "learning_rate": 6.852907089186236e-07, + "loss": 0.0939, + "step": 6238 + }, + { + "epoch": 0.4, + "grad_norm": 0.8131809197901665, + "learning_rate": 6.851947841512679e-07, + "loss": 0.2382, + "step": 6239 + }, + { + "epoch": 0.4, + "grad_norm": 2.497959138251734, + "learning_rate": 6.850988514831303e-07, + "loss": 0.0311, + "step": 6240 + }, + { + "epoch": 0.4, + "grad_norm": 4.275365788480474, + "learning_rate": 6.850029109183038e-07, + "loss": 0.1505, + "step": 6241 + }, + { + "epoch": 0.4, + "grad_norm": 0.2547152339384144, + "learning_rate": 6.84906962460881e-07, + "loss": 0.1546, + "step": 6242 + }, + { + "epoch": 0.4, + "grad_norm": 0.3994564711131799, + "learning_rate": 6.848110061149555e-07, + "loss": 0.0257, + "step": 6243 + }, + { + "epoch": 0.4, + "grad_norm": 1.0345026380540454, + "learning_rate": 6.84715041884621e-07, + "loss": 0.3793, + "step": 6244 + }, + { + "epoch": 0.4, + "grad_norm": 0.43390028267813124, + "learning_rate": 6.846190697739714e-07, + "loss": 0.0355, + "step": 6245 + }, + { + "epoch": 0.4, + "grad_norm": 0.4501212776602618, + "learning_rate": 6.845230897871012e-07, + "loss": 0.1599, + "step": 6246 + }, + { + "epoch": 0.4, + "grad_norm": 2.580929022002219, + "learning_rate": 6.84427101928105e-07, + "loss": 0.15, + "step": 6247 + }, + { + "epoch": 0.4, + "grad_norm": 0.7918955130027382, + "learning_rate": 6.84331106201078e-07, + "loss": 0.1475, + "step": 6248 + }, + { + "epoch": 0.4, + "grad_norm": 0.7569238348928278, + "learning_rate": 6.842351026101154e-07, + "loss": 0.107, + "step": 6249 + }, + { + "epoch": 0.4, + "grad_norm": 0.4995717604473063, + "learning_rate": 6.84139091159313e-07, + "loss": 0.2009, + "step": 6250 + }, + { + "epoch": 0.4, + "grad_norm": 0.9157557633717965, + "learning_rate": 6.840430718527667e-07, + "loss": 0.003, + "step": 6251 + }, + { + "epoch": 0.4, + "grad_norm": 0.7124468562585222, + "learning_rate": 6.839470446945732e-07, + "loss": 0.2808, + "step": 6252 + }, + { + "epoch": 0.4, + "grad_norm": 1.2051956401988941, + "learning_rate": 6.838510096888288e-07, + "loss": 0.2728, + "step": 6253 + }, + { + "epoch": 0.4, + "grad_norm": 0.5212681126804175, + "learning_rate": 6.837549668396309e-07, + "loss": 0.0165, + "step": 6254 + }, + { + "epoch": 0.4, + "grad_norm": 0.7271535467615731, + "learning_rate": 6.836589161510766e-07, + "loss": 0.1922, + "step": 6255 + }, + { + "epoch": 0.4, + "grad_norm": 1.9817213786470782, + "learning_rate": 6.835628576272637e-07, + "loss": 0.244, + "step": 6256 + }, + { + "epoch": 0.4, + "grad_norm": 1.0020861976946314, + "learning_rate": 6.834667912722904e-07, + "loss": 0.2997, + "step": 6257 + }, + { + "epoch": 0.4, + "grad_norm": 0.44966370145242784, + "learning_rate": 6.833707170902549e-07, + "loss": 0.1575, + "step": 6258 + }, + { + "epoch": 0.4, + "grad_norm": 0.5248286096091686, + "learning_rate": 6.832746350852561e-07, + "loss": 0.2467, + "step": 6259 + }, + { + "epoch": 0.4, + "grad_norm": 0.6208019392122317, + "learning_rate": 6.831785452613927e-07, + "loss": 0.2825, + "step": 6260 + }, + { + "epoch": 0.4, + "grad_norm": 1.057503488458636, + "learning_rate": 6.830824476227646e-07, + "loss": 0.4394, + "step": 6261 + }, + { + "epoch": 0.4, + "grad_norm": 0.4542964719792616, + "learning_rate": 6.82986342173471e-07, + "loss": 0.3187, + "step": 6262 + }, + { + "epoch": 0.4, + "grad_norm": 1.0216584528054076, + "learning_rate": 6.828902289176124e-07, + "loss": 0.0623, + "step": 6263 + }, + { + "epoch": 0.4, + "grad_norm": 0.3721115803693167, + "learning_rate": 6.827941078592888e-07, + "loss": 0.2275, + "step": 6264 + }, + { + "epoch": 0.4, + "grad_norm": 0.6876065045926866, + "learning_rate": 6.826979790026012e-07, + "loss": 0.2629, + "step": 6265 + }, + { + "epoch": 0.4, + "grad_norm": 0.5959189359618354, + "learning_rate": 6.826018423516505e-07, + "loss": 0.0692, + "step": 6266 + }, + { + "epoch": 0.4, + "grad_norm": 1.0987326793714454, + "learning_rate": 6.825056979105381e-07, + "loss": 0.3505, + "step": 6267 + }, + { + "epoch": 0.4, + "grad_norm": 2.5305220700049564, + "learning_rate": 6.82409545683366e-07, + "loss": 0.2905, + "step": 6268 + }, + { + "epoch": 0.4, + "grad_norm": 0.9076133219416157, + "learning_rate": 6.823133856742358e-07, + "loss": 0.3421, + "step": 6269 + }, + { + "epoch": 0.4, + "grad_norm": 1.0344516297215987, + "learning_rate": 6.822172178872501e-07, + "loss": 0.1861, + "step": 6270 + }, + { + "epoch": 0.4, + "grad_norm": 0.6141548408873324, + "learning_rate": 6.821210423265115e-07, + "loss": 0.1325, + "step": 6271 + }, + { + "epoch": 0.4, + "grad_norm": 12.259281383914091, + "learning_rate": 6.820248589961231e-07, + "loss": 0.0573, + "step": 6272 + }, + { + "epoch": 0.4, + "grad_norm": 0.1936161211338814, + "learning_rate": 6.819286679001883e-07, + "loss": 0.0052, + "step": 6273 + }, + { + "epoch": 0.4, + "grad_norm": 0.4761750848766932, + "learning_rate": 6.818324690428109e-07, + "loss": 0.0708, + "step": 6274 + }, + { + "epoch": 0.4, + "grad_norm": 0.30141760885628427, + "learning_rate": 6.817362624280948e-07, + "loss": 0.1181, + "step": 6275 + }, + { + "epoch": 0.4, + "grad_norm": 1.27387131059069, + "learning_rate": 6.816400480601444e-07, + "loss": 0.111, + "step": 6276 + }, + { + "epoch": 0.4, + "grad_norm": 5.78459022870707, + "learning_rate": 6.815438259430645e-07, + "loss": 0.2472, + "step": 6277 + }, + { + "epoch": 0.4, + "grad_norm": 0.9573281104683642, + "learning_rate": 6.8144759608096e-07, + "loss": 0.1301, + "step": 6278 + }, + { + "epoch": 0.4, + "grad_norm": 5.028878427849481, + "learning_rate": 6.813513584779362e-07, + "loss": 0.2653, + "step": 6279 + }, + { + "epoch": 0.4, + "grad_norm": 0.381367328172131, + "learning_rate": 6.81255113138099e-07, + "loss": 0.0072, + "step": 6280 + }, + { + "epoch": 0.4, + "grad_norm": 1.4528935142791577, + "learning_rate": 6.811588600655542e-07, + "loss": 0.1284, + "step": 6281 + }, + { + "epoch": 0.4, + "grad_norm": 4.827426492833447, + "learning_rate": 6.810625992644083e-07, + "loss": 0.156, + "step": 6282 + }, + { + "epoch": 0.4, + "grad_norm": 0.5734812805451183, + "learning_rate": 6.80966330738768e-07, + "loss": 0.0737, + "step": 6283 + }, + { + "epoch": 0.4, + "grad_norm": 0.4426379072669722, + "learning_rate": 6.808700544927402e-07, + "loss": 0.1289, + "step": 6284 + }, + { + "epoch": 0.4, + "grad_norm": 0.5192769835580945, + "learning_rate": 6.807737705304323e-07, + "loss": 0.2355, + "step": 6285 + }, + { + "epoch": 0.4, + "grad_norm": 0.6008560869470537, + "learning_rate": 6.806774788559519e-07, + "loss": 0.2676, + "step": 6286 + }, + { + "epoch": 0.4, + "grad_norm": 0.9184380307491457, + "learning_rate": 6.80581179473407e-07, + "loss": 0.4086, + "step": 6287 + }, + { + "epoch": 0.4, + "grad_norm": 0.24984547522423572, + "learning_rate": 6.804848723869061e-07, + "loss": 0.2237, + "step": 6288 + }, + { + "epoch": 0.4, + "grad_norm": 0.5476870152107928, + "learning_rate": 6.803885576005577e-07, + "loss": 0.1659, + "step": 6289 + }, + { + "epoch": 0.4, + "grad_norm": 1.5843547615793472, + "learning_rate": 6.802922351184707e-07, + "loss": 0.1208, + "step": 6290 + }, + { + "epoch": 0.4, + "grad_norm": 0.36539251437592996, + "learning_rate": 6.801959049447545e-07, + "loss": 0.1999, + "step": 6291 + }, + { + "epoch": 0.4, + "grad_norm": 8.163480671211163, + "learning_rate": 6.800995670835188e-07, + "loss": 0.1565, + "step": 6292 + }, + { + "epoch": 0.4, + "grad_norm": 15.468156813312886, + "learning_rate": 6.800032215388736e-07, + "loss": 0.1801, + "step": 6293 + }, + { + "epoch": 0.4, + "grad_norm": 0.5391643643765122, + "learning_rate": 6.79906868314929e-07, + "loss": 0.3423, + "step": 6294 + }, + { + "epoch": 0.4, + "grad_norm": 1.1876241664389382, + "learning_rate": 6.798105074157958e-07, + "loss": 0.1654, + "step": 6295 + }, + { + "epoch": 0.4, + "grad_norm": 0.8193904871120007, + "learning_rate": 6.797141388455851e-07, + "loss": 0.2495, + "step": 6296 + }, + { + "epoch": 0.4, + "grad_norm": 0.3866432513975203, + "learning_rate": 6.796177626084078e-07, + "loss": 0.164, + "step": 6297 + }, + { + "epoch": 0.4, + "grad_norm": 1.0882381860432704, + "learning_rate": 6.795213787083755e-07, + "loss": 0.1788, + "step": 6298 + }, + { + "epoch": 0.4, + "grad_norm": 0.6830235621905923, + "learning_rate": 6.794249871496005e-07, + "loss": 0.3599, + "step": 6299 + }, + { + "epoch": 0.4, + "grad_norm": 4.695270607090052, + "learning_rate": 6.793285879361948e-07, + "loss": 0.0877, + "step": 6300 + }, + { + "epoch": 0.4, + "grad_norm": 0.7194837379626305, + "learning_rate": 6.79232181072271e-07, + "loss": 0.3207, + "step": 6301 + }, + { + "epoch": 0.4, + "grad_norm": 0.9357192862350031, + "learning_rate": 6.79135766561942e-07, + "loss": 0.3821, + "step": 6302 + }, + { + "epoch": 0.4, + "grad_norm": 0.5450760651607603, + "learning_rate": 6.790393444093213e-07, + "loss": 0.1295, + "step": 6303 + }, + { + "epoch": 0.4, + "grad_norm": 0.5605722159660916, + "learning_rate": 6.789429146185222e-07, + "loss": 0.0503, + "step": 6304 + }, + { + "epoch": 0.4, + "grad_norm": 1.223088798425785, + "learning_rate": 6.788464771936585e-07, + "loss": 0.5387, + "step": 6305 + }, + { + "epoch": 0.4, + "grad_norm": 0.428314502941871, + "learning_rate": 6.787500321388447e-07, + "loss": 0.004, + "step": 6306 + }, + { + "epoch": 0.4, + "grad_norm": 1.0015165823477454, + "learning_rate": 6.786535794581951e-07, + "loss": 0.1085, + "step": 6307 + }, + { + "epoch": 0.4, + "grad_norm": 1.178132416984318, + "learning_rate": 6.785571191558247e-07, + "loss": 0.1638, + "step": 6308 + }, + { + "epoch": 0.4, + "grad_norm": 0.7832864868245555, + "learning_rate": 6.784606512358486e-07, + "loss": 0.1841, + "step": 6309 + }, + { + "epoch": 0.4, + "grad_norm": 1.2342575567020215, + "learning_rate": 6.783641757023825e-07, + "loss": 0.3512, + "step": 6310 + }, + { + "epoch": 0.4, + "grad_norm": 0.02782269276322806, + "learning_rate": 6.78267692559542e-07, + "loss": 0.0007, + "step": 6311 + }, + { + "epoch": 0.4, + "grad_norm": 0.9632128514314175, + "learning_rate": 6.781712018114434e-07, + "loss": 0.2452, + "step": 6312 + }, + { + "epoch": 0.4, + "grad_norm": 0.9387459585892254, + "learning_rate": 6.780747034622032e-07, + "loss": 0.1231, + "step": 6313 + }, + { + "epoch": 0.4, + "grad_norm": 0.6726956733244531, + "learning_rate": 6.77978197515938e-07, + "loss": 0.1672, + "step": 6314 + }, + { + "epoch": 0.4, + "grad_norm": 0.8011892718369154, + "learning_rate": 6.778816839767654e-07, + "loss": 0.2529, + "step": 6315 + }, + { + "epoch": 0.4, + "grad_norm": 1.654576615629492, + "learning_rate": 6.777851628488022e-07, + "loss": 0.1318, + "step": 6316 + }, + { + "epoch": 0.4, + "grad_norm": 0.468742702068239, + "learning_rate": 6.776886341361668e-07, + "loss": 0.1593, + "step": 6317 + }, + { + "epoch": 0.4, + "grad_norm": 0.8942879143321072, + "learning_rate": 6.77592097842977e-07, + "loss": 0.3876, + "step": 6318 + }, + { + "epoch": 0.4, + "grad_norm": 0.6368684574518247, + "learning_rate": 6.774955539733514e-07, + "loss": 0.0998, + "step": 6319 + }, + { + "epoch": 0.4, + "grad_norm": 0.7566350153543336, + "learning_rate": 6.773990025314086e-07, + "loss": 0.1656, + "step": 6320 + }, + { + "epoch": 0.4, + "grad_norm": 0.731691102948197, + "learning_rate": 6.773024435212677e-07, + "loss": 0.0195, + "step": 6321 + }, + { + "epoch": 0.4, + "grad_norm": 0.6343300683315546, + "learning_rate": 6.772058769470483e-07, + "loss": 0.2368, + "step": 6322 + }, + { + "epoch": 0.4, + "grad_norm": 3.180929949863677, + "learning_rate": 6.771093028128699e-07, + "loss": 0.0249, + "step": 6323 + }, + { + "epoch": 0.4, + "grad_norm": 2.20453988497507, + "learning_rate": 6.770127211228526e-07, + "loss": 0.1609, + "step": 6324 + }, + { + "epoch": 0.4, + "grad_norm": 5.817803980307327, + "learning_rate": 6.769161318811165e-07, + "loss": 0.1692, + "step": 6325 + }, + { + "epoch": 0.4, + "grad_norm": 0.6893446552490462, + "learning_rate": 6.76819535091783e-07, + "loss": 0.2116, + "step": 6326 + }, + { + "epoch": 0.4, + "grad_norm": 7.248398235457054, + "learning_rate": 6.767229307589723e-07, + "loss": 0.1265, + "step": 6327 + }, + { + "epoch": 0.4, + "grad_norm": 1.1731612265180096, + "learning_rate": 6.766263188868063e-07, + "loss": 0.4075, + "step": 6328 + }, + { + "epoch": 0.4, + "grad_norm": 2.1095975619987564, + "learning_rate": 6.765296994794065e-07, + "loss": 0.341, + "step": 6329 + }, + { + "epoch": 0.4, + "grad_norm": 1.232819800035905, + "learning_rate": 6.764330725408948e-07, + "loss": 0.4152, + "step": 6330 + }, + { + "epoch": 0.4, + "grad_norm": 0.40808648334532943, + "learning_rate": 6.763364380753936e-07, + "loss": 0.0156, + "step": 6331 + }, + { + "epoch": 0.4, + "grad_norm": 0.3680753344624959, + "learning_rate": 6.762397960870255e-07, + "loss": 0.1849, + "step": 6332 + }, + { + "epoch": 0.4, + "grad_norm": 0.9161980014580507, + "learning_rate": 6.761431465799133e-07, + "loss": 0.2478, + "step": 6333 + }, + { + "epoch": 0.4, + "grad_norm": 0.7829457137275232, + "learning_rate": 6.760464895581804e-07, + "loss": 0.3173, + "step": 6334 + }, + { + "epoch": 0.4, + "grad_norm": 4.025450958864549, + "learning_rate": 6.759498250259503e-07, + "loss": 0.2282, + "step": 6335 + }, + { + "epoch": 0.4, + "grad_norm": 0.622090011675234, + "learning_rate": 6.75853152987347e-07, + "loss": 0.2927, + "step": 6336 + }, + { + "epoch": 0.4, + "grad_norm": 0.5802494290831592, + "learning_rate": 6.757564734464948e-07, + "loss": 0.1875, + "step": 6337 + }, + { + "epoch": 0.4, + "grad_norm": 0.4361417755347666, + "learning_rate": 6.756597864075181e-07, + "loss": 0.05, + "step": 6338 + }, + { + "epoch": 0.4, + "grad_norm": 2.166029941431076, + "learning_rate": 6.755630918745417e-07, + "loss": 0.166, + "step": 6339 + }, + { + "epoch": 0.4, + "grad_norm": 0.7587754413586166, + "learning_rate": 6.754663898516909e-07, + "loss": 0.2583, + "step": 6340 + }, + { + "epoch": 0.4, + "grad_norm": 0.35510815215948655, + "learning_rate": 6.753696803430913e-07, + "loss": 0.1415, + "step": 6341 + }, + { + "epoch": 0.4, + "grad_norm": 0.942727015461539, + "learning_rate": 6.752729633528684e-07, + "loss": 0.296, + "step": 6342 + }, + { + "epoch": 0.4, + "grad_norm": 0.9292487618931184, + "learning_rate": 6.751762388851486e-07, + "loss": 0.1719, + "step": 6343 + }, + { + "epoch": 0.4, + "grad_norm": 0.5921404678657156, + "learning_rate": 6.750795069440582e-07, + "loss": 0.0233, + "step": 6344 + }, + { + "epoch": 0.4, + "grad_norm": 1.063019208120071, + "learning_rate": 6.749827675337243e-07, + "loss": 0.1282, + "step": 6345 + }, + { + "epoch": 0.4, + "grad_norm": 0.6059948271698828, + "learning_rate": 6.748860206582738e-07, + "loss": 0.3899, + "step": 6346 + }, + { + "epoch": 0.4, + "grad_norm": 2.819963819050505, + "learning_rate": 6.747892663218339e-07, + "loss": 0.0624, + "step": 6347 + }, + { + "epoch": 0.4, + "grad_norm": 1.087721441077157, + "learning_rate": 6.746925045285327e-07, + "loss": 0.0145, + "step": 6348 + }, + { + "epoch": 0.4, + "grad_norm": 0.1544841294131863, + "learning_rate": 6.745957352824979e-07, + "loss": 0.0223, + "step": 6349 + }, + { + "epoch": 0.4, + "grad_norm": 0.9026185574747579, + "learning_rate": 6.744989585878582e-07, + "loss": 0.0029, + "step": 6350 + }, + { + "epoch": 0.41, + "grad_norm": 2.920157172626653, + "learning_rate": 6.744021744487422e-07, + "loss": 0.3463, + "step": 6351 + }, + { + "epoch": 0.41, + "grad_norm": 1.8754340127993847, + "learning_rate": 6.743053828692787e-07, + "loss": 0.2398, + "step": 6352 + }, + { + "epoch": 0.41, + "grad_norm": 1.125188320794302, + "learning_rate": 6.742085838535972e-07, + "loss": 0.2402, + "step": 6353 + }, + { + "epoch": 0.41, + "grad_norm": 0.09075343192105742, + "learning_rate": 6.741117774058271e-07, + "loss": 0.0033, + "step": 6354 + }, + { + "epoch": 0.41, + "grad_norm": 0.591763171934437, + "learning_rate": 6.740149635300989e-07, + "loss": 0.3425, + "step": 6355 + }, + { + "epoch": 0.41, + "grad_norm": 0.9332673410017857, + "learning_rate": 6.739181422305424e-07, + "loss": 0.3002, + "step": 6356 + }, + { + "epoch": 0.41, + "grad_norm": 0.6561116635671101, + "learning_rate": 6.738213135112884e-07, + "loss": 0.1401, + "step": 6357 + }, + { + "epoch": 0.41, + "grad_norm": 0.5671755029146652, + "learning_rate": 6.737244773764677e-07, + "loss": 0.2599, + "step": 6358 + }, + { + "epoch": 0.41, + "grad_norm": 0.9630947590414323, + "learning_rate": 6.736276338302115e-07, + "loss": 0.2172, + "step": 6359 + }, + { + "epoch": 0.41, + "grad_norm": 1.1197194309821634, + "learning_rate": 6.735307828766514e-07, + "loss": 0.1828, + "step": 6360 + }, + { + "epoch": 0.41, + "grad_norm": 0.4267255933308179, + "learning_rate": 6.734339245199194e-07, + "loss": 0.2289, + "step": 6361 + }, + { + "epoch": 0.41, + "grad_norm": 0.061760439965799295, + "learning_rate": 6.733370587641473e-07, + "loss": 0.0005, + "step": 6362 + }, + { + "epoch": 0.41, + "grad_norm": 1.0401649801124897, + "learning_rate": 6.732401856134681e-07, + "loss": 0.2416, + "step": 6363 + }, + { + "epoch": 0.41, + "grad_norm": 1.1427021284904402, + "learning_rate": 6.731433050720143e-07, + "loss": 0.1625, + "step": 6364 + }, + { + "epoch": 0.41, + "grad_norm": 0.7584669024460573, + "learning_rate": 6.730464171439189e-07, + "loss": 0.5968, + "step": 6365 + }, + { + "epoch": 0.41, + "grad_norm": 0.947389546604894, + "learning_rate": 6.729495218333156e-07, + "loss": 0.3292, + "step": 6366 + }, + { + "epoch": 0.41, + "grad_norm": 1.2671454166673617, + "learning_rate": 6.728526191443379e-07, + "loss": 0.3977, + "step": 6367 + }, + { + "epoch": 0.41, + "grad_norm": 1.2807435233821538, + "learning_rate": 6.7275570908112e-07, + "loss": 0.1983, + "step": 6368 + }, + { + "epoch": 0.41, + "grad_norm": 1.1223775443224318, + "learning_rate": 6.726587916477963e-07, + "loss": 0.2572, + "step": 6369 + }, + { + "epoch": 0.41, + "grad_norm": 0.7796094429732707, + "learning_rate": 6.725618668485015e-07, + "loss": 0.2362, + "step": 6370 + }, + { + "epoch": 0.41, + "grad_norm": 0.8569312131457432, + "learning_rate": 6.724649346873705e-07, + "loss": 0.0895, + "step": 6371 + }, + { + "epoch": 0.41, + "grad_norm": 0.3162810885051341, + "learning_rate": 6.723679951685387e-07, + "loss": 0.1953, + "step": 6372 + }, + { + "epoch": 0.41, + "grad_norm": 1.0160207518359947, + "learning_rate": 6.722710482961418e-07, + "loss": 0.1238, + "step": 6373 + }, + { + "epoch": 0.41, + "grad_norm": 4.239643357815333, + "learning_rate": 6.721740940743155e-07, + "loss": 0.101, + "step": 6374 + }, + { + "epoch": 0.41, + "grad_norm": 0.7511605584213781, + "learning_rate": 6.720771325071965e-07, + "loss": 0.2265, + "step": 6375 + }, + { + "epoch": 0.41, + "grad_norm": 1.875460261507989, + "learning_rate": 6.719801635989209e-07, + "loss": 0.1086, + "step": 6376 + }, + { + "epoch": 0.41, + "grad_norm": 2.21785125615534, + "learning_rate": 6.71883187353626e-07, + "loss": 0.0764, + "step": 6377 + }, + { + "epoch": 0.41, + "grad_norm": 6.68719826755472, + "learning_rate": 6.717862037754486e-07, + "loss": 0.2404, + "step": 6378 + }, + { + "epoch": 0.41, + "grad_norm": 6.571688675166393, + "learning_rate": 6.716892128685264e-07, + "loss": 0.1843, + "step": 6379 + }, + { + "epoch": 0.41, + "grad_norm": 0.6185389766513376, + "learning_rate": 6.715922146369973e-07, + "loss": 0.3114, + "step": 6380 + }, + { + "epoch": 0.41, + "grad_norm": 0.3319873843100709, + "learning_rate": 6.714952090849995e-07, + "loss": 0.1966, + "step": 6381 + }, + { + "epoch": 0.41, + "grad_norm": 0.2561438784387966, + "learning_rate": 6.713981962166712e-07, + "loss": 0.1524, + "step": 6382 + }, + { + "epoch": 0.41, + "grad_norm": 0.28233751339411856, + "learning_rate": 6.713011760361513e-07, + "loss": 0.1737, + "step": 6383 + }, + { + "epoch": 0.41, + "grad_norm": 1.182331344622677, + "learning_rate": 6.712041485475789e-07, + "loss": 0.1968, + "step": 6384 + }, + { + "epoch": 0.41, + "grad_norm": 2.972702489962563, + "learning_rate": 6.711071137550934e-07, + "loss": 0.2662, + "step": 6385 + }, + { + "epoch": 0.41, + "grad_norm": 0.7964588870798529, + "learning_rate": 6.710100716628344e-07, + "loss": 0.0531, + "step": 6386 + }, + { + "epoch": 0.41, + "grad_norm": 1.4359035223098955, + "learning_rate": 6.709130222749419e-07, + "loss": 0.2233, + "step": 6387 + }, + { + "epoch": 0.41, + "grad_norm": 0.8217549713020542, + "learning_rate": 6.708159655955563e-07, + "loss": 0.4644, + "step": 6388 + }, + { + "epoch": 0.41, + "grad_norm": 1.3280261803432758, + "learning_rate": 6.707189016288184e-07, + "loss": 0.4257, + "step": 6389 + }, + { + "epoch": 0.41, + "grad_norm": 1.3041206893016233, + "learning_rate": 6.706218303788688e-07, + "loss": 0.1558, + "step": 6390 + }, + { + "epoch": 0.41, + "grad_norm": 0.7141351038072162, + "learning_rate": 6.70524751849849e-07, + "loss": 0.4258, + "step": 6391 + }, + { + "epoch": 0.41, + "grad_norm": 1.1079707205512248, + "learning_rate": 6.704276660459007e-07, + "loss": 0.3083, + "step": 6392 + }, + { + "epoch": 0.41, + "grad_norm": 1.2512500793703196, + "learning_rate": 6.703305729711652e-07, + "loss": 0.3654, + "step": 6393 + }, + { + "epoch": 0.41, + "grad_norm": 0.8928412086560439, + "learning_rate": 6.702334726297852e-07, + "loss": 0.0603, + "step": 6394 + }, + { + "epoch": 0.41, + "grad_norm": 0.5169301511702493, + "learning_rate": 6.70136365025903e-07, + "loss": 0.1779, + "step": 6395 + }, + { + "epoch": 0.41, + "grad_norm": 0.9494093079091375, + "learning_rate": 6.700392501636616e-07, + "loss": 0.2459, + "step": 6396 + }, + { + "epoch": 0.41, + "grad_norm": 0.3858153397375629, + "learning_rate": 6.699421280472037e-07, + "loss": 0.1034, + "step": 6397 + }, + { + "epoch": 0.41, + "grad_norm": 0.8402655369890025, + "learning_rate": 6.698449986806731e-07, + "loss": 0.0212, + "step": 6398 + }, + { + "epoch": 0.41, + "grad_norm": 0.4053758742325924, + "learning_rate": 6.697478620682136e-07, + "loss": 0.0908, + "step": 6399 + }, + { + "epoch": 0.41, + "grad_norm": 0.6037583497943138, + "learning_rate": 6.696507182139689e-07, + "loss": 0.0792, + "step": 6400 + }, + { + "epoch": 0.41, + "grad_norm": 1.0995171185001902, + "learning_rate": 6.695535671220835e-07, + "loss": 0.2685, + "step": 6401 + }, + { + "epoch": 0.41, + "grad_norm": 0.7153908915965723, + "learning_rate": 6.694564087967022e-07, + "loss": 0.1978, + "step": 6402 + }, + { + "epoch": 0.41, + "grad_norm": 5.480691566042909, + "learning_rate": 6.693592432419697e-07, + "loss": 0.0978, + "step": 6403 + }, + { + "epoch": 0.41, + "grad_norm": 0.3212146592714469, + "learning_rate": 6.692620704620315e-07, + "loss": 0.1111, + "step": 6404 + }, + { + "epoch": 0.41, + "grad_norm": 0.9104411249066503, + "learning_rate": 6.691648904610331e-07, + "loss": 0.223, + "step": 6405 + }, + { + "epoch": 0.41, + "grad_norm": 0.725987599836831, + "learning_rate": 6.690677032431206e-07, + "loss": 0.1841, + "step": 6406 + }, + { + "epoch": 0.41, + "grad_norm": 0.167513301779837, + "learning_rate": 6.689705088124397e-07, + "loss": 0.0675, + "step": 6407 + }, + { + "epoch": 0.41, + "grad_norm": 0.6746871248555427, + "learning_rate": 6.688733071731375e-07, + "loss": 0.2058, + "step": 6408 + }, + { + "epoch": 0.41, + "grad_norm": 3.4429761523137734, + "learning_rate": 6.687760983293605e-07, + "loss": 0.3075, + "step": 6409 + }, + { + "epoch": 0.41, + "grad_norm": 0.8222056307600253, + "learning_rate": 6.686788822852557e-07, + "loss": 0.2678, + "step": 6410 + }, + { + "epoch": 0.41, + "grad_norm": 0.6403125674895418, + "learning_rate": 6.685816590449708e-07, + "loss": 0.1277, + "step": 6411 + }, + { + "epoch": 0.41, + "grad_norm": 6.387080088376487, + "learning_rate": 6.684844286126534e-07, + "loss": 0.175, + "step": 6412 + }, + { + "epoch": 0.41, + "grad_norm": 0.8035832009531326, + "learning_rate": 6.683871909924516e-07, + "loss": 0.164, + "step": 6413 + }, + { + "epoch": 0.41, + "grad_norm": 0.7419355566774356, + "learning_rate": 6.682899461885136e-07, + "loss": 0.3492, + "step": 6414 + }, + { + "epoch": 0.41, + "grad_norm": 0.7583044093642305, + "learning_rate": 6.681926942049882e-07, + "loss": 0.1684, + "step": 6415 + }, + { + "epoch": 0.41, + "grad_norm": 1.540524158474532, + "learning_rate": 6.680954350460247e-07, + "loss": 0.2001, + "step": 6416 + }, + { + "epoch": 0.41, + "grad_norm": 1.3589512671052704, + "learning_rate": 6.679981687157717e-07, + "loss": 0.1495, + "step": 6417 + }, + { + "epoch": 0.41, + "grad_norm": 0.6716513360157987, + "learning_rate": 6.679008952183791e-07, + "loss": 0.2796, + "step": 6418 + }, + { + "epoch": 0.41, + "grad_norm": 0.2655350732043996, + "learning_rate": 6.67803614557997e-07, + "loss": 0.1029, + "step": 6419 + }, + { + "epoch": 0.41, + "grad_norm": 0.7685597814453371, + "learning_rate": 6.677063267387752e-07, + "loss": 0.078, + "step": 6420 + }, + { + "epoch": 0.41, + "grad_norm": 0.37207951166087705, + "learning_rate": 6.676090317648645e-07, + "loss": 0.1047, + "step": 6421 + }, + { + "epoch": 0.41, + "grad_norm": 0.6874716827952146, + "learning_rate": 6.675117296404155e-07, + "loss": 0.2572, + "step": 6422 + }, + { + "epoch": 0.41, + "grad_norm": 2.1641264249149006, + "learning_rate": 6.674144203695793e-07, + "loss": 0.0321, + "step": 6423 + }, + { + "epoch": 0.41, + "grad_norm": 0.783936055476026, + "learning_rate": 6.673171039565075e-07, + "loss": 0.3694, + "step": 6424 + }, + { + "epoch": 0.41, + "grad_norm": 8.06002758520689, + "learning_rate": 6.672197804053515e-07, + "loss": 0.3867, + "step": 6425 + }, + { + "epoch": 0.41, + "grad_norm": 0.1685342360811017, + "learning_rate": 6.671224497202636e-07, + "loss": 0.0897, + "step": 6426 + }, + { + "epoch": 0.41, + "grad_norm": 3.2502365801792226, + "learning_rate": 6.670251119053962e-07, + "loss": 0.2772, + "step": 6427 + }, + { + "epoch": 0.41, + "grad_norm": 0.8456925694631241, + "learning_rate": 6.669277669649017e-07, + "loss": 0.2723, + "step": 6428 + }, + { + "epoch": 0.41, + "grad_norm": 0.2186130984426744, + "learning_rate": 6.66830414902933e-07, + "loss": 0.1497, + "step": 6429 + }, + { + "epoch": 0.41, + "grad_norm": 1.672158032240156, + "learning_rate": 6.667330557236435e-07, + "loss": 0.2074, + "step": 6430 + }, + { + "epoch": 0.41, + "grad_norm": 0.8898479913339835, + "learning_rate": 6.666356894311866e-07, + "loss": 0.0757, + "step": 6431 + }, + { + "epoch": 0.41, + "grad_norm": 0.6753965207327566, + "learning_rate": 6.665383160297162e-07, + "loss": 0.2126, + "step": 6432 + }, + { + "epoch": 0.41, + "grad_norm": 0.6426171782885012, + "learning_rate": 6.664409355233867e-07, + "loss": 0.2228, + "step": 6433 + }, + { + "epoch": 0.41, + "grad_norm": 0.9163429384997365, + "learning_rate": 6.66343547916352e-07, + "loss": 0.2849, + "step": 6434 + }, + { + "epoch": 0.41, + "grad_norm": 0.39900204553834123, + "learning_rate": 6.662461532127673e-07, + "loss": 0.133, + "step": 6435 + }, + { + "epoch": 0.41, + "grad_norm": 0.5645169244933328, + "learning_rate": 6.661487514167874e-07, + "loss": 0.2357, + "step": 6436 + }, + { + "epoch": 0.41, + "grad_norm": 1.0206422537663962, + "learning_rate": 6.660513425325679e-07, + "loss": 0.116, + "step": 6437 + }, + { + "epoch": 0.41, + "grad_norm": 3.2466906121473285, + "learning_rate": 6.659539265642642e-07, + "loss": 0.0162, + "step": 6438 + }, + { + "epoch": 0.41, + "grad_norm": 0.30870560313402473, + "learning_rate": 6.658565035160325e-07, + "loss": 0.1962, + "step": 6439 + }, + { + "epoch": 0.41, + "grad_norm": 2.20218703221726, + "learning_rate": 6.657590733920289e-07, + "loss": 0.1525, + "step": 6440 + }, + { + "epoch": 0.41, + "grad_norm": 0.23113188580448643, + "learning_rate": 6.656616361964099e-07, + "loss": 0.0881, + "step": 6441 + }, + { + "epoch": 0.41, + "grad_norm": 0.49620995616366453, + "learning_rate": 6.655641919333325e-07, + "loss": 0.3741, + "step": 6442 + }, + { + "epoch": 0.41, + "grad_norm": 0.5141069284573254, + "learning_rate": 6.654667406069539e-07, + "loss": 0.0724, + "step": 6443 + }, + { + "epoch": 0.41, + "grad_norm": 0.7985520950661176, + "learning_rate": 6.653692822214316e-07, + "loss": 0.3261, + "step": 6444 + }, + { + "epoch": 0.41, + "grad_norm": 0.8930237310932487, + "learning_rate": 6.652718167809232e-07, + "loss": 0.0418, + "step": 6445 + }, + { + "epoch": 0.41, + "grad_norm": 0.8823951379759444, + "learning_rate": 6.65174344289587e-07, + "loss": 0.2082, + "step": 6446 + }, + { + "epoch": 0.41, + "grad_norm": 1.3241973155351798, + "learning_rate": 6.650768647515812e-07, + "loss": 0.0664, + "step": 6447 + }, + { + "epoch": 0.41, + "grad_norm": 0.6709408431913578, + "learning_rate": 6.649793781710644e-07, + "loss": 0.2816, + "step": 6448 + }, + { + "epoch": 0.41, + "grad_norm": 0.7087149931517718, + "learning_rate": 6.648818845521955e-07, + "loss": 0.2196, + "step": 6449 + }, + { + "epoch": 0.41, + "grad_norm": 0.7660689037529811, + "learning_rate": 6.647843838991342e-07, + "loss": 0.115, + "step": 6450 + }, + { + "epoch": 0.41, + "grad_norm": 1.586727923499886, + "learning_rate": 6.646868762160398e-07, + "loss": 0.255, + "step": 6451 + }, + { + "epoch": 0.41, + "grad_norm": 0.2765990842381686, + "learning_rate": 6.645893615070722e-07, + "loss": 0.0083, + "step": 6452 + }, + { + "epoch": 0.41, + "grad_norm": 0.7482934174593467, + "learning_rate": 6.644918397763914e-07, + "loss": 0.2262, + "step": 6453 + }, + { + "epoch": 0.41, + "grad_norm": 1.3505831849518155, + "learning_rate": 6.643943110281583e-07, + "loss": 0.1289, + "step": 6454 + }, + { + "epoch": 0.41, + "grad_norm": 0.5051971188853868, + "learning_rate": 6.642967752665333e-07, + "loss": 0.1454, + "step": 6455 + }, + { + "epoch": 0.41, + "grad_norm": 0.45116601841098775, + "learning_rate": 6.641992324956775e-07, + "loss": 0.1051, + "step": 6456 + }, + { + "epoch": 0.41, + "grad_norm": 9.6467710453751, + "learning_rate": 6.641016827197526e-07, + "loss": 0.1847, + "step": 6457 + }, + { + "epoch": 0.41, + "grad_norm": 0.8112876594939268, + "learning_rate": 6.640041259429199e-07, + "loss": 0.5393, + "step": 6458 + }, + { + "epoch": 0.41, + "grad_norm": 0.8930198854204855, + "learning_rate": 6.639065621693414e-07, + "loss": 0.325, + "step": 6459 + }, + { + "epoch": 0.41, + "grad_norm": 1.7377636794694982, + "learning_rate": 6.638089914031794e-07, + "loss": 0.3848, + "step": 6460 + }, + { + "epoch": 0.41, + "grad_norm": 1.1674386532537577, + "learning_rate": 6.637114136485968e-07, + "loss": 0.298, + "step": 6461 + }, + { + "epoch": 0.41, + "grad_norm": 0.598930227126228, + "learning_rate": 6.636138289097561e-07, + "loss": 0.1786, + "step": 6462 + }, + { + "epoch": 0.41, + "grad_norm": 2.377681268256108, + "learning_rate": 6.635162371908205e-07, + "loss": 0.1953, + "step": 6463 + }, + { + "epoch": 0.41, + "grad_norm": 12.51374160520077, + "learning_rate": 6.634186384959536e-07, + "loss": 0.1956, + "step": 6464 + }, + { + "epoch": 0.41, + "grad_norm": 1.5770264650314494, + "learning_rate": 6.63321032829319e-07, + "loss": 0.1165, + "step": 6465 + }, + { + "epoch": 0.41, + "grad_norm": 1.4165259930628278, + "learning_rate": 6.632234201950808e-07, + "loss": 0.0081, + "step": 6466 + }, + { + "epoch": 0.41, + "grad_norm": 1.5479311939948879, + "learning_rate": 6.631258005974034e-07, + "loss": 0.1396, + "step": 6467 + }, + { + "epoch": 0.41, + "grad_norm": 0.7595715487242493, + "learning_rate": 6.630281740404513e-07, + "loss": 0.3016, + "step": 6468 + }, + { + "epoch": 0.41, + "grad_norm": 0.39108477600863556, + "learning_rate": 6.629305405283897e-07, + "loss": 0.3183, + "step": 6469 + }, + { + "epoch": 0.41, + "grad_norm": 5.949277306145099, + "learning_rate": 6.628329000653837e-07, + "loss": 0.1225, + "step": 6470 + }, + { + "epoch": 0.41, + "grad_norm": 0.7967134536685371, + "learning_rate": 6.62735252655599e-07, + "loss": 0.2556, + "step": 6471 + }, + { + "epoch": 0.41, + "grad_norm": 1.3221524500716266, + "learning_rate": 6.62637598303201e-07, + "loss": 0.4512, + "step": 6472 + }, + { + "epoch": 0.41, + "grad_norm": 5.538877138746729, + "learning_rate": 6.625399370123562e-07, + "loss": 0.3627, + "step": 6473 + }, + { + "epoch": 0.41, + "grad_norm": 2.466976161261532, + "learning_rate": 6.624422687872311e-07, + "loss": 0.0839, + "step": 6474 + }, + { + "epoch": 0.41, + "grad_norm": 1.0441176222594615, + "learning_rate": 6.623445936319922e-07, + "loss": 0.3385, + "step": 6475 + }, + { + "epoch": 0.41, + "grad_norm": 0.9286328959903735, + "learning_rate": 6.622469115508065e-07, + "loss": 0.3544, + "step": 6476 + }, + { + "epoch": 0.41, + "grad_norm": 0.8921096068236626, + "learning_rate": 6.621492225478413e-07, + "loss": 0.217, + "step": 6477 + }, + { + "epoch": 0.41, + "grad_norm": 0.527601776872973, + "learning_rate": 6.620515266272645e-07, + "loss": 0.2838, + "step": 6478 + }, + { + "epoch": 0.41, + "grad_norm": 0.5407391774707727, + "learning_rate": 6.619538237932437e-07, + "loss": 0.0094, + "step": 6479 + }, + { + "epoch": 0.41, + "grad_norm": 3.0682816568887366, + "learning_rate": 6.618561140499472e-07, + "loss": 0.1509, + "step": 6480 + }, + { + "epoch": 0.41, + "grad_norm": 0.767621032382869, + "learning_rate": 6.617583974015436e-07, + "loss": 0.4076, + "step": 6481 + }, + { + "epoch": 0.41, + "grad_norm": 0.8183856655656446, + "learning_rate": 6.616606738522016e-07, + "loss": 0.2044, + "step": 6482 + }, + { + "epoch": 0.41, + "grad_norm": 0.29348998718732544, + "learning_rate": 6.615629434060902e-07, + "loss": 0.0747, + "step": 6483 + }, + { + "epoch": 0.41, + "grad_norm": 0.316376771796467, + "learning_rate": 6.614652060673789e-07, + "loss": 0.2296, + "step": 6484 + }, + { + "epoch": 0.41, + "grad_norm": 0.49404427248597543, + "learning_rate": 6.613674618402373e-07, + "loss": 0.3097, + "step": 6485 + }, + { + "epoch": 0.41, + "grad_norm": 0.635776322474302, + "learning_rate": 6.612697107288352e-07, + "loss": 0.1519, + "step": 6486 + }, + { + "epoch": 0.41, + "grad_norm": 0.4708852073170756, + "learning_rate": 6.611719527373433e-07, + "loss": 0.0105, + "step": 6487 + }, + { + "epoch": 0.41, + "grad_norm": 1.6560381143197433, + "learning_rate": 6.610741878699319e-07, + "loss": 0.2706, + "step": 6488 + }, + { + "epoch": 0.41, + "grad_norm": 0.683823648475233, + "learning_rate": 6.609764161307718e-07, + "loss": 0.345, + "step": 6489 + }, + { + "epoch": 0.41, + "grad_norm": 0.5789976564007079, + "learning_rate": 6.608786375240342e-07, + "loss": 0.2959, + "step": 6490 + }, + { + "epoch": 0.41, + "grad_norm": 0.47281055515844767, + "learning_rate": 6.607808520538904e-07, + "loss": 0.0035, + "step": 6491 + }, + { + "epoch": 0.41, + "grad_norm": 1.0032031279807376, + "learning_rate": 6.606830597245123e-07, + "loss": 0.1048, + "step": 6492 + }, + { + "epoch": 0.41, + "grad_norm": 0.6261579156018745, + "learning_rate": 6.605852605400719e-07, + "loss": 0.1656, + "step": 6493 + }, + { + "epoch": 0.41, + "grad_norm": 0.732854958715936, + "learning_rate": 6.604874545047414e-07, + "loss": 0.2638, + "step": 6494 + }, + { + "epoch": 0.41, + "grad_norm": 3.780507289556038, + "learning_rate": 6.603896416226935e-07, + "loss": 0.4101, + "step": 6495 + }, + { + "epoch": 0.41, + "grad_norm": 1.3972146105870629, + "learning_rate": 6.60291821898101e-07, + "loss": 0.2275, + "step": 6496 + }, + { + "epoch": 0.41, + "grad_norm": 0.9831244538642729, + "learning_rate": 6.601939953351373e-07, + "loss": 0.3621, + "step": 6497 + }, + { + "epoch": 0.41, + "grad_norm": 0.9525721240066384, + "learning_rate": 6.600961619379757e-07, + "loss": 0.1266, + "step": 6498 + }, + { + "epoch": 0.41, + "grad_norm": 2.801474755548807, + "learning_rate": 6.599983217107899e-07, + "loss": 0.0536, + "step": 6499 + }, + { + "epoch": 0.41, + "grad_norm": 0.9934987509011964, + "learning_rate": 6.599004746577541e-07, + "loss": 0.1469, + "step": 6500 + }, + { + "epoch": 0.41, + "grad_norm": 4.473780673185841, + "learning_rate": 6.598026207830427e-07, + "loss": 0.206, + "step": 6501 + }, + { + "epoch": 0.41, + "grad_norm": 0.5678875570797841, + "learning_rate": 6.597047600908301e-07, + "loss": 0.1189, + "step": 6502 + }, + { + "epoch": 0.41, + "grad_norm": 0.981066779593897, + "learning_rate": 6.596068925852915e-07, + "loss": 0.2041, + "step": 6503 + }, + { + "epoch": 0.41, + "grad_norm": 1.9243978928505816, + "learning_rate": 6.59509018270602e-07, + "loss": 0.303, + "step": 6504 + }, + { + "epoch": 0.41, + "grad_norm": 0.6919267581567077, + "learning_rate": 6.594111371509371e-07, + "loss": 0.2919, + "step": 6505 + }, + { + "epoch": 0.41, + "grad_norm": 1.7933123534219113, + "learning_rate": 6.593132492304727e-07, + "loss": 0.3159, + "step": 6506 + }, + { + "epoch": 0.41, + "grad_norm": 2.729456866562622, + "learning_rate": 6.592153545133847e-07, + "loss": 0.0478, + "step": 6507 + }, + { + "epoch": 0.42, + "grad_norm": 1.3389561586035512, + "learning_rate": 6.591174530038497e-07, + "loss": 0.163, + "step": 6508 + }, + { + "epoch": 0.42, + "grad_norm": 0.3966142880288903, + "learning_rate": 6.590195447060442e-07, + "loss": 0.1249, + "step": 6509 + }, + { + "epoch": 0.42, + "grad_norm": 0.47325938802164785, + "learning_rate": 6.589216296241454e-07, + "loss": 0.1079, + "step": 6510 + }, + { + "epoch": 0.42, + "grad_norm": 0.38768715086358946, + "learning_rate": 6.588237077623305e-07, + "loss": 0.1644, + "step": 6511 + }, + { + "epoch": 0.42, + "grad_norm": 0.9449971313198207, + "learning_rate": 6.587257791247767e-07, + "loss": 0.3849, + "step": 6512 + }, + { + "epoch": 0.42, + "grad_norm": 0.4173316413548671, + "learning_rate": 6.586278437156621e-07, + "loss": 0.1504, + "step": 6513 + }, + { + "epoch": 0.42, + "grad_norm": 0.7588325020050893, + "learning_rate": 6.585299015391648e-07, + "loss": 0.3681, + "step": 6514 + }, + { + "epoch": 0.42, + "grad_norm": 7.643512989101004, + "learning_rate": 6.584319525994633e-07, + "loss": 0.1995, + "step": 6515 + }, + { + "epoch": 0.42, + "grad_norm": 6.448581609866346, + "learning_rate": 6.583339969007363e-07, + "loss": 0.1167, + "step": 6516 + }, + { + "epoch": 0.42, + "grad_norm": 7.179579918401583, + "learning_rate": 6.582360344471626e-07, + "loss": 0.03, + "step": 6517 + }, + { + "epoch": 0.42, + "grad_norm": 0.9476712906991313, + "learning_rate": 6.581380652429215e-07, + "loss": 0.0128, + "step": 6518 + }, + { + "epoch": 0.42, + "grad_norm": 0.5618076951985692, + "learning_rate": 6.580400892921928e-07, + "loss": 0.0394, + "step": 6519 + }, + { + "epoch": 0.42, + "grad_norm": 0.7208144385368301, + "learning_rate": 6.579421065991562e-07, + "loss": 0.1988, + "step": 6520 + }, + { + "epoch": 0.42, + "grad_norm": 0.6700607999321094, + "learning_rate": 6.578441171679916e-07, + "loss": 0.1638, + "step": 6521 + }, + { + "epoch": 0.42, + "grad_norm": 0.6036084279517975, + "learning_rate": 6.577461210028798e-07, + "loss": 0.104, + "step": 6522 + }, + { + "epoch": 0.42, + "grad_norm": 1.2579511343596985, + "learning_rate": 6.576481181080014e-07, + "loss": 0.2006, + "step": 6523 + }, + { + "epoch": 0.42, + "grad_norm": 5.995596720419675, + "learning_rate": 6.575501084875373e-07, + "loss": 0.0842, + "step": 6524 + }, + { + "epoch": 0.42, + "grad_norm": 0.7921197772652121, + "learning_rate": 6.574520921456687e-07, + "loss": 0.1853, + "step": 6525 + }, + { + "epoch": 0.42, + "grad_norm": 0.8754186202166979, + "learning_rate": 6.573540690865777e-07, + "loss": 0.2886, + "step": 6526 + }, + { + "epoch": 0.42, + "grad_norm": 0.6026451594831287, + "learning_rate": 6.572560393144456e-07, + "loss": 0.1946, + "step": 6527 + }, + { + "epoch": 0.42, + "grad_norm": 0.9409002386965092, + "learning_rate": 6.571580028334546e-07, + "loss": 0.298, + "step": 6528 + }, + { + "epoch": 0.42, + "grad_norm": 0.6088540029934685, + "learning_rate": 6.570599596477874e-07, + "loss": 0.0904, + "step": 6529 + }, + { + "epoch": 0.42, + "grad_norm": 0.4002230965927323, + "learning_rate": 6.569619097616268e-07, + "loss": 0.0855, + "step": 6530 + }, + { + "epoch": 0.42, + "grad_norm": 0.3362088504980511, + "learning_rate": 6.568638531791554e-07, + "loss": 0.0082, + "step": 6531 + }, + { + "epoch": 0.42, + "grad_norm": 1.3475426728193038, + "learning_rate": 6.567657899045566e-07, + "loss": 0.0606, + "step": 6532 + }, + { + "epoch": 0.42, + "grad_norm": 5.105330272258604, + "learning_rate": 6.566677199420142e-07, + "loss": 0.3037, + "step": 6533 + }, + { + "epoch": 0.42, + "grad_norm": 1.7076940168194243, + "learning_rate": 6.565696432957119e-07, + "loss": 0.2003, + "step": 6534 + }, + { + "epoch": 0.42, + "grad_norm": 0.9592712351857866, + "learning_rate": 6.564715599698338e-07, + "loss": 0.154, + "step": 6535 + }, + { + "epoch": 0.42, + "grad_norm": 1.0719706917957044, + "learning_rate": 6.563734699685646e-07, + "loss": 0.3982, + "step": 6536 + }, + { + "epoch": 0.42, + "grad_norm": 0.5285085284835996, + "learning_rate": 6.562753732960886e-07, + "loss": 0.2912, + "step": 6537 + }, + { + "epoch": 0.42, + "grad_norm": 0.709800823003546, + "learning_rate": 6.56177269956591e-07, + "loss": 0.3394, + "step": 6538 + }, + { + "epoch": 0.42, + "grad_norm": 0.8931223092415999, + "learning_rate": 6.560791599542572e-07, + "loss": 0.1268, + "step": 6539 + }, + { + "epoch": 0.42, + "grad_norm": 9.20830405696897, + "learning_rate": 6.559810432932727e-07, + "loss": 0.1853, + "step": 6540 + }, + { + "epoch": 0.42, + "grad_norm": 0.6305902483585847, + "learning_rate": 6.558829199778233e-07, + "loss": 0.2657, + "step": 6541 + }, + { + "epoch": 0.42, + "grad_norm": 0.7724616563658449, + "learning_rate": 6.557847900120952e-07, + "loss": 0.2717, + "step": 6542 + }, + { + "epoch": 0.42, + "grad_norm": 0.4791742121706651, + "learning_rate": 6.55686653400275e-07, + "loss": 0.134, + "step": 6543 + }, + { + "epoch": 0.42, + "grad_norm": 0.9442269367318035, + "learning_rate": 6.555885101465489e-07, + "loss": 0.2108, + "step": 6544 + }, + { + "epoch": 0.42, + "grad_norm": 0.4039040723449807, + "learning_rate": 6.554903602551043e-07, + "loss": 0.1719, + "step": 6545 + }, + { + "epoch": 0.42, + "grad_norm": 1.3915708401672107, + "learning_rate": 6.553922037301283e-07, + "loss": 0.3847, + "step": 6546 + }, + { + "epoch": 0.42, + "grad_norm": 0.38248881133172413, + "learning_rate": 6.552940405758084e-07, + "loss": 0.0838, + "step": 6547 + }, + { + "epoch": 0.42, + "grad_norm": 2.5619254010366976, + "learning_rate": 6.551958707963328e-07, + "loss": 0.226, + "step": 6548 + }, + { + "epoch": 0.42, + "grad_norm": 1.2534025099978723, + "learning_rate": 6.550976943958891e-07, + "loss": 0.1677, + "step": 6549 + }, + { + "epoch": 0.42, + "grad_norm": 0.831115820788772, + "learning_rate": 6.549995113786662e-07, + "loss": 0.1906, + "step": 6550 + }, + { + "epoch": 0.42, + "grad_norm": 7.0313492870443195, + "learning_rate": 6.549013217488525e-07, + "loss": 0.2437, + "step": 6551 + }, + { + "epoch": 0.42, + "grad_norm": 2.123844449278096, + "learning_rate": 6.54803125510637e-07, + "loss": 0.1735, + "step": 6552 + }, + { + "epoch": 0.42, + "grad_norm": 0.9805224328146542, + "learning_rate": 6.547049226682089e-07, + "loss": 0.2336, + "step": 6553 + }, + { + "epoch": 0.42, + "grad_norm": 0.6122253035320749, + "learning_rate": 6.546067132257579e-07, + "loss": 0.1658, + "step": 6554 + }, + { + "epoch": 0.42, + "grad_norm": 0.9216797554993842, + "learning_rate": 6.545084971874736e-07, + "loss": 0.2078, + "step": 6555 + }, + { + "epoch": 0.42, + "grad_norm": 0.6379449880822752, + "learning_rate": 6.544102745575463e-07, + "loss": 0.2793, + "step": 6556 + }, + { + "epoch": 0.42, + "grad_norm": 2.3316942329819206, + "learning_rate": 6.543120453401664e-07, + "loss": 0.0257, + "step": 6557 + }, + { + "epoch": 0.42, + "grad_norm": 0.3965400725147025, + "learning_rate": 6.542138095395243e-07, + "loss": 0.0984, + "step": 6558 + }, + { + "epoch": 0.42, + "grad_norm": 1.2872385639535027, + "learning_rate": 6.541155671598111e-07, + "loss": 0.248, + "step": 6559 + }, + { + "epoch": 0.42, + "grad_norm": 4.246088391524185, + "learning_rate": 6.54017318205218e-07, + "loss": 0.1558, + "step": 6560 + }, + { + "epoch": 0.42, + "grad_norm": 0.6877516861733453, + "learning_rate": 6.539190626799364e-07, + "loss": 0.0134, + "step": 6561 + }, + { + "epoch": 0.42, + "grad_norm": 0.443626408272202, + "learning_rate": 6.538208005881583e-07, + "loss": 0.1173, + "step": 6562 + }, + { + "epoch": 0.42, + "grad_norm": 1.273995148271526, + "learning_rate": 6.537225319340757e-07, + "loss": 0.197, + "step": 6563 + }, + { + "epoch": 0.42, + "grad_norm": 0.5114725362639235, + "learning_rate": 6.536242567218807e-07, + "loss": 0.2292, + "step": 6564 + }, + { + "epoch": 0.42, + "grad_norm": 2.243953362163476, + "learning_rate": 6.53525974955766e-07, + "loss": 0.0133, + "step": 6565 + }, + { + "epoch": 0.42, + "grad_norm": 0.5488081363315258, + "learning_rate": 6.534276866399247e-07, + "loss": 0.0538, + "step": 6566 + }, + { + "epoch": 0.42, + "grad_norm": 0.9886178392048715, + "learning_rate": 6.5332939177855e-07, + "loss": 0.2797, + "step": 6567 + }, + { + "epoch": 0.42, + "grad_norm": 0.8601829829661732, + "learning_rate": 6.53231090375835e-07, + "loss": 0.3236, + "step": 6568 + }, + { + "epoch": 0.42, + "grad_norm": 0.5067175979204344, + "learning_rate": 6.531327824359738e-07, + "loss": 0.2217, + "step": 6569 + }, + { + "epoch": 0.42, + "grad_norm": 0.75280071450143, + "learning_rate": 6.530344679631602e-07, + "loss": 0.2105, + "step": 6570 + }, + { + "epoch": 0.42, + "grad_norm": 0.9963648393536968, + "learning_rate": 6.529361469615887e-07, + "loss": 0.2076, + "step": 6571 + }, + { + "epoch": 0.42, + "grad_norm": 1.120861879069957, + "learning_rate": 6.528378194354536e-07, + "loss": 0.1174, + "step": 6572 + }, + { + "epoch": 0.42, + "grad_norm": 0.4431707215135499, + "learning_rate": 6.527394853889499e-07, + "loss": 0.0998, + "step": 6573 + }, + { + "epoch": 0.42, + "grad_norm": 0.09461529127731924, + "learning_rate": 6.526411448262726e-07, + "loss": 0.005, + "step": 6574 + }, + { + "epoch": 0.42, + "grad_norm": 0.8224445636242497, + "learning_rate": 6.525427977516173e-07, + "loss": 0.4131, + "step": 6575 + }, + { + "epoch": 0.42, + "grad_norm": 0.6901181559297348, + "learning_rate": 6.524444441691795e-07, + "loss": 0.3649, + "step": 6576 + }, + { + "epoch": 0.42, + "grad_norm": 0.35761199002053246, + "learning_rate": 6.523460840831554e-07, + "loss": 0.2469, + "step": 6577 + }, + { + "epoch": 0.42, + "grad_norm": 1.5334737367261122, + "learning_rate": 6.522477174977411e-07, + "loss": 0.0843, + "step": 6578 + }, + { + "epoch": 0.42, + "grad_norm": 0.5179009731488441, + "learning_rate": 6.52149344417133e-07, + "loss": 0.0151, + "step": 6579 + }, + { + "epoch": 0.42, + "grad_norm": 0.6488619418859829, + "learning_rate": 6.520509648455282e-07, + "loss": 0.2846, + "step": 6580 + }, + { + "epoch": 0.42, + "grad_norm": 0.3682507192636819, + "learning_rate": 6.519525787871234e-07, + "loss": 0.095, + "step": 6581 + }, + { + "epoch": 0.42, + "grad_norm": 1.1212866152391388, + "learning_rate": 6.518541862461162e-07, + "loss": 0.0726, + "step": 6582 + }, + { + "epoch": 0.42, + "grad_norm": 8.54516622159933, + "learning_rate": 6.517557872267041e-07, + "loss": 0.1133, + "step": 6583 + }, + { + "epoch": 0.42, + "grad_norm": 0.6169215072686566, + "learning_rate": 6.516573817330851e-07, + "loss": 0.1888, + "step": 6584 + }, + { + "epoch": 0.42, + "grad_norm": 0.6833676158879121, + "learning_rate": 6.515589697694574e-07, + "loss": 0.1507, + "step": 6585 + }, + { + "epoch": 0.42, + "grad_norm": 0.5188710826658951, + "learning_rate": 6.514605513400193e-07, + "loss": 0.4211, + "step": 6586 + }, + { + "epoch": 0.42, + "grad_norm": 0.3199304249283735, + "learning_rate": 6.513621264489696e-07, + "loss": 0.2161, + "step": 6587 + }, + { + "epoch": 0.42, + "grad_norm": 0.9024744484290665, + "learning_rate": 6.512636951005073e-07, + "loss": 0.2223, + "step": 6588 + }, + { + "epoch": 0.42, + "grad_norm": 1.2358497207831372, + "learning_rate": 6.511652572988316e-07, + "loss": 0.2116, + "step": 6589 + }, + { + "epoch": 0.42, + "grad_norm": 0.790714556211648, + "learning_rate": 6.510668130481423e-07, + "loss": 0.2035, + "step": 6590 + }, + { + "epoch": 0.42, + "grad_norm": 0.5837610043799196, + "learning_rate": 6.50968362352639e-07, + "loss": 0.2612, + "step": 6591 + }, + { + "epoch": 0.42, + "grad_norm": 2.1322110048786116, + "learning_rate": 6.508699052165218e-07, + "loss": 0.0731, + "step": 6592 + }, + { + "epoch": 0.42, + "grad_norm": 0.4869642528488591, + "learning_rate": 6.507714416439914e-07, + "loss": 0.1773, + "step": 6593 + }, + { + "epoch": 0.42, + "grad_norm": 0.46931346360923487, + "learning_rate": 6.506729716392479e-07, + "loss": 0.1304, + "step": 6594 + }, + { + "epoch": 0.42, + "grad_norm": 0.6495464461000598, + "learning_rate": 6.505744952064927e-07, + "loss": 0.211, + "step": 6595 + }, + { + "epoch": 0.42, + "grad_norm": 0.6816189436042995, + "learning_rate": 6.504760123499269e-07, + "loss": 0.2026, + "step": 6596 + }, + { + "epoch": 0.42, + "grad_norm": 0.5359782743913805, + "learning_rate": 6.503775230737518e-07, + "loss": 0.1804, + "step": 6597 + }, + { + "epoch": 0.42, + "grad_norm": 0.6466741031098954, + "learning_rate": 6.502790273821694e-07, + "loss": 0.2872, + "step": 6598 + }, + { + "epoch": 0.42, + "grad_norm": 3.059231198735174, + "learning_rate": 6.501805252793817e-07, + "loss": 0.2318, + "step": 6599 + }, + { + "epoch": 0.42, + "grad_norm": 0.1495449829824636, + "learning_rate": 6.500820167695905e-07, + "loss": 0.0118, + "step": 6600 + }, + { + "epoch": 0.42, + "grad_norm": 0.6556357119339661, + "learning_rate": 6.49983501856999e-07, + "loss": 0.2049, + "step": 6601 + }, + { + "epoch": 0.42, + "grad_norm": 0.19084012033555592, + "learning_rate": 6.498849805458098e-07, + "loss": 0.0892, + "step": 6602 + }, + { + "epoch": 0.42, + "grad_norm": 0.7542708776617518, + "learning_rate": 6.49786452840226e-07, + "loss": 0.0684, + "step": 6603 + }, + { + "epoch": 0.42, + "grad_norm": 1.130488531154757, + "learning_rate": 6.496879187444509e-07, + "loss": 0.0361, + "step": 6604 + }, + { + "epoch": 0.42, + "grad_norm": 0.12030908783750106, + "learning_rate": 6.495893782626884e-07, + "loss": 0.0038, + "step": 6605 + }, + { + "epoch": 0.42, + "grad_norm": 4.472643673466841, + "learning_rate": 6.494908313991424e-07, + "loss": 0.027, + "step": 6606 + }, + { + "epoch": 0.42, + "grad_norm": 6.756578264745721, + "learning_rate": 6.493922781580169e-07, + "loss": 0.1752, + "step": 6607 + }, + { + "epoch": 0.42, + "grad_norm": 0.47065007556631094, + "learning_rate": 6.492937185435165e-07, + "loss": 0.0365, + "step": 6608 + }, + { + "epoch": 0.42, + "grad_norm": 0.37773434203172523, + "learning_rate": 6.491951525598461e-07, + "loss": 0.1043, + "step": 6609 + }, + { + "epoch": 0.42, + "grad_norm": 16.170231038508277, + "learning_rate": 6.490965802112103e-07, + "loss": 0.0348, + "step": 6610 + }, + { + "epoch": 0.42, + "grad_norm": 3.2153154323505864, + "learning_rate": 6.489980015018147e-07, + "loss": 0.2384, + "step": 6611 + }, + { + "epoch": 0.42, + "grad_norm": 1.1887446464012499, + "learning_rate": 6.488994164358651e-07, + "loss": 0.4168, + "step": 6612 + }, + { + "epoch": 0.42, + "grad_norm": 0.5097010072238237, + "learning_rate": 6.488008250175669e-07, + "loss": 0.0474, + "step": 6613 + }, + { + "epoch": 0.42, + "grad_norm": 1.7804920096956791, + "learning_rate": 6.487022272511264e-07, + "loss": 0.1254, + "step": 6614 + }, + { + "epoch": 0.42, + "grad_norm": 0.5381319677601095, + "learning_rate": 6.486036231407499e-07, + "loss": 0.0628, + "step": 6615 + }, + { + "epoch": 0.42, + "grad_norm": 0.3956450165002046, + "learning_rate": 6.485050126906442e-07, + "loss": 0.1665, + "step": 6616 + }, + { + "epoch": 0.42, + "grad_norm": 0.6006757979680589, + "learning_rate": 6.48406395905016e-07, + "loss": 0.1678, + "step": 6617 + }, + { + "epoch": 0.42, + "grad_norm": 0.5749528585225581, + "learning_rate": 6.483077727880726e-07, + "loss": 0.2657, + "step": 6618 + }, + { + "epoch": 0.42, + "grad_norm": 2.546013603761911, + "learning_rate": 6.482091433440215e-07, + "loss": 0.1231, + "step": 6619 + }, + { + "epoch": 0.42, + "grad_norm": 0.6563493718514495, + "learning_rate": 6.481105075770705e-07, + "loss": 0.3289, + "step": 6620 + }, + { + "epoch": 0.42, + "grad_norm": 0.3557252399707117, + "learning_rate": 6.480118654914275e-07, + "loss": 0.2553, + "step": 6621 + }, + { + "epoch": 0.42, + "grad_norm": 1.3836554168728923, + "learning_rate": 6.479132170913009e-07, + "loss": 0.2631, + "step": 6622 + }, + { + "epoch": 0.42, + "grad_norm": 0.49305934072685575, + "learning_rate": 6.478145623808988e-07, + "loss": 0.2529, + "step": 6623 + }, + { + "epoch": 0.42, + "grad_norm": 0.9065870549860774, + "learning_rate": 6.477159013644306e-07, + "loss": 0.2566, + "step": 6624 + }, + { + "epoch": 0.42, + "grad_norm": 1.220369398802296, + "learning_rate": 6.476172340461051e-07, + "loss": 0.3382, + "step": 6625 + }, + { + "epoch": 0.42, + "grad_norm": 0.5111977616318292, + "learning_rate": 6.475185604301314e-07, + "loss": 0.1724, + "step": 6626 + }, + { + "epoch": 0.42, + "grad_norm": 0.4111482409555583, + "learning_rate": 6.474198805207196e-07, + "loss": 0.1109, + "step": 6627 + }, + { + "epoch": 0.42, + "grad_norm": 15.147696513130137, + "learning_rate": 6.473211943220792e-07, + "loss": 0.1227, + "step": 6628 + }, + { + "epoch": 0.42, + "grad_norm": 0.9306742720330724, + "learning_rate": 6.472225018384205e-07, + "loss": 0.2259, + "step": 6629 + }, + { + "epoch": 0.42, + "grad_norm": 0.26695730537907403, + "learning_rate": 6.471238030739541e-07, + "loss": 0.191, + "step": 6630 + }, + { + "epoch": 0.42, + "grad_norm": 1.1497587790002861, + "learning_rate": 6.470250980328903e-07, + "loss": 0.3335, + "step": 6631 + }, + { + "epoch": 0.42, + "grad_norm": 0.5381242770023748, + "learning_rate": 6.469263867194404e-07, + "loss": 0.0837, + "step": 6632 + }, + { + "epoch": 0.42, + "grad_norm": 1.1718839718459892, + "learning_rate": 6.468276691378154e-07, + "loss": 0.2356, + "step": 6633 + }, + { + "epoch": 0.42, + "grad_norm": 1.0734400553710504, + "learning_rate": 6.467289452922268e-07, + "loss": 0.229, + "step": 6634 + }, + { + "epoch": 0.42, + "grad_norm": 1.4721460541689104, + "learning_rate": 6.466302151868865e-07, + "loss": 0.3775, + "step": 6635 + }, + { + "epoch": 0.42, + "grad_norm": 0.47354152135182404, + "learning_rate": 6.465314788260065e-07, + "loss": 0.3041, + "step": 6636 + }, + { + "epoch": 0.42, + "grad_norm": 1.213183190688514, + "learning_rate": 6.46432736213799e-07, + "loss": 0.0726, + "step": 6637 + }, + { + "epoch": 0.42, + "grad_norm": 4.691683528170075, + "learning_rate": 6.463339873544766e-07, + "loss": 0.0671, + "step": 6638 + }, + { + "epoch": 0.42, + "grad_norm": 0.7382855304101535, + "learning_rate": 6.462352322522523e-07, + "loss": 0.3539, + "step": 6639 + }, + { + "epoch": 0.42, + "grad_norm": 2.0412966731957, + "learning_rate": 6.461364709113389e-07, + "loss": 0.1656, + "step": 6640 + }, + { + "epoch": 0.42, + "grad_norm": 0.9203636723708595, + "learning_rate": 6.460377033359499e-07, + "loss": 0.2078, + "step": 6641 + }, + { + "epoch": 0.42, + "grad_norm": 0.5885890438419307, + "learning_rate": 6.459389295302989e-07, + "loss": 0.0815, + "step": 6642 + }, + { + "epoch": 0.42, + "grad_norm": 0.7677984394111165, + "learning_rate": 6.458401494985997e-07, + "loss": 0.3011, + "step": 6643 + }, + { + "epoch": 0.42, + "grad_norm": 0.2514408018419238, + "learning_rate": 6.457413632450666e-07, + "loss": 0.0802, + "step": 6644 + }, + { + "epoch": 0.42, + "grad_norm": 1.9141602880990805, + "learning_rate": 6.45642570773914e-07, + "loss": 0.2407, + "step": 6645 + }, + { + "epoch": 0.42, + "grad_norm": 1.0567396478833757, + "learning_rate": 6.455437720893564e-07, + "loss": 0.3146, + "step": 6646 + }, + { + "epoch": 0.42, + "grad_norm": 0.7719336200227167, + "learning_rate": 6.454449671956091e-07, + "loss": 0.1949, + "step": 6647 + }, + { + "epoch": 0.42, + "grad_norm": 0.799251195275632, + "learning_rate": 6.45346156096887e-07, + "loss": 0.296, + "step": 6648 + }, + { + "epoch": 0.42, + "grad_norm": 5.86904514165989, + "learning_rate": 6.452473387974058e-07, + "loss": 0.368, + "step": 6649 + }, + { + "epoch": 0.42, + "grad_norm": 0.5941697259217461, + "learning_rate": 6.45148515301381e-07, + "loss": 0.2574, + "step": 6650 + }, + { + "epoch": 0.42, + "grad_norm": 1.471621454658878, + "learning_rate": 6.45049685613029e-07, + "loss": 0.1181, + "step": 6651 + }, + { + "epoch": 0.42, + "grad_norm": 0.5072800655176259, + "learning_rate": 6.449508497365656e-07, + "loss": 0.3983, + "step": 6652 + }, + { + "epoch": 0.42, + "grad_norm": 0.44085371993939076, + "learning_rate": 6.448520076762076e-07, + "loss": 0.0789, + "step": 6653 + }, + { + "epoch": 0.42, + "grad_norm": 12.275489586224218, + "learning_rate": 6.447531594361719e-07, + "loss": 0.0508, + "step": 6654 + }, + { + "epoch": 0.42, + "grad_norm": 1.630060685326117, + "learning_rate": 6.446543050206752e-07, + "loss": 0.1589, + "step": 6655 + }, + { + "epoch": 0.42, + "grad_norm": 0.2328741430616827, + "learning_rate": 6.445554444339352e-07, + "loss": 0.0978, + "step": 6656 + }, + { + "epoch": 0.42, + "grad_norm": 0.5372530982254144, + "learning_rate": 6.444565776801693e-07, + "loss": 0.1315, + "step": 6657 + }, + { + "epoch": 0.42, + "grad_norm": 0.7599789551403485, + "learning_rate": 6.443577047635956e-07, + "loss": 0.1224, + "step": 6658 + }, + { + "epoch": 0.42, + "grad_norm": 4.0296851642204246, + "learning_rate": 6.442588256884318e-07, + "loss": 0.2358, + "step": 6659 + }, + { + "epoch": 0.42, + "grad_norm": 2.108297544744153, + "learning_rate": 6.441599404588966e-07, + "loss": 0.12, + "step": 6660 + }, + { + "epoch": 0.42, + "grad_norm": 0.501292539605557, + "learning_rate": 6.440610490792084e-07, + "loss": 0.2431, + "step": 6661 + }, + { + "epoch": 0.42, + "grad_norm": 0.34171415167564934, + "learning_rate": 6.439621515535863e-07, + "loss": 0.1894, + "step": 6662 + }, + { + "epoch": 0.42, + "grad_norm": 0.682733308349673, + "learning_rate": 6.438632478862494e-07, + "loss": 0.3229, + "step": 6663 + }, + { + "epoch": 0.42, + "grad_norm": 12.390017699026032, + "learning_rate": 6.437643380814171e-07, + "loss": 0.3022, + "step": 6664 + }, + { + "epoch": 0.43, + "grad_norm": 1.8681554634213695, + "learning_rate": 6.436654221433093e-07, + "loss": 0.0442, + "step": 6665 + }, + { + "epoch": 0.43, + "grad_norm": 0.541158370693776, + "learning_rate": 6.435665000761458e-07, + "loss": 0.2862, + "step": 6666 + }, + { + "epoch": 0.43, + "grad_norm": 0.390095187041583, + "learning_rate": 6.434675718841468e-07, + "loss": 0.2874, + "step": 6667 + }, + { + "epoch": 0.43, + "grad_norm": 1.4536828965465176, + "learning_rate": 6.433686375715327e-07, + "loss": 0.2102, + "step": 6668 + }, + { + "epoch": 0.43, + "grad_norm": 1.1446732248591567, + "learning_rate": 6.432696971425243e-07, + "loss": 0.2566, + "step": 6669 + }, + { + "epoch": 0.43, + "grad_norm": 0.757703886679042, + "learning_rate": 6.431707506013426e-07, + "loss": 0.2267, + "step": 6670 + }, + { + "epoch": 0.43, + "grad_norm": 1.4140671835415346, + "learning_rate": 6.430717979522088e-07, + "loss": 0.2333, + "step": 6671 + }, + { + "epoch": 0.43, + "grad_norm": 0.6067960560052531, + "learning_rate": 6.429728391993445e-07, + "loss": 0.2523, + "step": 6672 + }, + { + "epoch": 0.43, + "grad_norm": 0.7960205313213627, + "learning_rate": 6.428738743469717e-07, + "loss": 0.2754, + "step": 6673 + }, + { + "epoch": 0.43, + "grad_norm": 0.7451427062897477, + "learning_rate": 6.427749033993119e-07, + "loss": 0.228, + "step": 6674 + }, + { + "epoch": 0.43, + "grad_norm": 1.0022757694742077, + "learning_rate": 6.42675926360588e-07, + "loss": 0.4864, + "step": 6675 + }, + { + "epoch": 0.43, + "grad_norm": 3.1426768082927397, + "learning_rate": 6.425769432350221e-07, + "loss": 0.1139, + "step": 6676 + }, + { + "epoch": 0.43, + "grad_norm": 2.124319062949054, + "learning_rate": 6.424779540268372e-07, + "loss": 0.1503, + "step": 6677 + }, + { + "epoch": 0.43, + "grad_norm": 1.2308310821855686, + "learning_rate": 6.423789587402564e-07, + "loss": 0.1021, + "step": 6678 + }, + { + "epoch": 0.43, + "grad_norm": 0.5066897111623873, + "learning_rate": 6.422799573795031e-07, + "loss": 0.3278, + "step": 6679 + }, + { + "epoch": 0.43, + "grad_norm": 0.5059648558799121, + "learning_rate": 6.421809499488006e-07, + "loss": 0.1839, + "step": 6680 + }, + { + "epoch": 0.43, + "grad_norm": 0.27275453426585833, + "learning_rate": 6.420819364523731e-07, + "loss": 0.0828, + "step": 6681 + }, + { + "epoch": 0.43, + "grad_norm": 0.5761906999379414, + "learning_rate": 6.419829168944444e-07, + "loss": 0.1035, + "step": 6682 + }, + { + "epoch": 0.43, + "grad_norm": 0.7912710360378307, + "learning_rate": 6.418838912792393e-07, + "loss": 0.0861, + "step": 6683 + }, + { + "epoch": 0.43, + "grad_norm": 1.572790723941205, + "learning_rate": 6.41784859610982e-07, + "loss": 0.1856, + "step": 6684 + }, + { + "epoch": 0.43, + "grad_norm": 1.2006324218480453, + "learning_rate": 6.416858218938975e-07, + "loss": 0.199, + "step": 6685 + }, + { + "epoch": 0.43, + "grad_norm": 1.181230130596287, + "learning_rate": 6.415867781322112e-07, + "loss": 0.3226, + "step": 6686 + }, + { + "epoch": 0.43, + "grad_norm": 0.9367056371259241, + "learning_rate": 6.414877283301482e-07, + "loss": 0.3043, + "step": 6687 + }, + { + "epoch": 0.43, + "grad_norm": 0.780971038545777, + "learning_rate": 6.413886724919343e-07, + "loss": 0.3924, + "step": 6688 + }, + { + "epoch": 0.43, + "grad_norm": 0.1464454703347422, + "learning_rate": 6.412896106217955e-07, + "loss": 0.0669, + "step": 6689 + }, + { + "epoch": 0.43, + "grad_norm": 0.6590059178969544, + "learning_rate": 6.411905427239577e-07, + "loss": 0.3126, + "step": 6690 + }, + { + "epoch": 0.43, + "grad_norm": 0.5853783605081505, + "learning_rate": 6.410914688026475e-07, + "loss": 0.2109, + "step": 6691 + }, + { + "epoch": 0.43, + "grad_norm": 1.1179016763262029, + "learning_rate": 6.409923888620918e-07, + "loss": 0.2518, + "step": 6692 + }, + { + "epoch": 0.43, + "grad_norm": 13.581686640631824, + "learning_rate": 6.408933029065173e-07, + "loss": 0.2171, + "step": 6693 + }, + { + "epoch": 0.43, + "grad_norm": 0.8759997525338321, + "learning_rate": 6.407942109401514e-07, + "loss": 0.141, + "step": 6694 + }, + { + "epoch": 0.43, + "grad_norm": 0.5080766936981634, + "learning_rate": 6.406951129672212e-07, + "loss": 0.2806, + "step": 6695 + }, + { + "epoch": 0.43, + "grad_norm": 1.0399523992915547, + "learning_rate": 6.405960089919548e-07, + "loss": 0.0772, + "step": 6696 + }, + { + "epoch": 0.43, + "grad_norm": 0.6478695764754306, + "learning_rate": 6.404968990185799e-07, + "loss": 0.0619, + "step": 6697 + }, + { + "epoch": 0.43, + "grad_norm": 7.273136086090483, + "learning_rate": 6.403977830513248e-07, + "loss": 0.2038, + "step": 6698 + }, + { + "epoch": 0.43, + "grad_norm": 0.7704261948601646, + "learning_rate": 6.402986610944182e-07, + "loss": 0.2672, + "step": 6699 + }, + { + "epoch": 0.43, + "grad_norm": 1.0132380244586805, + "learning_rate": 6.401995331520886e-07, + "loss": 0.1967, + "step": 6700 + }, + { + "epoch": 0.43, + "grad_norm": 2.9905749152200145, + "learning_rate": 6.401003992285652e-07, + "loss": 0.0433, + "step": 6701 + }, + { + "epoch": 0.43, + "grad_norm": 1.4633532929933657, + "learning_rate": 6.400012593280771e-07, + "loss": 0.1597, + "step": 6702 + }, + { + "epoch": 0.43, + "grad_norm": 1.9126576229653443, + "learning_rate": 6.399021134548537e-07, + "loss": 0.0904, + "step": 6703 + }, + { + "epoch": 0.43, + "grad_norm": 0.7200646969220672, + "learning_rate": 6.39802961613125e-07, + "loss": 0.3084, + "step": 6704 + }, + { + "epoch": 0.43, + "grad_norm": 1.1758572937507459, + "learning_rate": 6.39703803807121e-07, + "loss": 0.0533, + "step": 6705 + }, + { + "epoch": 0.43, + "grad_norm": 1.4498625526282167, + "learning_rate": 6.396046400410718e-07, + "loss": 0.0553, + "step": 6706 + }, + { + "epoch": 0.43, + "grad_norm": 0.6364404529115247, + "learning_rate": 6.39505470319208e-07, + "loss": 0.226, + "step": 6707 + }, + { + "epoch": 0.43, + "grad_norm": 1.7772069739561727, + "learning_rate": 6.394062946457604e-07, + "loss": 0.4006, + "step": 6708 + }, + { + "epoch": 0.43, + "grad_norm": 0.6028916535151102, + "learning_rate": 6.3930711302496e-07, + "loss": 0.3125, + "step": 6709 + }, + { + "epoch": 0.43, + "grad_norm": 0.6758249587863409, + "learning_rate": 6.39207925461038e-07, + "loss": 0.4262, + "step": 6710 + }, + { + "epoch": 0.43, + "grad_norm": 7.127001295949069, + "learning_rate": 6.391087319582263e-07, + "loss": 0.2229, + "step": 6711 + }, + { + "epoch": 0.43, + "grad_norm": 0.3262722172903861, + "learning_rate": 6.390095325207564e-07, + "loss": 0.0041, + "step": 6712 + }, + { + "epoch": 0.43, + "grad_norm": 0.7507862581580848, + "learning_rate": 6.389103271528605e-07, + "loss": 0.145, + "step": 6713 + }, + { + "epoch": 0.43, + "grad_norm": 0.6605094691567882, + "learning_rate": 6.388111158587706e-07, + "loss": 0.2998, + "step": 6714 + }, + { + "epoch": 0.43, + "grad_norm": 3.5563965705038187, + "learning_rate": 6.387118986427195e-07, + "loss": 0.0762, + "step": 6715 + }, + { + "epoch": 0.43, + "grad_norm": 0.8925033905398138, + "learning_rate": 6.386126755089398e-07, + "loss": 0.2854, + "step": 6716 + }, + { + "epoch": 0.43, + "grad_norm": 5.237973136924914, + "learning_rate": 6.385134464616648e-07, + "loss": 0.0446, + "step": 6717 + }, + { + "epoch": 0.43, + "grad_norm": 0.5150703661256893, + "learning_rate": 6.384142115051279e-07, + "loss": 0.1314, + "step": 6718 + }, + { + "epoch": 0.43, + "grad_norm": 0.4835942761407444, + "learning_rate": 6.383149706435625e-07, + "loss": 0.1088, + "step": 6719 + }, + { + "epoch": 0.43, + "grad_norm": 1.6085024289437382, + "learning_rate": 6.382157238812023e-07, + "loss": 0.1894, + "step": 6720 + }, + { + "epoch": 0.43, + "grad_norm": 0.6893796463969709, + "learning_rate": 6.381164712222814e-07, + "loss": 0.1025, + "step": 6721 + }, + { + "epoch": 0.43, + "grad_norm": 0.3454877105562411, + "learning_rate": 6.380172126710344e-07, + "loss": 0.0908, + "step": 6722 + }, + { + "epoch": 0.43, + "grad_norm": 0.4616723698580447, + "learning_rate": 6.379179482316954e-07, + "loss": 0.2442, + "step": 6723 + }, + { + "epoch": 0.43, + "grad_norm": 0.9968537970426457, + "learning_rate": 6.378186779084995e-07, + "loss": 0.3062, + "step": 6724 + }, + { + "epoch": 0.43, + "grad_norm": 1.6244526051281594, + "learning_rate": 6.377194017056819e-07, + "loss": 0.2975, + "step": 6725 + }, + { + "epoch": 0.43, + "grad_norm": 0.7509501104222818, + "learning_rate": 6.376201196274777e-07, + "loss": 0.1483, + "step": 6726 + }, + { + "epoch": 0.43, + "grad_norm": 0.3151371133817729, + "learning_rate": 6.375208316781226e-07, + "loss": 0.1096, + "step": 6727 + }, + { + "epoch": 0.43, + "grad_norm": 0.3383792185360287, + "learning_rate": 6.374215378618523e-07, + "loss": 0.0132, + "step": 6728 + }, + { + "epoch": 0.43, + "grad_norm": 0.6491694219196131, + "learning_rate": 6.373222381829031e-07, + "loss": 0.1896, + "step": 6729 + }, + { + "epoch": 0.43, + "grad_norm": 1.3156566306106663, + "learning_rate": 6.37222932645511e-07, + "loss": 0.2652, + "step": 6730 + }, + { + "epoch": 0.43, + "grad_norm": 0.8979187862266332, + "learning_rate": 6.371236212539129e-07, + "loss": 0.2058, + "step": 6731 + }, + { + "epoch": 0.43, + "grad_norm": 1.278954544093297, + "learning_rate": 6.370243040123452e-07, + "loss": 0.3163, + "step": 6732 + }, + { + "epoch": 0.43, + "grad_norm": 0.6705358804955086, + "learning_rate": 6.369249809250454e-07, + "loss": 0.2729, + "step": 6733 + }, + { + "epoch": 0.43, + "grad_norm": 0.706961289716489, + "learning_rate": 6.368256519962506e-07, + "loss": 0.0283, + "step": 6734 + }, + { + "epoch": 0.43, + "grad_norm": 1.1270196563346673, + "learning_rate": 6.367263172301984e-07, + "loss": 0.043, + "step": 6735 + }, + { + "epoch": 0.43, + "grad_norm": 0.27520589895700176, + "learning_rate": 6.366269766311269e-07, + "loss": 0.1172, + "step": 6736 + }, + { + "epoch": 0.43, + "grad_norm": 1.6130876314593725, + "learning_rate": 6.365276302032737e-07, + "loss": 0.4631, + "step": 6737 + }, + { + "epoch": 0.43, + "grad_norm": 1.9158684740988248, + "learning_rate": 6.364282779508774e-07, + "loss": 0.0629, + "step": 6738 + }, + { + "epoch": 0.43, + "grad_norm": 1.5385877544612692, + "learning_rate": 6.363289198781765e-07, + "loss": 0.1979, + "step": 6739 + }, + { + "epoch": 0.43, + "grad_norm": 2.7091496753358872, + "learning_rate": 6.362295559894099e-07, + "loss": 0.1839, + "step": 6740 + }, + { + "epoch": 0.43, + "grad_norm": 1.9930991041229553, + "learning_rate": 6.361301862888164e-07, + "loss": 0.3691, + "step": 6741 + }, + { + "epoch": 0.43, + "grad_norm": 0.5466540366244256, + "learning_rate": 6.360308107806357e-07, + "loss": 0.1623, + "step": 6742 + }, + { + "epoch": 0.43, + "grad_norm": 1.13058829593568, + "learning_rate": 6.35931429469107e-07, + "loss": 0.266, + "step": 6743 + }, + { + "epoch": 0.43, + "grad_norm": 0.8362409781450588, + "learning_rate": 6.358320423584704e-07, + "loss": 0.3643, + "step": 6744 + }, + { + "epoch": 0.43, + "grad_norm": 0.8712235975431387, + "learning_rate": 6.357326494529657e-07, + "loss": 0.2947, + "step": 6745 + }, + { + "epoch": 0.43, + "grad_norm": 5.790006196204553, + "learning_rate": 6.356332507568333e-07, + "loss": 0.1578, + "step": 6746 + }, + { + "epoch": 0.43, + "grad_norm": 0.5730066080214747, + "learning_rate": 6.355338462743138e-07, + "loss": 0.1548, + "step": 6747 + }, + { + "epoch": 0.43, + "grad_norm": 2.454384903442159, + "learning_rate": 6.35434436009648e-07, + "loss": 0.3031, + "step": 6748 + }, + { + "epoch": 0.43, + "grad_norm": 0.6514055731437256, + "learning_rate": 6.353350199670771e-07, + "loss": 0.1687, + "step": 6749 + }, + { + "epoch": 0.43, + "grad_norm": 0.18309743262848088, + "learning_rate": 6.352355981508419e-07, + "loss": 0.1015, + "step": 6750 + }, + { + "epoch": 0.43, + "grad_norm": 0.8656871711227753, + "learning_rate": 6.351361705651842e-07, + "loss": 0.3523, + "step": 6751 + }, + { + "epoch": 0.43, + "grad_norm": 0.8071333822294513, + "learning_rate": 6.350367372143459e-07, + "loss": 0.261, + "step": 6752 + }, + { + "epoch": 0.43, + "grad_norm": 0.6646761374076501, + "learning_rate": 6.34937298102569e-07, + "loss": 0.2523, + "step": 6753 + }, + { + "epoch": 0.43, + "grad_norm": 0.5172036440857891, + "learning_rate": 6.348378532340957e-07, + "loss": 0.4282, + "step": 6754 + }, + { + "epoch": 0.43, + "grad_norm": 0.8127899949717996, + "learning_rate": 6.347384026131683e-07, + "loss": 0.2521, + "step": 6755 + }, + { + "epoch": 0.43, + "grad_norm": 1.1329997645487928, + "learning_rate": 6.3463894624403e-07, + "loss": 0.2012, + "step": 6756 + }, + { + "epoch": 0.43, + "grad_norm": 4.088470560050196, + "learning_rate": 6.345394841309237e-07, + "loss": 0.0251, + "step": 6757 + }, + { + "epoch": 0.43, + "grad_norm": 0.8617685245678508, + "learning_rate": 6.344400162780923e-07, + "loss": 0.0105, + "step": 6758 + }, + { + "epoch": 0.43, + "grad_norm": 0.5893830867155143, + "learning_rate": 6.343405426897797e-07, + "loss": 0.2246, + "step": 6759 + }, + { + "epoch": 0.43, + "grad_norm": 1.7666749117852034, + "learning_rate": 6.342410633702294e-07, + "loss": 0.1976, + "step": 6760 + }, + { + "epoch": 0.43, + "grad_norm": 0.43331517845402123, + "learning_rate": 6.341415783236854e-07, + "loss": 0.1695, + "step": 6761 + }, + { + "epoch": 0.43, + "grad_norm": 0.9868818836290725, + "learning_rate": 6.340420875543921e-07, + "loss": 0.0934, + "step": 6762 + }, + { + "epoch": 0.43, + "grad_norm": 0.6897306726627255, + "learning_rate": 6.33942591066594e-07, + "loss": 0.2653, + "step": 6763 + }, + { + "epoch": 0.43, + "grad_norm": 4.0389408405008735, + "learning_rate": 6.338430888645356e-07, + "loss": 0.0828, + "step": 6764 + }, + { + "epoch": 0.43, + "grad_norm": 0.5074205963952, + "learning_rate": 6.33743580952462e-07, + "loss": 0.2088, + "step": 6765 + }, + { + "epoch": 0.43, + "grad_norm": 0.47124719780596425, + "learning_rate": 6.336440673346184e-07, + "loss": 0.1126, + "step": 6766 + }, + { + "epoch": 0.43, + "grad_norm": 0.4583817460931665, + "learning_rate": 6.335445480152503e-07, + "loss": 0.1736, + "step": 6767 + }, + { + "epoch": 0.43, + "grad_norm": 0.3462864839528916, + "learning_rate": 6.334450229986031e-07, + "loss": 0.1249, + "step": 6768 + }, + { + "epoch": 0.43, + "grad_norm": 0.36479249709676476, + "learning_rate": 6.33345492288923e-07, + "loss": 0.0074, + "step": 6769 + }, + { + "epoch": 0.43, + "grad_norm": 0.2588922594358886, + "learning_rate": 6.332459558904563e-07, + "loss": 0.1877, + "step": 6770 + }, + { + "epoch": 0.43, + "grad_norm": 0.899914125571852, + "learning_rate": 6.331464138074491e-07, + "loss": 0.3181, + "step": 6771 + }, + { + "epoch": 0.43, + "grad_norm": 0.934832204196358, + "learning_rate": 6.330468660441484e-07, + "loss": 0.2004, + "step": 6772 + }, + { + "epoch": 0.43, + "grad_norm": 0.9039667094154356, + "learning_rate": 6.329473126048008e-07, + "loss": 0.0988, + "step": 6773 + }, + { + "epoch": 0.43, + "grad_norm": 1.1220408080787108, + "learning_rate": 6.328477534936537e-07, + "loss": 0.2913, + "step": 6774 + }, + { + "epoch": 0.43, + "grad_norm": 0.7439464280070911, + "learning_rate": 6.327481887149542e-07, + "loss": 0.2032, + "step": 6775 + }, + { + "epoch": 0.43, + "grad_norm": 2.2175894949275574, + "learning_rate": 6.326486182729504e-07, + "loss": 0.217, + "step": 6776 + }, + { + "epoch": 0.43, + "grad_norm": 0.6617656036990464, + "learning_rate": 6.325490421718897e-07, + "loss": 0.0769, + "step": 6777 + }, + { + "epoch": 0.43, + "grad_norm": 0.6548727496560991, + "learning_rate": 6.324494604160205e-07, + "loss": 0.1635, + "step": 6778 + }, + { + "epoch": 0.43, + "grad_norm": 1.1961538380748058, + "learning_rate": 6.323498730095909e-07, + "loss": 0.34, + "step": 6779 + }, + { + "epoch": 0.43, + "grad_norm": 0.7573309400627748, + "learning_rate": 6.322502799568496e-07, + "loss": 0.1469, + "step": 6780 + }, + { + "epoch": 0.43, + "grad_norm": 0.4620618967612187, + "learning_rate": 6.321506812620457e-07, + "loss": 0.1377, + "step": 6781 + }, + { + "epoch": 0.43, + "grad_norm": 0.9670347054280445, + "learning_rate": 6.32051076929428e-07, + "loss": 0.2661, + "step": 6782 + }, + { + "epoch": 0.43, + "grad_norm": 2.6153038264053357, + "learning_rate": 6.319514669632459e-07, + "loss": 0.1095, + "step": 6783 + }, + { + "epoch": 0.43, + "grad_norm": 1.1576094055504325, + "learning_rate": 6.318518513677491e-07, + "loss": 0.2552, + "step": 6784 + }, + { + "epoch": 0.43, + "grad_norm": 2.3510418092236915, + "learning_rate": 6.317522301471872e-07, + "loss": 0.2902, + "step": 6785 + }, + { + "epoch": 0.43, + "grad_norm": 0.609354472764255, + "learning_rate": 6.316526033058103e-07, + "loss": 0.2841, + "step": 6786 + }, + { + "epoch": 0.43, + "grad_norm": 1.3021652967977673, + "learning_rate": 6.315529708478685e-07, + "loss": 0.1702, + "step": 6787 + }, + { + "epoch": 0.43, + "grad_norm": 0.9888446225038058, + "learning_rate": 6.314533327776126e-07, + "loss": 0.2604, + "step": 6788 + }, + { + "epoch": 0.43, + "grad_norm": 3.0346951448584885, + "learning_rate": 6.313536890992935e-07, + "loss": 0.4024, + "step": 6789 + }, + { + "epoch": 0.43, + "grad_norm": 0.5675104587584527, + "learning_rate": 6.312540398171617e-07, + "loss": 0.0111, + "step": 6790 + }, + { + "epoch": 0.43, + "grad_norm": 0.5672563634103571, + "learning_rate": 6.311543849354689e-07, + "loss": 0.327, + "step": 6791 + }, + { + "epoch": 0.43, + "grad_norm": 3.443557818405537, + "learning_rate": 6.310547244584663e-07, + "loss": 0.0626, + "step": 6792 + }, + { + "epoch": 0.43, + "grad_norm": 0.3011797489014841, + "learning_rate": 6.309550583904057e-07, + "loss": 0.1006, + "step": 6793 + }, + { + "epoch": 0.43, + "grad_norm": 1.246214236196623, + "learning_rate": 6.308553867355391e-07, + "loss": 0.1736, + "step": 6794 + }, + { + "epoch": 0.43, + "grad_norm": 1.446954270409426, + "learning_rate": 6.307557094981184e-07, + "loss": 0.1546, + "step": 6795 + }, + { + "epoch": 0.43, + "grad_norm": 1.7072287569490263, + "learning_rate": 6.306560266823966e-07, + "loss": 0.1176, + "step": 6796 + }, + { + "epoch": 0.43, + "grad_norm": 0.9312694871096633, + "learning_rate": 6.305563382926259e-07, + "loss": 0.1041, + "step": 6797 + }, + { + "epoch": 0.43, + "grad_norm": 1.1974276895069347, + "learning_rate": 6.304566443330594e-07, + "loss": 0.1647, + "step": 6798 + }, + { + "epoch": 0.43, + "grad_norm": 0.719215210065318, + "learning_rate": 6.303569448079502e-07, + "loss": 0.1296, + "step": 6799 + }, + { + "epoch": 0.43, + "grad_norm": 8.842410190704364, + "learning_rate": 6.302572397215516e-07, + "loss": 0.0253, + "step": 6800 + }, + { + "epoch": 0.43, + "grad_norm": 0.30814150918820393, + "learning_rate": 6.301575290781174e-07, + "loss": 0.0688, + "step": 6801 + }, + { + "epoch": 0.43, + "grad_norm": 0.6158718607868237, + "learning_rate": 6.300578128819015e-07, + "loss": 0.2403, + "step": 6802 + }, + { + "epoch": 0.43, + "grad_norm": 1.4780560018664006, + "learning_rate": 6.299580911371576e-07, + "loss": 0.1857, + "step": 6803 + }, + { + "epoch": 0.43, + "grad_norm": 0.9397332241649011, + "learning_rate": 6.298583638481403e-07, + "loss": 0.2389, + "step": 6804 + }, + { + "epoch": 0.43, + "grad_norm": 4.923574127674357, + "learning_rate": 6.297586310191043e-07, + "loss": 0.0653, + "step": 6805 + }, + { + "epoch": 0.43, + "grad_norm": 0.5311185850515917, + "learning_rate": 6.296588926543042e-07, + "loss": 0.295, + "step": 6806 + }, + { + "epoch": 0.43, + "grad_norm": 1.6618138251404375, + "learning_rate": 6.29559148757995e-07, + "loss": 0.2583, + "step": 6807 + }, + { + "epoch": 0.43, + "grad_norm": 0.7704545834260043, + "learning_rate": 6.294593993344322e-07, + "loss": 0.2924, + "step": 6808 + }, + { + "epoch": 0.43, + "grad_norm": 3.225263236600781, + "learning_rate": 6.29359644387871e-07, + "loss": 0.218, + "step": 6809 + }, + { + "epoch": 0.43, + "grad_norm": 1.5587407216636802, + "learning_rate": 6.292598839225674e-07, + "loss": 0.3032, + "step": 6810 + }, + { + "epoch": 0.43, + "grad_norm": 1.0127580991201515, + "learning_rate": 6.291601179427774e-07, + "loss": 0.3185, + "step": 6811 + }, + { + "epoch": 0.43, + "grad_norm": 0.511789852653221, + "learning_rate": 6.290603464527571e-07, + "loss": 0.0501, + "step": 6812 + }, + { + "epoch": 0.43, + "grad_norm": 7.280016632547844, + "learning_rate": 6.28960569456763e-07, + "loss": 0.2233, + "step": 6813 + }, + { + "epoch": 0.43, + "grad_norm": 0.7443266284461287, + "learning_rate": 6.288607869590517e-07, + "loss": 0.2224, + "step": 6814 + }, + { + "epoch": 0.43, + "grad_norm": 1.1274665943383784, + "learning_rate": 6.2876099896388e-07, + "loss": 0.3323, + "step": 6815 + }, + { + "epoch": 0.43, + "grad_norm": 0.5260091674169767, + "learning_rate": 6.286612054755054e-07, + "loss": 0.1671, + "step": 6816 + }, + { + "epoch": 0.43, + "grad_norm": 0.6868699193481335, + "learning_rate": 6.285614064981853e-07, + "loss": 0.02, + "step": 6817 + }, + { + "epoch": 0.43, + "grad_norm": 0.732733327052659, + "learning_rate": 6.284616020361771e-07, + "loss": 0.0463, + "step": 6818 + }, + { + "epoch": 0.43, + "grad_norm": 0.21229347455748196, + "learning_rate": 6.283617920937388e-07, + "loss": 0.0889, + "step": 6819 + }, + { + "epoch": 0.43, + "grad_norm": 1.7182442012876338, + "learning_rate": 6.282619766751282e-07, + "loss": 0.2928, + "step": 6820 + }, + { + "epoch": 0.43, + "grad_norm": 1.2240411603638657, + "learning_rate": 6.281621557846039e-07, + "loss": 0.1031, + "step": 6821 + }, + { + "epoch": 0.44, + "grad_norm": 0.49941767733144304, + "learning_rate": 6.280623294264242e-07, + "loss": 0.2916, + "step": 6822 + }, + { + "epoch": 0.44, + "grad_norm": 2.4504738802027255, + "learning_rate": 6.279624976048483e-07, + "loss": 0.3901, + "step": 6823 + }, + { + "epoch": 0.44, + "grad_norm": 1.8574063668748844, + "learning_rate": 6.278626603241351e-07, + "loss": 0.1969, + "step": 6824 + }, + { + "epoch": 0.44, + "grad_norm": 0.6650372944583962, + "learning_rate": 6.277628175885435e-07, + "loss": 0.1371, + "step": 6825 + }, + { + "epoch": 0.44, + "grad_norm": 1.8725049317109095, + "learning_rate": 6.276629694023335e-07, + "loss": 0.0056, + "step": 6826 + }, + { + "epoch": 0.44, + "grad_norm": 1.7852613308837884, + "learning_rate": 6.275631157697646e-07, + "loss": 0.1149, + "step": 6827 + }, + { + "epoch": 0.44, + "grad_norm": 0.5960671203452759, + "learning_rate": 6.274632566950966e-07, + "loss": 0.2283, + "step": 6828 + }, + { + "epoch": 0.44, + "grad_norm": 1.3019755436344658, + "learning_rate": 6.273633921825899e-07, + "loss": 0.3012, + "step": 6829 + }, + { + "epoch": 0.44, + "grad_norm": 10.425201121885749, + "learning_rate": 6.272635222365049e-07, + "loss": 0.2481, + "step": 6830 + }, + { + "epoch": 0.44, + "grad_norm": 0.8176663616259935, + "learning_rate": 6.271636468611021e-07, + "loss": 0.1553, + "step": 6831 + }, + { + "epoch": 0.44, + "grad_norm": 0.6115331284522859, + "learning_rate": 6.270637660606426e-07, + "loss": 0.0967, + "step": 6832 + }, + { + "epoch": 0.44, + "grad_norm": 0.9737249811992744, + "learning_rate": 6.269638798393875e-07, + "loss": 0.281, + "step": 6833 + }, + { + "epoch": 0.44, + "grad_norm": 18.043001773579363, + "learning_rate": 6.268639882015978e-07, + "loss": 0.1687, + "step": 6834 + }, + { + "epoch": 0.44, + "grad_norm": 0.6132946727237054, + "learning_rate": 6.267640911515355e-07, + "loss": 0.2331, + "step": 6835 + }, + { + "epoch": 0.44, + "grad_norm": 0.5091497400318866, + "learning_rate": 6.266641886934622e-07, + "loss": 0.1694, + "step": 6836 + }, + { + "epoch": 0.44, + "grad_norm": 2.3970003631326264, + "learning_rate": 6.2656428083164e-07, + "loss": 0.3293, + "step": 6837 + }, + { + "epoch": 0.44, + "grad_norm": 1.505052794904146, + "learning_rate": 6.264643675703312e-07, + "loss": 0.226, + "step": 6838 + }, + { + "epoch": 0.44, + "grad_norm": 5.9167611183008, + "learning_rate": 6.263644489137982e-07, + "loss": 0.1118, + "step": 6839 + }, + { + "epoch": 0.44, + "grad_norm": 1.0811616980231809, + "learning_rate": 6.262645248663037e-07, + "loss": 0.2247, + "step": 6840 + }, + { + "epoch": 0.44, + "grad_norm": 1.6414905323992222, + "learning_rate": 6.261645954321108e-07, + "loss": 0.1955, + "step": 6841 + }, + { + "epoch": 0.44, + "grad_norm": 1.3250585746642969, + "learning_rate": 6.260646606154827e-07, + "loss": 0.1739, + "step": 6842 + }, + { + "epoch": 0.44, + "grad_norm": 0.7866168030369379, + "learning_rate": 6.259647204206827e-07, + "loss": 0.3972, + "step": 6843 + }, + { + "epoch": 0.44, + "grad_norm": 0.5581515988682273, + "learning_rate": 6.258647748519746e-07, + "loss": 0.2474, + "step": 6844 + }, + { + "epoch": 0.44, + "grad_norm": 0.7993384137029356, + "learning_rate": 6.257648239136221e-07, + "loss": 0.3265, + "step": 6845 + }, + { + "epoch": 0.44, + "grad_norm": 0.8276376844992642, + "learning_rate": 6.256648676098895e-07, + "loss": 0.2602, + "step": 6846 + }, + { + "epoch": 0.44, + "grad_norm": 1.3657207400181275, + "learning_rate": 6.25564905945041e-07, + "loss": 0.3829, + "step": 6847 + }, + { + "epoch": 0.44, + "grad_norm": 0.4485079253704959, + "learning_rate": 6.254649389233409e-07, + "loss": 0.176, + "step": 6848 + }, + { + "epoch": 0.44, + "grad_norm": 0.43576652917857184, + "learning_rate": 6.253649665490545e-07, + "loss": 0.0893, + "step": 6849 + }, + { + "epoch": 0.44, + "grad_norm": 0.5289305109863717, + "learning_rate": 6.252649888264468e-07, + "loss": 0.1441, + "step": 6850 + }, + { + "epoch": 0.44, + "grad_norm": 1.469843352456335, + "learning_rate": 6.251650057597826e-07, + "loss": 0.1063, + "step": 6851 + }, + { + "epoch": 0.44, + "grad_norm": 1.1768701925903495, + "learning_rate": 6.250650173533278e-07, + "loss": 0.2213, + "step": 6852 + }, + { + "epoch": 0.44, + "grad_norm": 0.9943437390577592, + "learning_rate": 6.24965023611348e-07, + "loss": 0.3129, + "step": 6853 + }, + { + "epoch": 0.44, + "grad_norm": 1.057901605441251, + "learning_rate": 6.248650245381088e-07, + "loss": 0.0195, + "step": 6854 + }, + { + "epoch": 0.44, + "grad_norm": 0.7806514200502783, + "learning_rate": 6.247650201378769e-07, + "loss": 0.2214, + "step": 6855 + }, + { + "epoch": 0.44, + "grad_norm": 0.5671469742361622, + "learning_rate": 6.246650104149184e-07, + "loss": 0.2613, + "step": 6856 + }, + { + "epoch": 0.44, + "grad_norm": 5.373765136142397, + "learning_rate": 6.245649953734998e-07, + "loss": 0.1473, + "step": 6857 + }, + { + "epoch": 0.44, + "grad_norm": 0.4461208500038242, + "learning_rate": 6.244649750178882e-07, + "loss": 0.2095, + "step": 6858 + }, + { + "epoch": 0.44, + "grad_norm": 0.724439562233869, + "learning_rate": 6.243649493523505e-07, + "loss": 0.127, + "step": 6859 + }, + { + "epoch": 0.44, + "grad_norm": 12.835199059229101, + "learning_rate": 6.242649183811541e-07, + "loss": 0.0996, + "step": 6860 + }, + { + "epoch": 0.44, + "grad_norm": 0.7136121296431818, + "learning_rate": 6.241648821085665e-07, + "loss": 0.1202, + "step": 6861 + }, + { + "epoch": 0.44, + "grad_norm": 1.1481135542597556, + "learning_rate": 6.240648405388555e-07, + "loss": 0.1732, + "step": 6862 + }, + { + "epoch": 0.44, + "grad_norm": 14.84286234223769, + "learning_rate": 6.239647936762888e-07, + "loss": 0.21, + "step": 6863 + }, + { + "epoch": 0.44, + "grad_norm": 7.559212080866238, + "learning_rate": 6.238647415251349e-07, + "loss": 0.1487, + "step": 6864 + }, + { + "epoch": 0.44, + "grad_norm": 0.8333486084871736, + "learning_rate": 6.237646840896622e-07, + "loss": 0.0493, + "step": 6865 + }, + { + "epoch": 0.44, + "grad_norm": 0.9564629195000102, + "learning_rate": 6.236646213741393e-07, + "loss": 0.2068, + "step": 6866 + }, + { + "epoch": 0.44, + "grad_norm": 0.6018376526384829, + "learning_rate": 6.235645533828348e-07, + "loss": 0.2356, + "step": 6867 + }, + { + "epoch": 0.44, + "grad_norm": 1.6077695408432908, + "learning_rate": 6.234644801200182e-07, + "loss": 0.378, + "step": 6868 + }, + { + "epoch": 0.44, + "grad_norm": 0.25863719910906324, + "learning_rate": 6.233644015899587e-07, + "loss": 0.1129, + "step": 6869 + }, + { + "epoch": 0.44, + "grad_norm": 13.29980844732213, + "learning_rate": 6.232643177969258e-07, + "loss": 0.092, + "step": 6870 + }, + { + "epoch": 0.44, + "grad_norm": 0.4036243777816879, + "learning_rate": 6.231642287451894e-07, + "loss": 0.137, + "step": 6871 + }, + { + "epoch": 0.44, + "grad_norm": 1.531938766838832, + "learning_rate": 6.230641344390193e-07, + "loss": 0.1709, + "step": 6872 + }, + { + "epoch": 0.44, + "grad_norm": 1.626332250713213, + "learning_rate": 6.22964034882686e-07, + "loss": 0.259, + "step": 6873 + }, + { + "epoch": 0.44, + "grad_norm": 1.3131201119663751, + "learning_rate": 6.228639300804597e-07, + "loss": 0.1008, + "step": 6874 + }, + { + "epoch": 0.44, + "grad_norm": 0.948858701528825, + "learning_rate": 6.227638200366111e-07, + "loss": 0.0435, + "step": 6875 + }, + { + "epoch": 0.44, + "grad_norm": 0.34621796552462913, + "learning_rate": 6.226637047554112e-07, + "loss": 0.1809, + "step": 6876 + }, + { + "epoch": 0.44, + "grad_norm": 0.6545483903692884, + "learning_rate": 6.22563584241131e-07, + "loss": 0.2285, + "step": 6877 + }, + { + "epoch": 0.44, + "grad_norm": 1.4983342353946147, + "learning_rate": 6.224634584980419e-07, + "loss": 0.2559, + "step": 6878 + }, + { + "epoch": 0.44, + "grad_norm": 0.6262553504638929, + "learning_rate": 6.223633275304157e-07, + "loss": 0.2534, + "step": 6879 + }, + { + "epoch": 0.44, + "grad_norm": 3.0476521647089876, + "learning_rate": 6.222631913425237e-07, + "loss": 0.1286, + "step": 6880 + }, + { + "epoch": 0.44, + "grad_norm": 1.7532988373024432, + "learning_rate": 6.221630499386383e-07, + "loss": 0.1562, + "step": 6881 + }, + { + "epoch": 0.44, + "grad_norm": 0.2812288963283702, + "learning_rate": 6.220629033230317e-07, + "loss": 0.0915, + "step": 6882 + }, + { + "epoch": 0.44, + "grad_norm": 0.5943660575348206, + "learning_rate": 6.219627514999761e-07, + "loss": 0.1219, + "step": 6883 + }, + { + "epoch": 0.44, + "grad_norm": 3.5110541196172584, + "learning_rate": 6.218625944737444e-07, + "loss": 0.0744, + "step": 6884 + }, + { + "epoch": 0.44, + "grad_norm": 1.1145811824476093, + "learning_rate": 6.217624322486094e-07, + "loss": 0.361, + "step": 6885 + }, + { + "epoch": 0.44, + "grad_norm": 1.0813975123631303, + "learning_rate": 6.216622648288443e-07, + "loss": 0.1015, + "step": 6886 + }, + { + "epoch": 0.44, + "grad_norm": 0.5614615087155922, + "learning_rate": 6.215620922187226e-07, + "loss": 0.2179, + "step": 6887 + }, + { + "epoch": 0.44, + "grad_norm": 0.666469936246563, + "learning_rate": 6.214619144225175e-07, + "loss": 0.1969, + "step": 6888 + }, + { + "epoch": 0.44, + "grad_norm": 1.8435504813603971, + "learning_rate": 6.21361731444503e-07, + "loss": 0.1713, + "step": 6889 + }, + { + "epoch": 0.44, + "grad_norm": 9.12098429969003, + "learning_rate": 6.212615432889529e-07, + "loss": 0.1041, + "step": 6890 + }, + { + "epoch": 0.44, + "grad_norm": 0.6345391001430074, + "learning_rate": 6.211613499601418e-07, + "loss": 0.327, + "step": 6891 + }, + { + "epoch": 0.44, + "grad_norm": 1.1628990424915013, + "learning_rate": 6.210611514623439e-07, + "loss": 0.1649, + "step": 6892 + }, + { + "epoch": 0.44, + "grad_norm": 0.8765090102836677, + "learning_rate": 6.209609477998338e-07, + "loss": 0.2759, + "step": 6893 + }, + { + "epoch": 0.44, + "grad_norm": 1.2374649525278218, + "learning_rate": 6.208607389768866e-07, + "loss": 0.3338, + "step": 6894 + }, + { + "epoch": 0.44, + "grad_norm": 5.199880028380958, + "learning_rate": 6.207605249977773e-07, + "loss": 0.3526, + "step": 6895 + }, + { + "epoch": 0.44, + "grad_norm": 2.5493822708776075, + "learning_rate": 6.206603058667814e-07, + "loss": 0.3173, + "step": 6896 + }, + { + "epoch": 0.44, + "grad_norm": 1.2408383608057978, + "learning_rate": 6.20560081588174e-07, + "loss": 0.2257, + "step": 6897 + }, + { + "epoch": 0.44, + "grad_norm": 0.28977942430802, + "learning_rate": 6.204598521662315e-07, + "loss": 0.0809, + "step": 6898 + }, + { + "epoch": 0.44, + "grad_norm": 0.5550949076424141, + "learning_rate": 6.203596176052293e-07, + "loss": 0.0761, + "step": 6899 + }, + { + "epoch": 0.44, + "grad_norm": 0.2417739785051891, + "learning_rate": 6.20259377909444e-07, + "loss": 0.0807, + "step": 6900 + }, + { + "epoch": 0.44, + "grad_norm": 0.39630038415248925, + "learning_rate": 6.201591330831517e-07, + "loss": 0.0536, + "step": 6901 + }, + { + "epoch": 0.44, + "grad_norm": 0.9842101392684679, + "learning_rate": 6.200588831306293e-07, + "loss": 0.3377, + "step": 6902 + }, + { + "epoch": 0.44, + "grad_norm": 0.5111251463979756, + "learning_rate": 6.199586280561538e-07, + "loss": 0.1592, + "step": 6903 + }, + { + "epoch": 0.44, + "grad_norm": 0.3409038537179123, + "learning_rate": 6.198583678640019e-07, + "loss": 0.1111, + "step": 6904 + }, + { + "epoch": 0.44, + "grad_norm": 0.3043629387571401, + "learning_rate": 6.197581025584511e-07, + "loss": 0.0168, + "step": 6905 + }, + { + "epoch": 0.44, + "grad_norm": 0.7300653946375809, + "learning_rate": 6.196578321437789e-07, + "loss": 0.0399, + "step": 6906 + }, + { + "epoch": 0.44, + "grad_norm": 0.8141902535050116, + "learning_rate": 6.19557556624263e-07, + "loss": 0.1366, + "step": 6907 + }, + { + "epoch": 0.44, + "grad_norm": 0.07619674392620761, + "learning_rate": 6.194572760041815e-07, + "loss": 0.0012, + "step": 6908 + }, + { + "epoch": 0.44, + "grad_norm": 1.8270895485880623, + "learning_rate": 6.193569902878124e-07, + "loss": 0.2732, + "step": 6909 + }, + { + "epoch": 0.44, + "grad_norm": 0.8055134513779834, + "learning_rate": 6.192566994794342e-07, + "loss": 0.219, + "step": 6910 + }, + { + "epoch": 0.44, + "grad_norm": 1.3787247546125327, + "learning_rate": 6.191564035833253e-07, + "loss": 0.4333, + "step": 6911 + }, + { + "epoch": 0.44, + "grad_norm": 0.9173091222520319, + "learning_rate": 6.190561026037648e-07, + "loss": 0.2145, + "step": 6912 + }, + { + "epoch": 0.44, + "grad_norm": 0.8747517670691473, + "learning_rate": 6.189557965450316e-07, + "loss": 0.1481, + "step": 6913 + }, + { + "epoch": 0.44, + "grad_norm": 0.5975429868199008, + "learning_rate": 6.188554854114052e-07, + "loss": 0.137, + "step": 6914 + }, + { + "epoch": 0.44, + "grad_norm": 0.7264176118036235, + "learning_rate": 6.187551692071648e-07, + "loss": 0.1171, + "step": 6915 + }, + { + "epoch": 0.44, + "grad_norm": 1.098857129560535, + "learning_rate": 6.1865484793659e-07, + "loss": 0.487, + "step": 6916 + }, + { + "epoch": 0.44, + "grad_norm": 0.5252791025312935, + "learning_rate": 6.185545216039609e-07, + "loss": 0.2031, + "step": 6917 + }, + { + "epoch": 0.44, + "grad_norm": 0.6138002838529008, + "learning_rate": 6.184541902135576e-07, + "loss": 0.1736, + "step": 6918 + }, + { + "epoch": 0.44, + "grad_norm": 0.5698825485798151, + "learning_rate": 6.183538537696604e-07, + "loss": 0.2127, + "step": 6919 + }, + { + "epoch": 0.44, + "grad_norm": 2.5815916642846686, + "learning_rate": 6.182535122765498e-07, + "loss": 0.1407, + "step": 6920 + }, + { + "epoch": 0.44, + "grad_norm": 3.9493237980265676, + "learning_rate": 6.181531657385068e-07, + "loss": 0.1887, + "step": 6921 + }, + { + "epoch": 0.44, + "grad_norm": 1.7150951635851865, + "learning_rate": 6.180528141598121e-07, + "loss": 0.1532, + "step": 6922 + }, + { + "epoch": 0.44, + "grad_norm": 0.39603092612381635, + "learning_rate": 6.179524575447471e-07, + "loss": 0.0771, + "step": 6923 + }, + { + "epoch": 0.44, + "grad_norm": 1.0938817803981258, + "learning_rate": 6.178520958975932e-07, + "loss": 0.2842, + "step": 6924 + }, + { + "epoch": 0.44, + "grad_norm": 0.8009649953234502, + "learning_rate": 6.17751729222632e-07, + "loss": 0.0132, + "step": 6925 + }, + { + "epoch": 0.44, + "grad_norm": 0.33108503279882795, + "learning_rate": 6.176513575241452e-07, + "loss": 0.0082, + "step": 6926 + }, + { + "epoch": 0.44, + "grad_norm": 0.31729671294644823, + "learning_rate": 6.175509808064149e-07, + "loss": 0.1053, + "step": 6927 + }, + { + "epoch": 0.44, + "grad_norm": 0.5492720653003984, + "learning_rate": 6.174505990737238e-07, + "loss": 0.1087, + "step": 6928 + }, + { + "epoch": 0.44, + "grad_norm": 0.41360439446294106, + "learning_rate": 6.173502123303538e-07, + "loss": 0.2309, + "step": 6929 + }, + { + "epoch": 0.44, + "grad_norm": 0.5401488246862701, + "learning_rate": 6.172498205805878e-07, + "loss": 0.2762, + "step": 6930 + }, + { + "epoch": 0.44, + "grad_norm": 1.1357691343779526, + "learning_rate": 6.171494238287088e-07, + "loss": 0.26, + "step": 6931 + }, + { + "epoch": 0.44, + "grad_norm": 0.34405876677592406, + "learning_rate": 6.170490220789999e-07, + "loss": 0.2515, + "step": 6932 + }, + { + "epoch": 0.44, + "grad_norm": 1.0478404241618138, + "learning_rate": 6.169486153357444e-07, + "loss": 0.2009, + "step": 6933 + }, + { + "epoch": 0.44, + "grad_norm": 0.40073446946851377, + "learning_rate": 6.168482036032259e-07, + "loss": 0.0367, + "step": 6934 + }, + { + "epoch": 0.44, + "grad_norm": 0.6573471808166629, + "learning_rate": 6.167477868857281e-07, + "loss": 0.2296, + "step": 6935 + }, + { + "epoch": 0.44, + "grad_norm": 0.43918478126204147, + "learning_rate": 6.166473651875351e-07, + "loss": 0.0706, + "step": 6936 + }, + { + "epoch": 0.44, + "grad_norm": 1.002656753065787, + "learning_rate": 6.165469385129309e-07, + "loss": 0.1135, + "step": 6937 + }, + { + "epoch": 0.44, + "grad_norm": 1.494979637287609, + "learning_rate": 6.164465068662e-07, + "loss": 0.4066, + "step": 6938 + }, + { + "epoch": 0.44, + "grad_norm": 0.47883964075679697, + "learning_rate": 6.163460702516271e-07, + "loss": 0.1891, + "step": 6939 + }, + { + "epoch": 0.44, + "grad_norm": 1.1409255996678267, + "learning_rate": 6.162456286734969e-07, + "loss": 0.3263, + "step": 6940 + }, + { + "epoch": 0.44, + "grad_norm": 0.3621007983991172, + "learning_rate": 6.161451821360947e-07, + "loss": 0.1045, + "step": 6941 + }, + { + "epoch": 0.44, + "grad_norm": 1.0862254194163052, + "learning_rate": 6.160447306437054e-07, + "loss": 0.2759, + "step": 6942 + }, + { + "epoch": 0.44, + "grad_norm": 0.5946015766783351, + "learning_rate": 6.159442742006147e-07, + "loss": 0.0527, + "step": 6943 + }, + { + "epoch": 0.44, + "grad_norm": 0.5582335389228525, + "learning_rate": 6.158438128111081e-07, + "loss": 0.0515, + "step": 6944 + }, + { + "epoch": 0.44, + "grad_norm": 0.6744063246468665, + "learning_rate": 6.157433464794716e-07, + "loss": 0.0112, + "step": 6945 + }, + { + "epoch": 0.44, + "grad_norm": 0.7471634695884373, + "learning_rate": 6.156428752099912e-07, + "loss": 0.1808, + "step": 6946 + }, + { + "epoch": 0.44, + "grad_norm": 0.383769180786806, + "learning_rate": 6.155423990069532e-07, + "loss": 0.1272, + "step": 6947 + }, + { + "epoch": 0.44, + "grad_norm": 1.464658661457626, + "learning_rate": 6.154419178746443e-07, + "loss": 0.2879, + "step": 6948 + }, + { + "epoch": 0.44, + "grad_norm": 4.586826095802796, + "learning_rate": 6.153414318173511e-07, + "loss": 0.1035, + "step": 6949 + }, + { + "epoch": 0.44, + "grad_norm": 0.42072790409513783, + "learning_rate": 6.152409408393606e-07, + "loss": 0.0064, + "step": 6950 + }, + { + "epoch": 0.44, + "grad_norm": 0.3812237351074588, + "learning_rate": 6.151404449449599e-07, + "loss": 0.1688, + "step": 6951 + }, + { + "epoch": 0.44, + "grad_norm": 1.0321606069137594, + "learning_rate": 6.150399441384363e-07, + "loss": 0.1237, + "step": 6952 + }, + { + "epoch": 0.44, + "grad_norm": 1.606937099338738, + "learning_rate": 6.149394384240775e-07, + "loss": 0.205, + "step": 6953 + }, + { + "epoch": 0.44, + "grad_norm": 5.982863396210204, + "learning_rate": 6.148389278061711e-07, + "loss": 0.2688, + "step": 6954 + }, + { + "epoch": 0.44, + "grad_norm": 0.5434452744793399, + "learning_rate": 6.147384122890052e-07, + "loss": 0.281, + "step": 6955 + }, + { + "epoch": 0.44, + "grad_norm": 2.0847305758437145, + "learning_rate": 6.146378918768681e-07, + "loss": 0.2153, + "step": 6956 + }, + { + "epoch": 0.44, + "grad_norm": 0.8470400376818763, + "learning_rate": 6.14537366574048e-07, + "loss": 0.3103, + "step": 6957 + }, + { + "epoch": 0.44, + "grad_norm": 0.7175605445892274, + "learning_rate": 6.144368363848335e-07, + "loss": 0.2604, + "step": 6958 + }, + { + "epoch": 0.44, + "grad_norm": 0.565787560280558, + "learning_rate": 6.143363013135136e-07, + "loss": 0.2501, + "step": 6959 + }, + { + "epoch": 0.44, + "grad_norm": 0.3774440405473363, + "learning_rate": 6.142357613643773e-07, + "loss": 0.2021, + "step": 6960 + }, + { + "epoch": 0.44, + "grad_norm": 1.2299796206338582, + "learning_rate": 6.141352165417137e-07, + "loss": 0.1713, + "step": 6961 + }, + { + "epoch": 0.44, + "grad_norm": 0.7901467738433083, + "learning_rate": 6.140346668498124e-07, + "loss": 0.1122, + "step": 6962 + }, + { + "epoch": 0.44, + "grad_norm": 0.6819889165777708, + "learning_rate": 6.139341122929629e-07, + "loss": 0.1528, + "step": 6963 + }, + { + "epoch": 0.44, + "grad_norm": 0.8651701321719657, + "learning_rate": 6.13833552875455e-07, + "loss": 0.2012, + "step": 6964 + }, + { + "epoch": 0.44, + "grad_norm": 0.17737635653643177, + "learning_rate": 6.137329886015791e-07, + "loss": 0.019, + "step": 6965 + }, + { + "epoch": 0.44, + "grad_norm": 1.2778729695799618, + "learning_rate": 6.136324194756252e-07, + "loss": 0.3287, + "step": 6966 + }, + { + "epoch": 0.44, + "grad_norm": 0.5028356491136653, + "learning_rate": 6.135318455018839e-07, + "loss": 0.185, + "step": 6967 + }, + { + "epoch": 0.44, + "grad_norm": 0.49877890473115016, + "learning_rate": 6.134312666846459e-07, + "loss": 0.1714, + "step": 6968 + }, + { + "epoch": 0.44, + "grad_norm": 0.7981573156656027, + "learning_rate": 6.13330683028202e-07, + "loss": 0.0811, + "step": 6969 + }, + { + "epoch": 0.44, + "grad_norm": 0.6005895831721726, + "learning_rate": 6.132300945368433e-07, + "loss": 0.3987, + "step": 6970 + }, + { + "epoch": 0.44, + "grad_norm": 0.31192909204643715, + "learning_rate": 6.131295012148612e-07, + "loss": 0.2181, + "step": 6971 + }, + { + "epoch": 0.44, + "grad_norm": 0.3545310906924588, + "learning_rate": 6.130289030665468e-07, + "loss": 0.12, + "step": 6972 + }, + { + "epoch": 0.44, + "grad_norm": 1.025570772503253, + "learning_rate": 6.129283000961926e-07, + "loss": 0.3388, + "step": 6973 + }, + { + "epoch": 0.44, + "grad_norm": 0.5429665518394462, + "learning_rate": 6.1282769230809e-07, + "loss": 0.2221, + "step": 6974 + }, + { + "epoch": 0.44, + "grad_norm": 1.0744699445752983, + "learning_rate": 6.127270797065312e-07, + "loss": 0.0902, + "step": 6975 + }, + { + "epoch": 0.44, + "grad_norm": 1.2929872500332653, + "learning_rate": 6.126264622958086e-07, + "loss": 0.038, + "step": 6976 + }, + { + "epoch": 0.44, + "grad_norm": 1.7455266878558893, + "learning_rate": 6.125258400802146e-07, + "loss": 0.3655, + "step": 6977 + }, + { + "epoch": 0.44, + "grad_norm": 2.627917352512731, + "learning_rate": 6.124252130640423e-07, + "loss": 0.0604, + "step": 6978 + }, + { + "epoch": 0.45, + "grad_norm": 0.8291769579982922, + "learning_rate": 6.123245812515843e-07, + "loss": 0.2673, + "step": 6979 + }, + { + "epoch": 0.45, + "grad_norm": 0.4631818470052978, + "learning_rate": 6.122239446471338e-07, + "loss": 0.1917, + "step": 6980 + }, + { + "epoch": 0.45, + "grad_norm": 1.0010906980464314, + "learning_rate": 6.121233032549842e-07, + "loss": 0.0943, + "step": 6981 + }, + { + "epoch": 0.45, + "grad_norm": 1.113879147230086, + "learning_rate": 6.120226570794291e-07, + "loss": 0.2011, + "step": 6982 + }, + { + "epoch": 0.45, + "grad_norm": 0.1995621808630295, + "learning_rate": 6.119220061247622e-07, + "loss": 0.0945, + "step": 6983 + }, + { + "epoch": 0.45, + "grad_norm": 0.8629359079521771, + "learning_rate": 6.118213503952778e-07, + "loss": 0.1741, + "step": 6984 + }, + { + "epoch": 0.45, + "grad_norm": 0.7448026705895965, + "learning_rate": 6.117206898952696e-07, + "loss": 0.3602, + "step": 6985 + }, + { + "epoch": 0.45, + "grad_norm": 1.197204476150808, + "learning_rate": 6.116200246290322e-07, + "loss": 0.3732, + "step": 6986 + }, + { + "epoch": 0.45, + "grad_norm": 0.7836219455196711, + "learning_rate": 6.115193546008601e-07, + "loss": 0.2002, + "step": 6987 + }, + { + "epoch": 0.45, + "grad_norm": 0.5205671280907647, + "learning_rate": 6.114186798150482e-07, + "loss": 0.2355, + "step": 6988 + }, + { + "epoch": 0.45, + "grad_norm": 2.7519950190590827, + "learning_rate": 6.113180002758915e-07, + "loss": 0.2139, + "step": 6989 + }, + { + "epoch": 0.45, + "grad_norm": 0.3892107135968623, + "learning_rate": 6.112173159876851e-07, + "loss": 0.0879, + "step": 6990 + }, + { + "epoch": 0.45, + "grad_norm": 0.31456452934603335, + "learning_rate": 6.111166269547243e-07, + "loss": 0.0036, + "step": 6991 + }, + { + "epoch": 0.45, + "grad_norm": 0.795319291289239, + "learning_rate": 6.110159331813049e-07, + "loss": 0.1771, + "step": 6992 + }, + { + "epoch": 0.45, + "grad_norm": 3.76803851220041, + "learning_rate": 6.109152346717228e-07, + "loss": 0.2949, + "step": 6993 + }, + { + "epoch": 0.45, + "grad_norm": 1.126107545010468, + "learning_rate": 6.108145314302736e-07, + "loss": 0.3448, + "step": 6994 + }, + { + "epoch": 0.45, + "grad_norm": 0.7010751213890889, + "learning_rate": 6.107138234612539e-07, + "loss": 0.342, + "step": 6995 + }, + { + "epoch": 0.45, + "grad_norm": 2.1487274465706263, + "learning_rate": 6.106131107689598e-07, + "loss": 0.1234, + "step": 6996 + }, + { + "epoch": 0.45, + "grad_norm": 1.0047599593340455, + "learning_rate": 6.105123933576881e-07, + "loss": 0.2634, + "step": 6997 + }, + { + "epoch": 0.45, + "grad_norm": 0.6502696156537707, + "learning_rate": 6.104116712317355e-07, + "loss": 0.468, + "step": 6998 + }, + { + "epoch": 0.45, + "grad_norm": 0.27978610515785746, + "learning_rate": 6.10310944395399e-07, + "loss": 0.1076, + "step": 6999 + }, + { + "epoch": 0.45, + "grad_norm": 0.5266859976788861, + "learning_rate": 6.102102128529759e-07, + "loss": 0.0514, + "step": 7000 + }, + { + "epoch": 0.45, + "grad_norm": 0.49816105419925955, + "learning_rate": 6.101094766087637e-07, + "loss": 0.1085, + "step": 7001 + }, + { + "epoch": 0.45, + "grad_norm": 0.8781912021438211, + "learning_rate": 6.100087356670596e-07, + "loss": 0.302, + "step": 7002 + }, + { + "epoch": 0.45, + "grad_norm": 4.143626452489102, + "learning_rate": 6.09907990032162e-07, + "loss": 0.2635, + "step": 7003 + }, + { + "epoch": 0.45, + "grad_norm": 0.9082295169453426, + "learning_rate": 6.098072397083684e-07, + "loss": 0.1063, + "step": 7004 + }, + { + "epoch": 0.45, + "grad_norm": 0.3790899514979397, + "learning_rate": 6.097064846999773e-07, + "loss": 0.1614, + "step": 7005 + }, + { + "epoch": 0.45, + "grad_norm": 0.5109056251055746, + "learning_rate": 6.096057250112869e-07, + "loss": 0.1268, + "step": 7006 + }, + { + "epoch": 0.45, + "grad_norm": 0.3445931844183288, + "learning_rate": 6.095049606465962e-07, + "loss": 0.1876, + "step": 7007 + }, + { + "epoch": 0.45, + "grad_norm": 1.6897933713277216, + "learning_rate": 6.094041916102035e-07, + "loss": 0.0207, + "step": 7008 + }, + { + "epoch": 0.45, + "grad_norm": 0.701035327618761, + "learning_rate": 6.093034179064081e-07, + "loss": 0.1681, + "step": 7009 + }, + { + "epoch": 0.45, + "grad_norm": 0.5862399618767468, + "learning_rate": 6.092026395395091e-07, + "loss": 0.282, + "step": 7010 + }, + { + "epoch": 0.45, + "grad_norm": 0.9519031633352236, + "learning_rate": 6.091018565138061e-07, + "loss": 0.0982, + "step": 7011 + }, + { + "epoch": 0.45, + "grad_norm": 7.167690752353747, + "learning_rate": 6.090010688335987e-07, + "loss": 0.1385, + "step": 7012 + }, + { + "epoch": 0.45, + "grad_norm": 0.6973553591500395, + "learning_rate": 6.089002765031864e-07, + "loss": 0.22, + "step": 7013 + }, + { + "epoch": 0.45, + "grad_norm": 0.19367020577117855, + "learning_rate": 6.087994795268695e-07, + "loss": 0.0833, + "step": 7014 + }, + { + "epoch": 0.45, + "grad_norm": 1.5990229607046382, + "learning_rate": 6.08698677908948e-07, + "loss": 0.2395, + "step": 7015 + }, + { + "epoch": 0.45, + "grad_norm": 0.8620603415141842, + "learning_rate": 6.085978716537223e-07, + "loss": 0.3999, + "step": 7016 + }, + { + "epoch": 0.45, + "grad_norm": 1.049386554922625, + "learning_rate": 6.084970607654931e-07, + "loss": 0.1278, + "step": 7017 + }, + { + "epoch": 0.45, + "grad_norm": 1.6789723515233934, + "learning_rate": 6.083962452485614e-07, + "loss": 0.0992, + "step": 7018 + }, + { + "epoch": 0.45, + "grad_norm": 0.6013876531051031, + "learning_rate": 6.082954251072278e-07, + "loss": 0.068, + "step": 7019 + }, + { + "epoch": 0.45, + "grad_norm": 0.7665096500380996, + "learning_rate": 6.081946003457936e-07, + "loss": 0.1732, + "step": 7020 + }, + { + "epoch": 0.45, + "grad_norm": 0.6786080414216922, + "learning_rate": 6.080937709685604e-07, + "loss": 0.1515, + "step": 7021 + }, + { + "epoch": 0.45, + "grad_norm": 0.6232705392964524, + "learning_rate": 6.079929369798297e-07, + "loss": 0.3126, + "step": 7022 + }, + { + "epoch": 0.45, + "grad_norm": 0.9174738349237397, + "learning_rate": 6.078920983839031e-07, + "loss": 0.0919, + "step": 7023 + }, + { + "epoch": 0.45, + "grad_norm": 0.3351450552038322, + "learning_rate": 6.077912551850828e-07, + "loss": 0.1825, + "step": 7024 + }, + { + "epoch": 0.45, + "grad_norm": 0.23047285273482546, + "learning_rate": 6.076904073876706e-07, + "loss": 0.1486, + "step": 7025 + }, + { + "epoch": 0.45, + "grad_norm": 9.8596867267407, + "learning_rate": 6.075895549959693e-07, + "loss": 0.0656, + "step": 7026 + }, + { + "epoch": 0.45, + "grad_norm": 0.5862264008202591, + "learning_rate": 6.074886980142813e-07, + "loss": 0.2388, + "step": 7027 + }, + { + "epoch": 0.45, + "grad_norm": 0.6502582586265484, + "learning_rate": 6.073878364469094e-07, + "loss": 0.3046, + "step": 7028 + }, + { + "epoch": 0.45, + "grad_norm": 0.7072089060085425, + "learning_rate": 6.072869702981565e-07, + "loss": 0.016, + "step": 7029 + }, + { + "epoch": 0.45, + "grad_norm": 1.9694642254886912, + "learning_rate": 6.071860995723257e-07, + "loss": 0.6971, + "step": 7030 + }, + { + "epoch": 0.45, + "grad_norm": 0.6990240558170092, + "learning_rate": 6.070852242737206e-07, + "loss": 0.16, + "step": 7031 + }, + { + "epoch": 0.45, + "grad_norm": 0.936223825646583, + "learning_rate": 6.069843444066444e-07, + "loss": 0.295, + "step": 7032 + }, + { + "epoch": 0.45, + "grad_norm": 0.40226374749511673, + "learning_rate": 6.06883459975401e-07, + "loss": 0.1209, + "step": 7033 + }, + { + "epoch": 0.45, + "grad_norm": 1.935690253055944, + "learning_rate": 6.067825709842945e-07, + "loss": 0.1124, + "step": 7034 + }, + { + "epoch": 0.45, + "grad_norm": 1.2207454136067766, + "learning_rate": 6.066816774376287e-07, + "loss": 0.1075, + "step": 7035 + }, + { + "epoch": 0.45, + "grad_norm": 0.5618052578226107, + "learning_rate": 6.06580779339708e-07, + "loss": 0.0529, + "step": 7036 + }, + { + "epoch": 0.45, + "grad_norm": 0.7092968831146726, + "learning_rate": 6.064798766948371e-07, + "loss": 0.2881, + "step": 7037 + }, + { + "epoch": 0.45, + "grad_norm": 4.862719814670639, + "learning_rate": 6.063789695073208e-07, + "loss": 0.1436, + "step": 7038 + }, + { + "epoch": 0.45, + "grad_norm": 0.6966503669595614, + "learning_rate": 6.062780577814636e-07, + "loss": 0.2753, + "step": 7039 + }, + { + "epoch": 0.45, + "grad_norm": 0.15349847192339625, + "learning_rate": 6.061771415215708e-07, + "loss": 0.0889, + "step": 7040 + }, + { + "epoch": 0.45, + "grad_norm": 0.8643961482115704, + "learning_rate": 6.060762207319479e-07, + "loss": 0.1374, + "step": 7041 + }, + { + "epoch": 0.45, + "grad_norm": 0.2582579644147097, + "learning_rate": 6.059752954168999e-07, + "loss": 0.0937, + "step": 7042 + }, + { + "epoch": 0.45, + "grad_norm": 0.8432267169130003, + "learning_rate": 6.058743655807331e-07, + "loss": 0.2547, + "step": 7043 + }, + { + "epoch": 0.45, + "grad_norm": 0.2976526130842025, + "learning_rate": 6.057734312277526e-07, + "loss": 0.0898, + "step": 7044 + }, + { + "epoch": 0.45, + "grad_norm": 3.102839488835296, + "learning_rate": 6.056724923622651e-07, + "loss": 0.1107, + "step": 7045 + }, + { + "epoch": 0.45, + "grad_norm": 0.550269931757731, + "learning_rate": 6.055715489885768e-07, + "loss": 0.0329, + "step": 7046 + }, + { + "epoch": 0.45, + "grad_norm": 0.4707632456794082, + "learning_rate": 6.054706011109938e-07, + "loss": 0.0565, + "step": 7047 + }, + { + "epoch": 0.45, + "grad_norm": 0.46532627867999804, + "learning_rate": 6.05369648733823e-07, + "loss": 0.0121, + "step": 7048 + }, + { + "epoch": 0.45, + "grad_norm": 5.047816250916057, + "learning_rate": 6.052686918613712e-07, + "loss": 0.4253, + "step": 7049 + }, + { + "epoch": 0.45, + "grad_norm": 1.0216356209529873, + "learning_rate": 6.051677304979453e-07, + "loss": 0.1661, + "step": 7050 + }, + { + "epoch": 0.45, + "grad_norm": 0.7515596358866237, + "learning_rate": 6.050667646478527e-07, + "loss": 0.3816, + "step": 7051 + }, + { + "epoch": 0.45, + "grad_norm": 0.5555317867772822, + "learning_rate": 6.049657943154006e-07, + "loss": 0.2741, + "step": 7052 + }, + { + "epoch": 0.45, + "grad_norm": 0.45035019037585133, + "learning_rate": 6.048648195048968e-07, + "loss": 0.0255, + "step": 7053 + }, + { + "epoch": 0.45, + "grad_norm": 1.3586966690743851, + "learning_rate": 6.047638402206489e-07, + "loss": 0.1486, + "step": 7054 + }, + { + "epoch": 0.45, + "grad_norm": 0.7127962571969929, + "learning_rate": 6.046628564669651e-07, + "loss": 0.0519, + "step": 7055 + }, + { + "epoch": 0.45, + "grad_norm": 1.1781048738549795, + "learning_rate": 6.045618682481535e-07, + "loss": 0.3443, + "step": 7056 + }, + { + "epoch": 0.45, + "grad_norm": 2.4974924732471555, + "learning_rate": 6.044608755685222e-07, + "loss": 0.1785, + "step": 7057 + }, + { + "epoch": 0.45, + "grad_norm": 0.9627407303530922, + "learning_rate": 6.043598784323802e-07, + "loss": 0.1179, + "step": 7058 + }, + { + "epoch": 0.45, + "grad_norm": 0.8824047630303993, + "learning_rate": 6.042588768440357e-07, + "loss": 0.4079, + "step": 7059 + }, + { + "epoch": 0.45, + "grad_norm": 0.7897017631778379, + "learning_rate": 6.041578708077981e-07, + "loss": 0.2406, + "step": 7060 + }, + { + "epoch": 0.45, + "grad_norm": 0.4281677492674576, + "learning_rate": 6.040568603279763e-07, + "loss": 0.1699, + "step": 7061 + }, + { + "epoch": 0.45, + "grad_norm": 0.3898733033420859, + "learning_rate": 6.039558454088795e-07, + "loss": 0.1781, + "step": 7062 + }, + { + "epoch": 0.45, + "grad_norm": 0.5578960512714476, + "learning_rate": 6.038548260548176e-07, + "loss": 0.2769, + "step": 7063 + }, + { + "epoch": 0.45, + "grad_norm": 1.7492669585363871, + "learning_rate": 6.037538022700999e-07, + "loss": 0.1844, + "step": 7064 + }, + { + "epoch": 0.45, + "grad_norm": 1.6116049102028214, + "learning_rate": 6.036527740590365e-07, + "loss": 0.2459, + "step": 7065 + }, + { + "epoch": 0.45, + "grad_norm": 1.2831282294351607, + "learning_rate": 6.035517414259377e-07, + "loss": 0.1085, + "step": 7066 + }, + { + "epoch": 0.45, + "grad_norm": 1.0692156727939737, + "learning_rate": 6.03450704375113e-07, + "loss": 0.4134, + "step": 7067 + }, + { + "epoch": 0.45, + "grad_norm": 0.9839753343140111, + "learning_rate": 6.033496629108736e-07, + "loss": 0.2069, + "step": 7068 + }, + { + "epoch": 0.45, + "grad_norm": 0.9041922192505167, + "learning_rate": 6.032486170375296e-07, + "loss": 0.0116, + "step": 7069 + }, + { + "epoch": 0.45, + "grad_norm": 0.4681185734971004, + "learning_rate": 6.031475667593919e-07, + "loss": 0.2982, + "step": 7070 + }, + { + "epoch": 0.45, + "grad_norm": 0.7860744215186456, + "learning_rate": 6.030465120807719e-07, + "loss": 0.0097, + "step": 7071 + }, + { + "epoch": 0.45, + "grad_norm": 0.9383937659706048, + "learning_rate": 6.029454530059806e-07, + "loss": 0.1902, + "step": 7072 + }, + { + "epoch": 0.45, + "grad_norm": 1.718398069911204, + "learning_rate": 6.028443895393291e-07, + "loss": 0.3911, + "step": 7073 + }, + { + "epoch": 0.45, + "grad_norm": 0.06919293034300165, + "learning_rate": 6.027433216851294e-07, + "loss": 0.0005, + "step": 7074 + }, + { + "epoch": 0.45, + "grad_norm": 1.0320483799463922, + "learning_rate": 6.026422494476929e-07, + "loss": 0.1796, + "step": 7075 + }, + { + "epoch": 0.45, + "grad_norm": 0.9449293296619714, + "learning_rate": 6.025411728313317e-07, + "loss": 0.1266, + "step": 7076 + }, + { + "epoch": 0.45, + "grad_norm": 0.2993277087210431, + "learning_rate": 6.02440091840358e-07, + "loss": 0.1716, + "step": 7077 + }, + { + "epoch": 0.45, + "grad_norm": 0.945913960885128, + "learning_rate": 6.023390064790841e-07, + "loss": 0.298, + "step": 7078 + }, + { + "epoch": 0.45, + "grad_norm": 0.8939332327962871, + "learning_rate": 6.022379167518225e-07, + "loss": 0.0611, + "step": 7079 + }, + { + "epoch": 0.45, + "grad_norm": 0.9663232671040488, + "learning_rate": 6.021368226628857e-07, + "loss": 0.4879, + "step": 7080 + }, + { + "epoch": 0.45, + "grad_norm": 0.3242663789682058, + "learning_rate": 6.020357242165868e-07, + "loss": 0.1704, + "step": 7081 + }, + { + "epoch": 0.45, + "grad_norm": 0.9455796030817295, + "learning_rate": 6.019346214172388e-07, + "loss": 0.1807, + "step": 7082 + }, + { + "epoch": 0.45, + "grad_norm": 1.4402759090689443, + "learning_rate": 6.018335142691548e-07, + "loss": 0.0593, + "step": 7083 + }, + { + "epoch": 0.45, + "grad_norm": 1.2060208908726957, + "learning_rate": 6.017324027766486e-07, + "loss": 0.2904, + "step": 7084 + }, + { + "epoch": 0.45, + "grad_norm": 0.7129259771847446, + "learning_rate": 6.016312869440334e-07, + "loss": 0.4456, + "step": 7085 + }, + { + "epoch": 0.45, + "grad_norm": 0.7854997394334815, + "learning_rate": 6.015301667756233e-07, + "loss": 0.1428, + "step": 7086 + }, + { + "epoch": 0.45, + "grad_norm": 0.48994875507244606, + "learning_rate": 6.014290422757322e-07, + "loss": 0.2895, + "step": 7087 + }, + { + "epoch": 0.45, + "grad_norm": 0.7294957042734611, + "learning_rate": 6.013279134486742e-07, + "loss": 0.0426, + "step": 7088 + }, + { + "epoch": 0.45, + "grad_norm": 0.7434832182245137, + "learning_rate": 6.012267802987636e-07, + "loss": 0.1383, + "step": 7089 + }, + { + "epoch": 0.45, + "grad_norm": 0.4636452503714737, + "learning_rate": 6.011256428303152e-07, + "loss": 0.1793, + "step": 7090 + }, + { + "epoch": 0.45, + "grad_norm": 0.7988272816439137, + "learning_rate": 6.010245010476436e-07, + "loss": 0.3583, + "step": 7091 + }, + { + "epoch": 0.45, + "grad_norm": 0.9552552379750733, + "learning_rate": 6.009233549550636e-07, + "loss": 0.3605, + "step": 7092 + }, + { + "epoch": 0.45, + "grad_norm": 9.43371975585381, + "learning_rate": 6.008222045568907e-07, + "loss": 0.1882, + "step": 7093 + }, + { + "epoch": 0.45, + "grad_norm": 1.3571109562706452, + "learning_rate": 6.007210498574395e-07, + "loss": 0.2746, + "step": 7094 + }, + { + "epoch": 0.45, + "grad_norm": 0.4418232209930374, + "learning_rate": 6.00619890861026e-07, + "loss": 0.1772, + "step": 7095 + }, + { + "epoch": 0.45, + "grad_norm": 5.088758666672563, + "learning_rate": 6.005187275719657e-07, + "loss": 0.0755, + "step": 7096 + }, + { + "epoch": 0.45, + "grad_norm": 0.5687223310030136, + "learning_rate": 6.004175599945743e-07, + "loss": 0.1623, + "step": 7097 + }, + { + "epoch": 0.45, + "grad_norm": 1.01592613710664, + "learning_rate": 6.003163881331681e-07, + "loss": 0.2278, + "step": 7098 + }, + { + "epoch": 0.45, + "grad_norm": 0.3370256316854185, + "learning_rate": 6.002152119920629e-07, + "loss": 0.1968, + "step": 7099 + }, + { + "epoch": 0.45, + "grad_norm": 0.4808284341035867, + "learning_rate": 6.001140315755755e-07, + "loss": 0.2689, + "step": 7100 + }, + { + "epoch": 0.45, + "grad_norm": 8.011765000959112, + "learning_rate": 6.000128468880222e-07, + "loss": 0.3323, + "step": 7101 + }, + { + "epoch": 0.45, + "grad_norm": 0.8513305845378742, + "learning_rate": 5.999116579337198e-07, + "loss": 0.2085, + "step": 7102 + }, + { + "epoch": 0.45, + "grad_norm": 0.6186146638521367, + "learning_rate": 5.998104647169852e-07, + "loss": 0.0298, + "step": 7103 + }, + { + "epoch": 0.45, + "grad_norm": 0.7770787457397188, + "learning_rate": 5.997092672421356e-07, + "loss": 0.2867, + "step": 7104 + }, + { + "epoch": 0.45, + "grad_norm": 0.6759549912096738, + "learning_rate": 5.996080655134881e-07, + "loss": 0.3032, + "step": 7105 + }, + { + "epoch": 0.45, + "grad_norm": 1.375055395363622, + "learning_rate": 5.995068595353604e-07, + "loss": 0.0204, + "step": 7106 + }, + { + "epoch": 0.45, + "grad_norm": 0.8979471716423942, + "learning_rate": 5.994056493120699e-07, + "loss": 0.2618, + "step": 7107 + }, + { + "epoch": 0.45, + "grad_norm": 0.5738377713029338, + "learning_rate": 5.993044348479347e-07, + "loss": 0.1037, + "step": 7108 + }, + { + "epoch": 0.45, + "grad_norm": 0.5764327562948564, + "learning_rate": 5.992032161472726e-07, + "loss": 0.1666, + "step": 7109 + }, + { + "epoch": 0.45, + "grad_norm": 1.5461270918707353, + "learning_rate": 5.99101993214402e-07, + "loss": 0.3417, + "step": 7110 + }, + { + "epoch": 0.45, + "grad_norm": 1.4657147448614762, + "learning_rate": 5.99000766053641e-07, + "loss": 0.1103, + "step": 7111 + }, + { + "epoch": 0.45, + "grad_norm": 0.8157136799194386, + "learning_rate": 5.988995346693084e-07, + "loss": 0.1337, + "step": 7112 + }, + { + "epoch": 0.45, + "grad_norm": 0.46844818672747407, + "learning_rate": 5.987982990657228e-07, + "loss": 0.1807, + "step": 7113 + }, + { + "epoch": 0.45, + "grad_norm": 1.8863593073604794, + "learning_rate": 5.986970592472033e-07, + "loss": 0.3098, + "step": 7114 + }, + { + "epoch": 0.45, + "grad_norm": 0.5580882911711244, + "learning_rate": 5.985958152180686e-07, + "loss": 0.2203, + "step": 7115 + }, + { + "epoch": 0.45, + "grad_norm": 0.9407780401334193, + "learning_rate": 5.984945669826382e-07, + "loss": 0.2465, + "step": 7116 + }, + { + "epoch": 0.45, + "grad_norm": 0.699661212881519, + "learning_rate": 5.983933145452318e-07, + "loss": 0.2772, + "step": 7117 + }, + { + "epoch": 0.45, + "grad_norm": 0.24115919400100697, + "learning_rate": 5.982920579101687e-07, + "loss": 0.0113, + "step": 7118 + }, + { + "epoch": 0.45, + "grad_norm": 0.5354065445651078, + "learning_rate": 5.981907970817688e-07, + "loss": 0.1612, + "step": 7119 + }, + { + "epoch": 0.45, + "grad_norm": 0.6437441543164536, + "learning_rate": 5.980895320643521e-07, + "loss": 0.2031, + "step": 7120 + }, + { + "epoch": 0.45, + "grad_norm": 0.7059838005150271, + "learning_rate": 5.979882628622389e-07, + "loss": 0.4494, + "step": 7121 + }, + { + "epoch": 0.45, + "grad_norm": 0.31758025093563497, + "learning_rate": 5.978869894797493e-07, + "loss": 0.1973, + "step": 7122 + }, + { + "epoch": 0.45, + "grad_norm": 0.6120633224019312, + "learning_rate": 5.97785711921204e-07, + "loss": 0.3558, + "step": 7123 + }, + { + "epoch": 0.45, + "grad_norm": 1.312367372031094, + "learning_rate": 5.976844301909236e-07, + "loss": 0.2304, + "step": 7124 + }, + { + "epoch": 0.45, + "grad_norm": 0.3313741831082385, + "learning_rate": 5.975831442932291e-07, + "loss": 0.1006, + "step": 7125 + }, + { + "epoch": 0.45, + "grad_norm": 0.5129312842561959, + "learning_rate": 5.974818542324414e-07, + "loss": 0.22, + "step": 7126 + }, + { + "epoch": 0.45, + "grad_norm": 0.32401724292769385, + "learning_rate": 5.97380560012882e-07, + "loss": 0.2122, + "step": 7127 + }, + { + "epoch": 0.45, + "grad_norm": 0.4625502269175985, + "learning_rate": 5.972792616388721e-07, + "loss": 0.055, + "step": 7128 + }, + { + "epoch": 0.45, + "grad_norm": 0.9942322616535594, + "learning_rate": 5.971779591147332e-07, + "loss": 0.1874, + "step": 7129 + }, + { + "epoch": 0.45, + "grad_norm": 0.4563264253733514, + "learning_rate": 5.970766524447875e-07, + "loss": 0.087, + "step": 7130 + }, + { + "epoch": 0.45, + "grad_norm": 1.2157699901808448, + "learning_rate": 5.969753416333564e-07, + "loss": 0.0866, + "step": 7131 + }, + { + "epoch": 0.45, + "grad_norm": 0.8444079709273857, + "learning_rate": 5.968740266847623e-07, + "loss": 0.1228, + "step": 7132 + }, + { + "epoch": 0.45, + "grad_norm": 0.34704186053735525, + "learning_rate": 5.967727076033274e-07, + "loss": 0.0757, + "step": 7133 + }, + { + "epoch": 0.45, + "grad_norm": 1.3237010936906624, + "learning_rate": 5.966713843933746e-07, + "loss": 0.0512, + "step": 7134 + }, + { + "epoch": 0.46, + "grad_norm": 5.932547110155241, + "learning_rate": 5.965700570592261e-07, + "loss": 0.245, + "step": 7135 + }, + { + "epoch": 0.46, + "grad_norm": 0.7496287466544687, + "learning_rate": 5.964687256052046e-07, + "loss": 0.2027, + "step": 7136 + }, + { + "epoch": 0.46, + "grad_norm": 0.30193107065623703, + "learning_rate": 5.963673900356335e-07, + "loss": 0.092, + "step": 7137 + }, + { + "epoch": 0.46, + "grad_norm": 2.332263676956981, + "learning_rate": 5.962660503548358e-07, + "loss": 0.1935, + "step": 7138 + }, + { + "epoch": 0.46, + "grad_norm": 0.6921697636905982, + "learning_rate": 5.961647065671349e-07, + "loss": 0.2038, + "step": 7139 + }, + { + "epoch": 0.46, + "grad_norm": 1.7691815576656338, + "learning_rate": 5.960633586768542e-07, + "loss": 0.1293, + "step": 7140 + }, + { + "epoch": 0.46, + "grad_norm": 0.5950655910490383, + "learning_rate": 5.959620066883175e-07, + "loss": 0.163, + "step": 7141 + }, + { + "epoch": 0.46, + "grad_norm": 0.6732396003329623, + "learning_rate": 5.958606506058488e-07, + "loss": 0.1545, + "step": 7142 + }, + { + "epoch": 0.46, + "grad_norm": 0.5869733926350481, + "learning_rate": 5.95759290433772e-07, + "loss": 0.0886, + "step": 7143 + }, + { + "epoch": 0.46, + "grad_norm": 0.8168037275744525, + "learning_rate": 5.956579261764115e-07, + "loss": 0.3139, + "step": 7144 + }, + { + "epoch": 0.46, + "grad_norm": 0.32485987063036015, + "learning_rate": 5.955565578380914e-07, + "loss": 0.1552, + "step": 7145 + }, + { + "epoch": 0.46, + "grad_norm": 0.31954013098482065, + "learning_rate": 5.954551854231365e-07, + "loss": 0.086, + "step": 7146 + }, + { + "epoch": 0.46, + "grad_norm": 0.2954219369681322, + "learning_rate": 5.953538089358713e-07, + "loss": 0.0162, + "step": 7147 + }, + { + "epoch": 0.46, + "grad_norm": 0.66192513564281, + "learning_rate": 5.952524283806214e-07, + "loss": 0.2002, + "step": 7148 + }, + { + "epoch": 0.46, + "grad_norm": 0.6250885992829237, + "learning_rate": 5.95151043761711e-07, + "loss": 0.2374, + "step": 7149 + }, + { + "epoch": 0.46, + "grad_norm": 0.30212655217801865, + "learning_rate": 5.950496550834659e-07, + "loss": 0.1873, + "step": 7150 + }, + { + "epoch": 0.46, + "grad_norm": 1.091021040170508, + "learning_rate": 5.949482623502116e-07, + "loss": 0.3885, + "step": 7151 + }, + { + "epoch": 0.46, + "grad_norm": 0.6016881690483314, + "learning_rate": 5.948468655662734e-07, + "loss": 0.311, + "step": 7152 + }, + { + "epoch": 0.46, + "grad_norm": 0.8520165271748031, + "learning_rate": 5.947454647359774e-07, + "loss": 0.4149, + "step": 7153 + }, + { + "epoch": 0.46, + "grad_norm": 0.6822457209307743, + "learning_rate": 5.946440598636492e-07, + "loss": 0.1621, + "step": 7154 + }, + { + "epoch": 0.46, + "grad_norm": 0.9286637757220354, + "learning_rate": 5.945426509536152e-07, + "loss": 0.1993, + "step": 7155 + }, + { + "epoch": 0.46, + "grad_norm": 0.4551563553112895, + "learning_rate": 5.944412380102017e-07, + "loss": 0.1555, + "step": 7156 + }, + { + "epoch": 0.46, + "grad_norm": 1.549513819768842, + "learning_rate": 5.943398210377352e-07, + "loss": 0.317, + "step": 7157 + }, + { + "epoch": 0.46, + "grad_norm": 0.8022045638220008, + "learning_rate": 5.942384000405423e-07, + "loss": 0.2718, + "step": 7158 + }, + { + "epoch": 0.46, + "grad_norm": 0.5936247476974245, + "learning_rate": 5.941369750229497e-07, + "loss": 0.2676, + "step": 7159 + }, + { + "epoch": 0.46, + "grad_norm": 0.43999162216149557, + "learning_rate": 5.940355459892844e-07, + "loss": 0.3624, + "step": 7160 + }, + { + "epoch": 0.46, + "grad_norm": 2.844207234831347, + "learning_rate": 5.939341129438738e-07, + "loss": 0.054, + "step": 7161 + }, + { + "epoch": 0.46, + "grad_norm": 0.5082391569111769, + "learning_rate": 5.938326758910453e-07, + "loss": 0.2208, + "step": 7162 + }, + { + "epoch": 0.46, + "grad_norm": 1.172219731786886, + "learning_rate": 5.93731234835126e-07, + "loss": 0.3387, + "step": 7163 + }, + { + "epoch": 0.46, + "grad_norm": 0.4864323174163239, + "learning_rate": 5.936297897804439e-07, + "loss": 0.2884, + "step": 7164 + }, + { + "epoch": 0.46, + "grad_norm": 0.4171003748458937, + "learning_rate": 5.935283407313268e-07, + "loss": 0.1463, + "step": 7165 + }, + { + "epoch": 0.46, + "grad_norm": 1.2970815946203582, + "learning_rate": 5.934268876921025e-07, + "loss": 0.3739, + "step": 7166 + }, + { + "epoch": 0.46, + "grad_norm": 0.5236303770620295, + "learning_rate": 5.933254306670994e-07, + "loss": 0.1736, + "step": 7167 + }, + { + "epoch": 0.46, + "grad_norm": 0.5700171082654275, + "learning_rate": 5.932239696606457e-07, + "loss": 0.2731, + "step": 7168 + }, + { + "epoch": 0.46, + "grad_norm": 0.43685907066280566, + "learning_rate": 5.931225046770703e-07, + "loss": 0.2653, + "step": 7169 + }, + { + "epoch": 0.46, + "grad_norm": 2.8688550234332038, + "learning_rate": 5.930210357207015e-07, + "loss": 0.2695, + "step": 7170 + }, + { + "epoch": 0.46, + "grad_norm": 1.0139315492143588, + "learning_rate": 5.929195627958683e-07, + "loss": 0.1514, + "step": 7171 + }, + { + "epoch": 0.46, + "grad_norm": 1.563002708663235, + "learning_rate": 5.928180859068999e-07, + "loss": 0.1547, + "step": 7172 + }, + { + "epoch": 0.46, + "grad_norm": 1.5001619767205903, + "learning_rate": 5.927166050581252e-07, + "loss": 0.033, + "step": 7173 + }, + { + "epoch": 0.46, + "grad_norm": 0.6917706009406938, + "learning_rate": 5.926151202538739e-07, + "loss": 0.1704, + "step": 7174 + }, + { + "epoch": 0.46, + "grad_norm": 0.9353287183536301, + "learning_rate": 5.925136314984753e-07, + "loss": 0.2011, + "step": 7175 + }, + { + "epoch": 0.46, + "grad_norm": 0.6112538646341721, + "learning_rate": 5.924121387962593e-07, + "loss": 0.0681, + "step": 7176 + }, + { + "epoch": 0.46, + "grad_norm": 0.858619915778985, + "learning_rate": 5.923106421515556e-07, + "loss": 0.3036, + "step": 7177 + }, + { + "epoch": 0.46, + "grad_norm": 5.354676634271242, + "learning_rate": 5.922091415686944e-07, + "loss": 0.0096, + "step": 7178 + }, + { + "epoch": 0.46, + "grad_norm": 1.2571455296787604, + "learning_rate": 5.921076370520057e-07, + "loss": 0.1193, + "step": 7179 + }, + { + "epoch": 0.46, + "grad_norm": 0.736715975373125, + "learning_rate": 5.920061286058202e-07, + "loss": 0.1809, + "step": 7180 + }, + { + "epoch": 0.46, + "grad_norm": 7.068782573052699, + "learning_rate": 5.919046162344683e-07, + "loss": 0.0239, + "step": 7181 + }, + { + "epoch": 0.46, + "grad_norm": 1.3297467235529592, + "learning_rate": 5.918030999422808e-07, + "loss": 0.0886, + "step": 7182 + }, + { + "epoch": 0.46, + "grad_norm": 0.7417717017004508, + "learning_rate": 5.917015797335882e-07, + "loss": 0.1744, + "step": 7183 + }, + { + "epoch": 0.46, + "grad_norm": 1.0933639592682791, + "learning_rate": 5.916000556127221e-07, + "loss": 0.1144, + "step": 7184 + }, + { + "epoch": 0.46, + "grad_norm": 0.17164216430584833, + "learning_rate": 5.914985275840135e-07, + "loss": 0.0051, + "step": 7185 + }, + { + "epoch": 0.46, + "grad_norm": 0.6620853225118942, + "learning_rate": 5.913969956517936e-07, + "loss": 0.1639, + "step": 7186 + }, + { + "epoch": 0.46, + "grad_norm": 1.0864099238793041, + "learning_rate": 5.912954598203943e-07, + "loss": 0.3266, + "step": 7187 + }, + { + "epoch": 0.46, + "grad_norm": 0.6563817289774813, + "learning_rate": 5.91193920094147e-07, + "loss": 0.3287, + "step": 7188 + }, + { + "epoch": 0.46, + "grad_norm": 0.7480545118849424, + "learning_rate": 5.910923764773841e-07, + "loss": 0.0214, + "step": 7189 + }, + { + "epoch": 0.46, + "grad_norm": 0.7998208232146227, + "learning_rate": 5.90990828974437e-07, + "loss": 0.3755, + "step": 7190 + }, + { + "epoch": 0.46, + "grad_norm": 0.85748334831464, + "learning_rate": 5.908892775896383e-07, + "loss": 0.0507, + "step": 7191 + }, + { + "epoch": 0.46, + "grad_norm": 1.1970218213760373, + "learning_rate": 5.907877223273202e-07, + "loss": 0.2678, + "step": 7192 + }, + { + "epoch": 0.46, + "grad_norm": 1.2924320625986172, + "learning_rate": 5.906861631918155e-07, + "loss": 0.2574, + "step": 7193 + }, + { + "epoch": 0.46, + "grad_norm": 0.7987424260681844, + "learning_rate": 5.905846001874566e-07, + "loss": 0.1193, + "step": 7194 + }, + { + "epoch": 0.46, + "grad_norm": 0.6319274213415333, + "learning_rate": 5.904830333185768e-07, + "loss": 0.1224, + "step": 7195 + }, + { + "epoch": 0.46, + "grad_norm": 0.3711639795844755, + "learning_rate": 5.903814625895088e-07, + "loss": 0.0397, + "step": 7196 + }, + { + "epoch": 0.46, + "grad_norm": 0.3745274574843632, + "learning_rate": 5.902798880045858e-07, + "loss": 0.0753, + "step": 7197 + }, + { + "epoch": 0.46, + "grad_norm": 4.56204350914102, + "learning_rate": 5.901783095681414e-07, + "loss": 0.0599, + "step": 7198 + }, + { + "epoch": 0.46, + "grad_norm": 1.1506934707785388, + "learning_rate": 5.900767272845091e-07, + "loss": 0.212, + "step": 7199 + }, + { + "epoch": 0.46, + "grad_norm": 0.5970370856193812, + "learning_rate": 5.899751411580224e-07, + "loss": 0.2976, + "step": 7200 + }, + { + "epoch": 0.46, + "grad_norm": 0.9259827639952576, + "learning_rate": 5.898735511930155e-07, + "loss": 0.2, + "step": 7201 + }, + { + "epoch": 0.46, + "grad_norm": 0.46751645919753676, + "learning_rate": 5.89771957393822e-07, + "loss": 0.2628, + "step": 7202 + }, + { + "epoch": 0.46, + "grad_norm": 0.8725741363156909, + "learning_rate": 5.896703597647764e-07, + "loss": 0.2463, + "step": 7203 + }, + { + "epoch": 0.46, + "grad_norm": 1.4982264543897335, + "learning_rate": 5.89568758310213e-07, + "loss": 0.1472, + "step": 7204 + }, + { + "epoch": 0.46, + "grad_norm": 1.1028605339442303, + "learning_rate": 5.894671530344664e-07, + "loss": 0.193, + "step": 7205 + }, + { + "epoch": 0.46, + "grad_norm": 0.5001892897512012, + "learning_rate": 5.893655439418711e-07, + "loss": 0.088, + "step": 7206 + }, + { + "epoch": 0.46, + "grad_norm": 9.022140920096918, + "learning_rate": 5.892639310367622e-07, + "loss": 0.1458, + "step": 7207 + }, + { + "epoch": 0.46, + "grad_norm": 14.497112180372172, + "learning_rate": 5.891623143234744e-07, + "loss": 0.189, + "step": 7208 + }, + { + "epoch": 0.46, + "grad_norm": 0.30387165766925095, + "learning_rate": 5.89060693806343e-07, + "loss": 0.0739, + "step": 7209 + }, + { + "epoch": 0.46, + "grad_norm": 1.0157909781768688, + "learning_rate": 5.889590694897035e-07, + "loss": 0.5586, + "step": 7210 + }, + { + "epoch": 0.46, + "grad_norm": 1.1583784876385135, + "learning_rate": 5.888574413778913e-07, + "loss": 0.3272, + "step": 7211 + }, + { + "epoch": 0.46, + "grad_norm": 0.7133214714640085, + "learning_rate": 5.88755809475242e-07, + "loss": 0.1663, + "step": 7212 + }, + { + "epoch": 0.46, + "grad_norm": 0.5133567316724253, + "learning_rate": 5.886541737860912e-07, + "loss": 0.1052, + "step": 7213 + }, + { + "epoch": 0.46, + "grad_norm": 17.285625438249102, + "learning_rate": 5.885525343147754e-07, + "loss": 0.2791, + "step": 7214 + }, + { + "epoch": 0.46, + "grad_norm": 3.1601010633523465, + "learning_rate": 5.884508910656302e-07, + "loss": 0.1034, + "step": 7215 + }, + { + "epoch": 0.46, + "grad_norm": 1.7205234624779004, + "learning_rate": 5.883492440429925e-07, + "loss": 0.4425, + "step": 7216 + }, + { + "epoch": 0.46, + "grad_norm": 0.9092036765999852, + "learning_rate": 5.882475932511984e-07, + "loss": 0.2444, + "step": 7217 + }, + { + "epoch": 0.46, + "grad_norm": 1.966331431955261, + "learning_rate": 5.881459386945845e-07, + "loss": 0.1008, + "step": 7218 + }, + { + "epoch": 0.46, + "grad_norm": 0.7241359638013197, + "learning_rate": 5.880442803774877e-07, + "loss": 0.0397, + "step": 7219 + }, + { + "epoch": 0.46, + "grad_norm": 0.7865950024944246, + "learning_rate": 5.879426183042448e-07, + "loss": 0.3198, + "step": 7220 + }, + { + "epoch": 0.46, + "grad_norm": 3.2668959117213308, + "learning_rate": 5.878409524791929e-07, + "loss": 0.3804, + "step": 7221 + }, + { + "epoch": 0.46, + "grad_norm": 0.6940116579350895, + "learning_rate": 5.877392829066697e-07, + "loss": 0.1352, + "step": 7222 + }, + { + "epoch": 0.46, + "grad_norm": 0.7889425403151112, + "learning_rate": 5.876376095910122e-07, + "loss": 0.1604, + "step": 7223 + }, + { + "epoch": 0.46, + "grad_norm": 0.13743757225120593, + "learning_rate": 5.87535932536558e-07, + "loss": 0.0081, + "step": 7224 + }, + { + "epoch": 0.46, + "grad_norm": 0.3539409891951253, + "learning_rate": 5.874342517476451e-07, + "loss": 0.1702, + "step": 7225 + }, + { + "epoch": 0.46, + "grad_norm": 1.3253394447168811, + "learning_rate": 5.873325672286112e-07, + "loss": 0.1449, + "step": 7226 + }, + { + "epoch": 0.46, + "grad_norm": 0.1508727438993049, + "learning_rate": 5.872308789837943e-07, + "loss": 0.0957, + "step": 7227 + }, + { + "epoch": 0.46, + "grad_norm": 0.37757411481511055, + "learning_rate": 5.871291870175328e-07, + "loss": 0.1251, + "step": 7228 + }, + { + "epoch": 0.46, + "grad_norm": 0.16487198167342196, + "learning_rate": 5.87027491334165e-07, + "loss": 0.0046, + "step": 7229 + }, + { + "epoch": 0.46, + "grad_norm": 0.6371992172879106, + "learning_rate": 5.869257919380297e-07, + "loss": 0.124, + "step": 7230 + }, + { + "epoch": 0.46, + "grad_norm": 0.9435011963184733, + "learning_rate": 5.868240888334652e-07, + "loss": 0.2358, + "step": 7231 + }, + { + "epoch": 0.46, + "grad_norm": 0.6291321543008145, + "learning_rate": 5.867223820248105e-07, + "loss": 0.2749, + "step": 7232 + }, + { + "epoch": 0.46, + "grad_norm": 0.6028154079885149, + "learning_rate": 5.866206715164047e-07, + "loss": 0.2447, + "step": 7233 + }, + { + "epoch": 0.46, + "grad_norm": 0.243144110217354, + "learning_rate": 5.86518957312587e-07, + "loss": 0.0919, + "step": 7234 + }, + { + "epoch": 0.46, + "grad_norm": 1.7366644013030765, + "learning_rate": 5.864172394176965e-07, + "loss": 0.1285, + "step": 7235 + }, + { + "epoch": 0.46, + "grad_norm": 1.3381590499033993, + "learning_rate": 5.86315517836073e-07, + "loss": 0.1884, + "step": 7236 + }, + { + "epoch": 0.46, + "grad_norm": 0.9461233417057059, + "learning_rate": 5.862137925720559e-07, + "loss": 0.1757, + "step": 7237 + }, + { + "epoch": 0.46, + "grad_norm": 0.3710231970881075, + "learning_rate": 5.861120636299851e-07, + "loss": 0.2281, + "step": 7238 + }, + { + "epoch": 0.46, + "grad_norm": 0.6179070684772522, + "learning_rate": 5.860103310142005e-07, + "loss": 0.1681, + "step": 7239 + }, + { + "epoch": 0.46, + "grad_norm": 1.0201386018154825, + "learning_rate": 5.859085947290423e-07, + "loss": 0.3028, + "step": 7240 + }, + { + "epoch": 0.46, + "grad_norm": 0.8091902911533021, + "learning_rate": 5.858068547788509e-07, + "loss": 0.0124, + "step": 7241 + }, + { + "epoch": 0.46, + "grad_norm": 1.3980271825191142, + "learning_rate": 5.857051111679664e-07, + "loss": 0.3732, + "step": 7242 + }, + { + "epoch": 0.46, + "grad_norm": 0.8013402975837172, + "learning_rate": 5.856033639007297e-07, + "loss": 0.2799, + "step": 7243 + }, + { + "epoch": 0.46, + "grad_norm": 0.1687425807823621, + "learning_rate": 5.855016129814815e-07, + "loss": 0.0311, + "step": 7244 + }, + { + "epoch": 0.46, + "grad_norm": 1.3294547064513318, + "learning_rate": 5.853998584145624e-07, + "loss": 0.2819, + "step": 7245 + }, + { + "epoch": 0.46, + "grad_norm": 1.3755901930805092, + "learning_rate": 5.852981002043138e-07, + "loss": 0.3118, + "step": 7246 + }, + { + "epoch": 0.46, + "grad_norm": 1.289201689858387, + "learning_rate": 5.851963383550766e-07, + "loss": 0.106, + "step": 7247 + }, + { + "epoch": 0.46, + "grad_norm": 0.5300436654021281, + "learning_rate": 5.850945728711925e-07, + "loss": 0.1291, + "step": 7248 + }, + { + "epoch": 0.46, + "grad_norm": 0.6562759244648393, + "learning_rate": 5.849928037570028e-07, + "loss": 0.1758, + "step": 7249 + }, + { + "epoch": 0.46, + "grad_norm": 4.098462470269909, + "learning_rate": 5.848910310168493e-07, + "loss": 0.1566, + "step": 7250 + }, + { + "epoch": 0.46, + "grad_norm": 0.8302393051115002, + "learning_rate": 5.847892546550737e-07, + "loss": 0.128, + "step": 7251 + }, + { + "epoch": 0.46, + "grad_norm": 0.8359880903191275, + "learning_rate": 5.84687474676018e-07, + "loss": 0.3108, + "step": 7252 + }, + { + "epoch": 0.46, + "grad_norm": 1.4437891175800763, + "learning_rate": 5.845856910840245e-07, + "loss": 0.2676, + "step": 7253 + }, + { + "epoch": 0.46, + "grad_norm": 0.9646235215281507, + "learning_rate": 5.844839038834353e-07, + "loss": 0.0868, + "step": 7254 + }, + { + "epoch": 0.46, + "grad_norm": 0.6620742954436339, + "learning_rate": 5.84382113078593e-07, + "loss": 0.2169, + "step": 7255 + }, + { + "epoch": 0.46, + "grad_norm": 0.4860674138148537, + "learning_rate": 5.8428031867384e-07, + "loss": 0.1432, + "step": 7256 + }, + { + "epoch": 0.46, + "grad_norm": 0.5606129692529753, + "learning_rate": 5.841785206735192e-07, + "loss": 0.1234, + "step": 7257 + }, + { + "epoch": 0.46, + "grad_norm": 1.094353071114604, + "learning_rate": 5.840767190819736e-07, + "loss": 0.2795, + "step": 7258 + }, + { + "epoch": 0.46, + "grad_norm": 0.9150744126339434, + "learning_rate": 5.839749139035461e-07, + "loss": 0.191, + "step": 7259 + }, + { + "epoch": 0.46, + "grad_norm": 1.3703948391791727, + "learning_rate": 5.8387310514258e-07, + "loss": 0.0749, + "step": 7260 + }, + { + "epoch": 0.46, + "grad_norm": 0.5517259013873832, + "learning_rate": 5.837712928034187e-07, + "loss": 0.2893, + "step": 7261 + }, + { + "epoch": 0.46, + "grad_norm": 1.0500803493405015, + "learning_rate": 5.836694768904054e-07, + "loss": 0.3375, + "step": 7262 + }, + { + "epoch": 0.46, + "grad_norm": 2.9423254492124644, + "learning_rate": 5.835676574078842e-07, + "loss": 0.075, + "step": 7263 + }, + { + "epoch": 0.46, + "grad_norm": 0.7692318263987405, + "learning_rate": 5.834658343601987e-07, + "loss": 0.4155, + "step": 7264 + }, + { + "epoch": 0.46, + "grad_norm": 0.9579505383096613, + "learning_rate": 5.833640077516929e-07, + "loss": 0.1745, + "step": 7265 + }, + { + "epoch": 0.46, + "grad_norm": 1.2862268054084534, + "learning_rate": 5.832621775867109e-07, + "loss": 0.1967, + "step": 7266 + }, + { + "epoch": 0.46, + "grad_norm": 0.6713444781147704, + "learning_rate": 5.831603438695971e-07, + "loss": 0.3673, + "step": 7267 + }, + { + "epoch": 0.46, + "grad_norm": 1.3249779682100575, + "learning_rate": 5.830585066046958e-07, + "loss": 0.4114, + "step": 7268 + }, + { + "epoch": 0.46, + "grad_norm": 0.7909573625344419, + "learning_rate": 5.829566657963517e-07, + "loss": 0.0779, + "step": 7269 + }, + { + "epoch": 0.46, + "grad_norm": 1.3061708633493678, + "learning_rate": 5.828548214489095e-07, + "loss": 0.1772, + "step": 7270 + }, + { + "epoch": 0.46, + "grad_norm": 3.2316863525602373, + "learning_rate": 5.82752973566714e-07, + "loss": 0.1857, + "step": 7271 + }, + { + "epoch": 0.46, + "grad_norm": 0.3739109090293889, + "learning_rate": 5.826511221541104e-07, + "loss": 0.0953, + "step": 7272 + }, + { + "epoch": 0.46, + "grad_norm": 2.657124797840211, + "learning_rate": 5.825492672154437e-07, + "loss": 0.2445, + "step": 7273 + }, + { + "epoch": 0.46, + "grad_norm": 0.568751310419939, + "learning_rate": 5.824474087550593e-07, + "loss": 0.1064, + "step": 7274 + }, + { + "epoch": 0.46, + "grad_norm": 0.5527706509889663, + "learning_rate": 5.823455467773026e-07, + "loss": 0.3305, + "step": 7275 + }, + { + "epoch": 0.46, + "grad_norm": 0.7951595167062507, + "learning_rate": 5.822436812865194e-07, + "loss": 0.0214, + "step": 7276 + }, + { + "epoch": 0.46, + "grad_norm": 4.654831699863176, + "learning_rate": 5.821418122870556e-07, + "loss": 0.0929, + "step": 7277 + }, + { + "epoch": 0.46, + "grad_norm": 0.8192238158820631, + "learning_rate": 5.820399397832568e-07, + "loss": 0.399, + "step": 7278 + }, + { + "epoch": 0.46, + "grad_norm": 0.6710596774381883, + "learning_rate": 5.819380637794693e-07, + "loss": 0.2271, + "step": 7279 + }, + { + "epoch": 0.46, + "grad_norm": 1.0295387864219177, + "learning_rate": 5.818361842800392e-07, + "loss": 0.2058, + "step": 7280 + }, + { + "epoch": 0.46, + "grad_norm": 0.4595757345400026, + "learning_rate": 5.817343012893131e-07, + "loss": 0.0978, + "step": 7281 + }, + { + "epoch": 0.46, + "grad_norm": 1.1208460280131833, + "learning_rate": 5.816324148116374e-07, + "loss": 0.2242, + "step": 7282 + }, + { + "epoch": 0.46, + "grad_norm": 0.7370933209663031, + "learning_rate": 5.815305248513587e-07, + "loss": 0.209, + "step": 7283 + }, + { + "epoch": 0.46, + "grad_norm": 1.1254594372749078, + "learning_rate": 5.814286314128238e-07, + "loss": 0.1657, + "step": 7284 + }, + { + "epoch": 0.46, + "grad_norm": 1.5784525671144114, + "learning_rate": 5.8132673450038e-07, + "loss": 0.0118, + "step": 7285 + }, + { + "epoch": 0.46, + "grad_norm": 1.0887737723803113, + "learning_rate": 5.812248341183741e-07, + "loss": 0.1122, + "step": 7286 + }, + { + "epoch": 0.46, + "grad_norm": 0.3398571182371665, + "learning_rate": 5.811229302711536e-07, + "loss": 0.0988, + "step": 7287 + }, + { + "epoch": 0.46, + "grad_norm": 0.29411231775771174, + "learning_rate": 5.810210229630657e-07, + "loss": 0.029, + "step": 7288 + }, + { + "epoch": 0.46, + "grad_norm": 0.6714221210862924, + "learning_rate": 5.809191121984582e-07, + "loss": 0.0151, + "step": 7289 + }, + { + "epoch": 0.46, + "grad_norm": 0.7738093846839272, + "learning_rate": 5.808171979816786e-07, + "loss": 0.4841, + "step": 7290 + }, + { + "epoch": 0.46, + "grad_norm": 1.2247291459538128, + "learning_rate": 5.80715280317075e-07, + "loss": 0.1818, + "step": 7291 + }, + { + "epoch": 0.47, + "grad_norm": 0.17260494263047638, + "learning_rate": 5.80613359208995e-07, + "loss": 0.1391, + "step": 7292 + }, + { + "epoch": 0.47, + "grad_norm": 14.147421161271533, + "learning_rate": 5.805114346617873e-07, + "loss": 0.0911, + "step": 7293 + }, + { + "epoch": 0.47, + "grad_norm": 0.350897262005696, + "learning_rate": 5.804095066797999e-07, + "loss": 0.1607, + "step": 7294 + }, + { + "epoch": 0.47, + "grad_norm": 3.4318251929458587, + "learning_rate": 5.803075752673812e-07, + "loss": 0.025, + "step": 7295 + }, + { + "epoch": 0.47, + "grad_norm": 0.6625797293507669, + "learning_rate": 5.802056404288801e-07, + "loss": 0.009, + "step": 7296 + }, + { + "epoch": 0.47, + "grad_norm": 1.2150155533447429, + "learning_rate": 5.80103702168645e-07, + "loss": 0.348, + "step": 7297 + }, + { + "epoch": 0.47, + "grad_norm": 0.7820606372820846, + "learning_rate": 5.80001760491025e-07, + "loss": 0.3305, + "step": 7298 + }, + { + "epoch": 0.47, + "grad_norm": 0.3395156167890347, + "learning_rate": 5.798998154003691e-07, + "loss": 0.1866, + "step": 7299 + }, + { + "epoch": 0.47, + "grad_norm": 1.583684948082188, + "learning_rate": 5.797978669010264e-07, + "loss": 0.1602, + "step": 7300 + }, + { + "epoch": 0.47, + "grad_norm": 0.7808833631802821, + "learning_rate": 5.796959149973463e-07, + "loss": 0.2628, + "step": 7301 + }, + { + "epoch": 0.47, + "grad_norm": 0.4931297365788696, + "learning_rate": 5.795939596936782e-07, + "loss": 0.0382, + "step": 7302 + }, + { + "epoch": 0.47, + "grad_norm": 0.46770785079811705, + "learning_rate": 5.794920009943719e-07, + "loss": 0.1639, + "step": 7303 + }, + { + "epoch": 0.47, + "grad_norm": 0.5700771055707824, + "learning_rate": 5.793900389037769e-07, + "loss": 0.2143, + "step": 7304 + }, + { + "epoch": 0.47, + "grad_norm": 0.563425993375374, + "learning_rate": 5.792880734262433e-07, + "loss": 0.2519, + "step": 7305 + }, + { + "epoch": 0.47, + "grad_norm": 4.458020840421295, + "learning_rate": 5.791861045661211e-07, + "loss": 0.3534, + "step": 7306 + }, + { + "epoch": 0.47, + "grad_norm": 0.7477225624836562, + "learning_rate": 5.790841323277606e-07, + "loss": 0.1822, + "step": 7307 + }, + { + "epoch": 0.47, + "grad_norm": 1.1290463820403616, + "learning_rate": 5.789821567155119e-07, + "loss": 0.2205, + "step": 7308 + }, + { + "epoch": 0.47, + "grad_norm": 0.7053548395714695, + "learning_rate": 5.788801777337256e-07, + "loss": 0.1834, + "step": 7309 + }, + { + "epoch": 0.47, + "grad_norm": 3.024969451501184, + "learning_rate": 5.787781953867523e-07, + "loss": 0.247, + "step": 7310 + }, + { + "epoch": 0.47, + "grad_norm": 1.7774324720546493, + "learning_rate": 5.78676209678943e-07, + "loss": 0.2281, + "step": 7311 + }, + { + "epoch": 0.47, + "grad_norm": 0.5249247474929776, + "learning_rate": 5.785742206146483e-07, + "loss": 0.2471, + "step": 7312 + }, + { + "epoch": 0.47, + "grad_norm": 0.47158116919704907, + "learning_rate": 5.784722281982196e-07, + "loss": 0.1751, + "step": 7313 + }, + { + "epoch": 0.47, + "grad_norm": 1.7358734719182072, + "learning_rate": 5.783702324340078e-07, + "loss": 0.1933, + "step": 7314 + }, + { + "epoch": 0.47, + "grad_norm": 0.3645172137192145, + "learning_rate": 5.782682333263643e-07, + "loss": 0.1133, + "step": 7315 + }, + { + "epoch": 0.47, + "grad_norm": 10.867536241972235, + "learning_rate": 5.781662308796406e-07, + "loss": 0.0963, + "step": 7316 + }, + { + "epoch": 0.47, + "grad_norm": 0.6811480977103188, + "learning_rate": 5.780642250981884e-07, + "loss": 0.4057, + "step": 7317 + }, + { + "epoch": 0.47, + "grad_norm": 0.681771414756409, + "learning_rate": 5.779622159863593e-07, + "loss": 0.0947, + "step": 7318 + }, + { + "epoch": 0.47, + "grad_norm": 0.5311426383320308, + "learning_rate": 5.778602035485054e-07, + "loss": 0.0439, + "step": 7319 + }, + { + "epoch": 0.47, + "grad_norm": 1.0074352489639262, + "learning_rate": 5.777581877889787e-07, + "loss": 0.3747, + "step": 7320 + }, + { + "epoch": 0.47, + "grad_norm": 0.7402048501721525, + "learning_rate": 5.776561687121315e-07, + "loss": 0.3483, + "step": 7321 + }, + { + "epoch": 0.47, + "grad_norm": 0.7648789389158936, + "learning_rate": 5.77554146322316e-07, + "loss": 0.331, + "step": 7322 + }, + { + "epoch": 0.47, + "grad_norm": 2.650351293264006, + "learning_rate": 5.774521206238847e-07, + "loss": 0.1978, + "step": 7323 + }, + { + "epoch": 0.47, + "grad_norm": 8.773258554706342, + "learning_rate": 5.773500916211902e-07, + "loss": 0.2602, + "step": 7324 + }, + { + "epoch": 0.47, + "grad_norm": 1.148778741358844, + "learning_rate": 5.772480593185853e-07, + "loss": 0.0301, + "step": 7325 + }, + { + "epoch": 0.47, + "grad_norm": 1.766184658234635, + "learning_rate": 5.771460237204229e-07, + "loss": 0.2557, + "step": 7326 + }, + { + "epoch": 0.47, + "grad_norm": 0.39052958858279657, + "learning_rate": 5.770439848310562e-07, + "loss": 0.1391, + "step": 7327 + }, + { + "epoch": 0.47, + "grad_norm": 1.5991400704799148, + "learning_rate": 5.769419426548381e-07, + "loss": 0.1637, + "step": 7328 + }, + { + "epoch": 0.47, + "grad_norm": 0.5392510406057364, + "learning_rate": 5.76839897196122e-07, + "loss": 0.1154, + "step": 7329 + }, + { + "epoch": 0.47, + "grad_norm": 0.5869935013530925, + "learning_rate": 5.767378484592616e-07, + "loss": 0.2897, + "step": 7330 + }, + { + "epoch": 0.47, + "grad_norm": 0.40047283983111004, + "learning_rate": 5.766357964486102e-07, + "loss": 0.2059, + "step": 7331 + }, + { + "epoch": 0.47, + "grad_norm": 0.2511239699126518, + "learning_rate": 5.765337411685216e-07, + "loss": 0.1408, + "step": 7332 + }, + { + "epoch": 0.47, + "grad_norm": 0.6251798292657806, + "learning_rate": 5.764316826233498e-07, + "loss": 0.2621, + "step": 7333 + }, + { + "epoch": 0.47, + "grad_norm": 0.5509022837755441, + "learning_rate": 5.763296208174488e-07, + "loss": 0.2851, + "step": 7334 + }, + { + "epoch": 0.47, + "grad_norm": 0.8798314347754198, + "learning_rate": 5.762275557551726e-07, + "loss": 0.3619, + "step": 7335 + }, + { + "epoch": 0.47, + "grad_norm": 0.1918609967717406, + "learning_rate": 5.761254874408759e-07, + "loss": 0.0782, + "step": 7336 + }, + { + "epoch": 0.47, + "grad_norm": 0.9119931217971832, + "learning_rate": 5.760234158789126e-07, + "loss": 0.4166, + "step": 7337 + }, + { + "epoch": 0.47, + "grad_norm": 0.8541241720669797, + "learning_rate": 5.759213410736376e-07, + "loss": 0.0546, + "step": 7338 + }, + { + "epoch": 0.47, + "grad_norm": 0.6403395795517418, + "learning_rate": 5.758192630294058e-07, + "loss": 0.2085, + "step": 7339 + }, + { + "epoch": 0.47, + "grad_norm": 0.6724951175395882, + "learning_rate": 5.757171817505716e-07, + "loss": 0.2122, + "step": 7340 + }, + { + "epoch": 0.47, + "grad_norm": 1.7958559112389705, + "learning_rate": 5.756150972414903e-07, + "loss": 0.0467, + "step": 7341 + }, + { + "epoch": 0.47, + "grad_norm": 0.7387496108044898, + "learning_rate": 5.75513009506517e-07, + "loss": 0.1968, + "step": 7342 + }, + { + "epoch": 0.47, + "grad_norm": 0.9963018733725529, + "learning_rate": 5.754109185500069e-07, + "loss": 0.3962, + "step": 7343 + }, + { + "epoch": 0.47, + "grad_norm": 1.5125945984411493, + "learning_rate": 5.753088243763153e-07, + "loss": 0.061, + "step": 7344 + }, + { + "epoch": 0.47, + "grad_norm": 7.129892852925207, + "learning_rate": 5.752067269897979e-07, + "loss": 0.0721, + "step": 7345 + }, + { + "epoch": 0.47, + "grad_norm": 0.7931316184757881, + "learning_rate": 5.751046263948104e-07, + "loss": 0.0084, + "step": 7346 + }, + { + "epoch": 0.47, + "grad_norm": 0.9499235591276186, + "learning_rate": 5.750025225957085e-07, + "loss": 0.1203, + "step": 7347 + }, + { + "epoch": 0.47, + "grad_norm": 0.563436624583075, + "learning_rate": 5.749004155968482e-07, + "loss": 0.1158, + "step": 7348 + }, + { + "epoch": 0.47, + "grad_norm": 0.8241276446340091, + "learning_rate": 5.747983054025856e-07, + "loss": 0.2044, + "step": 7349 + }, + { + "epoch": 0.47, + "grad_norm": 3.101218515866801, + "learning_rate": 5.746961920172771e-07, + "loss": 0.1827, + "step": 7350 + }, + { + "epoch": 0.47, + "grad_norm": 1.3685416761720732, + "learning_rate": 5.745940754452787e-07, + "loss": 0.0704, + "step": 7351 + }, + { + "epoch": 0.47, + "grad_norm": 1.328030911984748, + "learning_rate": 5.744919556909472e-07, + "loss": 0.2337, + "step": 7352 + }, + { + "epoch": 0.47, + "grad_norm": 2.5497695070730573, + "learning_rate": 5.74389832758639e-07, + "loss": 0.1309, + "step": 7353 + }, + { + "epoch": 0.47, + "grad_norm": 0.7591838272308836, + "learning_rate": 5.742877066527112e-07, + "loss": 0.0285, + "step": 7354 + }, + { + "epoch": 0.47, + "grad_norm": 0.8302343064697865, + "learning_rate": 5.741855773775204e-07, + "loss": 0.2295, + "step": 7355 + }, + { + "epoch": 0.47, + "grad_norm": 0.5090856157851922, + "learning_rate": 5.740834449374237e-07, + "loss": 0.2969, + "step": 7356 + }, + { + "epoch": 0.47, + "grad_norm": 0.2033450691961722, + "learning_rate": 5.739813093367783e-07, + "loss": 0.0484, + "step": 7357 + }, + { + "epoch": 0.47, + "grad_norm": 0.584509287221558, + "learning_rate": 5.738791705799415e-07, + "loss": 0.1798, + "step": 7358 + }, + { + "epoch": 0.47, + "grad_norm": 0.4247587619689443, + "learning_rate": 5.737770286712708e-07, + "loss": 0.285, + "step": 7359 + }, + { + "epoch": 0.47, + "grad_norm": 3.6729317035023294, + "learning_rate": 5.736748836151237e-07, + "loss": 0.1208, + "step": 7360 + }, + { + "epoch": 0.47, + "grad_norm": 0.6018869856424918, + "learning_rate": 5.73572735415858e-07, + "loss": 0.1735, + "step": 7361 + }, + { + "epoch": 0.47, + "grad_norm": 1.0309512263073584, + "learning_rate": 5.734705840778315e-07, + "loss": 0.5402, + "step": 7362 + }, + { + "epoch": 0.47, + "grad_norm": 0.3507469001297476, + "learning_rate": 5.733684296054022e-07, + "loss": 0.1199, + "step": 7363 + }, + { + "epoch": 0.47, + "grad_norm": 3.592307700239587, + "learning_rate": 5.732662720029282e-07, + "loss": 0.1362, + "step": 7364 + }, + { + "epoch": 0.47, + "grad_norm": 2.3931602978053745, + "learning_rate": 5.731641112747679e-07, + "loss": 0.1187, + "step": 7365 + }, + { + "epoch": 0.47, + "grad_norm": 1.1124017828310615, + "learning_rate": 5.730619474252792e-07, + "loss": 0.1946, + "step": 7366 + }, + { + "epoch": 0.47, + "grad_norm": 0.5105519902196064, + "learning_rate": 5.729597804588212e-07, + "loss": 0.2302, + "step": 7367 + }, + { + "epoch": 0.47, + "grad_norm": 1.2018239662291923, + "learning_rate": 5.728576103797524e-07, + "loss": 0.4881, + "step": 7368 + }, + { + "epoch": 0.47, + "grad_norm": 1.1376805720728558, + "learning_rate": 5.727554371924313e-07, + "loss": 0.3844, + "step": 7369 + }, + { + "epoch": 0.47, + "grad_norm": 0.6324332393685804, + "learning_rate": 5.72653260901217e-07, + "loss": 0.4303, + "step": 7370 + }, + { + "epoch": 0.47, + "grad_norm": 0.65881572345634, + "learning_rate": 5.725510815104685e-07, + "loss": 0.0819, + "step": 7371 + }, + { + "epoch": 0.47, + "grad_norm": 1.7395975743305332, + "learning_rate": 5.724488990245451e-07, + "loss": 0.1954, + "step": 7372 + }, + { + "epoch": 0.47, + "grad_norm": 0.9277443571557993, + "learning_rate": 5.723467134478059e-07, + "loss": 0.2163, + "step": 7373 + }, + { + "epoch": 0.47, + "grad_norm": 0.9069014862888083, + "learning_rate": 5.722445247846106e-07, + "loss": 0.2253, + "step": 7374 + }, + { + "epoch": 0.47, + "grad_norm": 0.11844499542924179, + "learning_rate": 5.721423330393187e-07, + "loss": 0.0032, + "step": 7375 + }, + { + "epoch": 0.47, + "grad_norm": 0.8113056982316724, + "learning_rate": 5.720401382162898e-07, + "loss": 0.1503, + "step": 7376 + }, + { + "epoch": 0.47, + "grad_norm": 0.9212818922283537, + "learning_rate": 5.719379403198837e-07, + "loss": 0.1709, + "step": 7377 + }, + { + "epoch": 0.47, + "grad_norm": 0.9404735582256118, + "learning_rate": 5.718357393544605e-07, + "loss": 0.1065, + "step": 7378 + }, + { + "epoch": 0.47, + "grad_norm": 0.4801437221205867, + "learning_rate": 5.717335353243802e-07, + "loss": 0.1478, + "step": 7379 + }, + { + "epoch": 0.47, + "grad_norm": 1.4510494955548074, + "learning_rate": 5.716313282340032e-07, + "loss": 0.159, + "step": 7380 + }, + { + "epoch": 0.47, + "grad_norm": 1.5956475286412855, + "learning_rate": 5.715291180876896e-07, + "loss": 0.1442, + "step": 7381 + }, + { + "epoch": 0.47, + "grad_norm": 5.172564143546517, + "learning_rate": 5.714269048898002e-07, + "loss": 0.0202, + "step": 7382 + }, + { + "epoch": 0.47, + "grad_norm": 1.574979745638005, + "learning_rate": 5.713246886446953e-07, + "loss": 0.0843, + "step": 7383 + }, + { + "epoch": 0.47, + "grad_norm": 0.9970643512039582, + "learning_rate": 5.712224693567358e-07, + "loss": 0.1385, + "step": 7384 + }, + { + "epoch": 0.47, + "grad_norm": 0.6911268598909803, + "learning_rate": 5.711202470302827e-07, + "loss": 0.2269, + "step": 7385 + }, + { + "epoch": 0.47, + "grad_norm": 4.80939766757664, + "learning_rate": 5.710180216696968e-07, + "loss": 0.3549, + "step": 7386 + }, + { + "epoch": 0.47, + "grad_norm": 6.610968088004343, + "learning_rate": 5.709157932793394e-07, + "loss": 0.0185, + "step": 7387 + }, + { + "epoch": 0.47, + "grad_norm": 0.7329459952435532, + "learning_rate": 5.708135618635717e-07, + "loss": 0.1785, + "step": 7388 + }, + { + "epoch": 0.47, + "grad_norm": 1.8214124310681195, + "learning_rate": 5.70711327426755e-07, + "loss": 0.2332, + "step": 7389 + }, + { + "epoch": 0.47, + "grad_norm": 3.653476522592435, + "learning_rate": 5.706090899732508e-07, + "loss": 0.1851, + "step": 7390 + }, + { + "epoch": 0.47, + "grad_norm": 1.8452045293615673, + "learning_rate": 5.705068495074211e-07, + "loss": 0.1723, + "step": 7391 + }, + { + "epoch": 0.47, + "grad_norm": 0.8629686321621239, + "learning_rate": 5.704046060336275e-07, + "loss": 0.2495, + "step": 7392 + }, + { + "epoch": 0.47, + "grad_norm": 6.380436473286159, + "learning_rate": 5.703023595562318e-07, + "loss": 0.2289, + "step": 7393 + }, + { + "epoch": 0.47, + "grad_norm": 3.4993895908190598, + "learning_rate": 5.702001100795961e-07, + "loss": 0.2069, + "step": 7394 + }, + { + "epoch": 0.47, + "grad_norm": 1.507355922001305, + "learning_rate": 5.700978576080826e-07, + "loss": 0.2583, + "step": 7395 + }, + { + "epoch": 0.47, + "grad_norm": 0.514544843370766, + "learning_rate": 5.699956021460537e-07, + "loss": 0.0457, + "step": 7396 + }, + { + "epoch": 0.47, + "grad_norm": 1.0435372857076322, + "learning_rate": 5.698933436978715e-07, + "loss": 0.4678, + "step": 7397 + }, + { + "epoch": 0.47, + "grad_norm": 1.2240897383019091, + "learning_rate": 5.697910822678988e-07, + "loss": 0.2138, + "step": 7398 + }, + { + "epoch": 0.47, + "grad_norm": 0.75916899659083, + "learning_rate": 5.696888178604982e-07, + "loss": 0.2188, + "step": 7399 + }, + { + "epoch": 0.47, + "grad_norm": 0.7958821348665706, + "learning_rate": 5.695865504800327e-07, + "loss": 0.2863, + "step": 7400 + }, + { + "epoch": 0.47, + "grad_norm": 0.9350666028729014, + "learning_rate": 5.694842801308651e-07, + "loss": 0.1918, + "step": 7401 + }, + { + "epoch": 0.47, + "grad_norm": 1.0548114986016022, + "learning_rate": 5.693820068173583e-07, + "loss": 0.2893, + "step": 7402 + }, + { + "epoch": 0.47, + "grad_norm": 0.5524496773427505, + "learning_rate": 5.692797305438756e-07, + "loss": 0.3367, + "step": 7403 + }, + { + "epoch": 0.47, + "grad_norm": 0.6890498424681148, + "learning_rate": 5.691774513147802e-07, + "loss": 0.2766, + "step": 7404 + }, + { + "epoch": 0.47, + "grad_norm": 1.2570451277405057, + "learning_rate": 5.690751691344359e-07, + "loss": 0.3281, + "step": 7405 + }, + { + "epoch": 0.47, + "grad_norm": 0.7342886843729234, + "learning_rate": 5.689728840072059e-07, + "loss": 0.1861, + "step": 7406 + }, + { + "epoch": 0.47, + "grad_norm": 2.2393823786284757, + "learning_rate": 5.688705959374542e-07, + "loss": 0.1758, + "step": 7407 + }, + { + "epoch": 0.47, + "grad_norm": 0.6368756048863157, + "learning_rate": 5.687683049295441e-07, + "loss": 0.1287, + "step": 7408 + }, + { + "epoch": 0.47, + "grad_norm": 1.3698230996427294, + "learning_rate": 5.686660109878401e-07, + "loss": 0.1459, + "step": 7409 + }, + { + "epoch": 0.47, + "grad_norm": 1.0428440778245631, + "learning_rate": 5.68563714116706e-07, + "loss": 0.1315, + "step": 7410 + }, + { + "epoch": 0.47, + "grad_norm": 0.4462005025358093, + "learning_rate": 5.68461414320506e-07, + "loss": 0.2594, + "step": 7411 + }, + { + "epoch": 0.47, + "grad_norm": 0.4810305048733831, + "learning_rate": 5.683591116036045e-07, + "loss": 0.2054, + "step": 7412 + }, + { + "epoch": 0.47, + "grad_norm": 1.091110397138657, + "learning_rate": 5.682568059703659e-07, + "loss": 0.1844, + "step": 7413 + }, + { + "epoch": 0.47, + "grad_norm": 1.111523344208612, + "learning_rate": 5.681544974251547e-07, + "loss": 0.1186, + "step": 7414 + }, + { + "epoch": 0.47, + "grad_norm": 0.5531265967104229, + "learning_rate": 5.680521859723355e-07, + "loss": 0.201, + "step": 7415 + }, + { + "epoch": 0.47, + "grad_norm": 0.36396224093760593, + "learning_rate": 5.679498716162733e-07, + "loss": 0.1059, + "step": 7416 + }, + { + "epoch": 0.47, + "grad_norm": 9.232194012764126, + "learning_rate": 5.67847554361333e-07, + "loss": 0.2394, + "step": 7417 + }, + { + "epoch": 0.47, + "grad_norm": 0.33948903903857913, + "learning_rate": 5.677452342118797e-07, + "loss": 0.0658, + "step": 7418 + }, + { + "epoch": 0.47, + "grad_norm": 0.9176724911364541, + "learning_rate": 5.676429111722784e-07, + "loss": 0.0953, + "step": 7419 + }, + { + "epoch": 0.47, + "grad_norm": 1.0441066460826038, + "learning_rate": 5.675405852468948e-07, + "loss": 0.1832, + "step": 7420 + }, + { + "epoch": 0.47, + "grad_norm": 0.7920029983208265, + "learning_rate": 5.674382564400938e-07, + "loss": 0.2583, + "step": 7421 + }, + { + "epoch": 0.47, + "grad_norm": 3.048901779663038, + "learning_rate": 5.673359247562412e-07, + "loss": 0.257, + "step": 7422 + }, + { + "epoch": 0.47, + "grad_norm": 0.615553286073353, + "learning_rate": 5.67233590199703e-07, + "loss": 0.0901, + "step": 7423 + }, + { + "epoch": 0.47, + "grad_norm": 0.7062176481074847, + "learning_rate": 5.671312527748444e-07, + "loss": 0.0728, + "step": 7424 + }, + { + "epoch": 0.47, + "grad_norm": 5.232755665082155, + "learning_rate": 5.670289124860317e-07, + "loss": 0.2293, + "step": 7425 + }, + { + "epoch": 0.47, + "grad_norm": 2.405574665183514, + "learning_rate": 5.669265693376309e-07, + "loss": 0.0083, + "step": 7426 + }, + { + "epoch": 0.47, + "grad_norm": 1.1098301197267593, + "learning_rate": 5.66824223334008e-07, + "loss": 0.3068, + "step": 7427 + }, + { + "epoch": 0.47, + "grad_norm": 0.9236546297958498, + "learning_rate": 5.667218744795293e-07, + "loss": 0.2577, + "step": 7428 + }, + { + "epoch": 0.47, + "grad_norm": 0.8847137481375129, + "learning_rate": 5.666195227785615e-07, + "loss": 0.1334, + "step": 7429 + }, + { + "epoch": 0.47, + "grad_norm": 0.5823681045348512, + "learning_rate": 5.665171682354709e-07, + "loss": 0.1977, + "step": 7430 + }, + { + "epoch": 0.47, + "grad_norm": 2.3705110138490335, + "learning_rate": 5.664148108546242e-07, + "loss": 0.0945, + "step": 7431 + }, + { + "epoch": 0.47, + "grad_norm": 1.1590817549719374, + "learning_rate": 5.663124506403881e-07, + "loss": 0.3621, + "step": 7432 + }, + { + "epoch": 0.47, + "grad_norm": 8.905325163363612, + "learning_rate": 5.662100875971297e-07, + "loss": 0.1864, + "step": 7433 + }, + { + "epoch": 0.47, + "grad_norm": 1.0797349867376271, + "learning_rate": 5.661077217292155e-07, + "loss": 0.2406, + "step": 7434 + }, + { + "epoch": 0.47, + "grad_norm": 7.195393183468775, + "learning_rate": 5.660053530410132e-07, + "loss": 0.2057, + "step": 7435 + }, + { + "epoch": 0.47, + "grad_norm": 0.40924590451085185, + "learning_rate": 5.6590298153689e-07, + "loss": 0.2426, + "step": 7436 + }, + { + "epoch": 0.47, + "grad_norm": 0.6132333443663023, + "learning_rate": 5.658006072212132e-07, + "loss": 0.1041, + "step": 7437 + }, + { + "epoch": 0.47, + "grad_norm": 1.378726814524991, + "learning_rate": 5.656982300983499e-07, + "loss": 0.2694, + "step": 7438 + }, + { + "epoch": 0.47, + "grad_norm": 5.546295418977469, + "learning_rate": 5.655958501726682e-07, + "loss": 0.2877, + "step": 7439 + }, + { + "epoch": 0.47, + "grad_norm": 0.9890241239705159, + "learning_rate": 5.654934674485356e-07, + "loss": 0.1202, + "step": 7440 + }, + { + "epoch": 0.47, + "grad_norm": 0.5518453900124864, + "learning_rate": 5.653910819303202e-07, + "loss": 0.1582, + "step": 7441 + }, + { + "epoch": 0.47, + "grad_norm": 0.31820669561369536, + "learning_rate": 5.652886936223896e-07, + "loss": 0.1387, + "step": 7442 + }, + { + "epoch": 0.47, + "grad_norm": 1.194544907467919, + "learning_rate": 5.65186302529112e-07, + "loss": 0.5461, + "step": 7443 + }, + { + "epoch": 0.47, + "grad_norm": 0.3864319810046312, + "learning_rate": 5.650839086548559e-07, + "loss": 0.2458, + "step": 7444 + }, + { + "epoch": 0.47, + "grad_norm": 1.2756607149434038, + "learning_rate": 5.649815120039894e-07, + "loss": 0.0146, + "step": 7445 + }, + { + "epoch": 0.47, + "grad_norm": 1.374367894987499, + "learning_rate": 5.648791125808808e-07, + "loss": 0.1254, + "step": 7446 + }, + { + "epoch": 0.47, + "grad_norm": 0.6856197673962292, + "learning_rate": 5.647767103898989e-07, + "loss": 0.093, + "step": 7447 + }, + { + "epoch": 0.47, + "grad_norm": 0.9346600309424645, + "learning_rate": 5.646743054354123e-07, + "loss": 0.2221, + "step": 7448 + }, + { + "epoch": 0.48, + "grad_norm": 1.312934572514989, + "learning_rate": 5.6457189772179e-07, + "loss": 0.1119, + "step": 7449 + }, + { + "epoch": 0.48, + "grad_norm": 0.8698171801457183, + "learning_rate": 5.644694872534007e-07, + "loss": 0.3173, + "step": 7450 + }, + { + "epoch": 0.48, + "grad_norm": 0.5585476449319329, + "learning_rate": 5.643670740346134e-07, + "loss": 0.3569, + "step": 7451 + }, + { + "epoch": 0.48, + "grad_norm": 0.1521584245809629, + "learning_rate": 5.642646580697973e-07, + "loss": 0.0535, + "step": 7452 + }, + { + "epoch": 0.48, + "grad_norm": 0.824701361709204, + "learning_rate": 5.641622393633217e-07, + "loss": 0.251, + "step": 7453 + }, + { + "epoch": 0.48, + "grad_norm": 0.4205844762570796, + "learning_rate": 5.640598179195561e-07, + "loss": 0.1754, + "step": 7454 + }, + { + "epoch": 0.48, + "grad_norm": 0.8573693617300268, + "learning_rate": 5.639573937428698e-07, + "loss": 0.1083, + "step": 7455 + }, + { + "epoch": 0.48, + "grad_norm": 1.1005043004659185, + "learning_rate": 5.638549668376325e-07, + "loss": 0.1672, + "step": 7456 + }, + { + "epoch": 0.48, + "grad_norm": 0.6114767191442775, + "learning_rate": 5.637525372082139e-07, + "loss": 0.1807, + "step": 7457 + }, + { + "epoch": 0.48, + "grad_norm": 0.8702120397028751, + "learning_rate": 5.63650104858984e-07, + "loss": 0.1615, + "step": 7458 + }, + { + "epoch": 0.48, + "grad_norm": 1.750906079323879, + "learning_rate": 5.635476697943127e-07, + "loss": 0.1844, + "step": 7459 + }, + { + "epoch": 0.48, + "grad_norm": 0.4672155006839803, + "learning_rate": 5.6344523201857e-07, + "loss": 0.0849, + "step": 7460 + }, + { + "epoch": 0.48, + "grad_norm": 0.31078634128353105, + "learning_rate": 5.633427915361261e-07, + "loss": 0.115, + "step": 7461 + }, + { + "epoch": 0.48, + "grad_norm": 6.36883160702436, + "learning_rate": 5.632403483513514e-07, + "loss": 0.0598, + "step": 7462 + }, + { + "epoch": 0.48, + "grad_norm": 0.7199565022113346, + "learning_rate": 5.631379024686163e-07, + "loss": 0.0862, + "step": 7463 + }, + { + "epoch": 0.48, + "grad_norm": 0.9325619587341416, + "learning_rate": 5.630354538922914e-07, + "loss": 0.3025, + "step": 7464 + }, + { + "epoch": 0.48, + "grad_norm": 0.7220012421406382, + "learning_rate": 5.629330026267474e-07, + "loss": 0.1827, + "step": 7465 + }, + { + "epoch": 0.48, + "grad_norm": 0.4894598119503782, + "learning_rate": 5.628305486763551e-07, + "loss": 0.1976, + "step": 7466 + }, + { + "epoch": 0.48, + "grad_norm": 0.740669590038806, + "learning_rate": 5.627280920454851e-07, + "loss": 0.2961, + "step": 7467 + }, + { + "epoch": 0.48, + "grad_norm": 1.1176548194685672, + "learning_rate": 5.626256327385086e-07, + "loss": 0.128, + "step": 7468 + }, + { + "epoch": 0.48, + "grad_norm": 0.31588475411744266, + "learning_rate": 5.625231707597966e-07, + "loss": 0.0828, + "step": 7469 + }, + { + "epoch": 0.48, + "grad_norm": 0.7131728843382178, + "learning_rate": 5.624207061137205e-07, + "loss": 0.158, + "step": 7470 + }, + { + "epoch": 0.48, + "grad_norm": 4.46312935719627, + "learning_rate": 5.623182388046516e-07, + "loss": 0.1385, + "step": 7471 + }, + { + "epoch": 0.48, + "grad_norm": 1.1777465574904347, + "learning_rate": 5.622157688369615e-07, + "loss": 0.2871, + "step": 7472 + }, + { + "epoch": 0.48, + "grad_norm": 2.688964848925245, + "learning_rate": 5.621132962150216e-07, + "loss": 0.0781, + "step": 7473 + }, + { + "epoch": 0.48, + "grad_norm": 0.5968462726565935, + "learning_rate": 5.620108209432036e-07, + "loss": 0.1112, + "step": 7474 + }, + { + "epoch": 0.48, + "grad_norm": 1.3971323826497248, + "learning_rate": 5.619083430258793e-07, + "loss": 0.1836, + "step": 7475 + }, + { + "epoch": 0.48, + "grad_norm": 1.0694918118281551, + "learning_rate": 5.618058624674207e-07, + "loss": 0.1519, + "step": 7476 + }, + { + "epoch": 0.48, + "grad_norm": 1.0335790561850045, + "learning_rate": 5.617033792721997e-07, + "loss": 0.2359, + "step": 7477 + }, + { + "epoch": 0.48, + "grad_norm": 0.4831809868341667, + "learning_rate": 5.616008934445883e-07, + "loss": 0.2011, + "step": 7478 + }, + { + "epoch": 0.48, + "grad_norm": 0.4898245992729707, + "learning_rate": 5.614984049889593e-07, + "loss": 0.1118, + "step": 7479 + }, + { + "epoch": 0.48, + "grad_norm": 4.1437450886365275, + "learning_rate": 5.613959139096845e-07, + "loss": 0.3033, + "step": 7480 + }, + { + "epoch": 0.48, + "grad_norm": 0.7385254767518705, + "learning_rate": 5.612934202111367e-07, + "loss": 0.2655, + "step": 7481 + }, + { + "epoch": 0.48, + "grad_norm": 0.3418980351521246, + "learning_rate": 5.611909238976884e-07, + "loss": 0.2096, + "step": 7482 + }, + { + "epoch": 0.48, + "grad_norm": 0.2870817067320607, + "learning_rate": 5.610884249737121e-07, + "loss": 0.1744, + "step": 7483 + }, + { + "epoch": 0.48, + "grad_norm": 2.0469729269311743, + "learning_rate": 5.60985923443581e-07, + "loss": 0.1054, + "step": 7484 + }, + { + "epoch": 0.48, + "grad_norm": 0.2029980528324711, + "learning_rate": 5.608834193116677e-07, + "loss": 0.1731, + "step": 7485 + }, + { + "epoch": 0.48, + "grad_norm": 0.7293884920226757, + "learning_rate": 5.607809125823453e-07, + "loss": 0.0062, + "step": 7486 + }, + { + "epoch": 0.48, + "grad_norm": 20.011926147180226, + "learning_rate": 5.606784032599869e-07, + "loss": 0.1207, + "step": 7487 + }, + { + "epoch": 0.48, + "grad_norm": 0.6523455597712159, + "learning_rate": 5.60575891348966e-07, + "loss": 0.2639, + "step": 7488 + }, + { + "epoch": 0.48, + "grad_norm": 0.5988232100390637, + "learning_rate": 5.604733768536559e-07, + "loss": 0.2087, + "step": 7489 + }, + { + "epoch": 0.48, + "grad_norm": 1.1281720086035099, + "learning_rate": 5.603708597784298e-07, + "loss": 0.0091, + "step": 7490 + }, + { + "epoch": 0.48, + "grad_norm": 0.5289742693658647, + "learning_rate": 5.602683401276614e-07, + "loss": 0.1171, + "step": 7491 + }, + { + "epoch": 0.48, + "grad_norm": 1.1895943799066078, + "learning_rate": 5.601658179057247e-07, + "loss": 0.1158, + "step": 7492 + }, + { + "epoch": 0.48, + "grad_norm": 0.8388780051991538, + "learning_rate": 5.600632931169932e-07, + "loss": 0.3189, + "step": 7493 + }, + { + "epoch": 0.48, + "grad_norm": 4.686891110765236, + "learning_rate": 5.599607657658408e-07, + "loss": 0.0356, + "step": 7494 + }, + { + "epoch": 0.48, + "grad_norm": 0.6788510211169609, + "learning_rate": 5.598582358566415e-07, + "loss": 0.2257, + "step": 7495 + }, + { + "epoch": 0.48, + "grad_norm": 9.364153192213852, + "learning_rate": 5.597557033937697e-07, + "loss": 0.2356, + "step": 7496 + }, + { + "epoch": 0.48, + "grad_norm": 0.37181014590039935, + "learning_rate": 5.596531683815992e-07, + "loss": 0.1165, + "step": 7497 + }, + { + "epoch": 0.48, + "grad_norm": 0.7648144265397897, + "learning_rate": 5.595506308245049e-07, + "loss": 0.3077, + "step": 7498 + }, + { + "epoch": 0.48, + "grad_norm": 2.4327140745052023, + "learning_rate": 5.594480907268609e-07, + "loss": 0.2935, + "step": 7499 + }, + { + "epoch": 0.48, + "grad_norm": 3.2451480230515064, + "learning_rate": 5.593455480930418e-07, + "loss": 0.3365, + "step": 7500 + }, + { + "epoch": 0.48, + "grad_norm": 5.915083767056461, + "learning_rate": 5.592430029274224e-07, + "loss": 0.0809, + "step": 7501 + }, + { + "epoch": 0.48, + "grad_norm": 0.9222138475209727, + "learning_rate": 5.591404552343774e-07, + "loss": 0.3594, + "step": 7502 + }, + { + "epoch": 0.48, + "grad_norm": 0.32992894311582377, + "learning_rate": 5.590379050182817e-07, + "loss": 0.0914, + "step": 7503 + }, + { + "epoch": 0.48, + "grad_norm": 5.206815532507948, + "learning_rate": 5.589353522835102e-07, + "loss": 0.112, + "step": 7504 + }, + { + "epoch": 0.48, + "grad_norm": 0.8363646515703453, + "learning_rate": 5.58832797034438e-07, + "loss": 0.2132, + "step": 7505 + }, + { + "epoch": 0.48, + "grad_norm": 0.8144334905459782, + "learning_rate": 5.587302392754407e-07, + "loss": 0.3436, + "step": 7506 + }, + { + "epoch": 0.48, + "grad_norm": 0.8848522820728025, + "learning_rate": 5.586276790108931e-07, + "loss": 0.1702, + "step": 7507 + }, + { + "epoch": 0.48, + "grad_norm": 0.25257579505733574, + "learning_rate": 5.585251162451709e-07, + "loss": 0.0853, + "step": 7508 + }, + { + "epoch": 0.48, + "grad_norm": 1.3327463292421773, + "learning_rate": 5.584225509826497e-07, + "loss": 0.0967, + "step": 7509 + }, + { + "epoch": 0.48, + "grad_norm": 1.3346992988258828, + "learning_rate": 5.583199832277049e-07, + "loss": 0.1898, + "step": 7510 + }, + { + "epoch": 0.48, + "grad_norm": 1.4112276192280022, + "learning_rate": 5.582174129847125e-07, + "loss": 0.1245, + "step": 7511 + }, + { + "epoch": 0.48, + "grad_norm": 0.8455868299535237, + "learning_rate": 5.581148402580481e-07, + "loss": 0.0945, + "step": 7512 + }, + { + "epoch": 0.48, + "grad_norm": 0.7961139849131209, + "learning_rate": 5.580122650520879e-07, + "loss": 0.2118, + "step": 7513 + }, + { + "epoch": 0.48, + "grad_norm": 0.9728331830756659, + "learning_rate": 5.579096873712077e-07, + "loss": 0.2882, + "step": 7514 + }, + { + "epoch": 0.48, + "grad_norm": 3.145700284257323, + "learning_rate": 5.578071072197839e-07, + "loss": 0.1685, + "step": 7515 + }, + { + "epoch": 0.48, + "grad_norm": 0.4719600960107473, + "learning_rate": 5.577045246021928e-07, + "loss": 0.1886, + "step": 7516 + }, + { + "epoch": 0.48, + "grad_norm": 0.9118244024819626, + "learning_rate": 5.576019395228106e-07, + "loss": 0.0788, + "step": 7517 + }, + { + "epoch": 0.48, + "grad_norm": 0.9239190246386999, + "learning_rate": 5.574993519860138e-07, + "loss": 0.3041, + "step": 7518 + }, + { + "epoch": 0.48, + "grad_norm": 0.6822214387634311, + "learning_rate": 5.573967619961791e-07, + "loss": 0.1335, + "step": 7519 + }, + { + "epoch": 0.48, + "grad_norm": 1.0513784452022978, + "learning_rate": 5.572941695576834e-07, + "loss": 0.0111, + "step": 7520 + }, + { + "epoch": 0.48, + "grad_norm": 0.7666833761008804, + "learning_rate": 5.57191574674903e-07, + "loss": 0.1138, + "step": 7521 + }, + { + "epoch": 0.48, + "grad_norm": 0.6404449871594167, + "learning_rate": 5.570889773522149e-07, + "loss": 0.1827, + "step": 7522 + }, + { + "epoch": 0.48, + "grad_norm": 0.8274110168994253, + "learning_rate": 5.569863775939965e-07, + "loss": 0.2472, + "step": 7523 + }, + { + "epoch": 0.48, + "grad_norm": 1.0695101580292428, + "learning_rate": 5.568837754046246e-07, + "loss": 0.2303, + "step": 7524 + }, + { + "epoch": 0.48, + "grad_norm": 1.1625372819514135, + "learning_rate": 5.567811707884765e-07, + "loss": 0.2299, + "step": 7525 + }, + { + "epoch": 0.48, + "grad_norm": 2.239994514135691, + "learning_rate": 5.566785637499296e-07, + "loss": 0.3032, + "step": 7526 + }, + { + "epoch": 0.48, + "grad_norm": 1.845062834876523, + "learning_rate": 5.565759542933611e-07, + "loss": 0.2958, + "step": 7527 + }, + { + "epoch": 0.48, + "grad_norm": 0.90402531981841, + "learning_rate": 5.564733424231487e-07, + "loss": 0.4111, + "step": 7528 + }, + { + "epoch": 0.48, + "grad_norm": 2.1355457669657243, + "learning_rate": 5.5637072814367e-07, + "loss": 0.379, + "step": 7529 + }, + { + "epoch": 0.48, + "grad_norm": 1.1256920054797275, + "learning_rate": 5.562681114593028e-07, + "loss": 0.1953, + "step": 7530 + }, + { + "epoch": 0.48, + "grad_norm": 0.9255408113191933, + "learning_rate": 5.561654923744248e-07, + "loss": 0.306, + "step": 7531 + }, + { + "epoch": 0.48, + "grad_norm": 0.6651820663004722, + "learning_rate": 5.560628708934138e-07, + "loss": 0.2565, + "step": 7532 + }, + { + "epoch": 0.48, + "grad_norm": 1.2996596651136072, + "learning_rate": 5.559602470206483e-07, + "loss": 0.4174, + "step": 7533 + }, + { + "epoch": 0.48, + "grad_norm": 0.48131214773171593, + "learning_rate": 5.55857620760506e-07, + "loss": 0.2003, + "step": 7534 + }, + { + "epoch": 0.48, + "grad_norm": 2.0320814499631323, + "learning_rate": 5.557549921173655e-07, + "loss": 0.5093, + "step": 7535 + }, + { + "epoch": 0.48, + "grad_norm": 1.0990309741661553, + "learning_rate": 5.556523610956047e-07, + "loss": 0.2306, + "step": 7536 + }, + { + "epoch": 0.48, + "grad_norm": 1.8260995074990696, + "learning_rate": 5.555497276996024e-07, + "loss": 0.1701, + "step": 7537 + }, + { + "epoch": 0.48, + "grad_norm": 1.1976736039695113, + "learning_rate": 5.554470919337372e-07, + "loss": 0.2351, + "step": 7538 + }, + { + "epoch": 0.48, + "grad_norm": 0.40345372153506504, + "learning_rate": 5.553444538023873e-07, + "loss": 0.0877, + "step": 7539 + }, + { + "epoch": 0.48, + "grad_norm": 0.5977917822238236, + "learning_rate": 5.55241813309932e-07, + "loss": 0.0766, + "step": 7540 + }, + { + "epoch": 0.48, + "grad_norm": 0.7745418848440369, + "learning_rate": 5.551391704607497e-07, + "loss": 0.2889, + "step": 7541 + }, + { + "epoch": 0.48, + "grad_norm": 0.42312695308428366, + "learning_rate": 5.550365252592196e-07, + "loss": 0.1046, + "step": 7542 + }, + { + "epoch": 0.48, + "grad_norm": 0.6641934247142979, + "learning_rate": 5.549338777097208e-07, + "loss": 0.3636, + "step": 7543 + }, + { + "epoch": 0.48, + "grad_norm": 0.952245733703652, + "learning_rate": 5.548312278166322e-07, + "loss": 0.2157, + "step": 7544 + }, + { + "epoch": 0.48, + "grad_norm": 0.7019529118628163, + "learning_rate": 5.547285755843334e-07, + "loss": 0.2253, + "step": 7545 + }, + { + "epoch": 0.48, + "grad_norm": 0.9408301507018305, + "learning_rate": 5.546259210172034e-07, + "loss": 0.2198, + "step": 7546 + }, + { + "epoch": 0.48, + "grad_norm": 1.9697559515545209, + "learning_rate": 5.54523264119622e-07, + "loss": 0.1452, + "step": 7547 + }, + { + "epoch": 0.48, + "grad_norm": 4.314768569281151, + "learning_rate": 5.544206048959682e-07, + "loss": 0.2431, + "step": 7548 + }, + { + "epoch": 0.48, + "grad_norm": 0.7991822471163085, + "learning_rate": 5.543179433506222e-07, + "loss": 0.0965, + "step": 7549 + }, + { + "epoch": 0.48, + "grad_norm": 0.7061729152233741, + "learning_rate": 5.542152794879636e-07, + "loss": 0.2957, + "step": 7550 + }, + { + "epoch": 0.48, + "grad_norm": 0.5756950985548269, + "learning_rate": 5.54112613312372e-07, + "loss": 0.256, + "step": 7551 + }, + { + "epoch": 0.48, + "grad_norm": 0.2867756764185683, + "learning_rate": 5.540099448282276e-07, + "loss": 0.0989, + "step": 7552 + }, + { + "epoch": 0.48, + "grad_norm": 0.6086105040269305, + "learning_rate": 5.539072740399104e-07, + "loss": 0.112, + "step": 7553 + }, + { + "epoch": 0.48, + "grad_norm": 1.138626460754883, + "learning_rate": 5.538046009518006e-07, + "loss": 0.059, + "step": 7554 + }, + { + "epoch": 0.48, + "grad_norm": 0.6693955455920618, + "learning_rate": 5.537019255682783e-07, + "loss": 0.3693, + "step": 7555 + }, + { + "epoch": 0.48, + "grad_norm": 6.216398338530876, + "learning_rate": 5.535992478937239e-07, + "loss": 0.2422, + "step": 7556 + }, + { + "epoch": 0.48, + "grad_norm": 0.6205476139449395, + "learning_rate": 5.534965679325179e-07, + "loss": 0.1238, + "step": 7557 + }, + { + "epoch": 0.48, + "grad_norm": 0.2505535639024559, + "learning_rate": 5.533938856890407e-07, + "loss": 0.0778, + "step": 7558 + }, + { + "epoch": 0.48, + "grad_norm": 0.7440686785079905, + "learning_rate": 5.532912011676729e-07, + "loss": 0.0899, + "step": 7559 + }, + { + "epoch": 0.48, + "grad_norm": 5.782080365904402, + "learning_rate": 5.531885143727955e-07, + "loss": 0.09, + "step": 7560 + }, + { + "epoch": 0.48, + "grad_norm": 1.9205299052777522, + "learning_rate": 5.530858253087891e-07, + "loss": 0.1384, + "step": 7561 + }, + { + "epoch": 0.48, + "grad_norm": 2.0070649464553347, + "learning_rate": 5.529831339800348e-07, + "loss": 0.2301, + "step": 7562 + }, + { + "epoch": 0.48, + "grad_norm": 4.642698626302945, + "learning_rate": 5.528804403909133e-07, + "loss": 0.1444, + "step": 7563 + }, + { + "epoch": 0.48, + "grad_norm": 3.5385271444594335, + "learning_rate": 5.52777744545806e-07, + "loss": 0.1689, + "step": 7564 + }, + { + "epoch": 0.48, + "grad_norm": 0.6910624278217642, + "learning_rate": 5.52675046449094e-07, + "loss": 0.1171, + "step": 7565 + }, + { + "epoch": 0.48, + "grad_norm": 0.5188507620744114, + "learning_rate": 5.525723461051587e-07, + "loss": 0.118, + "step": 7566 + }, + { + "epoch": 0.48, + "grad_norm": 0.7215030641675393, + "learning_rate": 5.524696435183812e-07, + "loss": 0.1521, + "step": 7567 + }, + { + "epoch": 0.48, + "grad_norm": 0.27550035434511455, + "learning_rate": 5.523669386931433e-07, + "loss": 0.0337, + "step": 7568 + }, + { + "epoch": 0.48, + "grad_norm": 0.45155692176302054, + "learning_rate": 5.522642316338268e-07, + "loss": 0.0892, + "step": 7569 + }, + { + "epoch": 0.48, + "grad_norm": 0.5821565862132517, + "learning_rate": 5.521615223448129e-07, + "loss": 0.2217, + "step": 7570 + }, + { + "epoch": 0.48, + "grad_norm": 4.732416575891742, + "learning_rate": 5.520588108304836e-07, + "loss": 0.2106, + "step": 7571 + }, + { + "epoch": 0.48, + "grad_norm": 2.594161838262372, + "learning_rate": 5.519560970952207e-07, + "loss": 0.0865, + "step": 7572 + }, + { + "epoch": 0.48, + "grad_norm": 0.4948336791965397, + "learning_rate": 5.518533811434064e-07, + "loss": 0.2018, + "step": 7573 + }, + { + "epoch": 0.48, + "grad_norm": 1.0229297133671198, + "learning_rate": 5.517506629794226e-07, + "loss": 0.2161, + "step": 7574 + }, + { + "epoch": 0.48, + "grad_norm": 0.6807350704565484, + "learning_rate": 5.516479426076515e-07, + "loss": 0.1555, + "step": 7575 + }, + { + "epoch": 0.48, + "grad_norm": 5.0254761901459135, + "learning_rate": 5.515452200324753e-07, + "loss": 0.1612, + "step": 7576 + }, + { + "epoch": 0.48, + "grad_norm": 13.57679777406315, + "learning_rate": 5.514424952582765e-07, + "loss": 0.2426, + "step": 7577 + }, + { + "epoch": 0.48, + "grad_norm": 0.8858005241840776, + "learning_rate": 5.513397682894373e-07, + "loss": 0.2008, + "step": 7578 + }, + { + "epoch": 0.48, + "grad_norm": 0.6883412880363127, + "learning_rate": 5.512370391303404e-07, + "loss": 0.2956, + "step": 7579 + }, + { + "epoch": 0.48, + "grad_norm": 1.642081543611526, + "learning_rate": 5.511343077853684e-07, + "loss": 0.1615, + "step": 7580 + }, + { + "epoch": 0.48, + "grad_norm": 0.7327974006760869, + "learning_rate": 5.510315742589042e-07, + "loss": 0.3597, + "step": 7581 + }, + { + "epoch": 0.48, + "grad_norm": 0.5961600903985111, + "learning_rate": 5.509288385553302e-07, + "loss": 0.1655, + "step": 7582 + }, + { + "epoch": 0.48, + "grad_norm": 0.5370388498415422, + "learning_rate": 5.508261006790298e-07, + "loss": 0.3869, + "step": 7583 + }, + { + "epoch": 0.48, + "grad_norm": 1.8474899756471022, + "learning_rate": 5.507233606343857e-07, + "loss": 0.3243, + "step": 7584 + }, + { + "epoch": 0.48, + "grad_norm": 1.2173954578916704, + "learning_rate": 5.506206184257809e-07, + "loss": 0.3129, + "step": 7585 + }, + { + "epoch": 0.48, + "grad_norm": 0.5548149607465687, + "learning_rate": 5.505178740575989e-07, + "loss": 0.2358, + "step": 7586 + }, + { + "epoch": 0.48, + "grad_norm": 2.2944959997760943, + "learning_rate": 5.504151275342228e-07, + "loss": 0.1349, + "step": 7587 + }, + { + "epoch": 0.48, + "grad_norm": 24.841566155442354, + "learning_rate": 5.503123788600361e-07, + "loss": 0.0623, + "step": 7588 + }, + { + "epoch": 0.48, + "grad_norm": 0.8311751932502293, + "learning_rate": 5.502096280394222e-07, + "loss": 0.1832, + "step": 7589 + }, + { + "epoch": 0.48, + "grad_norm": 0.7656308631754906, + "learning_rate": 5.501068750767646e-07, + "loss": 0.2056, + "step": 7590 + }, + { + "epoch": 0.48, + "grad_norm": 0.8605056054119362, + "learning_rate": 5.500041199764469e-07, + "loss": 0.1156, + "step": 7591 + }, + { + "epoch": 0.48, + "grad_norm": 0.5339431055225383, + "learning_rate": 5.49901362742853e-07, + "loss": 0.0966, + "step": 7592 + }, + { + "epoch": 0.48, + "grad_norm": 0.7953356023979216, + "learning_rate": 5.497986033803664e-07, + "loss": 0.2662, + "step": 7593 + }, + { + "epoch": 0.48, + "grad_norm": 0.5622668629224475, + "learning_rate": 5.496958418933715e-07, + "loss": 0.0963, + "step": 7594 + }, + { + "epoch": 0.48, + "grad_norm": 0.6164005777732087, + "learning_rate": 5.495930782862521e-07, + "loss": 0.3443, + "step": 7595 + }, + { + "epoch": 0.48, + "grad_norm": 0.9408372696437483, + "learning_rate": 5.494903125633923e-07, + "loss": 0.2415, + "step": 7596 + }, + { + "epoch": 0.48, + "grad_norm": 0.6880557385671987, + "learning_rate": 5.493875447291762e-07, + "loss": 0.1342, + "step": 7597 + }, + { + "epoch": 0.48, + "grad_norm": 0.44870256035805056, + "learning_rate": 5.492847747879882e-07, + "loss": 0.1061, + "step": 7598 + }, + { + "epoch": 0.48, + "grad_norm": 0.8199473668297109, + "learning_rate": 5.491820027442126e-07, + "loss": 0.3366, + "step": 7599 + }, + { + "epoch": 0.48, + "grad_norm": 0.7325136084105575, + "learning_rate": 5.490792286022339e-07, + "loss": 0.2507, + "step": 7600 + }, + { + "epoch": 0.48, + "grad_norm": 0.728868102794531, + "learning_rate": 5.489764523664366e-07, + "loss": 0.2105, + "step": 7601 + }, + { + "epoch": 0.48, + "grad_norm": 3.3552811345763844, + "learning_rate": 5.488736740412056e-07, + "loss": 0.3901, + "step": 7602 + }, + { + "epoch": 0.48, + "grad_norm": 0.9693907483553831, + "learning_rate": 5.487708936309252e-07, + "loss": 0.2329, + "step": 7603 + }, + { + "epoch": 0.48, + "grad_norm": 0.6543900507727194, + "learning_rate": 5.486681111399804e-07, + "loss": 0.3631, + "step": 7604 + }, + { + "epoch": 0.48, + "grad_norm": 0.3797306778892442, + "learning_rate": 5.485653265727563e-07, + "loss": 0.0862, + "step": 7605 + }, + { + "epoch": 0.49, + "grad_norm": 1.347410830998166, + "learning_rate": 5.484625399336378e-07, + "loss": 0.1362, + "step": 7606 + }, + { + "epoch": 0.49, + "grad_norm": 1.1322417870599837, + "learning_rate": 5.483597512270097e-07, + "loss": 0.3191, + "step": 7607 + }, + { + "epoch": 0.49, + "grad_norm": 2.3830461419814357, + "learning_rate": 5.482569604572576e-07, + "loss": 0.1132, + "step": 7608 + }, + { + "epoch": 0.49, + "grad_norm": 0.4995163021122291, + "learning_rate": 5.481541676287664e-07, + "loss": 0.3372, + "step": 7609 + }, + { + "epoch": 0.49, + "grad_norm": 0.6449794252728354, + "learning_rate": 5.480513727459218e-07, + "loss": 0.2552, + "step": 7610 + }, + { + "epoch": 0.49, + "grad_norm": 0.3200613387108409, + "learning_rate": 5.479485758131089e-07, + "loss": 0.1286, + "step": 7611 + }, + { + "epoch": 0.49, + "grad_norm": 0.4558450421203875, + "learning_rate": 5.478457768347132e-07, + "loss": 0.1366, + "step": 7612 + }, + { + "epoch": 0.49, + "grad_norm": 8.501176775713478, + "learning_rate": 5.477429758151208e-07, + "loss": 0.2014, + "step": 7613 + }, + { + "epoch": 0.49, + "grad_norm": 1.168999490477346, + "learning_rate": 5.476401727587168e-07, + "loss": 0.2244, + "step": 7614 + }, + { + "epoch": 0.49, + "grad_norm": 0.5360430642494545, + "learning_rate": 5.475373676698874e-07, + "loss": 0.2527, + "step": 7615 + }, + { + "epoch": 0.49, + "grad_norm": 1.3431825819123024, + "learning_rate": 5.474345605530185e-07, + "loss": 0.2389, + "step": 7616 + }, + { + "epoch": 0.49, + "grad_norm": 0.755240154250283, + "learning_rate": 5.473317514124957e-07, + "loss": 0.2358, + "step": 7617 + }, + { + "epoch": 0.49, + "grad_norm": 0.5671909924975638, + "learning_rate": 5.472289402527053e-07, + "loss": 0.2044, + "step": 7618 + }, + { + "epoch": 0.49, + "grad_norm": 1.0461198665266376, + "learning_rate": 5.471261270780333e-07, + "loss": 0.3341, + "step": 7619 + }, + { + "epoch": 0.49, + "grad_norm": 2.7318666519389283, + "learning_rate": 5.470233118928659e-07, + "loss": 0.098, + "step": 7620 + }, + { + "epoch": 0.49, + "grad_norm": 6.196377638093798, + "learning_rate": 5.469204947015897e-07, + "loss": 0.1907, + "step": 7621 + }, + { + "epoch": 0.49, + "grad_norm": 8.052346199280715, + "learning_rate": 5.468176755085907e-07, + "loss": 0.3102, + "step": 7622 + }, + { + "epoch": 0.49, + "grad_norm": 0.4156714251904758, + "learning_rate": 5.467148543182556e-07, + "loss": 0.0306, + "step": 7623 + }, + { + "epoch": 0.49, + "grad_norm": 1.2150847494755244, + "learning_rate": 5.466120311349709e-07, + "loss": 0.0505, + "step": 7624 + }, + { + "epoch": 0.49, + "grad_norm": 0.09483021303767877, + "learning_rate": 5.465092059631234e-07, + "loss": 0.0652, + "step": 7625 + }, + { + "epoch": 0.49, + "grad_norm": 0.5774180036422483, + "learning_rate": 5.464063788070995e-07, + "loss": 0.0294, + "step": 7626 + }, + { + "epoch": 0.49, + "grad_norm": 1.1717246689561696, + "learning_rate": 5.463035496712862e-07, + "loss": 0.2189, + "step": 7627 + }, + { + "epoch": 0.49, + "grad_norm": 0.15920007005961984, + "learning_rate": 5.462007185600705e-07, + "loss": 0.0866, + "step": 7628 + }, + { + "epoch": 0.49, + "grad_norm": 5.562923039175392, + "learning_rate": 5.460978854778392e-07, + "loss": 0.1522, + "step": 7629 + }, + { + "epoch": 0.49, + "grad_norm": 3.784044452622361, + "learning_rate": 5.459950504289794e-07, + "loss": 0.1205, + "step": 7630 + }, + { + "epoch": 0.49, + "grad_norm": 0.5308654101621021, + "learning_rate": 5.458922134178784e-07, + "loss": 0.0208, + "step": 7631 + }, + { + "epoch": 0.49, + "grad_norm": 7.900879184419944, + "learning_rate": 5.457893744489233e-07, + "loss": 0.1425, + "step": 7632 + }, + { + "epoch": 0.49, + "grad_norm": 0.22269806232356187, + "learning_rate": 5.456865335265013e-07, + "loss": 0.007, + "step": 7633 + }, + { + "epoch": 0.49, + "grad_norm": 0.9320836456837528, + "learning_rate": 5.45583690655e-07, + "loss": 0.2754, + "step": 7634 + }, + { + "epoch": 0.49, + "grad_norm": 0.8461662886483006, + "learning_rate": 5.454808458388069e-07, + "loss": 0.1706, + "step": 7635 + }, + { + "epoch": 0.49, + "grad_norm": 0.7396967270574977, + "learning_rate": 5.453779990823094e-07, + "loss": 0.1853, + "step": 7636 + }, + { + "epoch": 0.49, + "grad_norm": 1.1345758146153235, + "learning_rate": 5.452751503898953e-07, + "loss": 0.2223, + "step": 7637 + }, + { + "epoch": 0.49, + "grad_norm": 0.09067743686668761, + "learning_rate": 5.451722997659522e-07, + "loss": 0.0006, + "step": 7638 + }, + { + "epoch": 0.49, + "grad_norm": 0.4105353757140416, + "learning_rate": 5.450694472148679e-07, + "loss": 0.3065, + "step": 7639 + }, + { + "epoch": 0.49, + "grad_norm": 3.2832544663749155, + "learning_rate": 5.449665927410305e-07, + "loss": 0.0066, + "step": 7640 + }, + { + "epoch": 0.49, + "grad_norm": 0.6266049999684713, + "learning_rate": 5.44863736348828e-07, + "loss": 0.1949, + "step": 7641 + }, + { + "epoch": 0.49, + "grad_norm": 1.7392414920433288, + "learning_rate": 5.447608780426481e-07, + "loss": 0.0945, + "step": 7642 + }, + { + "epoch": 0.49, + "grad_norm": 1.4139969306125162, + "learning_rate": 5.446580178268794e-07, + "loss": 0.1347, + "step": 7643 + }, + { + "epoch": 0.49, + "grad_norm": 5.381408684843561, + "learning_rate": 5.445551557059097e-07, + "loss": 0.1924, + "step": 7644 + }, + { + "epoch": 0.49, + "grad_norm": 0.5822479549167303, + "learning_rate": 5.444522916841275e-07, + "loss": 0.4079, + "step": 7645 + }, + { + "epoch": 0.49, + "grad_norm": 5.772179732324928, + "learning_rate": 5.443494257659211e-07, + "loss": 0.1675, + "step": 7646 + }, + { + "epoch": 0.49, + "grad_norm": 0.6034245235777258, + "learning_rate": 5.442465579556792e-07, + "loss": 0.1451, + "step": 7647 + }, + { + "epoch": 0.49, + "grad_norm": 1.0490176186344715, + "learning_rate": 5.441436882577901e-07, + "loss": 0.4069, + "step": 7648 + }, + { + "epoch": 0.49, + "grad_norm": 4.994696639605889, + "learning_rate": 5.440408166766426e-07, + "loss": 0.2387, + "step": 7649 + }, + { + "epoch": 0.49, + "grad_norm": 0.6424124901515734, + "learning_rate": 5.439379432166254e-07, + "loss": 0.3026, + "step": 7650 + }, + { + "epoch": 0.49, + "grad_norm": 12.852007642684793, + "learning_rate": 5.438350678821269e-07, + "loss": 0.2627, + "step": 7651 + }, + { + "epoch": 0.49, + "grad_norm": 2.395700268164772, + "learning_rate": 5.437321906775366e-07, + "loss": 0.2675, + "step": 7652 + }, + { + "epoch": 0.49, + "grad_norm": 0.27957112006074175, + "learning_rate": 5.43629311607243e-07, + "loss": 0.1985, + "step": 7653 + }, + { + "epoch": 0.49, + "grad_norm": 0.7109060020565691, + "learning_rate": 5.435264306756353e-07, + "loss": 0.2381, + "step": 7654 + }, + { + "epoch": 0.49, + "grad_norm": 0.7992833436780332, + "learning_rate": 5.434235478871025e-07, + "loss": 0.267, + "step": 7655 + }, + { + "epoch": 0.49, + "grad_norm": 0.8420379115632857, + "learning_rate": 5.433206632460339e-07, + "loss": 0.1712, + "step": 7656 + }, + { + "epoch": 0.49, + "grad_norm": 1.0141722606768333, + "learning_rate": 5.432177767568188e-07, + "loss": 0.2647, + "step": 7657 + }, + { + "epoch": 0.49, + "grad_norm": 0.9163221936286211, + "learning_rate": 5.431148884238463e-07, + "loss": 0.2378, + "step": 7658 + }, + { + "epoch": 0.49, + "grad_norm": 1.11536793358381, + "learning_rate": 5.430119982515061e-07, + "loss": 0.3102, + "step": 7659 + }, + { + "epoch": 0.49, + "grad_norm": 0.6877843730171554, + "learning_rate": 5.429091062441877e-07, + "loss": 0.1391, + "step": 7660 + }, + { + "epoch": 0.49, + "grad_norm": 0.8357158283203828, + "learning_rate": 5.428062124062803e-07, + "loss": 0.1612, + "step": 7661 + }, + { + "epoch": 0.49, + "grad_norm": 0.3087892038315234, + "learning_rate": 5.427033167421739e-07, + "loss": 0.1355, + "step": 7662 + }, + { + "epoch": 0.49, + "grad_norm": 0.6497803644720554, + "learning_rate": 5.426004192562583e-07, + "loss": 0.1091, + "step": 7663 + }, + { + "epoch": 0.49, + "grad_norm": 1.1518661309368448, + "learning_rate": 5.424975199529231e-07, + "loss": 0.0588, + "step": 7664 + }, + { + "epoch": 0.49, + "grad_norm": 0.7817363220829404, + "learning_rate": 5.423946188365583e-07, + "loss": 0.2561, + "step": 7665 + }, + { + "epoch": 0.49, + "grad_norm": 5.0568310816484825, + "learning_rate": 5.422917159115538e-07, + "loss": 0.0106, + "step": 7666 + }, + { + "epoch": 0.49, + "grad_norm": 1.0509192402800769, + "learning_rate": 5.421888111822996e-07, + "loss": 0.1834, + "step": 7667 + }, + { + "epoch": 0.49, + "grad_norm": 2.638077809428933, + "learning_rate": 5.42085904653186e-07, + "loss": 0.0882, + "step": 7668 + }, + { + "epoch": 0.49, + "grad_norm": 1.1955041141489864, + "learning_rate": 5.419829963286032e-07, + "loss": 0.0549, + "step": 7669 + }, + { + "epoch": 0.49, + "grad_norm": 0.5011330821533714, + "learning_rate": 5.418800862129411e-07, + "loss": 0.129, + "step": 7670 + }, + { + "epoch": 0.49, + "grad_norm": 0.6610469479758729, + "learning_rate": 5.417771743105907e-07, + "loss": 0.2157, + "step": 7671 + }, + { + "epoch": 0.49, + "grad_norm": 0.32319978083462303, + "learning_rate": 5.416742606259418e-07, + "loss": 0.0035, + "step": 7672 + }, + { + "epoch": 0.49, + "grad_norm": 0.25474548508449196, + "learning_rate": 5.415713451633852e-07, + "loss": 0.1364, + "step": 7673 + }, + { + "epoch": 0.49, + "grad_norm": 0.5000377958750638, + "learning_rate": 5.414684279273115e-07, + "loss": 0.1953, + "step": 7674 + }, + { + "epoch": 0.49, + "grad_norm": 0.9522464291738733, + "learning_rate": 5.413655089221113e-07, + "loss": 0.3496, + "step": 7675 + }, + { + "epoch": 0.49, + "grad_norm": 0.3830960507033039, + "learning_rate": 5.412625881521753e-07, + "loss": 0.1597, + "step": 7676 + }, + { + "epoch": 0.49, + "grad_norm": 4.8450301876350155, + "learning_rate": 5.411596656218945e-07, + "loss": 0.1851, + "step": 7677 + }, + { + "epoch": 0.49, + "grad_norm": 0.6179371119718056, + "learning_rate": 5.410567413356593e-07, + "loss": 0.1354, + "step": 7678 + }, + { + "epoch": 0.49, + "grad_norm": 0.8340167567328702, + "learning_rate": 5.409538152978612e-07, + "loss": 0.2141, + "step": 7679 + }, + { + "epoch": 0.49, + "grad_norm": 2.8976651429360185, + "learning_rate": 5.40850887512891e-07, + "loss": 0.2676, + "step": 7680 + }, + { + "epoch": 0.49, + "grad_norm": 1.1350004580546227, + "learning_rate": 5.407479579851398e-07, + "loss": 0.3251, + "step": 7681 + }, + { + "epoch": 0.49, + "grad_norm": 0.8125019697000947, + "learning_rate": 5.406450267189989e-07, + "loss": 0.1211, + "step": 7682 + }, + { + "epoch": 0.49, + "grad_norm": 0.87036871741752, + "learning_rate": 5.405420937188591e-07, + "loss": 0.0286, + "step": 7683 + }, + { + "epoch": 0.49, + "grad_norm": 1.2302845580684012, + "learning_rate": 5.404391589891125e-07, + "loss": 0.3608, + "step": 7684 + }, + { + "epoch": 0.49, + "grad_norm": 2.849230930705265, + "learning_rate": 5.403362225341499e-07, + "loss": 0.205, + "step": 7685 + }, + { + "epoch": 0.49, + "grad_norm": 1.8302906844553808, + "learning_rate": 5.40233284358363e-07, + "loss": 0.221, + "step": 7686 + }, + { + "epoch": 0.49, + "grad_norm": 0.663861848975396, + "learning_rate": 5.401303444661433e-07, + "loss": 0.2397, + "step": 7687 + }, + { + "epoch": 0.49, + "grad_norm": 1.9918275103532312, + "learning_rate": 5.400274028618824e-07, + "loss": 0.1558, + "step": 7688 + }, + { + "epoch": 0.49, + "grad_norm": 0.7139400232361313, + "learning_rate": 5.39924459549972e-07, + "loss": 0.3042, + "step": 7689 + }, + { + "epoch": 0.49, + "grad_norm": 1.6811553655827545, + "learning_rate": 5.398215145348039e-07, + "loss": 0.2626, + "step": 7690 + }, + { + "epoch": 0.49, + "grad_norm": 0.6472167918658924, + "learning_rate": 5.3971856782077e-07, + "loss": 0.2076, + "step": 7691 + }, + { + "epoch": 0.49, + "grad_norm": 0.7166800549540868, + "learning_rate": 5.396156194122621e-07, + "loss": 0.2499, + "step": 7692 + }, + { + "epoch": 0.49, + "grad_norm": 0.7760320156479528, + "learning_rate": 5.395126693136723e-07, + "loss": 0.4403, + "step": 7693 + }, + { + "epoch": 0.49, + "grad_norm": 0.44126949815312816, + "learning_rate": 5.394097175293926e-07, + "loss": 0.2611, + "step": 7694 + }, + { + "epoch": 0.49, + "grad_norm": 1.2981542533191037, + "learning_rate": 5.393067640638151e-07, + "loss": 0.1411, + "step": 7695 + }, + { + "epoch": 0.49, + "grad_norm": 3.3773437959150394, + "learning_rate": 5.39203808921332e-07, + "loss": 0.2844, + "step": 7696 + }, + { + "epoch": 0.49, + "grad_norm": 1.9578332391244648, + "learning_rate": 5.391008521063356e-07, + "loss": 0.3256, + "step": 7697 + }, + { + "epoch": 0.49, + "grad_norm": 0.6184362630267441, + "learning_rate": 5.389978936232185e-07, + "loss": 0.0678, + "step": 7698 + }, + { + "epoch": 0.49, + "grad_norm": 5.015152660420145, + "learning_rate": 5.388949334763724e-07, + "loss": 0.3061, + "step": 7699 + }, + { + "epoch": 0.49, + "grad_norm": 0.14821951159348457, + "learning_rate": 5.387919716701905e-07, + "loss": 0.077, + "step": 7700 + }, + { + "epoch": 0.49, + "grad_norm": 0.8196056775202845, + "learning_rate": 5.386890082090652e-07, + "loss": 0.0755, + "step": 7701 + }, + { + "epoch": 0.49, + "grad_norm": 0.27443719106242326, + "learning_rate": 5.385860430973889e-07, + "loss": 0.168, + "step": 7702 + }, + { + "epoch": 0.49, + "grad_norm": 0.39796304383492087, + "learning_rate": 5.384830763395544e-07, + "loss": 0.2335, + "step": 7703 + }, + { + "epoch": 0.49, + "grad_norm": 0.2585751688268829, + "learning_rate": 5.383801079399546e-07, + "loss": 0.1074, + "step": 7704 + }, + { + "epoch": 0.49, + "grad_norm": 0.2769851886238829, + "learning_rate": 5.382771379029822e-07, + "loss": 0.1108, + "step": 7705 + }, + { + "epoch": 0.49, + "grad_norm": 0.638441502383699, + "learning_rate": 5.381741662330302e-07, + "loss": 0.1724, + "step": 7706 + }, + { + "epoch": 0.49, + "grad_norm": 1.681914166537708, + "learning_rate": 5.380711929344914e-07, + "loss": 0.2414, + "step": 7707 + }, + { + "epoch": 0.49, + "grad_norm": 0.42036962401488187, + "learning_rate": 5.37968218011759e-07, + "loss": 0.1828, + "step": 7708 + }, + { + "epoch": 0.49, + "grad_norm": 0.6681332202550524, + "learning_rate": 5.378652414692262e-07, + "loss": 0.1142, + "step": 7709 + }, + { + "epoch": 0.49, + "grad_norm": 0.6668070210396156, + "learning_rate": 5.37762263311286e-07, + "loss": 0.2297, + "step": 7710 + }, + { + "epoch": 0.49, + "grad_norm": 0.17273657349610713, + "learning_rate": 5.376592835423319e-07, + "loss": 0.0043, + "step": 7711 + }, + { + "epoch": 0.49, + "grad_norm": 0.6394614784215694, + "learning_rate": 5.37556302166757e-07, + "loss": 0.2194, + "step": 7712 + }, + { + "epoch": 0.49, + "grad_norm": 0.9990973771703578, + "learning_rate": 5.374533191889546e-07, + "loss": 0.3003, + "step": 7713 + }, + { + "epoch": 0.49, + "grad_norm": 2.852651630229182, + "learning_rate": 5.373503346133183e-07, + "loss": 0.0856, + "step": 7714 + }, + { + "epoch": 0.49, + "grad_norm": 0.7313290835938698, + "learning_rate": 5.372473484442418e-07, + "loss": 0.0975, + "step": 7715 + }, + { + "epoch": 0.49, + "grad_norm": 0.7619842201842432, + "learning_rate": 5.371443606861186e-07, + "loss": 0.1442, + "step": 7716 + }, + { + "epoch": 0.49, + "grad_norm": 0.48601837270091935, + "learning_rate": 5.370413713433419e-07, + "loss": 0.2027, + "step": 7717 + }, + { + "epoch": 0.49, + "grad_norm": 0.9943991060640079, + "learning_rate": 5.369383804203062e-07, + "loss": 0.1661, + "step": 7718 + }, + { + "epoch": 0.49, + "grad_norm": 0.7764327557555127, + "learning_rate": 5.368353879214048e-07, + "loss": 0.1986, + "step": 7719 + }, + { + "epoch": 0.49, + "grad_norm": 0.9571235265037613, + "learning_rate": 5.367323938510319e-07, + "loss": 0.0151, + "step": 7720 + }, + { + "epoch": 0.49, + "grad_norm": 0.1331415179899828, + "learning_rate": 5.36629398213581e-07, + "loss": 0.0049, + "step": 7721 + }, + { + "epoch": 0.49, + "grad_norm": 0.9564062075025476, + "learning_rate": 5.365264010134465e-07, + "loss": 0.2393, + "step": 7722 + }, + { + "epoch": 0.49, + "grad_norm": 3.9988487704387783, + "learning_rate": 5.364234022550222e-07, + "loss": 0.0996, + "step": 7723 + }, + { + "epoch": 0.49, + "grad_norm": 0.6578332170961195, + "learning_rate": 5.363204019427023e-07, + "loss": 0.4422, + "step": 7724 + }, + { + "epoch": 0.49, + "grad_norm": 1.0935508995397545, + "learning_rate": 5.362174000808812e-07, + "loss": 0.3207, + "step": 7725 + }, + { + "epoch": 0.49, + "grad_norm": 0.6625098167845284, + "learning_rate": 5.361143966739528e-07, + "loss": 0.2386, + "step": 7726 + }, + { + "epoch": 0.49, + "grad_norm": 0.6384002449405118, + "learning_rate": 5.360113917263119e-07, + "loss": 0.2626, + "step": 7727 + }, + { + "epoch": 0.49, + "grad_norm": 0.86185255036157, + "learning_rate": 5.359083852423525e-07, + "loss": 0.5264, + "step": 7728 + }, + { + "epoch": 0.49, + "grad_norm": 0.5290037514723575, + "learning_rate": 5.358053772264691e-07, + "loss": 0.1684, + "step": 7729 + }, + { + "epoch": 0.49, + "grad_norm": 0.22971659840906652, + "learning_rate": 5.357023676830565e-07, + "loss": 0.0068, + "step": 7730 + }, + { + "epoch": 0.49, + "grad_norm": 1.0084694477299414, + "learning_rate": 5.35599356616509e-07, + "loss": 0.3455, + "step": 7731 + }, + { + "epoch": 0.49, + "grad_norm": 1.5719038292544176, + "learning_rate": 5.354963440312215e-07, + "loss": 0.1823, + "step": 7732 + }, + { + "epoch": 0.49, + "grad_norm": 0.8107698385400323, + "learning_rate": 5.353933299315885e-07, + "loss": 0.0623, + "step": 7733 + }, + { + "epoch": 0.49, + "grad_norm": 1.5501289802182807, + "learning_rate": 5.35290314322005e-07, + "loss": 0.3329, + "step": 7734 + }, + { + "epoch": 0.49, + "grad_norm": 1.3193147033692225, + "learning_rate": 5.351872972068656e-07, + "loss": 0.214, + "step": 7735 + }, + { + "epoch": 0.49, + "grad_norm": 0.8932772123066981, + "learning_rate": 5.350842785905654e-07, + "loss": 0.5223, + "step": 7736 + }, + { + "epoch": 0.49, + "grad_norm": 1.3815366076253965, + "learning_rate": 5.349812584774994e-07, + "loss": 0.1413, + "step": 7737 + }, + { + "epoch": 0.49, + "grad_norm": 1.0877090864882917, + "learning_rate": 5.348782368720625e-07, + "loss": 0.3399, + "step": 7738 + }, + { + "epoch": 0.49, + "grad_norm": 0.649411908566595, + "learning_rate": 5.347752137786501e-07, + "loss": 0.0373, + "step": 7739 + }, + { + "epoch": 0.49, + "grad_norm": 1.168303644676484, + "learning_rate": 5.346721892016571e-07, + "loss": 0.2095, + "step": 7740 + }, + { + "epoch": 0.49, + "grad_norm": 1.6210928324996892, + "learning_rate": 5.345691631454788e-07, + "loss": 0.1635, + "step": 7741 + }, + { + "epoch": 0.49, + "grad_norm": 1.4875449275210424, + "learning_rate": 5.344661356145104e-07, + "loss": 0.1373, + "step": 7742 + }, + { + "epoch": 0.49, + "grad_norm": 0.35111158500776496, + "learning_rate": 5.343631066131476e-07, + "loss": 0.2429, + "step": 7743 + }, + { + "epoch": 0.49, + "grad_norm": 5.130496377326105, + "learning_rate": 5.342600761457853e-07, + "loss": 0.286, + "step": 7744 + }, + { + "epoch": 0.49, + "grad_norm": 0.6032419223302801, + "learning_rate": 5.341570442168194e-07, + "loss": 0.0359, + "step": 7745 + }, + { + "epoch": 0.49, + "grad_norm": 1.0336253550312409, + "learning_rate": 5.340540108306454e-07, + "loss": 0.1477, + "step": 7746 + }, + { + "epoch": 0.49, + "grad_norm": 0.4896560824713781, + "learning_rate": 5.339509759916589e-07, + "loss": 0.1968, + "step": 7747 + }, + { + "epoch": 0.49, + "grad_norm": 0.624759526829821, + "learning_rate": 5.338479397042553e-07, + "loss": 0.3045, + "step": 7748 + }, + { + "epoch": 0.49, + "grad_norm": 0.2795796516751618, + "learning_rate": 5.337449019728306e-07, + "loss": 0.1251, + "step": 7749 + }, + { + "epoch": 0.49, + "grad_norm": 0.8266157280587519, + "learning_rate": 5.336418628017807e-07, + "loss": 0.3139, + "step": 7750 + }, + { + "epoch": 0.49, + "grad_norm": 0.6785908836524304, + "learning_rate": 5.335388221955012e-07, + "loss": 0.2336, + "step": 7751 + }, + { + "epoch": 0.49, + "grad_norm": 2.3237713765769747, + "learning_rate": 5.334357801583881e-07, + "loss": 0.2462, + "step": 7752 + }, + { + "epoch": 0.49, + "grad_norm": 0.5192653336984875, + "learning_rate": 5.333327366948374e-07, + "loss": 0.1865, + "step": 7753 + }, + { + "epoch": 0.49, + "grad_norm": 0.24939429534706634, + "learning_rate": 5.332296918092453e-07, + "loss": 0.2587, + "step": 7754 + }, + { + "epoch": 0.49, + "grad_norm": 0.38511437198997533, + "learning_rate": 5.331266455060077e-07, + "loss": 0.011, + "step": 7755 + }, + { + "epoch": 0.49, + "grad_norm": 0.2836501620836957, + "learning_rate": 5.330235977895205e-07, + "loss": 0.1769, + "step": 7756 + }, + { + "epoch": 0.49, + "grad_norm": 0.995630535414285, + "learning_rate": 5.329205486641806e-07, + "loss": 0.385, + "step": 7757 + }, + { + "epoch": 0.49, + "grad_norm": 0.7148093968339438, + "learning_rate": 5.328174981343838e-07, + "loss": 0.3027, + "step": 7758 + }, + { + "epoch": 0.49, + "grad_norm": 0.8158328911905972, + "learning_rate": 5.327144462045266e-07, + "loss": 0.3766, + "step": 7759 + }, + { + "epoch": 0.49, + "grad_norm": 0.58806648258431, + "learning_rate": 5.326113928790053e-07, + "loss": 0.2653, + "step": 7760 + }, + { + "epoch": 0.49, + "grad_norm": 0.9748352938096041, + "learning_rate": 5.325083381622164e-07, + "loss": 0.178, + "step": 7761 + }, + { + "epoch": 0.49, + "grad_norm": 0.9685051298089195, + "learning_rate": 5.324052820585563e-07, + "loss": 0.2109, + "step": 7762 + }, + { + "epoch": 0.5, + "grad_norm": 1.344605254927458, + "learning_rate": 5.323022245724219e-07, + "loss": 0.1343, + "step": 7763 + }, + { + "epoch": 0.5, + "grad_norm": 2.313227171798812, + "learning_rate": 5.321991657082096e-07, + "loss": 0.0295, + "step": 7764 + }, + { + "epoch": 0.5, + "grad_norm": 0.5575780887903956, + "learning_rate": 5.320961054703163e-07, + "loss": 0.305, + "step": 7765 + }, + { + "epoch": 0.5, + "grad_norm": 0.36333360036625995, + "learning_rate": 5.319930438631386e-07, + "loss": 0.002, + "step": 7766 + }, + { + "epoch": 0.5, + "grad_norm": 0.4898434219005641, + "learning_rate": 5.318899808910733e-07, + "loss": 0.1105, + "step": 7767 + }, + { + "epoch": 0.5, + "grad_norm": 0.24221747318964093, + "learning_rate": 5.317869165585173e-07, + "loss": 0.0364, + "step": 7768 + }, + { + "epoch": 0.5, + "grad_norm": 1.3506909444437065, + "learning_rate": 5.316838508698676e-07, + "loss": 0.2376, + "step": 7769 + }, + { + "epoch": 0.5, + "grad_norm": 1.7355945954046756, + "learning_rate": 5.315807838295208e-07, + "loss": 0.2195, + "step": 7770 + }, + { + "epoch": 0.5, + "grad_norm": 0.6566585438574053, + "learning_rate": 5.314777154418746e-07, + "loss": 0.1648, + "step": 7771 + }, + { + "epoch": 0.5, + "grad_norm": 0.41189169152852995, + "learning_rate": 5.313746457113257e-07, + "loss": 0.1249, + "step": 7772 + }, + { + "epoch": 0.5, + "grad_norm": 0.6918205197923989, + "learning_rate": 5.312715746422713e-07, + "loss": 0.3339, + "step": 7773 + }, + { + "epoch": 0.5, + "grad_norm": 2.2473419047145127, + "learning_rate": 5.311685022391088e-07, + "loss": 0.1643, + "step": 7774 + }, + { + "epoch": 0.5, + "grad_norm": 0.5809377724992406, + "learning_rate": 5.310654285062352e-07, + "loss": 0.2165, + "step": 7775 + }, + { + "epoch": 0.5, + "grad_norm": 2.015666218305165, + "learning_rate": 5.30962353448048e-07, + "loss": 0.1307, + "step": 7776 + }, + { + "epoch": 0.5, + "grad_norm": 0.38949374229009076, + "learning_rate": 5.308592770689447e-07, + "loss": 0.1978, + "step": 7777 + }, + { + "epoch": 0.5, + "grad_norm": 5.157184007758185, + "learning_rate": 5.307561993733225e-07, + "loss": 0.2324, + "step": 7778 + }, + { + "epoch": 0.5, + "grad_norm": 0.22596655073587732, + "learning_rate": 5.306531203655789e-07, + "loss": 0.0781, + "step": 7779 + }, + { + "epoch": 0.5, + "grad_norm": 0.7766956549889035, + "learning_rate": 5.305500400501116e-07, + "loss": 0.2524, + "step": 7780 + }, + { + "epoch": 0.5, + "grad_norm": 0.533611785353364, + "learning_rate": 5.304469584313184e-07, + "loss": 0.1424, + "step": 7781 + }, + { + "epoch": 0.5, + "grad_norm": 0.6913264970543666, + "learning_rate": 5.303438755135966e-07, + "loss": 0.4107, + "step": 7782 + }, + { + "epoch": 0.5, + "grad_norm": 1.9672104309768748, + "learning_rate": 5.30240791301344e-07, + "loss": 0.1589, + "step": 7783 + }, + { + "epoch": 0.5, + "grad_norm": 0.6988466547244304, + "learning_rate": 5.301377057989585e-07, + "loss": 0.1725, + "step": 7784 + }, + { + "epoch": 0.5, + "grad_norm": 0.2910088842559605, + "learning_rate": 5.30034619010838e-07, + "loss": 0.1654, + "step": 7785 + }, + { + "epoch": 0.5, + "grad_norm": 4.355324140429315, + "learning_rate": 5.299315309413801e-07, + "loss": 0.3068, + "step": 7786 + }, + { + "epoch": 0.5, + "grad_norm": 0.8952307282853894, + "learning_rate": 5.298284415949831e-07, + "loss": 0.2814, + "step": 7787 + }, + { + "epoch": 0.5, + "grad_norm": 0.9455958195092917, + "learning_rate": 5.297253509760447e-07, + "loss": 0.1514, + "step": 7788 + }, + { + "epoch": 0.5, + "grad_norm": 1.0228700995109932, + "learning_rate": 5.29622259088963e-07, + "loss": 0.0032, + "step": 7789 + }, + { + "epoch": 0.5, + "grad_norm": 0.914851414661287, + "learning_rate": 5.295191659381361e-07, + "loss": 0.1251, + "step": 7790 + }, + { + "epoch": 0.5, + "grad_norm": 1.1990458833847757, + "learning_rate": 5.294160715279625e-07, + "loss": 0.3261, + "step": 7791 + }, + { + "epoch": 0.5, + "grad_norm": 1.189237769919022, + "learning_rate": 5.293129758628401e-07, + "loss": 0.1638, + "step": 7792 + }, + { + "epoch": 0.5, + "grad_norm": 1.4307245719668564, + "learning_rate": 5.292098789471672e-07, + "loss": 0.1495, + "step": 7793 + }, + { + "epoch": 0.5, + "grad_norm": 0.9181822888819925, + "learning_rate": 5.291067807853421e-07, + "loss": 0.0756, + "step": 7794 + }, + { + "epoch": 0.5, + "grad_norm": 1.0261945761045828, + "learning_rate": 5.290036813817633e-07, + "loss": 0.1123, + "step": 7795 + }, + { + "epoch": 0.5, + "grad_norm": 1.8237465208574568, + "learning_rate": 5.289005807408291e-07, + "loss": 0.1983, + "step": 7796 + }, + { + "epoch": 0.5, + "grad_norm": 0.9719494841336705, + "learning_rate": 5.287974788669379e-07, + "loss": 0.5802, + "step": 7797 + }, + { + "epoch": 0.5, + "grad_norm": 0.7977473228851141, + "learning_rate": 5.286943757644885e-07, + "loss": 0.3637, + "step": 7798 + }, + { + "epoch": 0.5, + "grad_norm": 0.5200201574343315, + "learning_rate": 5.285912714378794e-07, + "loss": 0.1093, + "step": 7799 + }, + { + "epoch": 0.5, + "grad_norm": 0.46932306713004374, + "learning_rate": 5.28488165891509e-07, + "loss": 0.0479, + "step": 7800 + }, + { + "epoch": 0.5, + "grad_norm": 1.44605212307224, + "learning_rate": 5.283850591297764e-07, + "loss": 0.2126, + "step": 7801 + }, + { + "epoch": 0.5, + "grad_norm": 9.7413993988323, + "learning_rate": 5.2828195115708e-07, + "loss": 0.3058, + "step": 7802 + }, + { + "epoch": 0.5, + "grad_norm": 2.4379561362648015, + "learning_rate": 5.281788419778187e-07, + "loss": 0.0603, + "step": 7803 + }, + { + "epoch": 0.5, + "grad_norm": 0.8758741891476285, + "learning_rate": 5.280757315963914e-07, + "loss": 0.1908, + "step": 7804 + }, + { + "epoch": 0.5, + "grad_norm": 0.4412617542815155, + "learning_rate": 5.279726200171968e-07, + "loss": 0.2986, + "step": 7805 + }, + { + "epoch": 0.5, + "grad_norm": 1.5032658183060224, + "learning_rate": 5.278695072446342e-07, + "loss": 0.0798, + "step": 7806 + }, + { + "epoch": 0.5, + "grad_norm": 0.4376415470066588, + "learning_rate": 5.277663932831022e-07, + "loss": 0.0424, + "step": 7807 + }, + { + "epoch": 0.5, + "grad_norm": 0.9769449621105338, + "learning_rate": 5.276632781370003e-07, + "loss": 0.1485, + "step": 7808 + }, + { + "epoch": 0.5, + "grad_norm": 0.7515751597096842, + "learning_rate": 5.275601618107272e-07, + "loss": 0.431, + "step": 7809 + }, + { + "epoch": 0.5, + "grad_norm": 0.7794421637018186, + "learning_rate": 5.274570443086822e-07, + "loss": 0.2394, + "step": 7810 + }, + { + "epoch": 0.5, + "grad_norm": 0.5447644869777392, + "learning_rate": 5.273539256352645e-07, + "loss": 0.168, + "step": 7811 + }, + { + "epoch": 0.5, + "grad_norm": 0.8876176330913443, + "learning_rate": 5.272508057948734e-07, + "loss": 0.1879, + "step": 7812 + }, + { + "epoch": 0.5, + "grad_norm": 0.9865574689471549, + "learning_rate": 5.27147684791908e-07, + "loss": 0.2503, + "step": 7813 + }, + { + "epoch": 0.5, + "grad_norm": 0.9871320823600839, + "learning_rate": 5.270445626307679e-07, + "loss": 0.3219, + "step": 7814 + }, + { + "epoch": 0.5, + "grad_norm": 0.7105275225394994, + "learning_rate": 5.269414393158522e-07, + "loss": 0.1235, + "step": 7815 + }, + { + "epoch": 0.5, + "grad_norm": 1.1802947614800248, + "learning_rate": 5.268383148515607e-07, + "loss": 0.0591, + "step": 7816 + }, + { + "epoch": 0.5, + "grad_norm": 6.3279453807796, + "learning_rate": 5.267351892422928e-07, + "loss": 0.1419, + "step": 7817 + }, + { + "epoch": 0.5, + "grad_norm": 1.4359935887534894, + "learning_rate": 5.266320624924479e-07, + "loss": 0.4444, + "step": 7818 + }, + { + "epoch": 0.5, + "grad_norm": 1.335113781847231, + "learning_rate": 5.265289346064258e-07, + "loss": 0.2569, + "step": 7819 + }, + { + "epoch": 0.5, + "grad_norm": 5.056877467538794, + "learning_rate": 5.264258055886258e-07, + "loss": 0.2682, + "step": 7820 + }, + { + "epoch": 0.5, + "grad_norm": 0.6868373893259, + "learning_rate": 5.26322675443448e-07, + "loss": 0.1618, + "step": 7821 + }, + { + "epoch": 0.5, + "grad_norm": 0.5173004207683524, + "learning_rate": 5.26219544175292e-07, + "loss": 0.2116, + "step": 7822 + }, + { + "epoch": 0.5, + "grad_norm": 0.5744844435517102, + "learning_rate": 5.261164117885572e-07, + "loss": 0.2363, + "step": 7823 + }, + { + "epoch": 0.5, + "grad_norm": 3.0058124276324367, + "learning_rate": 5.26013278287644e-07, + "loss": 0.2604, + "step": 7824 + }, + { + "epoch": 0.5, + "grad_norm": 0.5659951737869565, + "learning_rate": 5.259101436769522e-07, + "loss": 0.1254, + "step": 7825 + }, + { + "epoch": 0.5, + "grad_norm": 1.8246655049506346, + "learning_rate": 5.258070079608814e-07, + "loss": 0.0466, + "step": 7826 + }, + { + "epoch": 0.5, + "grad_norm": 2.354953605724272, + "learning_rate": 5.257038711438318e-07, + "loss": 0.1545, + "step": 7827 + }, + { + "epoch": 0.5, + "grad_norm": 1.035775037723114, + "learning_rate": 5.256007332302033e-07, + "loss": 0.2671, + "step": 7828 + }, + { + "epoch": 0.5, + "grad_norm": 4.089958319818489, + "learning_rate": 5.254975942243962e-07, + "loss": 0.3936, + "step": 7829 + }, + { + "epoch": 0.5, + "grad_norm": 9.762537594903154, + "learning_rate": 5.253944541308105e-07, + "loss": 0.1401, + "step": 7830 + }, + { + "epoch": 0.5, + "grad_norm": 2.746543211746795, + "learning_rate": 5.252913129538462e-07, + "loss": 0.2737, + "step": 7831 + }, + { + "epoch": 0.5, + "grad_norm": 0.28309018431784894, + "learning_rate": 5.251881706979036e-07, + "loss": 0.12, + "step": 7832 + }, + { + "epoch": 0.5, + "grad_norm": 0.8489990124044013, + "learning_rate": 5.25085027367383e-07, + "loss": 0.2752, + "step": 7833 + }, + { + "epoch": 0.5, + "grad_norm": 0.3177622837747427, + "learning_rate": 5.249818829666849e-07, + "loss": 0.1412, + "step": 7834 + }, + { + "epoch": 0.5, + "grad_norm": 0.8600800060063669, + "learning_rate": 5.248787375002093e-07, + "loss": 0.3269, + "step": 7835 + }, + { + "epoch": 0.5, + "grad_norm": 2.9871049459997794, + "learning_rate": 5.247755909723569e-07, + "loss": 0.0822, + "step": 7836 + }, + { + "epoch": 0.5, + "grad_norm": 0.824574174471444, + "learning_rate": 5.24672443387528e-07, + "loss": 0.2035, + "step": 7837 + }, + { + "epoch": 0.5, + "grad_norm": 0.7726990955671356, + "learning_rate": 5.245692947501229e-07, + "loss": 0.0721, + "step": 7838 + }, + { + "epoch": 0.5, + "grad_norm": 0.7629832334784106, + "learning_rate": 5.244661450645424e-07, + "loss": 0.0428, + "step": 7839 + }, + { + "epoch": 0.5, + "grad_norm": 1.025522337254423, + "learning_rate": 5.243629943351868e-07, + "loss": 0.4119, + "step": 7840 + }, + { + "epoch": 0.5, + "grad_norm": 1.0978245388548087, + "learning_rate": 5.242598425664569e-07, + "loss": 0.3479, + "step": 7841 + }, + { + "epoch": 0.5, + "grad_norm": 0.4144828682749771, + "learning_rate": 5.241566897627535e-07, + "loss": 0.35, + "step": 7842 + }, + { + "epoch": 0.5, + "grad_norm": 0.348071760526536, + "learning_rate": 5.240535359284771e-07, + "loss": 0.0055, + "step": 7843 + }, + { + "epoch": 0.5, + "grad_norm": 14.121454223171519, + "learning_rate": 5.239503810680285e-07, + "loss": 0.1295, + "step": 7844 + }, + { + "epoch": 0.5, + "grad_norm": 10.88673366259795, + "learning_rate": 5.238472251858085e-07, + "loss": 0.02, + "step": 7845 + }, + { + "epoch": 0.5, + "grad_norm": 0.562742651472133, + "learning_rate": 5.23744068286218e-07, + "loss": 0.2023, + "step": 7846 + }, + { + "epoch": 0.5, + "grad_norm": 0.845274488769913, + "learning_rate": 5.236409103736578e-07, + "loss": 0.0736, + "step": 7847 + }, + { + "epoch": 0.5, + "grad_norm": 0.6935526470572801, + "learning_rate": 5.235377514525287e-07, + "loss": 0.2186, + "step": 7848 + }, + { + "epoch": 0.5, + "grad_norm": 1.491441170439203, + "learning_rate": 5.234345915272319e-07, + "loss": 0.1227, + "step": 7849 + }, + { + "epoch": 0.5, + "grad_norm": 1.0239583355372224, + "learning_rate": 5.233314306021683e-07, + "loss": 0.1545, + "step": 7850 + }, + { + "epoch": 0.5, + "grad_norm": 2.8937449633580066, + "learning_rate": 5.232282686817391e-07, + "loss": 0.2691, + "step": 7851 + }, + { + "epoch": 0.5, + "grad_norm": 0.48963595647131725, + "learning_rate": 5.23125105770345e-07, + "loss": 0.0804, + "step": 7852 + }, + { + "epoch": 0.5, + "grad_norm": 0.800896218318243, + "learning_rate": 5.230219418723877e-07, + "loss": 0.1848, + "step": 7853 + }, + { + "epoch": 0.5, + "grad_norm": 1.6642178462979964, + "learning_rate": 5.229187769922678e-07, + "loss": 0.3277, + "step": 7854 + }, + { + "epoch": 0.5, + "grad_norm": 0.6979156812551884, + "learning_rate": 5.228156111343869e-07, + "loss": 0.0602, + "step": 7855 + }, + { + "epoch": 0.5, + "grad_norm": 0.7067320927297459, + "learning_rate": 5.227124443031463e-07, + "loss": 0.1629, + "step": 7856 + }, + { + "epoch": 0.5, + "grad_norm": 1.7787555692149855, + "learning_rate": 5.226092765029471e-07, + "loss": 0.1465, + "step": 7857 + }, + { + "epoch": 0.5, + "grad_norm": 0.9791138546302344, + "learning_rate": 5.225061077381906e-07, + "loss": 0.1425, + "step": 7858 + }, + { + "epoch": 0.5, + "grad_norm": 1.396157475943847, + "learning_rate": 5.224029380132784e-07, + "loss": 0.0152, + "step": 7859 + }, + { + "epoch": 0.5, + "grad_norm": 1.5747056921221125, + "learning_rate": 5.222997673326117e-07, + "loss": 0.0786, + "step": 7860 + }, + { + "epoch": 0.5, + "grad_norm": 1.4217302165875072, + "learning_rate": 5.221965957005923e-07, + "loss": 0.0959, + "step": 7861 + }, + { + "epoch": 0.5, + "grad_norm": 6.778564964888106, + "learning_rate": 5.220934231216213e-07, + "loss": 0.1542, + "step": 7862 + }, + { + "epoch": 0.5, + "grad_norm": 0.471767682061118, + "learning_rate": 5.219902496001007e-07, + "loss": 0.4055, + "step": 7863 + }, + { + "epoch": 0.5, + "grad_norm": 0.5294869636570287, + "learning_rate": 5.218870751404318e-07, + "loss": 0.0591, + "step": 7864 + }, + { + "epoch": 0.5, + "grad_norm": 0.7789689219152222, + "learning_rate": 5.217838997470161e-07, + "loss": 0.2538, + "step": 7865 + }, + { + "epoch": 0.5, + "grad_norm": 7.266033380340139, + "learning_rate": 5.216807234242556e-07, + "loss": 0.1506, + "step": 7866 + }, + { + "epoch": 0.5, + "grad_norm": 0.6533613723205102, + "learning_rate": 5.215775461765518e-07, + "loss": 0.1102, + "step": 7867 + }, + { + "epoch": 0.5, + "grad_norm": 0.7032095492239564, + "learning_rate": 5.214743680083063e-07, + "loss": 0.325, + "step": 7868 + }, + { + "epoch": 0.5, + "grad_norm": 3.2152376585789004, + "learning_rate": 5.213711889239213e-07, + "loss": 0.2158, + "step": 7869 + }, + { + "epoch": 0.5, + "grad_norm": 2.031521999729483, + "learning_rate": 5.212680089277985e-07, + "loss": 0.023, + "step": 7870 + }, + { + "epoch": 0.5, + "grad_norm": 1.4208887784432154, + "learning_rate": 5.211648280243395e-07, + "loss": 0.0118, + "step": 7871 + }, + { + "epoch": 0.5, + "grad_norm": 0.47270387623683613, + "learning_rate": 5.210616462179464e-07, + "loss": 0.2021, + "step": 7872 + }, + { + "epoch": 0.5, + "grad_norm": 4.21269858601264, + "learning_rate": 5.209584635130213e-07, + "loss": 0.2684, + "step": 7873 + }, + { + "epoch": 0.5, + "grad_norm": 0.7791515887512775, + "learning_rate": 5.20855279913966e-07, + "loss": 0.2264, + "step": 7874 + }, + { + "epoch": 0.5, + "grad_norm": 2.987061945698786, + "learning_rate": 5.207520954251824e-07, + "loss": 0.2461, + "step": 7875 + }, + { + "epoch": 0.5, + "grad_norm": 1.1747753057153916, + "learning_rate": 5.206489100510728e-07, + "loss": 0.2769, + "step": 7876 + }, + { + "epoch": 0.5, + "grad_norm": 0.6665906890843033, + "learning_rate": 5.205457237960391e-07, + "loss": 0.2738, + "step": 7877 + }, + { + "epoch": 0.5, + "grad_norm": 0.369531591363996, + "learning_rate": 5.204425366644835e-07, + "loss": 0.1376, + "step": 7878 + }, + { + "epoch": 0.5, + "grad_norm": 1.8279046066898146, + "learning_rate": 5.203393486608083e-07, + "loss": 0.1184, + "step": 7879 + }, + { + "epoch": 0.5, + "grad_norm": 0.3569836166745123, + "learning_rate": 5.202361597894156e-07, + "loss": 0.0574, + "step": 7880 + }, + { + "epoch": 0.5, + "grad_norm": 0.7107200054599689, + "learning_rate": 5.201329700547076e-07, + "loss": 0.2351, + "step": 7881 + }, + { + "epoch": 0.5, + "grad_norm": 0.19613903012530348, + "learning_rate": 5.200297794610866e-07, + "loss": 0.0029, + "step": 7882 + }, + { + "epoch": 0.5, + "grad_norm": 1.3353815282918122, + "learning_rate": 5.199265880129549e-07, + "loss": 0.2413, + "step": 7883 + }, + { + "epoch": 0.5, + "grad_norm": 0.24590316881461446, + "learning_rate": 5.19823395714715e-07, + "loss": 0.0928, + "step": 7884 + }, + { + "epoch": 0.5, + "grad_norm": 0.6912026268662488, + "learning_rate": 5.197202025707692e-07, + "loss": 0.0113, + "step": 7885 + }, + { + "epoch": 0.5, + "grad_norm": 1.3384877353144549, + "learning_rate": 5.196170085855197e-07, + "loss": 0.2684, + "step": 7886 + }, + { + "epoch": 0.5, + "grad_norm": 1.6400111663334485, + "learning_rate": 5.195138137633695e-07, + "loss": 0.2284, + "step": 7887 + }, + { + "epoch": 0.5, + "grad_norm": 0.7398213844163112, + "learning_rate": 5.194106181087205e-07, + "loss": 0.2143, + "step": 7888 + }, + { + "epoch": 0.5, + "grad_norm": 2.2830114388100147, + "learning_rate": 5.193074216259756e-07, + "loss": 0.2561, + "step": 7889 + }, + { + "epoch": 0.5, + "grad_norm": 0.9472489104447508, + "learning_rate": 5.192042243195374e-07, + "loss": 0.2109, + "step": 7890 + }, + { + "epoch": 0.5, + "grad_norm": 5.367648599858341, + "learning_rate": 5.191010261938084e-07, + "loss": 0.4713, + "step": 7891 + }, + { + "epoch": 0.5, + "grad_norm": 1.155433735597053, + "learning_rate": 5.18997827253191e-07, + "loss": 0.1569, + "step": 7892 + }, + { + "epoch": 0.5, + "grad_norm": 0.406164477273476, + "learning_rate": 5.188946275020883e-07, + "loss": 0.0912, + "step": 7893 + }, + { + "epoch": 0.5, + "grad_norm": 0.9541769863246171, + "learning_rate": 5.187914269449027e-07, + "loss": 0.1282, + "step": 7894 + }, + { + "epoch": 0.5, + "grad_norm": 1.12016431612114, + "learning_rate": 5.186882255860371e-07, + "loss": 0.2475, + "step": 7895 + }, + { + "epoch": 0.5, + "grad_norm": 1.077277458584877, + "learning_rate": 5.185850234298942e-07, + "loss": 0.0833, + "step": 7896 + }, + { + "epoch": 0.5, + "grad_norm": 1.1287624437146637, + "learning_rate": 5.184818204808768e-07, + "loss": 0.4477, + "step": 7897 + }, + { + "epoch": 0.5, + "grad_norm": 4.067688187277838, + "learning_rate": 5.183786167433879e-07, + "loss": 0.1426, + "step": 7898 + }, + { + "epoch": 0.5, + "grad_norm": 0.7476661301573576, + "learning_rate": 5.182754122218301e-07, + "loss": 0.0813, + "step": 7899 + }, + { + "epoch": 0.5, + "grad_norm": 0.2548690992889109, + "learning_rate": 5.181722069206067e-07, + "loss": 0.1678, + "step": 7900 + }, + { + "epoch": 0.5, + "grad_norm": 2.4858376999577905, + "learning_rate": 5.180690008441202e-07, + "loss": 0.0102, + "step": 7901 + }, + { + "epoch": 0.5, + "grad_norm": 1.0979010552464186, + "learning_rate": 5.179657939967739e-07, + "loss": 0.2151, + "step": 7902 + }, + { + "epoch": 0.5, + "grad_norm": 3.7467724528418, + "learning_rate": 5.178625863829708e-07, + "loss": 0.1008, + "step": 7903 + }, + { + "epoch": 0.5, + "grad_norm": 1.0258098759250416, + "learning_rate": 5.177593780071138e-07, + "loss": 0.216, + "step": 7904 + }, + { + "epoch": 0.5, + "grad_norm": 0.8528197600824754, + "learning_rate": 5.176561688736059e-07, + "loss": 0.2002, + "step": 7905 + }, + { + "epoch": 0.5, + "grad_norm": 3.4726365861437474, + "learning_rate": 5.175529589868505e-07, + "loss": 0.1345, + "step": 7906 + }, + { + "epoch": 0.5, + "grad_norm": 6.778146508546821, + "learning_rate": 5.174497483512505e-07, + "loss": 0.1732, + "step": 7907 + }, + { + "epoch": 0.5, + "grad_norm": 2.175122001822574, + "learning_rate": 5.173465369712092e-07, + "loss": 0.0348, + "step": 7908 + }, + { + "epoch": 0.5, + "grad_norm": 1.3902211936326443, + "learning_rate": 5.172433248511298e-07, + "loss": 0.2418, + "step": 7909 + }, + { + "epoch": 0.5, + "grad_norm": 0.6413738610015453, + "learning_rate": 5.171401119954155e-07, + "loss": 0.2419, + "step": 7910 + }, + { + "epoch": 0.5, + "grad_norm": 0.9434512738195908, + "learning_rate": 5.170368984084695e-07, + "loss": 0.0773, + "step": 7911 + }, + { + "epoch": 0.5, + "grad_norm": 0.49654956072595885, + "learning_rate": 5.169336840946951e-07, + "loss": 0.2155, + "step": 7912 + }, + { + "epoch": 0.5, + "grad_norm": 0.6266402889062209, + "learning_rate": 5.168304690584957e-07, + "loss": 0.2152, + "step": 7913 + }, + { + "epoch": 0.5, + "grad_norm": 1.0088775659474432, + "learning_rate": 5.167272533042747e-07, + "loss": 0.1774, + "step": 7914 + }, + { + "epoch": 0.5, + "grad_norm": 1.413291250801521, + "learning_rate": 5.166240368364355e-07, + "loss": 0.3786, + "step": 7915 + }, + { + "epoch": 0.5, + "grad_norm": 1.4757878077723017, + "learning_rate": 5.165208196593815e-07, + "loss": 0.1337, + "step": 7916 + }, + { + "epoch": 0.5, + "grad_norm": 0.5831083215292089, + "learning_rate": 5.16417601777516e-07, + "loss": 0.2393, + "step": 7917 + }, + { + "epoch": 0.5, + "grad_norm": 0.2954726822787284, + "learning_rate": 5.163143831952428e-07, + "loss": 0.015, + "step": 7918 + }, + { + "epoch": 0.51, + "grad_norm": 3.0815712859292015, + "learning_rate": 5.16211163916965e-07, + "loss": 0.2246, + "step": 7919 + }, + { + "epoch": 0.51, + "grad_norm": 0.8915610110045376, + "learning_rate": 5.161079439470865e-07, + "loss": 0.3047, + "step": 7920 + }, + { + "epoch": 0.51, + "grad_norm": 2.616067334512679, + "learning_rate": 5.160047232900105e-07, + "loss": 0.0679, + "step": 7921 + }, + { + "epoch": 0.51, + "grad_norm": 2.6201283893735887, + "learning_rate": 5.159015019501412e-07, + "loss": 0.1053, + "step": 7922 + }, + { + "epoch": 0.51, + "grad_norm": 0.5590432125548226, + "learning_rate": 5.157982799318816e-07, + "loss": 0.0049, + "step": 7923 + }, + { + "epoch": 0.51, + "grad_norm": 1.5662754624081587, + "learning_rate": 5.156950572396357e-07, + "loss": 0.002, + "step": 7924 + }, + { + "epoch": 0.51, + "grad_norm": 0.3489688627740372, + "learning_rate": 5.15591833877807e-07, + "loss": 0.1944, + "step": 7925 + }, + { + "epoch": 0.51, + "grad_norm": 0.9277712734648788, + "learning_rate": 5.154886098507994e-07, + "loss": 0.2823, + "step": 7926 + }, + { + "epoch": 0.51, + "grad_norm": 0.3126595205279847, + "learning_rate": 5.153853851630167e-07, + "loss": 0.2276, + "step": 7927 + }, + { + "epoch": 0.51, + "grad_norm": 0.5437867303477449, + "learning_rate": 5.152821598188624e-07, + "loss": 0.2645, + "step": 7928 + }, + { + "epoch": 0.51, + "grad_norm": 9.298265152104033, + "learning_rate": 5.151789338227404e-07, + "loss": 0.2065, + "step": 7929 + }, + { + "epoch": 0.51, + "grad_norm": 1.3523082298627538, + "learning_rate": 5.150757071790546e-07, + "loss": 0.2012, + "step": 7930 + }, + { + "epoch": 0.51, + "grad_norm": 0.6762209296542762, + "learning_rate": 5.14972479892209e-07, + "loss": 0.1619, + "step": 7931 + }, + { + "epoch": 0.51, + "grad_norm": 0.8012859942811726, + "learning_rate": 5.148692519666071e-07, + "loss": 0.1933, + "step": 7932 + }, + { + "epoch": 0.51, + "grad_norm": 1.3174116071937148, + "learning_rate": 5.147660234066532e-07, + "loss": 0.1909, + "step": 7933 + }, + { + "epoch": 0.51, + "grad_norm": 0.46627943344389033, + "learning_rate": 5.146627942167509e-07, + "loss": 0.0972, + "step": 7934 + }, + { + "epoch": 0.51, + "grad_norm": 0.42565532044733345, + "learning_rate": 5.145595644013044e-07, + "loss": 0.1496, + "step": 7935 + }, + { + "epoch": 0.51, + "grad_norm": 5.854961639675671, + "learning_rate": 5.144563339647177e-07, + "loss": 0.1301, + "step": 7936 + }, + { + "epoch": 0.51, + "grad_norm": 1.828730714431461, + "learning_rate": 5.143531029113946e-07, + "loss": 0.3431, + "step": 7937 + }, + { + "epoch": 0.51, + "grad_norm": 2.8837225539057707, + "learning_rate": 5.142498712457392e-07, + "loss": 0.1427, + "step": 7938 + }, + { + "epoch": 0.51, + "grad_norm": 0.3271662302507762, + "learning_rate": 5.141466389721557e-07, + "loss": 0.1194, + "step": 7939 + }, + { + "epoch": 0.51, + "grad_norm": 0.29422800625131323, + "learning_rate": 5.140434060950482e-07, + "loss": 0.1945, + "step": 7940 + }, + { + "epoch": 0.51, + "grad_norm": 1.8082749986629059, + "learning_rate": 5.139401726188208e-07, + "loss": 0.2746, + "step": 7941 + }, + { + "epoch": 0.51, + "grad_norm": 4.66322914498958, + "learning_rate": 5.138369385478774e-07, + "loss": 0.3334, + "step": 7942 + }, + { + "epoch": 0.51, + "grad_norm": 0.5024296011165815, + "learning_rate": 5.137337038866227e-07, + "loss": 0.0649, + "step": 7943 + }, + { + "epoch": 0.51, + "grad_norm": 0.5067471540265036, + "learning_rate": 5.136304686394604e-07, + "loss": 0.0759, + "step": 7944 + }, + { + "epoch": 0.51, + "grad_norm": 0.6354339781473719, + "learning_rate": 5.135272328107949e-07, + "loss": 0.1112, + "step": 7945 + }, + { + "epoch": 0.51, + "grad_norm": 1.129014749509955, + "learning_rate": 5.134239964050307e-07, + "loss": 0.1003, + "step": 7946 + }, + { + "epoch": 0.51, + "grad_norm": 0.5008406881251112, + "learning_rate": 5.133207594265715e-07, + "loss": 0.1838, + "step": 7947 + }, + { + "epoch": 0.51, + "grad_norm": 1.1048411268653404, + "learning_rate": 5.132175218798221e-07, + "loss": 0.1741, + "step": 7948 + }, + { + "epoch": 0.51, + "grad_norm": 1.0134837594097874, + "learning_rate": 5.131142837691865e-07, + "loss": 0.1174, + "step": 7949 + }, + { + "epoch": 0.51, + "grad_norm": 11.01198196378554, + "learning_rate": 5.130110450990693e-07, + "loss": 0.1294, + "step": 7950 + }, + { + "epoch": 0.51, + "grad_norm": 3.56648443745084, + "learning_rate": 5.129078058738747e-07, + "loss": 0.0392, + "step": 7951 + }, + { + "epoch": 0.51, + "grad_norm": 1.2297248998717523, + "learning_rate": 5.128045660980072e-07, + "loss": 0.0891, + "step": 7952 + }, + { + "epoch": 0.51, + "grad_norm": 1.4581243116909357, + "learning_rate": 5.127013257758712e-07, + "loss": 0.1297, + "step": 7953 + }, + { + "epoch": 0.51, + "grad_norm": 0.9345380450389321, + "learning_rate": 5.125980849118712e-07, + "loss": 0.189, + "step": 7954 + }, + { + "epoch": 0.51, + "grad_norm": 0.7370901183156554, + "learning_rate": 5.124948435104114e-07, + "loss": 0.1185, + "step": 7955 + }, + { + "epoch": 0.51, + "grad_norm": 0.7193480222061377, + "learning_rate": 5.123916015758964e-07, + "loss": 0.2652, + "step": 7956 + }, + { + "epoch": 0.51, + "grad_norm": 0.6466032299607144, + "learning_rate": 5.122883591127309e-07, + "loss": 0.1024, + "step": 7957 + }, + { + "epoch": 0.51, + "grad_norm": 0.7860832056788873, + "learning_rate": 5.121851161253192e-07, + "loss": 0.226, + "step": 7958 + }, + { + "epoch": 0.51, + "grad_norm": 0.6013005123668494, + "learning_rate": 5.120818726180661e-07, + "loss": 0.0711, + "step": 7959 + }, + { + "epoch": 0.51, + "grad_norm": 6.064753175786098, + "learning_rate": 5.11978628595376e-07, + "loss": 0.1304, + "step": 7960 + }, + { + "epoch": 0.51, + "grad_norm": 0.9750951139813014, + "learning_rate": 5.118753840616535e-07, + "loss": 0.0555, + "step": 7961 + }, + { + "epoch": 0.51, + "grad_norm": 0.9487612432620487, + "learning_rate": 5.117721390213033e-07, + "loss": 0.4431, + "step": 7962 + }, + { + "epoch": 0.51, + "grad_norm": 1.3780561210469224, + "learning_rate": 5.116688934787299e-07, + "loss": 0.227, + "step": 7963 + }, + { + "epoch": 0.51, + "grad_norm": 0.9622208866925224, + "learning_rate": 5.11565647438338e-07, + "loss": 0.2718, + "step": 7964 + }, + { + "epoch": 0.51, + "grad_norm": 1.3846857939877564, + "learning_rate": 5.114624009045324e-07, + "loss": 0.4056, + "step": 7965 + }, + { + "epoch": 0.51, + "grad_norm": 0.6896302473879041, + "learning_rate": 5.113591538817176e-07, + "loss": 0.0749, + "step": 7966 + }, + { + "epoch": 0.51, + "grad_norm": 4.1611748449305255, + "learning_rate": 5.112559063742986e-07, + "loss": 0.3602, + "step": 7967 + }, + { + "epoch": 0.51, + "grad_norm": 0.9255598049382037, + "learning_rate": 5.111526583866799e-07, + "loss": 0.2475, + "step": 7968 + }, + { + "epoch": 0.51, + "grad_norm": 0.5252074153445964, + "learning_rate": 5.110494099232665e-07, + "loss": 0.1219, + "step": 7969 + }, + { + "epoch": 0.51, + "grad_norm": 11.55610246662506, + "learning_rate": 5.109461609884631e-07, + "loss": 0.193, + "step": 7970 + }, + { + "epoch": 0.51, + "grad_norm": 0.7640356996064016, + "learning_rate": 5.108429115866744e-07, + "loss": 0.2268, + "step": 7971 + }, + { + "epoch": 0.51, + "grad_norm": 0.4296766191799689, + "learning_rate": 5.107396617223052e-07, + "loss": 0.2183, + "step": 7972 + }, + { + "epoch": 0.51, + "grad_norm": 6.612663793624046, + "learning_rate": 5.106364113997607e-07, + "loss": 0.3109, + "step": 7973 + }, + { + "epoch": 0.51, + "grad_norm": 1.2406635949629878, + "learning_rate": 5.105331606234452e-07, + "loss": 0.2663, + "step": 7974 + }, + { + "epoch": 0.51, + "grad_norm": 0.3077453661861215, + "learning_rate": 5.10429909397764e-07, + "loss": 0.0636, + "step": 7975 + }, + { + "epoch": 0.51, + "grad_norm": 0.7591910911749146, + "learning_rate": 5.103266577271219e-07, + "loss": 0.0717, + "step": 7976 + }, + { + "epoch": 0.51, + "grad_norm": 1.3426273054482245, + "learning_rate": 5.102234056159239e-07, + "loss": 0.3577, + "step": 7977 + }, + { + "epoch": 0.51, + "grad_norm": 0.4971718407893129, + "learning_rate": 5.101201530685748e-07, + "loss": 0.127, + "step": 7978 + }, + { + "epoch": 0.51, + "grad_norm": 1.0861819445275214, + "learning_rate": 5.100169000894796e-07, + "loss": 0.1992, + "step": 7979 + }, + { + "epoch": 0.51, + "grad_norm": 1.9030532265442714, + "learning_rate": 5.099136466830434e-07, + "loss": 0.226, + "step": 7980 + }, + { + "epoch": 0.51, + "grad_norm": 1.1638092337308699, + "learning_rate": 5.09810392853671e-07, + "loss": 0.2711, + "step": 7981 + }, + { + "epoch": 0.51, + "grad_norm": 0.5815643900116362, + "learning_rate": 5.097071386057676e-07, + "loss": 0.2693, + "step": 7982 + }, + { + "epoch": 0.51, + "grad_norm": 0.3478150074781302, + "learning_rate": 5.096038839437381e-07, + "loss": 0.2425, + "step": 7983 + }, + { + "epoch": 0.51, + "grad_norm": 0.34556668354772546, + "learning_rate": 5.095006288719875e-07, + "loss": 0.1279, + "step": 7984 + }, + { + "epoch": 0.51, + "grad_norm": 0.696040851455979, + "learning_rate": 5.093973733949212e-07, + "loss": 0.2189, + "step": 7985 + }, + { + "epoch": 0.51, + "grad_norm": 0.8684467332166635, + "learning_rate": 5.09294117516944e-07, + "loss": 0.4641, + "step": 7986 + }, + { + "epoch": 0.51, + "grad_norm": 2.106608551218204, + "learning_rate": 5.09190861242461e-07, + "loss": 0.283, + "step": 7987 + }, + { + "epoch": 0.51, + "grad_norm": 0.6784539215400707, + "learning_rate": 5.090876045758774e-07, + "loss": 0.3893, + "step": 7988 + }, + { + "epoch": 0.51, + "grad_norm": 0.562374800966257, + "learning_rate": 5.089843475215983e-07, + "loss": 0.1535, + "step": 7989 + }, + { + "epoch": 0.51, + "grad_norm": 3.6739943047222257, + "learning_rate": 5.08881090084029e-07, + "loss": 0.2647, + "step": 7990 + }, + { + "epoch": 0.51, + "grad_norm": 0.4446705559212927, + "learning_rate": 5.087778322675744e-07, + "loss": 0.3003, + "step": 7991 + }, + { + "epoch": 0.51, + "grad_norm": 1.1347843340063282, + "learning_rate": 5.086745740766398e-07, + "loss": 0.3486, + "step": 7992 + }, + { + "epoch": 0.51, + "grad_norm": 3.6966441014613145, + "learning_rate": 5.085713155156305e-07, + "loss": 0.0802, + "step": 7993 + }, + { + "epoch": 0.51, + "grad_norm": 0.6872586725807186, + "learning_rate": 5.084680565889517e-07, + "loss": 0.209, + "step": 7994 + }, + { + "epoch": 0.51, + "grad_norm": 0.4954523590391869, + "learning_rate": 5.083647973010085e-07, + "loss": 0.0734, + "step": 7995 + }, + { + "epoch": 0.51, + "grad_norm": 0.630303025252287, + "learning_rate": 5.082615376562063e-07, + "loss": 0.1912, + "step": 7996 + }, + { + "epoch": 0.51, + "grad_norm": 0.5332288454112831, + "learning_rate": 5.081582776589502e-07, + "loss": 0.2678, + "step": 7997 + }, + { + "epoch": 0.51, + "grad_norm": 0.9638121437248867, + "learning_rate": 5.080550173136456e-07, + "loss": 0.1524, + "step": 7998 + }, + { + "epoch": 0.51, + "grad_norm": 0.5817877016507166, + "learning_rate": 5.079517566246979e-07, + "loss": 0.2753, + "step": 7999 + }, + { + "epoch": 0.51, + "grad_norm": 0.7366627544026684, + "learning_rate": 5.078484955965121e-07, + "loss": 0.2207, + "step": 8000 + }, + { + "epoch": 0.51, + "grad_norm": 0.6617489013541173, + "learning_rate": 5.077452342334938e-07, + "loss": 0.1174, + "step": 8001 + }, + { + "epoch": 0.51, + "grad_norm": 4.537758947180974, + "learning_rate": 5.076419725400482e-07, + "loss": 0.1951, + "step": 8002 + }, + { + "epoch": 0.51, + "grad_norm": 0.894337567628483, + "learning_rate": 5.075387105205809e-07, + "loss": 0.0404, + "step": 8003 + }, + { + "epoch": 0.51, + "grad_norm": 0.408447184632947, + "learning_rate": 5.074354481794968e-07, + "loss": 0.0025, + "step": 8004 + }, + { + "epoch": 0.51, + "grad_norm": 1.2473253288384847, + "learning_rate": 5.073321855212016e-07, + "loss": 0.0523, + "step": 8005 + }, + { + "epoch": 0.51, + "grad_norm": 0.38759903538036455, + "learning_rate": 5.072289225501007e-07, + "loss": 0.0103, + "step": 8006 + }, + { + "epoch": 0.51, + "grad_norm": 1.7517533132809295, + "learning_rate": 5.071256592705993e-07, + "loss": 0.082, + "step": 8007 + }, + { + "epoch": 0.51, + "grad_norm": 0.21340932328299736, + "learning_rate": 5.07022395687103e-07, + "loss": 0.0664, + "step": 8008 + }, + { + "epoch": 0.51, + "grad_norm": 0.6888549592613122, + "learning_rate": 5.069191318040171e-07, + "loss": 0.1864, + "step": 8009 + }, + { + "epoch": 0.51, + "grad_norm": 0.6952146654491679, + "learning_rate": 5.068158676257471e-07, + "loss": 0.2177, + "step": 8010 + }, + { + "epoch": 0.51, + "grad_norm": 0.21022522516618827, + "learning_rate": 5.067126031566987e-07, + "loss": 0.009, + "step": 8011 + }, + { + "epoch": 0.51, + "grad_norm": 0.3295249984265424, + "learning_rate": 5.06609338401277e-07, + "loss": 0.2187, + "step": 8012 + }, + { + "epoch": 0.51, + "grad_norm": 0.3381397979980587, + "learning_rate": 5.065060733638877e-07, + "loss": 0.2721, + "step": 8013 + }, + { + "epoch": 0.51, + "grad_norm": 0.7951994549328867, + "learning_rate": 5.064028080489363e-07, + "loss": 0.0808, + "step": 8014 + }, + { + "epoch": 0.51, + "grad_norm": 1.208335598045493, + "learning_rate": 5.062995424608283e-07, + "loss": 0.3566, + "step": 8015 + }, + { + "epoch": 0.51, + "grad_norm": 0.9711718417727587, + "learning_rate": 5.061962766039691e-07, + "loss": 0.1127, + "step": 8016 + }, + { + "epoch": 0.51, + "grad_norm": 0.3681723380290377, + "learning_rate": 5.060930104827641e-07, + "loss": 0.0907, + "step": 8017 + }, + { + "epoch": 0.51, + "grad_norm": 0.7362546339027762, + "learning_rate": 5.05989744101619e-07, + "loss": 0.1672, + "step": 8018 + }, + { + "epoch": 0.51, + "grad_norm": 0.690255265327017, + "learning_rate": 5.058864774649395e-07, + "loss": 0.1658, + "step": 8019 + }, + { + "epoch": 0.51, + "grad_norm": 0.873125487517271, + "learning_rate": 5.05783210577131e-07, + "loss": 0.2203, + "step": 8020 + }, + { + "epoch": 0.51, + "grad_norm": 0.5923813617712705, + "learning_rate": 5.056799434425992e-07, + "loss": 0.2328, + "step": 8021 + }, + { + "epoch": 0.51, + "grad_norm": 1.8319052137680274, + "learning_rate": 5.055766760657496e-07, + "loss": 0.1973, + "step": 8022 + }, + { + "epoch": 0.51, + "grad_norm": 1.2575692636570914, + "learning_rate": 5.054734084509877e-07, + "loss": 0.1985, + "step": 8023 + }, + { + "epoch": 0.51, + "grad_norm": 1.0313095356288016, + "learning_rate": 5.053701406027192e-07, + "loss": 0.1867, + "step": 8024 + }, + { + "epoch": 0.51, + "grad_norm": 0.44176030278131667, + "learning_rate": 5.052668725253498e-07, + "loss": 0.109, + "step": 8025 + }, + { + "epoch": 0.51, + "grad_norm": 0.6980213760967009, + "learning_rate": 5.051636042232849e-07, + "loss": 0.1877, + "step": 8026 + }, + { + "epoch": 0.51, + "grad_norm": 0.5090893499613691, + "learning_rate": 5.050603357009304e-07, + "loss": 0.1255, + "step": 8027 + }, + { + "epoch": 0.51, + "grad_norm": 2.744190893066737, + "learning_rate": 5.049570669626917e-07, + "loss": 0.1226, + "step": 8028 + }, + { + "epoch": 0.51, + "grad_norm": 1.417775319956826, + "learning_rate": 5.048537980129747e-07, + "loss": 0.0854, + "step": 8029 + }, + { + "epoch": 0.51, + "grad_norm": 0.750635235017525, + "learning_rate": 5.047505288561847e-07, + "loss": 0.2062, + "step": 8030 + }, + { + "epoch": 0.51, + "grad_norm": 7.683348780517243, + "learning_rate": 5.046472594967278e-07, + "loss": 0.1323, + "step": 8031 + }, + { + "epoch": 0.51, + "grad_norm": 1.108180759399494, + "learning_rate": 5.045439899390094e-07, + "loss": 0.0988, + "step": 8032 + }, + { + "epoch": 0.51, + "grad_norm": 0.4179128445023675, + "learning_rate": 5.044407201874353e-07, + "loss": 0.0593, + "step": 8033 + }, + { + "epoch": 0.51, + "grad_norm": 3.550250873218855, + "learning_rate": 5.04337450246411e-07, + "loss": 0.1507, + "step": 8034 + }, + { + "epoch": 0.51, + "grad_norm": 0.48309367073510967, + "learning_rate": 5.042341801203424e-07, + "loss": 0.0078, + "step": 8035 + }, + { + "epoch": 0.51, + "grad_norm": 0.8426078244778038, + "learning_rate": 5.041309098136351e-07, + "loss": 0.1387, + "step": 8036 + }, + { + "epoch": 0.51, + "grad_norm": 0.48805343814886276, + "learning_rate": 5.04027639330695e-07, + "loss": 0.2846, + "step": 8037 + }, + { + "epoch": 0.51, + "grad_norm": 14.76702358007144, + "learning_rate": 5.039243686759277e-07, + "loss": 0.1401, + "step": 8038 + }, + { + "epoch": 0.51, + "grad_norm": 1.3855713237176372, + "learning_rate": 5.038210978537388e-07, + "loss": 0.0997, + "step": 8039 + }, + { + "epoch": 0.51, + "grad_norm": 0.4823045750901206, + "learning_rate": 5.037178268685344e-07, + "loss": 0.1423, + "step": 8040 + }, + { + "epoch": 0.51, + "grad_norm": 1.6589906452343282, + "learning_rate": 5.036145557247199e-07, + "loss": 0.0878, + "step": 8041 + }, + { + "epoch": 0.51, + "grad_norm": 2.1415394052155747, + "learning_rate": 5.035112844267014e-07, + "loss": 0.0197, + "step": 8042 + }, + { + "epoch": 0.51, + "grad_norm": 0.6135851235172674, + "learning_rate": 5.034080129788843e-07, + "loss": 0.1854, + "step": 8043 + }, + { + "epoch": 0.51, + "grad_norm": 0.6905787597383316, + "learning_rate": 5.033047413856745e-07, + "loss": 0.2306, + "step": 8044 + }, + { + "epoch": 0.51, + "grad_norm": 0.7618880486185982, + "learning_rate": 5.032014696514776e-07, + "loss": 0.2863, + "step": 8045 + }, + { + "epoch": 0.51, + "grad_norm": 1.1490263618140528, + "learning_rate": 5.030981977806998e-07, + "loss": 0.2538, + "step": 8046 + }, + { + "epoch": 0.51, + "grad_norm": 0.41906248444879696, + "learning_rate": 5.029949257777466e-07, + "loss": 0.1505, + "step": 8047 + }, + { + "epoch": 0.51, + "grad_norm": 0.6151812296385359, + "learning_rate": 5.028916536470239e-07, + "loss": 0.1525, + "step": 8048 + }, + { + "epoch": 0.51, + "grad_norm": 1.321947098669094, + "learning_rate": 5.027883813929373e-07, + "loss": 0.255, + "step": 8049 + }, + { + "epoch": 0.51, + "grad_norm": 1.0067335337780323, + "learning_rate": 5.026851090198929e-07, + "loss": 0.5148, + "step": 8050 + }, + { + "epoch": 0.51, + "grad_norm": 1.2197256165289168, + "learning_rate": 5.025818365322964e-07, + "loss": 0.2783, + "step": 8051 + }, + { + "epoch": 0.51, + "grad_norm": 0.4318952023099847, + "learning_rate": 5.024785639345534e-07, + "loss": 0.233, + "step": 8052 + }, + { + "epoch": 0.51, + "grad_norm": 0.467107979145924, + "learning_rate": 5.023752912310699e-07, + "loss": 0.2264, + "step": 8053 + }, + { + "epoch": 0.51, + "grad_norm": 0.5960508151237881, + "learning_rate": 5.022720184262517e-07, + "loss": 0.0043, + "step": 8054 + }, + { + "epoch": 0.51, + "grad_norm": 0.7491023884257654, + "learning_rate": 5.021687455245046e-07, + "loss": 0.1087, + "step": 8055 + }, + { + "epoch": 0.51, + "grad_norm": 0.8026981893353763, + "learning_rate": 5.020654725302347e-07, + "loss": 0.1684, + "step": 8056 + }, + { + "epoch": 0.51, + "grad_norm": 1.6393626698686294, + "learning_rate": 5.019621994478473e-07, + "loss": 0.2286, + "step": 8057 + }, + { + "epoch": 0.51, + "grad_norm": 0.5835826189171106, + "learning_rate": 5.018589262817488e-07, + "loss": 0.2519, + "step": 8058 + }, + { + "epoch": 0.51, + "grad_norm": 7.71906783880137, + "learning_rate": 5.017556530363445e-07, + "loss": 0.1609, + "step": 8059 + }, + { + "epoch": 0.51, + "grad_norm": 0.6192050745601048, + "learning_rate": 5.016523797160406e-07, + "loss": 0.0912, + "step": 8060 + }, + { + "epoch": 0.51, + "grad_norm": 1.5135148904147495, + "learning_rate": 5.015491063252429e-07, + "loss": 0.1984, + "step": 8061 + }, + { + "epoch": 0.51, + "grad_norm": 0.7676244659370985, + "learning_rate": 5.014458328683572e-07, + "loss": 0.0042, + "step": 8062 + }, + { + "epoch": 0.51, + "grad_norm": 0.676370431133513, + "learning_rate": 5.013425593497892e-07, + "loss": 0.3381, + "step": 8063 + }, + { + "epoch": 0.51, + "grad_norm": 0.5266436808843096, + "learning_rate": 5.012392857739452e-07, + "loss": 0.1857, + "step": 8064 + }, + { + "epoch": 0.51, + "grad_norm": 0.9299334948981965, + "learning_rate": 5.011360121452306e-07, + "loss": 0.5212, + "step": 8065 + }, + { + "epoch": 0.51, + "grad_norm": 1.4985207563224874, + "learning_rate": 5.010327384680515e-07, + "loss": 0.0652, + "step": 8066 + }, + { + "epoch": 0.51, + "grad_norm": 0.2844133747117827, + "learning_rate": 5.009294647468136e-07, + "loss": 0.109, + "step": 8067 + }, + { + "epoch": 0.51, + "grad_norm": 2.378840373785456, + "learning_rate": 5.00826190985923e-07, + "loss": 0.091, + "step": 8068 + }, + { + "epoch": 0.51, + "grad_norm": 1.2423092356654564, + "learning_rate": 5.007229171897854e-07, + "loss": 0.262, + "step": 8069 + }, + { + "epoch": 0.51, + "grad_norm": 0.6809100805891322, + "learning_rate": 5.006196433628068e-07, + "loss": 0.133, + "step": 8070 + }, + { + "epoch": 0.51, + "grad_norm": 1.807786554075676, + "learning_rate": 5.005163695093927e-07, + "loss": 0.2683, + "step": 8071 + }, + { + "epoch": 0.51, + "grad_norm": 0.6403892779241127, + "learning_rate": 5.004130956339494e-07, + "loss": 0.0859, + "step": 8072 + }, + { + "epoch": 0.51, + "grad_norm": 0.7590827412452761, + "learning_rate": 5.003098217408826e-07, + "loss": 0.0805, + "step": 8073 + }, + { + "epoch": 0.51, + "grad_norm": 6.672439477342041, + "learning_rate": 5.002065478345982e-07, + "loss": 0.2191, + "step": 8074 + }, + { + "epoch": 0.51, + "grad_norm": 0.9034156078321618, + "learning_rate": 5.00103273919502e-07, + "loss": 0.0136, + "step": 8075 + }, + { + "epoch": 0.52, + "grad_norm": 2.9520891091385693, + "learning_rate": 5e-07, + "loss": 0.1591, + "step": 8076 + }, + { + "epoch": 0.52, + "grad_norm": 1.0735962217333608, + "learning_rate": 4.998967260804982e-07, + "loss": 0.363, + "step": 8077 + }, + { + "epoch": 0.52, + "grad_norm": 1.0739018029784582, + "learning_rate": 4.997934521654018e-07, + "loss": 0.2883, + "step": 8078 + }, + { + "epoch": 0.52, + "grad_norm": 0.7644867479130036, + "learning_rate": 4.996901782591174e-07, + "loss": 0.3885, + "step": 8079 + }, + { + "epoch": 0.52, + "grad_norm": 0.7211748433681862, + "learning_rate": 4.995869043660506e-07, + "loss": 0.079, + "step": 8080 + }, + { + "epoch": 0.52, + "grad_norm": 0.3702103745019236, + "learning_rate": 4.994836304906073e-07, + "loss": 0.0758, + "step": 8081 + }, + { + "epoch": 0.52, + "grad_norm": 2.0644432870665215, + "learning_rate": 4.993803566371933e-07, + "loss": 0.0639, + "step": 8082 + }, + { + "epoch": 0.52, + "grad_norm": 0.8060213093348012, + "learning_rate": 4.992770828102147e-07, + "loss": 0.3468, + "step": 8083 + }, + { + "epoch": 0.52, + "grad_norm": 0.5310872523741449, + "learning_rate": 4.991738090140769e-07, + "loss": 0.3825, + "step": 8084 + }, + { + "epoch": 0.52, + "grad_norm": 0.6983589634138904, + "learning_rate": 4.990705352531864e-07, + "loss": 0.1557, + "step": 8085 + }, + { + "epoch": 0.52, + "grad_norm": 0.5453860264192577, + "learning_rate": 4.989672615319485e-07, + "loss": 0.2753, + "step": 8086 + }, + { + "epoch": 0.52, + "grad_norm": 0.588703415308047, + "learning_rate": 4.988639878547694e-07, + "loss": 0.1344, + "step": 8087 + }, + { + "epoch": 0.52, + "grad_norm": 0.8166596433992229, + "learning_rate": 4.987607142260548e-07, + "loss": 0.2103, + "step": 8088 + }, + { + "epoch": 0.52, + "grad_norm": 1.1569791097214266, + "learning_rate": 4.986574406502107e-07, + "loss": 0.1128, + "step": 8089 + }, + { + "epoch": 0.52, + "grad_norm": 4.8820478279134125, + "learning_rate": 4.98554167131643e-07, + "loss": 0.2281, + "step": 8090 + }, + { + "epoch": 0.52, + "grad_norm": 4.1672236181499045, + "learning_rate": 4.984508936747572e-07, + "loss": 0.1638, + "step": 8091 + }, + { + "epoch": 0.52, + "grad_norm": 1.7543266365961565, + "learning_rate": 4.983476202839594e-07, + "loss": 0.1106, + "step": 8092 + }, + { + "epoch": 0.52, + "grad_norm": 1.8841857722847184, + "learning_rate": 4.982443469636555e-07, + "loss": 0.1923, + "step": 8093 + }, + { + "epoch": 0.52, + "grad_norm": 3.9459934202711238, + "learning_rate": 4.981410737182514e-07, + "loss": 0.0412, + "step": 8094 + }, + { + "epoch": 0.52, + "grad_norm": 1.3946510977912714, + "learning_rate": 4.980378005521527e-07, + "loss": 0.1793, + "step": 8095 + }, + { + "epoch": 0.52, + "grad_norm": 1.1675751686893905, + "learning_rate": 4.979345274697654e-07, + "loss": 0.0289, + "step": 8096 + }, + { + "epoch": 0.52, + "grad_norm": 0.47801455840366996, + "learning_rate": 4.978312544754953e-07, + "loss": 0.3539, + "step": 8097 + }, + { + "epoch": 0.52, + "grad_norm": 0.5212935629372658, + "learning_rate": 4.977279815737482e-07, + "loss": 0.0023, + "step": 8098 + }, + { + "epoch": 0.52, + "grad_norm": 0.8551706395574106, + "learning_rate": 4.976247087689301e-07, + "loss": 0.0983, + "step": 8099 + }, + { + "epoch": 0.52, + "grad_norm": 2.0236467719199167, + "learning_rate": 4.975214360654467e-07, + "loss": 0.2156, + "step": 8100 + }, + { + "epoch": 0.52, + "grad_norm": 0.770254602796096, + "learning_rate": 4.974181634677036e-07, + "loss": 0.1489, + "step": 8101 + }, + { + "epoch": 0.52, + "grad_norm": 0.3393677786302405, + "learning_rate": 4.973148909801071e-07, + "loss": 0.1005, + "step": 8102 + }, + { + "epoch": 0.52, + "grad_norm": 1.2674565230397858, + "learning_rate": 4.972116186070625e-07, + "loss": 0.2204, + "step": 8103 + }, + { + "epoch": 0.52, + "grad_norm": 0.5216410523777196, + "learning_rate": 4.971083463529762e-07, + "loss": 0.2565, + "step": 8104 + }, + { + "epoch": 0.52, + "grad_norm": 1.5289063925044168, + "learning_rate": 4.970050742222536e-07, + "loss": 0.2912, + "step": 8105 + }, + { + "epoch": 0.52, + "grad_norm": 0.3860747416830018, + "learning_rate": 4.969018022193003e-07, + "loss": 0.1317, + "step": 8106 + }, + { + "epoch": 0.52, + "grad_norm": 3.4942623652535483, + "learning_rate": 4.967985303485224e-07, + "loss": 0.1161, + "step": 8107 + }, + { + "epoch": 0.52, + "grad_norm": 0.8004291099596728, + "learning_rate": 4.966952586143256e-07, + "loss": 0.2606, + "step": 8108 + }, + { + "epoch": 0.52, + "grad_norm": 1.7968568147246406, + "learning_rate": 4.965919870211159e-07, + "loss": 0.2674, + "step": 8109 + }, + { + "epoch": 0.52, + "grad_norm": 0.6414534403017824, + "learning_rate": 4.964887155732987e-07, + "loss": 0.1584, + "step": 8110 + }, + { + "epoch": 0.52, + "grad_norm": 2.9856321369312853, + "learning_rate": 4.9638544427528e-07, + "loss": 0.2173, + "step": 8111 + }, + { + "epoch": 0.52, + "grad_norm": 0.5929900285271503, + "learning_rate": 4.962821731314655e-07, + "loss": 0.3186, + "step": 8112 + }, + { + "epoch": 0.52, + "grad_norm": 1.1246579842009279, + "learning_rate": 4.961789021462611e-07, + "loss": 0.1268, + "step": 8113 + }, + { + "epoch": 0.52, + "grad_norm": 0.6573588297530798, + "learning_rate": 4.960756313240723e-07, + "loss": 0.1884, + "step": 8114 + }, + { + "epoch": 0.52, + "grad_norm": 1.0487922556717102, + "learning_rate": 4.959723606693051e-07, + "loss": 0.1636, + "step": 8115 + }, + { + "epoch": 0.52, + "grad_norm": 1.0779077508305832, + "learning_rate": 4.958690901863648e-07, + "loss": 0.1861, + "step": 8116 + }, + { + "epoch": 0.52, + "grad_norm": 0.667903773938721, + "learning_rate": 4.957658198796577e-07, + "loss": 0.2523, + "step": 8117 + }, + { + "epoch": 0.52, + "grad_norm": 0.47649954407524425, + "learning_rate": 4.956625497535892e-07, + "loss": 0.3787, + "step": 8118 + }, + { + "epoch": 0.52, + "grad_norm": 0.8589198922433372, + "learning_rate": 4.955592798125648e-07, + "loss": 0.4041, + "step": 8119 + }, + { + "epoch": 0.52, + "grad_norm": 0.6860354350736951, + "learning_rate": 4.954560100609908e-07, + "loss": 0.2068, + "step": 8120 + }, + { + "epoch": 0.52, + "grad_norm": 11.332317558630018, + "learning_rate": 4.953527405032723e-07, + "loss": 0.3692, + "step": 8121 + }, + { + "epoch": 0.52, + "grad_norm": 1.933375547824541, + "learning_rate": 4.952494711438154e-07, + "loss": 0.2179, + "step": 8122 + }, + { + "epoch": 0.52, + "grad_norm": 1.0123585012208458, + "learning_rate": 4.951462019870254e-07, + "loss": 0.1651, + "step": 8123 + }, + { + "epoch": 0.52, + "grad_norm": 1.6589280995115026, + "learning_rate": 4.950429330373082e-07, + "loss": 0.2217, + "step": 8124 + }, + { + "epoch": 0.52, + "grad_norm": 0.8875536056595668, + "learning_rate": 4.949396642990697e-07, + "loss": 0.1683, + "step": 8125 + }, + { + "epoch": 0.52, + "grad_norm": 1.0292126746650092, + "learning_rate": 4.94836395776715e-07, + "loss": 0.3058, + "step": 8126 + }, + { + "epoch": 0.52, + "grad_norm": 0.6532360720076447, + "learning_rate": 4.947331274746502e-07, + "loss": 0.0899, + "step": 8127 + }, + { + "epoch": 0.52, + "grad_norm": 0.44401196696336753, + "learning_rate": 4.946298593972808e-07, + "loss": 0.0918, + "step": 8128 + }, + { + "epoch": 0.52, + "grad_norm": 0.8374962262196353, + "learning_rate": 4.945265915490121e-07, + "loss": 0.3585, + "step": 8129 + }, + { + "epoch": 0.52, + "grad_norm": 11.34241429195748, + "learning_rate": 4.944233239342504e-07, + "loss": 0.0184, + "step": 8130 + }, + { + "epoch": 0.52, + "grad_norm": 0.6473484301622712, + "learning_rate": 4.943200565574007e-07, + "loss": 0.1428, + "step": 8131 + }, + { + "epoch": 0.52, + "grad_norm": 12.825841666555736, + "learning_rate": 4.942167894228689e-07, + "loss": 0.1661, + "step": 8132 + }, + { + "epoch": 0.52, + "grad_norm": 0.4350198260722425, + "learning_rate": 4.941135225350605e-07, + "loss": 0.304, + "step": 8133 + }, + { + "epoch": 0.52, + "grad_norm": 0.22369393361009066, + "learning_rate": 4.94010255898381e-07, + "loss": 0.0747, + "step": 8134 + }, + { + "epoch": 0.52, + "grad_norm": 0.16485481347532424, + "learning_rate": 4.93906989517236e-07, + "loss": 0.089, + "step": 8135 + }, + { + "epoch": 0.52, + "grad_norm": 1.6101082522220975, + "learning_rate": 4.938037233960311e-07, + "loss": 0.1237, + "step": 8136 + }, + { + "epoch": 0.52, + "grad_norm": 0.3702393562442652, + "learning_rate": 4.937004575391719e-07, + "loss": 0.1818, + "step": 8137 + }, + { + "epoch": 0.52, + "grad_norm": 7.872984812179066, + "learning_rate": 4.935971919510636e-07, + "loss": 0.0108, + "step": 8138 + }, + { + "epoch": 0.52, + "grad_norm": 1.944398182559204, + "learning_rate": 4.934939266361123e-07, + "loss": 0.0112, + "step": 8139 + }, + { + "epoch": 0.52, + "grad_norm": 1.0587341436362527, + "learning_rate": 4.933906615987229e-07, + "loss": 0.2331, + "step": 8140 + }, + { + "epoch": 0.52, + "grad_norm": 1.0097210622684705, + "learning_rate": 4.932873968433014e-07, + "loss": 0.0713, + "step": 8141 + }, + { + "epoch": 0.52, + "grad_norm": 2.9011634437214178, + "learning_rate": 4.931841323742528e-07, + "loss": 0.0172, + "step": 8142 + }, + { + "epoch": 0.52, + "grad_norm": 1.0085254744974845, + "learning_rate": 4.930808681959829e-07, + "loss": 0.209, + "step": 8143 + }, + { + "epoch": 0.52, + "grad_norm": 0.6101017080712451, + "learning_rate": 4.92977604312897e-07, + "loss": 0.0775, + "step": 8144 + }, + { + "epoch": 0.52, + "grad_norm": 1.0447089952474162, + "learning_rate": 4.928743407294008e-07, + "loss": 0.1055, + "step": 8145 + }, + { + "epoch": 0.52, + "grad_norm": 1.546086501221723, + "learning_rate": 4.927710774498995e-07, + "loss": 0.2003, + "step": 8146 + }, + { + "epoch": 0.52, + "grad_norm": 1.4433914650160102, + "learning_rate": 4.926678144787985e-07, + "loss": 0.1689, + "step": 8147 + }, + { + "epoch": 0.52, + "grad_norm": 0.7219197014496946, + "learning_rate": 4.925645518205033e-07, + "loss": 0.2977, + "step": 8148 + }, + { + "epoch": 0.52, + "grad_norm": 0.9241352289020095, + "learning_rate": 4.924612894794192e-07, + "loss": 0.2784, + "step": 8149 + }, + { + "epoch": 0.52, + "grad_norm": 0.6593509286864717, + "learning_rate": 4.923580274599518e-07, + "loss": 0.2903, + "step": 8150 + }, + { + "epoch": 0.52, + "grad_norm": 0.9204912285373574, + "learning_rate": 4.922547657665061e-07, + "loss": 0.1956, + "step": 8151 + }, + { + "epoch": 0.52, + "grad_norm": 1.264522924498254, + "learning_rate": 4.921515044034878e-07, + "loss": 0.3374, + "step": 8152 + }, + { + "epoch": 0.52, + "grad_norm": 0.5032700619783951, + "learning_rate": 4.920482433753021e-07, + "loss": 0.2798, + "step": 8153 + }, + { + "epoch": 0.52, + "grad_norm": 0.956876684470016, + "learning_rate": 4.919449826863544e-07, + "loss": 0.2016, + "step": 8154 + }, + { + "epoch": 0.52, + "grad_norm": 3.7909706573922373, + "learning_rate": 4.918417223410497e-07, + "loss": 0.213, + "step": 8155 + }, + { + "epoch": 0.52, + "grad_norm": 1.7344354376992372, + "learning_rate": 4.917384623437937e-07, + "loss": 0.2235, + "step": 8156 + }, + { + "epoch": 0.52, + "grad_norm": 1.4713305981333957, + "learning_rate": 4.916352026989914e-07, + "loss": 0.0422, + "step": 8157 + }, + { + "epoch": 0.52, + "grad_norm": 0.5609733558667588, + "learning_rate": 4.915319434110484e-07, + "loss": 0.1526, + "step": 8158 + }, + { + "epoch": 0.52, + "grad_norm": 0.303771899028892, + "learning_rate": 4.914286844843695e-07, + "loss": 0.158, + "step": 8159 + }, + { + "epoch": 0.52, + "grad_norm": 1.3765189718166573, + "learning_rate": 4.913254259233602e-07, + "loss": 0.3024, + "step": 8160 + }, + { + "epoch": 0.52, + "grad_norm": 2.322163771166202, + "learning_rate": 4.912221677324257e-07, + "loss": 0.1188, + "step": 8161 + }, + { + "epoch": 0.52, + "grad_norm": 0.397630194609182, + "learning_rate": 4.911189099159711e-07, + "loss": 0.0038, + "step": 8162 + }, + { + "epoch": 0.52, + "grad_norm": 0.7918383108872507, + "learning_rate": 4.910156524784017e-07, + "loss": 0.1372, + "step": 8163 + }, + { + "epoch": 0.52, + "grad_norm": 1.5311296124532012, + "learning_rate": 4.909123954241225e-07, + "loss": 0.1052, + "step": 8164 + }, + { + "epoch": 0.52, + "grad_norm": 0.707926056032602, + "learning_rate": 4.908091387575391e-07, + "loss": 0.175, + "step": 8165 + }, + { + "epoch": 0.52, + "grad_norm": 1.5344325190448418, + "learning_rate": 4.90705882483056e-07, + "loss": 0.1296, + "step": 8166 + }, + { + "epoch": 0.52, + "grad_norm": 1.700488779754395, + "learning_rate": 4.906026266050788e-07, + "loss": 0.4026, + "step": 8167 + }, + { + "epoch": 0.52, + "grad_norm": 0.32097578211959354, + "learning_rate": 4.904993711280124e-07, + "loss": 0.1006, + "step": 8168 + }, + { + "epoch": 0.52, + "grad_norm": 0.8326151176037822, + "learning_rate": 4.903961160562618e-07, + "loss": 0.0026, + "step": 8169 + }, + { + "epoch": 0.52, + "grad_norm": 0.3794740402403776, + "learning_rate": 4.902928613942325e-07, + "loss": 0.1105, + "step": 8170 + }, + { + "epoch": 0.52, + "grad_norm": 0.33940217049646326, + "learning_rate": 4.901896071463289e-07, + "loss": 0.2032, + "step": 8171 + }, + { + "epoch": 0.52, + "grad_norm": 1.3758703619299881, + "learning_rate": 4.900863533169566e-07, + "loss": 0.3885, + "step": 8172 + }, + { + "epoch": 0.52, + "grad_norm": 0.611254928262428, + "learning_rate": 4.899830999105204e-07, + "loss": 0.1637, + "step": 8173 + }, + { + "epoch": 0.52, + "grad_norm": 9.332136382986466, + "learning_rate": 4.898798469314253e-07, + "loss": 0.1619, + "step": 8174 + }, + { + "epoch": 0.52, + "grad_norm": 1.6545439993081186, + "learning_rate": 4.89776594384076e-07, + "loss": 0.1125, + "step": 8175 + }, + { + "epoch": 0.52, + "grad_norm": 0.6373618442771709, + "learning_rate": 4.896733422728781e-07, + "loss": 0.2989, + "step": 8176 + }, + { + "epoch": 0.52, + "grad_norm": 1.0231394374024527, + "learning_rate": 4.895700906022359e-07, + "loss": 0.315, + "step": 8177 + }, + { + "epoch": 0.52, + "grad_norm": 0.6125522670776106, + "learning_rate": 4.894668393765548e-07, + "loss": 0.2293, + "step": 8178 + }, + { + "epoch": 0.52, + "grad_norm": 0.6645183435609646, + "learning_rate": 4.893635886002394e-07, + "loss": 0.2517, + "step": 8179 + }, + { + "epoch": 0.52, + "grad_norm": 0.8889959015194648, + "learning_rate": 4.892603382776947e-07, + "loss": 0.165, + "step": 8180 + }, + { + "epoch": 0.52, + "grad_norm": 0.960100897863591, + "learning_rate": 4.891570884133255e-07, + "loss": 0.1212, + "step": 8181 + }, + { + "epoch": 0.52, + "grad_norm": 0.4169013265792246, + "learning_rate": 4.89053839011537e-07, + "loss": 0.1354, + "step": 8182 + }, + { + "epoch": 0.52, + "grad_norm": 15.974883256278313, + "learning_rate": 4.889505900767334e-07, + "loss": 0.4403, + "step": 8183 + }, + { + "epoch": 0.52, + "grad_norm": 0.4604167259060572, + "learning_rate": 4.8884734161332e-07, + "loss": 0.2273, + "step": 8184 + }, + { + "epoch": 0.52, + "grad_norm": 9.206047113363974, + "learning_rate": 4.887440936257013e-07, + "loss": 0.2439, + "step": 8185 + }, + { + "epoch": 0.52, + "grad_norm": 0.4879592225809976, + "learning_rate": 4.886408461182824e-07, + "loss": 0.1288, + "step": 8186 + }, + { + "epoch": 0.52, + "grad_norm": 0.8666582105075445, + "learning_rate": 4.885375990954677e-07, + "loss": 0.0935, + "step": 8187 + }, + { + "epoch": 0.52, + "grad_norm": 0.5234728481060329, + "learning_rate": 4.88434352561662e-07, + "loss": 0.1426, + "step": 8188 + }, + { + "epoch": 0.52, + "grad_norm": 0.5205276731923292, + "learning_rate": 4.883311065212703e-07, + "loss": 0.1332, + "step": 8189 + }, + { + "epoch": 0.52, + "grad_norm": 0.6284872415583754, + "learning_rate": 4.882278609786968e-07, + "loss": 0.1104, + "step": 8190 + }, + { + "epoch": 0.52, + "grad_norm": 0.5106344655459321, + "learning_rate": 4.881246159383466e-07, + "loss": 0.0892, + "step": 8191 + }, + { + "epoch": 0.52, + "grad_norm": 1.7396302828353256, + "learning_rate": 4.88021371404624e-07, + "loss": 0.2086, + "step": 8192 + }, + { + "epoch": 0.52, + "grad_norm": 1.6956739818468076, + "learning_rate": 4.87918127381934e-07, + "loss": 0.3581, + "step": 8193 + }, + { + "epoch": 0.52, + "grad_norm": 1.0877921362422047, + "learning_rate": 4.878148838746806e-07, + "loss": 0.0218, + "step": 8194 + }, + { + "epoch": 0.52, + "grad_norm": 7.673432447629786, + "learning_rate": 4.87711640887269e-07, + "loss": 0.1537, + "step": 8195 + }, + { + "epoch": 0.52, + "grad_norm": 0.948180460074467, + "learning_rate": 4.876083984241035e-07, + "loss": 0.0797, + "step": 8196 + }, + { + "epoch": 0.52, + "grad_norm": 0.9047276678021269, + "learning_rate": 4.875051564895886e-07, + "loss": 0.2154, + "step": 8197 + }, + { + "epoch": 0.52, + "grad_norm": 0.46054497364789626, + "learning_rate": 4.874019150881287e-07, + "loss": 0.0631, + "step": 8198 + }, + { + "epoch": 0.52, + "grad_norm": 0.9628242709982492, + "learning_rate": 4.872986742241288e-07, + "loss": 0.4154, + "step": 8199 + }, + { + "epoch": 0.52, + "grad_norm": 0.8853121666096614, + "learning_rate": 4.871954339019928e-07, + "loss": 0.0893, + "step": 8200 + }, + { + "epoch": 0.52, + "grad_norm": 0.8700554086125147, + "learning_rate": 4.870921941261252e-07, + "loss": 0.007, + "step": 8201 + }, + { + "epoch": 0.52, + "grad_norm": 0.710502323168251, + "learning_rate": 4.869889549009308e-07, + "loss": 0.3938, + "step": 8202 + }, + { + "epoch": 0.52, + "grad_norm": 1.3829422967772256, + "learning_rate": 4.868857162308135e-07, + "loss": 0.1249, + "step": 8203 + }, + { + "epoch": 0.52, + "grad_norm": 5.791836083047208, + "learning_rate": 4.86782478120178e-07, + "loss": 0.293, + "step": 8204 + }, + { + "epoch": 0.52, + "grad_norm": 1.055559361123662, + "learning_rate": 4.866792405734286e-07, + "loss": 0.1588, + "step": 8205 + }, + { + "epoch": 0.52, + "grad_norm": 0.12669878074895569, + "learning_rate": 4.865760035949695e-07, + "loss": 0.0149, + "step": 8206 + }, + { + "epoch": 0.52, + "grad_norm": 1.0831513869789322, + "learning_rate": 4.864727671892049e-07, + "loss": 0.1228, + "step": 8207 + }, + { + "epoch": 0.52, + "grad_norm": 2.674063684024899, + "learning_rate": 4.863695313605397e-07, + "loss": 0.1532, + "step": 8208 + }, + { + "epoch": 0.52, + "grad_norm": 6.784761189667426, + "learning_rate": 4.862662961133773e-07, + "loss": 0.2814, + "step": 8209 + }, + { + "epoch": 0.52, + "grad_norm": 1.3034063368760405, + "learning_rate": 4.861630614521225e-07, + "loss": 0.272, + "step": 8210 + }, + { + "epoch": 0.52, + "grad_norm": 1.6401069113083235, + "learning_rate": 4.860598273811792e-07, + "loss": 0.1154, + "step": 8211 + }, + { + "epoch": 0.52, + "grad_norm": 0.6507259723895588, + "learning_rate": 4.859565939049519e-07, + "loss": 0.3146, + "step": 8212 + }, + { + "epoch": 0.52, + "grad_norm": 0.7603477601240486, + "learning_rate": 4.858533610278443e-07, + "loss": 0.2414, + "step": 8213 + }, + { + "epoch": 0.52, + "grad_norm": 0.6837864369543493, + "learning_rate": 4.857501287542608e-07, + "loss": 0.1865, + "step": 8214 + }, + { + "epoch": 0.52, + "grad_norm": 0.6469380514526999, + "learning_rate": 4.856468970886056e-07, + "loss": 0.0477, + "step": 8215 + }, + { + "epoch": 0.52, + "grad_norm": 0.6632697507099637, + "learning_rate": 4.855436660352824e-07, + "loss": 0.1616, + "step": 8216 + }, + { + "epoch": 0.52, + "grad_norm": 1.1212084672513034, + "learning_rate": 4.854404355986958e-07, + "loss": 0.0772, + "step": 8217 + }, + { + "epoch": 0.52, + "grad_norm": 1.8867303347652737, + "learning_rate": 4.853372057832491e-07, + "loss": 0.1253, + "step": 8218 + }, + { + "epoch": 0.52, + "grad_norm": 0.42147078461829746, + "learning_rate": 4.852339765933469e-07, + "loss": 0.2253, + "step": 8219 + }, + { + "epoch": 0.52, + "grad_norm": 1.0304215517377768, + "learning_rate": 4.851307480333929e-07, + "loss": 0.2961, + "step": 8220 + }, + { + "epoch": 0.52, + "grad_norm": 1.0333451791890018, + "learning_rate": 4.850275201077911e-07, + "loss": 0.232, + "step": 8221 + }, + { + "epoch": 0.52, + "grad_norm": 0.4349674322535713, + "learning_rate": 4.849242928209453e-07, + "loss": 0.2093, + "step": 8222 + }, + { + "epoch": 0.52, + "grad_norm": 2.2486247488705318, + "learning_rate": 4.848210661772595e-07, + "loss": 0.2181, + "step": 8223 + }, + { + "epoch": 0.52, + "grad_norm": 2.5097889743442834, + "learning_rate": 4.847178401811375e-07, + "loss": 0.357, + "step": 8224 + }, + { + "epoch": 0.52, + "grad_norm": 1.3235845555502364, + "learning_rate": 4.846146148369834e-07, + "loss": 0.075, + "step": 8225 + }, + { + "epoch": 0.52, + "grad_norm": 0.9198231270911172, + "learning_rate": 4.845113901492004e-07, + "loss": 0.1831, + "step": 8226 + }, + { + "epoch": 0.52, + "grad_norm": 1.8586642988102762, + "learning_rate": 4.844081661221929e-07, + "loss": 0.0167, + "step": 8227 + }, + { + "epoch": 0.52, + "grad_norm": 0.6726872701153576, + "learning_rate": 4.843049427603644e-07, + "loss": 0.211, + "step": 8228 + }, + { + "epoch": 0.52, + "grad_norm": 0.8395887871025307, + "learning_rate": 4.842017200681184e-07, + "loss": 0.0067, + "step": 8229 + }, + { + "epoch": 0.52, + "grad_norm": 0.2945067975585641, + "learning_rate": 4.840984980498589e-07, + "loss": 0.0833, + "step": 8230 + }, + { + "epoch": 0.52, + "grad_norm": 0.9712970287543637, + "learning_rate": 4.839952767099894e-07, + "loss": 0.1664, + "step": 8231 + }, + { + "epoch": 0.52, + "grad_norm": 1.2145687671104668, + "learning_rate": 4.838920560529136e-07, + "loss": 0.3168, + "step": 8232 + }, + { + "epoch": 0.53, + "grad_norm": 0.9919768412091605, + "learning_rate": 4.837888360830349e-07, + "loss": 0.1024, + "step": 8233 + }, + { + "epoch": 0.53, + "grad_norm": 1.1152289836489446, + "learning_rate": 4.836856168047574e-07, + "loss": 0.2503, + "step": 8234 + }, + { + "epoch": 0.53, + "grad_norm": 0.43487629793650373, + "learning_rate": 4.835823982224839e-07, + "loss": 0.1746, + "step": 8235 + }, + { + "epoch": 0.53, + "grad_norm": 0.7776933543405603, + "learning_rate": 4.834791803406186e-07, + "loss": 0.4996, + "step": 8236 + }, + { + "epoch": 0.53, + "grad_norm": 0.7138515491082422, + "learning_rate": 4.833759631635644e-07, + "loss": 0.1812, + "step": 8237 + }, + { + "epoch": 0.53, + "grad_norm": 0.7899423270112802, + "learning_rate": 4.832727466957254e-07, + "loss": 0.2266, + "step": 8238 + }, + { + "epoch": 0.53, + "grad_norm": 0.9439042794776742, + "learning_rate": 4.831695309415042e-07, + "loss": 0.1149, + "step": 8239 + }, + { + "epoch": 0.53, + "grad_norm": 0.5698927914817434, + "learning_rate": 4.83066315905305e-07, + "loss": 0.354, + "step": 8240 + }, + { + "epoch": 0.53, + "grad_norm": 1.0048873877797317, + "learning_rate": 4.829631015915306e-07, + "loss": 0.035, + "step": 8241 + }, + { + "epoch": 0.53, + "grad_norm": 1.0012648658704233, + "learning_rate": 4.828598880045846e-07, + "loss": 0.134, + "step": 8242 + }, + { + "epoch": 0.53, + "grad_norm": 4.793594013577755, + "learning_rate": 4.827566751488704e-07, + "loss": 0.1977, + "step": 8243 + }, + { + "epoch": 0.53, + "grad_norm": 1.171597253648042, + "learning_rate": 4.826534630287908e-07, + "loss": 0.1717, + "step": 8244 + }, + { + "epoch": 0.53, + "grad_norm": 1.2448125237617151, + "learning_rate": 4.825502516487496e-07, + "loss": 0.2479, + "step": 8245 + }, + { + "epoch": 0.53, + "grad_norm": 0.377931561316245, + "learning_rate": 4.824470410131495e-07, + "loss": 0.1566, + "step": 8246 + }, + { + "epoch": 0.53, + "grad_norm": 0.982739061442296, + "learning_rate": 4.823438311263942e-07, + "loss": 0.2143, + "step": 8247 + }, + { + "epoch": 0.53, + "grad_norm": 0.45563477761264554, + "learning_rate": 4.822406219928863e-07, + "loss": 0.2273, + "step": 8248 + }, + { + "epoch": 0.53, + "grad_norm": 3.088298522228207, + "learning_rate": 4.821374136170293e-07, + "loss": 0.1257, + "step": 8249 + }, + { + "epoch": 0.53, + "grad_norm": 0.375275171763511, + "learning_rate": 4.820342060032261e-07, + "loss": 0.0603, + "step": 8250 + }, + { + "epoch": 0.53, + "grad_norm": 1.949710767535515, + "learning_rate": 4.819309991558798e-07, + "loss": 0.359, + "step": 8251 + }, + { + "epoch": 0.53, + "grad_norm": 6.590046422935963, + "learning_rate": 4.818277930793933e-07, + "loss": 0.4137, + "step": 8252 + }, + { + "epoch": 0.53, + "grad_norm": 0.485660221582689, + "learning_rate": 4.817245877781698e-07, + "loss": 0.1023, + "step": 8253 + }, + { + "epoch": 0.53, + "grad_norm": 1.2500216822562424, + "learning_rate": 4.816213832566121e-07, + "loss": 0.0481, + "step": 8254 + }, + { + "epoch": 0.53, + "grad_norm": 1.501707110922901, + "learning_rate": 4.815181795191233e-07, + "loss": 0.1328, + "step": 8255 + }, + { + "epoch": 0.53, + "grad_norm": 0.6027320587153924, + "learning_rate": 4.814149765701059e-07, + "loss": 0.1465, + "step": 8256 + }, + { + "epoch": 0.53, + "grad_norm": 3.0408515438402133, + "learning_rate": 4.81311774413963e-07, + "loss": 0.1136, + "step": 8257 + }, + { + "epoch": 0.53, + "grad_norm": 1.1432097162698354, + "learning_rate": 4.812085730550973e-07, + "loss": 0.1676, + "step": 8258 + }, + { + "epoch": 0.53, + "grad_norm": 1.0042730486274967, + "learning_rate": 4.811053724979116e-07, + "loss": 0.1835, + "step": 8259 + }, + { + "epoch": 0.53, + "grad_norm": 0.7901281894729675, + "learning_rate": 4.81002172746809e-07, + "loss": 0.1528, + "step": 8260 + }, + { + "epoch": 0.53, + "grad_norm": 0.40423764933552675, + "learning_rate": 4.808989738061916e-07, + "loss": 0.0702, + "step": 8261 + }, + { + "epoch": 0.53, + "grad_norm": 0.34243011998723405, + "learning_rate": 4.807957756804627e-07, + "loss": 0.053, + "step": 8262 + }, + { + "epoch": 0.53, + "grad_norm": 0.4038619697821567, + "learning_rate": 4.806925783740242e-07, + "loss": 0.1311, + "step": 8263 + }, + { + "epoch": 0.53, + "grad_norm": 1.6184916906991575, + "learning_rate": 4.805893818912794e-07, + "loss": 0.1294, + "step": 8264 + }, + { + "epoch": 0.53, + "grad_norm": 12.62261932255158, + "learning_rate": 4.804861862366305e-07, + "loss": 0.1913, + "step": 8265 + }, + { + "epoch": 0.53, + "grad_norm": 0.6886609271115808, + "learning_rate": 4.803829914144802e-07, + "loss": 0.4007, + "step": 8266 + }, + { + "epoch": 0.53, + "grad_norm": 1.7613069744678602, + "learning_rate": 4.802797974292308e-07, + "loss": 0.1288, + "step": 8267 + }, + { + "epoch": 0.53, + "grad_norm": 1.3144447465818885, + "learning_rate": 4.80176604285285e-07, + "loss": 0.2315, + "step": 8268 + }, + { + "epoch": 0.53, + "grad_norm": 0.47126022938693496, + "learning_rate": 4.800734119870451e-07, + "loss": 0.1008, + "step": 8269 + }, + { + "epoch": 0.53, + "grad_norm": 0.4993473224307254, + "learning_rate": 4.799702205389134e-07, + "loss": 0.0805, + "step": 8270 + }, + { + "epoch": 0.53, + "grad_norm": 0.826096688904793, + "learning_rate": 4.798670299452926e-07, + "loss": 0.2461, + "step": 8271 + }, + { + "epoch": 0.53, + "grad_norm": 0.1566726732922138, + "learning_rate": 4.797638402105844e-07, + "loss": 0.0678, + "step": 8272 + }, + { + "epoch": 0.53, + "grad_norm": 17.1333157525917, + "learning_rate": 4.796606513391917e-07, + "loss": 0.1987, + "step": 8273 + }, + { + "epoch": 0.53, + "grad_norm": 1.9522791582660433, + "learning_rate": 4.795574633355164e-07, + "loss": 0.0395, + "step": 8274 + }, + { + "epoch": 0.53, + "grad_norm": 0.8561925291307796, + "learning_rate": 4.794542762039608e-07, + "loss": 0.2843, + "step": 8275 + }, + { + "epoch": 0.53, + "grad_norm": 11.071059948220585, + "learning_rate": 4.793510899489272e-07, + "loss": 0.0931, + "step": 8276 + }, + { + "epoch": 0.53, + "grad_norm": 4.473368344647713, + "learning_rate": 4.792479045748177e-07, + "loss": 0.2107, + "step": 8277 + }, + { + "epoch": 0.53, + "grad_norm": 1.9678500358403068, + "learning_rate": 4.79144720086034e-07, + "loss": 0.2189, + "step": 8278 + }, + { + "epoch": 0.53, + "grad_norm": 1.70270008822198, + "learning_rate": 4.790415364869787e-07, + "loss": 0.2395, + "step": 8279 + }, + { + "epoch": 0.53, + "grad_norm": 0.7649746568862957, + "learning_rate": 4.789383537820534e-07, + "loss": 0.1648, + "step": 8280 + }, + { + "epoch": 0.53, + "grad_norm": 0.9893349122689812, + "learning_rate": 4.788351719756605e-07, + "loss": 0.2831, + "step": 8281 + }, + { + "epoch": 0.53, + "grad_norm": 0.6704188958555899, + "learning_rate": 4.787319910722015e-07, + "loss": 0.1827, + "step": 8282 + }, + { + "epoch": 0.53, + "grad_norm": 0.7719067889338662, + "learning_rate": 4.786288110760787e-07, + "loss": 0.0498, + "step": 8283 + }, + { + "epoch": 0.53, + "grad_norm": 0.9135942110973932, + "learning_rate": 4.785256319916936e-07, + "loss": 0.1649, + "step": 8284 + }, + { + "epoch": 0.53, + "grad_norm": 0.5359750663694142, + "learning_rate": 4.784224538234482e-07, + "loss": 0.1537, + "step": 8285 + }, + { + "epoch": 0.53, + "grad_norm": 1.112424574766902, + "learning_rate": 4.783192765757446e-07, + "loss": 0.2091, + "step": 8286 + }, + { + "epoch": 0.53, + "grad_norm": 0.5248860382975025, + "learning_rate": 4.782161002529839e-07, + "loss": 0.3016, + "step": 8287 + }, + { + "epoch": 0.53, + "grad_norm": 1.3642197574348445, + "learning_rate": 4.781129248595684e-07, + "loss": 0.3299, + "step": 8288 + }, + { + "epoch": 0.53, + "grad_norm": 0.9654919145150089, + "learning_rate": 4.780097503998993e-07, + "loss": 0.3039, + "step": 8289 + }, + { + "epoch": 0.53, + "grad_norm": 0.7129230089890654, + "learning_rate": 4.779065768783786e-07, + "loss": 0.2663, + "step": 8290 + }, + { + "epoch": 0.53, + "grad_norm": 3.492018950625692, + "learning_rate": 4.778034042994077e-07, + "loss": 0.0489, + "step": 8291 + }, + { + "epoch": 0.53, + "grad_norm": 1.016346917646898, + "learning_rate": 4.777002326673883e-07, + "loss": 0.2143, + "step": 8292 + }, + { + "epoch": 0.53, + "grad_norm": 1.7027076661134657, + "learning_rate": 4.775970619867215e-07, + "loss": 0.1086, + "step": 8293 + }, + { + "epoch": 0.53, + "grad_norm": 0.7552783355285612, + "learning_rate": 4.774938922618094e-07, + "loss": 0.1274, + "step": 8294 + }, + { + "epoch": 0.53, + "grad_norm": 17.187041279410437, + "learning_rate": 4.773907234970528e-07, + "loss": 0.3003, + "step": 8295 + }, + { + "epoch": 0.53, + "grad_norm": 0.561537872363602, + "learning_rate": 4.772875556968537e-07, + "loss": 0.2272, + "step": 8296 + }, + { + "epoch": 0.53, + "grad_norm": 0.8076328473818086, + "learning_rate": 4.771843888656131e-07, + "loss": 0.2778, + "step": 8297 + }, + { + "epoch": 0.53, + "grad_norm": 0.30795960532586475, + "learning_rate": 4.770812230077322e-07, + "loss": 0.0026, + "step": 8298 + }, + { + "epoch": 0.53, + "grad_norm": 0.3285228663365966, + "learning_rate": 4.769780581276125e-07, + "loss": 0.1642, + "step": 8299 + }, + { + "epoch": 0.53, + "grad_norm": 0.649480294227551, + "learning_rate": 4.768748942296549e-07, + "loss": 0.1039, + "step": 8300 + }, + { + "epoch": 0.53, + "grad_norm": 0.5681854671972938, + "learning_rate": 4.7677173131826096e-07, + "loss": 0.1768, + "step": 8301 + }, + { + "epoch": 0.53, + "grad_norm": 0.8524605023286805, + "learning_rate": 4.7666856939783167e-07, + "loss": 0.163, + "step": 8302 + }, + { + "epoch": 0.53, + "grad_norm": 0.9369709721737559, + "learning_rate": 4.765654084727681e-07, + "loss": 0.1572, + "step": 8303 + }, + { + "epoch": 0.53, + "grad_norm": 5.7409964722526485, + "learning_rate": 4.764622485474712e-07, + "loss": 0.3333, + "step": 8304 + }, + { + "epoch": 0.53, + "grad_norm": 0.7505081269417608, + "learning_rate": 4.763590896263423e-07, + "loss": 0.2993, + "step": 8305 + }, + { + "epoch": 0.53, + "grad_norm": 0.7082784780927405, + "learning_rate": 4.762559317137819e-07, + "loss": 0.4248, + "step": 8306 + }, + { + "epoch": 0.53, + "grad_norm": 0.7729870754292155, + "learning_rate": 4.7615277481419146e-07, + "loss": 0.1719, + "step": 8307 + }, + { + "epoch": 0.53, + "grad_norm": 0.8689005251881637, + "learning_rate": 4.7604961893197137e-07, + "loss": 0.3522, + "step": 8308 + }, + { + "epoch": 0.53, + "grad_norm": 2.343606459450366, + "learning_rate": 4.759464640715229e-07, + "loss": 0.114, + "step": 8309 + }, + { + "epoch": 0.53, + "grad_norm": 5.34035926840061, + "learning_rate": 4.758433102372465e-07, + "loss": 0.2122, + "step": 8310 + }, + { + "epoch": 0.53, + "grad_norm": 1.7962579966207999, + "learning_rate": 4.7574015743354305e-07, + "loss": 0.1081, + "step": 8311 + }, + { + "epoch": 0.53, + "grad_norm": 1.7334214270279522, + "learning_rate": 4.756370056648133e-07, + "loss": 0.0126, + "step": 8312 + }, + { + "epoch": 0.53, + "grad_norm": 0.4473313375010857, + "learning_rate": 4.7553385493545766e-07, + "loss": 0.2959, + "step": 8313 + }, + { + "epoch": 0.53, + "grad_norm": 0.6052795437915548, + "learning_rate": 4.754307052498773e-07, + "loss": 0.2558, + "step": 8314 + }, + { + "epoch": 0.53, + "grad_norm": 0.52268626158944, + "learning_rate": 4.753275566124721e-07, + "loss": 0.2504, + "step": 8315 + }, + { + "epoch": 0.53, + "grad_norm": 0.6730348211603852, + "learning_rate": 4.7522440902764323e-07, + "loss": 0.0809, + "step": 8316 + }, + { + "epoch": 0.53, + "grad_norm": 0.8276657829322512, + "learning_rate": 4.751212624997906e-07, + "loss": 0.3571, + "step": 8317 + }, + { + "epoch": 0.53, + "grad_norm": 1.0171656654102184, + "learning_rate": 4.7501811703331516e-07, + "loss": 0.3255, + "step": 8318 + }, + { + "epoch": 0.53, + "grad_norm": 1.3961201876532447, + "learning_rate": 4.7491497263261687e-07, + "loss": 0.314, + "step": 8319 + }, + { + "epoch": 0.53, + "grad_norm": 0.6164111459767151, + "learning_rate": 4.7481182930209633e-07, + "loss": 0.3303, + "step": 8320 + }, + { + "epoch": 0.53, + "grad_norm": 3.327110020906333, + "learning_rate": 4.7470868704615387e-07, + "loss": 0.0943, + "step": 8321 + }, + { + "epoch": 0.53, + "grad_norm": 0.27534488706768706, + "learning_rate": 4.746055458691896e-07, + "loss": 0.0504, + "step": 8322 + }, + { + "epoch": 0.53, + "grad_norm": 0.7571062987207461, + "learning_rate": 4.745024057756037e-07, + "loss": 0.1668, + "step": 8323 + }, + { + "epoch": 0.53, + "grad_norm": 0.7103832911043411, + "learning_rate": 4.743992667697966e-07, + "loss": 0.2673, + "step": 8324 + }, + { + "epoch": 0.53, + "grad_norm": 0.40543707517658106, + "learning_rate": 4.742961288561684e-07, + "loss": 0.067, + "step": 8325 + }, + { + "epoch": 0.53, + "grad_norm": 13.155291203497054, + "learning_rate": 4.7419299203911867e-07, + "loss": 0.1394, + "step": 8326 + }, + { + "epoch": 0.53, + "grad_norm": 5.633389690731605, + "learning_rate": 4.74089856323048e-07, + "loss": 0.3341, + "step": 8327 + }, + { + "epoch": 0.53, + "grad_norm": 0.7601287600239229, + "learning_rate": 4.739867217123559e-07, + "loss": 0.1004, + "step": 8328 + }, + { + "epoch": 0.53, + "grad_norm": 0.659000570859291, + "learning_rate": 4.738835882114427e-07, + "loss": 0.0744, + "step": 8329 + }, + { + "epoch": 0.53, + "grad_norm": 5.727445611828362, + "learning_rate": 4.73780455824708e-07, + "loss": 0.2002, + "step": 8330 + }, + { + "epoch": 0.53, + "grad_norm": 0.7044838640912626, + "learning_rate": 4.73677324556552e-07, + "loss": 0.0962, + "step": 8331 + }, + { + "epoch": 0.53, + "grad_norm": 1.6210387091077625, + "learning_rate": 4.7357419441137404e-07, + "loss": 0.2306, + "step": 8332 + }, + { + "epoch": 0.53, + "grad_norm": 3.1456649589567296, + "learning_rate": 4.734710653935743e-07, + "loss": 0.0845, + "step": 8333 + }, + { + "epoch": 0.53, + "grad_norm": 0.8166196626153188, + "learning_rate": 4.73367937507552e-07, + "loss": 0.3285, + "step": 8334 + }, + { + "epoch": 0.53, + "grad_norm": 0.6904261910706524, + "learning_rate": 4.732648107577072e-07, + "loss": 0.1853, + "step": 8335 + }, + { + "epoch": 0.53, + "grad_norm": 0.4770212767922006, + "learning_rate": 4.7316168514843914e-07, + "loss": 0.1259, + "step": 8336 + }, + { + "epoch": 0.53, + "grad_norm": 0.6304307296852425, + "learning_rate": 4.7305856068414776e-07, + "loss": 0.0056, + "step": 8337 + }, + { + "epoch": 0.53, + "grad_norm": 0.526324203251784, + "learning_rate": 4.729554373692323e-07, + "loss": 0.1671, + "step": 8338 + }, + { + "epoch": 0.53, + "grad_norm": 1.1042738785690747, + "learning_rate": 4.72852315208092e-07, + "loss": 0.124, + "step": 8339 + }, + { + "epoch": 0.53, + "grad_norm": 0.6621693609820087, + "learning_rate": 4.727491942051268e-07, + "loss": 0.1009, + "step": 8340 + }, + { + "epoch": 0.53, + "grad_norm": 0.6882980695951783, + "learning_rate": 4.7264607436473555e-07, + "loss": 0.2576, + "step": 8341 + }, + { + "epoch": 0.53, + "grad_norm": 15.985279478692943, + "learning_rate": 4.725429556913179e-07, + "loss": 0.173, + "step": 8342 + }, + { + "epoch": 0.53, + "grad_norm": 1.9648885535488356, + "learning_rate": 4.724398381892728e-07, + "loss": 0.0932, + "step": 8343 + }, + { + "epoch": 0.53, + "grad_norm": 0.839036168880391, + "learning_rate": 4.723367218629998e-07, + "loss": 0.0475, + "step": 8344 + }, + { + "epoch": 0.53, + "grad_norm": 1.0665152967812122, + "learning_rate": 4.722336067168977e-07, + "loss": 0.1767, + "step": 8345 + }, + { + "epoch": 0.53, + "grad_norm": 0.8784662023791162, + "learning_rate": 4.7213049275536584e-07, + "loss": 0.337, + "step": 8346 + }, + { + "epoch": 0.53, + "grad_norm": 0.988119990288558, + "learning_rate": 4.720273799828031e-07, + "loss": 0.0906, + "step": 8347 + }, + { + "epoch": 0.53, + "grad_norm": 0.33164467170179124, + "learning_rate": 4.7192426840360864e-07, + "loss": 0.2812, + "step": 8348 + }, + { + "epoch": 0.53, + "grad_norm": 0.5021048271459208, + "learning_rate": 4.718211580221812e-07, + "loss": 0.1779, + "step": 8349 + }, + { + "epoch": 0.53, + "grad_norm": 1.1088301088217718, + "learning_rate": 4.7171804884292e-07, + "loss": 0.2481, + "step": 8350 + }, + { + "epoch": 0.53, + "grad_norm": 1.5711181883692436, + "learning_rate": 4.7161494087022354e-07, + "loss": 0.2516, + "step": 8351 + }, + { + "epoch": 0.53, + "grad_norm": 5.866843021786571, + "learning_rate": 4.7151183410849094e-07, + "loss": 0.3975, + "step": 8352 + }, + { + "epoch": 0.53, + "grad_norm": 0.32527701729108605, + "learning_rate": 4.7140872856212077e-07, + "loss": 0.1093, + "step": 8353 + }, + { + "epoch": 0.53, + "grad_norm": 0.6621450184134362, + "learning_rate": 4.713056242355115e-07, + "loss": 0.0823, + "step": 8354 + }, + { + "epoch": 0.53, + "grad_norm": 0.3107572932525853, + "learning_rate": 4.7120252113306207e-07, + "loss": 0.1065, + "step": 8355 + }, + { + "epoch": 0.53, + "grad_norm": 0.05815418833054292, + "learning_rate": 4.710994192591709e-07, + "loss": 0.0006, + "step": 8356 + }, + { + "epoch": 0.53, + "grad_norm": 0.3691203708536689, + "learning_rate": 4.709963186182368e-07, + "loss": 0.0926, + "step": 8357 + }, + { + "epoch": 0.53, + "grad_norm": 1.0321675826769945, + "learning_rate": 4.708932192146579e-07, + "loss": 0.3407, + "step": 8358 + }, + { + "epoch": 0.53, + "grad_norm": 1.2388699622096775, + "learning_rate": 4.707901210528329e-07, + "loss": 0.3973, + "step": 8359 + }, + { + "epoch": 0.53, + "grad_norm": 0.6445735646207422, + "learning_rate": 4.706870241371598e-07, + "loss": 0.1492, + "step": 8360 + }, + { + "epoch": 0.53, + "grad_norm": 0.6405927916498171, + "learning_rate": 4.705839284720375e-07, + "loss": 0.546, + "step": 8361 + }, + { + "epoch": 0.53, + "grad_norm": 1.217280316421169, + "learning_rate": 4.704808340618637e-07, + "loss": 0.3646, + "step": 8362 + }, + { + "epoch": 0.53, + "grad_norm": 0.32295731170163944, + "learning_rate": 4.703777409110371e-07, + "loss": 0.2454, + "step": 8363 + }, + { + "epoch": 0.53, + "grad_norm": 1.2450631581739244, + "learning_rate": 4.702746490239554e-07, + "loss": 0.0479, + "step": 8364 + }, + { + "epoch": 0.53, + "grad_norm": 1.6994199215400096, + "learning_rate": 4.70171558405017e-07, + "loss": 0.2139, + "step": 8365 + }, + { + "epoch": 0.53, + "grad_norm": 0.9752995049467115, + "learning_rate": 4.7006846905861995e-07, + "loss": 0.3142, + "step": 8366 + }, + { + "epoch": 0.53, + "grad_norm": 0.627713157969607, + "learning_rate": 4.6996538098916205e-07, + "loss": 0.294, + "step": 8367 + }, + { + "epoch": 0.53, + "grad_norm": 0.41530161662137777, + "learning_rate": 4.698622942010416e-07, + "loss": 0.0899, + "step": 8368 + }, + { + "epoch": 0.53, + "grad_norm": 0.5522485878580587, + "learning_rate": 4.69759208698656e-07, + "loss": 0.1956, + "step": 8369 + }, + { + "epoch": 0.53, + "grad_norm": 1.5530429781071238, + "learning_rate": 4.6965612448640354e-07, + "loss": 0.1169, + "step": 8370 + }, + { + "epoch": 0.53, + "grad_norm": 0.636913940183296, + "learning_rate": 4.6955304156868165e-07, + "loss": 0.0016, + "step": 8371 + }, + { + "epoch": 0.53, + "grad_norm": 1.949578232853312, + "learning_rate": 4.694499599498884e-07, + "loss": 0.1755, + "step": 8372 + }, + { + "epoch": 0.53, + "grad_norm": 0.6750312131157368, + "learning_rate": 4.69346879634421e-07, + "loss": 0.2282, + "step": 8373 + }, + { + "epoch": 0.53, + "grad_norm": 4.723795459386764, + "learning_rate": 4.692438006266775e-07, + "loss": 0.0811, + "step": 8374 + }, + { + "epoch": 0.53, + "grad_norm": 0.39597401424435147, + "learning_rate": 4.6914072293105523e-07, + "loss": 0.0521, + "step": 8375 + }, + { + "epoch": 0.53, + "grad_norm": 0.5125918304620144, + "learning_rate": 4.690376465519519e-07, + "loss": 0.1816, + "step": 8376 + }, + { + "epoch": 0.53, + "grad_norm": 0.15254802116096794, + "learning_rate": 4.6893457149376464e-07, + "loss": 0.0027, + "step": 8377 + }, + { + "epoch": 0.53, + "grad_norm": 1.1792409719099959, + "learning_rate": 4.688314977608912e-07, + "loss": 0.2804, + "step": 8378 + }, + { + "epoch": 0.53, + "grad_norm": 1.00172599190676, + "learning_rate": 4.6872842535772874e-07, + "loss": 0.2656, + "step": 8379 + }, + { + "epoch": 0.53, + "grad_norm": 2.8731598878152553, + "learning_rate": 4.686253542886743e-07, + "loss": 0.132, + "step": 8380 + }, + { + "epoch": 0.53, + "grad_norm": 1.0209030059932318, + "learning_rate": 4.6852228455812535e-07, + "loss": 0.0621, + "step": 8381 + }, + { + "epoch": 0.53, + "grad_norm": 0.5910051250160278, + "learning_rate": 4.684192161704791e-07, + "loss": 0.3391, + "step": 8382 + }, + { + "epoch": 0.53, + "grad_norm": 0.5875092722322819, + "learning_rate": 4.683161491301326e-07, + "loss": 0.3366, + "step": 8383 + }, + { + "epoch": 0.53, + "grad_norm": 0.5832849105154081, + "learning_rate": 4.6821308344148267e-07, + "loss": 0.1486, + "step": 8384 + }, + { + "epoch": 0.53, + "grad_norm": 0.2557154629724138, + "learning_rate": 4.681100191089268e-07, + "loss": 0.185, + "step": 8385 + }, + { + "epoch": 0.53, + "grad_norm": 1.8808627732116225, + "learning_rate": 4.6800695613686135e-07, + "loss": 0.257, + "step": 8386 + }, + { + "epoch": 0.53, + "grad_norm": 0.27462422795780117, + "learning_rate": 4.679038945296837e-07, + "loss": 0.1196, + "step": 8387 + }, + { + "epoch": 0.53, + "grad_norm": 0.6125359784602092, + "learning_rate": 4.6780083429179025e-07, + "loss": 0.2423, + "step": 8388 + }, + { + "epoch": 0.53, + "grad_norm": 0.38922478139869265, + "learning_rate": 4.676977754275781e-07, + "loss": 0.0041, + "step": 8389 + }, + { + "epoch": 0.54, + "grad_norm": 0.8147472227342312, + "learning_rate": 4.6759471794144354e-07, + "loss": 0.2865, + "step": 8390 + }, + { + "epoch": 0.54, + "grad_norm": 0.8615916354838382, + "learning_rate": 4.6749166183778367e-07, + "loss": 0.281, + "step": 8391 + }, + { + "epoch": 0.54, + "grad_norm": 0.6019648112880962, + "learning_rate": 4.673886071209948e-07, + "loss": 0.0172, + "step": 8392 + }, + { + "epoch": 0.54, + "grad_norm": 0.7827091748181172, + "learning_rate": 4.6728555379547353e-07, + "loss": 0.1966, + "step": 8393 + }, + { + "epoch": 0.54, + "grad_norm": 0.40517344795530413, + "learning_rate": 4.6718250186561637e-07, + "loss": 0.1911, + "step": 8394 + }, + { + "epoch": 0.54, + "grad_norm": 0.37609331752727443, + "learning_rate": 4.6707945133581944e-07, + "loss": 0.0497, + "step": 8395 + }, + { + "epoch": 0.54, + "grad_norm": 0.8233073465419235, + "learning_rate": 4.669764022104795e-07, + "loss": 0.1986, + "step": 8396 + }, + { + "epoch": 0.54, + "grad_norm": 0.7806807328458075, + "learning_rate": 4.6687335449399246e-07, + "loss": 0.223, + "step": 8397 + }, + { + "epoch": 0.54, + "grad_norm": 0.9744470360724538, + "learning_rate": 4.6677030819075494e-07, + "loss": 0.1456, + "step": 8398 + }, + { + "epoch": 0.54, + "grad_norm": 0.9128355771847734, + "learning_rate": 4.666672633051625e-07, + "loss": 0.3131, + "step": 8399 + }, + { + "epoch": 0.54, + "grad_norm": 1.1988994555552688, + "learning_rate": 4.6656421984161185e-07, + "loss": 0.2688, + "step": 8400 + }, + { + "epoch": 0.54, + "grad_norm": 0.7766678877976095, + "learning_rate": 4.6646117780449875e-07, + "loss": 0.2256, + "step": 8401 + }, + { + "epoch": 0.54, + "grad_norm": 0.7061747127141389, + "learning_rate": 4.663581371982194e-07, + "loss": 0.1532, + "step": 8402 + }, + { + "epoch": 0.54, + "grad_norm": 1.173840406685293, + "learning_rate": 4.662550980271692e-07, + "loss": 0.2169, + "step": 8403 + }, + { + "epoch": 0.54, + "grad_norm": 0.6232501211624247, + "learning_rate": 4.661520602957447e-07, + "loss": 0.0222, + "step": 8404 + }, + { + "epoch": 0.54, + "grad_norm": 1.51348777533686, + "learning_rate": 4.6604902400834113e-07, + "loss": 0.3304, + "step": 8405 + }, + { + "epoch": 0.54, + "grad_norm": 0.5824220708737986, + "learning_rate": 4.659459891693546e-07, + "loss": 0.3077, + "step": 8406 + }, + { + "epoch": 0.54, + "grad_norm": 0.5160647659160633, + "learning_rate": 4.6584295578318055e-07, + "loss": 0.1435, + "step": 8407 + }, + { + "epoch": 0.54, + "grad_norm": 1.3249734204797299, + "learning_rate": 4.657399238542147e-07, + "loss": 0.1361, + "step": 8408 + }, + { + "epoch": 0.54, + "grad_norm": 1.601539827843137, + "learning_rate": 4.6563689338685253e-07, + "loss": 0.2454, + "step": 8409 + }, + { + "epoch": 0.54, + "grad_norm": 0.4928277164362475, + "learning_rate": 4.6553386438548946e-07, + "loss": 0.0144, + "step": 8410 + }, + { + "epoch": 0.54, + "grad_norm": 1.1178534249758558, + "learning_rate": 4.6543083685452127e-07, + "loss": 0.4038, + "step": 8411 + }, + { + "epoch": 0.54, + "grad_norm": 11.69181954222864, + "learning_rate": 4.6532781079834284e-07, + "loss": 0.1806, + "step": 8412 + }, + { + "epoch": 0.54, + "grad_norm": 0.023451811323160213, + "learning_rate": 4.652247862213499e-07, + "loss": 0.0003, + "step": 8413 + }, + { + "epoch": 0.54, + "grad_norm": 0.7853279716942542, + "learning_rate": 4.6512176312793735e-07, + "loss": 0.0061, + "step": 8414 + }, + { + "epoch": 0.54, + "grad_norm": 0.6417682889599597, + "learning_rate": 4.6501874152250064e-07, + "loss": 0.1982, + "step": 8415 + }, + { + "epoch": 0.54, + "grad_norm": 0.5193905532521556, + "learning_rate": 4.6491572140943455e-07, + "loss": 0.1116, + "step": 8416 + }, + { + "epoch": 0.54, + "grad_norm": 2.7000523213032452, + "learning_rate": 4.6481270279313433e-07, + "loss": 0.304, + "step": 8417 + }, + { + "epoch": 0.54, + "grad_norm": 1.8775827363867574, + "learning_rate": 4.647096856779951e-07, + "loss": 0.1525, + "step": 8418 + }, + { + "epoch": 0.54, + "grad_norm": 1.257623169315283, + "learning_rate": 4.646066700684115e-07, + "loss": 0.0711, + "step": 8419 + }, + { + "epoch": 0.54, + "grad_norm": 0.3113897164922417, + "learning_rate": 4.6450365596877867e-07, + "loss": 0.059, + "step": 8420 + }, + { + "epoch": 0.54, + "grad_norm": 0.7721374090663957, + "learning_rate": 4.6440064338349094e-07, + "loss": 0.1587, + "step": 8421 + }, + { + "epoch": 0.54, + "grad_norm": 1.2158504599664803, + "learning_rate": 4.642976323169436e-07, + "loss": 0.1179, + "step": 8422 + }, + { + "epoch": 0.54, + "grad_norm": 1.3499208462900647, + "learning_rate": 4.6419462277353083e-07, + "loss": 0.2684, + "step": 8423 + }, + { + "epoch": 0.54, + "grad_norm": 2.922887618861537, + "learning_rate": 4.640916147576477e-07, + "loss": 0.0658, + "step": 8424 + }, + { + "epoch": 0.54, + "grad_norm": 0.5540538560114585, + "learning_rate": 4.6398860827368814e-07, + "loss": 0.1489, + "step": 8425 + }, + { + "epoch": 0.54, + "grad_norm": 1.0657525259003775, + "learning_rate": 4.638856033260471e-07, + "loss": 0.2786, + "step": 8426 + }, + { + "epoch": 0.54, + "grad_norm": 0.38324265983943734, + "learning_rate": 4.6378259991911883e-07, + "loss": 0.0047, + "step": 8427 + }, + { + "epoch": 0.54, + "grad_norm": 2.9656010549995666, + "learning_rate": 4.6367959805729765e-07, + "loss": 0.226, + "step": 8428 + }, + { + "epoch": 0.54, + "grad_norm": 9.998603730798484, + "learning_rate": 4.635765977449777e-07, + "loss": 0.094, + "step": 8429 + }, + { + "epoch": 0.54, + "grad_norm": 0.548560762199273, + "learning_rate": 4.634735989865536e-07, + "loss": 0.0562, + "step": 8430 + }, + { + "epoch": 0.54, + "grad_norm": 0.4566069762684165, + "learning_rate": 4.633706017864189e-07, + "loss": 0.114, + "step": 8431 + }, + { + "epoch": 0.54, + "grad_norm": 0.8559033881437175, + "learning_rate": 4.632676061489682e-07, + "loss": 0.1003, + "step": 8432 + }, + { + "epoch": 0.54, + "grad_norm": 0.8142044444980603, + "learning_rate": 4.6316461207859505e-07, + "loss": 0.1296, + "step": 8433 + }, + { + "epoch": 0.54, + "grad_norm": 0.5067751983888892, + "learning_rate": 4.630616195796938e-07, + "loss": 0.0145, + "step": 8434 + }, + { + "epoch": 0.54, + "grad_norm": 1.1498546862355845, + "learning_rate": 4.62958628656658e-07, + "loss": 0.3408, + "step": 8435 + }, + { + "epoch": 0.54, + "grad_norm": 0.59984117843711, + "learning_rate": 4.628556393138815e-07, + "loss": 0.1739, + "step": 8436 + }, + { + "epoch": 0.54, + "grad_norm": 1.4816842370205916, + "learning_rate": 4.6275265155575824e-07, + "loss": 0.0509, + "step": 8437 + }, + { + "epoch": 0.54, + "grad_norm": 0.8461256816270033, + "learning_rate": 4.626496653866816e-07, + "loss": 0.3231, + "step": 8438 + }, + { + "epoch": 0.54, + "grad_norm": 2.0689898407931318, + "learning_rate": 4.6254668081104553e-07, + "loss": 0.3156, + "step": 8439 + }, + { + "epoch": 0.54, + "grad_norm": 1.154742449093417, + "learning_rate": 4.624436978332431e-07, + "loss": 0.2319, + "step": 8440 + }, + { + "epoch": 0.54, + "grad_norm": 0.7100348437553234, + "learning_rate": 4.6234071645766826e-07, + "loss": 0.2873, + "step": 8441 + }, + { + "epoch": 0.54, + "grad_norm": 0.5191890453423041, + "learning_rate": 4.622377366887139e-07, + "loss": 0.0466, + "step": 8442 + }, + { + "epoch": 0.54, + "grad_norm": 0.8436845098887154, + "learning_rate": 4.621347585307739e-07, + "loss": 0.2187, + "step": 8443 + }, + { + "epoch": 0.54, + "grad_norm": 0.4120815992148151, + "learning_rate": 4.620317819882409e-07, + "loss": 0.0053, + "step": 8444 + }, + { + "epoch": 0.54, + "grad_norm": 0.656772350978441, + "learning_rate": 4.619288070655086e-07, + "loss": 0.1531, + "step": 8445 + }, + { + "epoch": 0.54, + "grad_norm": 0.6586417685936933, + "learning_rate": 4.618258337669698e-07, + "loss": 0.239, + "step": 8446 + }, + { + "epoch": 0.54, + "grad_norm": 1.5419891264394243, + "learning_rate": 4.617228620970178e-07, + "loss": 0.1489, + "step": 8447 + }, + { + "epoch": 0.54, + "grad_norm": 0.8317930629719872, + "learning_rate": 4.6161989206004554e-07, + "loss": 0.2021, + "step": 8448 + }, + { + "epoch": 0.54, + "grad_norm": 0.38792223643785884, + "learning_rate": 4.615169236604456e-07, + "loss": 0.0915, + "step": 8449 + }, + { + "epoch": 0.54, + "grad_norm": 0.9317987377358307, + "learning_rate": 4.614139569026113e-07, + "loss": 0.1291, + "step": 8450 + }, + { + "epoch": 0.54, + "grad_norm": 1.964238468234055, + "learning_rate": 4.613109917909349e-07, + "loss": 0.1983, + "step": 8451 + }, + { + "epoch": 0.54, + "grad_norm": 10.504604358323638, + "learning_rate": 4.6120802832980946e-07, + "loss": 0.1325, + "step": 8452 + }, + { + "epoch": 0.54, + "grad_norm": 1.193223609694503, + "learning_rate": 4.611050665236276e-07, + "loss": 0.5047, + "step": 8453 + }, + { + "epoch": 0.54, + "grad_norm": 2.584229022981351, + "learning_rate": 4.610021063767817e-07, + "loss": 0.2107, + "step": 8454 + }, + { + "epoch": 0.54, + "grad_norm": 0.7681234642862357, + "learning_rate": 4.608991478936643e-07, + "loss": 0.1975, + "step": 8455 + }, + { + "epoch": 0.54, + "grad_norm": 0.9747994156063038, + "learning_rate": 4.60796191078668e-07, + "loss": 0.2907, + "step": 8456 + }, + { + "epoch": 0.54, + "grad_norm": 0.6442174283300625, + "learning_rate": 4.6069323593618487e-07, + "loss": 0.1747, + "step": 8457 + }, + { + "epoch": 0.54, + "grad_norm": 1.0394665216571475, + "learning_rate": 4.605902824706074e-07, + "loss": 0.2331, + "step": 8458 + }, + { + "epoch": 0.54, + "grad_norm": 0.7345764613771602, + "learning_rate": 4.6048733068632763e-07, + "loss": 0.0262, + "step": 8459 + }, + { + "epoch": 0.54, + "grad_norm": 1.364427066048286, + "learning_rate": 4.6038438058773795e-07, + "loss": 0.0444, + "step": 8460 + }, + { + "epoch": 0.54, + "grad_norm": 0.7958588620663453, + "learning_rate": 4.6028143217922994e-07, + "loss": 0.1346, + "step": 8461 + }, + { + "epoch": 0.54, + "grad_norm": 1.57771632222782, + "learning_rate": 4.60178485465196e-07, + "loss": 0.1572, + "step": 8462 + }, + { + "epoch": 0.54, + "grad_norm": 0.4540274020577097, + "learning_rate": 4.6007554045002807e-07, + "loss": 0.2118, + "step": 8463 + }, + { + "epoch": 0.54, + "grad_norm": 0.4574318972870368, + "learning_rate": 4.599725971381176e-07, + "loss": 0.0756, + "step": 8464 + }, + { + "epoch": 0.54, + "grad_norm": 1.2494177439012157, + "learning_rate": 4.598696555338569e-07, + "loss": 0.0699, + "step": 8465 + }, + { + "epoch": 0.54, + "grad_norm": 4.324517809174406, + "learning_rate": 4.5976671564163703e-07, + "loss": 0.1505, + "step": 8466 + }, + { + "epoch": 0.54, + "grad_norm": 0.8563665290211504, + "learning_rate": 4.5966377746585024e-07, + "loss": 0.0799, + "step": 8467 + }, + { + "epoch": 0.54, + "grad_norm": 1.307458669269294, + "learning_rate": 4.5956084101088755e-07, + "loss": 0.3482, + "step": 8468 + }, + { + "epoch": 0.54, + "grad_norm": 1.8117454115871394, + "learning_rate": 4.594579062811409e-07, + "loss": 0.2052, + "step": 8469 + }, + { + "epoch": 0.54, + "grad_norm": 22.86468592982254, + "learning_rate": 4.593549732810012e-07, + "loss": 0.2059, + "step": 8470 + }, + { + "epoch": 0.54, + "grad_norm": 0.9233915420970702, + "learning_rate": 4.592520420148602e-07, + "loss": 0.1638, + "step": 8471 + }, + { + "epoch": 0.54, + "grad_norm": 1.9841653192358117, + "learning_rate": 4.591491124871089e-07, + "loss": 0.2057, + "step": 8472 + }, + { + "epoch": 0.54, + "grad_norm": 0.6514008796100588, + "learning_rate": 4.590461847021388e-07, + "loss": 0.1183, + "step": 8473 + }, + { + "epoch": 0.54, + "grad_norm": 1.115142321159001, + "learning_rate": 4.589432586643405e-07, + "loss": 0.263, + "step": 8474 + }, + { + "epoch": 0.54, + "grad_norm": 0.6404165934906061, + "learning_rate": 4.5884033437810565e-07, + "loss": 0.388, + "step": 8475 + }, + { + "epoch": 0.54, + "grad_norm": 0.8905150331337415, + "learning_rate": 4.5873741184782484e-07, + "loss": 0.1492, + "step": 8476 + }, + { + "epoch": 0.54, + "grad_norm": 0.3374940481431025, + "learning_rate": 4.5863449107788877e-07, + "loss": 0.0189, + "step": 8477 + }, + { + "epoch": 0.54, + "grad_norm": 0.837287076567836, + "learning_rate": 4.585315720726885e-07, + "loss": 0.5004, + "step": 8478 + }, + { + "epoch": 0.54, + "grad_norm": 4.45436173566265, + "learning_rate": 4.584286548366148e-07, + "loss": 0.0123, + "step": 8479 + }, + { + "epoch": 0.54, + "grad_norm": 0.07608838150686013, + "learning_rate": 4.583257393740583e-07, + "loss": 0.0126, + "step": 8480 + }, + { + "epoch": 0.54, + "grad_norm": 1.298241936440913, + "learning_rate": 4.582228256894093e-07, + "loss": 0.1864, + "step": 8481 + }, + { + "epoch": 0.54, + "grad_norm": 1.9557106428726254, + "learning_rate": 4.581199137870588e-07, + "loss": 0.1747, + "step": 8482 + }, + { + "epoch": 0.54, + "grad_norm": 0.4943259769523899, + "learning_rate": 4.580170036713968e-07, + "loss": 0.2299, + "step": 8483 + }, + { + "epoch": 0.54, + "grad_norm": 8.991030265560546, + "learning_rate": 4.57914095346814e-07, + "loss": 0.1538, + "step": 8484 + }, + { + "epoch": 0.54, + "grad_norm": 1.0107885142531865, + "learning_rate": 4.578111888177003e-07, + "loss": 0.3783, + "step": 8485 + }, + { + "epoch": 0.54, + "grad_norm": 1.0740041697591793, + "learning_rate": 4.577082840884463e-07, + "loss": 0.211, + "step": 8486 + }, + { + "epoch": 0.54, + "grad_norm": 0.6396555243855557, + "learning_rate": 4.5760538116344174e-07, + "loss": 0.2172, + "step": 8487 + }, + { + "epoch": 0.54, + "grad_norm": 0.6631907456910964, + "learning_rate": 4.57502480047077e-07, + "loss": 0.264, + "step": 8488 + }, + { + "epoch": 0.54, + "grad_norm": 1.1861297116872045, + "learning_rate": 4.573995807437418e-07, + "loss": 0.0859, + "step": 8489 + }, + { + "epoch": 0.54, + "grad_norm": 12.233467100768586, + "learning_rate": 4.57296683257826e-07, + "loss": 0.2024, + "step": 8490 + }, + { + "epoch": 0.54, + "grad_norm": 0.562863703698288, + "learning_rate": 4.571937875937198e-07, + "loss": 0.1103, + "step": 8491 + }, + { + "epoch": 0.54, + "grad_norm": 1.3014767160948295, + "learning_rate": 4.570908937558124e-07, + "loss": 0.2501, + "step": 8492 + }, + { + "epoch": 0.54, + "grad_norm": 1.3948229887777503, + "learning_rate": 4.56988001748494e-07, + "loss": 0.275, + "step": 8493 + }, + { + "epoch": 0.54, + "grad_norm": 0.9564443585401297, + "learning_rate": 4.568851115761537e-07, + "loss": 0.368, + "step": 8494 + }, + { + "epoch": 0.54, + "grad_norm": 0.29839740125089426, + "learning_rate": 4.5678222324318137e-07, + "loss": 0.1443, + "step": 8495 + }, + { + "epoch": 0.54, + "grad_norm": 0.6743428581848108, + "learning_rate": 4.5667933675396606e-07, + "loss": 0.2141, + "step": 8496 + }, + { + "epoch": 0.54, + "grad_norm": 0.7459111072513056, + "learning_rate": 4.5657645211289744e-07, + "loss": 0.1226, + "step": 8497 + }, + { + "epoch": 0.54, + "grad_norm": 0.7824845827003406, + "learning_rate": 4.564735693243647e-07, + "loss": 0.1287, + "step": 8498 + }, + { + "epoch": 0.54, + "grad_norm": 1.4834984827118423, + "learning_rate": 4.56370688392757e-07, + "loss": 0.1547, + "step": 8499 + }, + { + "epoch": 0.54, + "grad_norm": 1.0938361165736732, + "learning_rate": 4.562678093224633e-07, + "loss": 0.3256, + "step": 8500 + }, + { + "epoch": 0.54, + "grad_norm": 29.55789695253735, + "learning_rate": 4.5616493211787297e-07, + "loss": 0.2399, + "step": 8501 + }, + { + "epoch": 0.54, + "grad_norm": 0.6838882822874129, + "learning_rate": 4.560620567833746e-07, + "loss": 0.076, + "step": 8502 + }, + { + "epoch": 0.54, + "grad_norm": 0.24447589420637153, + "learning_rate": 4.5595918332335744e-07, + "loss": 0.069, + "step": 8503 + }, + { + "epoch": 0.54, + "grad_norm": 0.7864297919528537, + "learning_rate": 4.5585631174220994e-07, + "loss": 0.2658, + "step": 8504 + }, + { + "epoch": 0.54, + "grad_norm": 2.3257562561475966, + "learning_rate": 4.557534420443208e-07, + "loss": 0.0941, + "step": 8505 + }, + { + "epoch": 0.54, + "grad_norm": 1.9571794965312268, + "learning_rate": 4.556505742340789e-07, + "loss": 0.0663, + "step": 8506 + }, + { + "epoch": 0.54, + "grad_norm": 0.7530598028222795, + "learning_rate": 4.555477083158725e-07, + "loss": 0.3032, + "step": 8507 + }, + { + "epoch": 0.54, + "grad_norm": 5.675894191460296, + "learning_rate": 4.554448442940904e-07, + "loss": 0.124, + "step": 8508 + }, + { + "epoch": 0.54, + "grad_norm": 0.5352661522361948, + "learning_rate": 4.5534198217312066e-07, + "loss": 0.2388, + "step": 8509 + }, + { + "epoch": 0.54, + "grad_norm": 1.1325929422909422, + "learning_rate": 4.552391219573519e-07, + "loss": 0.2196, + "step": 8510 + }, + { + "epoch": 0.54, + "grad_norm": 1.1595632651429615, + "learning_rate": 4.5513626365117196e-07, + "loss": 0.1096, + "step": 8511 + }, + { + "epoch": 0.54, + "grad_norm": 0.935187183481488, + "learning_rate": 4.5503340725896946e-07, + "loss": 0.2327, + "step": 8512 + }, + { + "epoch": 0.54, + "grad_norm": 0.6270097308649183, + "learning_rate": 4.5493055278513194e-07, + "loss": 0.1127, + "step": 8513 + }, + { + "epoch": 0.54, + "grad_norm": 0.2333554916025773, + "learning_rate": 4.548277002340479e-07, + "loss": 0.0802, + "step": 8514 + }, + { + "epoch": 0.54, + "grad_norm": 0.3078384653044468, + "learning_rate": 4.547248496101047e-07, + "loss": 0.0836, + "step": 8515 + }, + { + "epoch": 0.54, + "grad_norm": 0.8529922203258207, + "learning_rate": 4.546220009176906e-07, + "loss": 0.2505, + "step": 8516 + }, + { + "epoch": 0.54, + "grad_norm": 0.5147821592586467, + "learning_rate": 4.5451915416119325e-07, + "loss": 0.1138, + "step": 8517 + }, + { + "epoch": 0.54, + "grad_norm": 0.7878131179164628, + "learning_rate": 4.54416309345e-07, + "loss": 0.0269, + "step": 8518 + }, + { + "epoch": 0.54, + "grad_norm": 14.107587381449914, + "learning_rate": 4.5431346647349885e-07, + "loss": 0.0914, + "step": 8519 + }, + { + "epoch": 0.54, + "grad_norm": 3.3688615720946276, + "learning_rate": 4.542106255510768e-07, + "loss": 0.3, + "step": 8520 + }, + { + "epoch": 0.54, + "grad_norm": 0.9529514095880248, + "learning_rate": 4.541077865821218e-07, + "loss": 0.3298, + "step": 8521 + }, + { + "epoch": 0.54, + "grad_norm": 0.51656489930236, + "learning_rate": 4.540049495710206e-07, + "loss": 0.3581, + "step": 8522 + }, + { + "epoch": 0.54, + "grad_norm": 1.0633316442621703, + "learning_rate": 4.5390211452216087e-07, + "loss": 0.2921, + "step": 8523 + }, + { + "epoch": 0.54, + "grad_norm": 0.6073632241863104, + "learning_rate": 4.537992814399296e-07, + "loss": 0.0691, + "step": 8524 + }, + { + "epoch": 0.54, + "grad_norm": 1.0659607195223204, + "learning_rate": 4.5369645032871377e-07, + "loss": 0.0703, + "step": 8525 + }, + { + "epoch": 0.54, + "grad_norm": 0.9100748578867685, + "learning_rate": 4.5359362119290046e-07, + "loss": 0.2182, + "step": 8526 + }, + { + "epoch": 0.54, + "grad_norm": 0.5451664741831638, + "learning_rate": 4.534907940368767e-07, + "loss": 0.1903, + "step": 8527 + }, + { + "epoch": 0.54, + "grad_norm": 1.2983460906523192, + "learning_rate": 4.5338796886502896e-07, + "loss": 0.2098, + "step": 8528 + }, + { + "epoch": 0.54, + "grad_norm": 0.9268620866095381, + "learning_rate": 4.5328514568174444e-07, + "loss": 0.1282, + "step": 8529 + }, + { + "epoch": 0.54, + "grad_norm": 0.6468324519500694, + "learning_rate": 4.531823244914094e-07, + "loss": 0.2481, + "step": 8530 + }, + { + "epoch": 0.54, + "grad_norm": 5.875258197524188, + "learning_rate": 4.530795052984104e-07, + "loss": 0.1138, + "step": 8531 + }, + { + "epoch": 0.54, + "grad_norm": 0.6116136267177559, + "learning_rate": 4.529766881071341e-07, + "loss": 0.0849, + "step": 8532 + }, + { + "epoch": 0.54, + "grad_norm": 0.8706325022627655, + "learning_rate": 4.528738729219667e-07, + "loss": 0.3316, + "step": 8533 + }, + { + "epoch": 0.54, + "grad_norm": 0.477585535082431, + "learning_rate": 4.5277105974729484e-07, + "loss": 0.1506, + "step": 8534 + }, + { + "epoch": 0.54, + "grad_norm": 1.119045564193954, + "learning_rate": 4.526682485875043e-07, + "loss": 0.469, + "step": 8535 + }, + { + "epoch": 0.54, + "grad_norm": 2.147142349762046, + "learning_rate": 4.525654394469816e-07, + "loss": 0.135, + "step": 8536 + }, + { + "epoch": 0.54, + "grad_norm": 0.8146832885122889, + "learning_rate": 4.5246263233011244e-07, + "loss": 0.1504, + "step": 8537 + }, + { + "epoch": 0.54, + "grad_norm": 0.4546010503383642, + "learning_rate": 4.523598272412832e-07, + "loss": 0.1102, + "step": 8538 + }, + { + "epoch": 0.54, + "grad_norm": 1.6218698456156957, + "learning_rate": 4.522570241848792e-07, + "loss": 0.1513, + "step": 8539 + }, + { + "epoch": 0.54, + "grad_norm": 1.5445552367923363, + "learning_rate": 4.521542231652868e-07, + "loss": 0.2466, + "step": 8540 + }, + { + "epoch": 0.54, + "grad_norm": 0.5778055370867833, + "learning_rate": 4.520514241868912e-07, + "loss": 0.1345, + "step": 8541 + }, + { + "epoch": 0.54, + "grad_norm": 0.7511853910031592, + "learning_rate": 4.519486272540783e-07, + "loss": 0.1383, + "step": 8542 + }, + { + "epoch": 0.54, + "grad_norm": 0.3388616325480788, + "learning_rate": 4.5184583237123356e-07, + "loss": 0.1855, + "step": 8543 + }, + { + "epoch": 0.54, + "grad_norm": 0.7383229291815947, + "learning_rate": 4.517430395427424e-07, + "loss": 0.2364, + "step": 8544 + }, + { + "epoch": 0.54, + "grad_norm": 0.8153944375237804, + "learning_rate": 4.5164024877299034e-07, + "loss": 0.2166, + "step": 8545 + }, + { + "epoch": 0.54, + "grad_norm": 1.2759299075411263, + "learning_rate": 4.515374600663623e-07, + "loss": 0.3461, + "step": 8546 + }, + { + "epoch": 0.55, + "grad_norm": 1.6106474779064373, + "learning_rate": 4.514346734272437e-07, + "loss": 0.2649, + "step": 8547 + }, + { + "epoch": 0.55, + "grad_norm": 0.8904849809586077, + "learning_rate": 4.5133188886001947e-07, + "loss": 0.2382, + "step": 8548 + }, + { + "epoch": 0.55, + "grad_norm": 1.6980647809570883, + "learning_rate": 4.512291063690749e-07, + "loss": 0.2302, + "step": 8549 + }, + { + "epoch": 0.55, + "grad_norm": 1.1361973622331778, + "learning_rate": 4.5112632595879444e-07, + "loss": 0.2255, + "step": 8550 + }, + { + "epoch": 0.55, + "grad_norm": 1.380638977705818, + "learning_rate": 4.510235476335633e-07, + "loss": 0.127, + "step": 8551 + }, + { + "epoch": 0.55, + "grad_norm": 14.98530149233771, + "learning_rate": 4.5092077139776594e-07, + "loss": 0.1807, + "step": 8552 + }, + { + "epoch": 0.55, + "grad_norm": 0.8875602497822686, + "learning_rate": 4.508179972557874e-07, + "loss": 0.3445, + "step": 8553 + }, + { + "epoch": 0.55, + "grad_norm": 4.0839262924190445, + "learning_rate": 4.507152252120117e-07, + "loss": 0.1327, + "step": 8554 + }, + { + "epoch": 0.55, + "grad_norm": 0.6907390655670341, + "learning_rate": 4.506124552708238e-07, + "loss": 0.1258, + "step": 8555 + }, + { + "epoch": 0.55, + "grad_norm": 0.7974425763261175, + "learning_rate": 4.505096874366077e-07, + "loss": 0.0369, + "step": 8556 + }, + { + "epoch": 0.55, + "grad_norm": 0.45529218864764853, + "learning_rate": 4.5040692171374794e-07, + "loss": 0.1797, + "step": 8557 + }, + { + "epoch": 0.55, + "grad_norm": 0.794674730828558, + "learning_rate": 4.503041581066285e-07, + "loss": 0.1448, + "step": 8558 + }, + { + "epoch": 0.55, + "grad_norm": 1.3580680448302769, + "learning_rate": 4.502013966196336e-07, + "loss": 0.1913, + "step": 8559 + }, + { + "epoch": 0.55, + "grad_norm": 2.8710498541525413, + "learning_rate": 4.500986372571472e-07, + "loss": 0.3488, + "step": 8560 + }, + { + "epoch": 0.55, + "grad_norm": 2.1796187287502056, + "learning_rate": 4.4999588002355314e-07, + "loss": 0.4305, + "step": 8561 + }, + { + "epoch": 0.55, + "grad_norm": 0.5076527385948689, + "learning_rate": 4.498931249232356e-07, + "loss": 0.1952, + "step": 8562 + }, + { + "epoch": 0.55, + "grad_norm": 1.1039826500375853, + "learning_rate": 4.4979037196057785e-07, + "loss": 0.2341, + "step": 8563 + }, + { + "epoch": 0.55, + "grad_norm": 0.3838415985195842, + "learning_rate": 4.49687621139964e-07, + "loss": 0.1882, + "step": 8564 + }, + { + "epoch": 0.55, + "grad_norm": 0.7075439609216445, + "learning_rate": 4.4958487246577714e-07, + "loss": 0.2253, + "step": 8565 + }, + { + "epoch": 0.55, + "grad_norm": 1.8971257055841362, + "learning_rate": 4.4948212594240113e-07, + "loss": 0.0992, + "step": 8566 + }, + { + "epoch": 0.55, + "grad_norm": 1.3838648711265849, + "learning_rate": 4.493793815742191e-07, + "loss": 0.0273, + "step": 8567 + }, + { + "epoch": 0.55, + "grad_norm": 0.6225953300300522, + "learning_rate": 4.492766393656144e-07, + "loss": 0.3109, + "step": 8568 + }, + { + "epoch": 0.55, + "grad_norm": 0.7097647397292058, + "learning_rate": 4.4917389932097027e-07, + "loss": 0.1853, + "step": 8569 + }, + { + "epoch": 0.55, + "grad_norm": 0.5155582573603247, + "learning_rate": 4.4907116144466976e-07, + "loss": 0.0226, + "step": 8570 + }, + { + "epoch": 0.55, + "grad_norm": 1.1948258985476277, + "learning_rate": 4.489684257410958e-07, + "loss": 0.0056, + "step": 8571 + }, + { + "epoch": 0.55, + "grad_norm": 0.28535405179376255, + "learning_rate": 4.4886569221463154e-07, + "loss": 0.1475, + "step": 8572 + }, + { + "epoch": 0.55, + "grad_norm": 5.021488884601238, + "learning_rate": 4.487629608696597e-07, + "loss": 0.273, + "step": 8573 + }, + { + "epoch": 0.55, + "grad_norm": 0.9495205928322551, + "learning_rate": 4.4866023171056274e-07, + "loss": 0.1342, + "step": 8574 + }, + { + "epoch": 0.55, + "grad_norm": 0.4904094764027969, + "learning_rate": 4.485575047417237e-07, + "loss": 0.1998, + "step": 8575 + }, + { + "epoch": 0.55, + "grad_norm": 2.02893706203771, + "learning_rate": 4.484547799675247e-07, + "loss": 0.1373, + "step": 8576 + }, + { + "epoch": 0.55, + "grad_norm": 0.26415931682304167, + "learning_rate": 4.483520573923485e-07, + "loss": 0.165, + "step": 8577 + }, + { + "epoch": 0.55, + "grad_norm": 0.11699404437693192, + "learning_rate": 4.4824933702057727e-07, + "loss": 0.0034, + "step": 8578 + }, + { + "epoch": 0.55, + "grad_norm": 0.9777991146970927, + "learning_rate": 4.4814661885659353e-07, + "loss": 0.2992, + "step": 8579 + }, + { + "epoch": 0.55, + "grad_norm": 0.6497567332125923, + "learning_rate": 4.480439029047791e-07, + "loss": 0.3313, + "step": 8580 + }, + { + "epoch": 0.55, + "grad_norm": 0.7355528914886902, + "learning_rate": 4.4794118916951644e-07, + "loss": 0.3799, + "step": 8581 + }, + { + "epoch": 0.55, + "grad_norm": 0.8520051342515318, + "learning_rate": 4.478384776551871e-07, + "loss": 0.1371, + "step": 8582 + }, + { + "epoch": 0.55, + "grad_norm": 1.2076017304051176, + "learning_rate": 4.477357683661733e-07, + "loss": 0.2908, + "step": 8583 + }, + { + "epoch": 0.55, + "grad_norm": 0.7971270527375104, + "learning_rate": 4.476330613068565e-07, + "loss": 0.1446, + "step": 8584 + }, + { + "epoch": 0.55, + "grad_norm": 0.8498952281979963, + "learning_rate": 4.4753035648161886e-07, + "loss": 0.3229, + "step": 8585 + }, + { + "epoch": 0.55, + "grad_norm": 1.084276383353148, + "learning_rate": 4.474276538948415e-07, + "loss": 0.128, + "step": 8586 + }, + { + "epoch": 0.55, + "grad_norm": 0.779160114721411, + "learning_rate": 4.4732495355090597e-07, + "loss": 0.228, + "step": 8587 + }, + { + "epoch": 0.55, + "grad_norm": 0.4506411749383888, + "learning_rate": 4.4722225545419413e-07, + "loss": 0.0505, + "step": 8588 + }, + { + "epoch": 0.55, + "grad_norm": 1.1525618354432954, + "learning_rate": 4.471195596090867e-07, + "loss": 0.2949, + "step": 8589 + }, + { + "epoch": 0.55, + "grad_norm": 1.0802060735415184, + "learning_rate": 4.470168660199654e-07, + "loss": 0.1355, + "step": 8590 + }, + { + "epoch": 0.55, + "grad_norm": 0.5992702188904998, + "learning_rate": 4.4691417469121083e-07, + "loss": 0.2294, + "step": 8591 + }, + { + "epoch": 0.55, + "grad_norm": 0.5559914605425803, + "learning_rate": 4.4681148562720455e-07, + "loss": 0.3393, + "step": 8592 + }, + { + "epoch": 0.55, + "grad_norm": 11.390465442380409, + "learning_rate": 4.4670879883232696e-07, + "loss": 0.2481, + "step": 8593 + }, + { + "epoch": 0.55, + "grad_norm": 0.8133634865613211, + "learning_rate": 4.466061143109593e-07, + "loss": 0.174, + "step": 8594 + }, + { + "epoch": 0.55, + "grad_norm": 1.3427987143616134, + "learning_rate": 4.4650343206748215e-07, + "loss": 0.1598, + "step": 8595 + }, + { + "epoch": 0.55, + "grad_norm": 0.5185805257579341, + "learning_rate": 4.464007521062761e-07, + "loss": 0.0469, + "step": 8596 + }, + { + "epoch": 0.55, + "grad_norm": 0.6905742917442251, + "learning_rate": 4.462980744317216e-07, + "loss": 0.1262, + "step": 8597 + }, + { + "epoch": 0.55, + "grad_norm": 1.0559210717523206, + "learning_rate": 4.461953990481994e-07, + "loss": 0.2988, + "step": 8598 + }, + { + "epoch": 0.55, + "grad_norm": 1.0591882872068195, + "learning_rate": 4.460927259600897e-07, + "loss": 0.3202, + "step": 8599 + }, + { + "epoch": 0.55, + "grad_norm": 0.8848992748142672, + "learning_rate": 4.459900551717723e-07, + "loss": 0.4407, + "step": 8600 + }, + { + "epoch": 0.55, + "grad_norm": 2.730913811847409, + "learning_rate": 4.4588738668762815e-07, + "loss": 0.1821, + "step": 8601 + }, + { + "epoch": 0.55, + "grad_norm": 4.541553882484758, + "learning_rate": 4.4578472051203653e-07, + "loss": 0.2061, + "step": 8602 + }, + { + "epoch": 0.55, + "grad_norm": 1.8020017672337625, + "learning_rate": 4.4568205664937786e-07, + "loss": 0.3005, + "step": 8603 + }, + { + "epoch": 0.55, + "grad_norm": 1.2069163581798041, + "learning_rate": 4.455793951040318e-07, + "loss": 0.2515, + "step": 8604 + }, + { + "epoch": 0.55, + "grad_norm": 13.947038719264075, + "learning_rate": 4.454767358803782e-07, + "loss": 0.1206, + "step": 8605 + }, + { + "epoch": 0.55, + "grad_norm": 0.2260616838684422, + "learning_rate": 4.453740789827965e-07, + "loss": 0.1138, + "step": 8606 + }, + { + "epoch": 0.55, + "grad_norm": 0.9046499230171707, + "learning_rate": 4.452714244156667e-07, + "loss": 0.097, + "step": 8607 + }, + { + "epoch": 0.55, + "grad_norm": 4.356366811719842, + "learning_rate": 4.451687721833676e-07, + "loss": 0.2497, + "step": 8608 + }, + { + "epoch": 0.55, + "grad_norm": 0.8806676060896952, + "learning_rate": 4.450661222902792e-07, + "loss": 0.2198, + "step": 8609 + }, + { + "epoch": 0.55, + "grad_norm": 8.340212205460144, + "learning_rate": 4.4496347474078027e-07, + "loss": 0.2238, + "step": 8610 + }, + { + "epoch": 0.55, + "grad_norm": 1.5907989412480348, + "learning_rate": 4.448608295392503e-07, + "loss": 0.3913, + "step": 8611 + }, + { + "epoch": 0.55, + "grad_norm": 0.33591879836840965, + "learning_rate": 4.4475818669006806e-07, + "loss": 0.1539, + "step": 8612 + }, + { + "epoch": 0.55, + "grad_norm": 1.481818672318781, + "learning_rate": 4.4465554619761263e-07, + "loss": 0.1277, + "step": 8613 + }, + { + "epoch": 0.55, + "grad_norm": 3.0167631925411578, + "learning_rate": 4.44552908066263e-07, + "loss": 0.1213, + "step": 8614 + }, + { + "epoch": 0.55, + "grad_norm": 0.6183022367609539, + "learning_rate": 4.4445027230039755e-07, + "loss": 0.1888, + "step": 8615 + }, + { + "epoch": 0.55, + "grad_norm": 0.6780775402626844, + "learning_rate": 4.4434763890439543e-07, + "loss": 0.2843, + "step": 8616 + }, + { + "epoch": 0.55, + "grad_norm": 16.540078901841717, + "learning_rate": 4.442450078826346e-07, + "loss": 0.1059, + "step": 8617 + }, + { + "epoch": 0.55, + "grad_norm": 3.022785206750274, + "learning_rate": 4.4414237923949407e-07, + "loss": 0.3227, + "step": 8618 + }, + { + "epoch": 0.55, + "grad_norm": 2.721530692035682, + "learning_rate": 4.440397529793517e-07, + "loss": 0.0032, + "step": 8619 + }, + { + "epoch": 0.55, + "grad_norm": 0.7044104800798273, + "learning_rate": 4.439371291065862e-07, + "loss": 0.3409, + "step": 8620 + }, + { + "epoch": 0.55, + "grad_norm": 0.5552772627363022, + "learning_rate": 4.438345076255753e-07, + "loss": 0.1829, + "step": 8621 + }, + { + "epoch": 0.55, + "grad_norm": 0.17930898073871132, + "learning_rate": 4.4373188854069727e-07, + "loss": 0.0717, + "step": 8622 + }, + { + "epoch": 0.55, + "grad_norm": 0.5092543984358266, + "learning_rate": 4.436292718563299e-07, + "loss": 0.0758, + "step": 8623 + }, + { + "epoch": 0.55, + "grad_norm": 1.8913321612001055, + "learning_rate": 4.4352665757685125e-07, + "loss": 0.3083, + "step": 8624 + }, + { + "epoch": 0.55, + "grad_norm": 1.119974980136262, + "learning_rate": 4.4342404570663876e-07, + "loss": 0.2503, + "step": 8625 + }, + { + "epoch": 0.55, + "grad_norm": 0.439870508423816, + "learning_rate": 4.4332143625007043e-07, + "loss": 0.1707, + "step": 8626 + }, + { + "epoch": 0.55, + "grad_norm": 11.48170564663966, + "learning_rate": 4.432188292115236e-07, + "loss": 0.1542, + "step": 8627 + }, + { + "epoch": 0.55, + "grad_norm": 1.449310572381116, + "learning_rate": 4.431162245953754e-07, + "loss": 0.0982, + "step": 8628 + }, + { + "epoch": 0.55, + "grad_norm": 1.2178805021997257, + "learning_rate": 4.430136224060035e-07, + "loss": 0.1398, + "step": 8629 + }, + { + "epoch": 0.55, + "grad_norm": 1.7279167315476547, + "learning_rate": 4.4291102264778506e-07, + "loss": 0.157, + "step": 8630 + }, + { + "epoch": 0.55, + "grad_norm": 1.093104599073745, + "learning_rate": 4.4280842532509716e-07, + "loss": 0.2506, + "step": 8631 + }, + { + "epoch": 0.55, + "grad_norm": 0.7498838727386716, + "learning_rate": 4.4270583044231667e-07, + "loss": 0.1681, + "step": 8632 + }, + { + "epoch": 0.55, + "grad_norm": 1.916312673192367, + "learning_rate": 4.4260323800382086e-07, + "loss": 0.0876, + "step": 8633 + }, + { + "epoch": 0.55, + "grad_norm": 1.73006049574467, + "learning_rate": 4.4250064801398607e-07, + "loss": 0.0755, + "step": 8634 + }, + { + "epoch": 0.55, + "grad_norm": 0.47755745843624, + "learning_rate": 4.4239806047718945e-07, + "loss": 0.1912, + "step": 8635 + }, + { + "epoch": 0.55, + "grad_norm": 0.3343599388855688, + "learning_rate": 4.4229547539780715e-07, + "loss": 0.1178, + "step": 8636 + }, + { + "epoch": 0.55, + "grad_norm": 0.40355819689314826, + "learning_rate": 4.421928927802161e-07, + "loss": 0.1157, + "step": 8637 + }, + { + "epoch": 0.55, + "grad_norm": 0.17095910480109017, + "learning_rate": 4.420903126287922e-07, + "loss": 0.0202, + "step": 8638 + }, + { + "epoch": 0.55, + "grad_norm": 1.5340901959281243, + "learning_rate": 4.419877349479121e-07, + "loss": 0.3271, + "step": 8639 + }, + { + "epoch": 0.55, + "grad_norm": 1.0797816216778933, + "learning_rate": 4.41885159741952e-07, + "loss": 0.2782, + "step": 8640 + }, + { + "epoch": 0.55, + "grad_norm": 1.5183601064212755, + "learning_rate": 4.4178258701528756e-07, + "loss": 0.1911, + "step": 8641 + }, + { + "epoch": 0.55, + "grad_norm": 4.65818756265992, + "learning_rate": 4.4168001677229523e-07, + "loss": 0.1366, + "step": 8642 + }, + { + "epoch": 0.55, + "grad_norm": 0.9687773782908738, + "learning_rate": 4.4157744901735033e-07, + "loss": 0.1931, + "step": 8643 + }, + { + "epoch": 0.55, + "grad_norm": 0.6543510595751106, + "learning_rate": 4.4147488375482915e-07, + "loss": 0.1133, + "step": 8644 + }, + { + "epoch": 0.55, + "grad_norm": 1.1118765521249256, + "learning_rate": 4.4137232098910693e-07, + "loss": 0.3299, + "step": 8645 + }, + { + "epoch": 0.55, + "grad_norm": 0.24685852565012192, + "learning_rate": 4.4126976072455946e-07, + "loss": 0.1877, + "step": 8646 + }, + { + "epoch": 0.55, + "grad_norm": 0.2573644076145768, + "learning_rate": 4.411672029655619e-07, + "loss": 0.1783, + "step": 8647 + }, + { + "epoch": 0.55, + "grad_norm": 12.250444386789134, + "learning_rate": 4.410646477164899e-07, + "loss": 0.3069, + "step": 8648 + }, + { + "epoch": 0.55, + "grad_norm": 1.1789709245803754, + "learning_rate": 4.4096209498171825e-07, + "loss": 0.4047, + "step": 8649 + }, + { + "epoch": 0.55, + "grad_norm": 1.1773938457912942, + "learning_rate": 4.4085954476562263e-07, + "loss": 0.3411, + "step": 8650 + }, + { + "epoch": 0.55, + "grad_norm": 15.609434498904355, + "learning_rate": 4.4075699707257754e-07, + "loss": 0.2038, + "step": 8651 + }, + { + "epoch": 0.55, + "grad_norm": 0.7033537492918164, + "learning_rate": 4.4065445190695816e-07, + "loss": 0.2254, + "step": 8652 + }, + { + "epoch": 0.55, + "grad_norm": 8.534189575717024, + "learning_rate": 4.40551909273139e-07, + "loss": 0.36, + "step": 8653 + }, + { + "epoch": 0.55, + "grad_norm": 0.42367573297646766, + "learning_rate": 4.4044936917549505e-07, + "loss": 0.1442, + "step": 8654 + }, + { + "epoch": 0.55, + "grad_norm": 0.1640803618953181, + "learning_rate": 4.403468316184007e-07, + "loss": 0.0206, + "step": 8655 + }, + { + "epoch": 0.55, + "grad_norm": 0.35458751937588334, + "learning_rate": 4.402442966062304e-07, + "loss": 0.0076, + "step": 8656 + }, + { + "epoch": 0.55, + "grad_norm": 1.5175948670631023, + "learning_rate": 4.4014176414335854e-07, + "loss": 0.1836, + "step": 8657 + }, + { + "epoch": 0.55, + "grad_norm": 0.6565120566952919, + "learning_rate": 4.4003923423415923e-07, + "loss": 0.1064, + "step": 8658 + }, + { + "epoch": 0.55, + "grad_norm": 1.5703494333846566, + "learning_rate": 4.3993670688300694e-07, + "loss": 0.3563, + "step": 8659 + }, + { + "epoch": 0.55, + "grad_norm": 1.451116982674173, + "learning_rate": 4.3983418209427526e-07, + "loss": 0.1283, + "step": 8660 + }, + { + "epoch": 0.55, + "grad_norm": 0.7066977888382018, + "learning_rate": 4.397316598723385e-07, + "loss": 0.1661, + "step": 8661 + }, + { + "epoch": 0.55, + "grad_norm": 0.861787167085259, + "learning_rate": 4.3962914022157014e-07, + "loss": 0.222, + "step": 8662 + }, + { + "epoch": 0.55, + "grad_norm": 0.5970506627413962, + "learning_rate": 4.395266231463442e-07, + "loss": 0.2229, + "step": 8663 + }, + { + "epoch": 0.55, + "grad_norm": 3.1996669243248315, + "learning_rate": 4.3942410865103384e-07, + "loss": 0.1183, + "step": 8664 + }, + { + "epoch": 0.55, + "grad_norm": 0.6991684169332326, + "learning_rate": 4.3932159674001305e-07, + "loss": 0.0926, + "step": 8665 + }, + { + "epoch": 0.55, + "grad_norm": 0.8945252113205767, + "learning_rate": 4.392190874176547e-07, + "loss": 0.1876, + "step": 8666 + }, + { + "epoch": 0.55, + "grad_norm": 12.472981795237423, + "learning_rate": 4.3911658068833234e-07, + "loss": 0.2331, + "step": 8667 + }, + { + "epoch": 0.55, + "grad_norm": 0.5292525294377366, + "learning_rate": 4.3901407655641915e-07, + "loss": 0.3686, + "step": 8668 + }, + { + "epoch": 0.55, + "grad_norm": 0.6746245898447345, + "learning_rate": 4.389115750262878e-07, + "loss": 0.0777, + "step": 8669 + }, + { + "epoch": 0.55, + "grad_norm": 1.9496036799051835, + "learning_rate": 4.3880907610231177e-07, + "loss": 0.0932, + "step": 8670 + }, + { + "epoch": 0.55, + "grad_norm": 1.1429990801985839, + "learning_rate": 4.387065797888633e-07, + "loss": 0.3681, + "step": 8671 + }, + { + "epoch": 0.55, + "grad_norm": 0.5704570566856747, + "learning_rate": 4.386040860903156e-07, + "loss": 0.2151, + "step": 8672 + }, + { + "epoch": 0.55, + "grad_norm": 40.21932685715568, + "learning_rate": 4.3850159501104073e-07, + "loss": 0.2563, + "step": 8673 + }, + { + "epoch": 0.55, + "grad_norm": 8.817384533150433, + "learning_rate": 4.3839910655541157e-07, + "loss": 0.0577, + "step": 8674 + }, + { + "epoch": 0.55, + "grad_norm": 0.46772570645501416, + "learning_rate": 4.382966207278004e-07, + "loss": 0.1736, + "step": 8675 + }, + { + "epoch": 0.55, + "grad_norm": 0.24529039893710539, + "learning_rate": 4.381941375325794e-07, + "loss": 0.1944, + "step": 8676 + }, + { + "epoch": 0.55, + "grad_norm": 0.8053000301213352, + "learning_rate": 4.380916569741206e-07, + "loss": 0.2553, + "step": 8677 + }, + { + "epoch": 0.55, + "grad_norm": 1.2459487746508058, + "learning_rate": 4.379891790567964e-07, + "loss": 0.2528, + "step": 8678 + }, + { + "epoch": 0.55, + "grad_norm": 0.5786673703935769, + "learning_rate": 4.3788670378497827e-07, + "loss": 0.0103, + "step": 8679 + }, + { + "epoch": 0.55, + "grad_norm": 0.6131249040544321, + "learning_rate": 4.377842311630385e-07, + "loss": 0.1538, + "step": 8680 + }, + { + "epoch": 0.55, + "grad_norm": 0.43975674968747863, + "learning_rate": 4.3768176119534816e-07, + "loss": 0.216, + "step": 8681 + }, + { + "epoch": 0.55, + "grad_norm": 0.9238939584516679, + "learning_rate": 4.3757929388627943e-07, + "loss": 0.1618, + "step": 8682 + }, + { + "epoch": 0.55, + "grad_norm": 0.5606365831651372, + "learning_rate": 4.3747682924020346e-07, + "loss": 0.275, + "step": 8683 + }, + { + "epoch": 0.55, + "grad_norm": 0.2871099484383887, + "learning_rate": 4.373743672614915e-07, + "loss": 0.2205, + "step": 8684 + }, + { + "epoch": 0.55, + "grad_norm": 1.2847660107718668, + "learning_rate": 4.3727190795451506e-07, + "loss": 0.2713, + "step": 8685 + }, + { + "epoch": 0.55, + "grad_norm": 1.6266697475905256, + "learning_rate": 4.3716945132364495e-07, + "loss": 0.0137, + "step": 8686 + }, + { + "epoch": 0.55, + "grad_norm": 0.09043999399511815, + "learning_rate": 4.3706699737325263e-07, + "loss": 0.0024, + "step": 8687 + }, + { + "epoch": 0.55, + "grad_norm": 0.773539277771456, + "learning_rate": 4.3696454610770843e-07, + "loss": 0.1742, + "step": 8688 + }, + { + "epoch": 0.55, + "grad_norm": 0.3623365393024069, + "learning_rate": 4.368620975313837e-07, + "loss": 0.0477, + "step": 8689 + }, + { + "epoch": 0.55, + "grad_norm": 0.7092943270773006, + "learning_rate": 4.367596516486485e-07, + "loss": 0.2816, + "step": 8690 + }, + { + "epoch": 0.55, + "grad_norm": 0.42632053701819134, + "learning_rate": 4.3665720846387397e-07, + "loss": 0.0572, + "step": 8691 + }, + { + "epoch": 0.55, + "grad_norm": 1.2135248255203528, + "learning_rate": 4.3655476798143003e-07, + "loss": 0.419, + "step": 8692 + }, + { + "epoch": 0.55, + "grad_norm": 0.8067930471012168, + "learning_rate": 4.3645233020568734e-07, + "loss": 0.1214, + "step": 8693 + }, + { + "epoch": 0.55, + "grad_norm": 0.5990983382871302, + "learning_rate": 4.363498951410159e-07, + "loss": 0.1371, + "step": 8694 + }, + { + "epoch": 0.55, + "grad_norm": 0.8936630579336159, + "learning_rate": 4.3624746279178603e-07, + "loss": 0.1692, + "step": 8695 + }, + { + "epoch": 0.55, + "grad_norm": 0.5441647656795372, + "learning_rate": 4.361450331623676e-07, + "loss": 0.2733, + "step": 8696 + }, + { + "epoch": 0.55, + "grad_norm": 0.6118546178471441, + "learning_rate": 4.3604260625713025e-07, + "loss": 0.3282, + "step": 8697 + }, + { + "epoch": 0.55, + "grad_norm": 0.39873167931542874, + "learning_rate": 4.359401820804441e-07, + "loss": 0.3149, + "step": 8698 + }, + { + "epoch": 0.55, + "grad_norm": 0.37182408010870244, + "learning_rate": 4.358377606366783e-07, + "loss": 0.0987, + "step": 8699 + }, + { + "epoch": 0.55, + "grad_norm": 0.543440377640531, + "learning_rate": 4.3573534193020274e-07, + "loss": 0.0189, + "step": 8700 + }, + { + "epoch": 0.55, + "grad_norm": 0.7683463471750335, + "learning_rate": 4.3563292596538665e-07, + "loss": 0.1435, + "step": 8701 + }, + { + "epoch": 0.55, + "grad_norm": 1.063278332037879, + "learning_rate": 4.3553051274659934e-07, + "loss": 0.1295, + "step": 8702 + }, + { + "epoch": 0.56, + "grad_norm": 1.254262210336867, + "learning_rate": 4.3542810227820985e-07, + "loss": 0.3357, + "step": 8703 + }, + { + "epoch": 0.56, + "grad_norm": 0.481368459596217, + "learning_rate": 4.3532569456458755e-07, + "loss": 0.3109, + "step": 8704 + }, + { + "epoch": 0.56, + "grad_norm": 1.2236216102368147, + "learning_rate": 4.352232896101009e-07, + "loss": 0.1365, + "step": 8705 + }, + { + "epoch": 0.56, + "grad_norm": 14.940685031864735, + "learning_rate": 4.351208874191192e-07, + "loss": 0.2594, + "step": 8706 + }, + { + "epoch": 0.56, + "grad_norm": 0.5902382366757902, + "learning_rate": 4.350184879960106e-07, + "loss": 0.0927, + "step": 8707 + }, + { + "epoch": 0.56, + "grad_norm": 4.597275120931913, + "learning_rate": 4.3491609134514416e-07, + "loss": 0.0874, + "step": 8708 + }, + { + "epoch": 0.56, + "grad_norm": 0.5006277786000783, + "learning_rate": 4.3481369747088804e-07, + "loss": 0.2512, + "step": 8709 + }, + { + "epoch": 0.56, + "grad_norm": 1.914182112445155, + "learning_rate": 4.3471130637761044e-07, + "loss": 0.0853, + "step": 8710 + }, + { + "epoch": 0.56, + "grad_norm": 0.2741050102783117, + "learning_rate": 4.3460891806968004e-07, + "loss": 0.2112, + "step": 8711 + }, + { + "epoch": 0.56, + "grad_norm": 0.6388195877724325, + "learning_rate": 4.3450653255146436e-07, + "loss": 0.2596, + "step": 8712 + }, + { + "epoch": 0.56, + "grad_norm": 0.3621628629849335, + "learning_rate": 4.344041498273319e-07, + "loss": 0.069, + "step": 8713 + }, + { + "epoch": 0.56, + "grad_norm": 2.377470267897786, + "learning_rate": 4.343017699016501e-07, + "loss": 0.1486, + "step": 8714 + }, + { + "epoch": 0.56, + "grad_norm": 0.27535344211405466, + "learning_rate": 4.3419939277878705e-07, + "loss": 0.0665, + "step": 8715 + }, + { + "epoch": 0.56, + "grad_norm": 0.40089579548440857, + "learning_rate": 4.340970184631099e-07, + "loss": 0.3926, + "step": 8716 + }, + { + "epoch": 0.56, + "grad_norm": 0.6938331545369534, + "learning_rate": 4.339946469589867e-07, + "loss": 0.1051, + "step": 8717 + }, + { + "epoch": 0.56, + "grad_norm": 2.462107263236189, + "learning_rate": 4.3389227827078434e-07, + "loss": 0.1912, + "step": 8718 + }, + { + "epoch": 0.56, + "grad_norm": 1.242599660882346, + "learning_rate": 4.3378991240287044e-07, + "loss": 0.2046, + "step": 8719 + }, + { + "epoch": 0.56, + "grad_norm": 1.803931785433432, + "learning_rate": 4.336875493596119e-07, + "loss": 0.3035, + "step": 8720 + }, + { + "epoch": 0.56, + "grad_norm": 0.34555717096346095, + "learning_rate": 4.3358518914537587e-07, + "loss": 0.0487, + "step": 8721 + }, + { + "epoch": 0.56, + "grad_norm": 0.2831189267608031, + "learning_rate": 4.33482831764529e-07, + "loss": 0.1161, + "step": 8722 + }, + { + "epoch": 0.56, + "grad_norm": 0.785665893365899, + "learning_rate": 4.333804772214385e-07, + "loss": 0.2001, + "step": 8723 + }, + { + "epoch": 0.56, + "grad_norm": 1.1015902546268215, + "learning_rate": 4.332781255204707e-07, + "loss": 0.0564, + "step": 8724 + }, + { + "epoch": 0.56, + "grad_norm": 0.897124187273214, + "learning_rate": 4.331757766659921e-07, + "loss": 0.2349, + "step": 8725 + }, + { + "epoch": 0.56, + "grad_norm": 2.7622424133876797, + "learning_rate": 4.3307343066236934e-07, + "loss": 0.0969, + "step": 8726 + }, + { + "epoch": 0.56, + "grad_norm": 1.0066861931099265, + "learning_rate": 4.329710875139684e-07, + "loss": 0.2512, + "step": 8727 + }, + { + "epoch": 0.56, + "grad_norm": 1.171090718565011, + "learning_rate": 4.3286874722515564e-07, + "loss": 0.2247, + "step": 8728 + }, + { + "epoch": 0.56, + "grad_norm": 0.2886861799314343, + "learning_rate": 4.32766409800297e-07, + "loss": 0.1011, + "step": 8729 + }, + { + "epoch": 0.56, + "grad_norm": 1.7881433383426815, + "learning_rate": 4.326640752437587e-07, + "loss": 0.156, + "step": 8730 + }, + { + "epoch": 0.56, + "grad_norm": 0.36662181002272237, + "learning_rate": 4.325617435599061e-07, + "loss": 0.1765, + "step": 8731 + }, + { + "epoch": 0.56, + "grad_norm": 0.8300046605081782, + "learning_rate": 4.3245941475310535e-07, + "loss": 0.1485, + "step": 8732 + }, + { + "epoch": 0.56, + "grad_norm": 2.08197081239682, + "learning_rate": 4.323570888277214e-07, + "loss": 0.1513, + "step": 8733 + }, + { + "epoch": 0.56, + "grad_norm": 0.6685893549406288, + "learning_rate": 4.322547657881203e-07, + "loss": 0.1808, + "step": 8734 + }, + { + "epoch": 0.56, + "grad_norm": 0.7791728144281073, + "learning_rate": 4.321524456386669e-07, + "loss": 0.2814, + "step": 8735 + }, + { + "epoch": 0.56, + "grad_norm": 0.5645565736405153, + "learning_rate": 4.320501283837267e-07, + "loss": 0.2956, + "step": 8736 + }, + { + "epoch": 0.56, + "grad_norm": 4.139665768483349, + "learning_rate": 4.3194781402766464e-07, + "loss": 0.0679, + "step": 8737 + }, + { + "epoch": 0.56, + "grad_norm": 0.7585261044644834, + "learning_rate": 4.318455025748454e-07, + "loss": 0.4393, + "step": 8738 + }, + { + "epoch": 0.56, + "grad_norm": 0.6420015229006227, + "learning_rate": 4.3174319402963426e-07, + "loss": 0.3076, + "step": 8739 + }, + { + "epoch": 0.56, + "grad_norm": 8.053093193924328, + "learning_rate": 4.316408883963955e-07, + "loss": 0.0895, + "step": 8740 + }, + { + "epoch": 0.56, + "grad_norm": 2.1766155095962807, + "learning_rate": 4.315385856794941e-07, + "loss": 0.3237, + "step": 8741 + }, + { + "epoch": 0.56, + "grad_norm": 0.28378767122573545, + "learning_rate": 4.31436285883294e-07, + "loss": 0.0871, + "step": 8742 + }, + { + "epoch": 0.56, + "grad_norm": 1.8100632778598453, + "learning_rate": 4.3133398901216e-07, + "loss": 0.0596, + "step": 8743 + }, + { + "epoch": 0.56, + "grad_norm": 0.917157902373197, + "learning_rate": 4.3123169507045587e-07, + "loss": 0.2018, + "step": 8744 + }, + { + "epoch": 0.56, + "grad_norm": 0.9164757278327148, + "learning_rate": 4.3112940406254595e-07, + "loss": 0.1257, + "step": 8745 + }, + { + "epoch": 0.56, + "grad_norm": 0.3863658506835133, + "learning_rate": 4.3102711599279406e-07, + "loss": 0.1335, + "step": 8746 + }, + { + "epoch": 0.56, + "grad_norm": 0.5603958703346787, + "learning_rate": 4.309248308655641e-07, + "loss": 0.2087, + "step": 8747 + }, + { + "epoch": 0.56, + "grad_norm": 0.8364777548053638, + "learning_rate": 4.308225486852196e-07, + "loss": 0.1572, + "step": 8748 + }, + { + "epoch": 0.56, + "grad_norm": 0.6659128864748425, + "learning_rate": 4.307202694561245e-07, + "loss": 0.2157, + "step": 8749 + }, + { + "epoch": 0.56, + "grad_norm": 1.4506071015548738, + "learning_rate": 4.306179931826419e-07, + "loss": 0.2602, + "step": 8750 + }, + { + "epoch": 0.56, + "grad_norm": 2.109939051352687, + "learning_rate": 4.3051571986913506e-07, + "loss": 0.316, + "step": 8751 + }, + { + "epoch": 0.56, + "grad_norm": 0.43624671986174895, + "learning_rate": 4.304134495199674e-07, + "loss": 0.0085, + "step": 8752 + }, + { + "epoch": 0.56, + "grad_norm": 2.181416783377148, + "learning_rate": 4.3031118213950166e-07, + "loss": 0.1573, + "step": 8753 + }, + { + "epoch": 0.56, + "grad_norm": 19.120870356589716, + "learning_rate": 4.302089177321012e-07, + "loss": 0.2051, + "step": 8754 + }, + { + "epoch": 0.56, + "grad_norm": 0.8544548483466076, + "learning_rate": 4.3010665630212836e-07, + "loss": 0.1524, + "step": 8755 + }, + { + "epoch": 0.56, + "grad_norm": 0.7709835200098785, + "learning_rate": 4.3000439785394646e-07, + "loss": 0.2253, + "step": 8756 + }, + { + "epoch": 0.56, + "grad_norm": 3.236281598019691, + "learning_rate": 4.299021423919173e-07, + "loss": 0.2094, + "step": 8757 + }, + { + "epoch": 0.56, + "grad_norm": 0.2272571124262694, + "learning_rate": 4.297998899204039e-07, + "loss": 0.0809, + "step": 8758 + }, + { + "epoch": 0.56, + "grad_norm": 0.7594296087296805, + "learning_rate": 4.296976404437681e-07, + "loss": 0.1135, + "step": 8759 + }, + { + "epoch": 0.56, + "grad_norm": 1.0453715609503011, + "learning_rate": 4.295953939663726e-07, + "loss": 0.1734, + "step": 8760 + }, + { + "epoch": 0.56, + "grad_norm": 0.8404093890406891, + "learning_rate": 4.2949315049257873e-07, + "loss": 0.3088, + "step": 8761 + }, + { + "epoch": 0.56, + "grad_norm": 1.3165393104660033, + "learning_rate": 4.293909100267491e-07, + "loss": 0.2291, + "step": 8762 + }, + { + "epoch": 0.56, + "grad_norm": 0.8868526873780667, + "learning_rate": 4.2928867257324495e-07, + "loss": 0.0805, + "step": 8763 + }, + { + "epoch": 0.56, + "grad_norm": 0.5407469980793979, + "learning_rate": 4.2918643813642836e-07, + "loss": 0.2776, + "step": 8764 + }, + { + "epoch": 0.56, + "grad_norm": 1.0047903624271113, + "learning_rate": 4.2908420672066075e-07, + "loss": 0.2345, + "step": 8765 + }, + { + "epoch": 0.56, + "grad_norm": 0.4653994504140275, + "learning_rate": 4.289819783303032e-07, + "loss": 0.1555, + "step": 8766 + }, + { + "epoch": 0.56, + "grad_norm": 1.46737109688055, + "learning_rate": 4.288797529697175e-07, + "loss": 0.3302, + "step": 8767 + }, + { + "epoch": 0.56, + "grad_norm": 0.6238645780029942, + "learning_rate": 4.287775306432642e-07, + "loss": 0.1174, + "step": 8768 + }, + { + "epoch": 0.56, + "grad_norm": 0.8079535255908654, + "learning_rate": 4.2867531135530484e-07, + "loss": 0.3148, + "step": 8769 + }, + { + "epoch": 0.56, + "grad_norm": 0.43723374040357554, + "learning_rate": 4.285730951101999e-07, + "loss": 0.1074, + "step": 8770 + }, + { + "epoch": 0.56, + "grad_norm": 0.38881263524925763, + "learning_rate": 4.284708819123104e-07, + "loss": 0.01, + "step": 8771 + }, + { + "epoch": 0.56, + "grad_norm": 1.1943730728812147, + "learning_rate": 4.2836867176599684e-07, + "loss": 0.0127, + "step": 8772 + }, + { + "epoch": 0.56, + "grad_norm": 0.48869727062370727, + "learning_rate": 4.282664646756198e-07, + "loss": 0.1828, + "step": 8773 + }, + { + "epoch": 0.56, + "grad_norm": 1.0197104140215727, + "learning_rate": 4.2816426064553946e-07, + "loss": 0.1231, + "step": 8774 + }, + { + "epoch": 0.56, + "grad_norm": 0.5393575259996369, + "learning_rate": 4.280620596801163e-07, + "loss": 0.0497, + "step": 8775 + }, + { + "epoch": 0.56, + "grad_norm": 0.05371980771129873, + "learning_rate": 4.279598617837102e-07, + "loss": 0.0002, + "step": 8776 + }, + { + "epoch": 0.56, + "grad_norm": 0.5745978682807066, + "learning_rate": 4.278576669606814e-07, + "loss": 0.1843, + "step": 8777 + }, + { + "epoch": 0.56, + "grad_norm": 0.9278652412815943, + "learning_rate": 4.277554752153895e-07, + "loss": 0.0959, + "step": 8778 + }, + { + "epoch": 0.56, + "grad_norm": 0.5250453545431085, + "learning_rate": 4.27653286552194e-07, + "loss": 0.2151, + "step": 8779 + }, + { + "epoch": 0.56, + "grad_norm": 0.9632878342014657, + "learning_rate": 4.2755110097545495e-07, + "loss": 0.023, + "step": 8780 + }, + { + "epoch": 0.56, + "grad_norm": 0.6455365855607985, + "learning_rate": 4.274489184895315e-07, + "loss": 0.1282, + "step": 8781 + }, + { + "epoch": 0.56, + "grad_norm": 4.19205089455886, + "learning_rate": 4.273467390987831e-07, + "loss": 0.0743, + "step": 8782 + }, + { + "epoch": 0.56, + "grad_norm": 0.854499994034134, + "learning_rate": 4.272445628075687e-07, + "loss": 0.216, + "step": 8783 + }, + { + "epoch": 0.56, + "grad_norm": 0.6115082054764824, + "learning_rate": 4.2714238962024775e-07, + "loss": 0.1696, + "step": 8784 + }, + { + "epoch": 0.56, + "grad_norm": 2.2685461559803914, + "learning_rate": 4.270402195411787e-07, + "loss": 0.1875, + "step": 8785 + }, + { + "epoch": 0.56, + "grad_norm": 0.8428912491270012, + "learning_rate": 4.269380525747207e-07, + "loss": 0.1702, + "step": 8786 + }, + { + "epoch": 0.56, + "grad_norm": 1.4900605117836085, + "learning_rate": 4.2683588872523214e-07, + "loss": 0.1229, + "step": 8787 + }, + { + "epoch": 0.56, + "grad_norm": 0.9344322384855469, + "learning_rate": 4.267337279970718e-07, + "loss": 0.2983, + "step": 8788 + }, + { + "epoch": 0.56, + "grad_norm": 1.3308066950851476, + "learning_rate": 4.2663157039459774e-07, + "loss": 0.1737, + "step": 8789 + }, + { + "epoch": 0.56, + "grad_norm": 1.4536736858630486, + "learning_rate": 4.265294159221684e-07, + "loss": 0.1757, + "step": 8790 + }, + { + "epoch": 0.56, + "grad_norm": 2.2401233737061514, + "learning_rate": 4.264272645841419e-07, + "loss": 0.2111, + "step": 8791 + }, + { + "epoch": 0.56, + "grad_norm": 1.7235916896888939, + "learning_rate": 4.263251163848762e-07, + "loss": 0.3137, + "step": 8792 + }, + { + "epoch": 0.56, + "grad_norm": 0.8752625380095372, + "learning_rate": 4.262229713287293e-07, + "loss": 0.5314, + "step": 8793 + }, + { + "epoch": 0.56, + "grad_norm": 4.771569810967823, + "learning_rate": 4.261208294200585e-07, + "loss": 0.196, + "step": 8794 + }, + { + "epoch": 0.56, + "grad_norm": 0.9800783063616554, + "learning_rate": 4.260186906632219e-07, + "loss": 0.2956, + "step": 8795 + }, + { + "epoch": 0.56, + "grad_norm": 0.9444654094315114, + "learning_rate": 4.259165550625764e-07, + "loss": 0.2544, + "step": 8796 + }, + { + "epoch": 0.56, + "grad_norm": 0.5934749999704413, + "learning_rate": 4.258144226224798e-07, + "loss": 0.1408, + "step": 8797 + }, + { + "epoch": 0.56, + "grad_norm": 2.3570317708605626, + "learning_rate": 4.257122933472889e-07, + "loss": 0.0839, + "step": 8798 + }, + { + "epoch": 0.56, + "grad_norm": 1.2242800026870266, + "learning_rate": 4.256101672413609e-07, + "loss": 0.1207, + "step": 8799 + }, + { + "epoch": 0.56, + "grad_norm": 1.241545716452189, + "learning_rate": 4.255080443090527e-07, + "loss": 0.4852, + "step": 8800 + }, + { + "epoch": 0.56, + "grad_norm": 0.35390348831937746, + "learning_rate": 4.254059245547212e-07, + "loss": 0.1465, + "step": 8801 + }, + { + "epoch": 0.56, + "grad_norm": 0.3990140449314784, + "learning_rate": 4.2530380798272283e-07, + "loss": 0.0983, + "step": 8802 + }, + { + "epoch": 0.56, + "grad_norm": 1.4236219228074876, + "learning_rate": 4.2520169459741427e-07, + "loss": 0.1796, + "step": 8803 + }, + { + "epoch": 0.56, + "grad_norm": 5.57621134386628, + "learning_rate": 4.250995844031516e-07, + "loss": 0.1206, + "step": 8804 + }, + { + "epoch": 0.56, + "grad_norm": 1.0245743971573507, + "learning_rate": 4.249974774042915e-07, + "loss": 0.3929, + "step": 8805 + }, + { + "epoch": 0.56, + "grad_norm": 0.9511849609887367, + "learning_rate": 4.2489537360518963e-07, + "loss": 0.1818, + "step": 8806 + }, + { + "epoch": 0.56, + "grad_norm": 0.36073045919090885, + "learning_rate": 4.2479327301020214e-07, + "loss": 0.275, + "step": 8807 + }, + { + "epoch": 0.56, + "grad_norm": 0.7016519427161496, + "learning_rate": 4.246911756236847e-07, + "loss": 0.1796, + "step": 8808 + }, + { + "epoch": 0.56, + "grad_norm": 5.261643534370843, + "learning_rate": 4.245890814499931e-07, + "loss": 0.2649, + "step": 8809 + }, + { + "epoch": 0.56, + "grad_norm": 0.17211187922867002, + "learning_rate": 4.244869904934831e-07, + "loss": 0.0906, + "step": 8810 + }, + { + "epoch": 0.56, + "grad_norm": 1.0630988415737792, + "learning_rate": 4.243849027585096e-07, + "loss": 0.0131, + "step": 8811 + }, + { + "epoch": 0.56, + "grad_norm": 0.9457028326375878, + "learning_rate": 4.2428281824942843e-07, + "loss": 0.4614, + "step": 8812 + }, + { + "epoch": 0.56, + "grad_norm": 1.3354028111491003, + "learning_rate": 4.2418073697059414e-07, + "loss": 0.1195, + "step": 8813 + }, + { + "epoch": 0.56, + "grad_norm": 0.486547982467088, + "learning_rate": 4.240786589263623e-07, + "loss": 0.3201, + "step": 8814 + }, + { + "epoch": 0.56, + "grad_norm": 0.608709704000889, + "learning_rate": 4.239765841210873e-07, + "loss": 0.0223, + "step": 8815 + }, + { + "epoch": 0.56, + "grad_norm": 0.8671122060277289, + "learning_rate": 4.2387451255912407e-07, + "loss": 0.2541, + "step": 8816 + }, + { + "epoch": 0.56, + "grad_norm": 1.11364371117671, + "learning_rate": 4.2377244424482726e-07, + "loss": 0.2379, + "step": 8817 + }, + { + "epoch": 0.56, + "grad_norm": 0.5695881297848178, + "learning_rate": 4.236703791825512e-07, + "loss": 0.0693, + "step": 8818 + }, + { + "epoch": 0.56, + "grad_norm": 1.2860772628905273, + "learning_rate": 4.235683173766503e-07, + "loss": 0.148, + "step": 8819 + }, + { + "epoch": 0.56, + "grad_norm": 1.9321914860380835, + "learning_rate": 4.234662588314784e-07, + "loss": 0.2268, + "step": 8820 + }, + { + "epoch": 0.56, + "grad_norm": 1.8848982594913803, + "learning_rate": 4.2336420355139e-07, + "loss": 0.0244, + "step": 8821 + }, + { + "epoch": 0.56, + "grad_norm": 0.5598738834934773, + "learning_rate": 4.232621515407385e-07, + "loss": 0.2702, + "step": 8822 + }, + { + "epoch": 0.56, + "grad_norm": 0.4768426556401251, + "learning_rate": 4.2316010280387803e-07, + "loss": 0.1238, + "step": 8823 + }, + { + "epoch": 0.56, + "grad_norm": 1.5368905908160633, + "learning_rate": 4.230580573451619e-07, + "loss": 0.2323, + "step": 8824 + }, + { + "epoch": 0.56, + "grad_norm": 3.4680823082719106, + "learning_rate": 4.2295601516894387e-07, + "loss": 0.2658, + "step": 8825 + }, + { + "epoch": 0.56, + "grad_norm": 0.30093073521511227, + "learning_rate": 4.2285397627957686e-07, + "loss": 0.0098, + "step": 8826 + }, + { + "epoch": 0.56, + "grad_norm": 0.6365285501257308, + "learning_rate": 4.227519406814146e-07, + "loss": 0.0815, + "step": 8827 + }, + { + "epoch": 0.56, + "grad_norm": 0.6121023613489245, + "learning_rate": 4.226499083788096e-07, + "loss": 0.3337, + "step": 8828 + }, + { + "epoch": 0.56, + "grad_norm": 0.7200664274228797, + "learning_rate": 4.225478793761153e-07, + "loss": 0.1372, + "step": 8829 + }, + { + "epoch": 0.56, + "grad_norm": 2.995027730163176, + "learning_rate": 4.2244585367768387e-07, + "loss": 0.1968, + "step": 8830 + }, + { + "epoch": 0.56, + "grad_norm": 1.1568868680182585, + "learning_rate": 4.2234383128786847e-07, + "loss": 0.2059, + "step": 8831 + }, + { + "epoch": 0.56, + "grad_norm": 1.065654123080128, + "learning_rate": 4.222418122110211e-07, + "loss": 0.0939, + "step": 8832 + }, + { + "epoch": 0.56, + "grad_norm": 1.760090562541034, + "learning_rate": 4.2213979645149455e-07, + "loss": 0.304, + "step": 8833 + }, + { + "epoch": 0.56, + "grad_norm": 1.1222549533889943, + "learning_rate": 4.220377840136407e-07, + "loss": 0.2764, + "step": 8834 + }, + { + "epoch": 0.56, + "grad_norm": 1.2180117061610305, + "learning_rate": 4.2193577490181163e-07, + "loss": 0.1748, + "step": 8835 + }, + { + "epoch": 0.56, + "grad_norm": 0.4405662940994313, + "learning_rate": 4.218337691203595e-07, + "loss": 0.1627, + "step": 8836 + }, + { + "epoch": 0.56, + "grad_norm": 0.9280427079148141, + "learning_rate": 4.217317666736357e-07, + "loss": 0.0307, + "step": 8837 + }, + { + "epoch": 0.56, + "grad_norm": 8.040666841572136, + "learning_rate": 4.2162976756599234e-07, + "loss": 0.0824, + "step": 8838 + }, + { + "epoch": 0.56, + "grad_norm": 0.408029469422152, + "learning_rate": 4.215277718017804e-07, + "loss": 0.3568, + "step": 8839 + }, + { + "epoch": 0.56, + "grad_norm": 0.850396461252373, + "learning_rate": 4.214257793853517e-07, + "loss": 0.3457, + "step": 8840 + }, + { + "epoch": 0.56, + "grad_norm": 0.130899098865247, + "learning_rate": 4.2132379032105693e-07, + "loss": 0.0031, + "step": 8841 + }, + { + "epoch": 0.56, + "grad_norm": 0.9850970598885355, + "learning_rate": 4.212218046132476e-07, + "loss": 0.2646, + "step": 8842 + }, + { + "epoch": 0.56, + "grad_norm": 0.7836241026560231, + "learning_rate": 4.2111982226627433e-07, + "loss": 0.0229, + "step": 8843 + }, + { + "epoch": 0.56, + "grad_norm": 10.300270262942005, + "learning_rate": 4.2101784328448814e-07, + "loss": 0.1831, + "step": 8844 + }, + { + "epoch": 0.56, + "grad_norm": 1.1229748622532114, + "learning_rate": 4.2091586767223936e-07, + "loss": 0.1617, + "step": 8845 + }, + { + "epoch": 0.56, + "grad_norm": 0.6543210638658481, + "learning_rate": 4.2081389543387893e-07, + "loss": 0.1861, + "step": 8846 + }, + { + "epoch": 0.56, + "grad_norm": 5.127326640082101, + "learning_rate": 4.2071192657375684e-07, + "loss": 0.056, + "step": 8847 + }, + { + "epoch": 0.56, + "grad_norm": 1.479982180174863, + "learning_rate": 4.206099610962231e-07, + "loss": 0.2553, + "step": 8848 + }, + { + "epoch": 0.56, + "grad_norm": 0.47943624358487036, + "learning_rate": 4.2050799900562826e-07, + "loss": 0.3061, + "step": 8849 + }, + { + "epoch": 0.56, + "grad_norm": 1.1833321862723516, + "learning_rate": 4.2040604030632176e-07, + "loss": 0.1738, + "step": 8850 + }, + { + "epoch": 0.56, + "grad_norm": 0.2990509725056782, + "learning_rate": 4.203040850026537e-07, + "loss": 0.0038, + "step": 8851 + }, + { + "epoch": 0.56, + "grad_norm": 0.6951384879437232, + "learning_rate": 4.202021330989736e-07, + "loss": 0.1756, + "step": 8852 + }, + { + "epoch": 0.56, + "grad_norm": 0.8608802564102809, + "learning_rate": 4.201001845996309e-07, + "loss": 0.1009, + "step": 8853 + }, + { + "epoch": 0.56, + "grad_norm": 0.575110980996379, + "learning_rate": 4.199982395089749e-07, + "loss": 0.1647, + "step": 8854 + }, + { + "epoch": 0.56, + "grad_norm": 8.192043742401415, + "learning_rate": 4.19896297831355e-07, + "loss": 0.1525, + "step": 8855 + }, + { + "epoch": 0.56, + "grad_norm": 0.5647354508329827, + "learning_rate": 4.197943595711198e-07, + "loss": 0.0924, + "step": 8856 + }, + { + "epoch": 0.56, + "grad_norm": 4.690594689503533, + "learning_rate": 4.1969242473261873e-07, + "loss": 0.1563, + "step": 8857 + }, + { + "epoch": 0.56, + "grad_norm": 15.318151122942464, + "learning_rate": 4.195904933202e-07, + "loss": 0.1284, + "step": 8858 + }, + { + "epoch": 0.56, + "grad_norm": 0.4100718526531126, + "learning_rate": 4.194885653382127e-07, + "loss": 0.1654, + "step": 8859 + }, + { + "epoch": 0.57, + "grad_norm": 0.6682643198475973, + "learning_rate": 4.1938664079100493e-07, + "loss": 0.1893, + "step": 8860 + }, + { + "epoch": 0.57, + "grad_norm": 4.702906979463967, + "learning_rate": 4.192847196829251e-07, + "loss": 0.3181, + "step": 8861 + }, + { + "epoch": 0.57, + "grad_norm": 2.2794038402298713, + "learning_rate": 4.1918280201832145e-07, + "loss": 0.0579, + "step": 8862 + }, + { + "epoch": 0.57, + "grad_norm": 0.43971315297669045, + "learning_rate": 4.1908088780154183e-07, + "loss": 0.1261, + "step": 8863 + }, + { + "epoch": 0.57, + "grad_norm": 0.44545082915315964, + "learning_rate": 4.1897897703693435e-07, + "loss": 0.1466, + "step": 8864 + }, + { + "epoch": 0.57, + "grad_norm": 0.2037949421885682, + "learning_rate": 4.188770697288464e-07, + "loss": 0.1159, + "step": 8865 + }, + { + "epoch": 0.57, + "grad_norm": 1.2637857986268712, + "learning_rate": 4.1877516588162596e-07, + "loss": 0.4962, + "step": 8866 + }, + { + "epoch": 0.57, + "grad_norm": 0.8129489178667497, + "learning_rate": 4.1867326549962e-07, + "loss": 0.2276, + "step": 8867 + }, + { + "epoch": 0.57, + "grad_norm": 0.6055254303719817, + "learning_rate": 4.185713685871762e-07, + "loss": 0.2519, + "step": 8868 + }, + { + "epoch": 0.57, + "grad_norm": 0.7323659725268498, + "learning_rate": 4.184694751486414e-07, + "loss": 0.1028, + "step": 8869 + }, + { + "epoch": 0.57, + "grad_norm": 0.684166843548358, + "learning_rate": 4.183675851883627e-07, + "loss": 0.2773, + "step": 8870 + }, + { + "epoch": 0.57, + "grad_norm": 0.1639504907915714, + "learning_rate": 4.1826569871068686e-07, + "loss": 0.0061, + "step": 8871 + }, + { + "epoch": 0.57, + "grad_norm": 3.8127810049258692, + "learning_rate": 4.1816381571996075e-07, + "loss": 0.1356, + "step": 8872 + }, + { + "epoch": 0.57, + "grad_norm": 0.9025274955719915, + "learning_rate": 4.180619362205306e-07, + "loss": 0.17, + "step": 8873 + }, + { + "epoch": 0.57, + "grad_norm": 0.8218113799740292, + "learning_rate": 4.1796006021674326e-07, + "loss": 0.2427, + "step": 8874 + }, + { + "epoch": 0.57, + "grad_norm": 0.22969596510198548, + "learning_rate": 4.1785818771294456e-07, + "loss": 0.0627, + "step": 8875 + }, + { + "epoch": 0.57, + "grad_norm": 1.013127341362521, + "learning_rate": 4.1775631871348056e-07, + "loss": 0.2197, + "step": 8876 + }, + { + "epoch": 0.57, + "grad_norm": 0.6531218599809984, + "learning_rate": 4.176544532226974e-07, + "loss": 0.1165, + "step": 8877 + }, + { + "epoch": 0.57, + "grad_norm": 3.6029073170787513, + "learning_rate": 4.175525912449408e-07, + "loss": 0.1084, + "step": 8878 + }, + { + "epoch": 0.57, + "grad_norm": 1.9247928800834706, + "learning_rate": 4.174507327845564e-07, + "loss": 0.0345, + "step": 8879 + }, + { + "epoch": 0.57, + "grad_norm": 2.4511928593583288, + "learning_rate": 4.173488778458896e-07, + "loss": 0.3031, + "step": 8880 + }, + { + "epoch": 0.57, + "grad_norm": 0.758286084876253, + "learning_rate": 4.17247026433286e-07, + "loss": 0.0033, + "step": 8881 + }, + { + "epoch": 0.57, + "grad_norm": 0.6638340939757755, + "learning_rate": 4.171451785510904e-07, + "loss": 0.419, + "step": 8882 + }, + { + "epoch": 0.57, + "grad_norm": 0.8397141346873115, + "learning_rate": 4.1704333420364827e-07, + "loss": 0.2656, + "step": 8883 + }, + { + "epoch": 0.57, + "grad_norm": 0.48375765804415893, + "learning_rate": 4.16941493395304e-07, + "loss": 0.0618, + "step": 8884 + }, + { + "epoch": 0.57, + "grad_norm": 2.4125366962194494, + "learning_rate": 4.1683965613040293e-07, + "loss": 0.3128, + "step": 8885 + }, + { + "epoch": 0.57, + "grad_norm": 1.7514592078671956, + "learning_rate": 4.16737822413289e-07, + "loss": 0.2279, + "step": 8886 + }, + { + "epoch": 0.57, + "grad_norm": 0.4256378297724906, + "learning_rate": 4.166359922483071e-07, + "loss": 0.2878, + "step": 8887 + }, + { + "epoch": 0.57, + "grad_norm": 0.6441216191541835, + "learning_rate": 4.165341656398014e-07, + "loss": 0.1651, + "step": 8888 + }, + { + "epoch": 0.57, + "grad_norm": 0.2115471476169245, + "learning_rate": 4.164323425921158e-07, + "loss": 0.1172, + "step": 8889 + }, + { + "epoch": 0.57, + "grad_norm": 0.9850882113832111, + "learning_rate": 4.163305231095947e-07, + "loss": 0.003, + "step": 8890 + }, + { + "epoch": 0.57, + "grad_norm": 0.8093672278466705, + "learning_rate": 4.1622870719658145e-07, + "loss": 0.2152, + "step": 8891 + }, + { + "epoch": 0.57, + "grad_norm": 0.6736874441809302, + "learning_rate": 4.1612689485742013e-07, + "loss": 0.1791, + "step": 8892 + }, + { + "epoch": 0.57, + "grad_norm": 9.01785573323617, + "learning_rate": 4.160250860964539e-07, + "loss": 0.1442, + "step": 8893 + }, + { + "epoch": 0.57, + "grad_norm": 1.8646779079782378, + "learning_rate": 4.1592328091802645e-07, + "loss": 0.2035, + "step": 8894 + }, + { + "epoch": 0.57, + "grad_norm": 0.3145613306400818, + "learning_rate": 4.158214793264807e-07, + "loss": 0.0532, + "step": 8895 + }, + { + "epoch": 0.57, + "grad_norm": 2.1389023483351055, + "learning_rate": 4.1571968132615996e-07, + "loss": 0.2556, + "step": 8896 + }, + { + "epoch": 0.57, + "grad_norm": 0.7143415719349784, + "learning_rate": 4.1561788692140707e-07, + "loss": 0.148, + "step": 8897 + }, + { + "epoch": 0.57, + "grad_norm": 1.090866347674417, + "learning_rate": 4.1551609611656473e-07, + "loss": 0.1279, + "step": 8898 + }, + { + "epoch": 0.57, + "grad_norm": 0.6558849385962897, + "learning_rate": 4.1541430891597544e-07, + "loss": 0.3018, + "step": 8899 + }, + { + "epoch": 0.57, + "grad_norm": 2.387434021177685, + "learning_rate": 4.15312525323982e-07, + "loss": 0.3073, + "step": 8900 + }, + { + "epoch": 0.57, + "grad_norm": 0.7054601023014728, + "learning_rate": 4.152107453449263e-07, + "loss": 0.0077, + "step": 8901 + }, + { + "epoch": 0.57, + "grad_norm": 0.5885797868645957, + "learning_rate": 4.151089689831508e-07, + "loss": 0.2542, + "step": 8902 + }, + { + "epoch": 0.57, + "grad_norm": 1.3560516496879205, + "learning_rate": 4.1500719624299734e-07, + "loss": 0.0821, + "step": 8903 + }, + { + "epoch": 0.57, + "grad_norm": 3.3467056027890014, + "learning_rate": 4.1490542712880754e-07, + "loss": 0.2704, + "step": 8904 + }, + { + "epoch": 0.57, + "grad_norm": 0.5524279259988052, + "learning_rate": 4.148036616449234e-07, + "loss": 0.112, + "step": 8905 + }, + { + "epoch": 0.57, + "grad_norm": 1.740823832942431, + "learning_rate": 4.147018997956862e-07, + "loss": 0.2801, + "step": 8906 + }, + { + "epoch": 0.57, + "grad_norm": 1.0735528649293915, + "learning_rate": 4.146001415854377e-07, + "loss": 0.2887, + "step": 8907 + }, + { + "epoch": 0.57, + "grad_norm": 2.7113644386011595, + "learning_rate": 4.144983870185185e-07, + "loss": 0.051, + "step": 8908 + }, + { + "epoch": 0.57, + "grad_norm": 0.9173098476106, + "learning_rate": 4.1439663609927033e-07, + "loss": 0.3603, + "step": 8909 + }, + { + "epoch": 0.57, + "grad_norm": 1.0221068373924442, + "learning_rate": 4.1429488883203346e-07, + "loss": 0.5015, + "step": 8910 + }, + { + "epoch": 0.57, + "grad_norm": 0.9748475047797515, + "learning_rate": 4.141931452211492e-07, + "loss": 0.0713, + "step": 8911 + }, + { + "epoch": 0.57, + "grad_norm": 0.877476302434798, + "learning_rate": 4.140914052709575e-07, + "loss": 0.3888, + "step": 8912 + }, + { + "epoch": 0.57, + "grad_norm": 2.035962076885889, + "learning_rate": 4.139896689857995e-07, + "loss": 0.0879, + "step": 8913 + }, + { + "epoch": 0.57, + "grad_norm": 0.3054973153770853, + "learning_rate": 4.138879363700149e-07, + "loss": 0.1952, + "step": 8914 + }, + { + "epoch": 0.57, + "grad_norm": 0.59316800489636, + "learning_rate": 4.1378620742794413e-07, + "loss": 0.2633, + "step": 8915 + }, + { + "epoch": 0.57, + "grad_norm": 1.9554768278231138, + "learning_rate": 4.136844821639272e-07, + "loss": 0.3325, + "step": 8916 + }, + { + "epoch": 0.57, + "grad_norm": 1.5777505092967516, + "learning_rate": 4.135827605823035e-07, + "loss": 0.2842, + "step": 8917 + }, + { + "epoch": 0.57, + "grad_norm": 0.4933923077079456, + "learning_rate": 4.134810426874131e-07, + "loss": 0.1007, + "step": 8918 + }, + { + "epoch": 0.57, + "grad_norm": 0.41674791695581065, + "learning_rate": 4.1337932848359526e-07, + "loss": 0.0506, + "step": 8919 + }, + { + "epoch": 0.57, + "grad_norm": 0.6713354467851044, + "learning_rate": 4.1327761797518955e-07, + "loss": 0.2607, + "step": 8920 + }, + { + "epoch": 0.57, + "grad_norm": 0.4323185746476104, + "learning_rate": 4.131759111665348e-07, + "loss": 0.0325, + "step": 8921 + }, + { + "epoch": 0.57, + "grad_norm": 1.0622512988011819, + "learning_rate": 4.1307420806197036e-07, + "loss": 0.2112, + "step": 8922 + }, + { + "epoch": 0.57, + "grad_norm": 1.4389750806798214, + "learning_rate": 4.129725086658349e-07, + "loss": 0.0492, + "step": 8923 + }, + { + "epoch": 0.57, + "grad_norm": 0.22444360001990274, + "learning_rate": 4.1287081298246716e-07, + "loss": 0.0761, + "step": 8924 + }, + { + "epoch": 0.57, + "grad_norm": 2.7183421669202206, + "learning_rate": 4.1276912101620564e-07, + "loss": 0.0647, + "step": 8925 + }, + { + "epoch": 0.57, + "grad_norm": 0.7079498184301862, + "learning_rate": 4.1266743277138894e-07, + "loss": 0.2825, + "step": 8926 + }, + { + "epoch": 0.57, + "grad_norm": 1.7627827987883815, + "learning_rate": 4.125657482523549e-07, + "loss": 0.297, + "step": 8927 + }, + { + "epoch": 0.57, + "grad_norm": 0.9324609714137633, + "learning_rate": 4.12464067463442e-07, + "loss": 0.3738, + "step": 8928 + }, + { + "epoch": 0.57, + "grad_norm": 0.4042411237926974, + "learning_rate": 4.12362390408988e-07, + "loss": 0.2373, + "step": 8929 + }, + { + "epoch": 0.57, + "grad_norm": 0.09771824803843174, + "learning_rate": 4.122607170933304e-07, + "loss": 0.0008, + "step": 8930 + }, + { + "epoch": 0.57, + "grad_norm": 0.6181588386577638, + "learning_rate": 4.1215904752080704e-07, + "loss": 0.0991, + "step": 8931 + }, + { + "epoch": 0.57, + "grad_norm": 0.4188993403668053, + "learning_rate": 4.120573816957552e-07, + "loss": 0.0067, + "step": 8932 + }, + { + "epoch": 0.57, + "grad_norm": 0.5478641896560902, + "learning_rate": 4.119557196225125e-07, + "loss": 0.278, + "step": 8933 + }, + { + "epoch": 0.57, + "grad_norm": 0.8758992113226542, + "learning_rate": 4.118540613054155e-07, + "loss": 0.372, + "step": 8934 + }, + { + "epoch": 0.57, + "grad_norm": 0.9157197980811437, + "learning_rate": 4.117524067488017e-07, + "loss": 0.3069, + "step": 8935 + }, + { + "epoch": 0.57, + "grad_norm": 0.892030904545098, + "learning_rate": 4.116507559570074e-07, + "loss": 0.2113, + "step": 8936 + }, + { + "epoch": 0.57, + "grad_norm": 0.4467649615776646, + "learning_rate": 4.1154910893436966e-07, + "loss": 0.0266, + "step": 8937 + }, + { + "epoch": 0.57, + "grad_norm": 4.770026343102141, + "learning_rate": 4.1144746568522457e-07, + "loss": 0.1038, + "step": 8938 + }, + { + "epoch": 0.57, + "grad_norm": 0.5433695625090664, + "learning_rate": 4.113458262139088e-07, + "loss": 0.2527, + "step": 8939 + }, + { + "epoch": 0.57, + "grad_norm": 0.8449894277474345, + "learning_rate": 4.112441905247581e-07, + "loss": 0.2222, + "step": 8940 + }, + { + "epoch": 0.57, + "grad_norm": 2.7217248814820465, + "learning_rate": 4.111425586221087e-07, + "loss": 0.3618, + "step": 8941 + }, + { + "epoch": 0.57, + "grad_norm": 1.8637140262740672, + "learning_rate": 4.1104093051029647e-07, + "loss": 0.1332, + "step": 8942 + }, + { + "epoch": 0.57, + "grad_norm": 0.5222302317550118, + "learning_rate": 4.109393061936569e-07, + "loss": 0.2407, + "step": 8943 + }, + { + "epoch": 0.57, + "grad_norm": 0.46888449597675147, + "learning_rate": 4.108376856765257e-07, + "loss": 0.106, + "step": 8944 + }, + { + "epoch": 0.57, + "grad_norm": 1.2237282015396125, + "learning_rate": 4.107360689632379e-07, + "loss": 0.2106, + "step": 8945 + }, + { + "epoch": 0.57, + "grad_norm": 1.468060232104511, + "learning_rate": 4.1063445605812894e-07, + "loss": 0.2647, + "step": 8946 + }, + { + "epoch": 0.57, + "grad_norm": 0.831866185765841, + "learning_rate": 4.105328469655336e-07, + "loss": 0.2963, + "step": 8947 + }, + { + "epoch": 0.57, + "grad_norm": 0.1882829784434609, + "learning_rate": 4.10431241689787e-07, + "loss": 0.0757, + "step": 8948 + }, + { + "epoch": 0.57, + "grad_norm": 0.9410353486862414, + "learning_rate": 4.103296402352236e-07, + "loss": 0.1798, + "step": 8949 + }, + { + "epoch": 0.57, + "grad_norm": 0.8360667018701529, + "learning_rate": 4.1022804260617805e-07, + "loss": 0.1084, + "step": 8950 + }, + { + "epoch": 0.57, + "grad_norm": 0.41446117427026946, + "learning_rate": 4.1012644880698455e-07, + "loss": 0.2458, + "step": 8951 + }, + { + "epoch": 0.57, + "grad_norm": 0.6538221590023136, + "learning_rate": 4.1002485884197765e-07, + "loss": 0.2322, + "step": 8952 + }, + { + "epoch": 0.57, + "grad_norm": 1.8797931882510255, + "learning_rate": 4.0992327271549087e-07, + "loss": 0.0174, + "step": 8953 + }, + { + "epoch": 0.57, + "grad_norm": 0.6596385415035146, + "learning_rate": 4.098216904318587e-07, + "loss": 0.146, + "step": 8954 + }, + { + "epoch": 0.57, + "grad_norm": 0.9258696029597476, + "learning_rate": 4.0972011199541414e-07, + "loss": 0.1677, + "step": 8955 + }, + { + "epoch": 0.57, + "grad_norm": 0.9562414673037318, + "learning_rate": 4.0961853741049135e-07, + "loss": 0.214, + "step": 8956 + }, + { + "epoch": 0.57, + "grad_norm": 0.6610291632584135, + "learning_rate": 4.095169666814233e-07, + "loss": 0.1226, + "step": 8957 + }, + { + "epoch": 0.57, + "grad_norm": 0.8198860743687632, + "learning_rate": 4.0941539981254336e-07, + "loss": 0.204, + "step": 8958 + }, + { + "epoch": 0.57, + "grad_norm": 3.905845755857973, + "learning_rate": 4.093138368081847e-07, + "loss": 0.1959, + "step": 8959 + }, + { + "epoch": 0.57, + "grad_norm": 1.211781261974601, + "learning_rate": 4.0921227767267974e-07, + "loss": 0.2329, + "step": 8960 + }, + { + "epoch": 0.57, + "grad_norm": 0.4863503643373439, + "learning_rate": 4.0911072241036185e-07, + "loss": 0.0428, + "step": 8961 + }, + { + "epoch": 0.57, + "grad_norm": 0.7256574497194364, + "learning_rate": 4.09009171025563e-07, + "loss": 0.1728, + "step": 8962 + }, + { + "epoch": 0.57, + "grad_norm": 18.89114656717362, + "learning_rate": 4.089076235226161e-07, + "loss": 0.4744, + "step": 8963 + }, + { + "epoch": 0.57, + "grad_norm": 0.9527457468840271, + "learning_rate": 4.0880607990585286e-07, + "loss": 0.2439, + "step": 8964 + }, + { + "epoch": 0.57, + "grad_norm": 0.748155632069046, + "learning_rate": 4.0870454017960577e-07, + "loss": 0.3099, + "step": 8965 + }, + { + "epoch": 0.57, + "grad_norm": 1.5801864280752493, + "learning_rate": 4.0860300434820633e-07, + "loss": 0.2227, + "step": 8966 + }, + { + "epoch": 0.57, + "grad_norm": 0.7190986270264014, + "learning_rate": 4.0850147241598654e-07, + "loss": 0.2481, + "step": 8967 + }, + { + "epoch": 0.57, + "grad_norm": 0.5443676111902176, + "learning_rate": 4.0839994438727785e-07, + "loss": 0.0259, + "step": 8968 + }, + { + "epoch": 0.57, + "grad_norm": 1.1092656664628424, + "learning_rate": 4.082984202664118e-07, + "loss": 0.2081, + "step": 8969 + }, + { + "epoch": 0.57, + "grad_norm": 0.7488173781044702, + "learning_rate": 4.0819690005771947e-07, + "loss": 0.1968, + "step": 8970 + }, + { + "epoch": 0.57, + "grad_norm": 1.4602423968032658, + "learning_rate": 4.080953837655317e-07, + "loss": 0.3081, + "step": 8971 + }, + { + "epoch": 0.57, + "grad_norm": 6.613948406488765, + "learning_rate": 4.079938713941799e-07, + "loss": 0.1172, + "step": 8972 + }, + { + "epoch": 0.57, + "grad_norm": 1.3240785831914836, + "learning_rate": 4.078923629479942e-07, + "loss": 0.1644, + "step": 8973 + }, + { + "epoch": 0.57, + "grad_norm": 0.49065242535318954, + "learning_rate": 4.077908584313058e-07, + "loss": 0.3614, + "step": 8974 + }, + { + "epoch": 0.57, + "grad_norm": 0.6036816709428935, + "learning_rate": 4.0768935784844444e-07, + "loss": 0.1832, + "step": 8975 + }, + { + "epoch": 0.57, + "grad_norm": 8.656090000346207, + "learning_rate": 4.0758786120374073e-07, + "loss": 0.1065, + "step": 8976 + }, + { + "epoch": 0.57, + "grad_norm": 2.1238109859775967, + "learning_rate": 4.0748636850152457e-07, + "loss": 0.4529, + "step": 8977 + }, + { + "epoch": 0.57, + "grad_norm": 0.658608595335291, + "learning_rate": 4.0738487974612613e-07, + "loss": 0.3871, + "step": 8978 + }, + { + "epoch": 0.57, + "grad_norm": 1.020767182452796, + "learning_rate": 4.0728339494187465e-07, + "loss": 0.2117, + "step": 8979 + }, + { + "epoch": 0.57, + "grad_norm": 11.587761079292793, + "learning_rate": 4.071819140931002e-07, + "loss": 0.225, + "step": 8980 + }, + { + "epoch": 0.57, + "grad_norm": 1.1746589604840403, + "learning_rate": 4.0708043720413157e-07, + "loss": 0.3331, + "step": 8981 + }, + { + "epoch": 0.57, + "grad_norm": 0.6432716813705855, + "learning_rate": 4.069789642792986e-07, + "loss": 0.009, + "step": 8982 + }, + { + "epoch": 0.57, + "grad_norm": 0.6342285927951091, + "learning_rate": 4.068774953229297e-07, + "loss": 0.0823, + "step": 8983 + }, + { + "epoch": 0.57, + "grad_norm": 7.316941299192119, + "learning_rate": 4.067760303393543e-07, + "loss": 0.0673, + "step": 8984 + }, + { + "epoch": 0.57, + "grad_norm": 0.7977403896202583, + "learning_rate": 4.066745693329007e-07, + "loss": 0.0088, + "step": 8985 + }, + { + "epoch": 0.57, + "grad_norm": 0.6602867804753216, + "learning_rate": 4.0657311230789757e-07, + "loss": 0.3205, + "step": 8986 + }, + { + "epoch": 0.57, + "grad_norm": 0.3875502905647615, + "learning_rate": 4.0647165926867345e-07, + "loss": 0.2616, + "step": 8987 + }, + { + "epoch": 0.57, + "grad_norm": 1.1612784449805174, + "learning_rate": 4.0637021021955615e-07, + "loss": 0.3196, + "step": 8988 + }, + { + "epoch": 0.57, + "grad_norm": 0.4985510654803668, + "learning_rate": 4.062687651648741e-07, + "loss": 0.0171, + "step": 8989 + }, + { + "epoch": 0.57, + "grad_norm": 0.27865756818428206, + "learning_rate": 4.061673241089547e-07, + "loss": 0.0365, + "step": 8990 + }, + { + "epoch": 0.57, + "grad_norm": 3.3307056343237686, + "learning_rate": 4.060658870561262e-07, + "loss": 0.0844, + "step": 8991 + }, + { + "epoch": 0.57, + "grad_norm": 2.243988055057629, + "learning_rate": 4.0596445401071547e-07, + "loss": 0.073, + "step": 8992 + }, + { + "epoch": 0.57, + "grad_norm": 1.2478361278616963, + "learning_rate": 4.058630249770504e-07, + "loss": 0.2899, + "step": 8993 + }, + { + "epoch": 0.57, + "grad_norm": 1.3104727381547325, + "learning_rate": 4.057615999594578e-07, + "loss": 0.2499, + "step": 8994 + }, + { + "epoch": 0.57, + "grad_norm": 0.32810876773928077, + "learning_rate": 4.0566017896226485e-07, + "loss": 0.1811, + "step": 8995 + }, + { + "epoch": 0.57, + "grad_norm": 0.8333586188888399, + "learning_rate": 4.0555876198979817e-07, + "loss": 0.3105, + "step": 8996 + }, + { + "epoch": 0.57, + "grad_norm": 0.2441767375955954, + "learning_rate": 4.054573490463848e-07, + "loss": 0.0094, + "step": 8997 + }, + { + "epoch": 0.57, + "grad_norm": 1.0921772024803698, + "learning_rate": 4.0535594013635093e-07, + "loss": 0.058, + "step": 8998 + }, + { + "epoch": 0.57, + "grad_norm": 1.2390422713542835, + "learning_rate": 4.0525453526402276e-07, + "loss": 0.1206, + "step": 8999 + }, + { + "epoch": 0.57, + "grad_norm": 1.1778885430337904, + "learning_rate": 4.0515313443372675e-07, + "loss": 0.2504, + "step": 9000 + }, + { + "epoch": 0.57, + "grad_norm": 0.08698806208004202, + "learning_rate": 4.050517376497885e-07, + "loss": 0.0028, + "step": 9001 + }, + { + "epoch": 0.57, + "grad_norm": 0.46755935882190264, + "learning_rate": 4.049503449165341e-07, + "loss": 0.0989, + "step": 9002 + }, + { + "epoch": 0.57, + "grad_norm": 0.7650949509322319, + "learning_rate": 4.04848956238289e-07, + "loss": 0.1436, + "step": 9003 + }, + { + "epoch": 0.57, + "grad_norm": 0.2979398501256509, + "learning_rate": 4.0474757161937876e-07, + "loss": 0.141, + "step": 9004 + }, + { + "epoch": 0.57, + "grad_norm": 1.0035517416848905, + "learning_rate": 4.046461910641285e-07, + "loss": 0.2083, + "step": 9005 + }, + { + "epoch": 0.57, + "grad_norm": 24.001824481540783, + "learning_rate": 4.045448145768636e-07, + "loss": 0.3099, + "step": 9006 + }, + { + "epoch": 0.57, + "grad_norm": 1.783926302428239, + "learning_rate": 4.0444344216190863e-07, + "loss": 0.2344, + "step": 9007 + }, + { + "epoch": 0.57, + "grad_norm": 8.939785030272276, + "learning_rate": 4.0434207382358865e-07, + "loss": 0.1586, + "step": 9008 + }, + { + "epoch": 0.57, + "grad_norm": 0.7242003816517104, + "learning_rate": 4.042407095662279e-07, + "loss": 0.129, + "step": 9009 + }, + { + "epoch": 0.57, + "grad_norm": 0.8403834322257356, + "learning_rate": 4.0413934939415126e-07, + "loss": 0.3, + "step": 9010 + }, + { + "epoch": 0.57, + "grad_norm": 1.4074080079376277, + "learning_rate": 4.0403799331168243e-07, + "loss": 0.2928, + "step": 9011 + }, + { + "epoch": 0.57, + "grad_norm": 1.4771753974956194, + "learning_rate": 4.0393664132314576e-07, + "loss": 0.1581, + "step": 9012 + }, + { + "epoch": 0.57, + "grad_norm": 0.6087305966614077, + "learning_rate": 4.0383529343286525e-07, + "loss": 0.0752, + "step": 9013 + }, + { + "epoch": 0.57, + "grad_norm": 0.4078878006792705, + "learning_rate": 4.037339496451642e-07, + "loss": 0.0535, + "step": 9014 + }, + { + "epoch": 0.57, + "grad_norm": 2.3292486987075627, + "learning_rate": 4.0363260996436666e-07, + "loss": 0.0814, + "step": 9015 + }, + { + "epoch": 0.57, + "grad_norm": 0.6243070981600599, + "learning_rate": 4.035312743947954e-07, + "loss": 0.1725, + "step": 9016 + }, + { + "epoch": 0.58, + "grad_norm": 1.4245762187153739, + "learning_rate": 4.0342994294077414e-07, + "loss": 0.1674, + "step": 9017 + }, + { + "epoch": 0.58, + "grad_norm": 0.22093514648280238, + "learning_rate": 4.0332861560662547e-07, + "loss": 0.0796, + "step": 9018 + }, + { + "epoch": 0.58, + "grad_norm": 0.7260767099304124, + "learning_rate": 4.0322729239667253e-07, + "loss": 0.1967, + "step": 9019 + }, + { + "epoch": 0.58, + "grad_norm": 0.7598463566865133, + "learning_rate": 4.031259733152377e-07, + "loss": 0.2083, + "step": 9020 + }, + { + "epoch": 0.58, + "grad_norm": 2.661778512892847, + "learning_rate": 4.030246583666437e-07, + "loss": 0.211, + "step": 9021 + }, + { + "epoch": 0.58, + "grad_norm": 0.5116657482186662, + "learning_rate": 4.029233475552125e-07, + "loss": 0.2962, + "step": 9022 + }, + { + "epoch": 0.58, + "grad_norm": 0.4688982985790497, + "learning_rate": 4.0282204088526674e-07, + "loss": 0.1241, + "step": 9023 + }, + { + "epoch": 0.58, + "grad_norm": 0.45749741855101583, + "learning_rate": 4.0272073836112786e-07, + "loss": 0.4121, + "step": 9024 + }, + { + "epoch": 0.58, + "grad_norm": 0.9434951366675195, + "learning_rate": 4.0261943998711803e-07, + "loss": 0.1673, + "step": 9025 + }, + { + "epoch": 0.58, + "grad_norm": 0.20550577244305945, + "learning_rate": 4.025181457675587e-07, + "loss": 0.0588, + "step": 9026 + }, + { + "epoch": 0.58, + "grad_norm": 2.428451287773308, + "learning_rate": 4.024168557067709e-07, + "loss": 0.1814, + "step": 9027 + }, + { + "epoch": 0.58, + "grad_norm": 0.8381884433585038, + "learning_rate": 4.023155698090764e-07, + "loss": 0.2038, + "step": 9028 + }, + { + "epoch": 0.58, + "grad_norm": 0.7965236383737759, + "learning_rate": 4.02214288078796e-07, + "loss": 0.1706, + "step": 9029 + }, + { + "epoch": 0.58, + "grad_norm": 0.6777519886371323, + "learning_rate": 4.021130105202507e-07, + "loss": 0.0079, + "step": 9030 + }, + { + "epoch": 0.58, + "grad_norm": 2.5632899213811604, + "learning_rate": 4.0201173713776105e-07, + "loss": 0.156, + "step": 9031 + }, + { + "epoch": 0.58, + "grad_norm": 2.9247967735208444, + "learning_rate": 4.0191046793564787e-07, + "loss": 0.3693, + "step": 9032 + }, + { + "epoch": 0.58, + "grad_norm": 0.5713296572055833, + "learning_rate": 4.0180920291823114e-07, + "loss": 0.3208, + "step": 9033 + }, + { + "epoch": 0.58, + "grad_norm": 0.5066304749829127, + "learning_rate": 4.017079420898314e-07, + "loss": 0.1027, + "step": 9034 + }, + { + "epoch": 0.58, + "grad_norm": 0.7147742170935585, + "learning_rate": 4.016066854547682e-07, + "loss": 0.1065, + "step": 9035 + }, + { + "epoch": 0.58, + "grad_norm": 1.3069685824116117, + "learning_rate": 4.015054330173618e-07, + "loss": 0.3056, + "step": 9036 + }, + { + "epoch": 0.58, + "grad_norm": 0.667330074309844, + "learning_rate": 4.014041847819314e-07, + "loss": 0.1823, + "step": 9037 + }, + { + "epoch": 0.58, + "grad_norm": 0.9773945518802425, + "learning_rate": 4.013029407527968e-07, + "loss": 0.1436, + "step": 9038 + }, + { + "epoch": 0.58, + "grad_norm": 1.2996877867987215, + "learning_rate": 4.012017009342773e-07, + "loss": 0.1427, + "step": 9039 + }, + { + "epoch": 0.58, + "grad_norm": 1.4452488615195211, + "learning_rate": 4.0110046533069157e-07, + "loss": 0.3065, + "step": 9040 + }, + { + "epoch": 0.58, + "grad_norm": 1.30645353995812, + "learning_rate": 4.00999233946359e-07, + "loss": 0.333, + "step": 9041 + }, + { + "epoch": 0.58, + "grad_norm": 0.4956683821175165, + "learning_rate": 4.0089800678559803e-07, + "loss": 0.1727, + "step": 9042 + }, + { + "epoch": 0.58, + "grad_norm": 5.739914002270914, + "learning_rate": 4.007967838527274e-07, + "loss": 0.0503, + "step": 9043 + }, + { + "epoch": 0.58, + "grad_norm": 0.3603300123768585, + "learning_rate": 4.0069556515206527e-07, + "loss": 0.1889, + "step": 9044 + }, + { + "epoch": 0.58, + "grad_norm": 0.9760889355895916, + "learning_rate": 4.005943506879301e-07, + "loss": 0.077, + "step": 9045 + }, + { + "epoch": 0.58, + "grad_norm": 1.081222799612338, + "learning_rate": 4.004931404646397e-07, + "loss": 0.3447, + "step": 9046 + }, + { + "epoch": 0.58, + "grad_norm": 1.4811212383192218, + "learning_rate": 4.003919344865119e-07, + "loss": 0.2423, + "step": 9047 + }, + { + "epoch": 0.58, + "grad_norm": 2.2905747419769944, + "learning_rate": 4.0029073275786436e-07, + "loss": 0.0074, + "step": 9048 + }, + { + "epoch": 0.58, + "grad_norm": 7.09817365571362, + "learning_rate": 4.001895352830148e-07, + "loss": 0.0707, + "step": 9049 + }, + { + "epoch": 0.58, + "grad_norm": 0.9445935114472639, + "learning_rate": 4.000883420662801e-07, + "loss": 0.2193, + "step": 9050 + }, + { + "epoch": 0.58, + "grad_norm": 1.4456565270365584, + "learning_rate": 3.9998715311197783e-07, + "loss": 0.2212, + "step": 9051 + }, + { + "epoch": 0.58, + "grad_norm": 1.0246092475515236, + "learning_rate": 3.998859684244244e-07, + "loss": 0.2089, + "step": 9052 + }, + { + "epoch": 0.58, + "grad_norm": 10.215196055215365, + "learning_rate": 3.99784788007937e-07, + "loss": 0.3612, + "step": 9053 + }, + { + "epoch": 0.58, + "grad_norm": 0.7285277373881145, + "learning_rate": 3.99683611866832e-07, + "loss": 0.1118, + "step": 9054 + }, + { + "epoch": 0.58, + "grad_norm": 7.310467250687471, + "learning_rate": 3.9958244000542566e-07, + "loss": 0.2895, + "step": 9055 + }, + { + "epoch": 0.58, + "grad_norm": 1.721133612694442, + "learning_rate": 3.9948127242803437e-07, + "loss": 0.1636, + "step": 9056 + }, + { + "epoch": 0.58, + "grad_norm": 3.025961345957209, + "learning_rate": 3.993801091389739e-07, + "loss": 0.1483, + "step": 9057 + }, + { + "epoch": 0.58, + "grad_norm": 0.9314000741301101, + "learning_rate": 3.9927895014256053e-07, + "loss": 0.1196, + "step": 9058 + }, + { + "epoch": 0.58, + "grad_norm": 16.420191387188993, + "learning_rate": 3.9917779544310935e-07, + "loss": 0.1613, + "step": 9059 + }, + { + "epoch": 0.58, + "grad_norm": 0.4235913058231204, + "learning_rate": 3.990766450449363e-07, + "loss": 0.1023, + "step": 9060 + }, + { + "epoch": 0.58, + "grad_norm": 1.2195491324790444, + "learning_rate": 3.989754989523563e-07, + "loss": 0.2947, + "step": 9061 + }, + { + "epoch": 0.58, + "grad_norm": 3.7037524693812025, + "learning_rate": 3.9887435716968484e-07, + "loss": 0.1254, + "step": 9062 + }, + { + "epoch": 0.58, + "grad_norm": 1.5073221242159822, + "learning_rate": 3.987732197012363e-07, + "loss": 0.2778, + "step": 9063 + }, + { + "epoch": 0.58, + "grad_norm": 1.8665317055890085, + "learning_rate": 3.9867208655132586e-07, + "loss": 0.0605, + "step": 9064 + }, + { + "epoch": 0.58, + "grad_norm": 4.762887116434521, + "learning_rate": 3.9857095772426784e-07, + "loss": 0.0755, + "step": 9065 + }, + { + "epoch": 0.58, + "grad_norm": 0.9474658750577679, + "learning_rate": 3.9846983322437667e-07, + "loss": 0.1098, + "step": 9066 + }, + { + "epoch": 0.58, + "grad_norm": 0.36958400160465527, + "learning_rate": 3.983687130559666e-07, + "loss": 0.1108, + "step": 9067 + }, + { + "epoch": 0.58, + "grad_norm": 0.7325477158345699, + "learning_rate": 3.982675972233514e-07, + "loss": 0.2775, + "step": 9068 + }, + { + "epoch": 0.58, + "grad_norm": 0.6345864761335269, + "learning_rate": 3.981664857308452e-07, + "loss": 0.07, + "step": 9069 + }, + { + "epoch": 0.58, + "grad_norm": 0.5382803473694683, + "learning_rate": 3.9806537858276124e-07, + "loss": 0.2797, + "step": 9070 + }, + { + "epoch": 0.58, + "grad_norm": 1.163239855383378, + "learning_rate": 3.979642757834133e-07, + "loss": 0.1866, + "step": 9071 + }, + { + "epoch": 0.58, + "grad_norm": 0.5826955762141282, + "learning_rate": 3.9786317733711427e-07, + "loss": 0.1656, + "step": 9072 + }, + { + "epoch": 0.58, + "grad_norm": 1.8460086516001164, + "learning_rate": 3.9776208324817755e-07, + "loss": 0.096, + "step": 9073 + }, + { + "epoch": 0.58, + "grad_norm": 0.8705191724337407, + "learning_rate": 3.9766099352091587e-07, + "loss": 0.3196, + "step": 9074 + }, + { + "epoch": 0.58, + "grad_norm": 0.5142661317455479, + "learning_rate": 3.975599081596419e-07, + "loss": 0.156, + "step": 9075 + }, + { + "epoch": 0.58, + "grad_norm": 0.6104047618995742, + "learning_rate": 3.9745882716866813e-07, + "loss": 0.031, + "step": 9076 + }, + { + "epoch": 0.58, + "grad_norm": 0.7345028381558942, + "learning_rate": 3.9735775055230714e-07, + "loss": 0.355, + "step": 9077 + }, + { + "epoch": 0.58, + "grad_norm": 6.588846068939626, + "learning_rate": 3.972566783148706e-07, + "loss": 0.1142, + "step": 9078 + }, + { + "epoch": 0.58, + "grad_norm": 7.341741735597102, + "learning_rate": 3.9715561046067094e-07, + "loss": 0.1682, + "step": 9079 + }, + { + "epoch": 0.58, + "grad_norm": 0.1621309997240818, + "learning_rate": 3.9705454699401963e-07, + "loss": 0.0015, + "step": 9080 + }, + { + "epoch": 0.58, + "grad_norm": 1.4465533222689928, + "learning_rate": 3.969534879192281e-07, + "loss": 0.2088, + "step": 9081 + }, + { + "epoch": 0.58, + "grad_norm": 0.3472903473872619, + "learning_rate": 3.9685243324060807e-07, + "loss": 0.1967, + "step": 9082 + }, + { + "epoch": 0.58, + "grad_norm": 0.89663315989137, + "learning_rate": 3.9675138296247045e-07, + "loss": 0.0303, + "step": 9083 + }, + { + "epoch": 0.58, + "grad_norm": 0.6255986993763517, + "learning_rate": 3.966503370891266e-07, + "loss": 0.1166, + "step": 9084 + }, + { + "epoch": 0.58, + "grad_norm": 2.3405934575042138, + "learning_rate": 3.9654929562488693e-07, + "loss": 0.2685, + "step": 9085 + }, + { + "epoch": 0.58, + "grad_norm": 1.5722119128549745, + "learning_rate": 3.964482585740625e-07, + "loss": 0.1406, + "step": 9086 + }, + { + "epoch": 0.58, + "grad_norm": 0.24404177857455117, + "learning_rate": 3.9634722594096325e-07, + "loss": 0.0997, + "step": 9087 + }, + { + "epoch": 0.58, + "grad_norm": 4.681805078870619, + "learning_rate": 3.962461977299e-07, + "loss": 0.2331, + "step": 9088 + }, + { + "epoch": 0.58, + "grad_norm": 0.8243376873980711, + "learning_rate": 3.961451739451823e-07, + "loss": 0.1161, + "step": 9089 + }, + { + "epoch": 0.58, + "grad_norm": 0.7757202223089231, + "learning_rate": 3.960441545911204e-07, + "loss": 0.0637, + "step": 9090 + }, + { + "epoch": 0.58, + "grad_norm": 2.7456104967572204, + "learning_rate": 3.959431396720237e-07, + "loss": 0.083, + "step": 9091 + }, + { + "epoch": 0.58, + "grad_norm": 1.1350634707790515, + "learning_rate": 3.9584212919220196e-07, + "loss": 0.2344, + "step": 9092 + }, + { + "epoch": 0.58, + "grad_norm": 0.878038497687454, + "learning_rate": 3.957411231559642e-07, + "loss": 0.1174, + "step": 9093 + }, + { + "epoch": 0.58, + "grad_norm": 1.0787155370288095, + "learning_rate": 3.9564012156761994e-07, + "loss": 0.1609, + "step": 9094 + }, + { + "epoch": 0.58, + "grad_norm": 0.7154063988683648, + "learning_rate": 3.955391244314779e-07, + "loss": 0.05, + "step": 9095 + }, + { + "epoch": 0.58, + "grad_norm": 0.8266407603287446, + "learning_rate": 3.954381317518466e-07, + "loss": 0.168, + "step": 9096 + }, + { + "epoch": 0.58, + "grad_norm": 2.122799645277665, + "learning_rate": 3.95337143533035e-07, + "loss": 0.3171, + "step": 9097 + }, + { + "epoch": 0.58, + "grad_norm": 0.6984134699333916, + "learning_rate": 3.9523615977935097e-07, + "loss": 0.3383, + "step": 9098 + }, + { + "epoch": 0.58, + "grad_norm": 1.1842718551636655, + "learning_rate": 3.951351804951032e-07, + "loss": 0.0621, + "step": 9099 + }, + { + "epoch": 0.58, + "grad_norm": 1.1785945127202282, + "learning_rate": 3.9503420568459936e-07, + "loss": 0.3907, + "step": 9100 + }, + { + "epoch": 0.58, + "grad_norm": 0.6463394683670811, + "learning_rate": 3.9493323535214737e-07, + "loss": 0.1419, + "step": 9101 + }, + { + "epoch": 0.58, + "grad_norm": 1.0666113227064726, + "learning_rate": 3.948322695020546e-07, + "loss": 0.158, + "step": 9102 + }, + { + "epoch": 0.58, + "grad_norm": 0.3029223594752769, + "learning_rate": 3.947313081386289e-07, + "loss": 0.0003, + "step": 9103 + }, + { + "epoch": 0.58, + "grad_norm": 1.2458121378905718, + "learning_rate": 3.9463035126617696e-07, + "loss": 0.1343, + "step": 9104 + }, + { + "epoch": 0.58, + "grad_norm": 1.5893834935333186, + "learning_rate": 3.945293988890062e-07, + "loss": 0.259, + "step": 9105 + }, + { + "epoch": 0.58, + "grad_norm": 2.3004152307314003, + "learning_rate": 3.944284510114232e-07, + "loss": 0.3243, + "step": 9106 + }, + { + "epoch": 0.58, + "grad_norm": 0.5497385037057883, + "learning_rate": 3.943275076377349e-07, + "loss": 0.1432, + "step": 9107 + }, + { + "epoch": 0.58, + "grad_norm": 0.7587435307983558, + "learning_rate": 3.942265687722474e-07, + "loss": 0.3035, + "step": 9108 + }, + { + "epoch": 0.58, + "grad_norm": 1.4121270975608575, + "learning_rate": 3.9412563441926706e-07, + "loss": 0.2223, + "step": 9109 + }, + { + "epoch": 0.58, + "grad_norm": 1.3152190806198152, + "learning_rate": 3.940247045831001e-07, + "loss": 0.3556, + "step": 9110 + }, + { + "epoch": 0.58, + "grad_norm": 1.077669034473468, + "learning_rate": 3.939237792680522e-07, + "loss": 0.3115, + "step": 9111 + }, + { + "epoch": 0.58, + "grad_norm": 0.5996167287361195, + "learning_rate": 3.938228584784292e-07, + "loss": 0.3228, + "step": 9112 + }, + { + "epoch": 0.58, + "grad_norm": 0.8498250818324076, + "learning_rate": 3.9372194221853636e-07, + "loss": 0.3961, + "step": 9113 + }, + { + "epoch": 0.58, + "grad_norm": 1.9441137172312675, + "learning_rate": 3.9362103049267934e-07, + "loss": 0.0193, + "step": 9114 + }, + { + "epoch": 0.58, + "grad_norm": 1.1079625665689703, + "learning_rate": 3.9352012330516275e-07, + "loss": 0.377, + "step": 9115 + }, + { + "epoch": 0.58, + "grad_norm": 0.5648126864513543, + "learning_rate": 3.9341922066029205e-07, + "loss": 0.1762, + "step": 9116 + }, + { + "epoch": 0.58, + "grad_norm": 0.2391696149516038, + "learning_rate": 3.9331832256237136e-07, + "loss": 0.0849, + "step": 9117 + }, + { + "epoch": 0.58, + "grad_norm": 28.27204448115714, + "learning_rate": 3.932174290157056e-07, + "loss": 0.291, + "step": 9118 + }, + { + "epoch": 0.58, + "grad_norm": 1.1459072452225505, + "learning_rate": 3.9311654002459896e-07, + "loss": 0.0047, + "step": 9119 + }, + { + "epoch": 0.58, + "grad_norm": 14.449940850162786, + "learning_rate": 3.9301565559335565e-07, + "loss": 0.3272, + "step": 9120 + }, + { + "epoch": 0.58, + "grad_norm": 0.907058481995523, + "learning_rate": 3.929147757262794e-07, + "loss": 0.099, + "step": 9121 + }, + { + "epoch": 0.58, + "grad_norm": 6.601482362661088, + "learning_rate": 3.9281390042767423e-07, + "loss": 0.0294, + "step": 9122 + }, + { + "epoch": 0.58, + "grad_norm": 0.695113172316583, + "learning_rate": 3.927130297018436e-07, + "loss": 0.2036, + "step": 9123 + }, + { + "epoch": 0.58, + "grad_norm": 0.4512606804656528, + "learning_rate": 3.9261216355309063e-07, + "loss": 0.1826, + "step": 9124 + }, + { + "epoch": 0.58, + "grad_norm": 0.40888229553450833, + "learning_rate": 3.925113019857187e-07, + "loss": 0.0128, + "step": 9125 + }, + { + "epoch": 0.58, + "grad_norm": 8.352895839425, + "learning_rate": 3.924104450040307e-07, + "loss": 0.2253, + "step": 9126 + }, + { + "epoch": 0.58, + "grad_norm": 0.8631112591496051, + "learning_rate": 3.9230959261232943e-07, + "loss": 0.1636, + "step": 9127 + }, + { + "epoch": 0.58, + "grad_norm": 0.9239130250737256, + "learning_rate": 3.9220874481491726e-07, + "loss": 0.4013, + "step": 9128 + }, + { + "epoch": 0.58, + "grad_norm": 0.8545318574040791, + "learning_rate": 3.92107901616097e-07, + "loss": 0.123, + "step": 9129 + }, + { + "epoch": 0.58, + "grad_norm": 0.6277111029733823, + "learning_rate": 3.920070630201703e-07, + "loss": 0.1028, + "step": 9130 + }, + { + "epoch": 0.58, + "grad_norm": 7.025476062430447, + "learning_rate": 3.9190622903143953e-07, + "loss": 0.0061, + "step": 9131 + }, + { + "epoch": 0.58, + "grad_norm": 0.43370590315597163, + "learning_rate": 3.918053996542062e-07, + "loss": 0.0681, + "step": 9132 + }, + { + "epoch": 0.58, + "grad_norm": 0.21568988514208015, + "learning_rate": 3.9170457489277227e-07, + "loss": 0.0934, + "step": 9133 + }, + { + "epoch": 0.58, + "grad_norm": 1.2134802163903566, + "learning_rate": 3.916037547514386e-07, + "loss": 0.48, + "step": 9134 + }, + { + "epoch": 0.58, + "grad_norm": 0.44384820722314655, + "learning_rate": 3.9150293923450684e-07, + "loss": 0.2024, + "step": 9135 + }, + { + "epoch": 0.58, + "grad_norm": 0.5290360688562474, + "learning_rate": 3.914021283462777e-07, + "loss": 0.2915, + "step": 9136 + }, + { + "epoch": 0.58, + "grad_norm": 4.199319608565975, + "learning_rate": 3.9130132209105207e-07, + "loss": 0.0644, + "step": 9137 + }, + { + "epoch": 0.58, + "grad_norm": 0.8110329089991334, + "learning_rate": 3.912005204731307e-07, + "loss": 0.2226, + "step": 9138 + }, + { + "epoch": 0.58, + "grad_norm": 1.6090097521712758, + "learning_rate": 3.9109972349681357e-07, + "loss": 0.1522, + "step": 9139 + }, + { + "epoch": 0.58, + "grad_norm": 1.0628346686616439, + "learning_rate": 3.909989311664014e-07, + "loss": 0.1917, + "step": 9140 + }, + { + "epoch": 0.58, + "grad_norm": 1.0460528081900293, + "learning_rate": 3.9089814348619376e-07, + "loss": 0.4451, + "step": 9141 + }, + { + "epoch": 0.58, + "grad_norm": 0.8136537433216505, + "learning_rate": 3.907973604604908e-07, + "loss": 0.1841, + "step": 9142 + }, + { + "epoch": 0.58, + "grad_norm": 1.2886978776955402, + "learning_rate": 3.9069658209359186e-07, + "loss": 0.266, + "step": 9143 + }, + { + "epoch": 0.58, + "grad_norm": 1.202882225819883, + "learning_rate": 3.905958083897965e-07, + "loss": 0.2374, + "step": 9144 + }, + { + "epoch": 0.58, + "grad_norm": 0.500113987439878, + "learning_rate": 3.9049503935340386e-07, + "loss": 0.2706, + "step": 9145 + }, + { + "epoch": 0.58, + "grad_norm": 1.1938992054164177, + "learning_rate": 3.90394274988713e-07, + "loss": 0.143, + "step": 9146 + }, + { + "epoch": 0.58, + "grad_norm": 3.7248861178693087, + "learning_rate": 3.9029351530002264e-07, + "loss": 0.0849, + "step": 9147 + }, + { + "epoch": 0.58, + "grad_norm": 0.8164298406079303, + "learning_rate": 3.901927602916316e-07, + "loss": 0.0368, + "step": 9148 + }, + { + "epoch": 0.58, + "grad_norm": 0.7645650435096253, + "learning_rate": 3.900920099678382e-07, + "loss": 0.1669, + "step": 9149 + }, + { + "epoch": 0.58, + "grad_norm": 0.6462155620964484, + "learning_rate": 3.899912643329403e-07, + "loss": 0.0824, + "step": 9150 + }, + { + "epoch": 0.58, + "grad_norm": 0.6100529014449996, + "learning_rate": 3.898905233912365e-07, + "loss": 0.3558, + "step": 9151 + }, + { + "epoch": 0.58, + "grad_norm": 2.44330946916766, + "learning_rate": 3.897897871470241e-07, + "loss": 0.004, + "step": 9152 + }, + { + "epoch": 0.58, + "grad_norm": 0.7378142353289706, + "learning_rate": 3.8968905560460095e-07, + "loss": 0.0732, + "step": 9153 + }, + { + "epoch": 0.58, + "grad_norm": 0.7624836196576208, + "learning_rate": 3.895883287682644e-07, + "loss": 0.2406, + "step": 9154 + }, + { + "epoch": 0.58, + "grad_norm": 1.1989838823723706, + "learning_rate": 3.8948760664231194e-07, + "loss": 0.2578, + "step": 9155 + }, + { + "epoch": 0.58, + "grad_norm": 0.38854518237118085, + "learning_rate": 3.893868892310401e-07, + "loss": 0.0291, + "step": 9156 + }, + { + "epoch": 0.58, + "grad_norm": 0.6847271574043243, + "learning_rate": 3.8928617653874616e-07, + "loss": 0.3524, + "step": 9157 + }, + { + "epoch": 0.58, + "grad_norm": 0.4334590327411633, + "learning_rate": 3.891854685697263e-07, + "loss": 0.0711, + "step": 9158 + }, + { + "epoch": 0.58, + "grad_norm": 0.411262064489176, + "learning_rate": 3.8908476532827727e-07, + "loss": 0.2449, + "step": 9159 + }, + { + "epoch": 0.58, + "grad_norm": 0.3337348207732892, + "learning_rate": 3.889840668186949e-07, + "loss": 0.1965, + "step": 9160 + }, + { + "epoch": 0.58, + "grad_norm": 1.129793996447344, + "learning_rate": 3.8888337304527564e-07, + "loss": 0.3591, + "step": 9161 + }, + { + "epoch": 0.58, + "grad_norm": 1.4164599484664437, + "learning_rate": 3.8878268401231487e-07, + "loss": 0.1566, + "step": 9162 + }, + { + "epoch": 0.58, + "grad_norm": 1.1135343748933653, + "learning_rate": 3.8868199972410846e-07, + "loss": 0.186, + "step": 9163 + }, + { + "epoch": 0.58, + "grad_norm": 0.9535641093883805, + "learning_rate": 3.8858132018495184e-07, + "loss": 0.2839, + "step": 9164 + }, + { + "epoch": 0.58, + "grad_norm": 0.4248583334239953, + "learning_rate": 3.884806453991398e-07, + "loss": 0.1362, + "step": 9165 + }, + { + "epoch": 0.58, + "grad_norm": 0.49284576972669875, + "learning_rate": 3.883799753709679e-07, + "loss": 0.3284, + "step": 9166 + }, + { + "epoch": 0.58, + "grad_norm": 1.028064848108264, + "learning_rate": 3.882793101047304e-07, + "loss": 0.2369, + "step": 9167 + }, + { + "epoch": 0.58, + "grad_norm": 0.2460391072109868, + "learning_rate": 3.8817864960472233e-07, + "loss": 0.009, + "step": 9168 + }, + { + "epoch": 0.58, + "grad_norm": 0.061349969952489876, + "learning_rate": 3.8807799387523765e-07, + "loss": 0.001, + "step": 9169 + }, + { + "epoch": 0.58, + "grad_norm": 0.5311125442366161, + "learning_rate": 3.8797734292057087e-07, + "loss": 0.0622, + "step": 9170 + }, + { + "epoch": 0.58, + "grad_norm": 0.8721603956515364, + "learning_rate": 3.8787669674501575e-07, + "loss": 0.1645, + "step": 9171 + }, + { + "epoch": 0.58, + "grad_norm": 1.1911385618025718, + "learning_rate": 3.877760553528663e-07, + "loss": 0.2388, + "step": 9172 + }, + { + "epoch": 0.58, + "grad_norm": 0.44380367715135, + "learning_rate": 3.876754187484157e-07, + "loss": 0.1058, + "step": 9173 + }, + { + "epoch": 0.59, + "grad_norm": 1.0451666161687792, + "learning_rate": 3.8757478693595777e-07, + "loss": 0.3121, + "step": 9174 + }, + { + "epoch": 0.59, + "grad_norm": 0.14264998394508208, + "learning_rate": 3.874741599197852e-07, + "loss": 0.0794, + "step": 9175 + }, + { + "epoch": 0.59, + "grad_norm": 1.6246629504104158, + "learning_rate": 3.873735377041914e-07, + "loss": 0.085, + "step": 9176 + }, + { + "epoch": 0.59, + "grad_norm": 0.5304588390550291, + "learning_rate": 3.872729202934689e-07, + "loss": 0.1211, + "step": 9177 + }, + { + "epoch": 0.59, + "grad_norm": 1.2139916211796347, + "learning_rate": 3.8717230769191003e-07, + "loss": 0.1638, + "step": 9178 + }, + { + "epoch": 0.59, + "grad_norm": 1.4945753870794083, + "learning_rate": 3.870716999038074e-07, + "loss": 0.4496, + "step": 9179 + }, + { + "epoch": 0.59, + "grad_norm": 0.4775768718401756, + "learning_rate": 3.869710969334531e-07, + "loss": 0.1229, + "step": 9180 + }, + { + "epoch": 0.59, + "grad_norm": 0.7109870809856619, + "learning_rate": 3.86870498785139e-07, + "loss": 0.2483, + "step": 9181 + }, + { + "epoch": 0.59, + "grad_norm": 0.9287785944134873, + "learning_rate": 3.8676990546315676e-07, + "loss": 0.1111, + "step": 9182 + }, + { + "epoch": 0.59, + "grad_norm": 0.7729595500918812, + "learning_rate": 3.8666931697179816e-07, + "loss": 0.2111, + "step": 9183 + }, + { + "epoch": 0.59, + "grad_norm": 0.6794761483883275, + "learning_rate": 3.8656873331535413e-07, + "loss": 0.2492, + "step": 9184 + }, + { + "epoch": 0.59, + "grad_norm": 0.4381350380622578, + "learning_rate": 3.8646815449811617e-07, + "loss": 0.0783, + "step": 9185 + }, + { + "epoch": 0.59, + "grad_norm": 0.7720450624517092, + "learning_rate": 3.863675805243747e-07, + "loss": 0.1668, + "step": 9186 + }, + { + "epoch": 0.59, + "grad_norm": 0.4921296405996789, + "learning_rate": 3.8626701139842094e-07, + "loss": 0.217, + "step": 9187 + }, + { + "epoch": 0.59, + "grad_norm": 0.8177338474651337, + "learning_rate": 3.8616644712454486e-07, + "loss": 0.1114, + "step": 9188 + }, + { + "epoch": 0.59, + "grad_norm": 5.166284876492476, + "learning_rate": 3.860658877070371e-07, + "loss": 0.1488, + "step": 9189 + }, + { + "epoch": 0.59, + "grad_norm": 12.143827540947076, + "learning_rate": 3.8596533315018775e-07, + "loss": 0.0627, + "step": 9190 + }, + { + "epoch": 0.59, + "grad_norm": 0.8860199853316247, + "learning_rate": 3.858647834582863e-07, + "loss": 0.2058, + "step": 9191 + }, + { + "epoch": 0.59, + "grad_norm": 0.8097965018820006, + "learning_rate": 3.857642386356228e-07, + "loss": 0.3329, + "step": 9192 + }, + { + "epoch": 0.59, + "grad_norm": 4.070774336932143, + "learning_rate": 3.8566369868648637e-07, + "loss": 0.1568, + "step": 9193 + }, + { + "epoch": 0.59, + "grad_norm": 1.2699900825217016, + "learning_rate": 3.855631636151666e-07, + "loss": 0.2194, + "step": 9194 + }, + { + "epoch": 0.59, + "grad_norm": 3.6469029270648288, + "learning_rate": 3.854626334259521e-07, + "loss": 0.2912, + "step": 9195 + }, + { + "epoch": 0.59, + "grad_norm": 1.5244764993476563, + "learning_rate": 3.853621081231321e-07, + "loss": 0.1268, + "step": 9196 + }, + { + "epoch": 0.59, + "grad_norm": 0.9962098492424587, + "learning_rate": 3.852615877109948e-07, + "loss": 0.2871, + "step": 9197 + }, + { + "epoch": 0.59, + "grad_norm": 4.7032776124392495, + "learning_rate": 3.851610721938289e-07, + "loss": 0.315, + "step": 9198 + }, + { + "epoch": 0.59, + "grad_norm": 0.8505050123015702, + "learning_rate": 3.850605615759225e-07, + "loss": 0.0576, + "step": 9199 + }, + { + "epoch": 0.59, + "grad_norm": 2.707335169164966, + "learning_rate": 3.849600558615637e-07, + "loss": 0.1038, + "step": 9200 + }, + { + "epoch": 0.59, + "grad_norm": 1.0816445579365792, + "learning_rate": 3.8485955505504004e-07, + "loss": 0.075, + "step": 9201 + }, + { + "epoch": 0.59, + "grad_norm": 0.1298037125380591, + "learning_rate": 3.8475905916063947e-07, + "loss": 0.0017, + "step": 9202 + }, + { + "epoch": 0.59, + "grad_norm": 1.2499967898855564, + "learning_rate": 3.8465856818264873e-07, + "loss": 0.0452, + "step": 9203 + }, + { + "epoch": 0.59, + "grad_norm": 0.7829148141341262, + "learning_rate": 3.8455808212535566e-07, + "loss": 0.2444, + "step": 9204 + }, + { + "epoch": 0.59, + "grad_norm": 0.45011960836309023, + "learning_rate": 3.844576009930468e-07, + "loss": 0.264, + "step": 9205 + }, + { + "epoch": 0.59, + "grad_norm": 2.1994074093175575, + "learning_rate": 3.843571247900089e-07, + "loss": 0.0186, + "step": 9206 + }, + { + "epoch": 0.59, + "grad_norm": 1.3517055045129158, + "learning_rate": 3.8425665352052854e-07, + "loss": 0.4352, + "step": 9207 + }, + { + "epoch": 0.59, + "grad_norm": 0.7661274484486612, + "learning_rate": 3.841561871888919e-07, + "loss": 0.1375, + "step": 9208 + }, + { + "epoch": 0.59, + "grad_norm": 0.7206736025970967, + "learning_rate": 3.8405572579938545e-07, + "loss": 0.1006, + "step": 9209 + }, + { + "epoch": 0.59, + "grad_norm": 1.4045134212470072, + "learning_rate": 3.839552693562945e-07, + "loss": 0.1919, + "step": 9210 + }, + { + "epoch": 0.59, + "grad_norm": 1.2670005349429003, + "learning_rate": 3.838548178639054e-07, + "loss": 0.191, + "step": 9211 + }, + { + "epoch": 0.59, + "grad_norm": 1.3860446118169985, + "learning_rate": 3.837543713265029e-07, + "loss": 0.1544, + "step": 9212 + }, + { + "epoch": 0.59, + "grad_norm": 0.6092347251192362, + "learning_rate": 3.8365392974837286e-07, + "loss": 0.0883, + "step": 9213 + }, + { + "epoch": 0.59, + "grad_norm": 0.7268909521142994, + "learning_rate": 3.8355349313379985e-07, + "loss": 0.507, + "step": 9214 + }, + { + "epoch": 0.59, + "grad_norm": 0.6860064248322543, + "learning_rate": 3.8345306148706903e-07, + "loss": 0.2763, + "step": 9215 + }, + { + "epoch": 0.59, + "grad_norm": 1.5713052815692077, + "learning_rate": 3.833526348124649e-07, + "loss": 0.4124, + "step": 9216 + }, + { + "epoch": 0.59, + "grad_norm": 0.7284454410432929, + "learning_rate": 3.8325221311427187e-07, + "loss": 0.3609, + "step": 9217 + }, + { + "epoch": 0.59, + "grad_norm": 0.5045497683987311, + "learning_rate": 3.831517963967742e-07, + "loss": 0.1256, + "step": 9218 + }, + { + "epoch": 0.59, + "grad_norm": 0.7560206150188047, + "learning_rate": 3.830513846642556e-07, + "loss": 0.3329, + "step": 9219 + }, + { + "epoch": 0.59, + "grad_norm": 1.5132619562727003, + "learning_rate": 3.829509779210002e-07, + "loss": 0.2106, + "step": 9220 + }, + { + "epoch": 0.59, + "grad_norm": 0.6174686597443946, + "learning_rate": 3.828505761712912e-07, + "loss": 0.2803, + "step": 9221 + }, + { + "epoch": 0.59, + "grad_norm": 1.8508200040860678, + "learning_rate": 3.827501794194123e-07, + "loss": 0.3196, + "step": 9222 + }, + { + "epoch": 0.59, + "grad_norm": 2.1476513717416568, + "learning_rate": 3.8264978766964627e-07, + "loss": 0.1843, + "step": 9223 + }, + { + "epoch": 0.59, + "grad_norm": 0.49750688205076526, + "learning_rate": 3.825494009262763e-07, + "loss": 0.1874, + "step": 9224 + }, + { + "epoch": 0.59, + "grad_norm": 0.6492510892939429, + "learning_rate": 3.8244901919358486e-07, + "loss": 0.354, + "step": 9225 + }, + { + "epoch": 0.59, + "grad_norm": 0.6144163882567673, + "learning_rate": 3.823486424758548e-07, + "loss": 0.2499, + "step": 9226 + }, + { + "epoch": 0.59, + "grad_norm": 1.003569649754277, + "learning_rate": 3.82248270777368e-07, + "loss": 0.2284, + "step": 9227 + }, + { + "epoch": 0.59, + "grad_norm": 1.2232116338891916, + "learning_rate": 3.8214790410240685e-07, + "loss": 0.2695, + "step": 9228 + }, + { + "epoch": 0.59, + "grad_norm": 0.5615557423366042, + "learning_rate": 3.820475424552527e-07, + "loss": 0.2816, + "step": 9229 + }, + { + "epoch": 0.59, + "grad_norm": 0.5172325479600991, + "learning_rate": 3.8194718584018787e-07, + "loss": 0.2323, + "step": 9230 + }, + { + "epoch": 0.59, + "grad_norm": 1.339617527615744, + "learning_rate": 3.8184683426149315e-07, + "loss": 0.2372, + "step": 9231 + }, + { + "epoch": 0.59, + "grad_norm": 8.635007534459703, + "learning_rate": 3.817464877234502e-07, + "loss": 0.2388, + "step": 9232 + }, + { + "epoch": 0.59, + "grad_norm": 1.1168657342703339, + "learning_rate": 3.816461462303397e-07, + "loss": 0.1081, + "step": 9233 + }, + { + "epoch": 0.59, + "grad_norm": 0.8265448121999057, + "learning_rate": 3.815458097864424e-07, + "loss": 0.3288, + "step": 9234 + }, + { + "epoch": 0.59, + "grad_norm": 1.4160038277515097, + "learning_rate": 3.8144547839603923e-07, + "loss": 0.0752, + "step": 9235 + }, + { + "epoch": 0.59, + "grad_norm": 1.2909499803288882, + "learning_rate": 3.8134515206341005e-07, + "loss": 0.0294, + "step": 9236 + }, + { + "epoch": 0.59, + "grad_norm": 0.9679006141827274, + "learning_rate": 3.8124483079283543e-07, + "loss": 0.2623, + "step": 9237 + }, + { + "epoch": 0.59, + "grad_norm": 1.1028104956836575, + "learning_rate": 3.8114451458859483e-07, + "loss": 0.2275, + "step": 9238 + }, + { + "epoch": 0.59, + "grad_norm": 0.22196268196621946, + "learning_rate": 3.8104420345496837e-07, + "loss": 0.1058, + "step": 9239 + }, + { + "epoch": 0.59, + "grad_norm": 1.993825894678243, + "learning_rate": 3.809438973962351e-07, + "loss": 0.2151, + "step": 9240 + }, + { + "epoch": 0.59, + "grad_norm": 1.0319915627413767, + "learning_rate": 3.8084359641667476e-07, + "loss": 0.076, + "step": 9241 + }, + { + "epoch": 0.59, + "grad_norm": 0.07997144953621751, + "learning_rate": 3.8074330052056587e-07, + "loss": 0.0012, + "step": 9242 + }, + { + "epoch": 0.59, + "grad_norm": 0.7991306537992611, + "learning_rate": 3.806430097121876e-07, + "loss": 0.2895, + "step": 9243 + }, + { + "epoch": 0.59, + "grad_norm": 0.8036803252125937, + "learning_rate": 3.8054272399581844e-07, + "loss": 0.1936, + "step": 9244 + }, + { + "epoch": 0.59, + "grad_norm": 1.8102653795342827, + "learning_rate": 3.8044244337573694e-07, + "loss": 0.2513, + "step": 9245 + }, + { + "epoch": 0.59, + "grad_norm": 0.8675800393996959, + "learning_rate": 3.8034216785622125e-07, + "loss": 0.0464, + "step": 9246 + }, + { + "epoch": 0.59, + "grad_norm": 1.2588834721619737, + "learning_rate": 3.802418974415489e-07, + "loss": 0.3701, + "step": 9247 + }, + { + "epoch": 0.59, + "grad_norm": 1.0249739483080074, + "learning_rate": 3.8014163213599824e-07, + "loss": 0.0737, + "step": 9248 + }, + { + "epoch": 0.59, + "grad_norm": 14.622398411848904, + "learning_rate": 3.800413719438463e-07, + "loss": 0.3292, + "step": 9249 + }, + { + "epoch": 0.59, + "grad_norm": 0.40679487772653566, + "learning_rate": 3.7994111686937066e-07, + "loss": 0.2561, + "step": 9250 + }, + { + "epoch": 0.59, + "grad_norm": 0.2984174953844867, + "learning_rate": 3.798408669168483e-07, + "loss": 0.0028, + "step": 9251 + }, + { + "epoch": 0.59, + "grad_norm": 0.4858604040673141, + "learning_rate": 3.7974062209055615e-07, + "loss": 0.1167, + "step": 9252 + }, + { + "epoch": 0.59, + "grad_norm": 1.563922569166749, + "learning_rate": 3.7964038239477065e-07, + "loss": 0.3193, + "step": 9253 + }, + { + "epoch": 0.59, + "grad_norm": 0.650429162744675, + "learning_rate": 3.7954014783376865e-07, + "loss": 0.1074, + "step": 9254 + }, + { + "epoch": 0.59, + "grad_norm": 9.61252308008292, + "learning_rate": 3.794399184118258e-07, + "loss": 0.1816, + "step": 9255 + }, + { + "epoch": 0.59, + "grad_norm": 0.6142621497538089, + "learning_rate": 3.793396941332187e-07, + "loss": 0.0874, + "step": 9256 + }, + { + "epoch": 0.59, + "grad_norm": 1.2884583985225477, + "learning_rate": 3.7923947500222256e-07, + "loss": 0.1975, + "step": 9257 + }, + { + "epoch": 0.59, + "grad_norm": 11.110770340974891, + "learning_rate": 3.7913926102311337e-07, + "loss": 0.1255, + "step": 9258 + }, + { + "epoch": 0.59, + "grad_norm": 0.3867503123187839, + "learning_rate": 3.790390522001662e-07, + "loss": 0.08, + "step": 9259 + }, + { + "epoch": 0.59, + "grad_norm": 1.3152861298882226, + "learning_rate": 3.789388485376561e-07, + "loss": 0.1842, + "step": 9260 + }, + { + "epoch": 0.59, + "grad_norm": 0.6016674065452318, + "learning_rate": 3.7883865003985824e-07, + "loss": 0.3036, + "step": 9261 + }, + { + "epoch": 0.59, + "grad_norm": 1.0199353647149971, + "learning_rate": 3.78738456711047e-07, + "loss": 0.1298, + "step": 9262 + }, + { + "epoch": 0.59, + "grad_norm": 1.5743923291555721, + "learning_rate": 3.786382685554972e-07, + "loss": 0.2609, + "step": 9263 + }, + { + "epoch": 0.59, + "grad_norm": 1.1420267270789695, + "learning_rate": 3.785380855774826e-07, + "loss": 0.2284, + "step": 9264 + }, + { + "epoch": 0.59, + "grad_norm": 0.3823484926973197, + "learning_rate": 3.784379077812776e-07, + "loss": 0.1094, + "step": 9265 + }, + { + "epoch": 0.59, + "grad_norm": 0.6768785129797791, + "learning_rate": 3.783377351711556e-07, + "loss": 0.4861, + "step": 9266 + }, + { + "epoch": 0.59, + "grad_norm": 0.493585718026856, + "learning_rate": 3.7823756775139064e-07, + "loss": 0.1025, + "step": 9267 + }, + { + "epoch": 0.59, + "grad_norm": 0.5133365280139011, + "learning_rate": 3.781374055262556e-07, + "loss": 0.1072, + "step": 9268 + }, + { + "epoch": 0.59, + "grad_norm": 0.6378779444642336, + "learning_rate": 3.7803724850002386e-07, + "loss": 0.3225, + "step": 9269 + }, + { + "epoch": 0.59, + "grad_norm": 1.6305097473210857, + "learning_rate": 3.779370966769683e-07, + "loss": 0.0843, + "step": 9270 + }, + { + "epoch": 0.59, + "grad_norm": 0.7640534442975933, + "learning_rate": 3.7783695006136166e-07, + "loss": 0.3095, + "step": 9271 + }, + { + "epoch": 0.59, + "grad_norm": 0.7972338856128529, + "learning_rate": 3.7773680865747614e-07, + "loss": 0.0946, + "step": 9272 + }, + { + "epoch": 0.59, + "grad_norm": 0.3971028010300925, + "learning_rate": 3.776366724695844e-07, + "loss": 0.3012, + "step": 9273 + }, + { + "epoch": 0.59, + "grad_norm": 2.669258071981314, + "learning_rate": 3.775365415019581e-07, + "loss": 0.1948, + "step": 9274 + }, + { + "epoch": 0.59, + "grad_norm": 0.8696804014722496, + "learning_rate": 3.77436415758869e-07, + "loss": 0.0936, + "step": 9275 + }, + { + "epoch": 0.59, + "grad_norm": 2.9205455098351596, + "learning_rate": 3.773362952445889e-07, + "loss": 0.1226, + "step": 9276 + }, + { + "epoch": 0.59, + "grad_norm": 0.7484490139593507, + "learning_rate": 3.77236179963389e-07, + "loss": 0.2307, + "step": 9277 + }, + { + "epoch": 0.59, + "grad_norm": 0.7675191985092316, + "learning_rate": 3.771360699195404e-07, + "loss": 0.088, + "step": 9278 + }, + { + "epoch": 0.59, + "grad_norm": 0.48411160950391235, + "learning_rate": 3.770359651173139e-07, + "loss": 0.1327, + "step": 9279 + }, + { + "epoch": 0.59, + "grad_norm": 2.966301264369236, + "learning_rate": 3.769358655609807e-07, + "loss": 0.1546, + "step": 9280 + }, + { + "epoch": 0.59, + "grad_norm": 0.6779362212885075, + "learning_rate": 3.7683577125481053e-07, + "loss": 0.2624, + "step": 9281 + }, + { + "epoch": 0.59, + "grad_norm": 0.7683336809436724, + "learning_rate": 3.7673568220307417e-07, + "loss": 0.0865, + "step": 9282 + }, + { + "epoch": 0.59, + "grad_norm": 0.36820261982633506, + "learning_rate": 3.766355984100412e-07, + "loss": 0.0808, + "step": 9283 + }, + { + "epoch": 0.59, + "grad_norm": 0.6542238951265417, + "learning_rate": 3.7653551987998185e-07, + "loss": 0.0406, + "step": 9284 + }, + { + "epoch": 0.59, + "grad_norm": 1.2466414346265247, + "learning_rate": 3.7643544661716514e-07, + "loss": 0.1965, + "step": 9285 + }, + { + "epoch": 0.59, + "grad_norm": 0.9634669945912673, + "learning_rate": 3.7633537862586085e-07, + "loss": 0.5376, + "step": 9286 + }, + { + "epoch": 0.59, + "grad_norm": 10.43694322912159, + "learning_rate": 3.7623531591033795e-07, + "loss": 0.1466, + "step": 9287 + }, + { + "epoch": 0.59, + "grad_norm": 4.7176798163861555, + "learning_rate": 3.76135258474865e-07, + "loss": 0.1206, + "step": 9288 + }, + { + "epoch": 0.59, + "grad_norm": 3.3216627653953292, + "learning_rate": 3.7603520632371127e-07, + "loss": 0.0452, + "step": 9289 + }, + { + "epoch": 0.59, + "grad_norm": 20.910602243800536, + "learning_rate": 3.7593515946114457e-07, + "loss": 0.2689, + "step": 9290 + }, + { + "epoch": 0.59, + "grad_norm": 1.3322446981226637, + "learning_rate": 3.758351178914336e-07, + "loss": 0.2586, + "step": 9291 + }, + { + "epoch": 0.59, + "grad_norm": 1.564679148620845, + "learning_rate": 3.7573508161884587e-07, + "loss": 0.0327, + "step": 9292 + }, + { + "epoch": 0.59, + "grad_norm": 0.7216668245555732, + "learning_rate": 3.7563505064764953e-07, + "loss": 0.2728, + "step": 9293 + }, + { + "epoch": 0.59, + "grad_norm": 0.9364007422457261, + "learning_rate": 3.755350249821118e-07, + "loss": 0.2889, + "step": 9294 + }, + { + "epoch": 0.59, + "grad_norm": 0.5974168987425299, + "learning_rate": 3.754350046265002e-07, + "loss": 0.1377, + "step": 9295 + }, + { + "epoch": 0.59, + "grad_norm": 0.6638751878720911, + "learning_rate": 3.753349895850817e-07, + "loss": 0.2111, + "step": 9296 + }, + { + "epoch": 0.59, + "grad_norm": 0.4677716160403327, + "learning_rate": 3.7523497986212317e-07, + "loss": 0.0553, + "step": 9297 + }, + { + "epoch": 0.59, + "grad_norm": 1.1163588469681387, + "learning_rate": 3.7513497546189103e-07, + "loss": 0.1391, + "step": 9298 + }, + { + "epoch": 0.59, + "grad_norm": 1.3841680899630626, + "learning_rate": 3.750349763886521e-07, + "loss": 0.4298, + "step": 9299 + }, + { + "epoch": 0.59, + "grad_norm": 2.948411230163331, + "learning_rate": 3.7493498264667234e-07, + "loss": 0.0064, + "step": 9300 + }, + { + "epoch": 0.59, + "grad_norm": 0.6125477209147805, + "learning_rate": 3.7483499424021737e-07, + "loss": 0.1465, + "step": 9301 + }, + { + "epoch": 0.59, + "grad_norm": 0.9717487418475622, + "learning_rate": 3.747350111735533e-07, + "loss": 0.2071, + "step": 9302 + }, + { + "epoch": 0.59, + "grad_norm": 0.6911711802058582, + "learning_rate": 3.7463503345094537e-07, + "loss": 0.246, + "step": 9303 + }, + { + "epoch": 0.59, + "grad_norm": 0.6588841672053187, + "learning_rate": 3.74535061076659e-07, + "loss": 0.0713, + "step": 9304 + }, + { + "epoch": 0.59, + "grad_norm": 1.1389535575238425, + "learning_rate": 3.7443509405495905e-07, + "loss": 0.1852, + "step": 9305 + }, + { + "epoch": 0.59, + "grad_norm": 0.6035440069910316, + "learning_rate": 3.7433513239011063e-07, + "loss": 0.2771, + "step": 9306 + }, + { + "epoch": 0.59, + "grad_norm": 0.8692971748620172, + "learning_rate": 3.742351760863778e-07, + "loss": 0.2165, + "step": 9307 + }, + { + "epoch": 0.59, + "grad_norm": 2.8784110511688423, + "learning_rate": 3.7413522514802544e-07, + "loss": 0.1849, + "step": 9308 + }, + { + "epoch": 0.59, + "grad_norm": 1.0342959203137772, + "learning_rate": 3.7403527957931713e-07, + "loss": 0.1187, + "step": 9309 + }, + { + "epoch": 0.59, + "grad_norm": 0.35069950752885065, + "learning_rate": 3.7393533938451733e-07, + "loss": 0.0126, + "step": 9310 + }, + { + "epoch": 0.59, + "grad_norm": 0.8453374061680935, + "learning_rate": 3.738354045678891e-07, + "loss": 0.2379, + "step": 9311 + }, + { + "epoch": 0.59, + "grad_norm": 7.790934286897221, + "learning_rate": 3.737354751336963e-07, + "loss": 0.195, + "step": 9312 + }, + { + "epoch": 0.59, + "grad_norm": 3.04097460610923, + "learning_rate": 3.736355510862018e-07, + "loss": 0.0642, + "step": 9313 + }, + { + "epoch": 0.59, + "grad_norm": 1.9689038288395968, + "learning_rate": 3.7353563242966877e-07, + "loss": 0.1235, + "step": 9314 + }, + { + "epoch": 0.59, + "grad_norm": 0.9070018954853412, + "learning_rate": 3.7343571916836005e-07, + "loss": 0.1238, + "step": 9315 + }, + { + "epoch": 0.59, + "grad_norm": 0.5193318612328358, + "learning_rate": 3.733358113065378e-07, + "loss": 0.1348, + "step": 9316 + }, + { + "epoch": 0.59, + "grad_norm": 6.589835579930599, + "learning_rate": 3.7323590884846454e-07, + "loss": 0.2745, + "step": 9317 + }, + { + "epoch": 0.59, + "grad_norm": 0.8207545384687512, + "learning_rate": 3.7313601179840214e-07, + "loss": 0.1382, + "step": 9318 + }, + { + "epoch": 0.59, + "grad_norm": 1.7543516932605283, + "learning_rate": 3.730361201606127e-07, + "loss": 0.2343, + "step": 9319 + }, + { + "epoch": 0.59, + "grad_norm": 2.319509783700083, + "learning_rate": 3.7293623393935736e-07, + "loss": 0.2488, + "step": 9320 + }, + { + "epoch": 0.59, + "grad_norm": 0.9649618460249436, + "learning_rate": 3.7283635313889784e-07, + "loss": 0.2595, + "step": 9321 + }, + { + "epoch": 0.59, + "grad_norm": 1.8895959725612803, + "learning_rate": 3.7273647776349514e-07, + "loss": 0.1304, + "step": 9322 + }, + { + "epoch": 0.59, + "grad_norm": 0.6724856053530631, + "learning_rate": 3.726366078174101e-07, + "loss": 0.2315, + "step": 9323 + }, + { + "epoch": 0.59, + "grad_norm": 0.494764728635633, + "learning_rate": 3.7253674330490324e-07, + "loss": 0.1345, + "step": 9324 + }, + { + "epoch": 0.59, + "grad_norm": 1.1241400815157059, + "learning_rate": 3.7243688423023546e-07, + "loss": 0.3665, + "step": 9325 + }, + { + "epoch": 0.59, + "grad_norm": 0.7634027749162353, + "learning_rate": 3.7233703059766643e-07, + "loss": 0.1472, + "step": 9326 + }, + { + "epoch": 0.59, + "grad_norm": 0.35743247768133973, + "learning_rate": 3.722371824114564e-07, + "loss": 0.0067, + "step": 9327 + }, + { + "epoch": 0.59, + "grad_norm": 0.19256407345676066, + "learning_rate": 3.7213733967586514e-07, + "loss": 0.1221, + "step": 9328 + }, + { + "epoch": 0.59, + "grad_norm": 1.8973184367805298, + "learning_rate": 3.720375023951517e-07, + "loss": 0.1271, + "step": 9329 + }, + { + "epoch": 0.59, + "grad_norm": 2.8442009928931453, + "learning_rate": 3.7193767057357577e-07, + "loss": 0.1489, + "step": 9330 + }, + { + "epoch": 0.6, + "grad_norm": 0.1138760101326204, + "learning_rate": 3.718378442153962e-07, + "loss": 0.0028, + "step": 9331 + }, + { + "epoch": 0.6, + "grad_norm": 0.8201737035455592, + "learning_rate": 3.7173802332487196e-07, + "loss": 0.1708, + "step": 9332 + }, + { + "epoch": 0.6, + "grad_norm": 0.4636916227535591, + "learning_rate": 3.716382079062613e-07, + "loss": 0.1483, + "step": 9333 + }, + { + "epoch": 0.6, + "grad_norm": 0.8163774505063767, + "learning_rate": 3.71538397963823e-07, + "loss": 0.352, + "step": 9334 + }, + { + "epoch": 0.6, + "grad_norm": 0.40018528842627643, + "learning_rate": 3.7143859350181464e-07, + "loss": 0.2831, + "step": 9335 + }, + { + "epoch": 0.6, + "grad_norm": 1.785518886776523, + "learning_rate": 3.7133879452449446e-07, + "loss": 0.4279, + "step": 9336 + }, + { + "epoch": 0.6, + "grad_norm": 0.7049248450393616, + "learning_rate": 3.712390010361198e-07, + "loss": 0.1501, + "step": 9337 + }, + { + "epoch": 0.6, + "grad_norm": 0.7123666464227902, + "learning_rate": 3.711392130409484e-07, + "loss": 0.2971, + "step": 9338 + }, + { + "epoch": 0.6, + "grad_norm": 2.368363434577095, + "learning_rate": 3.710394305432371e-07, + "loss": 0.2618, + "step": 9339 + }, + { + "epoch": 0.6, + "grad_norm": 0.8423457662773516, + "learning_rate": 3.7093965354724286e-07, + "loss": 0.1768, + "step": 9340 + }, + { + "epoch": 0.6, + "grad_norm": 1.1982493547049378, + "learning_rate": 3.708398820572225e-07, + "loss": 0.0443, + "step": 9341 + }, + { + "epoch": 0.6, + "grad_norm": 0.8572009384772475, + "learning_rate": 3.707401160774325e-07, + "loss": 0.074, + "step": 9342 + }, + { + "epoch": 0.6, + "grad_norm": 0.715077017393968, + "learning_rate": 3.70640355612129e-07, + "loss": 0.1425, + "step": 9343 + }, + { + "epoch": 0.6, + "grad_norm": 0.8019019532902004, + "learning_rate": 3.7054060066556793e-07, + "loss": 0.2665, + "step": 9344 + }, + { + "epoch": 0.6, + "grad_norm": 0.791625294891656, + "learning_rate": 3.7044085124200514e-07, + "loss": 0.087, + "step": 9345 + }, + { + "epoch": 0.6, + "grad_norm": 8.747912087089999, + "learning_rate": 3.703411073456959e-07, + "loss": 0.3365, + "step": 9346 + }, + { + "epoch": 0.6, + "grad_norm": 0.6407920231572916, + "learning_rate": 3.7024136898089576e-07, + "loss": 0.3517, + "step": 9347 + }, + { + "epoch": 0.6, + "grad_norm": 15.630045302440479, + "learning_rate": 3.701416361518597e-07, + "loss": 0.0223, + "step": 9348 + }, + { + "epoch": 0.6, + "grad_norm": 1.3720280989672002, + "learning_rate": 3.700419088628425e-07, + "loss": 0.3173, + "step": 9349 + }, + { + "epoch": 0.6, + "grad_norm": 0.2971003551245055, + "learning_rate": 3.6994218711809856e-07, + "loss": 0.097, + "step": 9350 + }, + { + "epoch": 0.6, + "grad_norm": 0.48855604220375287, + "learning_rate": 3.698424709218826e-07, + "loss": 0.2794, + "step": 9351 + }, + { + "epoch": 0.6, + "grad_norm": 1.434706071754007, + "learning_rate": 3.697427602784483e-07, + "loss": 0.2833, + "step": 9352 + }, + { + "epoch": 0.6, + "grad_norm": 0.7130226436187235, + "learning_rate": 3.696430551920499e-07, + "loss": 0.1784, + "step": 9353 + }, + { + "epoch": 0.6, + "grad_norm": 0.6419149544861212, + "learning_rate": 3.6954335566694057e-07, + "loss": 0.2665, + "step": 9354 + }, + { + "epoch": 0.6, + "grad_norm": 1.7860592936408222, + "learning_rate": 3.6944366170737415e-07, + "loss": 0.1419, + "step": 9355 + }, + { + "epoch": 0.6, + "grad_norm": 0.3870030044112588, + "learning_rate": 3.693439733176035e-07, + "loss": 0.1992, + "step": 9356 + }, + { + "epoch": 0.6, + "grad_norm": 0.4643972104686778, + "learning_rate": 3.6924429050188157e-07, + "loss": 0.1586, + "step": 9357 + }, + { + "epoch": 0.6, + "grad_norm": 3.778282250295543, + "learning_rate": 3.6914461326446104e-07, + "loss": 0.1099, + "step": 9358 + }, + { + "epoch": 0.6, + "grad_norm": 1.0465978772358504, + "learning_rate": 3.690449416095943e-07, + "loss": 0.1571, + "step": 9359 + }, + { + "epoch": 0.6, + "grad_norm": 1.5731659928726092, + "learning_rate": 3.6894527554153383e-07, + "loss": 0.3195, + "step": 9360 + }, + { + "epoch": 0.6, + "grad_norm": 2.2875317520675376, + "learning_rate": 3.688456150645311e-07, + "loss": 0.2305, + "step": 9361 + }, + { + "epoch": 0.6, + "grad_norm": 1.029337424805925, + "learning_rate": 3.6874596018283833e-07, + "loss": 0.088, + "step": 9362 + }, + { + "epoch": 0.6, + "grad_norm": 0.8809815467086525, + "learning_rate": 3.6864631090070653e-07, + "loss": 0.3054, + "step": 9363 + }, + { + "epoch": 0.6, + "grad_norm": 0.47611715679060396, + "learning_rate": 3.6854666722238737e-07, + "loss": 0.1209, + "step": 9364 + }, + { + "epoch": 0.6, + "grad_norm": 0.6557302681228317, + "learning_rate": 3.684470291521314e-07, + "loss": 0.0543, + "step": 9365 + }, + { + "epoch": 0.6, + "grad_norm": 0.31106441265079243, + "learning_rate": 3.683473966941898e-07, + "loss": 0.0051, + "step": 9366 + }, + { + "epoch": 0.6, + "grad_norm": 2.020456985395643, + "learning_rate": 3.682477698528128e-07, + "loss": 0.1233, + "step": 9367 + }, + { + "epoch": 0.6, + "grad_norm": 0.6984927839984826, + "learning_rate": 3.6814814863225083e-07, + "loss": 0.0894, + "step": 9368 + }, + { + "epoch": 0.6, + "grad_norm": 1.0236622478125843, + "learning_rate": 3.6804853303675407e-07, + "loss": 0.0866, + "step": 9369 + }, + { + "epoch": 0.6, + "grad_norm": 2.9520519063215946, + "learning_rate": 3.679489230705719e-07, + "loss": 0.1956, + "step": 9370 + }, + { + "epoch": 0.6, + "grad_norm": 0.9785352268345708, + "learning_rate": 3.6784931873795434e-07, + "loss": 0.2661, + "step": 9371 + }, + { + "epoch": 0.6, + "grad_norm": 0.5489439806777325, + "learning_rate": 3.677497200431503e-07, + "loss": 0.2252, + "step": 9372 + }, + { + "epoch": 0.6, + "grad_norm": 1.1660682395226845, + "learning_rate": 3.676501269904093e-07, + "loss": 0.2097, + "step": 9373 + }, + { + "epoch": 0.6, + "grad_norm": 0.9117422336753886, + "learning_rate": 3.6755053958397964e-07, + "loss": 0.1649, + "step": 9374 + }, + { + "epoch": 0.6, + "grad_norm": 1.3174871712267413, + "learning_rate": 3.6745095782811034e-07, + "loss": 0.1165, + "step": 9375 + }, + { + "epoch": 0.6, + "grad_norm": 1.1737490207513126, + "learning_rate": 3.673513817270496e-07, + "loss": 0.3228, + "step": 9376 + }, + { + "epoch": 0.6, + "grad_norm": 0.3779584403245971, + "learning_rate": 3.6725181128504566e-07, + "loss": 0.0662, + "step": 9377 + }, + { + "epoch": 0.6, + "grad_norm": 0.9040176754693658, + "learning_rate": 3.6715224650634623e-07, + "loss": 0.3983, + "step": 9378 + }, + { + "epoch": 0.6, + "grad_norm": 2.0949680380558715, + "learning_rate": 3.6705268739519916e-07, + "loss": 0.2969, + "step": 9379 + }, + { + "epoch": 0.6, + "grad_norm": 0.5585126973068211, + "learning_rate": 3.669531339558515e-07, + "loss": 0.0869, + "step": 9380 + }, + { + "epoch": 0.6, + "grad_norm": 3.8409218650921324, + "learning_rate": 3.668535861925509e-07, + "loss": 0.2927, + "step": 9381 + }, + { + "epoch": 0.6, + "grad_norm": 8.401990657290293, + "learning_rate": 3.667540441095436e-07, + "loss": 0.0338, + "step": 9382 + }, + { + "epoch": 0.6, + "grad_norm": 0.27809309699568374, + "learning_rate": 3.6665450771107697e-07, + "loss": 0.1003, + "step": 9383 + }, + { + "epoch": 0.6, + "grad_norm": 0.7875044896588642, + "learning_rate": 3.6655497700139693e-07, + "loss": 0.1111, + "step": 9384 + }, + { + "epoch": 0.6, + "grad_norm": 1.3888928888215062, + "learning_rate": 3.6645545198474973e-07, + "loss": 0.1958, + "step": 9385 + }, + { + "epoch": 0.6, + "grad_norm": 0.46544895086304044, + "learning_rate": 3.6635593266538167e-07, + "loss": 0.2682, + "step": 9386 + }, + { + "epoch": 0.6, + "grad_norm": 0.5980123759044917, + "learning_rate": 3.662564190475379e-07, + "loss": 0.1744, + "step": 9387 + }, + { + "epoch": 0.6, + "grad_norm": 1.738477076975794, + "learning_rate": 3.6615691113546445e-07, + "loss": 0.0572, + "step": 9388 + }, + { + "epoch": 0.6, + "grad_norm": 0.4997082116318408, + "learning_rate": 3.6605740893340596e-07, + "loss": 0.2915, + "step": 9389 + }, + { + "epoch": 0.6, + "grad_norm": 1.0518297815423197, + "learning_rate": 3.659579124456079e-07, + "loss": 0.2468, + "step": 9390 + }, + { + "epoch": 0.6, + "grad_norm": 0.8886369436167223, + "learning_rate": 3.6585842167631453e-07, + "loss": 0.3167, + "step": 9391 + }, + { + "epoch": 0.6, + "grad_norm": 0.7653665718881084, + "learning_rate": 3.6575893662977066e-07, + "loss": 0.171, + "step": 9392 + }, + { + "epoch": 0.6, + "grad_norm": 1.3841488873652306, + "learning_rate": 3.6565945731022036e-07, + "loss": 0.1029, + "step": 9393 + }, + { + "epoch": 0.6, + "grad_norm": 0.8641759029425111, + "learning_rate": 3.655599837219077e-07, + "loss": 0.2416, + "step": 9394 + }, + { + "epoch": 0.6, + "grad_norm": 0.9428812489335693, + "learning_rate": 3.6546051586907635e-07, + "loss": 0.1067, + "step": 9395 + }, + { + "epoch": 0.6, + "grad_norm": 1.3502908802967608, + "learning_rate": 3.6536105375596996e-07, + "loss": 0.0645, + "step": 9396 + }, + { + "epoch": 0.6, + "grad_norm": 0.8358373455391138, + "learning_rate": 3.652615973868317e-07, + "loss": 0.0722, + "step": 9397 + }, + { + "epoch": 0.6, + "grad_norm": 0.6973503012289795, + "learning_rate": 3.651621467659044e-07, + "loss": 0.1672, + "step": 9398 + }, + { + "epoch": 0.6, + "grad_norm": 0.926952821654789, + "learning_rate": 3.6506270189743116e-07, + "loss": 0.3482, + "step": 9399 + }, + { + "epoch": 0.6, + "grad_norm": 1.0162155844066454, + "learning_rate": 3.649632627856541e-07, + "loss": 0.3422, + "step": 9400 + }, + { + "epoch": 0.6, + "grad_norm": 1.170829378895211, + "learning_rate": 3.648638294348158e-07, + "loss": 0.0836, + "step": 9401 + }, + { + "epoch": 0.6, + "grad_norm": 0.33666530053854105, + "learning_rate": 3.647644018491581e-07, + "loss": 0.1849, + "step": 9402 + }, + { + "epoch": 0.6, + "grad_norm": 1.195738706653307, + "learning_rate": 3.646649800329231e-07, + "loss": 0.2266, + "step": 9403 + }, + { + "epoch": 0.6, + "grad_norm": 2.7221616589187154, + "learning_rate": 3.6456556399035186e-07, + "loss": 0.0995, + "step": 9404 + }, + { + "epoch": 0.6, + "grad_norm": 1.5300694572259874, + "learning_rate": 3.6446615372568616e-07, + "loss": 0.181, + "step": 9405 + }, + { + "epoch": 0.6, + "grad_norm": 14.89805046405319, + "learning_rate": 3.643667492431666e-07, + "loss": 0.0775, + "step": 9406 + }, + { + "epoch": 0.6, + "grad_norm": 0.38542962638776146, + "learning_rate": 3.642673505470344e-07, + "loss": 0.0659, + "step": 9407 + }, + { + "epoch": 0.6, + "grad_norm": 0.6983717939905597, + "learning_rate": 3.6416795764152964e-07, + "loss": 0.2694, + "step": 9408 + }, + { + "epoch": 0.6, + "grad_norm": 0.47082656016469765, + "learning_rate": 3.64068570530893e-07, + "loss": 0.1048, + "step": 9409 + }, + { + "epoch": 0.6, + "grad_norm": 0.9109312430517122, + "learning_rate": 3.639691892193644e-07, + "loss": 0.1162, + "step": 9410 + }, + { + "epoch": 0.6, + "grad_norm": 1.6899362271898326, + "learning_rate": 3.638698137111835e-07, + "loss": 0.1932, + "step": 9411 + }, + { + "epoch": 0.6, + "grad_norm": 0.8027858842048811, + "learning_rate": 3.637704440105902e-07, + "loss": 0.1846, + "step": 9412 + }, + { + "epoch": 0.6, + "grad_norm": 12.739760075312104, + "learning_rate": 3.636710801218235e-07, + "loss": 0.1182, + "step": 9413 + }, + { + "epoch": 0.6, + "grad_norm": 0.5382520406612367, + "learning_rate": 3.635717220491227e-07, + "loss": 0.2819, + "step": 9414 + }, + { + "epoch": 0.6, + "grad_norm": 1.0858281035483512, + "learning_rate": 3.6347236979672626e-07, + "loss": 0.3465, + "step": 9415 + }, + { + "epoch": 0.6, + "grad_norm": 1.9059460894166045, + "learning_rate": 3.633730233688732e-07, + "loss": 0.1408, + "step": 9416 + }, + { + "epoch": 0.6, + "grad_norm": 0.7336907181010045, + "learning_rate": 3.6327368276980145e-07, + "loss": 0.0576, + "step": 9417 + }, + { + "epoch": 0.6, + "grad_norm": 1.1336513371755648, + "learning_rate": 3.631743480037495e-07, + "loss": 0.2492, + "step": 9418 + }, + { + "epoch": 0.6, + "grad_norm": 0.4045115547659766, + "learning_rate": 3.630750190749546e-07, + "loss": 0.2621, + "step": 9419 + }, + { + "epoch": 0.6, + "grad_norm": 0.44199810016468766, + "learning_rate": 3.6297569598765475e-07, + "loss": 0.0086, + "step": 9420 + }, + { + "epoch": 0.6, + "grad_norm": 1.2029416627921536, + "learning_rate": 3.6287637874608713e-07, + "loss": 0.3133, + "step": 9421 + }, + { + "epoch": 0.6, + "grad_norm": 1.1686022841441073, + "learning_rate": 3.6277706735448903e-07, + "loss": 0.2998, + "step": 9422 + }, + { + "epoch": 0.6, + "grad_norm": 1.519242917613396, + "learning_rate": 3.6267776181709684e-07, + "loss": 0.3648, + "step": 9423 + }, + { + "epoch": 0.6, + "grad_norm": 0.8015422825334999, + "learning_rate": 3.6257846213814767e-07, + "loss": 0.0705, + "step": 9424 + }, + { + "epoch": 0.6, + "grad_norm": 1.5074650534128662, + "learning_rate": 3.6247916832187755e-07, + "loss": 0.0888, + "step": 9425 + }, + { + "epoch": 0.6, + "grad_norm": 1.4409350840386246, + "learning_rate": 3.6237988037252227e-07, + "loss": 0.3621, + "step": 9426 + }, + { + "epoch": 0.6, + "grad_norm": 8.588334078053489, + "learning_rate": 3.622805982943181e-07, + "loss": 0.0433, + "step": 9427 + }, + { + "epoch": 0.6, + "grad_norm": 1.690879103346311, + "learning_rate": 3.621813220915004e-07, + "loss": 0.2864, + "step": 9428 + }, + { + "epoch": 0.6, + "grad_norm": 3.641539238707806, + "learning_rate": 3.620820517683046e-07, + "loss": 0.094, + "step": 9429 + }, + { + "epoch": 0.6, + "grad_norm": 1.49798299721738, + "learning_rate": 3.619827873289657e-07, + "loss": 0.3174, + "step": 9430 + }, + { + "epoch": 0.6, + "grad_norm": 0.6268779811381258, + "learning_rate": 3.6188352877771865e-07, + "loss": 0.1791, + "step": 9431 + }, + { + "epoch": 0.6, + "grad_norm": 0.4414730057712469, + "learning_rate": 3.617842761187977e-07, + "loss": 0.3484, + "step": 9432 + }, + { + "epoch": 0.6, + "grad_norm": 5.086231279791825, + "learning_rate": 3.6168502935643763e-07, + "loss": 0.019, + "step": 9433 + }, + { + "epoch": 0.6, + "grad_norm": 0.4089373888040658, + "learning_rate": 3.61585788494872e-07, + "loss": 0.0678, + "step": 9434 + }, + { + "epoch": 0.6, + "grad_norm": 1.4732372969873673, + "learning_rate": 3.614865535383351e-07, + "loss": 0.1703, + "step": 9435 + }, + { + "epoch": 0.6, + "grad_norm": 1.5186388228339889, + "learning_rate": 3.613873244910601e-07, + "loss": 0.2791, + "step": 9436 + }, + { + "epoch": 0.6, + "grad_norm": 0.9842137879472515, + "learning_rate": 3.612881013572806e-07, + "loss": 0.1501, + "step": 9437 + }, + { + "epoch": 0.6, + "grad_norm": 0.5935332181125628, + "learning_rate": 3.611888841412296e-07, + "loss": 0.2552, + "step": 9438 + }, + { + "epoch": 0.6, + "grad_norm": 0.787286726230883, + "learning_rate": 3.6108967284713966e-07, + "loss": 0.193, + "step": 9439 + }, + { + "epoch": 0.6, + "grad_norm": 1.4755215820683798, + "learning_rate": 3.6099046747924373e-07, + "loss": 0.1659, + "step": 9440 + }, + { + "epoch": 0.6, + "grad_norm": 0.6750708548489052, + "learning_rate": 3.6089126804177364e-07, + "loss": 0.1033, + "step": 9441 + }, + { + "epoch": 0.6, + "grad_norm": 1.7863623723317774, + "learning_rate": 3.60792074538962e-07, + "loss": 0.1735, + "step": 9442 + }, + { + "epoch": 0.6, + "grad_norm": 16.245197149121086, + "learning_rate": 3.606928869750401e-07, + "loss": 0.2154, + "step": 9443 + }, + { + "epoch": 0.6, + "grad_norm": 0.6445252825729044, + "learning_rate": 3.605937053542398e-07, + "loss": 0.3043, + "step": 9444 + }, + { + "epoch": 0.6, + "grad_norm": 0.9737129284554178, + "learning_rate": 3.604945296807921e-07, + "loss": 0.3247, + "step": 9445 + }, + { + "epoch": 0.6, + "grad_norm": 0.3940290771510294, + "learning_rate": 3.6039535995892835e-07, + "loss": 0.0489, + "step": 9446 + }, + { + "epoch": 0.6, + "grad_norm": 0.610151977789657, + "learning_rate": 3.6029619619287897e-07, + "loss": 0.0484, + "step": 9447 + }, + { + "epoch": 0.6, + "grad_norm": 1.3523942494620698, + "learning_rate": 3.60197038386875e-07, + "loss": 0.2164, + "step": 9448 + }, + { + "epoch": 0.6, + "grad_norm": 0.4857908866675192, + "learning_rate": 3.600978865451462e-07, + "loss": 0.0688, + "step": 9449 + }, + { + "epoch": 0.6, + "grad_norm": 0.7683698264502922, + "learning_rate": 3.59998740671923e-07, + "loss": 0.1496, + "step": 9450 + }, + { + "epoch": 0.6, + "grad_norm": 0.6713109763358255, + "learning_rate": 3.5989960077143475e-07, + "loss": 0.141, + "step": 9451 + }, + { + "epoch": 0.6, + "grad_norm": 0.39428184273653033, + "learning_rate": 3.598004668479113e-07, + "loss": 0.1295, + "step": 9452 + }, + { + "epoch": 0.6, + "grad_norm": 0.6878095952961367, + "learning_rate": 3.597013389055818e-07, + "loss": 0.0199, + "step": 9453 + }, + { + "epoch": 0.6, + "grad_norm": 0.8524581325660707, + "learning_rate": 3.5960221694867516e-07, + "loss": 0.3448, + "step": 9454 + }, + { + "epoch": 0.6, + "grad_norm": 1.5628398563066372, + "learning_rate": 3.5950310098142016e-07, + "loss": 0.2538, + "step": 9455 + }, + { + "epoch": 0.6, + "grad_norm": 1.1186249288055055, + "learning_rate": 3.594039910080452e-07, + "loss": 0.0979, + "step": 9456 + }, + { + "epoch": 0.6, + "grad_norm": 0.5744758314234945, + "learning_rate": 3.5930488703277884e-07, + "loss": 0.3416, + "step": 9457 + }, + { + "epoch": 0.6, + "grad_norm": 0.990597953243976, + "learning_rate": 3.5920578905984867e-07, + "loss": 0.284, + "step": 9458 + }, + { + "epoch": 0.6, + "grad_norm": 1.5819561244282796, + "learning_rate": 3.591066970934827e-07, + "loss": 0.1504, + "step": 9459 + }, + { + "epoch": 0.6, + "grad_norm": 1.3169886141667242, + "learning_rate": 3.5900761113790813e-07, + "loss": 0.255, + "step": 9460 + }, + { + "epoch": 0.6, + "grad_norm": 1.165269825801838, + "learning_rate": 3.589085311973524e-07, + "loss": 0.3182, + "step": 9461 + }, + { + "epoch": 0.6, + "grad_norm": 12.824506802886921, + "learning_rate": 3.588094572760423e-07, + "loss": 0.2083, + "step": 9462 + }, + { + "epoch": 0.6, + "grad_norm": 0.6393293552065233, + "learning_rate": 3.587103893782046e-07, + "loss": 0.0955, + "step": 9463 + }, + { + "epoch": 0.6, + "grad_norm": 1.1996395451813144, + "learning_rate": 3.5861132750806566e-07, + "loss": 0.4389, + "step": 9464 + }, + { + "epoch": 0.6, + "grad_norm": 2.61484900781638, + "learning_rate": 3.5851227166985184e-07, + "loss": 0.2825, + "step": 9465 + }, + { + "epoch": 0.6, + "grad_norm": 6.172490924953922, + "learning_rate": 3.5841322186778894e-07, + "loss": 0.1773, + "step": 9466 + }, + { + "epoch": 0.6, + "grad_norm": 5.638159471201789, + "learning_rate": 3.583141781061024e-07, + "loss": 0.1132, + "step": 9467 + }, + { + "epoch": 0.6, + "grad_norm": 0.9351782254085814, + "learning_rate": 3.582151403890182e-07, + "loss": 0.2241, + "step": 9468 + }, + { + "epoch": 0.6, + "grad_norm": 0.9256261683344589, + "learning_rate": 3.581161087207608e-07, + "loss": 0.2268, + "step": 9469 + }, + { + "epoch": 0.6, + "grad_norm": 1.3051866291544902, + "learning_rate": 3.580170831055557e-07, + "loss": 0.2202, + "step": 9470 + }, + { + "epoch": 0.6, + "grad_norm": 1.2190511644389843, + "learning_rate": 3.57918063547627e-07, + "loss": 0.0569, + "step": 9471 + }, + { + "epoch": 0.6, + "grad_norm": 0.442025476340937, + "learning_rate": 3.578190500511994e-07, + "loss": 0.1541, + "step": 9472 + }, + { + "epoch": 0.6, + "grad_norm": 0.8236563443916848, + "learning_rate": 3.5772004262049705e-07, + "loss": 0.3391, + "step": 9473 + }, + { + "epoch": 0.6, + "grad_norm": 1.006509833273857, + "learning_rate": 3.5762104125974357e-07, + "loss": 0.1443, + "step": 9474 + }, + { + "epoch": 0.6, + "grad_norm": 5.159114976755673, + "learning_rate": 3.575220459731627e-07, + "loss": 0.1633, + "step": 9475 + }, + { + "epoch": 0.6, + "grad_norm": 0.4073080062571323, + "learning_rate": 3.5742305676497785e-07, + "loss": 0.0046, + "step": 9476 + }, + { + "epoch": 0.6, + "grad_norm": 0.9576682490917284, + "learning_rate": 3.573240736394119e-07, + "loss": 0.3445, + "step": 9477 + }, + { + "epoch": 0.6, + "grad_norm": 0.39686532945091335, + "learning_rate": 3.57225096600688e-07, + "loss": 0.0589, + "step": 9478 + }, + { + "epoch": 0.6, + "grad_norm": 0.8499172182669122, + "learning_rate": 3.5712612565302846e-07, + "loss": 0.183, + "step": 9479 + }, + { + "epoch": 0.6, + "grad_norm": 1.0256610774157449, + "learning_rate": 3.5702716080065544e-07, + "loss": 0.4106, + "step": 9480 + }, + { + "epoch": 0.6, + "grad_norm": 0.6932690415855755, + "learning_rate": 3.569282020477912e-07, + "loss": 0.115, + "step": 9481 + }, + { + "epoch": 0.6, + "grad_norm": 0.6391868470132124, + "learning_rate": 3.568292493986574e-07, + "loss": 0.2923, + "step": 9482 + }, + { + "epoch": 0.6, + "grad_norm": 2.0553878712678806, + "learning_rate": 3.5673030285747583e-07, + "loss": 0.126, + "step": 9483 + }, + { + "epoch": 0.6, + "grad_norm": 0.21637608116406298, + "learning_rate": 3.566313624284674e-07, + "loss": 0.1065, + "step": 9484 + }, + { + "epoch": 0.6, + "grad_norm": 1.875493412863062, + "learning_rate": 3.565324281158534e-07, + "loss": 0.158, + "step": 9485 + }, + { + "epoch": 0.6, + "grad_norm": 0.2088440849044081, + "learning_rate": 3.564334999238542e-07, + "loss": 0.0567, + "step": 9486 + }, + { + "epoch": 0.6, + "grad_norm": 0.6064417453901161, + "learning_rate": 3.563345778566907e-07, + "loss": 0.0139, + "step": 9487 + }, + { + "epoch": 0.61, + "grad_norm": 0.6574714732114035, + "learning_rate": 3.562356619185827e-07, + "loss": 0.2009, + "step": 9488 + }, + { + "epoch": 0.61, + "grad_norm": 1.155984125520488, + "learning_rate": 3.5613675211375063e-07, + "loss": 0.1546, + "step": 9489 + }, + { + "epoch": 0.61, + "grad_norm": 0.5766695840302901, + "learning_rate": 3.560378484464137e-07, + "loss": 0.3639, + "step": 9490 + }, + { + "epoch": 0.61, + "grad_norm": 0.48979977542295144, + "learning_rate": 3.559389509207916e-07, + "loss": 0.1652, + "step": 9491 + }, + { + "epoch": 0.61, + "grad_norm": 0.493927423119219, + "learning_rate": 3.5584005954110343e-07, + "loss": 0.1472, + "step": 9492 + }, + { + "epoch": 0.61, + "grad_norm": 0.7184802566700499, + "learning_rate": 3.5574117431156826e-07, + "loss": 0.3289, + "step": 9493 + }, + { + "epoch": 0.61, + "grad_norm": 0.5233670046628419, + "learning_rate": 3.5564229523640466e-07, + "loss": 0.1573, + "step": 9494 + }, + { + "epoch": 0.61, + "grad_norm": 1.5534037970393029, + "learning_rate": 3.5554342231983063e-07, + "loss": 0.3675, + "step": 9495 + }, + { + "epoch": 0.61, + "grad_norm": 2.866844066999668, + "learning_rate": 3.554445555660649e-07, + "loss": 0.0769, + "step": 9496 + }, + { + "epoch": 0.61, + "grad_norm": 0.7776888752887263, + "learning_rate": 3.5534569497932474e-07, + "loss": 0.2311, + "step": 9497 + }, + { + "epoch": 0.61, + "grad_norm": 1.7046551376374321, + "learning_rate": 3.552468405638282e-07, + "loss": 0.2205, + "step": 9498 + }, + { + "epoch": 0.61, + "grad_norm": 0.6692309773361224, + "learning_rate": 3.551479923237924e-07, + "loss": 0.1334, + "step": 9499 + }, + { + "epoch": 0.61, + "grad_norm": 2.104945431067929, + "learning_rate": 3.5504915026343443e-07, + "loss": 0.071, + "step": 9500 + }, + { + "epoch": 0.61, + "grad_norm": 0.5064598508166142, + "learning_rate": 3.54950314386971e-07, + "loss": 0.3589, + "step": 9501 + }, + { + "epoch": 0.61, + "grad_norm": 0.8520594832192026, + "learning_rate": 3.548514846986189e-07, + "loss": 0.1262, + "step": 9502 + }, + { + "epoch": 0.61, + "grad_norm": 15.362113891260808, + "learning_rate": 3.547526612025942e-07, + "loss": 0.3452, + "step": 9503 + }, + { + "epoch": 0.61, + "grad_norm": 0.9419430460431464, + "learning_rate": 3.5465384390311297e-07, + "loss": 0.447, + "step": 9504 + }, + { + "epoch": 0.61, + "grad_norm": 1.2961654803612481, + "learning_rate": 3.545550328043908e-07, + "loss": 0.1917, + "step": 9505 + }, + { + "epoch": 0.61, + "grad_norm": 0.5315887724165193, + "learning_rate": 3.5445622791064357e-07, + "loss": 0.2115, + "step": 9506 + }, + { + "epoch": 0.61, + "grad_norm": 0.4770144668206306, + "learning_rate": 3.543574292260861e-07, + "loss": 0.0086, + "step": 9507 + }, + { + "epoch": 0.61, + "grad_norm": 1.2355838871005962, + "learning_rate": 3.542586367549334e-07, + "loss": 0.1475, + "step": 9508 + }, + { + "epoch": 0.61, + "grad_norm": 1.2483751771922211, + "learning_rate": 3.541598505014004e-07, + "loss": 0.256, + "step": 9509 + }, + { + "epoch": 0.61, + "grad_norm": 0.4285539010868225, + "learning_rate": 3.540610704697011e-07, + "loss": 0.2119, + "step": 9510 + }, + { + "epoch": 0.61, + "grad_norm": 0.734726231846914, + "learning_rate": 3.5396229666405026e-07, + "loss": 0.2798, + "step": 9511 + }, + { + "epoch": 0.61, + "grad_norm": 0.482766810153079, + "learning_rate": 3.538635290886611e-07, + "loss": 0.2323, + "step": 9512 + }, + { + "epoch": 0.61, + "grad_norm": 2.0432071428183933, + "learning_rate": 3.5376476774774776e-07, + "loss": 0.3549, + "step": 9513 + }, + { + "epoch": 0.61, + "grad_norm": 0.7664898071153935, + "learning_rate": 3.536660126455233e-07, + "loss": 0.0092, + "step": 9514 + }, + { + "epoch": 0.61, + "grad_norm": 0.6304981059941067, + "learning_rate": 3.5356726378620103e-07, + "loss": 0.005, + "step": 9515 + }, + { + "epoch": 0.61, + "grad_norm": 0.38579326887007837, + "learning_rate": 3.5346852117399347e-07, + "loss": 0.0814, + "step": 9516 + }, + { + "epoch": 0.61, + "grad_norm": 1.3293810669383335, + "learning_rate": 3.533697848131134e-07, + "loss": 0.1155, + "step": 9517 + }, + { + "epoch": 0.61, + "grad_norm": 14.950871791178397, + "learning_rate": 3.5327105470777305e-07, + "loss": 0.1445, + "step": 9518 + }, + { + "epoch": 0.61, + "grad_norm": 1.7977325969398998, + "learning_rate": 3.531723308621847e-07, + "loss": 0.1063, + "step": 9519 + }, + { + "epoch": 0.61, + "grad_norm": 2.3291692439433445, + "learning_rate": 3.5307361328055976e-07, + "loss": 0.1668, + "step": 9520 + }, + { + "epoch": 0.61, + "grad_norm": 12.029977035286095, + "learning_rate": 3.529749019671097e-07, + "loss": 0.1566, + "step": 9521 + }, + { + "epoch": 0.61, + "grad_norm": 0.759593639531423, + "learning_rate": 3.5287619692604607e-07, + "loss": 0.1452, + "step": 9522 + }, + { + "epoch": 0.61, + "grad_norm": 5.215541996792754, + "learning_rate": 3.527774981615794e-07, + "loss": 0.1516, + "step": 9523 + }, + { + "epoch": 0.61, + "grad_norm": 0.9919919160786849, + "learning_rate": 3.526788056779208e-07, + "loss": 0.1444, + "step": 9524 + }, + { + "epoch": 0.61, + "grad_norm": 0.6100328855461785, + "learning_rate": 3.5258011947928047e-07, + "loss": 0.1438, + "step": 9525 + }, + { + "epoch": 0.61, + "grad_norm": 0.6662172389324932, + "learning_rate": 3.524814395698686e-07, + "loss": 0.1544, + "step": 9526 + }, + { + "epoch": 0.61, + "grad_norm": 1.8541939750322884, + "learning_rate": 3.5238276595389495e-07, + "loss": 0.2717, + "step": 9527 + }, + { + "epoch": 0.61, + "grad_norm": 0.9632005762172113, + "learning_rate": 3.522840986355694e-07, + "loss": 0.2894, + "step": 9528 + }, + { + "epoch": 0.61, + "grad_norm": 1.787435734934215, + "learning_rate": 3.5218543761910104e-07, + "loss": 0.0897, + "step": 9529 + }, + { + "epoch": 0.61, + "grad_norm": 0.33808649369160954, + "learning_rate": 3.520867829086992e-07, + "loss": 0.1316, + "step": 9530 + }, + { + "epoch": 0.61, + "grad_norm": 0.5051858975240087, + "learning_rate": 3.519881345085723e-07, + "loss": 0.0644, + "step": 9531 + }, + { + "epoch": 0.61, + "grad_norm": 0.5099164061138237, + "learning_rate": 3.5188949242292945e-07, + "loss": 0.2252, + "step": 9532 + }, + { + "epoch": 0.61, + "grad_norm": 1.8753734344434103, + "learning_rate": 3.517908566559783e-07, + "loss": 0.1211, + "step": 9533 + }, + { + "epoch": 0.61, + "grad_norm": 0.7426447517918439, + "learning_rate": 3.5169222721192735e-07, + "loss": 0.4824, + "step": 9534 + }, + { + "epoch": 0.61, + "grad_norm": 1.0894601162545856, + "learning_rate": 3.515936040949841e-07, + "loss": 0.0962, + "step": 9535 + }, + { + "epoch": 0.61, + "grad_norm": 1.8256599029554128, + "learning_rate": 3.514949873093559e-07, + "loss": 0.0358, + "step": 9536 + }, + { + "epoch": 0.61, + "grad_norm": 8.405568848617357, + "learning_rate": 3.513963768592502e-07, + "loss": 0.1053, + "step": 9537 + }, + { + "epoch": 0.61, + "grad_norm": 0.7117838901206692, + "learning_rate": 3.5129777274887363e-07, + "loss": 0.0553, + "step": 9538 + }, + { + "epoch": 0.61, + "grad_norm": 0.8556497796650047, + "learning_rate": 3.511991749824332e-07, + "loss": 0.1805, + "step": 9539 + }, + { + "epoch": 0.61, + "grad_norm": 0.9670599418896123, + "learning_rate": 3.5110058356413497e-07, + "loss": 0.3328, + "step": 9540 + }, + { + "epoch": 0.61, + "grad_norm": 1.2557616840936097, + "learning_rate": 3.510019984981853e-07, + "loss": 0.016, + "step": 9541 + }, + { + "epoch": 0.61, + "grad_norm": 1.192287509787701, + "learning_rate": 3.509034197887897e-07, + "loss": 0.3793, + "step": 9542 + }, + { + "epoch": 0.61, + "grad_norm": 0.5618701902969143, + "learning_rate": 3.5080484744015405e-07, + "loss": 0.1652, + "step": 9543 + }, + { + "epoch": 0.61, + "grad_norm": 0.4294697667042685, + "learning_rate": 3.5070628145648353e-07, + "loss": 0.1548, + "step": 9544 + }, + { + "epoch": 0.61, + "grad_norm": 1.2650894762362481, + "learning_rate": 3.5060772184198313e-07, + "loss": 0.0619, + "step": 9545 + }, + { + "epoch": 0.61, + "grad_norm": 0.2826780412617805, + "learning_rate": 3.5050916860085757e-07, + "loss": 0.1783, + "step": 9546 + }, + { + "epoch": 0.61, + "grad_norm": 0.48055308768303967, + "learning_rate": 3.5041062173731153e-07, + "loss": 0.0374, + "step": 9547 + }, + { + "epoch": 0.61, + "grad_norm": 0.26875555863780737, + "learning_rate": 3.503120812555491e-07, + "loss": 0.1087, + "step": 9548 + }, + { + "epoch": 0.61, + "grad_norm": 0.7210446372994427, + "learning_rate": 3.5021354715977404e-07, + "loss": 0.2469, + "step": 9549 + }, + { + "epoch": 0.61, + "grad_norm": 1.9612550213102113, + "learning_rate": 3.5011501945419034e-07, + "loss": 0.2909, + "step": 9550 + }, + { + "epoch": 0.61, + "grad_norm": 1.3144814339545225, + "learning_rate": 3.5001649814300103e-07, + "loss": 0.3016, + "step": 9551 + }, + { + "epoch": 0.61, + "grad_norm": 0.5657734398150366, + "learning_rate": 3.4991798323040957e-07, + "loss": 0.0682, + "step": 9552 + }, + { + "epoch": 0.61, + "grad_norm": 0.7252407911586491, + "learning_rate": 3.4981947472061846e-07, + "loss": 0.2639, + "step": 9553 + }, + { + "epoch": 0.61, + "grad_norm": 1.6242112103120891, + "learning_rate": 3.497209726178306e-07, + "loss": 0.0052, + "step": 9554 + }, + { + "epoch": 0.61, + "grad_norm": 0.2664016430341551, + "learning_rate": 3.4962247692624806e-07, + "loss": 0.1158, + "step": 9555 + }, + { + "epoch": 0.61, + "grad_norm": 1.3797610985119093, + "learning_rate": 3.495239876500732e-07, + "loss": 0.1901, + "step": 9556 + }, + { + "epoch": 0.61, + "grad_norm": 1.0646627807982256, + "learning_rate": 3.494255047935072e-07, + "loss": 0.264, + "step": 9557 + }, + { + "epoch": 0.61, + "grad_norm": 0.8334236528510394, + "learning_rate": 3.493270283607521e-07, + "loss": 0.3143, + "step": 9558 + }, + { + "epoch": 0.61, + "grad_norm": 0.6066992204145405, + "learning_rate": 3.492285583560086e-07, + "loss": 0.2633, + "step": 9559 + }, + { + "epoch": 0.61, + "grad_norm": 0.5321456602640795, + "learning_rate": 3.4913009478347824e-07, + "loss": 0.1904, + "step": 9560 + }, + { + "epoch": 0.61, + "grad_norm": 0.7261091688805966, + "learning_rate": 3.49031637647361e-07, + "loss": 0.4801, + "step": 9561 + }, + { + "epoch": 0.61, + "grad_norm": 1.033724474706224, + "learning_rate": 3.4893318695185767e-07, + "loss": 0.4179, + "step": 9562 + }, + { + "epoch": 0.61, + "grad_norm": 3.075835070526746, + "learning_rate": 3.488347427011684e-07, + "loss": 0.3301, + "step": 9563 + }, + { + "epoch": 0.61, + "grad_norm": 0.502068478136284, + "learning_rate": 3.4873630489949274e-07, + "loss": 0.0192, + "step": 9564 + }, + { + "epoch": 0.61, + "grad_norm": 0.395113070811715, + "learning_rate": 3.486378735510306e-07, + "loss": 0.0063, + "step": 9565 + }, + { + "epoch": 0.61, + "grad_norm": 1.7232580462657048, + "learning_rate": 3.4853944865998073e-07, + "loss": 0.2881, + "step": 9566 + }, + { + "epoch": 0.61, + "grad_norm": 0.9599165312685054, + "learning_rate": 3.484410302305427e-07, + "loss": 0.3233, + "step": 9567 + }, + { + "epoch": 0.61, + "grad_norm": 1.4703181695866034, + "learning_rate": 3.483426182669149e-07, + "loss": 0.1889, + "step": 9568 + }, + { + "epoch": 0.61, + "grad_norm": 1.146495240531375, + "learning_rate": 3.4824421277329585e-07, + "loss": 0.4046, + "step": 9569 + }, + { + "epoch": 0.61, + "grad_norm": 0.4314771979829906, + "learning_rate": 3.481458137538838e-07, + "loss": 0.1848, + "step": 9570 + }, + { + "epoch": 0.61, + "grad_norm": 0.9250065699509874, + "learning_rate": 3.4804742121287654e-07, + "loss": 0.4214, + "step": 9571 + }, + { + "epoch": 0.61, + "grad_norm": 0.3293318691339513, + "learning_rate": 3.4794903515447174e-07, + "loss": 0.188, + "step": 9572 + }, + { + "epoch": 0.61, + "grad_norm": 0.4910563428076659, + "learning_rate": 3.4785065558286696e-07, + "loss": 0.1863, + "step": 9573 + }, + { + "epoch": 0.61, + "grad_norm": 1.5969956253129487, + "learning_rate": 3.477522825022588e-07, + "loss": 0.0602, + "step": 9574 + }, + { + "epoch": 0.61, + "grad_norm": 0.3744545662743759, + "learning_rate": 3.476539159168446e-07, + "loss": 0.0669, + "step": 9575 + }, + { + "epoch": 0.61, + "grad_norm": 7.734299639347295, + "learning_rate": 3.475555558308205e-07, + "loss": 0.4074, + "step": 9576 + }, + { + "epoch": 0.61, + "grad_norm": 1.3504725250091854, + "learning_rate": 3.4745720224838275e-07, + "loss": 0.424, + "step": 9577 + }, + { + "epoch": 0.61, + "grad_norm": 0.4331875068739966, + "learning_rate": 3.4735885517372745e-07, + "loss": 0.0128, + "step": 9578 + }, + { + "epoch": 0.61, + "grad_norm": 0.42824816220074896, + "learning_rate": 3.472605146110501e-07, + "loss": 0.153, + "step": 9579 + }, + { + "epoch": 0.61, + "grad_norm": 0.4351001430071234, + "learning_rate": 3.4716218056454647e-07, + "loss": 0.1793, + "step": 9580 + }, + { + "epoch": 0.61, + "grad_norm": 1.4280842403548806, + "learning_rate": 3.4706385303841134e-07, + "loss": 0.1541, + "step": 9581 + }, + { + "epoch": 0.61, + "grad_norm": 0.5926766013134903, + "learning_rate": 3.4696553203683983e-07, + "loss": 0.1084, + "step": 9582 + }, + { + "epoch": 0.61, + "grad_norm": 1.0985680596498748, + "learning_rate": 3.4686721756402616e-07, + "loss": 0.2146, + "step": 9583 + }, + { + "epoch": 0.61, + "grad_norm": 0.8880529549801257, + "learning_rate": 3.4676890962416496e-07, + "loss": 0.017, + "step": 9584 + }, + { + "epoch": 0.61, + "grad_norm": 0.6677483589894967, + "learning_rate": 3.4667060822145e-07, + "loss": 0.2069, + "step": 9585 + }, + { + "epoch": 0.61, + "grad_norm": 1.0592907579725246, + "learning_rate": 3.4657231336007523e-07, + "loss": 0.13, + "step": 9586 + }, + { + "epoch": 0.61, + "grad_norm": 3.6592791785561625, + "learning_rate": 3.4647402504423385e-07, + "loss": 0.1032, + "step": 9587 + }, + { + "epoch": 0.61, + "grad_norm": 0.5331203369768155, + "learning_rate": 3.463757432781193e-07, + "loss": 0.2472, + "step": 9588 + }, + { + "epoch": 0.61, + "grad_norm": 7.921419468819464, + "learning_rate": 3.4627746806592444e-07, + "loss": 0.3503, + "step": 9589 + }, + { + "epoch": 0.61, + "grad_norm": 0.5934001506174543, + "learning_rate": 3.4617919941184166e-07, + "loss": 0.4153, + "step": 9590 + }, + { + "epoch": 0.61, + "grad_norm": 0.9025315697185343, + "learning_rate": 3.460809373200636e-07, + "loss": 0.051, + "step": 9591 + }, + { + "epoch": 0.61, + "grad_norm": 0.9931361896209535, + "learning_rate": 3.4598268179478194e-07, + "loss": 0.279, + "step": 9592 + }, + { + "epoch": 0.61, + "grad_norm": 6.088044584822752, + "learning_rate": 3.45884432840189e-07, + "loss": 0.059, + "step": 9593 + }, + { + "epoch": 0.61, + "grad_norm": 1.2314237640656722, + "learning_rate": 3.4578619046047567e-07, + "loss": 0.2023, + "step": 9594 + }, + { + "epoch": 0.61, + "grad_norm": 0.5092112323077743, + "learning_rate": 3.456879546598337e-07, + "loss": 0.093, + "step": 9595 + }, + { + "epoch": 0.61, + "grad_norm": 4.128925925754039, + "learning_rate": 3.4558972544245356e-07, + "loss": 0.1333, + "step": 9596 + }, + { + "epoch": 0.61, + "grad_norm": 1.8043015206803905, + "learning_rate": 3.454915028125263e-07, + "loss": 0.2976, + "step": 9597 + }, + { + "epoch": 0.61, + "grad_norm": 1.3068952477633422, + "learning_rate": 3.45393286774242e-07, + "loss": 0.2758, + "step": 9598 + }, + { + "epoch": 0.61, + "grad_norm": 0.9730817044959121, + "learning_rate": 3.452950773317911e-07, + "loss": 0.3222, + "step": 9599 + }, + { + "epoch": 0.61, + "grad_norm": 4.272755291092515, + "learning_rate": 3.4519687448936295e-07, + "loss": 0.2205, + "step": 9600 + }, + { + "epoch": 0.61, + "grad_norm": 1.5868444706628881, + "learning_rate": 3.4509867825114755e-07, + "loss": 0.1644, + "step": 9601 + }, + { + "epoch": 0.61, + "grad_norm": 1.8330587807219971, + "learning_rate": 3.450004886213337e-07, + "loss": 0.0324, + "step": 9602 + }, + { + "epoch": 0.61, + "grad_norm": 3.0068560904339283, + "learning_rate": 3.4490230560411085e-07, + "loss": 0.1586, + "step": 9603 + }, + { + "epoch": 0.61, + "grad_norm": 1.0001005538099705, + "learning_rate": 3.448041292036673e-07, + "loss": 0.2305, + "step": 9604 + }, + { + "epoch": 0.61, + "grad_norm": 0.4916345275421797, + "learning_rate": 3.447059594241916e-07, + "loss": 0.0538, + "step": 9605 + }, + { + "epoch": 0.61, + "grad_norm": 2.618687922918267, + "learning_rate": 3.446077962698718e-07, + "loss": 0.2956, + "step": 9606 + }, + { + "epoch": 0.61, + "grad_norm": 0.5696702585567011, + "learning_rate": 3.445096397448958e-07, + "loss": 0.2709, + "step": 9607 + }, + { + "epoch": 0.61, + "grad_norm": 0.1884662622061173, + "learning_rate": 3.4441148985345123e-07, + "loss": 0.0745, + "step": 9608 + }, + { + "epoch": 0.61, + "grad_norm": 1.1563562383245019, + "learning_rate": 3.443133465997251e-07, + "loss": 0.3654, + "step": 9609 + }, + { + "epoch": 0.61, + "grad_norm": 0.2414528424868526, + "learning_rate": 3.442152099879048e-07, + "loss": 0.0804, + "step": 9610 + }, + { + "epoch": 0.61, + "grad_norm": 0.9512809743923842, + "learning_rate": 3.441170800221765e-07, + "loss": 0.1162, + "step": 9611 + }, + { + "epoch": 0.61, + "grad_norm": 1.0785385980380031, + "learning_rate": 3.440189567067273e-07, + "loss": 0.0659, + "step": 9612 + }, + { + "epoch": 0.61, + "grad_norm": 1.5797862836251841, + "learning_rate": 3.4392084004574275e-07, + "loss": 0.3305, + "step": 9613 + }, + { + "epoch": 0.61, + "grad_norm": 0.6063870396351476, + "learning_rate": 3.4382273004340887e-07, + "loss": 0.2354, + "step": 9614 + }, + { + "epoch": 0.61, + "grad_norm": 2.5974500860148275, + "learning_rate": 3.4372462670391144e-07, + "loss": 0.1743, + "step": 9615 + }, + { + "epoch": 0.61, + "grad_norm": 1.3077113945833219, + "learning_rate": 3.436265300314355e-07, + "loss": 0.1657, + "step": 9616 + }, + { + "epoch": 0.61, + "grad_norm": 0.603166167192824, + "learning_rate": 3.4352844003016624e-07, + "loss": 0.2456, + "step": 9617 + }, + { + "epoch": 0.61, + "grad_norm": 0.2856105969290681, + "learning_rate": 3.434303567042881e-07, + "loss": 0.2376, + "step": 9618 + }, + { + "epoch": 0.61, + "grad_norm": 0.9429974027176391, + "learning_rate": 3.433322800579859e-07, + "loss": 0.2456, + "step": 9619 + }, + { + "epoch": 0.61, + "grad_norm": 0.880197302547717, + "learning_rate": 3.432342100954434e-07, + "loss": 0.1117, + "step": 9620 + }, + { + "epoch": 0.61, + "grad_norm": 0.2472669673392983, + "learning_rate": 3.431361468208448e-07, + "loss": 0.0912, + "step": 9621 + }, + { + "epoch": 0.61, + "grad_norm": 0.4222150174216388, + "learning_rate": 3.4303809023837327e-07, + "loss": 0.2073, + "step": 9622 + }, + { + "epoch": 0.61, + "grad_norm": 0.1100839309970587, + "learning_rate": 3.4294004035221246e-07, + "loss": 0.0011, + "step": 9623 + }, + { + "epoch": 0.61, + "grad_norm": 0.43061421143584283, + "learning_rate": 3.428419971665452e-07, + "loss": 0.1153, + "step": 9624 + }, + { + "epoch": 0.61, + "grad_norm": 1.1921919757851094, + "learning_rate": 3.4274396068555446e-07, + "loss": 0.3812, + "step": 9625 + }, + { + "epoch": 0.61, + "grad_norm": 0.4931510013296186, + "learning_rate": 3.4264593091342225e-07, + "loss": 0.144, + "step": 9626 + }, + { + "epoch": 0.61, + "grad_norm": 4.841924744830834, + "learning_rate": 3.425479078543312e-07, + "loss": 0.0298, + "step": 9627 + }, + { + "epoch": 0.61, + "grad_norm": 3.2763868997235948, + "learning_rate": 3.424498915124627e-07, + "loss": 0.2633, + "step": 9628 + }, + { + "epoch": 0.61, + "grad_norm": 0.4627046317141414, + "learning_rate": 3.423518818919987e-07, + "loss": 0.0877, + "step": 9629 + }, + { + "epoch": 0.61, + "grad_norm": 1.1415060188542492, + "learning_rate": 3.4225387899712017e-07, + "loss": 0.2889, + "step": 9630 + }, + { + "epoch": 0.61, + "grad_norm": 2.8697188070980397, + "learning_rate": 3.4215588283200847e-07, + "loss": 0.4644, + "step": 9631 + }, + { + "epoch": 0.61, + "grad_norm": 0.7868394702210959, + "learning_rate": 3.42057893400844e-07, + "loss": 0.1948, + "step": 9632 + }, + { + "epoch": 0.61, + "grad_norm": 0.7210343365033178, + "learning_rate": 3.419599107078072e-07, + "loss": 0.0843, + "step": 9633 + }, + { + "epoch": 0.61, + "grad_norm": 0.9195153597489307, + "learning_rate": 3.418619347570785e-07, + "loss": 0.0828, + "step": 9634 + }, + { + "epoch": 0.61, + "grad_norm": 0.31388945391280904, + "learning_rate": 3.4176396555283744e-07, + "loss": 0.2229, + "step": 9635 + }, + { + "epoch": 0.61, + "grad_norm": 16.59159076340234, + "learning_rate": 3.4166600309926387e-07, + "loss": 0.2786, + "step": 9636 + }, + { + "epoch": 0.61, + "grad_norm": 0.5840827296823353, + "learning_rate": 3.4156804740053665e-07, + "loss": 0.1857, + "step": 9637 + }, + { + "epoch": 0.61, + "grad_norm": 1.304655110458904, + "learning_rate": 3.414700984608352e-07, + "loss": 0.0982, + "step": 9638 + }, + { + "epoch": 0.61, + "grad_norm": 0.23187899125055195, + "learning_rate": 3.4137215628433793e-07, + "loss": 0.0079, + "step": 9639 + }, + { + "epoch": 0.61, + "grad_norm": 1.549274691583505, + "learning_rate": 3.412742208752234e-07, + "loss": 0.2203, + "step": 9640 + }, + { + "epoch": 0.61, + "grad_norm": 0.5426514275423491, + "learning_rate": 3.4117629223766965e-07, + "loss": 0.1101, + "step": 9641 + }, + { + "epoch": 0.61, + "grad_norm": 0.8701990932823046, + "learning_rate": 3.4107837037585456e-07, + "loss": 0.2096, + "step": 9642 + }, + { + "epoch": 0.61, + "grad_norm": 0.31101715403328684, + "learning_rate": 3.4098045529395557e-07, + "loss": 0.09, + "step": 9643 + }, + { + "epoch": 0.62, + "grad_norm": 0.49925692395483745, + "learning_rate": 3.408825469961503e-07, + "loss": 0.009, + "step": 9644 + }, + { + "epoch": 0.62, + "grad_norm": 1.0480465009927071, + "learning_rate": 3.407846454866153e-07, + "loss": 0.2926, + "step": 9645 + }, + { + "epoch": 0.62, + "grad_norm": 1.1996004083321903, + "learning_rate": 3.4068675076952735e-07, + "loss": 0.3273, + "step": 9646 + }, + { + "epoch": 0.62, + "grad_norm": 0.6171935971223315, + "learning_rate": 3.40588862849063e-07, + "loss": 0.1699, + "step": 9647 + }, + { + "epoch": 0.62, + "grad_norm": 0.4455311050832932, + "learning_rate": 3.40490981729398e-07, + "loss": 0.1556, + "step": 9648 + }, + { + "epoch": 0.62, + "grad_norm": 15.928084012225668, + "learning_rate": 3.403931074147085e-07, + "loss": 0.1599, + "step": 9649 + }, + { + "epoch": 0.62, + "grad_norm": 0.4388893883048158, + "learning_rate": 3.4029523990916984e-07, + "loss": 0.0156, + "step": 9650 + }, + { + "epoch": 0.62, + "grad_norm": 0.8493084512380781, + "learning_rate": 3.4019737921695734e-07, + "loss": 0.0307, + "step": 9651 + }, + { + "epoch": 0.62, + "grad_norm": 2.2978810644829313, + "learning_rate": 3.4009952534224573e-07, + "loss": 0.025, + "step": 9652 + }, + { + "epoch": 0.62, + "grad_norm": 1.0884515437922089, + "learning_rate": 3.400016782892101e-07, + "loss": 0.0912, + "step": 9653 + }, + { + "epoch": 0.62, + "grad_norm": 1.1045847972889378, + "learning_rate": 3.3990383806202427e-07, + "loss": 0.218, + "step": 9654 + }, + { + "epoch": 0.62, + "grad_norm": 1.3820573701876342, + "learning_rate": 3.398060046648627e-07, + "loss": 0.1362, + "step": 9655 + }, + { + "epoch": 0.62, + "grad_norm": 1.2204567299780504, + "learning_rate": 3.3970817810189883e-07, + "loss": 0.2979, + "step": 9656 + }, + { + "epoch": 0.62, + "grad_norm": 0.3358705612120792, + "learning_rate": 3.396103583773066e-07, + "loss": 0.0846, + "step": 9657 + }, + { + "epoch": 0.62, + "grad_norm": 0.7443790564141608, + "learning_rate": 3.3951254549525865e-07, + "loss": 0.2865, + "step": 9658 + }, + { + "epoch": 0.62, + "grad_norm": 1.864869465123004, + "learning_rate": 3.394147394599281e-07, + "loss": 0.1399, + "step": 9659 + }, + { + "epoch": 0.62, + "grad_norm": 1.0788401377321628, + "learning_rate": 3.3931694027548774e-07, + "loss": 0.1065, + "step": 9660 + }, + { + "epoch": 0.62, + "grad_norm": 0.8862960168875686, + "learning_rate": 3.392191479461096e-07, + "loss": 0.1619, + "step": 9661 + }, + { + "epoch": 0.62, + "grad_norm": 0.45105233462744826, + "learning_rate": 3.39121362475966e-07, + "loss": 0.011, + "step": 9662 + }, + { + "epoch": 0.62, + "grad_norm": 0.40562950885699883, + "learning_rate": 3.3902358386922823e-07, + "loss": 0.1351, + "step": 9663 + }, + { + "epoch": 0.62, + "grad_norm": 1.0528420955859363, + "learning_rate": 3.389258121300682e-07, + "loss": 0.142, + "step": 9664 + }, + { + "epoch": 0.62, + "grad_norm": 0.322898913995691, + "learning_rate": 3.388280472626567e-07, + "loss": 0.0512, + "step": 9665 + }, + { + "epoch": 0.62, + "grad_norm": 3.9924111999275667, + "learning_rate": 3.3873028927116474e-07, + "loss": 0.1011, + "step": 9666 + }, + { + "epoch": 0.62, + "grad_norm": 1.6810908359051329, + "learning_rate": 3.386325381597628e-07, + "loss": 0.1435, + "step": 9667 + }, + { + "epoch": 0.62, + "grad_norm": 0.973501401654767, + "learning_rate": 3.385347939326212e-07, + "loss": 0.161, + "step": 9668 + }, + { + "epoch": 0.62, + "grad_norm": 0.4225090616841344, + "learning_rate": 3.3843705659390977e-07, + "loss": 0.0649, + "step": 9669 + }, + { + "epoch": 0.62, + "grad_norm": 0.9910227797615438, + "learning_rate": 3.383393261477985e-07, + "loss": 0.1963, + "step": 9670 + }, + { + "epoch": 0.62, + "grad_norm": 0.8026710605913076, + "learning_rate": 3.382416025984563e-07, + "loss": 0.3476, + "step": 9671 + }, + { + "epoch": 0.62, + "grad_norm": 0.43782863834267566, + "learning_rate": 3.3814388595005274e-07, + "loss": 0.0489, + "step": 9672 + }, + { + "epoch": 0.62, + "grad_norm": 0.7834382778568777, + "learning_rate": 3.380461762067564e-07, + "loss": 0.3936, + "step": 9673 + }, + { + "epoch": 0.62, + "grad_norm": 1.140878511252451, + "learning_rate": 3.379484733727356e-07, + "loss": 0.2009, + "step": 9674 + }, + { + "epoch": 0.62, + "grad_norm": 2.145944887695228, + "learning_rate": 3.3785077745215867e-07, + "loss": 0.1061, + "step": 9675 + }, + { + "epoch": 0.62, + "grad_norm": 0.7828966360437236, + "learning_rate": 3.377530884491936e-07, + "loss": 0.133, + "step": 9676 + }, + { + "epoch": 0.62, + "grad_norm": 0.9790514628359507, + "learning_rate": 3.3765540636800795e-07, + "loss": 0.2241, + "step": 9677 + }, + { + "epoch": 0.62, + "grad_norm": 1.6244034945981407, + "learning_rate": 3.375577312127689e-07, + "loss": 0.0923, + "step": 9678 + }, + { + "epoch": 0.62, + "grad_norm": 2.1176109283196176, + "learning_rate": 3.374600629876437e-07, + "loss": 0.2063, + "step": 9679 + }, + { + "epoch": 0.62, + "grad_norm": 1.1400215488775414, + "learning_rate": 3.373624016967989e-07, + "loss": 0.339, + "step": 9680 + }, + { + "epoch": 0.62, + "grad_norm": 0.7266091394368728, + "learning_rate": 3.372647473444011e-07, + "loss": 0.1516, + "step": 9681 + }, + { + "epoch": 0.62, + "grad_norm": 0.998266862212202, + "learning_rate": 3.371670999346162e-07, + "loss": 0.226, + "step": 9682 + }, + { + "epoch": 0.62, + "grad_norm": 1.1303660680185235, + "learning_rate": 3.370694594716103e-07, + "loss": 0.3154, + "step": 9683 + }, + { + "epoch": 0.62, + "grad_norm": 0.5290700509410663, + "learning_rate": 3.369718259595486e-07, + "loss": 0.1477, + "step": 9684 + }, + { + "epoch": 0.62, + "grad_norm": 3.0769691318048444, + "learning_rate": 3.368741994025966e-07, + "loss": 0.1802, + "step": 9685 + }, + { + "epoch": 0.62, + "grad_norm": 1.1874478464550087, + "learning_rate": 3.367765798049193e-07, + "loss": 0.2394, + "step": 9686 + }, + { + "epoch": 0.62, + "grad_norm": 0.9981652510796589, + "learning_rate": 3.36678967170681e-07, + "loss": 0.2965, + "step": 9687 + }, + { + "epoch": 0.62, + "grad_norm": 0.38936891349843017, + "learning_rate": 3.365813615040465e-07, + "loss": 0.0568, + "step": 9688 + }, + { + "epoch": 0.62, + "grad_norm": 0.7970928639037027, + "learning_rate": 3.3648376280917946e-07, + "loss": 0.0278, + "step": 9689 + }, + { + "epoch": 0.62, + "grad_norm": 0.44544585074375437, + "learning_rate": 3.3638617109024405e-07, + "loss": 0.0273, + "step": 9690 + }, + { + "epoch": 0.62, + "grad_norm": 2.623778611811336, + "learning_rate": 3.3628858635140317e-07, + "loss": 0.3168, + "step": 9691 + }, + { + "epoch": 0.62, + "grad_norm": 0.838098790065471, + "learning_rate": 3.361910085968205e-07, + "loss": 0.1211, + "step": 9692 + }, + { + "epoch": 0.62, + "grad_norm": 0.5313964576854305, + "learning_rate": 3.360934378306586e-07, + "loss": 0.1963, + "step": 9693 + }, + { + "epoch": 0.62, + "grad_norm": 0.5870650851985146, + "learning_rate": 3.359958740570802e-07, + "loss": 0.2638, + "step": 9694 + }, + { + "epoch": 0.62, + "grad_norm": 0.7881511462464323, + "learning_rate": 3.3589831728024733e-07, + "loss": 0.2737, + "step": 9695 + }, + { + "epoch": 0.62, + "grad_norm": 1.0385252321720932, + "learning_rate": 3.358007675043224e-07, + "loss": 0.3148, + "step": 9696 + }, + { + "epoch": 0.62, + "grad_norm": 5.494414612317763, + "learning_rate": 3.357032247334666e-07, + "loss": 0.0761, + "step": 9697 + }, + { + "epoch": 0.62, + "grad_norm": 1.4388717112905187, + "learning_rate": 3.356056889718417e-07, + "loss": 0.2136, + "step": 9698 + }, + { + "epoch": 0.62, + "grad_norm": 1.1491435647868617, + "learning_rate": 3.355081602236086e-07, + "loss": 0.2962, + "step": 9699 + }, + { + "epoch": 0.62, + "grad_norm": 1.0983749076551765, + "learning_rate": 3.3541063849292785e-07, + "loss": 0.1626, + "step": 9700 + }, + { + "epoch": 0.62, + "grad_norm": 19.719457677317227, + "learning_rate": 3.3531312378396023e-07, + "loss": 0.1695, + "step": 9701 + }, + { + "epoch": 0.62, + "grad_norm": 0.5293311824003611, + "learning_rate": 3.352156161008658e-07, + "loss": 0.1906, + "step": 9702 + }, + { + "epoch": 0.62, + "grad_norm": 0.7800234034844945, + "learning_rate": 3.3511811544780445e-07, + "loss": 0.0977, + "step": 9703 + }, + { + "epoch": 0.62, + "grad_norm": 1.2912943653128102, + "learning_rate": 3.3502062182893563e-07, + "loss": 0.0991, + "step": 9704 + }, + { + "epoch": 0.62, + "grad_norm": 0.9972025373690413, + "learning_rate": 3.3492313524841896e-07, + "loss": 0.4256, + "step": 9705 + }, + { + "epoch": 0.62, + "grad_norm": 0.394740195822023, + "learning_rate": 3.34825655710413e-07, + "loss": 0.0541, + "step": 9706 + }, + { + "epoch": 0.62, + "grad_norm": 0.8023099451970512, + "learning_rate": 3.3472818321907677e-07, + "loss": 0.1039, + "step": 9707 + }, + { + "epoch": 0.62, + "grad_norm": 0.5450040172144064, + "learning_rate": 3.3463071777856826e-07, + "loss": 0.0747, + "step": 9708 + }, + { + "epoch": 0.62, + "grad_norm": 0.6629103130157872, + "learning_rate": 3.34533259393046e-07, + "loss": 0.2211, + "step": 9709 + }, + { + "epoch": 0.62, + "grad_norm": 1.0004173541174128, + "learning_rate": 3.344358080666674e-07, + "loss": 0.2864, + "step": 9710 + }, + { + "epoch": 0.62, + "grad_norm": 0.5784660571422798, + "learning_rate": 3.3433836380359017e-07, + "loss": 0.1308, + "step": 9711 + }, + { + "epoch": 0.62, + "grad_norm": 0.429189676289002, + "learning_rate": 3.342409266079711e-07, + "loss": 0.202, + "step": 9712 + }, + { + "epoch": 0.62, + "grad_norm": 5.455297020664025, + "learning_rate": 3.3414349648396755e-07, + "loss": 0.1935, + "step": 9713 + }, + { + "epoch": 0.62, + "grad_norm": 1.3197767456611609, + "learning_rate": 3.340460734357359e-07, + "loss": 0.015, + "step": 9714 + }, + { + "epoch": 0.62, + "grad_norm": 1.6251486683180227, + "learning_rate": 3.339486574674321e-07, + "loss": 0.105, + "step": 9715 + }, + { + "epoch": 0.62, + "grad_norm": 0.5383432674816208, + "learning_rate": 3.338512485832127e-07, + "loss": 0.2392, + "step": 9716 + }, + { + "epoch": 0.62, + "grad_norm": 0.5975307515061975, + "learning_rate": 3.3375384678723275e-07, + "loss": 0.2254, + "step": 9717 + }, + { + "epoch": 0.62, + "grad_norm": 1.2249868304715692, + "learning_rate": 3.3365645208364814e-07, + "loss": 0.206, + "step": 9718 + }, + { + "epoch": 0.62, + "grad_norm": 0.5727470827042784, + "learning_rate": 3.335590644766134e-07, + "loss": 0.1096, + "step": 9719 + }, + { + "epoch": 0.62, + "grad_norm": 0.26983518233568515, + "learning_rate": 3.3346168397028375e-07, + "loss": 0.1458, + "step": 9720 + }, + { + "epoch": 0.62, + "grad_norm": 0.7253371494377086, + "learning_rate": 3.333643105688134e-07, + "loss": 0.3655, + "step": 9721 + }, + { + "epoch": 0.62, + "grad_norm": 0.25924859035933084, + "learning_rate": 3.3326694427635657e-07, + "loss": 0.0089, + "step": 9722 + }, + { + "epoch": 0.62, + "grad_norm": 1.4638056171243157, + "learning_rate": 3.3316958509706695e-07, + "loss": 0.1153, + "step": 9723 + }, + { + "epoch": 0.62, + "grad_norm": 0.7050846533927227, + "learning_rate": 3.3307223303509835e-07, + "loss": 0.1181, + "step": 9724 + }, + { + "epoch": 0.62, + "grad_norm": 0.51324220020139, + "learning_rate": 3.329748880946037e-07, + "loss": 0.3162, + "step": 9725 + }, + { + "epoch": 0.62, + "grad_norm": 0.7059254358554043, + "learning_rate": 3.328775502797363e-07, + "loss": 0.1879, + "step": 9726 + }, + { + "epoch": 0.62, + "grad_norm": 1.454404948998227, + "learning_rate": 3.3278021959464857e-07, + "loss": 0.1415, + "step": 9727 + }, + { + "epoch": 0.62, + "grad_norm": 2.855437271924765, + "learning_rate": 3.3268289604349266e-07, + "loss": 0.2297, + "step": 9728 + }, + { + "epoch": 0.62, + "grad_norm": 0.8428411441037661, + "learning_rate": 3.325855796304208e-07, + "loss": 0.1808, + "step": 9729 + }, + { + "epoch": 0.62, + "grad_norm": 0.9022351116652456, + "learning_rate": 3.324882703595845e-07, + "loss": 0.3482, + "step": 9730 + }, + { + "epoch": 0.62, + "grad_norm": 0.5492724358666593, + "learning_rate": 3.3239096823513565e-07, + "loss": 0.1431, + "step": 9731 + }, + { + "epoch": 0.62, + "grad_norm": 4.918600336029918, + "learning_rate": 3.322936732612247e-07, + "loss": 0.1724, + "step": 9732 + }, + { + "epoch": 0.62, + "grad_norm": 0.7260838766547156, + "learning_rate": 3.321963854420031e-07, + "loss": 0.0123, + "step": 9733 + }, + { + "epoch": 0.62, + "grad_norm": 2.844162753335101, + "learning_rate": 3.3209910478162077e-07, + "loss": 0.2646, + "step": 9734 + }, + { + "epoch": 0.62, + "grad_norm": 1.2387889557100995, + "learning_rate": 3.320018312842284e-07, + "loss": 0.1458, + "step": 9735 + }, + { + "epoch": 0.62, + "grad_norm": 1.224096149135845, + "learning_rate": 3.3190456495397534e-07, + "loss": 0.2795, + "step": 9736 + }, + { + "epoch": 0.62, + "grad_norm": 0.9895912364231522, + "learning_rate": 3.318073057950117e-07, + "loss": 0.089, + "step": 9737 + }, + { + "epoch": 0.62, + "grad_norm": 9.013396254623336, + "learning_rate": 3.317100538114863e-07, + "loss": 0.1287, + "step": 9738 + }, + { + "epoch": 0.62, + "grad_norm": 4.944640453005538, + "learning_rate": 3.3161280900754845e-07, + "loss": 0.0451, + "step": 9739 + }, + { + "epoch": 0.62, + "grad_norm": 1.4478979865196622, + "learning_rate": 3.3151557138734655e-07, + "loss": 0.3024, + "step": 9740 + }, + { + "epoch": 0.62, + "grad_norm": 2.140879600682289, + "learning_rate": 3.314183409550292e-07, + "loss": 0.2722, + "step": 9741 + }, + { + "epoch": 0.62, + "grad_norm": 1.2735408774536647, + "learning_rate": 3.313211177147444e-07, + "loss": 0.0224, + "step": 9742 + }, + { + "epoch": 0.62, + "grad_norm": 1.2281358995379472, + "learning_rate": 3.3122390167063965e-07, + "loss": 0.2272, + "step": 9743 + }, + { + "epoch": 0.62, + "grad_norm": 0.4739552998196937, + "learning_rate": 3.311266928268626e-07, + "loss": 0.1578, + "step": 9744 + }, + { + "epoch": 0.62, + "grad_norm": 0.8496568020715792, + "learning_rate": 3.3102949118756017e-07, + "loss": 0.1799, + "step": 9745 + }, + { + "epoch": 0.62, + "grad_norm": 2.1675489398324537, + "learning_rate": 3.3093229675687945e-07, + "loss": 0.0326, + "step": 9746 + }, + { + "epoch": 0.62, + "grad_norm": 1.9339951663613293, + "learning_rate": 3.3083510953896676e-07, + "loss": 0.2022, + "step": 9747 + }, + { + "epoch": 0.62, + "grad_norm": 0.6903485604455774, + "learning_rate": 3.307379295379684e-07, + "loss": 0.2035, + "step": 9748 + }, + { + "epoch": 0.62, + "grad_norm": 1.0305568072894793, + "learning_rate": 3.3064075675803016e-07, + "loss": 0.2289, + "step": 9749 + }, + { + "epoch": 0.62, + "grad_norm": 3.941810575292261, + "learning_rate": 3.3054359120329786e-07, + "loss": 0.1156, + "step": 9750 + }, + { + "epoch": 0.62, + "grad_norm": 0.5583045424424432, + "learning_rate": 3.3044643287791643e-07, + "loss": 0.1339, + "step": 9751 + }, + { + "epoch": 0.62, + "grad_norm": 0.9404728184231548, + "learning_rate": 3.3034928178603115e-07, + "loss": 0.0288, + "step": 9752 + }, + { + "epoch": 0.62, + "grad_norm": 1.8899062681751952, + "learning_rate": 3.3025213793178643e-07, + "loss": 0.1342, + "step": 9753 + }, + { + "epoch": 0.62, + "grad_norm": 0.3018027126828369, + "learning_rate": 3.301550013193268e-07, + "loss": 0.2209, + "step": 9754 + }, + { + "epoch": 0.62, + "grad_norm": 0.4847358677401563, + "learning_rate": 3.300578719527963e-07, + "loss": 0.0411, + "step": 9755 + }, + { + "epoch": 0.62, + "grad_norm": 0.7310701196391066, + "learning_rate": 3.2996074983633846e-07, + "loss": 0.3191, + "step": 9756 + }, + { + "epoch": 0.62, + "grad_norm": 0.5122777123842382, + "learning_rate": 3.2986363497409707e-07, + "loss": 0.1452, + "step": 9757 + }, + { + "epoch": 0.62, + "grad_norm": 4.29930004610316, + "learning_rate": 3.2976652737021483e-07, + "loss": 0.1408, + "step": 9758 + }, + { + "epoch": 0.62, + "grad_norm": 0.7674445115922284, + "learning_rate": 3.2966942702883485e-07, + "loss": 0.1664, + "step": 9759 + }, + { + "epoch": 0.62, + "grad_norm": 0.2529105157325676, + "learning_rate": 3.2957233395409945e-07, + "loss": 0.0529, + "step": 9760 + }, + { + "epoch": 0.62, + "grad_norm": 0.7890271333132229, + "learning_rate": 3.29475248150151e-07, + "loss": 0.1605, + "step": 9761 + }, + { + "epoch": 0.62, + "grad_norm": 0.7592650520725693, + "learning_rate": 3.293781696211311e-07, + "loss": 0.2552, + "step": 9762 + }, + { + "epoch": 0.62, + "grad_norm": 1.1271683139864541, + "learning_rate": 3.2928109837118165e-07, + "loss": 0.1316, + "step": 9763 + }, + { + "epoch": 0.62, + "grad_norm": 12.793001244147618, + "learning_rate": 3.291840344044435e-07, + "loss": 0.0628, + "step": 9764 + }, + { + "epoch": 0.62, + "grad_norm": 0.2535944043000978, + "learning_rate": 3.2908697772505807e-07, + "loss": 0.0689, + "step": 9765 + }, + { + "epoch": 0.62, + "grad_norm": 5.561272604758733, + "learning_rate": 3.2898992833716563e-07, + "loss": 0.289, + "step": 9766 + }, + { + "epoch": 0.62, + "grad_norm": 0.3481681415578158, + "learning_rate": 3.288928862449066e-07, + "loss": 0.193, + "step": 9767 + }, + { + "epoch": 0.62, + "grad_norm": 2.1000027488492163, + "learning_rate": 3.2879585145242117e-07, + "loss": 0.1165, + "step": 9768 + }, + { + "epoch": 0.62, + "grad_norm": 0.7352536233546072, + "learning_rate": 3.2869882396384873e-07, + "loss": 0.1636, + "step": 9769 + }, + { + "epoch": 0.62, + "grad_norm": 2.1403764319347593, + "learning_rate": 3.286018037833289e-07, + "loss": 0.414, + "step": 9770 + }, + { + "epoch": 0.62, + "grad_norm": 0.6444888092476602, + "learning_rate": 3.2850479091500053e-07, + "loss": 0.0698, + "step": 9771 + }, + { + "epoch": 0.62, + "grad_norm": 0.8511599200217639, + "learning_rate": 3.284077853630027e-07, + "loss": 0.246, + "step": 9772 + }, + { + "epoch": 0.62, + "grad_norm": 0.27623584953854535, + "learning_rate": 3.2831078713147354e-07, + "loss": 0.0084, + "step": 9773 + }, + { + "epoch": 0.62, + "grad_norm": 1.2594911388306558, + "learning_rate": 3.2821379622455146e-07, + "loss": 0.2543, + "step": 9774 + }, + { + "epoch": 0.62, + "grad_norm": 18.190853615986125, + "learning_rate": 3.2811681264637405e-07, + "loss": 0.3407, + "step": 9775 + }, + { + "epoch": 0.62, + "grad_norm": 4.4178107490553025, + "learning_rate": 3.280198364010791e-07, + "loss": 0.3291, + "step": 9776 + }, + { + "epoch": 0.62, + "grad_norm": 2.980847201905259, + "learning_rate": 3.2792286749280346e-07, + "loss": 0.1604, + "step": 9777 + }, + { + "epoch": 0.62, + "grad_norm": 1.1848945969660434, + "learning_rate": 3.2782590592568436e-07, + "loss": 0.1019, + "step": 9778 + }, + { + "epoch": 0.62, + "grad_norm": 0.7675199347965304, + "learning_rate": 3.2772895170385816e-07, + "loss": 0.1948, + "step": 9779 + }, + { + "epoch": 0.62, + "grad_norm": 0.6438653739147412, + "learning_rate": 3.2763200483146125e-07, + "loss": 0.2464, + "step": 9780 + }, + { + "epoch": 0.62, + "grad_norm": 0.5752937854883382, + "learning_rate": 3.275350653126294e-07, + "loss": 0.2327, + "step": 9781 + }, + { + "epoch": 0.62, + "grad_norm": 0.3260922813825861, + "learning_rate": 3.274381331514986e-07, + "loss": 0.216, + "step": 9782 + }, + { + "epoch": 0.62, + "grad_norm": 0.6309515879234886, + "learning_rate": 3.2734120835220377e-07, + "loss": 0.1001, + "step": 9783 + }, + { + "epoch": 0.62, + "grad_norm": 0.9280792828751381, + "learning_rate": 3.2724429091887993e-07, + "loss": 0.2191, + "step": 9784 + }, + { + "epoch": 0.62, + "grad_norm": 1.1782041291181589, + "learning_rate": 3.271473808556622e-07, + "loss": 0.4337, + "step": 9785 + }, + { + "epoch": 0.62, + "grad_norm": 0.3325734765309915, + "learning_rate": 3.270504781666845e-07, + "loss": 0.1293, + "step": 9786 + }, + { + "epoch": 0.62, + "grad_norm": 0.11298133267691497, + "learning_rate": 3.269535828560812e-07, + "loss": 0.0005, + "step": 9787 + }, + { + "epoch": 0.62, + "grad_norm": 1.6464759162177716, + "learning_rate": 3.268566949279857e-07, + "loss": 0.1142, + "step": 9788 + }, + { + "epoch": 0.62, + "grad_norm": 11.900505378403551, + "learning_rate": 3.26759814386532e-07, + "loss": 0.1764, + "step": 9789 + }, + { + "epoch": 0.62, + "grad_norm": 1.108662414175651, + "learning_rate": 3.2666294123585253e-07, + "loss": 0.1668, + "step": 9790 + }, + { + "epoch": 0.62, + "grad_norm": 1.205937580649672, + "learning_rate": 3.2656607548008064e-07, + "loss": 0.2206, + "step": 9791 + }, + { + "epoch": 0.62, + "grad_norm": 1.1276834088401069, + "learning_rate": 3.2646921712334854e-07, + "loss": 0.1915, + "step": 9792 + }, + { + "epoch": 0.62, + "grad_norm": 1.041790434933491, + "learning_rate": 3.263723661697885e-07, + "loss": 0.0513, + "step": 9793 + }, + { + "epoch": 0.62, + "grad_norm": 0.45748085143230005, + "learning_rate": 3.2627552262353223e-07, + "loss": 0.1839, + "step": 9794 + }, + { + "epoch": 0.62, + "grad_norm": 0.37755251386908534, + "learning_rate": 3.2617868648871167e-07, + "loss": 0.0076, + "step": 9795 + }, + { + "epoch": 0.62, + "grad_norm": 0.9007939493937487, + "learning_rate": 3.2608185776945773e-07, + "loss": 0.0692, + "step": 9796 + }, + { + "epoch": 0.62, + "grad_norm": 0.41751782266332393, + "learning_rate": 3.259850364699012e-07, + "loss": 0.0901, + "step": 9797 + }, + { + "epoch": 0.62, + "grad_norm": 0.7244555180524256, + "learning_rate": 3.2588822259417294e-07, + "loss": 0.1607, + "step": 9798 + }, + { + "epoch": 0.62, + "grad_norm": 0.639742382083237, + "learning_rate": 3.2579141614640293e-07, + "loss": 0.0075, + "step": 9799 + }, + { + "epoch": 0.62, + "grad_norm": 0.83033785241535, + "learning_rate": 3.2569461713072145e-07, + "loss": 0.3367, + "step": 9800 + }, + { + "epoch": 0.63, + "grad_norm": 0.8677793501382356, + "learning_rate": 3.255978255512579e-07, + "loss": 0.2052, + "step": 9801 + }, + { + "epoch": 0.63, + "grad_norm": 0.8380638557553156, + "learning_rate": 3.2550104141214184e-07, + "loss": 0.251, + "step": 9802 + }, + { + "epoch": 0.63, + "grad_norm": 1.065585588617662, + "learning_rate": 3.2540426471750197e-07, + "loss": 0.0808, + "step": 9803 + }, + { + "epoch": 0.63, + "grad_norm": 1.1518702118880153, + "learning_rate": 3.253074954714674e-07, + "loss": 0.2295, + "step": 9804 + }, + { + "epoch": 0.63, + "grad_norm": 5.898787117839744, + "learning_rate": 3.2521073367816597e-07, + "loss": 0.0493, + "step": 9805 + }, + { + "epoch": 0.63, + "grad_norm": 0.6507183401082992, + "learning_rate": 3.251139793417263e-07, + "loss": 0.013, + "step": 9806 + }, + { + "epoch": 0.63, + "grad_norm": 1.3547308846636823, + "learning_rate": 3.250172324662756e-07, + "loss": 0.1052, + "step": 9807 + }, + { + "epoch": 0.63, + "grad_norm": 20.268166422279045, + "learning_rate": 3.249204930559417e-07, + "loss": 0.1381, + "step": 9808 + }, + { + "epoch": 0.63, + "grad_norm": 2.5414980160997245, + "learning_rate": 3.248237611148514e-07, + "loss": 0.2751, + "step": 9809 + }, + { + "epoch": 0.63, + "grad_norm": 1.27660086068766, + "learning_rate": 3.247270366471316e-07, + "loss": 0.1243, + "step": 9810 + }, + { + "epoch": 0.63, + "grad_norm": 1.5296173960649535, + "learning_rate": 3.246303196569089e-07, + "loss": 0.1863, + "step": 9811 + }, + { + "epoch": 0.63, + "grad_norm": 1.590121154826465, + "learning_rate": 3.2453361014830904e-07, + "loss": 0.023, + "step": 9812 + }, + { + "epoch": 0.63, + "grad_norm": 1.8378317787950598, + "learning_rate": 3.244369081254584e-07, + "loss": 0.1815, + "step": 9813 + }, + { + "epoch": 0.63, + "grad_norm": 1.678106466930295, + "learning_rate": 3.24340213592482e-07, + "loss": 0.2562, + "step": 9814 + }, + { + "epoch": 0.63, + "grad_norm": 8.061081552019047, + "learning_rate": 3.242435265535053e-07, + "loss": 0.2101, + "step": 9815 + }, + { + "epoch": 0.63, + "grad_norm": 1.4553365836982701, + "learning_rate": 3.241468470126529e-07, + "loss": 0.3213, + "step": 9816 + }, + { + "epoch": 0.63, + "grad_norm": 0.6583729938209066, + "learning_rate": 3.240501749740496e-07, + "loss": 0.1729, + "step": 9817 + }, + { + "epoch": 0.63, + "grad_norm": 0.5780146377092252, + "learning_rate": 3.239535104418196e-07, + "loss": 0.3041, + "step": 9818 + }, + { + "epoch": 0.63, + "grad_norm": 0.501996112418068, + "learning_rate": 3.2385685342008674e-07, + "loss": 0.2809, + "step": 9819 + }, + { + "epoch": 0.63, + "grad_norm": 0.6947544197855776, + "learning_rate": 3.2376020391297455e-07, + "loss": 0.1819, + "step": 9820 + }, + { + "epoch": 0.63, + "grad_norm": 0.5802086760688637, + "learning_rate": 3.236635619246064e-07, + "loss": 0.1085, + "step": 9821 + }, + { + "epoch": 0.63, + "grad_norm": 0.6207744953822706, + "learning_rate": 3.235669274591051e-07, + "loss": 0.074, + "step": 9822 + }, + { + "epoch": 0.63, + "grad_norm": 0.6310128693542639, + "learning_rate": 3.2347030052059354e-07, + "loss": 0.0043, + "step": 9823 + }, + { + "epoch": 0.63, + "grad_norm": 0.5035220755686219, + "learning_rate": 3.233736811131938e-07, + "loss": 0.3291, + "step": 9824 + }, + { + "epoch": 0.63, + "grad_norm": 1.3402357618229086, + "learning_rate": 3.232770692410277e-07, + "loss": 0.1443, + "step": 9825 + }, + { + "epoch": 0.63, + "grad_norm": 0.7722676121731585, + "learning_rate": 3.2318046490821716e-07, + "loss": 0.2044, + "step": 9826 + }, + { + "epoch": 0.63, + "grad_norm": 0.5447117817685545, + "learning_rate": 3.2308386811888346e-07, + "loss": 0.0425, + "step": 9827 + }, + { + "epoch": 0.63, + "grad_norm": 0.8505989702153116, + "learning_rate": 3.229872788771476e-07, + "loss": 0.4057, + "step": 9828 + }, + { + "epoch": 0.63, + "grad_norm": 0.6823708881589299, + "learning_rate": 3.2289069718713016e-07, + "loss": 0.1218, + "step": 9829 + }, + { + "epoch": 0.63, + "grad_norm": 1.0516683346630333, + "learning_rate": 3.227941230529517e-07, + "loss": 0.1493, + "step": 9830 + }, + { + "epoch": 0.63, + "grad_norm": 1.5065633695756446, + "learning_rate": 3.2269755647873214e-07, + "loss": 0.2123, + "step": 9831 + }, + { + "epoch": 0.63, + "grad_norm": 14.212562342131438, + "learning_rate": 3.226009974685914e-07, + "loss": 0.0154, + "step": 9832 + }, + { + "epoch": 0.63, + "grad_norm": 0.42259660204897154, + "learning_rate": 3.225044460266485e-07, + "loss": 0.1057, + "step": 9833 + }, + { + "epoch": 0.63, + "grad_norm": 0.89121376067872, + "learning_rate": 3.2240790215702297e-07, + "loss": 0.4477, + "step": 9834 + }, + { + "epoch": 0.63, + "grad_norm": 1.016530243754151, + "learning_rate": 3.223113658638331e-07, + "loss": 0.0705, + "step": 9835 + }, + { + "epoch": 0.63, + "grad_norm": 0.19237502739439066, + "learning_rate": 3.222148371511977e-07, + "loss": 0.1574, + "step": 9836 + }, + { + "epoch": 0.63, + "grad_norm": 0.9736830476847157, + "learning_rate": 3.221183160232348e-07, + "loss": 0.2513, + "step": 9837 + }, + { + "epoch": 0.63, + "grad_norm": 0.5915501474496268, + "learning_rate": 3.2202180248406196e-07, + "loss": 0.3128, + "step": 9838 + }, + { + "epoch": 0.63, + "grad_norm": 0.4582903970709344, + "learning_rate": 3.21925296537797e-07, + "loss": 0.0935, + "step": 9839 + }, + { + "epoch": 0.63, + "grad_norm": 0.6102361646494264, + "learning_rate": 3.218287981885567e-07, + "loss": 0.2526, + "step": 9840 + }, + { + "epoch": 0.63, + "grad_norm": 0.70409353789978, + "learning_rate": 3.2173230744045815e-07, + "loss": 0.3165, + "step": 9841 + }, + { + "epoch": 0.63, + "grad_norm": 0.3060056987874527, + "learning_rate": 3.216358242976176e-07, + "loss": 0.0452, + "step": 9842 + }, + { + "epoch": 0.63, + "grad_norm": 0.359948758444199, + "learning_rate": 3.2153934876415143e-07, + "loss": 0.1265, + "step": 9843 + }, + { + "epoch": 0.63, + "grad_norm": 1.0029068397163248, + "learning_rate": 3.214428808441754e-07, + "loss": 0.2236, + "step": 9844 + }, + { + "epoch": 0.63, + "grad_norm": 0.6554839420953738, + "learning_rate": 3.2134642054180493e-07, + "loss": 0.1272, + "step": 9845 + }, + { + "epoch": 0.63, + "grad_norm": 1.1511883519595218, + "learning_rate": 3.2124996786115524e-07, + "loss": 0.1783, + "step": 9846 + }, + { + "epoch": 0.63, + "grad_norm": 1.3290639556624124, + "learning_rate": 3.211535228063415e-07, + "loss": 0.3526, + "step": 9847 + }, + { + "epoch": 0.63, + "grad_norm": 6.436707333221691, + "learning_rate": 3.210570853814778e-07, + "loss": 0.0636, + "step": 9848 + }, + { + "epoch": 0.63, + "grad_norm": 0.9722318766730552, + "learning_rate": 3.209606555906788e-07, + "loss": 0.2851, + "step": 9849 + }, + { + "epoch": 0.63, + "grad_norm": 0.6854531809469963, + "learning_rate": 3.208642334380578e-07, + "loss": 0.2056, + "step": 9850 + }, + { + "epoch": 0.63, + "grad_norm": 2.1512496298708337, + "learning_rate": 3.2076781892772904e-07, + "loss": 0.1194, + "step": 9851 + }, + { + "epoch": 0.63, + "grad_norm": 0.9946664661922305, + "learning_rate": 3.2067141206380526e-07, + "loss": 0.0877, + "step": 9852 + }, + { + "epoch": 0.63, + "grad_norm": 0.7769647861589055, + "learning_rate": 3.2057501285039957e-07, + "loss": 0.2582, + "step": 9853 + }, + { + "epoch": 0.63, + "grad_norm": 0.6634267476529329, + "learning_rate": 3.204786212916245e-07, + "loss": 0.2364, + "step": 9854 + }, + { + "epoch": 0.63, + "grad_norm": 0.6357232327354747, + "learning_rate": 3.2038223739159225e-07, + "loss": 0.1074, + "step": 9855 + }, + { + "epoch": 0.63, + "grad_norm": 0.6956484779394868, + "learning_rate": 3.2028586115441504e-07, + "loss": 0.3608, + "step": 9856 + }, + { + "epoch": 0.63, + "grad_norm": 1.4483626126431408, + "learning_rate": 3.2018949258420404e-07, + "loss": 0.2857, + "step": 9857 + }, + { + "epoch": 0.63, + "grad_norm": 3.892691276179684, + "learning_rate": 3.2009313168507096e-07, + "loss": 0.1045, + "step": 9858 + }, + { + "epoch": 0.63, + "grad_norm": 5.604541284668023, + "learning_rate": 3.1999677846112634e-07, + "loss": 0.3529, + "step": 9859 + }, + { + "epoch": 0.63, + "grad_norm": 0.8117811118884894, + "learning_rate": 3.1990043291648116e-07, + "loss": 0.2828, + "step": 9860 + }, + { + "epoch": 0.63, + "grad_norm": 3.8544328729224833, + "learning_rate": 3.198040950552454e-07, + "loss": 0.2184, + "step": 9861 + }, + { + "epoch": 0.63, + "grad_norm": 1.107901832492373, + "learning_rate": 3.197077648815293e-07, + "loss": 0.211, + "step": 9862 + }, + { + "epoch": 0.63, + "grad_norm": 1.2094327134884784, + "learning_rate": 3.196114423994424e-07, + "loss": 0.1163, + "step": 9863 + }, + { + "epoch": 0.63, + "grad_norm": 1.2872753391107936, + "learning_rate": 3.19515127613094e-07, + "loss": 0.1766, + "step": 9864 + }, + { + "epoch": 0.63, + "grad_norm": 2.380807332028745, + "learning_rate": 3.1941882052659307e-07, + "loss": 0.2257, + "step": 9865 + }, + { + "epoch": 0.63, + "grad_norm": 1.1074461411061158, + "learning_rate": 3.1932252114404813e-07, + "loss": 0.3668, + "step": 9866 + }, + { + "epoch": 0.63, + "grad_norm": 0.38756982710411836, + "learning_rate": 3.1922622946956787e-07, + "loss": 0.1879, + "step": 9867 + }, + { + "epoch": 0.63, + "grad_norm": 1.1471559271998129, + "learning_rate": 3.191299455072598e-07, + "loss": 0.1824, + "step": 9868 + }, + { + "epoch": 0.63, + "grad_norm": 1.467852571762619, + "learning_rate": 3.190336692612321e-07, + "loss": 0.497, + "step": 9869 + }, + { + "epoch": 0.63, + "grad_norm": 0.8289119305925776, + "learning_rate": 3.1893740073559164e-07, + "loss": 0.0471, + "step": 9870 + }, + { + "epoch": 0.63, + "grad_norm": 0.2894254871999359, + "learning_rate": 3.1884113993444576e-07, + "loss": 0.1185, + "step": 9871 + }, + { + "epoch": 0.63, + "grad_norm": 1.2244586613989046, + "learning_rate": 3.18744886861901e-07, + "loss": 0.3413, + "step": 9872 + }, + { + "epoch": 0.63, + "grad_norm": 0.7645184562699384, + "learning_rate": 3.186486415220638e-07, + "loss": 0.3322, + "step": 9873 + }, + { + "epoch": 0.63, + "grad_norm": 0.7867369180073533, + "learning_rate": 3.1855240391903994e-07, + "loss": 0.2233, + "step": 9874 + }, + { + "epoch": 0.63, + "grad_norm": 0.21979088857614315, + "learning_rate": 3.1845617405693554e-07, + "loss": 0.0017, + "step": 9875 + }, + { + "epoch": 0.63, + "grad_norm": 4.412127161499392, + "learning_rate": 3.1835995193985546e-07, + "loss": 0.1026, + "step": 9876 + }, + { + "epoch": 0.63, + "grad_norm": 1.150974680950936, + "learning_rate": 3.1826373757190515e-07, + "loss": 0.0228, + "step": 9877 + }, + { + "epoch": 0.63, + "grad_norm": 0.5825104628199708, + "learning_rate": 3.1816753095718914e-07, + "loss": 0.2368, + "step": 9878 + }, + { + "epoch": 0.63, + "grad_norm": 4.99293123207642, + "learning_rate": 3.1807133209981167e-07, + "loss": 0.1077, + "step": 9879 + }, + { + "epoch": 0.63, + "grad_norm": 0.9097358517491003, + "learning_rate": 3.17975141003877e-07, + "loss": 0.3015, + "step": 9880 + }, + { + "epoch": 0.63, + "grad_norm": 1.226154462304313, + "learning_rate": 3.1787895767348856e-07, + "loss": 0.1915, + "step": 9881 + }, + { + "epoch": 0.63, + "grad_norm": 2.0056129689625313, + "learning_rate": 3.177827821127501e-07, + "loss": 0.1646, + "step": 9882 + }, + { + "epoch": 0.63, + "grad_norm": 1.5020985357185783, + "learning_rate": 3.1768661432576425e-07, + "loss": 0.4339, + "step": 9883 + }, + { + "epoch": 0.63, + "grad_norm": 10.327293405068815, + "learning_rate": 3.175904543166342e-07, + "loss": 0.238, + "step": 9884 + }, + { + "epoch": 0.63, + "grad_norm": 0.45995849192979227, + "learning_rate": 3.174943020894618e-07, + "loss": 0.2684, + "step": 9885 + }, + { + "epoch": 0.63, + "grad_norm": 0.683662106369194, + "learning_rate": 3.173981576483495e-07, + "loss": 0.3656, + "step": 9886 + }, + { + "epoch": 0.63, + "grad_norm": 3.639972937803985, + "learning_rate": 3.173020209973988e-07, + "loss": 0.0426, + "step": 9887 + }, + { + "epoch": 0.63, + "grad_norm": 0.6160806936059422, + "learning_rate": 3.172058921407112e-07, + "loss": 0.1846, + "step": 9888 + }, + { + "epoch": 0.63, + "grad_norm": 0.7642852608321751, + "learning_rate": 3.1710977108238764e-07, + "loss": 0.2798, + "step": 9889 + }, + { + "epoch": 0.63, + "grad_norm": 10.56581276208378, + "learning_rate": 3.170136578265289e-07, + "loss": 0.1302, + "step": 9890 + }, + { + "epoch": 0.63, + "grad_norm": 15.594260556517076, + "learning_rate": 3.1691755237723536e-07, + "loss": 0.1422, + "step": 9891 + }, + { + "epoch": 0.63, + "grad_norm": 1.893153347593312, + "learning_rate": 3.168214547386072e-07, + "loss": 0.2696, + "step": 9892 + }, + { + "epoch": 0.63, + "grad_norm": 1.4365150509337516, + "learning_rate": 3.1672536491474404e-07, + "loss": 0.1797, + "step": 9893 + }, + { + "epoch": 0.63, + "grad_norm": 5.122466333845425, + "learning_rate": 3.1662928290974514e-07, + "loss": 0.0621, + "step": 9894 + }, + { + "epoch": 0.63, + "grad_norm": 15.115066480424131, + "learning_rate": 3.165332087277097e-07, + "loss": 0.1453, + "step": 9895 + }, + { + "epoch": 0.63, + "grad_norm": 4.808008933131503, + "learning_rate": 3.164371423727362e-07, + "loss": 0.2404, + "step": 9896 + }, + { + "epoch": 0.63, + "grad_norm": 0.4273001064281147, + "learning_rate": 3.163410838489234e-07, + "loss": 0.1283, + "step": 9897 + }, + { + "epoch": 0.63, + "grad_norm": 1.0126728978340207, + "learning_rate": 3.162450331603691e-07, + "loss": 0.2811, + "step": 9898 + }, + { + "epoch": 0.63, + "grad_norm": 0.6151956085625986, + "learning_rate": 3.1614899031117113e-07, + "loss": 0.2404, + "step": 9899 + }, + { + "epoch": 0.63, + "grad_norm": 1.918530267726266, + "learning_rate": 3.160529553054267e-07, + "loss": 0.1715, + "step": 9900 + }, + { + "epoch": 0.63, + "grad_norm": 0.7940771940834439, + "learning_rate": 3.159569281472332e-07, + "loss": 0.0646, + "step": 9901 + }, + { + "epoch": 0.63, + "grad_norm": 8.132041643094976, + "learning_rate": 3.1586090884068685e-07, + "loss": 0.1998, + "step": 9902 + }, + { + "epoch": 0.63, + "grad_norm": 0.09350245611442531, + "learning_rate": 3.1576489738988455e-07, + "loss": 0.0017, + "step": 9903 + }, + { + "epoch": 0.63, + "grad_norm": 2.7649371838661816, + "learning_rate": 3.1566889379892193e-07, + "loss": 0.0868, + "step": 9904 + }, + { + "epoch": 0.63, + "grad_norm": 1.9467610536431719, + "learning_rate": 3.15572898071895e-07, + "loss": 0.2727, + "step": 9905 + }, + { + "epoch": 0.63, + "grad_norm": 1.8475691244522232, + "learning_rate": 3.1547691021289885e-07, + "loss": 0.1743, + "step": 9906 + }, + { + "epoch": 0.63, + "grad_norm": 1.3443031278498072, + "learning_rate": 3.1538093022602857e-07, + "loss": 0.0531, + "step": 9907 + }, + { + "epoch": 0.63, + "grad_norm": 0.4824038418934789, + "learning_rate": 3.1528495811537916e-07, + "loss": 0.251, + "step": 9908 + }, + { + "epoch": 0.63, + "grad_norm": 0.9748168955001419, + "learning_rate": 3.151889938850445e-07, + "loss": 0.074, + "step": 9909 + }, + { + "epoch": 0.63, + "grad_norm": 1.7990677806453492, + "learning_rate": 3.1509303753911916e-07, + "loss": 0.1646, + "step": 9910 + }, + { + "epoch": 0.63, + "grad_norm": 2.059348594169725, + "learning_rate": 3.149970890816963e-07, + "loss": 0.2524, + "step": 9911 + }, + { + "epoch": 0.63, + "grad_norm": 1.3788194423194713, + "learning_rate": 3.1490114851686975e-07, + "loss": 0.2684, + "step": 9912 + }, + { + "epoch": 0.63, + "grad_norm": 1.7774822863336062, + "learning_rate": 3.148052158487321e-07, + "loss": 0.0269, + "step": 9913 + }, + { + "epoch": 0.63, + "grad_norm": 1.3721846056736207, + "learning_rate": 3.147092910813764e-07, + "loss": 0.0962, + "step": 9914 + }, + { + "epoch": 0.63, + "grad_norm": 1.9218045254917826, + "learning_rate": 3.146133742188946e-07, + "loss": 0.1785, + "step": 9915 + }, + { + "epoch": 0.63, + "grad_norm": 0.2376069877986604, + "learning_rate": 3.145174652653791e-07, + "loss": 0.0841, + "step": 9916 + }, + { + "epoch": 0.63, + "grad_norm": 1.5314508960948032, + "learning_rate": 3.144215642249213e-07, + "loss": 0.129, + "step": 9917 + }, + { + "epoch": 0.63, + "grad_norm": 0.6824153588707195, + "learning_rate": 3.143256711016128e-07, + "loss": 0.2261, + "step": 9918 + }, + { + "epoch": 0.63, + "grad_norm": 1.1063518275775739, + "learning_rate": 3.1422978589954443e-07, + "loss": 0.1102, + "step": 9919 + }, + { + "epoch": 0.63, + "grad_norm": 0.5319474605010603, + "learning_rate": 3.1413390862280655e-07, + "loss": 0.2272, + "step": 9920 + }, + { + "epoch": 0.63, + "grad_norm": 1.5168683468987119, + "learning_rate": 3.1403803927549006e-07, + "loss": 0.595, + "step": 9921 + }, + { + "epoch": 0.63, + "grad_norm": 1.0728593540967328, + "learning_rate": 3.139421778616844e-07, + "loss": 0.1412, + "step": 9922 + }, + { + "epoch": 0.63, + "grad_norm": 1.1104532322306133, + "learning_rate": 3.1384632438547964e-07, + "loss": 0.2981, + "step": 9923 + }, + { + "epoch": 0.63, + "grad_norm": 0.8048437881054166, + "learning_rate": 3.137504788509648e-07, + "loss": 0.2572, + "step": 9924 + }, + { + "epoch": 0.63, + "grad_norm": 0.525706281740437, + "learning_rate": 3.1365464126222897e-07, + "loss": 0.1974, + "step": 9925 + }, + { + "epoch": 0.63, + "grad_norm": 0.3086253821196317, + "learning_rate": 3.135588116233607e-07, + "loss": 0.0059, + "step": 9926 + }, + { + "epoch": 0.63, + "grad_norm": 0.3348227131092952, + "learning_rate": 3.1346298993844844e-07, + "loss": 0.2634, + "step": 9927 + }, + { + "epoch": 0.63, + "grad_norm": 1.4285186408888528, + "learning_rate": 3.1336717621157986e-07, + "loss": 0.0997, + "step": 9928 + }, + { + "epoch": 0.63, + "grad_norm": 0.7934943600101988, + "learning_rate": 3.132713704468429e-07, + "loss": 0.2026, + "step": 9929 + }, + { + "epoch": 0.63, + "grad_norm": 4.100488292872332, + "learning_rate": 3.131755726483245e-07, + "loss": 0.1602, + "step": 9930 + }, + { + "epoch": 0.63, + "grad_norm": 0.5042898575656839, + "learning_rate": 3.1307978282011186e-07, + "loss": 0.089, + "step": 9931 + }, + { + "epoch": 0.63, + "grad_norm": 1.345181178870036, + "learning_rate": 3.129840009662913e-07, + "loss": 0.1918, + "step": 9932 + }, + { + "epoch": 0.63, + "grad_norm": 1.9917433650929446, + "learning_rate": 3.1288822709094933e-07, + "loss": 0.1131, + "step": 9933 + }, + { + "epoch": 0.63, + "grad_norm": 0.45394084261043055, + "learning_rate": 3.1279246119817174e-07, + "loss": 0.2909, + "step": 9934 + }, + { + "epoch": 0.63, + "grad_norm": 0.5376000378997712, + "learning_rate": 3.1269670329204393e-07, + "loss": 0.1752, + "step": 9935 + }, + { + "epoch": 0.63, + "grad_norm": 1.782579958651778, + "learning_rate": 3.126009533766515e-07, + "loss": 0.174, + "step": 9936 + }, + { + "epoch": 0.63, + "grad_norm": 3.1027129316539956, + "learning_rate": 3.1250521145607894e-07, + "loss": 0.1202, + "step": 9937 + }, + { + "epoch": 0.63, + "grad_norm": 1.6921228097307655, + "learning_rate": 3.124094775344112e-07, + "loss": 0.216, + "step": 9938 + }, + { + "epoch": 0.63, + "grad_norm": 0.8306429099856422, + "learning_rate": 3.1231375161573194e-07, + "loss": 0.1582, + "step": 9939 + }, + { + "epoch": 0.63, + "grad_norm": 1.0218445145187087, + "learning_rate": 3.122180337041256e-07, + "loss": 0.0916, + "step": 9940 + }, + { + "epoch": 0.63, + "grad_norm": 1.3578911721646219, + "learning_rate": 3.121223238036752e-07, + "loss": 0.3942, + "step": 9941 + }, + { + "epoch": 0.63, + "grad_norm": 6.109048368111545, + "learning_rate": 3.120266219184643e-07, + "loss": 0.1368, + "step": 9942 + }, + { + "epoch": 0.63, + "grad_norm": 0.5985847161275715, + "learning_rate": 3.1193092805257554e-07, + "loss": 0.22, + "step": 9943 + }, + { + "epoch": 0.63, + "grad_norm": 0.30522493991220756, + "learning_rate": 3.118352422100915e-07, + "loss": 0.0889, + "step": 9944 + }, + { + "epoch": 0.63, + "grad_norm": 0.5032988524392258, + "learning_rate": 3.117395643950941e-07, + "loss": 0.1759, + "step": 9945 + }, + { + "epoch": 0.63, + "grad_norm": 4.1292612129831685, + "learning_rate": 3.116438946116656e-07, + "loss": 0.294, + "step": 9946 + }, + { + "epoch": 0.63, + "grad_norm": 0.48900880580396, + "learning_rate": 3.115482328638872e-07, + "loss": 0.0879, + "step": 9947 + }, + { + "epoch": 0.63, + "grad_norm": 8.261090140424646, + "learning_rate": 3.1145257915583975e-07, + "loss": 0.1679, + "step": 9948 + }, + { + "epoch": 0.63, + "grad_norm": 1.0631672282215923, + "learning_rate": 3.1135693349160463e-07, + "loss": 0.1693, + "step": 9949 + }, + { + "epoch": 0.63, + "grad_norm": 0.1328929962914109, + "learning_rate": 3.112612958752617e-07, + "loss": 0.0741, + "step": 9950 + }, + { + "epoch": 0.63, + "grad_norm": 0.47584586979102117, + "learning_rate": 3.111656663108914e-07, + "loss": 0.0907, + "step": 9951 + }, + { + "epoch": 0.63, + "grad_norm": 1.8814139796048461, + "learning_rate": 3.110700448025732e-07, + "loss": 0.1669, + "step": 9952 + }, + { + "epoch": 0.63, + "grad_norm": 4.525102249279778, + "learning_rate": 3.1097443135438696e-07, + "loss": 0.143, + "step": 9953 + }, + { + "epoch": 0.63, + "grad_norm": 3.360034123319944, + "learning_rate": 3.1087882597041125e-07, + "loss": 0.2441, + "step": 9954 + }, + { + "epoch": 0.63, + "grad_norm": 0.8884149877409633, + "learning_rate": 3.1078322865472517e-07, + "loss": 0.2257, + "step": 9955 + }, + { + "epoch": 0.63, + "grad_norm": 1.4119783766819232, + "learning_rate": 3.1068763941140676e-07, + "loss": 0.456, + "step": 9956 + }, + { + "epoch": 0.63, + "grad_norm": 3.5584195161214445, + "learning_rate": 3.105920582445344e-07, + "loss": 0.2614, + "step": 9957 + }, + { + "epoch": 0.64, + "grad_norm": 1.6627540683760709, + "learning_rate": 3.104964851581855e-07, + "loss": 0.2102, + "step": 9958 + }, + { + "epoch": 0.64, + "grad_norm": 0.623198031090522, + "learning_rate": 3.104009201564376e-07, + "loss": 0.301, + "step": 9959 + }, + { + "epoch": 0.64, + "grad_norm": 5.718368521814724, + "learning_rate": 3.103053632433674e-07, + "loss": 0.0962, + "step": 9960 + }, + { + "epoch": 0.64, + "grad_norm": 0.4246775664980575, + "learning_rate": 3.102098144230518e-07, + "loss": 0.1491, + "step": 9961 + }, + { + "epoch": 0.64, + "grad_norm": 0.3173459440798687, + "learning_rate": 3.101142736995672e-07, + "loss": 0.0318, + "step": 9962 + }, + { + "epoch": 0.64, + "grad_norm": 4.97633954457249, + "learning_rate": 3.1001874107698916e-07, + "loss": 0.0127, + "step": 9963 + }, + { + "epoch": 0.64, + "grad_norm": 1.2916660812561684, + "learning_rate": 3.0992321655939377e-07, + "loss": 0.2368, + "step": 9964 + }, + { + "epoch": 0.64, + "grad_norm": 0.851010525009808, + "learning_rate": 3.0982770015085586e-07, + "loss": 0.112, + "step": 9965 + }, + { + "epoch": 0.64, + "grad_norm": 11.920113350536923, + "learning_rate": 3.097321918554507e-07, + "loss": 0.2247, + "step": 9966 + }, + { + "epoch": 0.64, + "grad_norm": 0.6563612146386266, + "learning_rate": 3.096366916772526e-07, + "loss": 0.1539, + "step": 9967 + }, + { + "epoch": 0.64, + "grad_norm": 0.80807293742905, + "learning_rate": 3.09541199620336e-07, + "loss": 0.0922, + "step": 9968 + }, + { + "epoch": 0.64, + "grad_norm": 0.29216820415518563, + "learning_rate": 3.0944571568877466e-07, + "loss": 0.2328, + "step": 9969 + }, + { + "epoch": 0.64, + "grad_norm": 0.7729377265353073, + "learning_rate": 3.093502398866422e-07, + "loss": 0.0259, + "step": 9970 + }, + { + "epoch": 0.64, + "grad_norm": 1.0381622148450527, + "learning_rate": 3.0925477221801156e-07, + "loss": 0.4978, + "step": 9971 + }, + { + "epoch": 0.64, + "grad_norm": 0.22838959937936235, + "learning_rate": 3.0915931268695604e-07, + "loss": 0.0997, + "step": 9972 + }, + { + "epoch": 0.64, + "grad_norm": 4.900846762164515, + "learning_rate": 3.090638612975477e-07, + "loss": 0.1945, + "step": 9973 + }, + { + "epoch": 0.64, + "grad_norm": 0.5443537803913264, + "learning_rate": 3.089684180538591e-07, + "loss": 0.1917, + "step": 9974 + }, + { + "epoch": 0.64, + "grad_norm": 1.2006464030075767, + "learning_rate": 3.0887298295996177e-07, + "loss": 0.2563, + "step": 9975 + }, + { + "epoch": 0.64, + "grad_norm": 0.7391239363071468, + "learning_rate": 3.0877755601992694e-07, + "loss": 0.1639, + "step": 9976 + }, + { + "epoch": 0.64, + "grad_norm": 1.6390049803932814, + "learning_rate": 3.0868213723782616e-07, + "loss": 0.0482, + "step": 9977 + }, + { + "epoch": 0.64, + "grad_norm": 0.4805558010039064, + "learning_rate": 3.0858672661772987e-07, + "loss": 0.3875, + "step": 9978 + }, + { + "epoch": 0.64, + "grad_norm": 0.9976928758079943, + "learning_rate": 3.0849132416370883e-07, + "loss": 0.1156, + "step": 9979 + }, + { + "epoch": 0.64, + "grad_norm": 0.45979684673637516, + "learning_rate": 3.0839592987983264e-07, + "loss": 0.1015, + "step": 9980 + }, + { + "epoch": 0.64, + "grad_norm": 0.4793416610666731, + "learning_rate": 3.0830054377017147e-07, + "loss": 0.1903, + "step": 9981 + }, + { + "epoch": 0.64, + "grad_norm": 0.5359966058214101, + "learning_rate": 3.082051658387943e-07, + "loss": 0.087, + "step": 9982 + }, + { + "epoch": 0.64, + "grad_norm": 3.385229362388537, + "learning_rate": 3.081097960897705e-07, + "loss": 0.27, + "step": 9983 + }, + { + "epoch": 0.64, + "grad_norm": 0.6884951815819341, + "learning_rate": 3.0801443452716827e-07, + "loss": 0.0878, + "step": 9984 + }, + { + "epoch": 0.64, + "grad_norm": 1.112456931004475, + "learning_rate": 3.079190811550565e-07, + "loss": 0.4376, + "step": 9985 + }, + { + "epoch": 0.64, + "grad_norm": 3.4666678772114836, + "learning_rate": 3.078237359775026e-07, + "loss": 0.2145, + "step": 9986 + }, + { + "epoch": 0.64, + "grad_norm": 0.46319190130088983, + "learning_rate": 3.0772839899857463e-07, + "loss": 0.1443, + "step": 9987 + }, + { + "epoch": 0.64, + "grad_norm": 0.9449454412277086, + "learning_rate": 3.0763307022233967e-07, + "loss": 0.2502, + "step": 9988 + }, + { + "epoch": 0.64, + "grad_norm": 0.5370357256102969, + "learning_rate": 3.075377496528645e-07, + "loss": 0.2487, + "step": 9989 + }, + { + "epoch": 0.64, + "grad_norm": 1.35345407023433, + "learning_rate": 3.07442437294216e-07, + "loss": 0.0602, + "step": 9990 + }, + { + "epoch": 0.64, + "grad_norm": 0.7936235863106439, + "learning_rate": 3.0734713315046004e-07, + "loss": 0.3272, + "step": 9991 + }, + { + "epoch": 0.64, + "grad_norm": 0.4594258122530349, + "learning_rate": 3.0725183722566286e-07, + "loss": 0.0157, + "step": 9992 + }, + { + "epoch": 0.64, + "grad_norm": 0.9367334480193928, + "learning_rate": 3.0715654952388954e-07, + "loss": 0.242, + "step": 9993 + }, + { + "epoch": 0.64, + "grad_norm": 9.361592574750528, + "learning_rate": 3.0706127004920557e-07, + "loss": 0.2584, + "step": 9994 + }, + { + "epoch": 0.64, + "grad_norm": 0.7880141354801808, + "learning_rate": 3.0696599880567576e-07, + "loss": 0.2367, + "step": 9995 + }, + { + "epoch": 0.64, + "grad_norm": 0.7058275558300987, + "learning_rate": 3.0687073579736443e-07, + "loss": 0.4972, + "step": 9996 + }, + { + "epoch": 0.64, + "grad_norm": 0.6251495941489278, + "learning_rate": 3.067754810283356e-07, + "loss": 0.2803, + "step": 9997 + }, + { + "epoch": 0.64, + "grad_norm": 0.5206111780187221, + "learning_rate": 3.0668023450265343e-07, + "loss": 0.075, + "step": 9998 + }, + { + "epoch": 0.64, + "grad_norm": 0.4054752988384419, + "learning_rate": 3.0658499622438093e-07, + "loss": 0.137, + "step": 9999 + }, + { + "epoch": 0.64, + "grad_norm": 0.7106065029504237, + "learning_rate": 3.0648976619758143e-07, + "loss": 0.1458, + "step": 10000 + }, + { + "epoch": 0.64, + "grad_norm": 3.6995757523269095, + "learning_rate": 3.0639454442631735e-07, + "loss": 0.0273, + "step": 10001 + }, + { + "epoch": 0.64, + "grad_norm": 0.5310302256045529, + "learning_rate": 3.0629933091465136e-07, + "loss": 0.1255, + "step": 10002 + }, + { + "epoch": 0.64, + "grad_norm": 0.745436216301801, + "learning_rate": 3.062041256666452e-07, + "loss": 0.2902, + "step": 10003 + }, + { + "epoch": 0.64, + "grad_norm": 1.6331744260702172, + "learning_rate": 3.0610892868636076e-07, + "loss": 0.3064, + "step": 10004 + }, + { + "epoch": 0.64, + "grad_norm": 2.9345158770750337, + "learning_rate": 3.06013739977859e-07, + "loss": 0.1613, + "step": 10005 + }, + { + "epoch": 0.64, + "grad_norm": 1.3799141040742937, + "learning_rate": 3.0591855954520106e-07, + "loss": 0.2232, + "step": 10006 + }, + { + "epoch": 0.64, + "grad_norm": 1.4388784706078745, + "learning_rate": 3.0582338739244765e-07, + "loss": 0.0198, + "step": 10007 + }, + { + "epoch": 0.64, + "grad_norm": 1.048807729296082, + "learning_rate": 3.0572822352365873e-07, + "loss": 0.4561, + "step": 10008 + }, + { + "epoch": 0.64, + "grad_norm": 0.7215751437856766, + "learning_rate": 3.0563306794289455e-07, + "loss": 0.2908, + "step": 10009 + }, + { + "epoch": 0.64, + "grad_norm": 0.5288074995535735, + "learning_rate": 3.055379206542142e-07, + "loss": 0.0895, + "step": 10010 + }, + { + "epoch": 0.64, + "grad_norm": 0.420119204931513, + "learning_rate": 3.0544278166167725e-07, + "loss": 0.2203, + "step": 10011 + }, + { + "epoch": 0.64, + "grad_norm": 0.9192937025417116, + "learning_rate": 3.0534765096934214e-07, + "loss": 0.324, + "step": 10012 + }, + { + "epoch": 0.64, + "grad_norm": 0.3216767944270263, + "learning_rate": 3.0525252858126765e-07, + "loss": 0.1034, + "step": 10013 + }, + { + "epoch": 0.64, + "grad_norm": 0.8025912749416866, + "learning_rate": 3.051574145015118e-07, + "loss": 0.419, + "step": 10014 + }, + { + "epoch": 0.64, + "grad_norm": 0.38860547799048384, + "learning_rate": 3.050623087341323e-07, + "loss": 0.2668, + "step": 10015 + }, + { + "epoch": 0.64, + "grad_norm": 0.30647173013463946, + "learning_rate": 3.049672112831867e-07, + "loss": 0.2673, + "step": 10016 + }, + { + "epoch": 0.64, + "grad_norm": 1.1634362172846398, + "learning_rate": 3.048721221527317e-07, + "loss": 0.1805, + "step": 10017 + }, + { + "epoch": 0.64, + "grad_norm": 1.2710136400778729, + "learning_rate": 3.047770413468245e-07, + "loss": 0.1133, + "step": 10018 + }, + { + "epoch": 0.64, + "grad_norm": 0.8712449299084689, + "learning_rate": 3.046819688695209e-07, + "loss": 0.3285, + "step": 10019 + }, + { + "epoch": 0.64, + "grad_norm": 2.6709309625179394, + "learning_rate": 3.0458690472487735e-07, + "loss": 0.1468, + "step": 10020 + }, + { + "epoch": 0.64, + "grad_norm": 2.8985259676730797, + "learning_rate": 3.0449184891694914e-07, + "loss": 0.1437, + "step": 10021 + }, + { + "epoch": 0.64, + "grad_norm": 2.161120386247777, + "learning_rate": 3.0439680144979174e-07, + "loss": 0.4697, + "step": 10022 + }, + { + "epoch": 0.64, + "grad_norm": 1.5839040079934381, + "learning_rate": 3.0430176232745984e-07, + "loss": 0.1546, + "step": 10023 + }, + { + "epoch": 0.64, + "grad_norm": 0.46234028993005594, + "learning_rate": 3.0420673155400846e-07, + "loss": 0.0834, + "step": 10024 + }, + { + "epoch": 0.64, + "grad_norm": 0.57650839058358, + "learning_rate": 3.041117091334913e-07, + "loss": 0.3336, + "step": 10025 + }, + { + "epoch": 0.64, + "grad_norm": 1.0261905910225066, + "learning_rate": 3.040166950699625e-07, + "loss": 0.3451, + "step": 10026 + }, + { + "epoch": 0.64, + "grad_norm": 0.9864322855566573, + "learning_rate": 3.039216893674753e-07, + "loss": 0.2225, + "step": 10027 + }, + { + "epoch": 0.64, + "grad_norm": 1.4240512723345102, + "learning_rate": 3.038266920300833e-07, + "loss": 0.2028, + "step": 10028 + }, + { + "epoch": 0.64, + "grad_norm": 0.4419891423604961, + "learning_rate": 3.037317030618388e-07, + "loss": 0.0258, + "step": 10029 + }, + { + "epoch": 0.64, + "grad_norm": 1.1329131507572303, + "learning_rate": 3.036367224667944e-07, + "loss": 0.3439, + "step": 10030 + }, + { + "epoch": 0.64, + "grad_norm": 1.5758442686255685, + "learning_rate": 3.0354175024900214e-07, + "loss": 0.1377, + "step": 10031 + }, + { + "epoch": 0.64, + "grad_norm": 1.2849863684374576, + "learning_rate": 3.0344678641251364e-07, + "loss": 0.4759, + "step": 10032 + }, + { + "epoch": 0.64, + "grad_norm": 1.2759617670586145, + "learning_rate": 3.0335183096138064e-07, + "loss": 0.351, + "step": 10033 + }, + { + "epoch": 0.64, + "grad_norm": 0.7502190509325263, + "learning_rate": 3.0325688389965355e-07, + "loss": 0.3014, + "step": 10034 + }, + { + "epoch": 0.64, + "grad_norm": 1.2910158057038357, + "learning_rate": 3.0316194523138355e-07, + "loss": 0.2676, + "step": 10035 + }, + { + "epoch": 0.64, + "grad_norm": 0.9492687373755502, + "learning_rate": 3.030670149606205e-07, + "loss": 0.2831, + "step": 10036 + }, + { + "epoch": 0.64, + "grad_norm": 0.22668594642335435, + "learning_rate": 3.029720930914146e-07, + "loss": 0.0998, + "step": 10037 + }, + { + "epoch": 0.64, + "grad_norm": 0.43713275265812845, + "learning_rate": 3.0287717962781506e-07, + "loss": 0.3113, + "step": 10038 + }, + { + "epoch": 0.64, + "grad_norm": 1.089456274733409, + "learning_rate": 3.0278227457387144e-07, + "loss": 0.1938, + "step": 10039 + }, + { + "epoch": 0.64, + "grad_norm": 0.5208155172137892, + "learning_rate": 3.0268737793363246e-07, + "loss": 0.1167, + "step": 10040 + }, + { + "epoch": 0.64, + "grad_norm": 0.9715374167494162, + "learning_rate": 3.025924897111466e-07, + "loss": 0.2882, + "step": 10041 + }, + { + "epoch": 0.64, + "grad_norm": 0.9099912635361547, + "learning_rate": 3.024976099104618e-07, + "loss": 0.583, + "step": 10042 + }, + { + "epoch": 0.64, + "grad_norm": 0.437331688200397, + "learning_rate": 3.0240273853562625e-07, + "loss": 0.1859, + "step": 10043 + }, + { + "epoch": 0.64, + "grad_norm": 1.2434999435163299, + "learning_rate": 3.023078755906871e-07, + "loss": 0.2225, + "step": 10044 + }, + { + "epoch": 0.64, + "grad_norm": 1.2345209468975693, + "learning_rate": 3.0221302107969114e-07, + "loss": 0.39, + "step": 10045 + }, + { + "epoch": 0.64, + "grad_norm": 0.37678068180372903, + "learning_rate": 3.021181750066856e-07, + "loss": 0.3625, + "step": 10046 + }, + { + "epoch": 0.64, + "grad_norm": 0.7321066389547666, + "learning_rate": 3.0202333737571616e-07, + "loss": 0.1433, + "step": 10047 + }, + { + "epoch": 0.64, + "grad_norm": 0.3389152957541529, + "learning_rate": 3.0192850819082937e-07, + "loss": 0.2074, + "step": 10048 + }, + { + "epoch": 0.64, + "grad_norm": 0.9342221122330095, + "learning_rate": 3.018336874560705e-07, + "loss": 0.2716, + "step": 10049 + }, + { + "epoch": 0.64, + "grad_norm": 2.2871994912978746, + "learning_rate": 3.017388751754849e-07, + "loss": 0.4026, + "step": 10050 + }, + { + "epoch": 0.64, + "grad_norm": 0.8573237685064583, + "learning_rate": 3.016440713531174e-07, + "loss": 0.3745, + "step": 10051 + }, + { + "epoch": 0.64, + "grad_norm": 0.9915866499862802, + "learning_rate": 3.0154927599301274e-07, + "loss": 0.5495, + "step": 10052 + }, + { + "epoch": 0.64, + "grad_norm": 0.1618492345434936, + "learning_rate": 3.014544890992147e-07, + "loss": 0.0037, + "step": 10053 + }, + { + "epoch": 0.64, + "grad_norm": 0.23930629093503197, + "learning_rate": 3.013597106757674e-07, + "loss": 0.0881, + "step": 10054 + }, + { + "epoch": 0.64, + "grad_norm": 2.7039139070001506, + "learning_rate": 3.0126494072671405e-07, + "loss": 0.0285, + "step": 10055 + }, + { + "epoch": 0.64, + "grad_norm": 2.682928743757131, + "learning_rate": 3.01170179256098e-07, + "loss": 0.1217, + "step": 10056 + }, + { + "epoch": 0.64, + "grad_norm": 1.1586214481233266, + "learning_rate": 3.0107542626796165e-07, + "loss": 0.2061, + "step": 10057 + }, + { + "epoch": 0.64, + "grad_norm": 0.33420648078880516, + "learning_rate": 3.009806817663475e-07, + "loss": 0.0271, + "step": 10058 + }, + { + "epoch": 0.64, + "grad_norm": 0.3803203391794372, + "learning_rate": 3.0088594575529774e-07, + "loss": 0.0549, + "step": 10059 + }, + { + "epoch": 0.64, + "grad_norm": 0.44753180417879, + "learning_rate": 3.0079121823885356e-07, + "loss": 0.0903, + "step": 10060 + }, + { + "epoch": 0.64, + "grad_norm": 0.38166879068954435, + "learning_rate": 3.006964992210567e-07, + "loss": 0.0024, + "step": 10061 + }, + { + "epoch": 0.64, + "grad_norm": 0.17810504323650683, + "learning_rate": 3.006017887059476e-07, + "loss": 0.1087, + "step": 10062 + }, + { + "epoch": 0.64, + "grad_norm": 1.1141266485442696, + "learning_rate": 3.005070866975673e-07, + "loss": 0.3222, + "step": 10063 + }, + { + "epoch": 0.64, + "grad_norm": 0.9520269321580211, + "learning_rate": 3.0041239319995544e-07, + "loss": 0.2189, + "step": 10064 + }, + { + "epoch": 0.64, + "grad_norm": 0.976190605386346, + "learning_rate": 3.003177082171523e-07, + "loss": 0.3072, + "step": 10065 + }, + { + "epoch": 0.64, + "grad_norm": 1.053057840162801, + "learning_rate": 3.0022303175319695e-07, + "loss": 0.2027, + "step": 10066 + }, + { + "epoch": 0.64, + "grad_norm": 0.5750296983297821, + "learning_rate": 3.001283638121288e-07, + "loss": 0.0482, + "step": 10067 + }, + { + "epoch": 0.64, + "grad_norm": 2.2917914081263246, + "learning_rate": 3.000337043979864e-07, + "loss": 0.2143, + "step": 10068 + }, + { + "epoch": 0.64, + "grad_norm": 2.191854693995644, + "learning_rate": 2.9993905351480823e-07, + "loss": 0.1763, + "step": 10069 + }, + { + "epoch": 0.64, + "grad_norm": 3.7433518076973855, + "learning_rate": 2.9984441116663206e-07, + "loss": 0.2507, + "step": 10070 + }, + { + "epoch": 0.64, + "grad_norm": 1.4834550371552282, + "learning_rate": 2.997497773574959e-07, + "loss": 0.3234, + "step": 10071 + }, + { + "epoch": 0.64, + "grad_norm": 11.049990209930852, + "learning_rate": 2.9965515209143674e-07, + "loss": 0.2025, + "step": 10072 + }, + { + "epoch": 0.64, + "grad_norm": 1.6302614591140923, + "learning_rate": 2.9956053537249137e-07, + "loss": 0.3305, + "step": 10073 + }, + { + "epoch": 0.64, + "grad_norm": 0.8719787278468222, + "learning_rate": 2.994659272046966e-07, + "loss": 0.1301, + "step": 10074 + }, + { + "epoch": 0.64, + "grad_norm": 1.6982705460270011, + "learning_rate": 2.993713275920885e-07, + "loss": 0.1801, + "step": 10075 + }, + { + "epoch": 0.64, + "grad_norm": 2.6275087308085543, + "learning_rate": 2.992767365387029e-07, + "loss": 0.0436, + "step": 10076 + }, + { + "epoch": 0.64, + "grad_norm": 0.2712916019561169, + "learning_rate": 2.9918215404857505e-07, + "loss": 0.0128, + "step": 10077 + }, + { + "epoch": 0.64, + "grad_norm": 1.1491099588932046, + "learning_rate": 2.990875801257404e-07, + "loss": 0.1069, + "step": 10078 + }, + { + "epoch": 0.64, + "grad_norm": 0.7451719885124801, + "learning_rate": 2.9899301477423336e-07, + "loss": 0.2882, + "step": 10079 + }, + { + "epoch": 0.64, + "grad_norm": 1.4185935208862142, + "learning_rate": 2.9889845799808854e-07, + "loss": 0.1663, + "step": 10080 + }, + { + "epoch": 0.64, + "grad_norm": 0.9512447606296824, + "learning_rate": 2.9880390980133954e-07, + "loss": 0.22, + "step": 10081 + }, + { + "epoch": 0.64, + "grad_norm": 0.7203886033678134, + "learning_rate": 2.9870937018802045e-07, + "loss": 0.1875, + "step": 10082 + }, + { + "epoch": 0.64, + "grad_norm": 1.80985039946878, + "learning_rate": 2.98614839162164e-07, + "loss": 0.3262, + "step": 10083 + }, + { + "epoch": 0.64, + "grad_norm": 0.3829490125097748, + "learning_rate": 2.985203167278035e-07, + "loss": 0.1044, + "step": 10084 + }, + { + "epoch": 0.64, + "grad_norm": 2.6240450139913647, + "learning_rate": 2.984258028889715e-07, + "loss": 0.0988, + "step": 10085 + }, + { + "epoch": 0.64, + "grad_norm": 7.005710200407289, + "learning_rate": 2.983312976496996e-07, + "loss": 0.3275, + "step": 10086 + }, + { + "epoch": 0.64, + "grad_norm": 0.9079486894107951, + "learning_rate": 2.982368010140203e-07, + "loss": 0.0682, + "step": 10087 + }, + { + "epoch": 0.64, + "grad_norm": 0.9466190574800312, + "learning_rate": 2.981423129859643e-07, + "loss": 0.385, + "step": 10088 + }, + { + "epoch": 0.64, + "grad_norm": 0.7014769328021367, + "learning_rate": 2.980478335695633e-07, + "loss": 0.1394, + "step": 10089 + }, + { + "epoch": 0.64, + "grad_norm": 1.0209469875880932, + "learning_rate": 2.9795336276884753e-07, + "loss": 0.2723, + "step": 10090 + }, + { + "epoch": 0.64, + "grad_norm": 0.6572692611290462, + "learning_rate": 2.9785890058784756e-07, + "loss": 0.1291, + "step": 10091 + }, + { + "epoch": 0.64, + "grad_norm": 0.7677575212627107, + "learning_rate": 2.977644470305931e-07, + "loss": 0.1856, + "step": 10092 + }, + { + "epoch": 0.64, + "grad_norm": 4.451153090437871, + "learning_rate": 2.9767000210111403e-07, + "loss": 0.2387, + "step": 10093 + }, + { + "epoch": 0.64, + "grad_norm": 0.898565856327848, + "learning_rate": 2.9757556580343923e-07, + "loss": 0.3278, + "step": 10094 + }, + { + "epoch": 0.64, + "grad_norm": 0.8480122659879772, + "learning_rate": 2.9748113814159795e-07, + "loss": 0.2639, + "step": 10095 + }, + { + "epoch": 0.64, + "grad_norm": 0.3145698739956969, + "learning_rate": 2.9738671911961826e-07, + "loss": 0.0033, + "step": 10096 + }, + { + "epoch": 0.64, + "grad_norm": 1.0598471603856254, + "learning_rate": 2.9729230874152863e-07, + "loss": 0.0895, + "step": 10097 + }, + { + "epoch": 0.64, + "grad_norm": 3.3706323597084116, + "learning_rate": 2.9719790701135656e-07, + "loss": 0.0065, + "step": 10098 + }, + { + "epoch": 0.64, + "grad_norm": 14.908254250704342, + "learning_rate": 2.9710351393312926e-07, + "loss": 0.2405, + "step": 10099 + }, + { + "epoch": 0.64, + "grad_norm": 0.7518823938104033, + "learning_rate": 2.970091295108741e-07, + "loss": 0.2358, + "step": 10100 + }, + { + "epoch": 0.64, + "grad_norm": 0.7617448569859814, + "learning_rate": 2.9691475374861747e-07, + "loss": 0.4019, + "step": 10101 + }, + { + "epoch": 0.64, + "grad_norm": 0.40944923641609, + "learning_rate": 2.968203866503857e-07, + "loss": 0.1179, + "step": 10102 + }, + { + "epoch": 0.64, + "grad_norm": 0.5376033138473534, + "learning_rate": 2.967260282202046e-07, + "loss": 0.1083, + "step": 10103 + }, + { + "epoch": 0.64, + "grad_norm": 0.6607908824701466, + "learning_rate": 2.9663167846209996e-07, + "loss": 0.1846, + "step": 10104 + }, + { + "epoch": 0.64, + "grad_norm": 0.5919673760987662, + "learning_rate": 2.9653733738009657e-07, + "loss": 0.1148, + "step": 10105 + }, + { + "epoch": 0.64, + "grad_norm": 1.1449331951659232, + "learning_rate": 2.964430049782195e-07, + "loss": 0.2059, + "step": 10106 + }, + { + "epoch": 0.64, + "grad_norm": 9.113220674165706, + "learning_rate": 2.9634868126049287e-07, + "loss": 0.2069, + "step": 10107 + }, + { + "epoch": 0.64, + "grad_norm": 8.432933815515144, + "learning_rate": 2.9625436623094113e-07, + "loss": 0.0945, + "step": 10108 + }, + { + "epoch": 0.64, + "grad_norm": 1.412151247822407, + "learning_rate": 2.961600598935875e-07, + "loss": 0.1954, + "step": 10109 + }, + { + "epoch": 0.64, + "grad_norm": 1.692059780788752, + "learning_rate": 2.960657622524556e-07, + "loss": 0.2773, + "step": 10110 + }, + { + "epoch": 0.64, + "grad_norm": 1.4465622939612175, + "learning_rate": 2.959714733115681e-07, + "loss": 0.1709, + "step": 10111 + }, + { + "epoch": 0.64, + "grad_norm": 1.035136004114922, + "learning_rate": 2.9587719307494787e-07, + "loss": 0.2606, + "step": 10112 + }, + { + "epoch": 0.64, + "grad_norm": 0.8765561261070781, + "learning_rate": 2.9578292154661696e-07, + "loss": 0.2269, + "step": 10113 + }, + { + "epoch": 0.64, + "grad_norm": 0.4644450559399976, + "learning_rate": 2.956886587305969e-07, + "loss": 0.1751, + "step": 10114 + }, + { + "epoch": 0.65, + "grad_norm": 0.8967843536064332, + "learning_rate": 2.9559440463090963e-07, + "loss": 0.149, + "step": 10115 + }, + { + "epoch": 0.65, + "grad_norm": 2.7431122814227695, + "learning_rate": 2.955001592515758e-07, + "loss": 0.1275, + "step": 10116 + }, + { + "epoch": 0.65, + "grad_norm": 2.1763049705796447, + "learning_rate": 2.9540592259661634e-07, + "loss": 0.1584, + "step": 10117 + }, + { + "epoch": 0.65, + "grad_norm": 0.8440220796152866, + "learning_rate": 2.9531169467005147e-07, + "loss": 0.109, + "step": 10118 + }, + { + "epoch": 0.65, + "grad_norm": 4.238583783068823, + "learning_rate": 2.9521747547590114e-07, + "loss": 0.3639, + "step": 10119 + }, + { + "epoch": 0.65, + "grad_norm": 0.4512614817524705, + "learning_rate": 2.9512326501818505e-07, + "loss": 0.116, + "step": 10120 + }, + { + "epoch": 0.65, + "grad_norm": 0.8378187597679194, + "learning_rate": 2.9502906330092233e-07, + "loss": 0.0733, + "step": 10121 + }, + { + "epoch": 0.65, + "grad_norm": 3.2417661979508154, + "learning_rate": 2.949348703281317e-07, + "loss": 0.2177, + "step": 10122 + }, + { + "epoch": 0.65, + "grad_norm": 0.989954464847215, + "learning_rate": 2.948406861038319e-07, + "loss": 0.3178, + "step": 10123 + }, + { + "epoch": 0.65, + "grad_norm": 0.924334248056165, + "learning_rate": 2.947465106320407e-07, + "loss": 0.197, + "step": 10124 + }, + { + "epoch": 0.65, + "grad_norm": 1.9169590362837634, + "learning_rate": 2.9465234391677614e-07, + "loss": 0.2571, + "step": 10125 + }, + { + "epoch": 0.65, + "grad_norm": 0.612481670372759, + "learning_rate": 2.945581859620554e-07, + "loss": 0.1421, + "step": 10126 + }, + { + "epoch": 0.65, + "grad_norm": 0.7364090504313137, + "learning_rate": 2.9446403677189523e-07, + "loss": 0.1013, + "step": 10127 + }, + { + "epoch": 0.65, + "grad_norm": 0.755564132742967, + "learning_rate": 2.943698963503125e-07, + "loss": 0.1658, + "step": 10128 + }, + { + "epoch": 0.65, + "grad_norm": 1.1002501104171318, + "learning_rate": 2.942757647013233e-07, + "loss": 0.2312, + "step": 10129 + }, + { + "epoch": 0.65, + "grad_norm": 3.8455882466354714, + "learning_rate": 2.941816418289438e-07, + "loss": 0.0073, + "step": 10130 + }, + { + "epoch": 0.65, + "grad_norm": 0.6175502859939063, + "learning_rate": 2.940875277371889e-07, + "loss": 0.0127, + "step": 10131 + }, + { + "epoch": 0.65, + "grad_norm": 1.3002325125406036, + "learning_rate": 2.9399342243007423e-07, + "loss": 0.2075, + "step": 10132 + }, + { + "epoch": 0.65, + "grad_norm": 0.6189822446095793, + "learning_rate": 2.938993259116141e-07, + "loss": 0.1336, + "step": 10133 + }, + { + "epoch": 0.65, + "grad_norm": 0.7771168105077464, + "learning_rate": 2.9380523818582325e-07, + "loss": 0.204, + "step": 10134 + }, + { + "epoch": 0.65, + "grad_norm": 4.246312885201733, + "learning_rate": 2.9371115925671517e-07, + "loss": 0.2103, + "step": 10135 + }, + { + "epoch": 0.65, + "grad_norm": 0.9666412012084128, + "learning_rate": 2.9361708912830403e-07, + "loss": 0.0553, + "step": 10136 + }, + { + "epoch": 0.65, + "grad_norm": 1.0650677233362091, + "learning_rate": 2.935230278046025e-07, + "loss": 0.4619, + "step": 10137 + }, + { + "epoch": 0.65, + "grad_norm": 0.4880920623331828, + "learning_rate": 2.934289752896238e-07, + "loss": 0.1457, + "step": 10138 + }, + { + "epoch": 0.65, + "grad_norm": 0.7050596125092834, + "learning_rate": 2.9333493158738033e-07, + "loss": 0.2861, + "step": 10139 + }, + { + "epoch": 0.65, + "grad_norm": 0.598019207815051, + "learning_rate": 2.9324089670188397e-07, + "loss": 0.2154, + "step": 10140 + }, + { + "epoch": 0.65, + "grad_norm": 1.3899083727109987, + "learning_rate": 2.931468706371468e-07, + "loss": 0.1716, + "step": 10141 + }, + { + "epoch": 0.65, + "grad_norm": 0.7443308974448324, + "learning_rate": 2.9305285339717964e-07, + "loss": 0.2573, + "step": 10142 + }, + { + "epoch": 0.65, + "grad_norm": 0.6111182814040098, + "learning_rate": 2.929588449859941e-07, + "loss": 0.0775, + "step": 10143 + }, + { + "epoch": 0.65, + "grad_norm": 0.6031787105155585, + "learning_rate": 2.9286484540760024e-07, + "loss": 0.174, + "step": 10144 + }, + { + "epoch": 0.65, + "grad_norm": 5.059288171325278, + "learning_rate": 2.927708546660085e-07, + "loss": 0.156, + "step": 10145 + }, + { + "epoch": 0.65, + "grad_norm": 0.6155850848380845, + "learning_rate": 2.926768727652287e-07, + "loss": 0.2203, + "step": 10146 + }, + { + "epoch": 0.65, + "grad_norm": 17.661522486178317, + "learning_rate": 2.925828997092703e-07, + "loss": 0.2288, + "step": 10147 + }, + { + "epoch": 0.65, + "grad_norm": 1.5909122067725574, + "learning_rate": 2.9248893550214225e-07, + "loss": 0.3839, + "step": 10148 + }, + { + "epoch": 0.65, + "grad_norm": 3.663937159106112, + "learning_rate": 2.9239498014785357e-07, + "loss": 0.1143, + "step": 10149 + }, + { + "epoch": 0.65, + "grad_norm": 0.6793887715076052, + "learning_rate": 2.923010336504121e-07, + "loss": 0.3644, + "step": 10150 + }, + { + "epoch": 0.65, + "grad_norm": 0.7528713435368135, + "learning_rate": 2.9220709601382643e-07, + "loss": 0.1757, + "step": 10151 + }, + { + "epoch": 0.65, + "grad_norm": 4.125473558541575, + "learning_rate": 2.9211316724210344e-07, + "loss": 0.0956, + "step": 10152 + }, + { + "epoch": 0.65, + "grad_norm": 0.3127818765202288, + "learning_rate": 2.920192473392509e-07, + "loss": 0.1067, + "step": 10153 + }, + { + "epoch": 0.65, + "grad_norm": 0.837624615422415, + "learning_rate": 2.919253363092753e-07, + "loss": 0.231, + "step": 10154 + }, + { + "epoch": 0.65, + "grad_norm": 1.2875846712266474, + "learning_rate": 2.918314341561829e-07, + "loss": 0.1165, + "step": 10155 + }, + { + "epoch": 0.65, + "grad_norm": 2.4005460474096294, + "learning_rate": 2.9173754088398027e-07, + "loss": 0.1164, + "step": 10156 + }, + { + "epoch": 0.65, + "grad_norm": 1.84564787952284, + "learning_rate": 2.9164365649667255e-07, + "loss": 0.3742, + "step": 10157 + }, + { + "epoch": 0.65, + "grad_norm": 0.7321126275754902, + "learning_rate": 2.915497809982653e-07, + "loss": 0.0675, + "step": 10158 + }, + { + "epoch": 0.65, + "grad_norm": 0.6524438418410831, + "learning_rate": 2.914559143927637e-07, + "loss": 0.1916, + "step": 10159 + }, + { + "epoch": 0.65, + "grad_norm": 0.7163088981075492, + "learning_rate": 2.913620566841718e-07, + "loss": 0.3322, + "step": 10160 + }, + { + "epoch": 0.65, + "grad_norm": 0.5007082039915902, + "learning_rate": 2.9126820787649397e-07, + "loss": 0.199, + "step": 10161 + }, + { + "epoch": 0.65, + "grad_norm": 0.6585978086439676, + "learning_rate": 2.911743679737342e-07, + "loss": 0.0193, + "step": 10162 + }, + { + "epoch": 0.65, + "grad_norm": 1.1463803896353832, + "learning_rate": 2.9108053697989543e-07, + "loss": 0.2195, + "step": 10163 + }, + { + "epoch": 0.65, + "grad_norm": 1.0451761528139054, + "learning_rate": 2.9098671489898114e-07, + "loss": 0.0989, + "step": 10164 + }, + { + "epoch": 0.65, + "grad_norm": 0.89734534298001, + "learning_rate": 2.908929017349936e-07, + "loss": 0.4223, + "step": 10165 + }, + { + "epoch": 0.65, + "grad_norm": 1.8270052705829065, + "learning_rate": 2.9079909749193544e-07, + "loss": 0.2895, + "step": 10166 + }, + { + "epoch": 0.65, + "grad_norm": 0.364870594769816, + "learning_rate": 2.907053021738083e-07, + "loss": 0.1091, + "step": 10167 + }, + { + "epoch": 0.65, + "grad_norm": 0.423504499918296, + "learning_rate": 2.906115157846135e-07, + "loss": 0.0939, + "step": 10168 + }, + { + "epoch": 0.65, + "grad_norm": 0.9835994607264935, + "learning_rate": 2.9051773832835257e-07, + "loss": 0.1069, + "step": 10169 + }, + { + "epoch": 0.65, + "grad_norm": 0.6078985931798069, + "learning_rate": 2.904239698090258e-07, + "loss": 0.2006, + "step": 10170 + }, + { + "epoch": 0.65, + "grad_norm": 1.8585419408299761, + "learning_rate": 2.9033021023063405e-07, + "loss": 0.0775, + "step": 10171 + }, + { + "epoch": 0.65, + "grad_norm": 0.9431968041486721, + "learning_rate": 2.9023645959717676e-07, + "loss": 0.1079, + "step": 10172 + }, + { + "epoch": 0.65, + "grad_norm": 0.2828754239310904, + "learning_rate": 2.90142717912654e-07, + "loss": 0.0871, + "step": 10173 + }, + { + "epoch": 0.65, + "grad_norm": 0.38468142708540887, + "learning_rate": 2.9004898518106457e-07, + "loss": 0.1511, + "step": 10174 + }, + { + "epoch": 0.65, + "grad_norm": 0.8713158391269873, + "learning_rate": 2.899552614064077e-07, + "loss": 0.115, + "step": 10175 + }, + { + "epoch": 0.65, + "grad_norm": 6.176435073014773, + "learning_rate": 2.8986154659268137e-07, + "loss": 0.1554, + "step": 10176 + }, + { + "epoch": 0.65, + "grad_norm": 2.0177193142856265, + "learning_rate": 2.8976784074388395e-07, + "loss": 0.4084, + "step": 10177 + }, + { + "epoch": 0.65, + "grad_norm": 7.369229236259131, + "learning_rate": 2.896741438640132e-07, + "loss": 0.2152, + "step": 10178 + }, + { + "epoch": 0.65, + "grad_norm": 1.4581195734418044, + "learning_rate": 2.8958045595706617e-07, + "loss": 0.1189, + "step": 10179 + }, + { + "epoch": 0.65, + "grad_norm": 3.9939602725598458, + "learning_rate": 2.894867770270398e-07, + "loss": 0.111, + "step": 10180 + }, + { + "epoch": 0.65, + "grad_norm": 0.5609244693542784, + "learning_rate": 2.8939310707793097e-07, + "loss": 0.1326, + "step": 10181 + }, + { + "epoch": 0.65, + "grad_norm": 1.1259466010058046, + "learning_rate": 2.8929944611373554e-07, + "loss": 0.2838, + "step": 10182 + }, + { + "epoch": 0.65, + "grad_norm": 0.833719703385876, + "learning_rate": 2.8920579413844904e-07, + "loss": 0.0472, + "step": 10183 + }, + { + "epoch": 0.65, + "grad_norm": 0.6790088170753138, + "learning_rate": 2.891121511560674e-07, + "loss": 0.1817, + "step": 10184 + }, + { + "epoch": 0.65, + "grad_norm": 1.0421877559803852, + "learning_rate": 2.8901851717058513e-07, + "loss": 0.1087, + "step": 10185 + }, + { + "epoch": 0.65, + "grad_norm": 0.5242384686078717, + "learning_rate": 2.889248921859972e-07, + "loss": 0.3666, + "step": 10186 + }, + { + "epoch": 0.65, + "grad_norm": 0.4096621419743181, + "learning_rate": 2.888312762062974e-07, + "loss": 0.0943, + "step": 10187 + }, + { + "epoch": 0.65, + "grad_norm": 1.1403917586988235, + "learning_rate": 2.887376692354803e-07, + "loss": 0.4638, + "step": 10188 + }, + { + "epoch": 0.65, + "grad_norm": 1.1251275356077421, + "learning_rate": 2.886440712775385e-07, + "loss": 0.2976, + "step": 10189 + }, + { + "epoch": 0.65, + "grad_norm": 0.40686203098349166, + "learning_rate": 2.8855048233646576e-07, + "loss": 0.3208, + "step": 10190 + }, + { + "epoch": 0.65, + "grad_norm": 0.81965736341543, + "learning_rate": 2.884569024162543e-07, + "loss": 0.3933, + "step": 10191 + }, + { + "epoch": 0.65, + "grad_norm": 0.6045867870038998, + "learning_rate": 2.8836333152089687e-07, + "loss": 0.1915, + "step": 10192 + }, + { + "epoch": 0.65, + "grad_norm": 0.6880109823058511, + "learning_rate": 2.88269769654385e-07, + "loss": 0.3868, + "step": 10193 + }, + { + "epoch": 0.65, + "grad_norm": 1.0855868710341854, + "learning_rate": 2.881762168207105e-07, + "loss": 0.1405, + "step": 10194 + }, + { + "epoch": 0.65, + "grad_norm": 1.7032886920100117, + "learning_rate": 2.8808267302386423e-07, + "loss": 0.2338, + "step": 10195 + }, + { + "epoch": 0.65, + "grad_norm": 0.7592264969634548, + "learning_rate": 2.8798913826783724e-07, + "loss": 0.2576, + "step": 10196 + }, + { + "epoch": 0.65, + "grad_norm": 1.3478853546872511, + "learning_rate": 2.8789561255661986e-07, + "loss": 0.1196, + "step": 10197 + }, + { + "epoch": 0.65, + "grad_norm": 1.6906784802526151, + "learning_rate": 2.878020958942019e-07, + "loss": 0.2686, + "step": 10198 + }, + { + "epoch": 0.65, + "grad_norm": 0.7904765996454155, + "learning_rate": 2.8770858828457336e-07, + "loss": 0.1888, + "step": 10199 + }, + { + "epoch": 0.65, + "grad_norm": 0.5154334002839001, + "learning_rate": 2.8761508973172286e-07, + "loss": 0.1782, + "step": 10200 + }, + { + "epoch": 0.65, + "grad_norm": 0.3354066006994746, + "learning_rate": 2.875216002396399e-07, + "loss": 0.1028, + "step": 10201 + }, + { + "epoch": 0.65, + "grad_norm": 3.444505493427869, + "learning_rate": 2.8742811981231236e-07, + "loss": 0.0521, + "step": 10202 + }, + { + "epoch": 0.65, + "grad_norm": 1.47817889508201, + "learning_rate": 2.873346484537288e-07, + "loss": 0.298, + "step": 10203 + }, + { + "epoch": 0.65, + "grad_norm": 0.6336947599145452, + "learning_rate": 2.8724118616787636e-07, + "loss": 0.5221, + "step": 10204 + }, + { + "epoch": 0.65, + "grad_norm": 0.928730037937413, + "learning_rate": 2.8714773295874283e-07, + "loss": 0.3397, + "step": 10205 + }, + { + "epoch": 0.65, + "grad_norm": 6.858571369987186, + "learning_rate": 2.870542888303148e-07, + "loss": 0.0332, + "step": 10206 + }, + { + "epoch": 0.65, + "grad_norm": 3.078718939124812, + "learning_rate": 2.86960853786579e-07, + "loss": 0.286, + "step": 10207 + }, + { + "epoch": 0.65, + "grad_norm": 1.1468493477250459, + "learning_rate": 2.868674278315214e-07, + "loss": 0.139, + "step": 10208 + }, + { + "epoch": 0.65, + "grad_norm": 1.4781322692514693, + "learning_rate": 2.8677401096912767e-07, + "loss": 0.268, + "step": 10209 + }, + { + "epoch": 0.65, + "grad_norm": 0.9173659262253786, + "learning_rate": 2.8668060320338345e-07, + "loss": 0.2243, + "step": 10210 + }, + { + "epoch": 0.65, + "grad_norm": 1.2222157176630717, + "learning_rate": 2.865872045382733e-07, + "loss": 0.3454, + "step": 10211 + }, + { + "epoch": 0.65, + "grad_norm": 0.3029370794229205, + "learning_rate": 2.86493814977782e-07, + "loss": 0.0399, + "step": 10212 + }, + { + "epoch": 0.65, + "grad_norm": 1.468774882671728, + "learning_rate": 2.864004345258938e-07, + "loss": 0.294, + "step": 10213 + }, + { + "epoch": 0.65, + "grad_norm": 1.6157720857306135, + "learning_rate": 2.863070631865926e-07, + "loss": 0.3866, + "step": 10214 + }, + { + "epoch": 0.65, + "grad_norm": 0.782503562281586, + "learning_rate": 2.8621370096386155e-07, + "loss": 0.3746, + "step": 10215 + }, + { + "epoch": 0.65, + "grad_norm": 1.248339048644885, + "learning_rate": 2.861203478616839e-07, + "loss": 0.2226, + "step": 10216 + }, + { + "epoch": 0.65, + "grad_norm": 0.3511410845734303, + "learning_rate": 2.86027003884042e-07, + "loss": 0.2554, + "step": 10217 + }, + { + "epoch": 0.65, + "grad_norm": 1.1966562512631873, + "learning_rate": 2.8593366903491845e-07, + "loss": 0.2706, + "step": 10218 + }, + { + "epoch": 0.65, + "grad_norm": 0.47867191486930216, + "learning_rate": 2.8584034331829465e-07, + "loss": 0.066, + "step": 10219 + }, + { + "epoch": 0.65, + "grad_norm": 0.5339949316047048, + "learning_rate": 2.8574702673815257e-07, + "loss": 0.12, + "step": 10220 + }, + { + "epoch": 0.65, + "grad_norm": 0.5611385857681584, + "learning_rate": 2.856537192984728e-07, + "loss": 0.1945, + "step": 10221 + }, + { + "epoch": 0.65, + "grad_norm": 0.7098057171282469, + "learning_rate": 2.8556042100323653e-07, + "loss": 0.2918, + "step": 10222 + }, + { + "epoch": 0.65, + "grad_norm": 3.142105614411486, + "learning_rate": 2.854671318564237e-07, + "loss": 0.1315, + "step": 10223 + }, + { + "epoch": 0.65, + "grad_norm": 0.35340259488248843, + "learning_rate": 2.853738518620141e-07, + "loss": 0.0505, + "step": 10224 + }, + { + "epoch": 0.65, + "grad_norm": 1.6201995876999788, + "learning_rate": 2.8528058102398767e-07, + "loss": 0.2407, + "step": 10225 + }, + { + "epoch": 0.65, + "grad_norm": 1.2426319278333497, + "learning_rate": 2.851873193463231e-07, + "loss": 0.3811, + "step": 10226 + }, + { + "epoch": 0.65, + "grad_norm": 2.9231925775715792, + "learning_rate": 2.850940668329995e-07, + "loss": 0.1911, + "step": 10227 + }, + { + "epoch": 0.65, + "grad_norm": 7.683992341126847, + "learning_rate": 2.8500082348799484e-07, + "loss": 0.1405, + "step": 10228 + }, + { + "epoch": 0.65, + "grad_norm": 3.030114905525364, + "learning_rate": 2.8490758931528733e-07, + "loss": 0.3582, + "step": 10229 + }, + { + "epoch": 0.65, + "grad_norm": 0.5730900085900599, + "learning_rate": 2.848143643188544e-07, + "loss": 0.2923, + "step": 10230 + }, + { + "epoch": 0.65, + "grad_norm": 1.3130173591499996, + "learning_rate": 2.847211485026732e-07, + "loss": 0.1366, + "step": 10231 + }, + { + "epoch": 0.65, + "grad_norm": 1.6563485023553393, + "learning_rate": 2.8462794187072056e-07, + "loss": 0.1694, + "step": 10232 + }, + { + "epoch": 0.65, + "grad_norm": 5.513843404179402, + "learning_rate": 2.8453474442697313e-07, + "loss": 0.0761, + "step": 10233 + }, + { + "epoch": 0.65, + "grad_norm": 1.4821740985558005, + "learning_rate": 2.8444155617540645e-07, + "loss": 0.3699, + "step": 10234 + }, + { + "epoch": 0.65, + "grad_norm": 3.937843973931371, + "learning_rate": 2.843483771199964e-07, + "loss": 0.2367, + "step": 10235 + }, + { + "epoch": 0.65, + "grad_norm": 0.6188196116057207, + "learning_rate": 2.842552072647182e-07, + "loss": 0.3673, + "step": 10236 + }, + { + "epoch": 0.65, + "grad_norm": 3.668748716083185, + "learning_rate": 2.8416204661354634e-07, + "loss": 0.1505, + "step": 10237 + }, + { + "epoch": 0.65, + "grad_norm": 1.6595810651174245, + "learning_rate": 2.8406889517045563e-07, + "loss": 0.3319, + "step": 10238 + }, + { + "epoch": 0.65, + "grad_norm": 0.41065654226966897, + "learning_rate": 2.839757529394197e-07, + "loss": 0.1962, + "step": 10239 + }, + { + "epoch": 0.65, + "grad_norm": 1.3955031550423198, + "learning_rate": 2.8388261992441263e-07, + "loss": 0.1096, + "step": 10240 + }, + { + "epoch": 0.65, + "grad_norm": 0.6508463178007073, + "learning_rate": 2.8378949612940726e-07, + "loss": 0.3858, + "step": 10241 + }, + { + "epoch": 0.65, + "grad_norm": 0.48423807947557324, + "learning_rate": 2.836963815583769e-07, + "loss": 0.1474, + "step": 10242 + }, + { + "epoch": 0.65, + "grad_norm": 2.8928769138359334, + "learning_rate": 2.8360327621529343e-07, + "loss": 0.0216, + "step": 10243 + }, + { + "epoch": 0.65, + "grad_norm": 0.42743112036302267, + "learning_rate": 2.835101801041294e-07, + "loss": 0.142, + "step": 10244 + }, + { + "epoch": 0.65, + "grad_norm": 1.0621584090643938, + "learning_rate": 2.834170932288562e-07, + "loss": 0.1203, + "step": 10245 + }, + { + "epoch": 0.65, + "grad_norm": 0.4533056235972029, + "learning_rate": 2.8332401559344534e-07, + "loss": 0.0911, + "step": 10246 + }, + { + "epoch": 0.65, + "grad_norm": 1.3373391990105874, + "learning_rate": 2.8323094720186724e-07, + "loss": 0.2234, + "step": 10247 + }, + { + "epoch": 0.65, + "grad_norm": 0.23324687665964133, + "learning_rate": 2.831378880580928e-07, + "loss": 0.0763, + "step": 10248 + }, + { + "epoch": 0.65, + "grad_norm": 0.6111975212027193, + "learning_rate": 2.8304483816609215e-07, + "loss": 0.1486, + "step": 10249 + }, + { + "epoch": 0.65, + "grad_norm": 1.4004848741687492, + "learning_rate": 2.8295179752983466e-07, + "loss": 0.1131, + "step": 10250 + }, + { + "epoch": 0.65, + "grad_norm": 0.6956776664200824, + "learning_rate": 2.828587661532901e-07, + "loss": 0.1478, + "step": 10251 + }, + { + "epoch": 0.65, + "grad_norm": 0.8217026929838352, + "learning_rate": 2.8276574404042677e-07, + "loss": 0.1402, + "step": 10252 + }, + { + "epoch": 0.65, + "grad_norm": 0.6513064563950473, + "learning_rate": 2.8267273119521365e-07, + "loss": 0.2293, + "step": 10253 + }, + { + "epoch": 0.65, + "grad_norm": 0.6472530386355813, + "learning_rate": 2.8257972762161863e-07, + "loss": 0.3021, + "step": 10254 + }, + { + "epoch": 0.65, + "grad_norm": 0.7732910536516439, + "learning_rate": 2.824867333236096e-07, + "loss": 0.1998, + "step": 10255 + }, + { + "epoch": 0.65, + "grad_norm": 0.6853220452332898, + "learning_rate": 2.823937483051536e-07, + "loss": 0.1586, + "step": 10256 + }, + { + "epoch": 0.65, + "grad_norm": 2.6189092250336454, + "learning_rate": 2.82300772570218e-07, + "loss": 0.3721, + "step": 10257 + }, + { + "epoch": 0.65, + "grad_norm": 1.3897431660961344, + "learning_rate": 2.8220780612276887e-07, + "loss": 0.2036, + "step": 10258 + }, + { + "epoch": 0.65, + "grad_norm": 4.048638907755914, + "learning_rate": 2.8211484896677274e-07, + "loss": 0.1098, + "step": 10259 + }, + { + "epoch": 0.65, + "grad_norm": 1.022173529418032, + "learning_rate": 2.820219011061949e-07, + "loss": 0.2273, + "step": 10260 + }, + { + "epoch": 0.65, + "grad_norm": 0.48019927111320554, + "learning_rate": 2.819289625450012e-07, + "loss": 0.3013, + "step": 10261 + }, + { + "epoch": 0.65, + "grad_norm": 0.3915675359680851, + "learning_rate": 2.8183603328715613e-07, + "loss": 0.248, + "step": 10262 + }, + { + "epoch": 0.65, + "grad_norm": 1.3131145352365399, + "learning_rate": 2.817431133366246e-07, + "loss": 0.1545, + "step": 10263 + }, + { + "epoch": 0.65, + "grad_norm": 0.41835883686822495, + "learning_rate": 2.8165020269737046e-07, + "loss": 0.0843, + "step": 10264 + }, + { + "epoch": 0.65, + "grad_norm": 5.489341998654538, + "learning_rate": 2.8155730137335777e-07, + "loss": 0.0821, + "step": 10265 + }, + { + "epoch": 0.65, + "grad_norm": 0.31110884249060594, + "learning_rate": 2.8146440936854953e-07, + "loss": 0.1006, + "step": 10266 + }, + { + "epoch": 0.65, + "grad_norm": 0.2558649006924264, + "learning_rate": 2.813715266869089e-07, + "loss": 0.0826, + "step": 10267 + }, + { + "epoch": 0.65, + "grad_norm": 1.138848363680206, + "learning_rate": 2.812786533323987e-07, + "loss": 0.1607, + "step": 10268 + }, + { + "epoch": 0.65, + "grad_norm": 0.3580398835841955, + "learning_rate": 2.811857893089806e-07, + "loss": 0.0585, + "step": 10269 + }, + { + "epoch": 0.65, + "grad_norm": 3.5828412543228145, + "learning_rate": 2.810929346206168e-07, + "loss": 0.1878, + "step": 10270 + }, + { + "epoch": 0.65, + "grad_norm": 5.606115308601487, + "learning_rate": 2.810000892712682e-07, + "loss": 0.1429, + "step": 10271 + }, + { + "epoch": 0.66, + "grad_norm": 0.22420079108152827, + "learning_rate": 2.809072532648963e-07, + "loss": 0.1768, + "step": 10272 + }, + { + "epoch": 0.66, + "grad_norm": 0.04813094441474049, + "learning_rate": 2.808144266054612e-07, + "loss": 0.0009, + "step": 10273 + }, + { + "epoch": 0.66, + "grad_norm": 0.47288197730564774, + "learning_rate": 2.8072160929692354e-07, + "loss": 0.1842, + "step": 10274 + }, + { + "epoch": 0.66, + "grad_norm": 0.738235164841927, + "learning_rate": 2.806288013432425e-07, + "loss": 0.1385, + "step": 10275 + }, + { + "epoch": 0.66, + "grad_norm": 0.7262208566464031, + "learning_rate": 2.8053600274837807e-07, + "loss": 0.1382, + "step": 10276 + }, + { + "epoch": 0.66, + "grad_norm": 21.905346040021723, + "learning_rate": 2.8044321351628897e-07, + "loss": 0.2486, + "step": 10277 + }, + { + "epoch": 0.66, + "grad_norm": 0.7732430553059354, + "learning_rate": 2.8035043365093347e-07, + "loss": 0.2504, + "step": 10278 + }, + { + "epoch": 0.66, + "grad_norm": 2.705032161380198, + "learning_rate": 2.802576631562703e-07, + "loss": 0.0047, + "step": 10279 + }, + { + "epoch": 0.66, + "grad_norm": 1.6248934088142746, + "learning_rate": 2.801649020362567e-07, + "loss": 0.0321, + "step": 10280 + }, + { + "epoch": 0.66, + "grad_norm": 1.5827335660059887, + "learning_rate": 2.8007215029485054e-07, + "loss": 0.0608, + "step": 10281 + }, + { + "epoch": 0.66, + "grad_norm": 0.8611950012380255, + "learning_rate": 2.799794079360083e-07, + "loss": 0.18, + "step": 10282 + }, + { + "epoch": 0.66, + "grad_norm": 1.7082351348512694, + "learning_rate": 2.798866749636869e-07, + "loss": 0.076, + "step": 10283 + }, + { + "epoch": 0.66, + "grad_norm": 0.9008111053865964, + "learning_rate": 2.797939513818426e-07, + "loss": 0.2403, + "step": 10284 + }, + { + "epoch": 0.66, + "grad_norm": 0.8943813175106697, + "learning_rate": 2.7970123719443073e-07, + "loss": 0.1797, + "step": 10285 + }, + { + "epoch": 0.66, + "grad_norm": 0.8584149610242897, + "learning_rate": 2.79608532405407e-07, + "loss": 0.0888, + "step": 10286 + }, + { + "epoch": 0.66, + "grad_norm": 15.404306424246116, + "learning_rate": 2.795158370187265e-07, + "loss": 0.1976, + "step": 10287 + }, + { + "epoch": 0.66, + "grad_norm": 0.7313523866606231, + "learning_rate": 2.794231510383435e-07, + "loss": 0.2239, + "step": 10288 + }, + { + "epoch": 0.66, + "grad_norm": 29.744310653057806, + "learning_rate": 2.7933047446821236e-07, + "loss": 0.1484, + "step": 10289 + }, + { + "epoch": 0.66, + "grad_norm": 0.4091252632914925, + "learning_rate": 2.7923780731228664e-07, + "loss": 0.3602, + "step": 10290 + }, + { + "epoch": 0.66, + "grad_norm": 0.17125294876779928, + "learning_rate": 2.7914514957452004e-07, + "loss": 0.0044, + "step": 10291 + }, + { + "epoch": 0.66, + "grad_norm": 0.8727869656872513, + "learning_rate": 2.7905250125886536e-07, + "loss": 0.33, + "step": 10292 + }, + { + "epoch": 0.66, + "grad_norm": 0.40791427587481266, + "learning_rate": 2.7895986236927495e-07, + "loss": 0.205, + "step": 10293 + }, + { + "epoch": 0.66, + "grad_norm": 0.808736533405036, + "learning_rate": 2.7886723290970134e-07, + "loss": 0.3945, + "step": 10294 + }, + { + "epoch": 0.66, + "grad_norm": 0.6901055506409601, + "learning_rate": 2.787746128840959e-07, + "loss": 0.1194, + "step": 10295 + }, + { + "epoch": 0.66, + "grad_norm": 0.5673610143798768, + "learning_rate": 2.7868200229641035e-07, + "loss": 0.2125, + "step": 10296 + }, + { + "epoch": 0.66, + "grad_norm": 25.81391849887281, + "learning_rate": 2.7858940115059537e-07, + "loss": 0.375, + "step": 10297 + }, + { + "epoch": 0.66, + "grad_norm": 0.9861928424107715, + "learning_rate": 2.7849680945060176e-07, + "loss": 0.0967, + "step": 10298 + }, + { + "epoch": 0.66, + "grad_norm": 0.45214295832004353, + "learning_rate": 2.784042272003794e-07, + "loss": 0.1678, + "step": 10299 + }, + { + "epoch": 0.66, + "grad_norm": 8.084999990659764, + "learning_rate": 2.7831165440387825e-07, + "loss": 0.3066, + "step": 10300 + }, + { + "epoch": 0.66, + "grad_norm": 1.2193244138626602, + "learning_rate": 2.7821909106504747e-07, + "loss": 0.3322, + "step": 10301 + }, + { + "epoch": 0.66, + "grad_norm": 5.810876479458508, + "learning_rate": 2.781265371878361e-07, + "loss": 0.1077, + "step": 10302 + }, + { + "epoch": 0.66, + "grad_norm": 3.3503260080641253, + "learning_rate": 2.7803399277619265e-07, + "loss": 0.3605, + "step": 10303 + }, + { + "epoch": 0.66, + "grad_norm": 1.420953137141911, + "learning_rate": 2.779414578340655e-07, + "loss": 0.1638, + "step": 10304 + }, + { + "epoch": 0.66, + "grad_norm": 0.8445873763051406, + "learning_rate": 2.778489323654022e-07, + "loss": 0.2076, + "step": 10305 + }, + { + "epoch": 0.66, + "grad_norm": 0.8129850797536065, + "learning_rate": 2.777564163741497e-07, + "loss": 0.1513, + "step": 10306 + }, + { + "epoch": 0.66, + "grad_norm": 1.4345052749212925, + "learning_rate": 2.7766390986425557e-07, + "loss": 0.2743, + "step": 10307 + }, + { + "epoch": 0.66, + "grad_norm": 0.608516525244101, + "learning_rate": 2.775714128396657e-07, + "loss": 0.3439, + "step": 10308 + }, + { + "epoch": 0.66, + "grad_norm": 1.3602859943341306, + "learning_rate": 2.7747892530432675e-07, + "loss": 0.1154, + "step": 10309 + }, + { + "epoch": 0.66, + "grad_norm": 0.8412252413073988, + "learning_rate": 2.7738644726218396e-07, + "loss": 0.0925, + "step": 10310 + }, + { + "epoch": 0.66, + "grad_norm": 0.4230985042881405, + "learning_rate": 2.77293978717183e-07, + "loss": 0.0543, + "step": 10311 + }, + { + "epoch": 0.66, + "grad_norm": 1.730359179745662, + "learning_rate": 2.7720151967326845e-07, + "loss": 0.083, + "step": 10312 + }, + { + "epoch": 0.66, + "grad_norm": 1.1155228647285207, + "learning_rate": 2.771090701343851e-07, + "loss": 0.4335, + "step": 10313 + }, + { + "epoch": 0.66, + "grad_norm": 0.6157355401303554, + "learning_rate": 2.7701663010447673e-07, + "loss": 0.099, + "step": 10314 + }, + { + "epoch": 0.66, + "grad_norm": 0.607567660844606, + "learning_rate": 2.7692419958748734e-07, + "loss": 0.0942, + "step": 10315 + }, + { + "epoch": 0.66, + "grad_norm": 1.7522680907313777, + "learning_rate": 2.7683177858736e-07, + "loss": 0.1124, + "step": 10316 + }, + { + "epoch": 0.66, + "grad_norm": 3.149502614901415, + "learning_rate": 2.767393671080376e-07, + "loss": 0.1974, + "step": 10317 + }, + { + "epoch": 0.66, + "grad_norm": 0.7498093110361993, + "learning_rate": 2.766469651534624e-07, + "loss": 0.2381, + "step": 10318 + }, + { + "epoch": 0.66, + "grad_norm": 0.6138349791888598, + "learning_rate": 2.765545727275768e-07, + "loss": 0.1838, + "step": 10319 + }, + { + "epoch": 0.66, + "grad_norm": 0.8325056592051004, + "learning_rate": 2.7646218983432245e-07, + "loss": 0.2273, + "step": 10320 + }, + { + "epoch": 0.66, + "grad_norm": 0.5471731317832644, + "learning_rate": 2.7636981647764024e-07, + "loss": 0.1571, + "step": 10321 + }, + { + "epoch": 0.66, + "grad_norm": 0.39690254731037833, + "learning_rate": 2.762774526614714e-07, + "loss": 0.0376, + "step": 10322 + }, + { + "epoch": 0.66, + "grad_norm": 0.24004478182410177, + "learning_rate": 2.76185098389756e-07, + "loss": 0.1198, + "step": 10323 + }, + { + "epoch": 0.66, + "grad_norm": 5.3566657921106415, + "learning_rate": 2.760927536664344e-07, + "loss": 0.2131, + "step": 10324 + }, + { + "epoch": 0.66, + "grad_norm": 0.9512974098221918, + "learning_rate": 2.7600041849544584e-07, + "loss": 0.1373, + "step": 10325 + }, + { + "epoch": 0.66, + "grad_norm": 0.7863187991648417, + "learning_rate": 2.7590809288073e-07, + "loss": 0.1968, + "step": 10326 + }, + { + "epoch": 0.66, + "grad_norm": 0.788712920122992, + "learning_rate": 2.7581577682622513e-07, + "loss": 0.2357, + "step": 10327 + }, + { + "epoch": 0.66, + "grad_norm": 1.284394747997247, + "learning_rate": 2.757234703358701e-07, + "loss": 0.1383, + "step": 10328 + }, + { + "epoch": 0.66, + "grad_norm": 6.363789118705277, + "learning_rate": 2.7563117341360244e-07, + "loss": 0.0422, + "step": 10329 + }, + { + "epoch": 0.66, + "grad_norm": 0.31012208836261174, + "learning_rate": 2.7553888606336016e-07, + "loss": 0.0392, + "step": 10330 + }, + { + "epoch": 0.66, + "grad_norm": 0.7872579617347853, + "learning_rate": 2.754466082890801e-07, + "loss": 0.0694, + "step": 10331 + }, + { + "epoch": 0.66, + "grad_norm": 0.14947528566196186, + "learning_rate": 2.7535434009469924e-07, + "loss": 0.0021, + "step": 10332 + }, + { + "epoch": 0.66, + "grad_norm": 4.8144802725901945, + "learning_rate": 2.7526208148415394e-07, + "loss": 0.1461, + "step": 10333 + }, + { + "epoch": 0.66, + "grad_norm": 1.3009842356808723, + "learning_rate": 2.7516983246137977e-07, + "loss": 0.3036, + "step": 10334 + }, + { + "epoch": 0.66, + "grad_norm": 0.9552782663324134, + "learning_rate": 2.750775930303125e-07, + "loss": 0.3527, + "step": 10335 + }, + { + "epoch": 0.66, + "grad_norm": 2.1074123808822676, + "learning_rate": 2.749853631948875e-07, + "loss": 0.1009, + "step": 10336 + }, + { + "epoch": 0.66, + "grad_norm": 1.7717845227744466, + "learning_rate": 2.74893142959039e-07, + "loss": 0.1866, + "step": 10337 + }, + { + "epoch": 0.66, + "grad_norm": 3.847477736795236, + "learning_rate": 2.7480093232670155e-07, + "loss": 0.0834, + "step": 10338 + }, + { + "epoch": 0.66, + "grad_norm": 0.8115874522348948, + "learning_rate": 2.747087313018092e-07, + "loss": 0.29, + "step": 10339 + }, + { + "epoch": 0.66, + "grad_norm": 1.3672825170076108, + "learning_rate": 2.7461653988829515e-07, + "loss": 0.1825, + "step": 10340 + }, + { + "epoch": 0.66, + "grad_norm": 0.8866651351700062, + "learning_rate": 2.745243580900927e-07, + "loss": 0.2698, + "step": 10341 + }, + { + "epoch": 0.66, + "grad_norm": 0.5706974465252797, + "learning_rate": 2.7443218591113427e-07, + "loss": 0.2585, + "step": 10342 + }, + { + "epoch": 0.66, + "grad_norm": 1.6401080035984237, + "learning_rate": 2.7434002335535233e-07, + "loss": 0.5122, + "step": 10343 + }, + { + "epoch": 0.66, + "grad_norm": 1.0598099448529625, + "learning_rate": 2.742478704266785e-07, + "loss": 0.152, + "step": 10344 + }, + { + "epoch": 0.66, + "grad_norm": 1.573205755417549, + "learning_rate": 2.7415572712904454e-07, + "loss": 0.0744, + "step": 10345 + }, + { + "epoch": 0.66, + "grad_norm": 0.9795329057276252, + "learning_rate": 2.740635934663813e-07, + "loss": 0.1945, + "step": 10346 + }, + { + "epoch": 0.66, + "grad_norm": 4.036694928135541, + "learning_rate": 2.739714694426191e-07, + "loss": 0.1459, + "step": 10347 + }, + { + "epoch": 0.66, + "grad_norm": 1.1962920417360572, + "learning_rate": 2.7387935506168857e-07, + "loss": 0.1081, + "step": 10348 + }, + { + "epoch": 0.66, + "grad_norm": 1.047942062306148, + "learning_rate": 2.7378725032751915e-07, + "loss": 0.1798, + "step": 10349 + }, + { + "epoch": 0.66, + "grad_norm": 0.8520817851554787, + "learning_rate": 2.7369515524404064e-07, + "loss": 0.4585, + "step": 10350 + }, + { + "epoch": 0.66, + "grad_norm": 1.03077299174891, + "learning_rate": 2.7360306981518147e-07, + "loss": 0.3425, + "step": 10351 + }, + { + "epoch": 0.66, + "grad_norm": 0.4387898763168881, + "learning_rate": 2.735109940448706e-07, + "loss": 0.0154, + "step": 10352 + }, + { + "epoch": 0.66, + "grad_norm": 0.6529751277320938, + "learning_rate": 2.734189279370359e-07, + "loss": 0.1431, + "step": 10353 + }, + { + "epoch": 0.66, + "grad_norm": 1.8642363768247914, + "learning_rate": 2.733268714956052e-07, + "loss": 0.1578, + "step": 10354 + }, + { + "epoch": 0.66, + "grad_norm": 0.5916966740097632, + "learning_rate": 2.7323482472450597e-07, + "loss": 0.3505, + "step": 10355 + }, + { + "epoch": 0.66, + "grad_norm": 0.9445762314627908, + "learning_rate": 2.7314278762766483e-07, + "loss": 0.3951, + "step": 10356 + }, + { + "epoch": 0.66, + "grad_norm": 0.7871275307321242, + "learning_rate": 2.7305076020900843e-07, + "loss": 0.2309, + "step": 10357 + }, + { + "epoch": 0.66, + "grad_norm": 2.1404180942625537, + "learning_rate": 2.7295874247246303e-07, + "loss": 0.1901, + "step": 10358 + }, + { + "epoch": 0.66, + "grad_norm": 1.4365060851096445, + "learning_rate": 2.7286673442195406e-07, + "loss": 0.1272, + "step": 10359 + }, + { + "epoch": 0.66, + "grad_norm": 0.6029931644642945, + "learning_rate": 2.727747360614066e-07, + "loss": 0.1351, + "step": 10360 + }, + { + "epoch": 0.66, + "grad_norm": 0.47347890544347404, + "learning_rate": 2.726827473947458e-07, + "loss": 0.3076, + "step": 10361 + }, + { + "epoch": 0.66, + "grad_norm": 0.47592070419616117, + "learning_rate": 2.725907684258959e-07, + "loss": 0.0055, + "step": 10362 + }, + { + "epoch": 0.66, + "grad_norm": 8.371716287788669, + "learning_rate": 2.7249879915878117e-07, + "loss": 0.0995, + "step": 10363 + }, + { + "epoch": 0.66, + "grad_norm": 12.834889662434467, + "learning_rate": 2.724068395973247e-07, + "loss": 0.2137, + "step": 10364 + }, + { + "epoch": 0.66, + "grad_norm": 0.8000913131057464, + "learning_rate": 2.7231488974545017e-07, + "loss": 0.2315, + "step": 10365 + }, + { + "epoch": 0.66, + "grad_norm": 0.564255357468439, + "learning_rate": 2.7222294960708004e-07, + "loss": 0.0632, + "step": 10366 + }, + { + "epoch": 0.66, + "grad_norm": 0.7205279558390081, + "learning_rate": 2.721310191861369e-07, + "loss": 0.1315, + "step": 10367 + }, + { + "epoch": 0.66, + "grad_norm": 1.7295300427974798, + "learning_rate": 2.720390984865424e-07, + "loss": 0.2025, + "step": 10368 + }, + { + "epoch": 0.66, + "grad_norm": 1.3406120790995233, + "learning_rate": 2.7194718751221836e-07, + "loss": 0.3642, + "step": 10369 + }, + { + "epoch": 0.66, + "grad_norm": 1.258689744960688, + "learning_rate": 2.7185528626708553e-07, + "loss": 0.1006, + "step": 10370 + }, + { + "epoch": 0.66, + "grad_norm": 0.43996053820520636, + "learning_rate": 2.717633947550651e-07, + "loss": 0.1191, + "step": 10371 + }, + { + "epoch": 0.66, + "grad_norm": 0.4180444976354502, + "learning_rate": 2.7167151298007686e-07, + "loss": 0.0946, + "step": 10372 + }, + { + "epoch": 0.66, + "grad_norm": 0.3772174713897463, + "learning_rate": 2.715796409460408e-07, + "loss": 0.1083, + "step": 10373 + }, + { + "epoch": 0.66, + "grad_norm": 0.7817672051214644, + "learning_rate": 2.714877786568767e-07, + "loss": 0.395, + "step": 10374 + }, + { + "epoch": 0.66, + "grad_norm": 5.325419730471386, + "learning_rate": 2.713959261165031e-07, + "loss": 0.2452, + "step": 10375 + }, + { + "epoch": 0.66, + "grad_norm": 0.5344302760002922, + "learning_rate": 2.7130408332883904e-07, + "loss": 0.1106, + "step": 10376 + }, + { + "epoch": 0.66, + "grad_norm": 0.9452148141559257, + "learning_rate": 2.7121225029780235e-07, + "loss": 0.2956, + "step": 10377 + }, + { + "epoch": 0.66, + "grad_norm": 1.8962247123977751, + "learning_rate": 2.7112042702731117e-07, + "loss": 0.1983, + "step": 10378 + }, + { + "epoch": 0.66, + "grad_norm": 1.8331461827843374, + "learning_rate": 2.7102861352128246e-07, + "loss": 0.4281, + "step": 10379 + }, + { + "epoch": 0.66, + "grad_norm": 0.9961725351184338, + "learning_rate": 2.709368097836336e-07, + "loss": 0.4122, + "step": 10380 + }, + { + "epoch": 0.66, + "grad_norm": 3.1622811223499467, + "learning_rate": 2.708450158182807e-07, + "loss": 0.0962, + "step": 10381 + }, + { + "epoch": 0.66, + "grad_norm": 0.5044438682947123, + "learning_rate": 2.707532316291403e-07, + "loss": 0.1427, + "step": 10382 + }, + { + "epoch": 0.66, + "grad_norm": 1.0408060350720334, + "learning_rate": 2.7066145722012767e-07, + "loss": 0.1227, + "step": 10383 + }, + { + "epoch": 0.66, + "grad_norm": 0.9981343484997779, + "learning_rate": 2.7056969259515856e-07, + "loss": 0.3258, + "step": 10384 + }, + { + "epoch": 0.66, + "grad_norm": 11.124303491774027, + "learning_rate": 2.704779377581473e-07, + "loss": 0.1624, + "step": 10385 + }, + { + "epoch": 0.66, + "grad_norm": 0.8375855156700935, + "learning_rate": 2.7038619271300886e-07, + "loss": 0.5048, + "step": 10386 + }, + { + "epoch": 0.66, + "grad_norm": 0.3768231972231354, + "learning_rate": 2.70294457463657e-07, + "loss": 0.1078, + "step": 10387 + }, + { + "epoch": 0.66, + "grad_norm": 0.6236407902616248, + "learning_rate": 2.7020273201400514e-07, + "loss": 0.282, + "step": 10388 + }, + { + "epoch": 0.66, + "grad_norm": 0.8953272419992317, + "learning_rate": 2.7011101636796674e-07, + "loss": 0.3133, + "step": 10389 + }, + { + "epoch": 0.66, + "grad_norm": 2.074712334373866, + "learning_rate": 2.700193105294545e-07, + "loss": 0.2425, + "step": 10390 + }, + { + "epoch": 0.66, + "grad_norm": 0.7647295096150315, + "learning_rate": 2.6992761450238086e-07, + "loss": 0.1282, + "step": 10391 + }, + { + "epoch": 0.66, + "grad_norm": 0.6549380344787635, + "learning_rate": 2.6983592829065765e-07, + "loss": 0.1558, + "step": 10392 + }, + { + "epoch": 0.66, + "grad_norm": 2.145107928758458, + "learning_rate": 2.697442518981966e-07, + "loss": 0.2234, + "step": 10393 + }, + { + "epoch": 0.66, + "grad_norm": 0.47280586014705855, + "learning_rate": 2.696525853289084e-07, + "loss": 0.057, + "step": 10394 + }, + { + "epoch": 0.66, + "grad_norm": 0.5057971735520573, + "learning_rate": 2.6956092858670416e-07, + "loss": 0.0897, + "step": 10395 + }, + { + "epoch": 0.66, + "grad_norm": 3.2933726887911243, + "learning_rate": 2.6946928167549377e-07, + "loss": 0.1144, + "step": 10396 + }, + { + "epoch": 0.66, + "grad_norm": 0.4718624220367011, + "learning_rate": 2.693776445991874e-07, + "loss": 0.0291, + "step": 10397 + }, + { + "epoch": 0.66, + "grad_norm": 1.0816261079841452, + "learning_rate": 2.692860173616942e-07, + "loss": 0.1048, + "step": 10398 + }, + { + "epoch": 0.66, + "grad_norm": 0.4821952474066926, + "learning_rate": 2.6919439996692343e-07, + "loss": 0.1043, + "step": 10399 + }, + { + "epoch": 0.66, + "grad_norm": 1.3739327583129264, + "learning_rate": 2.6910279241878337e-07, + "loss": 0.2589, + "step": 10400 + }, + { + "epoch": 0.66, + "grad_norm": 0.6883030170882999, + "learning_rate": 2.690111947211825e-07, + "loss": 0.173, + "step": 10401 + }, + { + "epoch": 0.66, + "grad_norm": 0.5572504594788301, + "learning_rate": 2.689196068780285e-07, + "loss": 0.2161, + "step": 10402 + }, + { + "epoch": 0.66, + "grad_norm": 0.8375524659754772, + "learning_rate": 2.688280288932283e-07, + "loss": 0.044, + "step": 10403 + }, + { + "epoch": 0.66, + "grad_norm": 1.1238918985541626, + "learning_rate": 2.687364607706893e-07, + "loss": 0.2673, + "step": 10404 + }, + { + "epoch": 0.66, + "grad_norm": 0.9614332658496733, + "learning_rate": 2.686449025143177e-07, + "loss": 0.2368, + "step": 10405 + }, + { + "epoch": 0.66, + "grad_norm": 5.139127536266855, + "learning_rate": 2.685533541280197e-07, + "loss": 0.2419, + "step": 10406 + }, + { + "epoch": 0.66, + "grad_norm": 0.9242720014787659, + "learning_rate": 2.684618156157008e-07, + "loss": 0.3948, + "step": 10407 + }, + { + "epoch": 0.66, + "grad_norm": 8.721545218688634, + "learning_rate": 2.683702869812662e-07, + "loss": 0.0945, + "step": 10408 + }, + { + "epoch": 0.66, + "grad_norm": 15.050138743858733, + "learning_rate": 2.682787682286209e-07, + "loss": 0.1971, + "step": 10409 + }, + { + "epoch": 0.66, + "grad_norm": 0.5187178695507517, + "learning_rate": 2.681872593616693e-07, + "loss": 0.2237, + "step": 10410 + }, + { + "epoch": 0.66, + "grad_norm": 0.28910169011590614, + "learning_rate": 2.6809576038431504e-07, + "loss": 0.0723, + "step": 10411 + }, + { + "epoch": 0.66, + "grad_norm": 0.7739970350627858, + "learning_rate": 2.68004271300462e-07, + "loss": 0.3179, + "step": 10412 + }, + { + "epoch": 0.66, + "grad_norm": 0.8477935669559543, + "learning_rate": 2.6791279211401284e-07, + "loss": 0.3444, + "step": 10413 + }, + { + "epoch": 0.66, + "grad_norm": 0.3818642354491543, + "learning_rate": 2.678213228288708e-07, + "loss": 0.0815, + "step": 10414 + }, + { + "epoch": 0.66, + "grad_norm": 0.528649656653528, + "learning_rate": 2.677298634489379e-07, + "loss": 0.3162, + "step": 10415 + }, + { + "epoch": 0.66, + "grad_norm": 0.9286483054980392, + "learning_rate": 2.676384139781157e-07, + "loss": 0.0572, + "step": 10416 + }, + { + "epoch": 0.66, + "grad_norm": 0.7625097904677913, + "learning_rate": 2.675469744203061e-07, + "loss": 0.3784, + "step": 10417 + }, + { + "epoch": 0.66, + "grad_norm": 0.2864610662133286, + "learning_rate": 2.6745554477940967e-07, + "loss": 0.0046, + "step": 10418 + }, + { + "epoch": 0.66, + "grad_norm": 0.6962038626822891, + "learning_rate": 2.6736412505932737e-07, + "loss": 0.2113, + "step": 10419 + }, + { + "epoch": 0.66, + "grad_norm": 1.0503986618584422, + "learning_rate": 2.672727152639589e-07, + "loss": 0.1527, + "step": 10420 + }, + { + "epoch": 0.66, + "grad_norm": 0.8225215419075138, + "learning_rate": 2.6718131539720445e-07, + "loss": 0.0783, + "step": 10421 + }, + { + "epoch": 0.66, + "grad_norm": 0.5404830518009177, + "learning_rate": 2.6708992546296294e-07, + "loss": 0.1638, + "step": 10422 + }, + { + "epoch": 0.66, + "grad_norm": 0.9482240485776333, + "learning_rate": 2.6699854546513354e-07, + "loss": 0.2334, + "step": 10423 + }, + { + "epoch": 0.66, + "grad_norm": 2.5889709783066235, + "learning_rate": 2.6690717540761446e-07, + "loss": 0.293, + "step": 10424 + }, + { + "epoch": 0.66, + "grad_norm": 0.558957254022895, + "learning_rate": 2.6681581529430384e-07, + "loss": 0.1498, + "step": 10425 + }, + { + "epoch": 0.66, + "grad_norm": 1.316452324124796, + "learning_rate": 2.667244651290995e-07, + "loss": 0.0172, + "step": 10426 + }, + { + "epoch": 0.66, + "grad_norm": 1.0084936543747833, + "learning_rate": 2.666331249158982e-07, + "loss": 0.0781, + "step": 10427 + }, + { + "epoch": 0.67, + "grad_norm": 1.1415900491388742, + "learning_rate": 2.665417946585972e-07, + "loss": 0.3482, + "step": 10428 + }, + { + "epoch": 0.67, + "grad_norm": 1.230059651396525, + "learning_rate": 2.6645047436109224e-07, + "loss": 0.1471, + "step": 10429 + }, + { + "epoch": 0.67, + "grad_norm": 0.6010759257443067, + "learning_rate": 2.6635916402727975e-07, + "loss": 0.1955, + "step": 10430 + }, + { + "epoch": 0.67, + "grad_norm": 1.5991172116467856, + "learning_rate": 2.662678636610549e-07, + "loss": 0.4037, + "step": 10431 + }, + { + "epoch": 0.67, + "grad_norm": 1.3643282391243785, + "learning_rate": 2.6617657326631296e-07, + "loss": 0.1668, + "step": 10432 + }, + { + "epoch": 0.67, + "grad_norm": 0.5806538082505955, + "learning_rate": 2.6608529284694823e-07, + "loss": 0.1467, + "step": 10433 + }, + { + "epoch": 0.67, + "grad_norm": 0.6803427975121148, + "learning_rate": 2.6599402240685543e-07, + "loss": 0.0592, + "step": 10434 + }, + { + "epoch": 0.67, + "grad_norm": 1.2480218245656194, + "learning_rate": 2.659027619499278e-07, + "loss": 0.1108, + "step": 10435 + }, + { + "epoch": 0.67, + "grad_norm": 0.7961858815220818, + "learning_rate": 2.658115114800591e-07, + "loss": 0.2516, + "step": 10436 + }, + { + "epoch": 0.67, + "grad_norm": 0.9066160613403413, + "learning_rate": 2.657202710011419e-07, + "loss": 0.2253, + "step": 10437 + }, + { + "epoch": 0.67, + "grad_norm": 2.6353668729752076, + "learning_rate": 2.6562904051706913e-07, + "loss": 0.1116, + "step": 10438 + }, + { + "epoch": 0.67, + "grad_norm": 0.8542947566658707, + "learning_rate": 2.655378200317324e-07, + "loss": 0.1967, + "step": 10439 + }, + { + "epoch": 0.67, + "grad_norm": 0.6447993647207726, + "learning_rate": 2.6544660954902385e-07, + "loss": 0.2617, + "step": 10440 + }, + { + "epoch": 0.67, + "grad_norm": 0.8252586569785209, + "learning_rate": 2.653554090728342e-07, + "loss": 0.2064, + "step": 10441 + }, + { + "epoch": 0.67, + "grad_norm": 1.551786203138509, + "learning_rate": 2.6526421860705473e-07, + "loss": 0.1988, + "step": 10442 + }, + { + "epoch": 0.67, + "grad_norm": 0.888358863306191, + "learning_rate": 2.6517303815557533e-07, + "loss": 0.168, + "step": 10443 + }, + { + "epoch": 0.67, + "grad_norm": 0.39348950210940503, + "learning_rate": 2.6508186772228626e-07, + "loss": 0.2157, + "step": 10444 + }, + { + "epoch": 0.67, + "grad_norm": 0.9599256798437353, + "learning_rate": 2.649907073110771e-07, + "loss": 0.2491, + "step": 10445 + }, + { + "epoch": 0.67, + "grad_norm": 0.7617142381150503, + "learning_rate": 2.648995569258366e-07, + "loss": 0.4391, + "step": 10446 + }, + { + "epoch": 0.67, + "grad_norm": 3.397491569318164, + "learning_rate": 2.6480841657045384e-07, + "loss": 0.0671, + "step": 10447 + }, + { + "epoch": 0.67, + "grad_norm": 1.3137672202187631, + "learning_rate": 2.6471728624881657e-07, + "loss": 0.2771, + "step": 10448 + }, + { + "epoch": 0.67, + "grad_norm": 0.9264509751527862, + "learning_rate": 2.6462616596481306e-07, + "loss": 0.1368, + "step": 10449 + }, + { + "epoch": 0.67, + "grad_norm": 1.1262196550745276, + "learning_rate": 2.645350557223303e-07, + "loss": 0.0926, + "step": 10450 + }, + { + "epoch": 0.67, + "grad_norm": 0.6099039134737922, + "learning_rate": 2.6444395552525556e-07, + "loss": 0.1469, + "step": 10451 + }, + { + "epoch": 0.67, + "grad_norm": 0.7466957962216927, + "learning_rate": 2.6435286537747507e-07, + "loss": 0.1395, + "step": 10452 + }, + { + "epoch": 0.67, + "grad_norm": 0.9883365727329495, + "learning_rate": 2.6426178528287526e-07, + "loss": 0.1304, + "step": 10453 + }, + { + "epoch": 0.67, + "grad_norm": 0.7112658868253172, + "learning_rate": 2.641707152453413e-07, + "loss": 0.1583, + "step": 10454 + }, + { + "epoch": 0.67, + "grad_norm": 3.708473714266294, + "learning_rate": 2.64079655268759e-07, + "loss": 0.0373, + "step": 10455 + }, + { + "epoch": 0.67, + "grad_norm": 4.028738953398234, + "learning_rate": 2.6398860535701273e-07, + "loss": 0.167, + "step": 10456 + }, + { + "epoch": 0.67, + "grad_norm": 2.0053975009335385, + "learning_rate": 2.638975655139869e-07, + "loss": 0.0174, + "step": 10457 + }, + { + "epoch": 0.67, + "grad_norm": 0.2956561433469352, + "learning_rate": 2.6380653574356576e-07, + "loss": 0.0927, + "step": 10458 + }, + { + "epoch": 0.67, + "grad_norm": 0.7033163724987198, + "learning_rate": 2.637155160496324e-07, + "loss": 0.3044, + "step": 10459 + }, + { + "epoch": 0.67, + "grad_norm": 0.7629600639173423, + "learning_rate": 2.636245064360701e-07, + "loss": 0.3183, + "step": 10460 + }, + { + "epoch": 0.67, + "grad_norm": 0.30311969857539905, + "learning_rate": 2.635335069067617e-07, + "loss": 0.2666, + "step": 10461 + }, + { + "epoch": 0.67, + "grad_norm": 3.8735044339443045, + "learning_rate": 2.634425174655891e-07, + "loss": 0.1904, + "step": 10462 + }, + { + "epoch": 0.67, + "grad_norm": 0.4713183463796078, + "learning_rate": 2.633515381164342e-07, + "loss": 0.0081, + "step": 10463 + }, + { + "epoch": 0.67, + "grad_norm": 2.2370586944612736, + "learning_rate": 2.632605688631787e-07, + "loss": 0.2827, + "step": 10464 + }, + { + "epoch": 0.67, + "grad_norm": 0.6483202060911514, + "learning_rate": 2.631696097097029e-07, + "loss": 0.1949, + "step": 10465 + }, + { + "epoch": 0.67, + "grad_norm": 0.606500670229192, + "learning_rate": 2.630786606598879e-07, + "loss": 0.2757, + "step": 10466 + }, + { + "epoch": 0.67, + "grad_norm": 0.8034149397610436, + "learning_rate": 2.629877217176134e-07, + "loss": 0.3208, + "step": 10467 + }, + { + "epoch": 0.67, + "grad_norm": 1.0127645399924585, + "learning_rate": 2.6289679288675923e-07, + "loss": 0.0089, + "step": 10468 + }, + { + "epoch": 0.67, + "grad_norm": 1.1079613960310797, + "learning_rate": 2.628058741712046e-07, + "loss": 0.0142, + "step": 10469 + }, + { + "epoch": 0.67, + "grad_norm": 1.124947011569078, + "learning_rate": 2.627149655748279e-07, + "loss": 0.1319, + "step": 10470 + }, + { + "epoch": 0.67, + "grad_norm": 1.8200726489442691, + "learning_rate": 2.626240671015081e-07, + "loss": 0.0614, + "step": 10471 + }, + { + "epoch": 0.67, + "grad_norm": 11.110121682498992, + "learning_rate": 2.625331787551225e-07, + "loss": 0.2593, + "step": 10472 + }, + { + "epoch": 0.67, + "grad_norm": 0.5619631755302068, + "learning_rate": 2.62442300539549e-07, + "loss": 0.0547, + "step": 10473 + }, + { + "epoch": 0.67, + "grad_norm": 0.5592244554126755, + "learning_rate": 2.6235143245866445e-07, + "loss": 0.176, + "step": 10474 + }, + { + "epoch": 0.67, + "grad_norm": 0.31486389004552046, + "learning_rate": 2.622605745163456e-07, + "loss": 0.2191, + "step": 10475 + }, + { + "epoch": 0.67, + "grad_norm": 0.6316466594089674, + "learning_rate": 2.621697267164684e-07, + "loss": 0.1195, + "step": 10476 + }, + { + "epoch": 0.67, + "grad_norm": 0.5717075827821232, + "learning_rate": 2.62078889062909e-07, + "loss": 0.4002, + "step": 10477 + }, + { + "epoch": 0.67, + "grad_norm": 0.8042376967286633, + "learning_rate": 2.619880615595422e-07, + "loss": 0.2113, + "step": 10478 + }, + { + "epoch": 0.67, + "grad_norm": 0.4112285756893933, + "learning_rate": 2.6189724421024315e-07, + "loss": 0.158, + "step": 10479 + }, + { + "epoch": 0.67, + "grad_norm": 2.8920847339347455, + "learning_rate": 2.618064370188864e-07, + "loss": 0.3246, + "step": 10480 + }, + { + "epoch": 0.67, + "grad_norm": 1.4096801217003756, + "learning_rate": 2.61715639989346e-07, + "loss": 0.0097, + "step": 10481 + }, + { + "epoch": 0.67, + "grad_norm": 0.45938981760695313, + "learning_rate": 2.616248531254953e-07, + "loss": 0.2131, + "step": 10482 + }, + { + "epoch": 0.67, + "grad_norm": 0.7588298537674147, + "learning_rate": 2.615340764312077e-07, + "loss": 0.2523, + "step": 10483 + }, + { + "epoch": 0.67, + "grad_norm": 1.1579948270642288, + "learning_rate": 2.614433099103558e-07, + "loss": 0.0244, + "step": 10484 + }, + { + "epoch": 0.67, + "grad_norm": 0.4989591486028849, + "learning_rate": 2.613525535668116e-07, + "loss": 0.0517, + "step": 10485 + }, + { + "epoch": 0.67, + "grad_norm": 1.6484474008402006, + "learning_rate": 2.612618074044475e-07, + "loss": 0.1227, + "step": 10486 + }, + { + "epoch": 0.67, + "grad_norm": 1.5772858373783285, + "learning_rate": 2.611710714271343e-07, + "loss": 0.0385, + "step": 10487 + }, + { + "epoch": 0.67, + "grad_norm": 0.9127399629002068, + "learning_rate": 2.610803456387436e-07, + "loss": 0.2728, + "step": 10488 + }, + { + "epoch": 0.67, + "grad_norm": 1.1753583013101303, + "learning_rate": 2.6098963004314536e-07, + "loss": 0.0091, + "step": 10489 + }, + { + "epoch": 0.67, + "grad_norm": 0.6615157221653731, + "learning_rate": 2.6089892464421025e-07, + "loss": 0.1522, + "step": 10490 + }, + { + "epoch": 0.67, + "grad_norm": 0.3786927597294344, + "learning_rate": 2.6080822944580736e-07, + "loss": 0.0326, + "step": 10491 + }, + { + "epoch": 0.67, + "grad_norm": 1.8774515974536057, + "learning_rate": 2.607175444518066e-07, + "loss": 0.1332, + "step": 10492 + }, + { + "epoch": 0.67, + "grad_norm": 0.7859571173425213, + "learning_rate": 2.606268696660761e-07, + "loss": 0.0165, + "step": 10493 + }, + { + "epoch": 0.67, + "grad_norm": 1.4774352893560765, + "learning_rate": 2.6053620509248475e-07, + "loss": 0.2128, + "step": 10494 + }, + { + "epoch": 0.67, + "grad_norm": 0.8849807851339067, + "learning_rate": 2.604455507349001e-07, + "loss": 0.2165, + "step": 10495 + }, + { + "epoch": 0.67, + "grad_norm": 0.3053876242631579, + "learning_rate": 2.603549065971898e-07, + "loss": 0.1507, + "step": 10496 + }, + { + "epoch": 0.67, + "grad_norm": 1.236602850657951, + "learning_rate": 2.6026427268322113e-07, + "loss": 0.2823, + "step": 10497 + }, + { + "epoch": 0.67, + "grad_norm": 0.9731953198793252, + "learning_rate": 2.6017364899686035e-07, + "loss": 0.353, + "step": 10498 + }, + { + "epoch": 0.67, + "grad_norm": 12.849028799996507, + "learning_rate": 2.6008303554197396e-07, + "loss": 0.2629, + "step": 10499 + }, + { + "epoch": 0.67, + "grad_norm": 1.609483280714231, + "learning_rate": 2.5999243232242745e-07, + "loss": 0.019, + "step": 10500 + }, + { + "epoch": 0.67, + "grad_norm": 1.3888003858011602, + "learning_rate": 2.599018393420864e-07, + "loss": 0.1603, + "step": 10501 + }, + { + "epoch": 0.67, + "grad_norm": 2.2085305407941447, + "learning_rate": 2.5981125660481535e-07, + "loss": 0.1441, + "step": 10502 + }, + { + "epoch": 0.67, + "grad_norm": 10.44957664609367, + "learning_rate": 2.5972068411447914e-07, + "loss": 0.0748, + "step": 10503 + }, + { + "epoch": 0.67, + "grad_norm": 1.6362945088703087, + "learning_rate": 2.596301218749414e-07, + "loss": 0.0895, + "step": 10504 + }, + { + "epoch": 0.67, + "grad_norm": 0.7594616526315587, + "learning_rate": 2.5953956989006596e-07, + "loss": 0.1739, + "step": 10505 + }, + { + "epoch": 0.67, + "grad_norm": 1.0883217998113466, + "learning_rate": 2.594490281637157e-07, + "loss": 0.0775, + "step": 10506 + }, + { + "epoch": 0.67, + "grad_norm": 12.490342342348187, + "learning_rate": 2.5935849669975373e-07, + "loss": 0.2393, + "step": 10507 + }, + { + "epoch": 0.67, + "grad_norm": 1.0084608531775603, + "learning_rate": 2.5926797550204175e-07, + "loss": 0.2081, + "step": 10508 + }, + { + "epoch": 0.67, + "grad_norm": 0.577018754148711, + "learning_rate": 2.591774645744421e-07, + "loss": 0.2774, + "step": 10509 + }, + { + "epoch": 0.67, + "grad_norm": 0.6290297424700538, + "learning_rate": 2.5908696392081573e-07, + "loss": 0.2795, + "step": 10510 + }, + { + "epoch": 0.67, + "grad_norm": 0.6718054667773171, + "learning_rate": 2.5899647354502403e-07, + "loss": 0.1014, + "step": 10511 + }, + { + "epoch": 0.67, + "grad_norm": 0.45768586251432564, + "learning_rate": 2.58905993450927e-07, + "loss": 0.1617, + "step": 10512 + }, + { + "epoch": 0.67, + "grad_norm": 12.646220834732189, + "learning_rate": 2.588155236423851e-07, + "loss": 0.2365, + "step": 10513 + }, + { + "epoch": 0.67, + "grad_norm": 0.7789869069369135, + "learning_rate": 2.5872506412325767e-07, + "loss": 0.3558, + "step": 10514 + }, + { + "epoch": 0.67, + "grad_norm": 1.151739340656819, + "learning_rate": 2.5863461489740403e-07, + "loss": 0.3409, + "step": 10515 + }, + { + "epoch": 0.67, + "grad_norm": 0.5370364545434133, + "learning_rate": 2.585441759686831e-07, + "loss": 0.2612, + "step": 10516 + }, + { + "epoch": 0.67, + "grad_norm": 0.5427067013972633, + "learning_rate": 2.584537473409528e-07, + "loss": 0.1106, + "step": 10517 + }, + { + "epoch": 0.67, + "grad_norm": 1.617124445554166, + "learning_rate": 2.5836332901807145e-07, + "loss": 0.0732, + "step": 10518 + }, + { + "epoch": 0.67, + "grad_norm": 1.7087900502440856, + "learning_rate": 2.58272921003896e-07, + "loss": 0.1356, + "step": 10519 + }, + { + "epoch": 0.67, + "grad_norm": 2.951110245258846, + "learning_rate": 2.5818252330228387e-07, + "loss": 0.0826, + "step": 10520 + }, + { + "epoch": 0.67, + "grad_norm": 1.1491656278638571, + "learning_rate": 2.580921359170912e-07, + "loss": 0.076, + "step": 10521 + }, + { + "epoch": 0.67, + "grad_norm": 1.1568027759612993, + "learning_rate": 2.580017588521746e-07, + "loss": 0.2779, + "step": 10522 + }, + { + "epoch": 0.67, + "grad_norm": 3.8358940530762977, + "learning_rate": 2.579113921113891e-07, + "loss": 0.5154, + "step": 10523 + }, + { + "epoch": 0.67, + "grad_norm": 1.2170483844360773, + "learning_rate": 2.5782103569859054e-07, + "loss": 0.0438, + "step": 10524 + }, + { + "epoch": 0.67, + "grad_norm": 0.7479360058026202, + "learning_rate": 2.5773068961763343e-07, + "loss": 0.2036, + "step": 10525 + }, + { + "epoch": 0.67, + "grad_norm": 0.617781493911869, + "learning_rate": 2.576403538723719e-07, + "loss": 0.1514, + "step": 10526 + }, + { + "epoch": 0.67, + "grad_norm": 0.7265854935703562, + "learning_rate": 2.5755002846666033e-07, + "loss": 0.2367, + "step": 10527 + }, + { + "epoch": 0.67, + "grad_norm": 0.737124315075192, + "learning_rate": 2.5745971340435177e-07, + "loss": 0.293, + "step": 10528 + }, + { + "epoch": 0.67, + "grad_norm": 1.6024153526531448, + "learning_rate": 2.5736940868929953e-07, + "loss": 0.333, + "step": 10529 + }, + { + "epoch": 0.67, + "grad_norm": 7.599186344179326, + "learning_rate": 2.572791143253559e-07, + "loss": 0.1295, + "step": 10530 + }, + { + "epoch": 0.67, + "grad_norm": 0.7085186447716115, + "learning_rate": 2.5718883031637314e-07, + "loss": 0.2426, + "step": 10531 + }, + { + "epoch": 0.67, + "grad_norm": 2.8266295058925617, + "learning_rate": 2.5709855666620316e-07, + "loss": 0.1897, + "step": 10532 + }, + { + "epoch": 0.67, + "grad_norm": 0.7798735228341888, + "learning_rate": 2.5700829337869694e-07, + "loss": 0.215, + "step": 10533 + }, + { + "epoch": 0.67, + "grad_norm": 2.002867064307344, + "learning_rate": 2.5691804045770535e-07, + "loss": 0.038, + "step": 10534 + }, + { + "epoch": 0.67, + "grad_norm": 0.8655546053823612, + "learning_rate": 2.56827797907079e-07, + "loss": 0.11, + "step": 10535 + }, + { + "epoch": 0.67, + "grad_norm": 1.872437944181974, + "learning_rate": 2.5673756573066747e-07, + "loss": 0.1736, + "step": 10536 + }, + { + "epoch": 0.67, + "grad_norm": 1.3986849148475318, + "learning_rate": 2.5664734393232054e-07, + "loss": 0.2444, + "step": 10537 + }, + { + "epoch": 0.67, + "grad_norm": 0.5756746969524624, + "learning_rate": 2.5655713251588715e-07, + "loss": 0.1042, + "step": 10538 + }, + { + "epoch": 0.67, + "grad_norm": 2.2697659380365542, + "learning_rate": 2.564669314852157e-07, + "loss": 0.0931, + "step": 10539 + }, + { + "epoch": 0.67, + "grad_norm": 1.3407578119448709, + "learning_rate": 2.5637674084415473e-07, + "loss": 0.2629, + "step": 10540 + }, + { + "epoch": 0.67, + "grad_norm": 0.39097716840919244, + "learning_rate": 2.5628656059655153e-07, + "loss": 0.2376, + "step": 10541 + }, + { + "epoch": 0.67, + "grad_norm": 0.6718139343989389, + "learning_rate": 2.561963907462537e-07, + "loss": 0.2885, + "step": 10542 + }, + { + "epoch": 0.67, + "grad_norm": 1.8800900704244332, + "learning_rate": 2.5610623129710784e-07, + "loss": 0.1508, + "step": 10543 + }, + { + "epoch": 0.67, + "grad_norm": 1.0095528779448446, + "learning_rate": 2.560160822529606e-07, + "loss": 0.499, + "step": 10544 + }, + { + "epoch": 0.67, + "grad_norm": 0.7794417974665724, + "learning_rate": 2.5592594361765753e-07, + "loss": 0.1611, + "step": 10545 + }, + { + "epoch": 0.67, + "grad_norm": 0.18154471465380143, + "learning_rate": 2.5583581539504463e-07, + "loss": 0.1021, + "step": 10546 + }, + { + "epoch": 0.67, + "grad_norm": 0.7474633808438751, + "learning_rate": 2.557456975889664e-07, + "loss": 0.0125, + "step": 10547 + }, + { + "epoch": 0.67, + "grad_norm": 4.031072803768003, + "learning_rate": 2.55655590203268e-07, + "loss": 0.0846, + "step": 10548 + }, + { + "epoch": 0.67, + "grad_norm": 0.7108431360941682, + "learning_rate": 2.55565493241793e-07, + "loss": 0.3651, + "step": 10549 + }, + { + "epoch": 0.67, + "grad_norm": 0.6395022418878527, + "learning_rate": 2.5547540670838553e-07, + "loss": 0.3176, + "step": 10550 + }, + { + "epoch": 0.67, + "grad_norm": 1.2489121349943053, + "learning_rate": 2.553853306068888e-07, + "loss": 0.3709, + "step": 10551 + }, + { + "epoch": 0.67, + "grad_norm": 4.126379091094442, + "learning_rate": 2.552952649411457e-07, + "loss": 0.0413, + "step": 10552 + }, + { + "epoch": 0.67, + "grad_norm": 0.8528101579995798, + "learning_rate": 2.5520520971499857e-07, + "loss": 0.1108, + "step": 10553 + }, + { + "epoch": 0.67, + "grad_norm": 0.5092444165228933, + "learning_rate": 2.551151649322891e-07, + "loss": 0.0573, + "step": 10554 + }, + { + "epoch": 0.67, + "grad_norm": 1.123056991807261, + "learning_rate": 2.5502513059685916e-07, + "loss": 0.0976, + "step": 10555 + }, + { + "epoch": 0.67, + "grad_norm": 0.21528667818707273, + "learning_rate": 2.549351067125494e-07, + "loss": 0.004, + "step": 10556 + }, + { + "epoch": 0.67, + "grad_norm": 1.1022126568579682, + "learning_rate": 2.5484509328320083e-07, + "loss": 0.2352, + "step": 10557 + }, + { + "epoch": 0.67, + "grad_norm": 4.963214973942761, + "learning_rate": 2.5475509031265316e-07, + "loss": 0.1623, + "step": 10558 + }, + { + "epoch": 0.67, + "grad_norm": 0.33563736633622543, + "learning_rate": 2.5466509780474667e-07, + "loss": 0.0764, + "step": 10559 + }, + { + "epoch": 0.67, + "grad_norm": 0.9615137551229664, + "learning_rate": 2.5457511576332e-07, + "loss": 0.2759, + "step": 10560 + }, + { + "epoch": 0.67, + "grad_norm": 0.8480390854122233, + "learning_rate": 2.5448514419221243e-07, + "loss": 0.2355, + "step": 10561 + }, + { + "epoch": 0.67, + "grad_norm": 2.800579296234656, + "learning_rate": 2.5439518309526203e-07, + "loss": 0.1037, + "step": 10562 + }, + { + "epoch": 0.67, + "grad_norm": 0.8229825352930443, + "learning_rate": 2.5430523247630703e-07, + "loss": 0.1112, + "step": 10563 + }, + { + "epoch": 0.67, + "grad_norm": 0.4302533019832532, + "learning_rate": 2.542152923391845e-07, + "loss": 0.0144, + "step": 10564 + }, + { + "epoch": 0.67, + "grad_norm": 0.36507556899829297, + "learning_rate": 2.5412536268773186e-07, + "loss": 0.1387, + "step": 10565 + }, + { + "epoch": 0.67, + "grad_norm": 2.479189561026578, + "learning_rate": 2.5403544352578544e-07, + "loss": 0.3136, + "step": 10566 + }, + { + "epoch": 0.67, + "grad_norm": 0.4284166623846381, + "learning_rate": 2.539455348571813e-07, + "loss": 0.022, + "step": 10567 + }, + { + "epoch": 0.67, + "grad_norm": 2.206928190817757, + "learning_rate": 2.538556366857556e-07, + "loss": 0.2967, + "step": 10568 + }, + { + "epoch": 0.67, + "grad_norm": 0.25602016189714777, + "learning_rate": 2.5376574901534296e-07, + "loss": 0.0336, + "step": 10569 + }, + { + "epoch": 0.67, + "grad_norm": 0.667943144170874, + "learning_rate": 2.536758718497787e-07, + "loss": 0.0874, + "step": 10570 + }, + { + "epoch": 0.67, + "grad_norm": 1.1676065572146814, + "learning_rate": 2.535860051928967e-07, + "loss": 0.1725, + "step": 10571 + }, + { + "epoch": 0.67, + "grad_norm": 0.06383168117739876, + "learning_rate": 2.534961490485313e-07, + "loss": 0.0017, + "step": 10572 + }, + { + "epoch": 0.67, + "grad_norm": 0.612009773288431, + "learning_rate": 2.534063034205155e-07, + "loss": 0.3274, + "step": 10573 + }, + { + "epoch": 0.67, + "grad_norm": 0.5469969393598104, + "learning_rate": 2.5331646831268274e-07, + "loss": 0.156, + "step": 10574 + }, + { + "epoch": 0.67, + "grad_norm": 11.353782065378933, + "learning_rate": 2.53226643728865e-07, + "loss": 0.3062, + "step": 10575 + }, + { + "epoch": 0.67, + "grad_norm": 0.6720243725775021, + "learning_rate": 2.531368296728951e-07, + "loss": 0.1248, + "step": 10576 + }, + { + "epoch": 0.67, + "grad_norm": 0.4131062860017984, + "learning_rate": 2.53047026148604e-07, + "loss": 0.2135, + "step": 10577 + }, + { + "epoch": 0.67, + "grad_norm": 0.6516688749793922, + "learning_rate": 2.5295723315982345e-07, + "loss": 0.1384, + "step": 10578 + }, + { + "epoch": 0.67, + "grad_norm": 0.46519008824416797, + "learning_rate": 2.5286745071038396e-07, + "loss": 0.1715, + "step": 10579 + }, + { + "epoch": 0.67, + "grad_norm": 0.4264081869372595, + "learning_rate": 2.527776788041156e-07, + "loss": 0.1294, + "step": 10580 + }, + { + "epoch": 0.67, + "grad_norm": 1.1450129377251033, + "learning_rate": 2.526879174448486e-07, + "loss": 0.4197, + "step": 10581 + }, + { + "epoch": 0.67, + "grad_norm": 0.9483996792354955, + "learning_rate": 2.5259816663641205e-07, + "loss": 0.1096, + "step": 10582 + }, + { + "epoch": 0.67, + "grad_norm": 0.5704949117557274, + "learning_rate": 2.5250842638263526e-07, + "loss": 0.0159, + "step": 10583 + }, + { + "epoch": 0.67, + "grad_norm": 1.0299182900440016, + "learning_rate": 2.524186966873463e-07, + "loss": 0.3862, + "step": 10584 + }, + { + "epoch": 0.68, + "grad_norm": 4.038388330939207, + "learning_rate": 2.5232897755437346e-07, + "loss": 0.3125, + "step": 10585 + }, + { + "epoch": 0.68, + "grad_norm": 1.8743035505804668, + "learning_rate": 2.522392689875442e-07, + "loss": 0.1385, + "step": 10586 + }, + { + "epoch": 0.68, + "grad_norm": 0.1298631708065149, + "learning_rate": 2.521495709906861e-07, + "loss": 0.0042, + "step": 10587 + }, + { + "epoch": 0.68, + "grad_norm": 1.5824709882596042, + "learning_rate": 2.5205988356762536e-07, + "loss": 0.1166, + "step": 10588 + }, + { + "epoch": 0.68, + "grad_norm": 5.152329124092868, + "learning_rate": 2.519702067221886e-07, + "loss": 0.1667, + "step": 10589 + }, + { + "epoch": 0.68, + "grad_norm": 0.8882923684298127, + "learning_rate": 2.5188054045820115e-07, + "loss": 0.1068, + "step": 10590 + }, + { + "epoch": 0.68, + "grad_norm": 0.43831150842493927, + "learning_rate": 2.517908847794889e-07, + "loss": 0.4529, + "step": 10591 + }, + { + "epoch": 0.68, + "grad_norm": 1.1712969157414352, + "learning_rate": 2.517012396898762e-07, + "loss": 0.006, + "step": 10592 + }, + { + "epoch": 0.68, + "grad_norm": 0.5727387135701465, + "learning_rate": 2.51611605193188e-07, + "loss": 0.0746, + "step": 10593 + }, + { + "epoch": 0.68, + "grad_norm": 0.9156049890281388, + "learning_rate": 2.515219812932481e-07, + "loss": 0.3767, + "step": 10594 + }, + { + "epoch": 0.68, + "grad_norm": 1.43717320243928, + "learning_rate": 2.5143236799387975e-07, + "loss": 0.1063, + "step": 10595 + }, + { + "epoch": 0.68, + "grad_norm": 1.3269259054658187, + "learning_rate": 2.5134276529890644e-07, + "loss": 0.0593, + "step": 10596 + }, + { + "epoch": 0.68, + "grad_norm": 1.3760722979262, + "learning_rate": 2.5125317321215046e-07, + "loss": 0.2649, + "step": 10597 + }, + { + "epoch": 0.68, + "grad_norm": 0.9689287146562792, + "learning_rate": 2.511635917374343e-07, + "loss": 0.1306, + "step": 10598 + }, + { + "epoch": 0.68, + "grad_norm": 0.7995139365785888, + "learning_rate": 2.510740208785793e-07, + "loss": 0.1817, + "step": 10599 + }, + { + "epoch": 0.68, + "grad_norm": 1.1968501978495556, + "learning_rate": 2.5098446063940725e-07, + "loss": 0.1242, + "step": 10600 + }, + { + "epoch": 0.68, + "grad_norm": 0.47176432883077934, + "learning_rate": 2.508949110237385e-07, + "loss": 0.0039, + "step": 10601 + }, + { + "epoch": 0.68, + "grad_norm": 0.8108725101723075, + "learning_rate": 2.5080537203539364e-07, + "loss": 0.2779, + "step": 10602 + }, + { + "epoch": 0.68, + "grad_norm": 0.8583643549900759, + "learning_rate": 2.5071584367819267e-07, + "loss": 0.4885, + "step": 10603 + }, + { + "epoch": 0.68, + "grad_norm": 0.26309118018076055, + "learning_rate": 2.506263259559548e-07, + "loss": 0.0486, + "step": 10604 + }, + { + "epoch": 0.68, + "grad_norm": 0.8450317211287066, + "learning_rate": 2.505368188724991e-07, + "loss": 0.2276, + "step": 10605 + }, + { + "epoch": 0.68, + "grad_norm": 0.9268324661988067, + "learning_rate": 2.5044732243164444e-07, + "loss": 0.0802, + "step": 10606 + }, + { + "epoch": 0.68, + "grad_norm": 0.5004466655700627, + "learning_rate": 2.5035783663720866e-07, + "loss": 0.2071, + "step": 10607 + }, + { + "epoch": 0.68, + "grad_norm": 0.34058912591187507, + "learning_rate": 2.5026836149300924e-07, + "loss": 0.198, + "step": 10608 + }, + { + "epoch": 0.68, + "grad_norm": 1.3687588008158, + "learning_rate": 2.5017889700286366e-07, + "loss": 0.0463, + "step": 10609 + }, + { + "epoch": 0.68, + "grad_norm": 1.3371554072172445, + "learning_rate": 2.5008944317058844e-07, + "loss": 0.1614, + "step": 10610 + }, + { + "epoch": 0.68, + "grad_norm": 0.7631054936767125, + "learning_rate": 2.500000000000001e-07, + "loss": 0.2467, + "step": 10611 + }, + { + "epoch": 0.68, + "grad_norm": 12.570913992693844, + "learning_rate": 2.4991056749491414e-07, + "loss": 0.0508, + "step": 10612 + }, + { + "epoch": 0.68, + "grad_norm": 0.5580753188516984, + "learning_rate": 2.498211456591463e-07, + "loss": 0.1841, + "step": 10613 + }, + { + "epoch": 0.68, + "grad_norm": 3.311823748840983, + "learning_rate": 2.4973173449651106e-07, + "loss": 0.2012, + "step": 10614 + }, + { + "epoch": 0.68, + "grad_norm": 0.3992919977999924, + "learning_rate": 2.496423340108234e-07, + "loss": 0.0016, + "step": 10615 + }, + { + "epoch": 0.68, + "grad_norm": 0.7304015725613393, + "learning_rate": 2.4955294420589687e-07, + "loss": 0.2443, + "step": 10616 + }, + { + "epoch": 0.68, + "grad_norm": 0.7688464051008319, + "learning_rate": 2.494635650855453e-07, + "loss": 0.1986, + "step": 10617 + }, + { + "epoch": 0.68, + "grad_norm": 0.3838141955276136, + "learning_rate": 2.493741966535816e-07, + "loss": 0.0887, + "step": 10618 + }, + { + "epoch": 0.68, + "grad_norm": 1.1487147416039272, + "learning_rate": 2.4928483891381863e-07, + "loss": 0.1145, + "step": 10619 + }, + { + "epoch": 0.68, + "grad_norm": 1.5174528699626204, + "learning_rate": 2.4919549187006826e-07, + "loss": 0.2569, + "step": 10620 + }, + { + "epoch": 0.68, + "grad_norm": 0.8653498544965659, + "learning_rate": 2.491061555261424e-07, + "loss": 0.2112, + "step": 10621 + }, + { + "epoch": 0.68, + "grad_norm": 0.8580172433265398, + "learning_rate": 2.4901682988585244e-07, + "loss": 0.1548, + "step": 10622 + }, + { + "epoch": 0.68, + "grad_norm": 0.7678532818603045, + "learning_rate": 2.489275149530089e-07, + "loss": 0.2632, + "step": 10623 + }, + { + "epoch": 0.68, + "grad_norm": 0.852388175325931, + "learning_rate": 2.488382107314225e-07, + "loss": 0.2685, + "step": 10624 + }, + { + "epoch": 0.68, + "grad_norm": 1.4027321008603786, + "learning_rate": 2.487489172249027e-07, + "loss": 0.2474, + "step": 10625 + }, + { + "epoch": 0.68, + "grad_norm": 1.049118659300945, + "learning_rate": 2.486596344372594e-07, + "loss": 0.1608, + "step": 10626 + }, + { + "epoch": 0.68, + "grad_norm": 0.8708467854010007, + "learning_rate": 2.485703623723011e-07, + "loss": 0.214, + "step": 10627 + }, + { + "epoch": 0.68, + "grad_norm": 0.4338989155347018, + "learning_rate": 2.4848110103383683e-07, + "loss": 0.1835, + "step": 10628 + }, + { + "epoch": 0.68, + "grad_norm": 1.0908190163622082, + "learning_rate": 2.4839185042567424e-07, + "loss": 0.2569, + "step": 10629 + }, + { + "epoch": 0.68, + "grad_norm": 0.6196024690562174, + "learning_rate": 2.483026105516212e-07, + "loss": 0.111, + "step": 10630 + }, + { + "epoch": 0.68, + "grad_norm": 0.7471693799119881, + "learning_rate": 2.4821338141548465e-07, + "loss": 0.1557, + "step": 10631 + }, + { + "epoch": 0.68, + "grad_norm": 5.471578282580022, + "learning_rate": 2.481241630210716e-07, + "loss": 0.2435, + "step": 10632 + }, + { + "epoch": 0.68, + "grad_norm": 0.48844098333069136, + "learning_rate": 2.4803495537218787e-07, + "loss": 0.0122, + "step": 10633 + }, + { + "epoch": 0.68, + "grad_norm": 0.9526328234776237, + "learning_rate": 2.4794575847263963e-07, + "loss": 0.2759, + "step": 10634 + }, + { + "epoch": 0.68, + "grad_norm": 1.0131510610706103, + "learning_rate": 2.478565723262321e-07, + "loss": 0.4307, + "step": 10635 + }, + { + "epoch": 0.68, + "grad_norm": 0.9358636516171898, + "learning_rate": 2.4776739693676977e-07, + "loss": 0.2347, + "step": 10636 + }, + { + "epoch": 0.68, + "grad_norm": 2.165739279034136, + "learning_rate": 2.476782323080574e-07, + "loss": 0.2701, + "step": 10637 + }, + { + "epoch": 0.68, + "grad_norm": 0.5396514702615218, + "learning_rate": 2.4758907844389905e-07, + "loss": 0.1629, + "step": 10638 + }, + { + "epoch": 0.68, + "grad_norm": 8.320681171798618, + "learning_rate": 2.474999353480978e-07, + "loss": 0.3476, + "step": 10639 + }, + { + "epoch": 0.68, + "grad_norm": 1.0111865266702131, + "learning_rate": 2.47410803024457e-07, + "loss": 0.2517, + "step": 10640 + }, + { + "epoch": 0.68, + "grad_norm": 1.0803292059452372, + "learning_rate": 2.473216814767792e-07, + "loss": 0.1364, + "step": 10641 + }, + { + "epoch": 0.68, + "grad_norm": 0.501200864153896, + "learning_rate": 2.472325707088663e-07, + "loss": 0.1743, + "step": 10642 + }, + { + "epoch": 0.68, + "grad_norm": 0.47594962004949454, + "learning_rate": 2.471434707245202e-07, + "loss": 0.2192, + "step": 10643 + }, + { + "epoch": 0.68, + "grad_norm": 0.720951558466603, + "learning_rate": 2.4705438152754174e-07, + "loss": 0.2886, + "step": 10644 + }, + { + "epoch": 0.68, + "grad_norm": 0.49906162606421556, + "learning_rate": 2.469653031217321e-07, + "loss": 0.0609, + "step": 10645 + }, + { + "epoch": 0.68, + "grad_norm": 0.5942776477775826, + "learning_rate": 2.4687623551089104e-07, + "loss": 0.1346, + "step": 10646 + }, + { + "epoch": 0.68, + "grad_norm": 1.1049866050200776, + "learning_rate": 2.4678717869881884e-07, + "loss": 0.214, + "step": 10647 + }, + { + "epoch": 0.68, + "grad_norm": 1.0522141300105585, + "learning_rate": 2.466981326893146e-07, + "loss": 0.0121, + "step": 10648 + }, + { + "epoch": 0.68, + "grad_norm": 9.595667859330842, + "learning_rate": 2.46609097486177e-07, + "loss": 0.1439, + "step": 10649 + }, + { + "epoch": 0.68, + "grad_norm": 0.6189656097202366, + "learning_rate": 2.465200730932049e-07, + "loss": 0.4181, + "step": 10650 + }, + { + "epoch": 0.68, + "grad_norm": 0.4350202001291532, + "learning_rate": 2.464310595141959e-07, + "loss": 0.3342, + "step": 10651 + }, + { + "epoch": 0.68, + "grad_norm": 1.9072968235831722, + "learning_rate": 2.4634205675294777e-07, + "loss": 0.5667, + "step": 10652 + }, + { + "epoch": 0.68, + "grad_norm": 2.433996284579106, + "learning_rate": 2.4625306481325727e-07, + "loss": 0.0617, + "step": 10653 + }, + { + "epoch": 0.68, + "grad_norm": 0.8957853909505935, + "learning_rate": 2.4616408369892126e-07, + "loss": 0.2836, + "step": 10654 + }, + { + "epoch": 0.68, + "grad_norm": 1.8956053562597286, + "learning_rate": 2.4607511341373557e-07, + "loss": 0.1735, + "step": 10655 + }, + { + "epoch": 0.68, + "grad_norm": 0.9634464743769201, + "learning_rate": 2.4598615396149597e-07, + "loss": 0.4158, + "step": 10656 + }, + { + "epoch": 0.68, + "grad_norm": 3.2599236002697625, + "learning_rate": 2.458972053459977e-07, + "loss": 0.1456, + "step": 10657 + }, + { + "epoch": 0.68, + "grad_norm": 0.4171246802869317, + "learning_rate": 2.4580826757103564e-07, + "loss": 0.162, + "step": 10658 + }, + { + "epoch": 0.68, + "grad_norm": 4.632731797597872, + "learning_rate": 2.457193406404036e-07, + "loss": 0.1327, + "step": 10659 + }, + { + "epoch": 0.68, + "grad_norm": 1.463570315065967, + "learning_rate": 2.4563042455789593e-07, + "loss": 0.2327, + "step": 10660 + }, + { + "epoch": 0.68, + "grad_norm": 0.7475601521375592, + "learning_rate": 2.455415193273055e-07, + "loss": 0.3889, + "step": 10661 + }, + { + "epoch": 0.68, + "grad_norm": 0.9511196635599971, + "learning_rate": 2.4545262495242556e-07, + "loss": 0.2351, + "step": 10662 + }, + { + "epoch": 0.68, + "grad_norm": 7.354419344303781, + "learning_rate": 2.4536374143704834e-07, + "loss": 0.1062, + "step": 10663 + }, + { + "epoch": 0.68, + "grad_norm": 2.7422148530732846, + "learning_rate": 2.452748687849656e-07, + "loss": 0.239, + "step": 10664 + }, + { + "epoch": 0.68, + "grad_norm": 0.9694439957071486, + "learning_rate": 2.4518600699996925e-07, + "loss": 0.2528, + "step": 10665 + }, + { + "epoch": 0.68, + "grad_norm": 0.38941484134385324, + "learning_rate": 2.4509715608584986e-07, + "loss": 0.0716, + "step": 10666 + }, + { + "epoch": 0.68, + "grad_norm": 0.53835049220775, + "learning_rate": 2.4500831604639843e-07, + "loss": 0.2671, + "step": 10667 + }, + { + "epoch": 0.68, + "grad_norm": 1.4995410768084054, + "learning_rate": 2.449194868854046e-07, + "loss": 0.0492, + "step": 10668 + }, + { + "epoch": 0.68, + "grad_norm": 0.688961576178236, + "learning_rate": 2.448306686066585e-07, + "loss": 0.3869, + "step": 10669 + }, + { + "epoch": 0.68, + "grad_norm": 0.7985312278403478, + "learning_rate": 2.4474186121394875e-07, + "loss": 0.3686, + "step": 10670 + }, + { + "epoch": 0.68, + "grad_norm": 1.116955759566959, + "learning_rate": 2.446530647110646e-07, + "loss": 0.3511, + "step": 10671 + }, + { + "epoch": 0.68, + "grad_norm": 1.3933397101762621, + "learning_rate": 2.4456427910179374e-07, + "loss": 0.1386, + "step": 10672 + }, + { + "epoch": 0.68, + "grad_norm": 0.24738398884795554, + "learning_rate": 2.444755043899243e-07, + "loss": 0.1014, + "step": 10673 + }, + { + "epoch": 0.68, + "grad_norm": 0.5006446506991662, + "learning_rate": 2.4438674057924365e-07, + "loss": 0.2221, + "step": 10674 + }, + { + "epoch": 0.68, + "grad_norm": 0.689609795630209, + "learning_rate": 2.442979876735383e-07, + "loss": 0.2426, + "step": 10675 + }, + { + "epoch": 0.68, + "grad_norm": 1.3975373833477966, + "learning_rate": 2.4420924567659504e-07, + "loss": 0.2743, + "step": 10676 + }, + { + "epoch": 0.68, + "grad_norm": 0.592271887308199, + "learning_rate": 2.441205145921994e-07, + "loss": 0.1441, + "step": 10677 + }, + { + "epoch": 0.68, + "grad_norm": 0.36363260404378417, + "learning_rate": 2.440317944241372e-07, + "loss": 0.0879, + "step": 10678 + }, + { + "epoch": 0.68, + "grad_norm": 0.8731244608591947, + "learning_rate": 2.4394308517619293e-07, + "loss": 0.2039, + "step": 10679 + }, + { + "epoch": 0.68, + "grad_norm": 3.1850744507161584, + "learning_rate": 2.4385438685215165e-07, + "loss": 0.0939, + "step": 10680 + }, + { + "epoch": 0.68, + "grad_norm": 2.922497198506668, + "learning_rate": 2.437656994557969e-07, + "loss": 0.2345, + "step": 10681 + }, + { + "epoch": 0.68, + "grad_norm": 0.9969140916400155, + "learning_rate": 2.4367702299091274e-07, + "loss": 0.5095, + "step": 10682 + }, + { + "epoch": 0.68, + "grad_norm": 0.6505041781957592, + "learning_rate": 2.4358835746128186e-07, + "loss": 0.2, + "step": 10683 + }, + { + "epoch": 0.68, + "grad_norm": 0.5692669734993312, + "learning_rate": 2.4349970287068725e-07, + "loss": 0.1239, + "step": 10684 + }, + { + "epoch": 0.68, + "grad_norm": 2.4186398298078355, + "learning_rate": 2.4341105922291076e-07, + "loss": 0.1111, + "step": 10685 + }, + { + "epoch": 0.68, + "grad_norm": 0.5287317822196954, + "learning_rate": 2.4332242652173455e-07, + "loss": 0.1459, + "step": 10686 + }, + { + "epoch": 0.68, + "grad_norm": 1.2826770114363877, + "learning_rate": 2.4323380477093934e-07, + "loss": 0.5146, + "step": 10687 + }, + { + "epoch": 0.68, + "grad_norm": 0.4946248138867597, + "learning_rate": 2.4314519397430646e-07, + "loss": 0.0826, + "step": 10688 + }, + { + "epoch": 0.68, + "grad_norm": 1.4667660044199442, + "learning_rate": 2.430565941356157e-07, + "loss": 0.3147, + "step": 10689 + }, + { + "epoch": 0.68, + "grad_norm": 1.3180618724213369, + "learning_rate": 2.4296800525864736e-07, + "loss": 0.2997, + "step": 10690 + }, + { + "epoch": 0.68, + "grad_norm": 7.671803698899248, + "learning_rate": 2.4287942734718044e-07, + "loss": 0.2124, + "step": 10691 + }, + { + "epoch": 0.68, + "grad_norm": 0.36901619813841613, + "learning_rate": 2.4279086040499394e-07, + "loss": 0.1636, + "step": 10692 + }, + { + "epoch": 0.68, + "grad_norm": 8.125111527344528, + "learning_rate": 2.427023044358667e-07, + "loss": 0.195, + "step": 10693 + }, + { + "epoch": 0.68, + "grad_norm": 0.5461668614907347, + "learning_rate": 2.4261375944357607e-07, + "loss": 0.3006, + "step": 10694 + }, + { + "epoch": 0.68, + "grad_norm": 1.36798634203906, + "learning_rate": 2.4252522543190017e-07, + "loss": 0.1184, + "step": 10695 + }, + { + "epoch": 0.68, + "grad_norm": 0.9025822251560608, + "learning_rate": 2.4243670240461556e-07, + "loss": 0.2122, + "step": 10696 + }, + { + "epoch": 0.68, + "grad_norm": 1.2385314046509113, + "learning_rate": 2.4234819036549916e-07, + "loss": 0.3901, + "step": 10697 + }, + { + "epoch": 0.68, + "grad_norm": 0.509657875785536, + "learning_rate": 2.422596893183266e-07, + "loss": 0.2317, + "step": 10698 + }, + { + "epoch": 0.68, + "grad_norm": 1.2800454280133113, + "learning_rate": 2.421711992668741e-07, + "loss": 0.3699, + "step": 10699 + }, + { + "epoch": 0.68, + "grad_norm": 0.9437888866957841, + "learning_rate": 2.420827202149164e-07, + "loss": 0.1995, + "step": 10700 + }, + { + "epoch": 0.68, + "grad_norm": 4.7303262059632845, + "learning_rate": 2.419942521662285e-07, + "loss": 0.1273, + "step": 10701 + }, + { + "epoch": 0.68, + "grad_norm": 3.096351887508121, + "learning_rate": 2.419057951245842e-07, + "loss": 0.1125, + "step": 10702 + }, + { + "epoch": 0.68, + "grad_norm": 7.0985929843028215, + "learning_rate": 2.418173490937578e-07, + "loss": 0.0228, + "step": 10703 + }, + { + "epoch": 0.68, + "grad_norm": 0.3791057710799882, + "learning_rate": 2.417289140775222e-07, + "loss": 0.13, + "step": 10704 + }, + { + "epoch": 0.68, + "grad_norm": 0.587168419506907, + "learning_rate": 2.4164049007965023e-07, + "loss": 0.2207, + "step": 10705 + }, + { + "epoch": 0.68, + "grad_norm": 0.6911265857887287, + "learning_rate": 2.415520771039144e-07, + "loss": 0.4728, + "step": 10706 + }, + { + "epoch": 0.68, + "grad_norm": 1.4532718760205119, + "learning_rate": 2.414636751540864e-07, + "loss": 0.1215, + "step": 10707 + }, + { + "epoch": 0.68, + "grad_norm": 1.0060944484966456, + "learning_rate": 2.413752842339377e-07, + "loss": 0.2105, + "step": 10708 + }, + { + "epoch": 0.68, + "grad_norm": 0.9268099357017542, + "learning_rate": 2.4128690434723947e-07, + "loss": 0.2565, + "step": 10709 + }, + { + "epoch": 0.68, + "grad_norm": 0.2520195283701715, + "learning_rate": 2.411985354977618e-07, + "loss": 0.2335, + "step": 10710 + }, + { + "epoch": 0.68, + "grad_norm": 3.0977907578748356, + "learning_rate": 2.411101776892749e-07, + "loss": 0.0649, + "step": 10711 + }, + { + "epoch": 0.68, + "grad_norm": 6.7218064755685125, + "learning_rate": 2.410218309255484e-07, + "loss": 0.0321, + "step": 10712 + }, + { + "epoch": 0.68, + "grad_norm": 0.7625233089280851, + "learning_rate": 2.40933495210351e-07, + "loss": 0.1131, + "step": 10713 + }, + { + "epoch": 0.68, + "grad_norm": 1.0855095888509445, + "learning_rate": 2.408451705474517e-07, + "loss": 0.114, + "step": 10714 + }, + { + "epoch": 0.68, + "grad_norm": 2.1842261955563496, + "learning_rate": 2.407568569406182e-07, + "loss": 0.0506, + "step": 10715 + }, + { + "epoch": 0.68, + "grad_norm": 0.38620217678899577, + "learning_rate": 2.406685543936185e-07, + "loss": 0.0084, + "step": 10716 + }, + { + "epoch": 0.68, + "grad_norm": 0.5924790759334992, + "learning_rate": 2.405802629102196e-07, + "loss": 0.0855, + "step": 10717 + }, + { + "epoch": 0.68, + "grad_norm": 0.5337281680297228, + "learning_rate": 2.4049198249418803e-07, + "loss": 0.2453, + "step": 10718 + }, + { + "epoch": 0.68, + "grad_norm": 0.5770910264205996, + "learning_rate": 2.404037131492903e-07, + "loss": 0.1786, + "step": 10719 + }, + { + "epoch": 0.68, + "grad_norm": 0.8558103821620981, + "learning_rate": 2.403154548792918e-07, + "loss": 0.172, + "step": 10720 + }, + { + "epoch": 0.68, + "grad_norm": 0.8557960799844515, + "learning_rate": 2.402272076879583e-07, + "loss": 0.1622, + "step": 10721 + }, + { + "epoch": 0.68, + "grad_norm": 2.80220731283959, + "learning_rate": 2.401389715790541e-07, + "loss": 0.2381, + "step": 10722 + }, + { + "epoch": 0.68, + "grad_norm": 1.076657403162898, + "learning_rate": 2.4005074655634393e-07, + "loss": 0.3262, + "step": 10723 + }, + { + "epoch": 0.68, + "grad_norm": 0.3533423049794706, + "learning_rate": 2.3996253262359133e-07, + "loss": 0.2099, + "step": 10724 + }, + { + "epoch": 0.68, + "grad_norm": 0.5668099182172478, + "learning_rate": 2.3987432978455995e-07, + "loss": 0.1811, + "step": 10725 + }, + { + "epoch": 0.68, + "grad_norm": 0.5371098300354171, + "learning_rate": 2.3978613804301246e-07, + "loss": 0.4094, + "step": 10726 + }, + { + "epoch": 0.68, + "grad_norm": 1.838505960343988, + "learning_rate": 2.3969795740271145e-07, + "loss": 0.2311, + "step": 10727 + }, + { + "epoch": 0.68, + "grad_norm": 1.667570296835776, + "learning_rate": 2.3960978786741874e-07, + "loss": 0.1847, + "step": 10728 + }, + { + "epoch": 0.68, + "grad_norm": 0.5828788587723586, + "learning_rate": 2.3952162944089616e-07, + "loss": 0.1691, + "step": 10729 + }, + { + "epoch": 0.68, + "grad_norm": 0.9476949999625318, + "learning_rate": 2.3943348212690433e-07, + "loss": 0.3167, + "step": 10730 + }, + { + "epoch": 0.68, + "grad_norm": 0.8783290069773235, + "learning_rate": 2.3934534592920413e-07, + "loss": 0.1312, + "step": 10731 + }, + { + "epoch": 0.68, + "grad_norm": 2.081775542810553, + "learning_rate": 2.392572208515555e-07, + "loss": 0.397, + "step": 10732 + }, + { + "epoch": 0.68, + "grad_norm": 0.5762764388574344, + "learning_rate": 2.3916910689771773e-07, + "loss": 0.0017, + "step": 10733 + }, + { + "epoch": 0.68, + "grad_norm": 0.7355044397297507, + "learning_rate": 2.390810040714504e-07, + "loss": 0.2677, + "step": 10734 + }, + { + "epoch": 0.68, + "grad_norm": 0.5964938536065715, + "learning_rate": 2.389929123765118e-07, + "loss": 0.0749, + "step": 10735 + }, + { + "epoch": 0.68, + "grad_norm": 0.7905799500317098, + "learning_rate": 2.3890483181666036e-07, + "loss": 0.1636, + "step": 10736 + }, + { + "epoch": 0.68, + "grad_norm": 0.36713854956127034, + "learning_rate": 2.3881676239565356e-07, + "loss": 0.0206, + "step": 10737 + }, + { + "epoch": 0.68, + "grad_norm": 7.347921022010893, + "learning_rate": 2.3872870411724887e-07, + "loss": 0.1419, + "step": 10738 + }, + { + "epoch": 0.68, + "grad_norm": 0.6525773147906483, + "learning_rate": 2.386406569852027e-07, + "loss": 0.4326, + "step": 10739 + }, + { + "epoch": 0.68, + "grad_norm": 0.5300134958343342, + "learning_rate": 2.3855262100327167e-07, + "loss": 0.1283, + "step": 10740 + }, + { + "epoch": 0.68, + "grad_norm": 2.8747068130142694, + "learning_rate": 2.3846459617521123e-07, + "loss": 0.0996, + "step": 10741 + }, + { + "epoch": 0.69, + "grad_norm": 0.8111566195875461, + "learning_rate": 2.3837658250477704e-07, + "loss": 0.2887, + "step": 10742 + }, + { + "epoch": 0.69, + "grad_norm": 0.6136206640377274, + "learning_rate": 2.382885799957236e-07, + "loss": 0.2516, + "step": 10743 + }, + { + "epoch": 0.69, + "grad_norm": 0.35564433647555616, + "learning_rate": 2.3820058865180548e-07, + "loss": 0.1259, + "step": 10744 + }, + { + "epoch": 0.69, + "grad_norm": 5.30969938791369, + "learning_rate": 2.3811260847677668e-07, + "loss": 0.0474, + "step": 10745 + }, + { + "epoch": 0.69, + "grad_norm": 0.982035371250566, + "learning_rate": 2.3802463947439028e-07, + "loss": 0.2785, + "step": 10746 + }, + { + "epoch": 0.69, + "grad_norm": 0.5894925748322754, + "learning_rate": 2.3793668164839954e-07, + "loss": 0.3994, + "step": 10747 + }, + { + "epoch": 0.69, + "grad_norm": 0.8366667867151247, + "learning_rate": 2.3784873500255666e-07, + "loss": 0.0728, + "step": 10748 + }, + { + "epoch": 0.69, + "grad_norm": 0.4545383376816283, + "learning_rate": 2.3776079954061385e-07, + "loss": 0.2984, + "step": 10749 + }, + { + "epoch": 0.69, + "grad_norm": 1.0425947107482705, + "learning_rate": 2.3767287526632234e-07, + "loss": 0.4034, + "step": 10750 + }, + { + "epoch": 0.69, + "grad_norm": 1.0457992927079485, + "learning_rate": 2.3758496218343355e-07, + "loss": 0.0778, + "step": 10751 + }, + { + "epoch": 0.69, + "grad_norm": 0.5266467923509143, + "learning_rate": 2.3749706029569754e-07, + "loss": 0.2213, + "step": 10752 + }, + { + "epoch": 0.69, + "grad_norm": 0.39514589901137304, + "learning_rate": 2.3740916960686486e-07, + "loss": 0.0478, + "step": 10753 + }, + { + "epoch": 0.69, + "grad_norm": 0.7346820868868575, + "learning_rate": 2.373212901206847e-07, + "loss": 0.1854, + "step": 10754 + }, + { + "epoch": 0.69, + "grad_norm": 1.701917774586411, + "learning_rate": 2.372334218409065e-07, + "loss": 0.0562, + "step": 10755 + }, + { + "epoch": 0.69, + "grad_norm": 2.841902311073271, + "learning_rate": 2.3714556477127861e-07, + "loss": 0.3454, + "step": 10756 + }, + { + "epoch": 0.69, + "grad_norm": 0.9812225346183727, + "learning_rate": 2.3705771891554948e-07, + "loss": 0.2398, + "step": 10757 + }, + { + "epoch": 0.69, + "grad_norm": 0.5400856289738735, + "learning_rate": 2.3696988427746667e-07, + "loss": 0.0455, + "step": 10758 + }, + { + "epoch": 0.69, + "grad_norm": 1.0962111199743132, + "learning_rate": 2.3688206086077716e-07, + "loss": 0.2525, + "step": 10759 + }, + { + "epoch": 0.69, + "grad_norm": 0.5658892060409247, + "learning_rate": 2.3679424866922805e-07, + "loss": 0.2609, + "step": 10760 + }, + { + "epoch": 0.69, + "grad_norm": 0.8507019580896611, + "learning_rate": 2.3670644770656517e-07, + "loss": 0.417, + "step": 10761 + }, + { + "epoch": 0.69, + "grad_norm": 6.051728849599386, + "learning_rate": 2.3661865797653458e-07, + "loss": 0.2295, + "step": 10762 + }, + { + "epoch": 0.69, + "grad_norm": 0.3231275283644623, + "learning_rate": 2.3653087948288147e-07, + "loss": 0.1155, + "step": 10763 + }, + { + "epoch": 0.69, + "grad_norm": 0.17446752350544095, + "learning_rate": 2.3644311222935088e-07, + "loss": 0.0846, + "step": 10764 + }, + { + "epoch": 0.69, + "grad_norm": 0.2839051739428037, + "learning_rate": 2.3635535621968678e-07, + "loss": 0.1953, + "step": 10765 + }, + { + "epoch": 0.69, + "grad_norm": 0.34255789800130965, + "learning_rate": 2.362676114576333e-07, + "loss": 0.1236, + "step": 10766 + }, + { + "epoch": 0.69, + "grad_norm": 0.9428923488254283, + "learning_rate": 2.3617987794693357e-07, + "loss": 0.0381, + "step": 10767 + }, + { + "epoch": 0.69, + "grad_norm": 1.0815575565411712, + "learning_rate": 2.3609215569133074e-07, + "loss": 0.2537, + "step": 10768 + }, + { + "epoch": 0.69, + "grad_norm": 1.2725050593944605, + "learning_rate": 2.3600444469456688e-07, + "loss": 0.0969, + "step": 10769 + }, + { + "epoch": 0.69, + "grad_norm": 4.320496287192785, + "learning_rate": 2.3591674496038428e-07, + "loss": 0.0124, + "step": 10770 + }, + { + "epoch": 0.69, + "grad_norm": 0.3960283998846838, + "learning_rate": 2.3582905649252404e-07, + "loss": 0.3139, + "step": 10771 + }, + { + "epoch": 0.69, + "grad_norm": 0.9080930515887877, + "learning_rate": 2.357413792947275e-07, + "loss": 0.3573, + "step": 10772 + }, + { + "epoch": 0.69, + "grad_norm": 1.022924753631725, + "learning_rate": 2.3565371337073493e-07, + "loss": 0.2909, + "step": 10773 + }, + { + "epoch": 0.69, + "grad_norm": 8.192332589148641, + "learning_rate": 2.355660587242862e-07, + "loss": 0.3369, + "step": 10774 + }, + { + "epoch": 0.69, + "grad_norm": 1.5991252494796782, + "learning_rate": 2.3547841535912112e-07, + "loss": 0.3739, + "step": 10775 + }, + { + "epoch": 0.69, + "grad_norm": 1.3910590544628323, + "learning_rate": 2.3539078327897843e-07, + "loss": 0.1886, + "step": 10776 + }, + { + "epoch": 0.69, + "grad_norm": 0.5203742713285756, + "learning_rate": 2.35303162487597e-07, + "loss": 0.1422, + "step": 10777 + }, + { + "epoch": 0.69, + "grad_norm": 10.009354655764911, + "learning_rate": 2.3521555298871466e-07, + "loss": 0.0497, + "step": 10778 + }, + { + "epoch": 0.69, + "grad_norm": 0.4855409972449557, + "learning_rate": 2.3512795478606905e-07, + "loss": 0.3071, + "step": 10779 + }, + { + "epoch": 0.69, + "grad_norm": 0.6035946861246638, + "learning_rate": 2.350403678833976e-07, + "loss": 0.2212, + "step": 10780 + }, + { + "epoch": 0.69, + "grad_norm": 0.43697590959260857, + "learning_rate": 2.3495279228443643e-07, + "loss": 0.0041, + "step": 10781 + }, + { + "epoch": 0.69, + "grad_norm": 0.761721033718258, + "learning_rate": 2.3486522799292202e-07, + "loss": 0.3627, + "step": 10782 + }, + { + "epoch": 0.69, + "grad_norm": 0.8386367542748692, + "learning_rate": 2.347776750125901e-07, + "loss": 0.2445, + "step": 10783 + }, + { + "epoch": 0.69, + "grad_norm": 0.670499795764103, + "learning_rate": 2.346901333471756e-07, + "loss": 0.187, + "step": 10784 + }, + { + "epoch": 0.69, + "grad_norm": 0.6148808205684445, + "learning_rate": 2.346026030004135e-07, + "loss": 0.0973, + "step": 10785 + }, + { + "epoch": 0.69, + "grad_norm": 0.4434147233251777, + "learning_rate": 2.3451508397603785e-07, + "loss": 0.1887, + "step": 10786 + }, + { + "epoch": 0.69, + "grad_norm": 1.134159405329597, + "learning_rate": 2.3442757627778227e-07, + "loss": 0.2923, + "step": 10787 + }, + { + "epoch": 0.69, + "grad_norm": 0.9278861527308889, + "learning_rate": 2.343400799093803e-07, + "loss": 0.1494, + "step": 10788 + }, + { + "epoch": 0.69, + "grad_norm": 0.6902228714397468, + "learning_rate": 2.3425259487456439e-07, + "loss": 0.1855, + "step": 10789 + }, + { + "epoch": 0.69, + "grad_norm": 0.7393512624553055, + "learning_rate": 2.341651211770672e-07, + "loss": 0.0571, + "step": 10790 + }, + { + "epoch": 0.69, + "grad_norm": 1.2855576620004634, + "learning_rate": 2.340776588206202e-07, + "loss": 0.3084, + "step": 10791 + }, + { + "epoch": 0.69, + "grad_norm": 1.208128575753836, + "learning_rate": 2.3399020780895495e-07, + "loss": 0.2589, + "step": 10792 + }, + { + "epoch": 0.69, + "grad_norm": 1.068699523543247, + "learning_rate": 2.33902768145802e-07, + "loss": 0.1964, + "step": 10793 + }, + { + "epoch": 0.69, + "grad_norm": 1.622485600677907, + "learning_rate": 2.338153398348921e-07, + "loss": 0.1287, + "step": 10794 + }, + { + "epoch": 0.69, + "grad_norm": 0.8692939431777988, + "learning_rate": 2.3372792287995473e-07, + "loss": 0.2133, + "step": 10795 + }, + { + "epoch": 0.69, + "grad_norm": 1.51608069027204, + "learning_rate": 2.336405172847196e-07, + "loss": 0.4074, + "step": 10796 + }, + { + "epoch": 0.69, + "grad_norm": 1.036266952579445, + "learning_rate": 2.3355312305291526e-07, + "loss": 0.2128, + "step": 10797 + }, + { + "epoch": 0.69, + "grad_norm": 0.727967646911707, + "learning_rate": 2.3346574018827037e-07, + "loss": 0.3428, + "step": 10798 + }, + { + "epoch": 0.69, + "grad_norm": 1.3616547796227862, + "learning_rate": 2.3337836869451294e-07, + "loss": 0.2615, + "step": 10799 + }, + { + "epoch": 0.69, + "grad_norm": 1.447780603213033, + "learning_rate": 2.332910085753701e-07, + "loss": 0.1512, + "step": 10800 + }, + { + "epoch": 0.69, + "grad_norm": 0.9427087413710671, + "learning_rate": 2.3320365983456918e-07, + "loss": 0.267, + "step": 10801 + }, + { + "epoch": 0.69, + "grad_norm": 9.508688698731216, + "learning_rate": 2.3311632247583623e-07, + "loss": 0.3934, + "step": 10802 + }, + { + "epoch": 0.69, + "grad_norm": 0.9400835084229531, + "learning_rate": 2.330289965028977e-07, + "loss": 0.3936, + "step": 10803 + }, + { + "epoch": 0.69, + "grad_norm": 0.14746324843519315, + "learning_rate": 2.3294168191947867e-07, + "loss": 0.0015, + "step": 10804 + }, + { + "epoch": 0.69, + "grad_norm": 1.1419398616777385, + "learning_rate": 2.3285437872930452e-07, + "loss": 0.1238, + "step": 10805 + }, + { + "epoch": 0.69, + "grad_norm": 0.7147358235020845, + "learning_rate": 2.3276708693609942e-07, + "loss": 0.3919, + "step": 10806 + }, + { + "epoch": 0.69, + "grad_norm": 0.5466781663745297, + "learning_rate": 2.3267980654358782e-07, + "loss": 0.1859, + "step": 10807 + }, + { + "epoch": 0.69, + "grad_norm": 2.135015677342683, + "learning_rate": 2.325925375554928e-07, + "loss": 0.2184, + "step": 10808 + }, + { + "epoch": 0.69, + "grad_norm": 0.826877852968951, + "learning_rate": 2.3250527997553794e-07, + "loss": 0.2951, + "step": 10809 + }, + { + "epoch": 0.69, + "grad_norm": 0.5517360802098473, + "learning_rate": 2.324180338074453e-07, + "loss": 0.0864, + "step": 10810 + }, + { + "epoch": 0.69, + "grad_norm": 1.1360274930485523, + "learning_rate": 2.3233079905493747e-07, + "loss": 0.2908, + "step": 10811 + }, + { + "epoch": 0.69, + "grad_norm": 0.7841358997135572, + "learning_rate": 2.322435757217357e-07, + "loss": 0.0843, + "step": 10812 + }, + { + "epoch": 0.69, + "grad_norm": 0.419906399602515, + "learning_rate": 2.3215636381156135e-07, + "loss": 0.0172, + "step": 10813 + }, + { + "epoch": 0.69, + "grad_norm": 1.0568777606934483, + "learning_rate": 2.3206916332813481e-07, + "loss": 0.2334, + "step": 10814 + }, + { + "epoch": 0.69, + "grad_norm": 0.2047534685914261, + "learning_rate": 2.3198197427517658e-07, + "loss": 0.0967, + "step": 10815 + }, + { + "epoch": 0.69, + "grad_norm": 0.7356720773304193, + "learning_rate": 2.3189479665640588e-07, + "loss": 0.1231, + "step": 10816 + }, + { + "epoch": 0.69, + "grad_norm": 0.9034519010183022, + "learning_rate": 2.3180763047554213e-07, + "loss": 0.1016, + "step": 10817 + }, + { + "epoch": 0.69, + "grad_norm": 6.186849603714974, + "learning_rate": 2.3172047573630416e-07, + "loss": 0.1949, + "step": 10818 + }, + { + "epoch": 0.69, + "grad_norm": 0.9214053839934992, + "learning_rate": 2.3163333244240984e-07, + "loss": 0.1373, + "step": 10819 + }, + { + "epoch": 0.69, + "grad_norm": 0.678931933551839, + "learning_rate": 2.3154620059757723e-07, + "loss": 0.0885, + "step": 10820 + }, + { + "epoch": 0.69, + "grad_norm": 1.9878662707130919, + "learning_rate": 2.3145908020552318e-07, + "loss": 0.3005, + "step": 10821 + }, + { + "epoch": 0.69, + "grad_norm": 0.47761683792383697, + "learning_rate": 2.3137197126996482e-07, + "loss": 0.2039, + "step": 10822 + }, + { + "epoch": 0.69, + "grad_norm": 0.3679451424814524, + "learning_rate": 2.3128487379461798e-07, + "loss": 0.1906, + "step": 10823 + }, + { + "epoch": 0.69, + "grad_norm": 1.0166061374711701, + "learning_rate": 2.3119778778319877e-07, + "loss": 0.2979, + "step": 10824 + }, + { + "epoch": 0.69, + "grad_norm": 1.216325736730003, + "learning_rate": 2.311107132394221e-07, + "loss": 0.29, + "step": 10825 + }, + { + "epoch": 0.69, + "grad_norm": 2.482344311760232, + "learning_rate": 2.3102365016700315e-07, + "loss": 0.1998, + "step": 10826 + }, + { + "epoch": 0.69, + "grad_norm": 0.8380530062664374, + "learning_rate": 2.3093659856965596e-07, + "loss": 0.0783, + "step": 10827 + }, + { + "epoch": 0.69, + "grad_norm": 0.6588364842038741, + "learning_rate": 2.3084955845109416e-07, + "loss": 0.1647, + "step": 10828 + }, + { + "epoch": 0.69, + "grad_norm": 2.2935703272833226, + "learning_rate": 2.3076252981503154e-07, + "loss": 0.0233, + "step": 10829 + }, + { + "epoch": 0.69, + "grad_norm": 0.47247374926229946, + "learning_rate": 2.3067551266518037e-07, + "loss": 0.1596, + "step": 10830 + }, + { + "epoch": 0.69, + "grad_norm": 0.9566299353721107, + "learning_rate": 2.305885070052534e-07, + "loss": 0.532, + "step": 10831 + }, + { + "epoch": 0.69, + "grad_norm": 0.8751567444118277, + "learning_rate": 2.305015128389622e-07, + "loss": 0.1801, + "step": 10832 + }, + { + "epoch": 0.69, + "grad_norm": 0.5742905626408723, + "learning_rate": 2.3041453017001815e-07, + "loss": 0.0756, + "step": 10833 + }, + { + "epoch": 0.69, + "grad_norm": 0.38439957698341465, + "learning_rate": 2.3032755900213223e-07, + "loss": 0.2279, + "step": 10834 + }, + { + "epoch": 0.69, + "grad_norm": 0.18531328291539825, + "learning_rate": 2.302405993390149e-07, + "loss": 0.1327, + "step": 10835 + }, + { + "epoch": 0.69, + "grad_norm": 12.60385236376719, + "learning_rate": 2.3015365118437574e-07, + "loss": 0.2272, + "step": 10836 + }, + { + "epoch": 0.69, + "grad_norm": 0.6486143980048088, + "learning_rate": 2.3006671454192444e-07, + "loss": 0.2544, + "step": 10837 + }, + { + "epoch": 0.69, + "grad_norm": 1.6628286454833892, + "learning_rate": 2.2997978941536955e-07, + "loss": 0.4387, + "step": 10838 + }, + { + "epoch": 0.69, + "grad_norm": 0.450387559009505, + "learning_rate": 2.2989287580841981e-07, + "loss": 0.1608, + "step": 10839 + }, + { + "epoch": 0.69, + "grad_norm": 2.560444908888854, + "learning_rate": 2.2980597372478282e-07, + "loss": 0.1453, + "step": 10840 + }, + { + "epoch": 0.69, + "grad_norm": 0.7652646336107973, + "learning_rate": 2.2971908316816635e-07, + "loss": 0.4461, + "step": 10841 + }, + { + "epoch": 0.69, + "grad_norm": 9.641306707553857, + "learning_rate": 2.2963220414227708e-07, + "loss": 0.2179, + "step": 10842 + }, + { + "epoch": 0.69, + "grad_norm": 1.1646865114753244, + "learning_rate": 2.2954533665082132e-07, + "loss": 0.1115, + "step": 10843 + }, + { + "epoch": 0.69, + "grad_norm": 0.5558736048088738, + "learning_rate": 2.2945848069750538e-07, + "loss": 0.0706, + "step": 10844 + }, + { + "epoch": 0.69, + "grad_norm": 1.0829731712122816, + "learning_rate": 2.2937163628603434e-07, + "loss": 0.1866, + "step": 10845 + }, + { + "epoch": 0.69, + "grad_norm": 0.7158053352819196, + "learning_rate": 2.2928480342011347e-07, + "loss": 0.4227, + "step": 10846 + }, + { + "epoch": 0.69, + "grad_norm": 0.5870431288921848, + "learning_rate": 2.2919798210344694e-07, + "loss": 0.1103, + "step": 10847 + }, + { + "epoch": 0.69, + "grad_norm": 0.518641319599222, + "learning_rate": 2.2911117233973905e-07, + "loss": 0.1914, + "step": 10848 + }, + { + "epoch": 0.69, + "grad_norm": 23.7094736476411, + "learning_rate": 2.2902437413269293e-07, + "loss": 0.0784, + "step": 10849 + }, + { + "epoch": 0.69, + "grad_norm": 1.037859043760364, + "learning_rate": 2.2893758748601172e-07, + "loss": 0.2207, + "step": 10850 + }, + { + "epoch": 0.69, + "grad_norm": 1.5423624933920133, + "learning_rate": 2.2885081240339809e-07, + "loss": 0.3081, + "step": 10851 + }, + { + "epoch": 0.69, + "grad_norm": 0.5829197775705188, + "learning_rate": 2.2876404888855372e-07, + "loss": 0.1074, + "step": 10852 + }, + { + "epoch": 0.69, + "grad_norm": 11.54594707465134, + "learning_rate": 2.286772969451803e-07, + "loss": 0.1657, + "step": 10853 + }, + { + "epoch": 0.69, + "grad_norm": 0.7077938283153192, + "learning_rate": 2.28590556576979e-07, + "loss": 0.2317, + "step": 10854 + }, + { + "epoch": 0.69, + "grad_norm": 0.8975389192393, + "learning_rate": 2.2850382778765014e-07, + "loss": 0.227, + "step": 10855 + }, + { + "epoch": 0.69, + "grad_norm": 0.35601280740426633, + "learning_rate": 2.2841711058089358e-07, + "loss": 0.2147, + "step": 10856 + }, + { + "epoch": 0.69, + "grad_norm": 0.9712915419698354, + "learning_rate": 2.2833040496040922e-07, + "loss": 0.1006, + "step": 10857 + }, + { + "epoch": 0.69, + "grad_norm": 0.9216084415325392, + "learning_rate": 2.2824371092989574e-07, + "loss": 0.1229, + "step": 10858 + }, + { + "epoch": 0.69, + "grad_norm": 0.6577588361517682, + "learning_rate": 2.2815702849305202e-07, + "loss": 0.2169, + "step": 10859 + }, + { + "epoch": 0.69, + "grad_norm": 0.28307925170456116, + "learning_rate": 2.2807035765357573e-07, + "loss": 0.105, + "step": 10860 + }, + { + "epoch": 0.69, + "grad_norm": 4.071701063090803, + "learning_rate": 2.2798369841516484e-07, + "loss": 0.0923, + "step": 10861 + }, + { + "epoch": 0.69, + "grad_norm": 1.607538185220634, + "learning_rate": 2.27897050781516e-07, + "loss": 0.4002, + "step": 10862 + }, + { + "epoch": 0.69, + "grad_norm": 2.5820780420207257, + "learning_rate": 2.2781041475632617e-07, + "loss": 0.1472, + "step": 10863 + }, + { + "epoch": 0.69, + "grad_norm": 0.6536469069476663, + "learning_rate": 2.2772379034329103e-07, + "loss": 0.1771, + "step": 10864 + }, + { + "epoch": 0.69, + "grad_norm": 0.8923911434698747, + "learning_rate": 2.2763717754610652e-07, + "loss": 0.2508, + "step": 10865 + }, + { + "epoch": 0.69, + "grad_norm": 6.736036544110356, + "learning_rate": 2.2755057636846737e-07, + "loss": 0.0673, + "step": 10866 + }, + { + "epoch": 0.69, + "grad_norm": 5.13211128099521, + "learning_rate": 2.2746398681406847e-07, + "loss": 0.2356, + "step": 10867 + }, + { + "epoch": 0.69, + "grad_norm": 0.47241730232045226, + "learning_rate": 2.2737740888660356e-07, + "loss": 0.1102, + "step": 10868 + }, + { + "epoch": 0.69, + "grad_norm": 1.4254458946121775, + "learning_rate": 2.272908425897665e-07, + "loss": 0.2835, + "step": 10869 + }, + { + "epoch": 0.69, + "grad_norm": 0.7895794471402303, + "learning_rate": 2.2720428792725044e-07, + "loss": 0.2342, + "step": 10870 + }, + { + "epoch": 0.69, + "grad_norm": 0.6911187701869503, + "learning_rate": 2.2711774490274766e-07, + "loss": 0.0379, + "step": 10871 + }, + { + "epoch": 0.69, + "grad_norm": 0.949568196928449, + "learning_rate": 2.2703121351995064e-07, + "loss": 0.3727, + "step": 10872 + }, + { + "epoch": 0.69, + "grad_norm": 0.6162927476970472, + "learning_rate": 2.269446937825506e-07, + "loss": 0.218, + "step": 10873 + }, + { + "epoch": 0.69, + "grad_norm": 0.7294745669729295, + "learning_rate": 2.2685818569423904e-07, + "loss": 0.1375, + "step": 10874 + }, + { + "epoch": 0.69, + "grad_norm": 0.7484061186990685, + "learning_rate": 2.2677168925870615e-07, + "loss": 0.5, + "step": 10875 + }, + { + "epoch": 0.69, + "grad_norm": 1.86320809631737, + "learning_rate": 2.2668520447964246e-07, + "loss": 0.1564, + "step": 10876 + }, + { + "epoch": 0.69, + "grad_norm": 0.3315311656569516, + "learning_rate": 2.2659873136073715e-07, + "loss": 0.0126, + "step": 10877 + }, + { + "epoch": 0.69, + "grad_norm": 0.46869695401621636, + "learning_rate": 2.265122699056798e-07, + "loss": 0.1943, + "step": 10878 + }, + { + "epoch": 0.69, + "grad_norm": 0.8318506761180977, + "learning_rate": 2.2642582011815858e-07, + "loss": 0.312, + "step": 10879 + }, + { + "epoch": 0.69, + "grad_norm": 0.7972055452495757, + "learning_rate": 2.2633938200186203e-07, + "loss": 0.1897, + "step": 10880 + }, + { + "epoch": 0.69, + "grad_norm": 0.8801000207391041, + "learning_rate": 2.2625295556047736e-07, + "loss": 0.2444, + "step": 10881 + }, + { + "epoch": 0.69, + "grad_norm": 0.4887721157384821, + "learning_rate": 2.261665407976921e-07, + "loss": 0.1556, + "step": 10882 + }, + { + "epoch": 0.69, + "grad_norm": 2.0530104310733335, + "learning_rate": 2.260801377171927e-07, + "loss": 0.1596, + "step": 10883 + }, + { + "epoch": 0.69, + "grad_norm": 0.3159872398057218, + "learning_rate": 2.2599374632266511e-07, + "loss": 0.1374, + "step": 10884 + }, + { + "epoch": 0.69, + "grad_norm": 1.611180244577775, + "learning_rate": 2.259073666177951e-07, + "loss": 0.2115, + "step": 10885 + }, + { + "epoch": 0.69, + "grad_norm": 2.0627240304031664, + "learning_rate": 2.2582099860626797e-07, + "loss": 0.0233, + "step": 10886 + }, + { + "epoch": 0.69, + "grad_norm": 0.6653248825304307, + "learning_rate": 2.257346422917681e-07, + "loss": 0.1105, + "step": 10887 + }, + { + "epoch": 0.69, + "grad_norm": 0.9950886138225368, + "learning_rate": 2.2564829767797965e-07, + "loss": 0.1554, + "step": 10888 + }, + { + "epoch": 0.69, + "grad_norm": 0.46491460431634946, + "learning_rate": 2.2556196476858657e-07, + "loss": 0.0984, + "step": 10889 + }, + { + "epoch": 0.69, + "grad_norm": 3.293669330584775, + "learning_rate": 2.2547564356727155e-07, + "loss": 0.2248, + "step": 10890 + }, + { + "epoch": 0.69, + "grad_norm": 5.088819784265053, + "learning_rate": 2.2538933407771766e-07, + "loss": 0.0916, + "step": 10891 + }, + { + "epoch": 0.69, + "grad_norm": 0.110023347891709, + "learning_rate": 2.2530303630360664e-07, + "loss": 0.0023, + "step": 10892 + }, + { + "epoch": 0.69, + "grad_norm": 0.9093858880895302, + "learning_rate": 2.2521675024862047e-07, + "loss": 0.3635, + "step": 10893 + }, + { + "epoch": 0.69, + "grad_norm": 0.7695666115831168, + "learning_rate": 2.2513047591643992e-07, + "loss": 0.13, + "step": 10894 + }, + { + "epoch": 0.69, + "grad_norm": 1.32660121975726, + "learning_rate": 2.2504421331074602e-07, + "loss": 0.1307, + "step": 10895 + }, + { + "epoch": 0.69, + "grad_norm": 0.5935487063161089, + "learning_rate": 2.249579624352187e-07, + "loss": 0.181, + "step": 10896 + }, + { + "epoch": 0.69, + "grad_norm": 0.7901801674782892, + "learning_rate": 2.2487172329353742e-07, + "loss": 0.239, + "step": 10897 + }, + { + "epoch": 0.69, + "grad_norm": 0.7021555975048261, + "learning_rate": 2.2478549588938166e-07, + "loss": 0.233, + "step": 10898 + }, + { + "epoch": 0.7, + "grad_norm": 0.5346683612322068, + "learning_rate": 2.2469928022642975e-07, + "loss": 0.1757, + "step": 10899 + }, + { + "epoch": 0.7, + "grad_norm": 0.5942130167894475, + "learning_rate": 2.2461307630836019e-07, + "loss": 0.4709, + "step": 10900 + }, + { + "epoch": 0.7, + "grad_norm": 0.7249753989402374, + "learning_rate": 2.2452688413885013e-07, + "loss": 0.0288, + "step": 10901 + }, + { + "epoch": 0.7, + "grad_norm": 4.289366193772501, + "learning_rate": 2.244407037215772e-07, + "loss": 0.1315, + "step": 10902 + }, + { + "epoch": 0.7, + "grad_norm": 10.954304440433352, + "learning_rate": 2.2435453506021756e-07, + "loss": 0.1926, + "step": 10903 + }, + { + "epoch": 0.7, + "grad_norm": 1.2178358002772534, + "learning_rate": 2.2426837815844763e-07, + "loss": 0.1126, + "step": 10904 + }, + { + "epoch": 0.7, + "grad_norm": 0.44983159921230526, + "learning_rate": 2.2418223301994293e-07, + "loss": 0.1848, + "step": 10905 + }, + { + "epoch": 0.7, + "grad_norm": 1.1048548497985862, + "learning_rate": 2.2409609964837883e-07, + "loss": 0.1964, + "step": 10906 + }, + { + "epoch": 0.7, + "grad_norm": 3.967727743154967, + "learning_rate": 2.2400997804742961e-07, + "loss": 0.1552, + "step": 10907 + }, + { + "epoch": 0.7, + "grad_norm": 0.13480417299554695, + "learning_rate": 2.239238682207697e-07, + "loss": 0.0921, + "step": 10908 + }, + { + "epoch": 0.7, + "grad_norm": 0.4985602003788864, + "learning_rate": 2.238377701720725e-07, + "loss": 0.1493, + "step": 10909 + }, + { + "epoch": 0.7, + "grad_norm": 1.815357305600492, + "learning_rate": 2.237516839050111e-07, + "loss": 0.3343, + "step": 10910 + }, + { + "epoch": 0.7, + "grad_norm": 0.9960945528187201, + "learning_rate": 2.2366560942325828e-07, + "loss": 0.2111, + "step": 10911 + }, + { + "epoch": 0.7, + "grad_norm": 1.4115713506311849, + "learning_rate": 2.2357954673048591e-07, + "loss": 0.0647, + "step": 10912 + }, + { + "epoch": 0.7, + "grad_norm": 1.0007216294650185, + "learning_rate": 2.2349349583036598e-07, + "loss": 0.1253, + "step": 10913 + }, + { + "epoch": 0.7, + "grad_norm": 0.821385428051082, + "learning_rate": 2.2340745672656914e-07, + "loss": 0.0354, + "step": 10914 + }, + { + "epoch": 0.7, + "grad_norm": 0.586873288271951, + "learning_rate": 2.2332142942276639e-07, + "loss": 0.0837, + "step": 10915 + }, + { + "epoch": 0.7, + "grad_norm": 0.7014841527889852, + "learning_rate": 2.2323541392262745e-07, + "loss": 0.196, + "step": 10916 + }, + { + "epoch": 0.7, + "grad_norm": 3.967562145076491, + "learning_rate": 2.2314941022982237e-07, + "loss": 0.3864, + "step": 10917 + }, + { + "epoch": 0.7, + "grad_norm": 0.9130360703446458, + "learning_rate": 2.230634183480198e-07, + "loss": 0.2439, + "step": 10918 + }, + { + "epoch": 0.7, + "grad_norm": 0.5969503230851588, + "learning_rate": 2.2297743828088867e-07, + "loss": 0.0972, + "step": 10919 + }, + { + "epoch": 0.7, + "grad_norm": 0.496305039123906, + "learning_rate": 2.228914700320967e-07, + "loss": 0.1178, + "step": 10920 + }, + { + "epoch": 0.7, + "grad_norm": 1.2448276710422292, + "learning_rate": 2.2280551360531168e-07, + "loss": 0.0335, + "step": 10921 + }, + { + "epoch": 0.7, + "grad_norm": 1.0203334600594705, + "learning_rate": 2.227195690042009e-07, + "loss": 0.195, + "step": 10922 + }, + { + "epoch": 0.7, + "grad_norm": 0.7703721701369108, + "learning_rate": 2.2263363623243054e-07, + "loss": 0.2266, + "step": 10923 + }, + { + "epoch": 0.7, + "grad_norm": 2.049210719399762, + "learning_rate": 2.22547715293667e-07, + "loss": 0.0976, + "step": 10924 + }, + { + "epoch": 0.7, + "grad_norm": 2.135960356496401, + "learning_rate": 2.2246180619157546e-07, + "loss": 0.0766, + "step": 10925 + }, + { + "epoch": 0.7, + "grad_norm": 0.31557956273237464, + "learning_rate": 2.2237590892982138e-07, + "loss": 0.2627, + "step": 10926 + }, + { + "epoch": 0.7, + "grad_norm": 2.608551494496266, + "learning_rate": 2.22290023512069e-07, + "loss": 0.1915, + "step": 10927 + }, + { + "epoch": 0.7, + "grad_norm": 0.35726329297766696, + "learning_rate": 2.2220414994198262e-07, + "loss": 0.1034, + "step": 10928 + }, + { + "epoch": 0.7, + "grad_norm": 1.886101922517762, + "learning_rate": 2.2211828822322544e-07, + "loss": 0.0925, + "step": 10929 + }, + { + "epoch": 0.7, + "grad_norm": 0.1361921865049236, + "learning_rate": 2.220324383594609e-07, + "loss": 0.104, + "step": 10930 + }, + { + "epoch": 0.7, + "grad_norm": 0.17799373833339124, + "learning_rate": 2.2194660035435115e-07, + "loss": 0.021, + "step": 10931 + }, + { + "epoch": 0.7, + "grad_norm": 14.42703875159493, + "learning_rate": 2.2186077421155853e-07, + "loss": 0.0421, + "step": 10932 + }, + { + "epoch": 0.7, + "grad_norm": 0.6544839328708031, + "learning_rate": 2.2177495993474426e-07, + "loss": 0.0781, + "step": 10933 + }, + { + "epoch": 0.7, + "grad_norm": 0.5053969140401248, + "learning_rate": 2.2168915752756966e-07, + "loss": 0.1136, + "step": 10934 + }, + { + "epoch": 0.7, + "grad_norm": 0.4204335796883389, + "learning_rate": 2.2160336699369496e-07, + "loss": 0.2632, + "step": 10935 + }, + { + "epoch": 0.7, + "grad_norm": 1.1065380945059267, + "learning_rate": 2.215175883367804e-07, + "loss": 0.2669, + "step": 10936 + }, + { + "epoch": 0.7, + "grad_norm": 1.061545908147558, + "learning_rate": 2.2143182156048533e-07, + "loss": 0.0885, + "step": 10937 + }, + { + "epoch": 0.7, + "grad_norm": 0.7378857275320583, + "learning_rate": 2.213460666684686e-07, + "loss": 0.2206, + "step": 10938 + }, + { + "epoch": 0.7, + "grad_norm": 1.050980108589889, + "learning_rate": 2.2126032366438884e-07, + "loss": 0.18, + "step": 10939 + }, + { + "epoch": 0.7, + "grad_norm": 2.3873979101593528, + "learning_rate": 2.2117459255190395e-07, + "loss": 0.2091, + "step": 10940 + }, + { + "epoch": 0.7, + "grad_norm": 0.81192411700409, + "learning_rate": 2.2108887333467168e-07, + "loss": 0.2052, + "step": 10941 + }, + { + "epoch": 0.7, + "grad_norm": 1.1025429172170629, + "learning_rate": 2.2100316601634856e-07, + "loss": 0.2563, + "step": 10942 + }, + { + "epoch": 0.7, + "grad_norm": 0.6293557518228154, + "learning_rate": 2.2091747060059141e-07, + "loss": 0.3961, + "step": 10943 + }, + { + "epoch": 0.7, + "grad_norm": 0.5601243607070677, + "learning_rate": 2.2083178709105583e-07, + "loss": 0.1867, + "step": 10944 + }, + { + "epoch": 0.7, + "grad_norm": 4.018311267316071, + "learning_rate": 2.2074611549139754e-07, + "loss": 0.2061, + "step": 10945 + }, + { + "epoch": 0.7, + "grad_norm": 0.7922089652905071, + "learning_rate": 2.206604558052712e-07, + "loss": 0.1672, + "step": 10946 + }, + { + "epoch": 0.7, + "grad_norm": 1.7070052739808517, + "learning_rate": 2.2057480803633154e-07, + "loss": 0.2281, + "step": 10947 + }, + { + "epoch": 0.7, + "grad_norm": 2.7016793010266547, + "learning_rate": 2.204891721882321e-07, + "loss": 0.2014, + "step": 10948 + }, + { + "epoch": 0.7, + "grad_norm": 0.8402525364923409, + "learning_rate": 2.2040354826462664e-07, + "loss": 0.2437, + "step": 10949 + }, + { + "epoch": 0.7, + "grad_norm": 0.5643340185649347, + "learning_rate": 2.2031793626916768e-07, + "loss": 0.1824, + "step": 10950 + }, + { + "epoch": 0.7, + "grad_norm": 0.8265373224511069, + "learning_rate": 2.2023233620550797e-07, + "loss": 0.1035, + "step": 10951 + }, + { + "epoch": 0.7, + "grad_norm": 0.348838993143465, + "learning_rate": 2.2014674807729923e-07, + "loss": 0.0987, + "step": 10952 + }, + { + "epoch": 0.7, + "grad_norm": 1.7134912311271417, + "learning_rate": 2.2006117188819257e-07, + "loss": 0.2111, + "step": 10953 + }, + { + "epoch": 0.7, + "grad_norm": 4.039798570227404, + "learning_rate": 2.1997560764183926e-07, + "loss": 0.1505, + "step": 10954 + }, + { + "epoch": 0.7, + "grad_norm": 1.2631960550321324, + "learning_rate": 2.1989005534188927e-07, + "loss": 0.1331, + "step": 10955 + }, + { + "epoch": 0.7, + "grad_norm": 0.41869817689472827, + "learning_rate": 2.198045149919926e-07, + "loss": 0.1506, + "step": 10956 + }, + { + "epoch": 0.7, + "grad_norm": 1.6820577255079567, + "learning_rate": 2.197189865957988e-07, + "loss": 0.0211, + "step": 10957 + }, + { + "epoch": 0.7, + "grad_norm": 0.8595514898710638, + "learning_rate": 2.1963347015695627e-07, + "loss": 0.1329, + "step": 10958 + }, + { + "epoch": 0.7, + "grad_norm": 0.4475700125823077, + "learning_rate": 2.195479656791135e-07, + "loss": 0.2221, + "step": 10959 + }, + { + "epoch": 0.7, + "grad_norm": 3.7665736691231024, + "learning_rate": 2.1946247316591843e-07, + "loss": 0.0957, + "step": 10960 + }, + { + "epoch": 0.7, + "grad_norm": 1.5043782022654189, + "learning_rate": 2.193769926210181e-07, + "loss": 0.0801, + "step": 10961 + }, + { + "epoch": 0.7, + "grad_norm": 0.1340649885550248, + "learning_rate": 2.1929152404805956e-07, + "loss": 0.0045, + "step": 10962 + }, + { + "epoch": 0.7, + "grad_norm": 0.9196294150621533, + "learning_rate": 2.1920606745068864e-07, + "loss": 0.2139, + "step": 10963 + }, + { + "epoch": 0.7, + "grad_norm": 3.5396771173298256, + "learning_rate": 2.1912062283255162e-07, + "loss": 0.0046, + "step": 10964 + }, + { + "epoch": 0.7, + "grad_norm": 1.6335613203423425, + "learning_rate": 2.1903519019729343e-07, + "loss": 0.1939, + "step": 10965 + }, + { + "epoch": 0.7, + "grad_norm": 0.7146708155614051, + "learning_rate": 2.1894976954855865e-07, + "loss": 0.3615, + "step": 10966 + }, + { + "epoch": 0.7, + "grad_norm": 0.39300506891577236, + "learning_rate": 2.188643608899919e-07, + "loss": 0.2438, + "step": 10967 + }, + { + "epoch": 0.7, + "grad_norm": 0.23469147985959943, + "learning_rate": 2.1877896422523644e-07, + "loss": 0.0026, + "step": 10968 + }, + { + "epoch": 0.7, + "grad_norm": 0.659546256345295, + "learning_rate": 2.1869357955793594e-07, + "loss": 0.1957, + "step": 10969 + }, + { + "epoch": 0.7, + "grad_norm": 1.1810606116773537, + "learning_rate": 2.186082068917326e-07, + "loss": 0.3302, + "step": 10970 + }, + { + "epoch": 0.7, + "grad_norm": 1.2789898034190668, + "learning_rate": 2.18522846230269e-07, + "loss": 0.1007, + "step": 10971 + }, + { + "epoch": 0.7, + "grad_norm": 1.8152696515338478, + "learning_rate": 2.1843749757718642e-07, + "loss": 0.2352, + "step": 10972 + }, + { + "epoch": 0.7, + "grad_norm": 0.506016568426306, + "learning_rate": 2.1835216093612646e-07, + "loss": 0.098, + "step": 10973 + }, + { + "epoch": 0.7, + "grad_norm": 1.3521813685806305, + "learning_rate": 2.182668363107293e-07, + "loss": 0.0706, + "step": 10974 + }, + { + "epoch": 0.7, + "grad_norm": 7.784555173968003, + "learning_rate": 2.1818152370463523e-07, + "loss": 0.0722, + "step": 10975 + }, + { + "epoch": 0.7, + "grad_norm": 1.952886471051704, + "learning_rate": 2.1809622312148402e-07, + "loss": 0.2057, + "step": 10976 + }, + { + "epoch": 0.7, + "grad_norm": 0.772372475430498, + "learning_rate": 2.180109345649145e-07, + "loss": 0.1928, + "step": 10977 + }, + { + "epoch": 0.7, + "grad_norm": 0.3889697536058163, + "learning_rate": 2.1792565803856556e-07, + "loss": 0.0522, + "step": 10978 + }, + { + "epoch": 0.7, + "grad_norm": 0.8443163771593503, + "learning_rate": 2.178403935460749e-07, + "loss": 0.1573, + "step": 10979 + }, + { + "epoch": 0.7, + "grad_norm": 0.549392684872409, + "learning_rate": 2.1775514109108046e-07, + "loss": 0.1038, + "step": 10980 + }, + { + "epoch": 0.7, + "grad_norm": 1.2069482018807078, + "learning_rate": 2.176699006772189e-07, + "loss": 0.2588, + "step": 10981 + }, + { + "epoch": 0.7, + "grad_norm": 7.342735477229369, + "learning_rate": 2.175846723081271e-07, + "loss": 0.3343, + "step": 10982 + }, + { + "epoch": 0.7, + "grad_norm": 0.6949236444913175, + "learning_rate": 2.1749945598744075e-07, + "loss": 0.0976, + "step": 10983 + }, + { + "epoch": 0.7, + "grad_norm": 3.4281699533945873, + "learning_rate": 2.1741425171879563e-07, + "loss": 0.2621, + "step": 10984 + }, + { + "epoch": 0.7, + "grad_norm": 1.0020370195970825, + "learning_rate": 2.1732905950582648e-07, + "loss": 0.1802, + "step": 10985 + }, + { + "epoch": 0.7, + "grad_norm": 2.0932002911407386, + "learning_rate": 2.1724387935216802e-07, + "loss": 0.1798, + "step": 10986 + }, + { + "epoch": 0.7, + "grad_norm": 1.2957926083056028, + "learning_rate": 2.171587112614539e-07, + "loss": 0.2829, + "step": 10987 + }, + { + "epoch": 0.7, + "grad_norm": 7.253034759683275, + "learning_rate": 2.1707355523731796e-07, + "loss": 0.1463, + "step": 10988 + }, + { + "epoch": 0.7, + "grad_norm": 0.34662369243637214, + "learning_rate": 2.1698841128339273e-07, + "loss": 0.0779, + "step": 10989 + }, + { + "epoch": 0.7, + "grad_norm": 0.8748200788607298, + "learning_rate": 2.1690327940331093e-07, + "loss": 0.163, + "step": 10990 + }, + { + "epoch": 0.7, + "grad_norm": 0.9269464993348918, + "learning_rate": 2.168181596007042e-07, + "loss": 0.118, + "step": 10991 + }, + { + "epoch": 0.7, + "grad_norm": 0.5689704725961378, + "learning_rate": 2.167330518792042e-07, + "loss": 0.1489, + "step": 10992 + }, + { + "epoch": 0.7, + "grad_norm": 0.1384217527504339, + "learning_rate": 2.1664795624244147e-07, + "loss": 0.0393, + "step": 10993 + }, + { + "epoch": 0.7, + "grad_norm": 1.7500700286501576, + "learning_rate": 2.1656287269404656e-07, + "loss": 0.1911, + "step": 10994 + }, + { + "epoch": 0.7, + "grad_norm": 1.1563941203100896, + "learning_rate": 2.1647780123764942e-07, + "loss": 0.1127, + "step": 10995 + }, + { + "epoch": 0.7, + "grad_norm": 0.7440228102081966, + "learning_rate": 2.1639274187687906e-07, + "loss": 0.0976, + "step": 10996 + }, + { + "epoch": 0.7, + "grad_norm": 0.3423135838816568, + "learning_rate": 2.1630769461536463e-07, + "loss": 0.0711, + "step": 10997 + }, + { + "epoch": 0.7, + "grad_norm": 0.6920210219335216, + "learning_rate": 2.1622265945673406e-07, + "loss": 0.2363, + "step": 10998 + }, + { + "epoch": 0.7, + "grad_norm": 1.477111108642288, + "learning_rate": 2.1613763640461552e-07, + "loss": 0.1506, + "step": 10999 + }, + { + "epoch": 0.7, + "grad_norm": 0.42656076104498575, + "learning_rate": 2.1605262546263587e-07, + "loss": 0.047, + "step": 11000 + }, + { + "epoch": 0.7, + "grad_norm": 0.8438581654792345, + "learning_rate": 2.1596762663442213e-07, + "loss": 0.0957, + "step": 11001 + }, + { + "epoch": 0.7, + "grad_norm": 7.809254864804685, + "learning_rate": 2.158826399236003e-07, + "loss": 0.1341, + "step": 11002 + }, + { + "epoch": 0.7, + "grad_norm": 0.8724682548909874, + "learning_rate": 2.1579766533379635e-07, + "loss": 0.3811, + "step": 11003 + }, + { + "epoch": 0.7, + "grad_norm": 0.2784828289476305, + "learning_rate": 2.1571270286863514e-07, + "loss": 0.2726, + "step": 11004 + }, + { + "epoch": 0.7, + "grad_norm": 1.325186343899838, + "learning_rate": 2.156277525317417e-07, + "loss": 0.2104, + "step": 11005 + }, + { + "epoch": 0.7, + "grad_norm": 0.7809667381097108, + "learning_rate": 2.1554281432674e-07, + "loss": 0.203, + "step": 11006 + }, + { + "epoch": 0.7, + "grad_norm": 0.5387177855910273, + "learning_rate": 2.1545788825725348e-07, + "loss": 0.2158, + "step": 11007 + }, + { + "epoch": 0.7, + "grad_norm": 1.7752514065628473, + "learning_rate": 2.1537297432690566e-07, + "loss": 0.0828, + "step": 11008 + }, + { + "epoch": 0.7, + "grad_norm": 0.6494091713904622, + "learning_rate": 2.152880725393187e-07, + "loss": 0.1067, + "step": 11009 + }, + { + "epoch": 0.7, + "grad_norm": 0.8775734501206596, + "learning_rate": 2.152031828981149e-07, + "loss": 0.4599, + "step": 11010 + }, + { + "epoch": 0.7, + "grad_norm": 0.4376476542465838, + "learning_rate": 2.151183054069159e-07, + "loss": 0.1312, + "step": 11011 + }, + { + "epoch": 0.7, + "grad_norm": 1.1787140873996134, + "learning_rate": 2.1503344006934283e-07, + "loss": 0.1617, + "step": 11012 + }, + { + "epoch": 0.7, + "grad_norm": 0.9153597476126069, + "learning_rate": 2.1494858688901585e-07, + "loss": 0.2475, + "step": 11013 + }, + { + "epoch": 0.7, + "grad_norm": 0.7634750254238873, + "learning_rate": 2.1486374586955535e-07, + "loss": 0.1348, + "step": 11014 + }, + { + "epoch": 0.7, + "grad_norm": 0.2705824767832749, + "learning_rate": 2.1477891701458052e-07, + "loss": 0.098, + "step": 11015 + }, + { + "epoch": 0.7, + "grad_norm": 0.5972037289624058, + "learning_rate": 2.1469410032771056e-07, + "loss": 0.0561, + "step": 11016 + }, + { + "epoch": 0.7, + "grad_norm": 9.543764948171805, + "learning_rate": 2.146092958125637e-07, + "loss": 0.1716, + "step": 11017 + }, + { + "epoch": 0.7, + "grad_norm": 0.20522270832625228, + "learning_rate": 2.145245034727582e-07, + "loss": 0.0167, + "step": 11018 + }, + { + "epoch": 0.7, + "grad_norm": 1.2037803633222162, + "learning_rate": 2.1443972331191118e-07, + "loss": 0.0692, + "step": 11019 + }, + { + "epoch": 0.7, + "grad_norm": 0.4281920119549205, + "learning_rate": 2.1435495533363946e-07, + "loss": 0.1621, + "step": 11020 + }, + { + "epoch": 0.7, + "grad_norm": 2.1511074131109007, + "learning_rate": 2.1427019954155979e-07, + "loss": 0.3641, + "step": 11021 + }, + { + "epoch": 0.7, + "grad_norm": 1.4625029465831232, + "learning_rate": 2.1418545593928756e-07, + "loss": 0.3489, + "step": 11022 + }, + { + "epoch": 0.7, + "grad_norm": 1.3047263123238109, + "learning_rate": 2.1410072453043853e-07, + "loss": 0.1995, + "step": 11023 + }, + { + "epoch": 0.7, + "grad_norm": 1.369664270063948, + "learning_rate": 2.1401600531862713e-07, + "loss": 0.1859, + "step": 11024 + }, + { + "epoch": 0.7, + "grad_norm": 4.689149430334241, + "learning_rate": 2.1393129830746804e-07, + "loss": 0.1718, + "step": 11025 + }, + { + "epoch": 0.7, + "grad_norm": 1.6200892886489244, + "learning_rate": 2.1384660350057465e-07, + "loss": 0.1687, + "step": 11026 + }, + { + "epoch": 0.7, + "grad_norm": 0.706330514924541, + "learning_rate": 2.1376192090156043e-07, + "loss": 0.1297, + "step": 11027 + }, + { + "epoch": 0.7, + "grad_norm": 1.0425024830100322, + "learning_rate": 2.1367725051403817e-07, + "loss": 0.3272, + "step": 11028 + }, + { + "epoch": 0.7, + "grad_norm": 0.946566279496054, + "learning_rate": 2.1359259234161985e-07, + "loss": 0.1678, + "step": 11029 + }, + { + "epoch": 0.7, + "grad_norm": 0.48638936657029846, + "learning_rate": 2.135079463879173e-07, + "loss": 0.0274, + "step": 11030 + }, + { + "epoch": 0.7, + "grad_norm": 1.5038371680377733, + "learning_rate": 2.134233126565419e-07, + "loss": 0.1905, + "step": 11031 + }, + { + "epoch": 0.7, + "grad_norm": 0.9346990238157602, + "learning_rate": 2.1333869115110382e-07, + "loss": 0.5061, + "step": 11032 + }, + { + "epoch": 0.7, + "grad_norm": 3.0362277480262794, + "learning_rate": 2.1325408187521364e-07, + "loss": 0.167, + "step": 11033 + }, + { + "epoch": 0.7, + "grad_norm": 4.861370803338594, + "learning_rate": 2.1316948483248082e-07, + "loss": 0.2061, + "step": 11034 + }, + { + "epoch": 0.7, + "grad_norm": 0.7567563176852425, + "learning_rate": 2.1308490002651413e-07, + "loss": 0.2294, + "step": 11035 + }, + { + "epoch": 0.7, + "grad_norm": 0.30553706264457836, + "learning_rate": 2.1300032746092257e-07, + "loss": 0.0625, + "step": 11036 + }, + { + "epoch": 0.7, + "grad_norm": 3.7933743165339, + "learning_rate": 2.129157671393138e-07, + "loss": 0.1537, + "step": 11037 + }, + { + "epoch": 0.7, + "grad_norm": 1.335773703985514, + "learning_rate": 2.1283121906529571e-07, + "loss": 0.3533, + "step": 11038 + }, + { + "epoch": 0.7, + "grad_norm": 0.4964445480855572, + "learning_rate": 2.1274668324247491e-07, + "loss": 0.2921, + "step": 11039 + }, + { + "epoch": 0.7, + "grad_norm": 2.930152862201858, + "learning_rate": 2.126621596744582e-07, + "loss": 0.1646, + "step": 11040 + }, + { + "epoch": 0.7, + "grad_norm": 4.248724251053854, + "learning_rate": 2.1257764836485127e-07, + "loss": 0.2651, + "step": 11041 + }, + { + "epoch": 0.7, + "grad_norm": 0.3113870893272026, + "learning_rate": 2.1249314931725974e-07, + "loss": 0.2208, + "step": 11042 + }, + { + "epoch": 0.7, + "grad_norm": 0.4539614691084134, + "learning_rate": 2.1240866253528832e-07, + "loss": 0.2331, + "step": 11043 + }, + { + "epoch": 0.7, + "grad_norm": 0.506758943529494, + "learning_rate": 2.1232418802254165e-07, + "loss": 0.3359, + "step": 11044 + }, + { + "epoch": 0.7, + "grad_norm": 0.48030673453778105, + "learning_rate": 2.1223972578262322e-07, + "loss": 0.0153, + "step": 11045 + }, + { + "epoch": 0.7, + "grad_norm": 0.8853162385891539, + "learning_rate": 2.1215527581913655e-07, + "loss": 0.1202, + "step": 11046 + }, + { + "epoch": 0.7, + "grad_norm": 0.7863355873264025, + "learning_rate": 2.1207083813568466e-07, + "loss": 0.2481, + "step": 11047 + }, + { + "epoch": 0.7, + "grad_norm": 1.0899537164999757, + "learning_rate": 2.1198641273586947e-07, + "loss": 0.2548, + "step": 11048 + }, + { + "epoch": 0.7, + "grad_norm": 7.11510099776902, + "learning_rate": 2.1190199962329302e-07, + "loss": 0.026, + "step": 11049 + }, + { + "epoch": 0.7, + "grad_norm": 0.6448255195451303, + "learning_rate": 2.1181759880155625e-07, + "loss": 0.2012, + "step": 11050 + }, + { + "epoch": 0.7, + "grad_norm": 1.3651832298687614, + "learning_rate": 2.1173321027426022e-07, + "loss": 0.0762, + "step": 11051 + }, + { + "epoch": 0.7, + "grad_norm": 0.7536512939188647, + "learning_rate": 2.1164883404500476e-07, + "loss": 0.1388, + "step": 11052 + }, + { + "epoch": 0.7, + "grad_norm": 0.9228609114338816, + "learning_rate": 2.1156447011738986e-07, + "loss": 0.064, + "step": 11053 + }, + { + "epoch": 0.7, + "grad_norm": 1.2032538379935707, + "learning_rate": 2.1148011849501436e-07, + "loss": 0.1504, + "step": 11054 + }, + { + "epoch": 0.7, + "grad_norm": 0.7894577062338723, + "learning_rate": 2.1139577918147711e-07, + "loss": 0.3238, + "step": 11055 + }, + { + "epoch": 0.71, + "grad_norm": 0.2614407387535882, + "learning_rate": 2.1131145218037593e-07, + "loss": 0.0887, + "step": 11056 + }, + { + "epoch": 0.71, + "grad_norm": 1.045079877596098, + "learning_rate": 2.1122713749530875e-07, + "loss": 0.2197, + "step": 11057 + }, + { + "epoch": 0.71, + "grad_norm": 0.5737296747467094, + "learning_rate": 2.1114283512987218e-07, + "loss": 0.223, + "step": 11058 + }, + { + "epoch": 0.71, + "grad_norm": 3.231003810412058, + "learning_rate": 2.1105854508766312e-07, + "loss": 0.2532, + "step": 11059 + }, + { + "epoch": 0.71, + "grad_norm": 16.87974904872534, + "learning_rate": 2.1097426737227724e-07, + "loss": 0.1487, + "step": 11060 + }, + { + "epoch": 0.71, + "grad_norm": 0.6033239733857554, + "learning_rate": 2.1089000198731027e-07, + "loss": 0.0204, + "step": 11061 + }, + { + "epoch": 0.71, + "grad_norm": 0.7984236334952097, + "learning_rate": 2.1080574893635683e-07, + "loss": 0.1942, + "step": 11062 + }, + { + "epoch": 0.71, + "grad_norm": 1.9847381829793511, + "learning_rate": 2.1072150822301167e-07, + "loss": 0.0893, + "step": 11063 + }, + { + "epoch": 0.71, + "grad_norm": 0.7396772250488312, + "learning_rate": 2.1063727985086827e-07, + "loss": 0.1299, + "step": 11064 + }, + { + "epoch": 0.71, + "grad_norm": 0.5982581324806512, + "learning_rate": 2.1055306382352022e-07, + "loss": 0.2079, + "step": 11065 + }, + { + "epoch": 0.71, + "grad_norm": 2.247004817230564, + "learning_rate": 2.1046886014456054e-07, + "loss": 0.2978, + "step": 11066 + }, + { + "epoch": 0.71, + "grad_norm": 1.2188244200151201, + "learning_rate": 2.1038466881758116e-07, + "loss": 0.2978, + "step": 11067 + }, + { + "epoch": 0.71, + "grad_norm": 1.0301709760929476, + "learning_rate": 2.1030048984617416e-07, + "loss": 0.2754, + "step": 11068 + }, + { + "epoch": 0.71, + "grad_norm": 5.343686261228263, + "learning_rate": 2.102163232339304e-07, + "loss": 0.2055, + "step": 11069 + }, + { + "epoch": 0.71, + "grad_norm": 1.0485427604293511, + "learning_rate": 2.1013216898444109e-07, + "loss": 0.3121, + "step": 11070 + }, + { + "epoch": 0.71, + "grad_norm": 1.1570134171494262, + "learning_rate": 2.1004802710129592e-07, + "loss": 0.1822, + "step": 11071 + }, + { + "epoch": 0.71, + "grad_norm": 1.5084985592317486, + "learning_rate": 2.0996389758808498e-07, + "loss": 0.0521, + "step": 11072 + }, + { + "epoch": 0.71, + "grad_norm": 0.4328263965227327, + "learning_rate": 2.0987978044839705e-07, + "loss": 0.0071, + "step": 11073 + }, + { + "epoch": 0.71, + "grad_norm": 0.42636946733427256, + "learning_rate": 2.0979567568582108e-07, + "loss": 0.1459, + "step": 11074 + }, + { + "epoch": 0.71, + "grad_norm": 0.8818847458483785, + "learning_rate": 2.0971158330394495e-07, + "loss": 0.4688, + "step": 11075 + }, + { + "epoch": 0.71, + "grad_norm": 0.6775631918795026, + "learning_rate": 2.096275033063561e-07, + "loss": 0.019, + "step": 11076 + }, + { + "epoch": 0.71, + "grad_norm": 2.3100042645637124, + "learning_rate": 2.0954343569664185e-07, + "loss": 0.1081, + "step": 11077 + }, + { + "epoch": 0.71, + "grad_norm": 1.3202207840361624, + "learning_rate": 2.094593804783883e-07, + "loss": 0.2358, + "step": 11078 + }, + { + "epoch": 0.71, + "grad_norm": 2.1424883672675796, + "learning_rate": 2.0937533765518185e-07, + "loss": 0.0199, + "step": 11079 + }, + { + "epoch": 0.71, + "grad_norm": 10.759146111867244, + "learning_rate": 2.0929130723060752e-07, + "loss": 0.257, + "step": 11080 + }, + { + "epoch": 0.71, + "grad_norm": 2.8304879441262054, + "learning_rate": 2.0920728920825043e-07, + "loss": 0.1505, + "step": 11081 + }, + { + "epoch": 0.71, + "grad_norm": 0.29801680293990124, + "learning_rate": 2.0912328359169495e-07, + "loss": 0.0428, + "step": 11082 + }, + { + "epoch": 0.71, + "grad_norm": 0.3461982760630799, + "learning_rate": 2.090392903845251e-07, + "loss": 0.0044, + "step": 11083 + }, + { + "epoch": 0.71, + "grad_norm": 2.0081329296310395, + "learning_rate": 2.0895530959032388e-07, + "loss": 0.3769, + "step": 11084 + }, + { + "epoch": 0.71, + "grad_norm": 0.8026771160791514, + "learning_rate": 2.0887134121267437e-07, + "loss": 0.2868, + "step": 11085 + }, + { + "epoch": 0.71, + "grad_norm": 0.45385461846890257, + "learning_rate": 2.0878738525515853e-07, + "loss": 0.2828, + "step": 11086 + }, + { + "epoch": 0.71, + "grad_norm": 0.41979306423280627, + "learning_rate": 2.087034417213584e-07, + "loss": 0.2649, + "step": 11087 + }, + { + "epoch": 0.71, + "grad_norm": 4.705398689810544, + "learning_rate": 2.0861951061485504e-07, + "loss": 0.1281, + "step": 11088 + }, + { + "epoch": 0.71, + "grad_norm": 1.0492582653181994, + "learning_rate": 2.0853559193922892e-07, + "loss": 0.2019, + "step": 11089 + }, + { + "epoch": 0.71, + "grad_norm": 2.050281563306293, + "learning_rate": 2.084516856980606e-07, + "loss": 0.2285, + "step": 11090 + }, + { + "epoch": 0.71, + "grad_norm": 0.6808190367480089, + "learning_rate": 2.083677918949292e-07, + "loss": 0.011, + "step": 11091 + }, + { + "epoch": 0.71, + "grad_norm": 2.999061353887269, + "learning_rate": 2.0828391053341427e-07, + "loss": 0.1825, + "step": 11092 + }, + { + "epoch": 0.71, + "grad_norm": 3.2763628861309195, + "learning_rate": 2.0820004161709393e-07, + "loss": 0.2497, + "step": 11093 + }, + { + "epoch": 0.71, + "grad_norm": 3.360521841048718, + "learning_rate": 2.0811618514954665e-07, + "loss": 0.158, + "step": 11094 + }, + { + "epoch": 0.71, + "grad_norm": 0.9218195814754128, + "learning_rate": 2.0803234113434942e-07, + "loss": 0.1443, + "step": 11095 + }, + { + "epoch": 0.71, + "grad_norm": 0.6991316452549218, + "learning_rate": 2.0794850957507965e-07, + "loss": 0.1132, + "step": 11096 + }, + { + "epoch": 0.71, + "grad_norm": 0.744254439839101, + "learning_rate": 2.0786469047531341e-07, + "loss": 0.2021, + "step": 11097 + }, + { + "epoch": 0.71, + "grad_norm": 0.8041557797444049, + "learning_rate": 2.0778088383862686e-07, + "loss": 0.3431, + "step": 11098 + }, + { + "epoch": 0.71, + "grad_norm": 0.6138279733706382, + "learning_rate": 2.0769708966859512e-07, + "loss": 0.1224, + "step": 11099 + }, + { + "epoch": 0.71, + "grad_norm": 0.6659031859543398, + "learning_rate": 2.0761330796879306e-07, + "loss": 0.1869, + "step": 11100 + }, + { + "epoch": 0.71, + "grad_norm": 0.576919203865755, + "learning_rate": 2.075295387427951e-07, + "loss": 0.3169, + "step": 11101 + }, + { + "epoch": 0.71, + "grad_norm": 1.0270864257226362, + "learning_rate": 2.0744578199417519e-07, + "loss": 0.1338, + "step": 11102 + }, + { + "epoch": 0.71, + "grad_norm": 0.9645019146817351, + "learning_rate": 2.0736203772650622e-07, + "loss": 0.2736, + "step": 11103 + }, + { + "epoch": 0.71, + "grad_norm": 1.1034516439933453, + "learning_rate": 2.0727830594336088e-07, + "loss": 0.3325, + "step": 11104 + }, + { + "epoch": 0.71, + "grad_norm": 0.6279409222422017, + "learning_rate": 2.0719458664831164e-07, + "loss": 0.1725, + "step": 11105 + }, + { + "epoch": 0.71, + "grad_norm": 1.0000270743273245, + "learning_rate": 2.0711087984492976e-07, + "loss": 0.3225, + "step": 11106 + }, + { + "epoch": 0.71, + "grad_norm": 2.489929596811043, + "learning_rate": 2.0702718553678672e-07, + "loss": 0.1607, + "step": 11107 + }, + { + "epoch": 0.71, + "grad_norm": 0.5464027832140982, + "learning_rate": 2.0694350372745267e-07, + "loss": 0.0064, + "step": 11108 + }, + { + "epoch": 0.71, + "grad_norm": 0.4753609948604213, + "learning_rate": 2.068598344204981e-07, + "loss": 0.1583, + "step": 11109 + }, + { + "epoch": 0.71, + "grad_norm": 1.0716566064726825, + "learning_rate": 2.0677617761949206e-07, + "loss": 0.2261, + "step": 11110 + }, + { + "epoch": 0.71, + "grad_norm": 0.31560982199835835, + "learning_rate": 2.066925333280039e-07, + "loss": 0.0317, + "step": 11111 + }, + { + "epoch": 0.71, + "grad_norm": 1.8557832334185258, + "learning_rate": 2.0660890154960175e-07, + "loss": 0.2705, + "step": 11112 + }, + { + "epoch": 0.71, + "grad_norm": 0.16970641697493014, + "learning_rate": 2.0652528228785382e-07, + "loss": 0.1145, + "step": 11113 + }, + { + "epoch": 0.71, + "grad_norm": 0.7323255733169622, + "learning_rate": 2.0644167554632712e-07, + "loss": 0.1083, + "step": 11114 + }, + { + "epoch": 0.71, + "grad_norm": 0.423845452669845, + "learning_rate": 2.0635808132858883e-07, + "loss": 0.2142, + "step": 11115 + }, + { + "epoch": 0.71, + "grad_norm": 0.7703557994381832, + "learning_rate": 2.0627449963820494e-07, + "loss": 0.0975, + "step": 11116 + }, + { + "epoch": 0.71, + "grad_norm": 1.397097604885472, + "learning_rate": 2.0619093047874136e-07, + "loss": 0.3368, + "step": 11117 + }, + { + "epoch": 0.71, + "grad_norm": 5.648458511106753, + "learning_rate": 2.0610737385376348e-07, + "loss": 0.0169, + "step": 11118 + }, + { + "epoch": 0.71, + "grad_norm": 1.4159630617314296, + "learning_rate": 2.0602382976683564e-07, + "loss": 0.3397, + "step": 11119 + }, + { + "epoch": 0.71, + "grad_norm": 0.40372083127383873, + "learning_rate": 2.0594029822152238e-07, + "loss": 0.1969, + "step": 11120 + }, + { + "epoch": 0.71, + "grad_norm": 1.3412550776805925, + "learning_rate": 2.0585677922138694e-07, + "loss": 0.3395, + "step": 11121 + }, + { + "epoch": 0.71, + "grad_norm": 0.8002680589122609, + "learning_rate": 2.057732727699928e-07, + "loss": 0.3023, + "step": 11122 + }, + { + "epoch": 0.71, + "grad_norm": 9.79165682852856, + "learning_rate": 2.056897788709021e-07, + "loss": 0.2777, + "step": 11123 + }, + { + "epoch": 0.71, + "grad_norm": 0.6588782726645641, + "learning_rate": 2.0560629752767727e-07, + "loss": 0.2617, + "step": 11124 + }, + { + "epoch": 0.71, + "grad_norm": 2.9772970872969657, + "learning_rate": 2.0552282874387944e-07, + "loss": 0.1517, + "step": 11125 + }, + { + "epoch": 0.71, + "grad_norm": 8.680089317186349, + "learning_rate": 2.0543937252306986e-07, + "loss": 0.1008, + "step": 11126 + }, + { + "epoch": 0.71, + "grad_norm": 0.16421178474496412, + "learning_rate": 2.0535592886880858e-07, + "loss": 0.0016, + "step": 11127 + }, + { + "epoch": 0.71, + "grad_norm": 1.0169762939722602, + "learning_rate": 2.0527249778465595e-07, + "loss": 0.3602, + "step": 11128 + }, + { + "epoch": 0.71, + "grad_norm": 0.4693658530747357, + "learning_rate": 2.051890792741708e-07, + "loss": 0.1672, + "step": 11129 + }, + { + "epoch": 0.71, + "grad_norm": 4.377894875031313, + "learning_rate": 2.0510567334091233e-07, + "loss": 0.3077, + "step": 11130 + }, + { + "epoch": 0.71, + "grad_norm": 0.9291163569811182, + "learning_rate": 2.050222799884387e-07, + "loss": 0.1542, + "step": 11131 + }, + { + "epoch": 0.71, + "grad_norm": 0.8849080450106809, + "learning_rate": 2.0493889922030738e-07, + "loss": 0.2231, + "step": 11132 + }, + { + "epoch": 0.71, + "grad_norm": 0.9583189714214655, + "learning_rate": 2.048555310400758e-07, + "loss": 0.3332, + "step": 11133 + }, + { + "epoch": 0.71, + "grad_norm": 0.33436524652268007, + "learning_rate": 2.0477217545130072e-07, + "loss": 0.0046, + "step": 11134 + }, + { + "epoch": 0.71, + "grad_norm": 0.3954086713344425, + "learning_rate": 2.0468883245753794e-07, + "loss": 0.0819, + "step": 11135 + }, + { + "epoch": 0.71, + "grad_norm": 0.39441446712178774, + "learning_rate": 2.0460550206234323e-07, + "loss": 0.2405, + "step": 11136 + }, + { + "epoch": 0.71, + "grad_norm": 0.7869768509866532, + "learning_rate": 2.0452218426927182e-07, + "loss": 0.1657, + "step": 11137 + }, + { + "epoch": 0.71, + "grad_norm": 0.31599890181855755, + "learning_rate": 2.0443887908187778e-07, + "loss": 0.1003, + "step": 11138 + }, + { + "epoch": 0.71, + "grad_norm": 0.9200400092405193, + "learning_rate": 2.0435558650371553e-07, + "loss": 0.2184, + "step": 11139 + }, + { + "epoch": 0.71, + "grad_norm": 0.5725922337780283, + "learning_rate": 2.042723065383381e-07, + "loss": 0.2143, + "step": 11140 + }, + { + "epoch": 0.71, + "grad_norm": 1.8036918149895718, + "learning_rate": 2.0418903918929875e-07, + "loss": 0.1671, + "step": 11141 + }, + { + "epoch": 0.71, + "grad_norm": 1.0754672093221846, + "learning_rate": 2.0410578446014943e-07, + "loss": 0.1978, + "step": 11142 + }, + { + "epoch": 0.71, + "grad_norm": 4.009860347668434, + "learning_rate": 2.0402254235444237e-07, + "loss": 0.263, + "step": 11143 + }, + { + "epoch": 0.71, + "grad_norm": 2.0077324420036544, + "learning_rate": 2.0393931287572863e-07, + "loss": 0.4108, + "step": 11144 + }, + { + "epoch": 0.71, + "grad_norm": 0.7975073861023216, + "learning_rate": 2.0385609602755877e-07, + "loss": 0.2923, + "step": 11145 + }, + { + "epoch": 0.71, + "grad_norm": 0.7617043569621263, + "learning_rate": 2.0377289181348338e-07, + "loss": 0.2321, + "step": 11146 + }, + { + "epoch": 0.71, + "grad_norm": 2.8149967886397724, + "learning_rate": 2.0368970023705174e-07, + "loss": 0.1248, + "step": 11147 + }, + { + "epoch": 0.71, + "grad_norm": 0.3126689947528291, + "learning_rate": 2.0360652130181329e-07, + "loss": 0.0073, + "step": 11148 + }, + { + "epoch": 0.71, + "grad_norm": 0.5347447803184486, + "learning_rate": 2.0352335501131634e-07, + "loss": 0.0089, + "step": 11149 + }, + { + "epoch": 0.71, + "grad_norm": 1.1174960807730312, + "learning_rate": 2.0344020136910915e-07, + "loss": 0.1651, + "step": 11150 + }, + { + "epoch": 0.71, + "grad_norm": 0.3891793644539086, + "learning_rate": 2.0335706037873907e-07, + "loss": 0.1932, + "step": 11151 + }, + { + "epoch": 0.71, + "grad_norm": 3.4109825230115582, + "learning_rate": 2.0327393204375303e-07, + "loss": 0.0103, + "step": 11152 + }, + { + "epoch": 0.71, + "grad_norm": 0.44008456427840037, + "learning_rate": 2.0319081636769775e-07, + "loss": 0.0624, + "step": 11153 + }, + { + "epoch": 0.71, + "grad_norm": 0.6804454130050315, + "learning_rate": 2.0310771335411876e-07, + "loss": 0.1748, + "step": 11154 + }, + { + "epoch": 0.71, + "grad_norm": 0.6616182441697905, + "learning_rate": 2.0302462300656148e-07, + "loss": 0.1779, + "step": 11155 + }, + { + "epoch": 0.71, + "grad_norm": 1.194886232200801, + "learning_rate": 2.02941545328571e-07, + "loss": 0.0157, + "step": 11156 + }, + { + "epoch": 0.71, + "grad_norm": 1.0210193060768187, + "learning_rate": 2.0285848032369136e-07, + "loss": 0.1991, + "step": 11157 + }, + { + "epoch": 0.71, + "grad_norm": 0.42220279880617184, + "learning_rate": 2.027754279954661e-07, + "loss": 0.0531, + "step": 11158 + }, + { + "epoch": 0.71, + "grad_norm": 1.3413695279247972, + "learning_rate": 2.0269238834743873e-07, + "loss": 0.0783, + "step": 11159 + }, + { + "epoch": 0.71, + "grad_norm": 0.1783040252237527, + "learning_rate": 2.026093613831516e-07, + "loss": 0.0881, + "step": 11160 + }, + { + "epoch": 0.71, + "grad_norm": 1.352888585774589, + "learning_rate": 2.0252634710614708e-07, + "loss": 0.1885, + "step": 11161 + }, + { + "epoch": 0.71, + "grad_norm": 0.7125407256154703, + "learning_rate": 2.0244334551996644e-07, + "loss": 0.207, + "step": 11162 + }, + { + "epoch": 0.71, + "grad_norm": 1.0810572743366624, + "learning_rate": 2.02360356628151e-07, + "loss": 0.1821, + "step": 11163 + }, + { + "epoch": 0.71, + "grad_norm": 0.9800343033106649, + "learning_rate": 2.0227738043424092e-07, + "loss": 0.0958, + "step": 11164 + }, + { + "epoch": 0.71, + "grad_norm": 1.1925215461899459, + "learning_rate": 2.0219441694177646e-07, + "loss": 0.13, + "step": 11165 + }, + { + "epoch": 0.71, + "grad_norm": 2.488269609454193, + "learning_rate": 2.0211146615429663e-07, + "loss": 0.0184, + "step": 11166 + }, + { + "epoch": 0.71, + "grad_norm": 0.5923949634576661, + "learning_rate": 2.0202852807534072e-07, + "loss": 0.1575, + "step": 11167 + }, + { + "epoch": 0.71, + "grad_norm": 1.0325760284176058, + "learning_rate": 2.0194560270844656e-07, + "loss": 0.2335, + "step": 11168 + }, + { + "epoch": 0.71, + "grad_norm": 1.616832799167069, + "learning_rate": 2.0186269005715238e-07, + "loss": 0.0999, + "step": 11169 + }, + { + "epoch": 0.71, + "grad_norm": 0.24812581154032837, + "learning_rate": 2.0177979012499496e-07, + "loss": 0.0109, + "step": 11170 + }, + { + "epoch": 0.71, + "grad_norm": 0.7395334250891737, + "learning_rate": 2.0169690291551122e-07, + "loss": 0.1651, + "step": 11171 + }, + { + "epoch": 0.71, + "grad_norm": 4.678953458118523, + "learning_rate": 2.0161402843223746e-07, + "loss": 0.2679, + "step": 11172 + }, + { + "epoch": 0.71, + "grad_norm": 1.2235245198901974, + "learning_rate": 2.0153116667870884e-07, + "loss": 0.1278, + "step": 11173 + }, + { + "epoch": 0.71, + "grad_norm": 7.413046949698867, + "learning_rate": 2.0144831765846086e-07, + "loss": 0.0327, + "step": 11174 + }, + { + "epoch": 0.71, + "grad_norm": 4.374152723337225, + "learning_rate": 2.0136548137502762e-07, + "loss": 0.0914, + "step": 11175 + }, + { + "epoch": 0.71, + "grad_norm": 0.5635293048822592, + "learning_rate": 2.0128265783194347e-07, + "loss": 0.2037, + "step": 11176 + }, + { + "epoch": 0.71, + "grad_norm": 0.696824857039066, + "learning_rate": 2.0119984703274145e-07, + "loss": 0.2893, + "step": 11177 + }, + { + "epoch": 0.71, + "grad_norm": 1.4448810036308517, + "learning_rate": 2.0111704898095484e-07, + "loss": 0.299, + "step": 11178 + }, + { + "epoch": 0.71, + "grad_norm": 1.4842321762944772, + "learning_rate": 2.010342636801155e-07, + "loss": 0.2445, + "step": 11179 + }, + { + "epoch": 0.71, + "grad_norm": 1.2315378854292929, + "learning_rate": 2.009514911337557e-07, + "loss": 0.1501, + "step": 11180 + }, + { + "epoch": 0.71, + "grad_norm": 0.9680214184034847, + "learning_rate": 2.0086873134540622e-07, + "loss": 0.1119, + "step": 11181 + }, + { + "epoch": 0.71, + "grad_norm": 0.6481634616597933, + "learning_rate": 2.007859843185982e-07, + "loss": 0.3295, + "step": 11182 + }, + { + "epoch": 0.71, + "grad_norm": 0.8337402788030434, + "learning_rate": 2.0070325005686146e-07, + "loss": 0.4265, + "step": 11183 + }, + { + "epoch": 0.71, + "grad_norm": 1.1277671168274772, + "learning_rate": 2.006205285637258e-07, + "loss": 0.1004, + "step": 11184 + }, + { + "epoch": 0.71, + "grad_norm": 1.1033904171117666, + "learning_rate": 2.0053781984272028e-07, + "loss": 0.2123, + "step": 11185 + }, + { + "epoch": 0.71, + "grad_norm": 2.831684019145874, + "learning_rate": 2.0045512389737317e-07, + "loss": 0.1505, + "step": 11186 + }, + { + "epoch": 0.71, + "grad_norm": 0.9398543804586794, + "learning_rate": 2.0037244073121268e-07, + "loss": 0.0956, + "step": 11187 + }, + { + "epoch": 0.71, + "grad_norm": 0.33626345490035, + "learning_rate": 2.0028977034776617e-07, + "loss": 0.1765, + "step": 11188 + }, + { + "epoch": 0.71, + "grad_norm": 0.7330600531312977, + "learning_rate": 2.0020711275056068e-07, + "loss": 0.2654, + "step": 11189 + }, + { + "epoch": 0.71, + "grad_norm": 5.071430163760048, + "learning_rate": 2.0012446794312232e-07, + "loss": 0.1128, + "step": 11190 + }, + { + "epoch": 0.71, + "grad_norm": 0.4393179279597202, + "learning_rate": 2.000418359289771e-07, + "loss": 0.0701, + "step": 11191 + }, + { + "epoch": 0.71, + "grad_norm": 0.7261389888422688, + "learning_rate": 1.9995921671165e-07, + "loss": 0.2169, + "step": 11192 + }, + { + "epoch": 0.71, + "grad_norm": 3.6970623023922435, + "learning_rate": 1.9987661029466606e-07, + "loss": 0.4113, + "step": 11193 + }, + { + "epoch": 0.71, + "grad_norm": 0.2783452929849227, + "learning_rate": 1.9979401668154905e-07, + "loss": 0.1111, + "step": 11194 + }, + { + "epoch": 0.71, + "grad_norm": 0.7138841297634648, + "learning_rate": 1.9971143587582296e-07, + "loss": 0.1301, + "step": 11195 + }, + { + "epoch": 0.71, + "grad_norm": 0.7634042944827867, + "learning_rate": 1.9962886788101047e-07, + "loss": 0.3775, + "step": 11196 + }, + { + "epoch": 0.71, + "grad_norm": 0.8514063734546307, + "learning_rate": 1.9954631270063455e-07, + "loss": 0.3355, + "step": 11197 + }, + { + "epoch": 0.71, + "grad_norm": 0.36701638430003297, + "learning_rate": 1.9946377033821682e-07, + "loss": 0.1232, + "step": 11198 + }, + { + "epoch": 0.71, + "grad_norm": 0.8138968763301555, + "learning_rate": 1.9938124079727868e-07, + "loss": 0.2306, + "step": 11199 + }, + { + "epoch": 0.71, + "grad_norm": 4.957162666514625, + "learning_rate": 1.9929872408134128e-07, + "loss": 0.01, + "step": 11200 + }, + { + "epoch": 0.71, + "grad_norm": 1.663272276955558, + "learning_rate": 1.9921622019392465e-07, + "loss": 0.3103, + "step": 11201 + }, + { + "epoch": 0.71, + "grad_norm": 0.6185255139594323, + "learning_rate": 1.9913372913854887e-07, + "loss": 0.1556, + "step": 11202 + }, + { + "epoch": 0.71, + "grad_norm": 2.517211480643752, + "learning_rate": 1.9905125091873286e-07, + "loss": 0.1336, + "step": 11203 + }, + { + "epoch": 0.71, + "grad_norm": 4.136301256768579, + "learning_rate": 1.9896878553799552e-07, + "loss": 0.0989, + "step": 11204 + }, + { + "epoch": 0.71, + "grad_norm": 0.5318233109332234, + "learning_rate": 1.9888633299985501e-07, + "loss": 0.0444, + "step": 11205 + }, + { + "epoch": 0.71, + "grad_norm": 0.6815359606972862, + "learning_rate": 1.988038933078287e-07, + "loss": 0.1588, + "step": 11206 + }, + { + "epoch": 0.71, + "grad_norm": 1.064440810281863, + "learning_rate": 1.9872146646543385e-07, + "loss": 0.1855, + "step": 11207 + }, + { + "epoch": 0.71, + "grad_norm": 0.9657875998630698, + "learning_rate": 1.98639052476187e-07, + "loss": 0.1648, + "step": 11208 + }, + { + "epoch": 0.71, + "grad_norm": 0.6574026522270735, + "learning_rate": 1.9855665134360384e-07, + "loss": 0.2409, + "step": 11209 + }, + { + "epoch": 0.71, + "grad_norm": 1.4096503991301412, + "learning_rate": 1.984742630712001e-07, + "loss": 0.1128, + "step": 11210 + }, + { + "epoch": 0.71, + "grad_norm": 1.5949327327874128, + "learning_rate": 1.983918876624902e-07, + "loss": 0.1528, + "step": 11211 + }, + { + "epoch": 0.72, + "grad_norm": 2.2938873804657227, + "learning_rate": 1.9830952512098887e-07, + "loss": 0.3501, + "step": 11212 + }, + { + "epoch": 0.72, + "grad_norm": 0.7971925638047858, + "learning_rate": 1.9822717545020968e-07, + "loss": 0.1957, + "step": 11213 + }, + { + "epoch": 0.72, + "grad_norm": 0.7505609000325931, + "learning_rate": 1.9814483865366565e-07, + "loss": 0.452, + "step": 11214 + }, + { + "epoch": 0.72, + "grad_norm": 1.1554099347542885, + "learning_rate": 1.9806251473486985e-07, + "loss": 0.4074, + "step": 11215 + }, + { + "epoch": 0.72, + "grad_norm": 1.612928200475958, + "learning_rate": 1.9798020369733387e-07, + "loss": 0.2485, + "step": 11216 + }, + { + "epoch": 0.72, + "grad_norm": 1.1516989032552714, + "learning_rate": 1.9789790554456975e-07, + "loss": 0.2485, + "step": 11217 + }, + { + "epoch": 0.72, + "grad_norm": 0.7193849460650937, + "learning_rate": 1.9781562028008815e-07, + "loss": 0.2538, + "step": 11218 + }, + { + "epoch": 0.72, + "grad_norm": 2.1335841146218795, + "learning_rate": 1.9773334790739977e-07, + "loss": 0.3156, + "step": 11219 + }, + { + "epoch": 0.72, + "grad_norm": 0.9071096834049142, + "learning_rate": 1.9765108843001422e-07, + "loss": 0.2181, + "step": 11220 + }, + { + "epoch": 0.72, + "grad_norm": 0.4201123633870905, + "learning_rate": 1.9756884185144124e-07, + "loss": 0.126, + "step": 11221 + }, + { + "epoch": 0.72, + "grad_norm": 8.74766202741582, + "learning_rate": 1.9748660817518924e-07, + "loss": 0.1011, + "step": 11222 + }, + { + "epoch": 0.72, + "grad_norm": 1.5162123783421584, + "learning_rate": 1.9740438740476667e-07, + "loss": 0.0893, + "step": 11223 + }, + { + "epoch": 0.72, + "grad_norm": 0.8705645587691554, + "learning_rate": 1.9732217954368142e-07, + "loss": 0.2566, + "step": 11224 + }, + { + "epoch": 0.72, + "grad_norm": 0.7949028927690419, + "learning_rate": 1.9723998459544027e-07, + "loss": 0.2387, + "step": 11225 + }, + { + "epoch": 0.72, + "grad_norm": 7.079413507175972, + "learning_rate": 1.9715780256355013e-07, + "loss": 0.1022, + "step": 11226 + }, + { + "epoch": 0.72, + "grad_norm": 1.1397131273361476, + "learning_rate": 1.970756334515168e-07, + "loss": 0.0882, + "step": 11227 + }, + { + "epoch": 0.72, + "grad_norm": 0.6633143255294004, + "learning_rate": 1.969934772628461e-07, + "loss": 0.0782, + "step": 11228 + }, + { + "epoch": 0.72, + "grad_norm": 0.7519215211373192, + "learning_rate": 1.9691133400104254e-07, + "loss": 0.145, + "step": 11229 + }, + { + "epoch": 0.72, + "grad_norm": 0.8070537655189025, + "learning_rate": 1.9682920366961098e-07, + "loss": 0.2521, + "step": 11230 + }, + { + "epoch": 0.72, + "grad_norm": 1.1612635234765778, + "learning_rate": 1.9674708627205484e-07, + "loss": 0.4225, + "step": 11231 + }, + { + "epoch": 0.72, + "grad_norm": 0.6298576287659362, + "learning_rate": 1.9666498181187775e-07, + "loss": 0.4885, + "step": 11232 + }, + { + "epoch": 0.72, + "grad_norm": 0.6332193205907097, + "learning_rate": 1.9658289029258218e-07, + "loss": 0.1921, + "step": 11233 + }, + { + "epoch": 0.72, + "grad_norm": 0.9558343194950308, + "learning_rate": 1.9650081171767063e-07, + "loss": 0.2455, + "step": 11234 + }, + { + "epoch": 0.72, + "grad_norm": 8.19114937827609, + "learning_rate": 1.964187460906444e-07, + "loss": 0.1037, + "step": 11235 + }, + { + "epoch": 0.72, + "grad_norm": 3.9883718376886304, + "learning_rate": 1.9633669341500492e-07, + "loss": 0.2626, + "step": 11236 + }, + { + "epoch": 0.72, + "grad_norm": 1.5499604013139279, + "learning_rate": 1.962546536942523e-07, + "loss": 0.3044, + "step": 11237 + }, + { + "epoch": 0.72, + "grad_norm": 0.9565536379392627, + "learning_rate": 1.9617262693188703e-07, + "loss": 0.0567, + "step": 11238 + }, + { + "epoch": 0.72, + "grad_norm": 0.45903994597422537, + "learning_rate": 1.96090613131408e-07, + "loss": 0.0997, + "step": 11239 + }, + { + "epoch": 0.72, + "grad_norm": 0.9446472463424355, + "learning_rate": 1.9600861229631456e-07, + "loss": 0.2746, + "step": 11240 + }, + { + "epoch": 0.72, + "grad_norm": 1.0899076842399467, + "learning_rate": 1.9592662443010466e-07, + "loss": 0.1573, + "step": 11241 + }, + { + "epoch": 0.72, + "grad_norm": 0.718774540486392, + "learning_rate": 1.9584464953627621e-07, + "loss": 0.135, + "step": 11242 + }, + { + "epoch": 0.72, + "grad_norm": 0.6652962312181652, + "learning_rate": 1.957626876183266e-07, + "loss": 0.1435, + "step": 11243 + }, + { + "epoch": 0.72, + "grad_norm": 0.600922276086795, + "learning_rate": 1.9568073867975217e-07, + "loss": 0.2011, + "step": 11244 + }, + { + "epoch": 0.72, + "grad_norm": 0.7627624830365518, + "learning_rate": 1.9559880272404937e-07, + "loss": 0.1666, + "step": 11245 + }, + { + "epoch": 0.72, + "grad_norm": 0.34654085612832575, + "learning_rate": 1.9551687975471337e-07, + "loss": 0.0986, + "step": 11246 + }, + { + "epoch": 0.72, + "grad_norm": 0.44899840641322625, + "learning_rate": 1.9543496977523954e-07, + "loss": 0.2565, + "step": 11247 + }, + { + "epoch": 0.72, + "grad_norm": 4.913897145425443, + "learning_rate": 1.95353072789122e-07, + "loss": 0.0238, + "step": 11248 + }, + { + "epoch": 0.72, + "grad_norm": 0.5162736989873675, + "learning_rate": 1.9527118879985498e-07, + "loss": 0.0036, + "step": 11249 + }, + { + "epoch": 0.72, + "grad_norm": 1.4226692172058273, + "learning_rate": 1.9518931781093145e-07, + "loss": 0.2843, + "step": 11250 + }, + { + "epoch": 0.72, + "grad_norm": 0.7849040542404795, + "learning_rate": 1.9510745982584452e-07, + "loss": 0.1482, + "step": 11251 + }, + { + "epoch": 0.72, + "grad_norm": 0.9902365652357928, + "learning_rate": 1.9502561484808612e-07, + "loss": 0.0985, + "step": 11252 + }, + { + "epoch": 0.72, + "grad_norm": 1.4831877618226033, + "learning_rate": 1.9494378288114816e-07, + "loss": 0.2056, + "step": 11253 + }, + { + "epoch": 0.72, + "grad_norm": 2.2272332823805097, + "learning_rate": 1.948619639285217e-07, + "loss": 0.257, + "step": 11254 + }, + { + "epoch": 0.72, + "grad_norm": 1.3251243194056916, + "learning_rate": 1.9478015799369711e-07, + "loss": 0.2012, + "step": 11255 + }, + { + "epoch": 0.72, + "grad_norm": 1.0066410634636844, + "learning_rate": 1.9469836508016475e-07, + "loss": 0.242, + "step": 11256 + }, + { + "epoch": 0.72, + "grad_norm": 0.44470279731483114, + "learning_rate": 1.9461658519141368e-07, + "loss": 0.1918, + "step": 11257 + }, + { + "epoch": 0.72, + "grad_norm": 0.9107649630220954, + "learning_rate": 1.9453481833093298e-07, + "loss": 0.2812, + "step": 11258 + }, + { + "epoch": 0.72, + "grad_norm": 1.8946927979470785, + "learning_rate": 1.94453064502211e-07, + "loss": 0.1912, + "step": 11259 + }, + { + "epoch": 0.72, + "grad_norm": 0.46426163624716443, + "learning_rate": 1.9437132370873565e-07, + "loss": 0.0628, + "step": 11260 + }, + { + "epoch": 0.72, + "grad_norm": 0.6507929639380663, + "learning_rate": 1.9428959595399385e-07, + "loss": 0.2525, + "step": 11261 + }, + { + "epoch": 0.72, + "grad_norm": 0.8699678919806615, + "learning_rate": 1.9420788124147264e-07, + "loss": 0.1519, + "step": 11262 + }, + { + "epoch": 0.72, + "grad_norm": 0.8073902475761016, + "learning_rate": 1.9412617957465777e-07, + "loss": 0.0252, + "step": 11263 + }, + { + "epoch": 0.72, + "grad_norm": 0.34797149864818405, + "learning_rate": 1.9404449095703512e-07, + "loss": 0.1325, + "step": 11264 + }, + { + "epoch": 0.72, + "grad_norm": 0.8124671326892062, + "learning_rate": 1.9396281539208937e-07, + "loss": 0.2661, + "step": 11265 + }, + { + "epoch": 0.72, + "grad_norm": 4.250268017497989, + "learning_rate": 1.9388115288330526e-07, + "loss": 0.0244, + "step": 11266 + }, + { + "epoch": 0.72, + "grad_norm": 0.6363677745593981, + "learning_rate": 1.9379950343416656e-07, + "loss": 0.1499, + "step": 11267 + }, + { + "epoch": 0.72, + "grad_norm": 1.158412443101855, + "learning_rate": 1.9371786704815645e-07, + "loss": 0.3666, + "step": 11268 + }, + { + "epoch": 0.72, + "grad_norm": 1.3719723092275262, + "learning_rate": 1.93636243728758e-07, + "loss": 0.2845, + "step": 11269 + }, + { + "epoch": 0.72, + "grad_norm": 1.8101719245667807, + "learning_rate": 1.9355463347945305e-07, + "loss": 0.2269, + "step": 11270 + }, + { + "epoch": 0.72, + "grad_norm": 1.3636657007926116, + "learning_rate": 1.934730363037237e-07, + "loss": 0.2541, + "step": 11271 + }, + { + "epoch": 0.72, + "grad_norm": 0.5334193361930671, + "learning_rate": 1.933914522050506e-07, + "loss": 0.1261, + "step": 11272 + }, + { + "epoch": 0.72, + "grad_norm": 1.9624972864915775, + "learning_rate": 1.933098811869147e-07, + "loss": 0.2005, + "step": 11273 + }, + { + "epoch": 0.72, + "grad_norm": 0.4614518387255773, + "learning_rate": 1.9322832325279558e-07, + "loss": 0.1322, + "step": 11274 + }, + { + "epoch": 0.72, + "grad_norm": 0.35987495361670174, + "learning_rate": 1.931467784061731e-07, + "loss": 0.0917, + "step": 11275 + }, + { + "epoch": 0.72, + "grad_norm": 0.3751024634717288, + "learning_rate": 1.930652466505257e-07, + "loss": 0.2347, + "step": 11276 + }, + { + "epoch": 0.72, + "grad_norm": 13.791205175829184, + "learning_rate": 1.9298372798933193e-07, + "loss": 0.1651, + "step": 11277 + }, + { + "epoch": 0.72, + "grad_norm": 0.5227542350670976, + "learning_rate": 1.9290222242606946e-07, + "loss": 0.0922, + "step": 11278 + }, + { + "epoch": 0.72, + "grad_norm": 0.39238810146444875, + "learning_rate": 1.9282072996421577e-07, + "loss": 0.1329, + "step": 11279 + }, + { + "epoch": 0.72, + "grad_norm": 1.8366118709173087, + "learning_rate": 1.92739250607247e-07, + "loss": 0.1964, + "step": 11280 + }, + { + "epoch": 0.72, + "grad_norm": 0.8545381031191396, + "learning_rate": 1.9265778435863967e-07, + "loss": 0.2031, + "step": 11281 + }, + { + "epoch": 0.72, + "grad_norm": 0.5449701366996338, + "learning_rate": 1.925763312218691e-07, + "loss": 0.1661, + "step": 11282 + }, + { + "epoch": 0.72, + "grad_norm": 0.3373603343141682, + "learning_rate": 1.9249489120041007e-07, + "loss": 0.4064, + "step": 11283 + }, + { + "epoch": 0.72, + "grad_norm": 1.7810633429509313, + "learning_rate": 1.924134642977373e-07, + "loss": 0.2826, + "step": 11284 + }, + { + "epoch": 0.72, + "grad_norm": 0.5539469380350881, + "learning_rate": 1.9233205051732431e-07, + "loss": 0.3932, + "step": 11285 + }, + { + "epoch": 0.72, + "grad_norm": 1.0544716698876138, + "learning_rate": 1.9225064986264473e-07, + "loss": 0.2043, + "step": 11286 + }, + { + "epoch": 0.72, + "grad_norm": 1.4614833600306727, + "learning_rate": 1.9216926233717084e-07, + "loss": 0.0585, + "step": 11287 + }, + { + "epoch": 0.72, + "grad_norm": 1.2431582573504985, + "learning_rate": 1.920878879443753e-07, + "loss": 0.3204, + "step": 11288 + }, + { + "epoch": 0.72, + "grad_norm": 0.7717165785479521, + "learning_rate": 1.9200652668772922e-07, + "loss": 0.4098, + "step": 11289 + }, + { + "epoch": 0.72, + "grad_norm": 1.27975978791258, + "learning_rate": 1.9192517857070402e-07, + "loss": 0.3203, + "step": 11290 + }, + { + "epoch": 0.72, + "grad_norm": 0.5398482857016399, + "learning_rate": 1.9184384359676986e-07, + "loss": 0.1996, + "step": 11291 + }, + { + "epoch": 0.72, + "grad_norm": 0.8720094451731067, + "learning_rate": 1.9176252176939696e-07, + "loss": 0.3041, + "step": 11292 + }, + { + "epoch": 0.72, + "grad_norm": 1.40526003036035, + "learning_rate": 1.916812130920543e-07, + "loss": 0.2855, + "step": 11293 + }, + { + "epoch": 0.72, + "grad_norm": 0.7492185619361499, + "learning_rate": 1.9159991756821097e-07, + "loss": 0.1638, + "step": 11294 + }, + { + "epoch": 0.72, + "grad_norm": 3.8863006868943675, + "learning_rate": 1.9151863520133527e-07, + "loss": 0.0419, + "step": 11295 + }, + { + "epoch": 0.72, + "grad_norm": 1.2509224150024811, + "learning_rate": 1.914373659948945e-07, + "loss": 0.0916, + "step": 11296 + }, + { + "epoch": 0.72, + "grad_norm": 1.734187662025367, + "learning_rate": 1.9135610995235618e-07, + "loss": 0.2635, + "step": 11297 + }, + { + "epoch": 0.72, + "grad_norm": 0.18010752757270784, + "learning_rate": 1.9127486707718648e-07, + "loss": 0.0547, + "step": 11298 + }, + { + "epoch": 0.72, + "grad_norm": 0.3652400883110703, + "learning_rate": 1.9119363737285177e-07, + "loss": 0.0909, + "step": 11299 + }, + { + "epoch": 0.72, + "grad_norm": 0.8036609510321918, + "learning_rate": 1.911124208428171e-07, + "loss": 0.1154, + "step": 11300 + }, + { + "epoch": 0.72, + "grad_norm": 0.9157834260756635, + "learning_rate": 1.9103121749054767e-07, + "loss": 0.1498, + "step": 11301 + }, + { + "epoch": 0.72, + "grad_norm": 0.44321398656095967, + "learning_rate": 1.9095002731950738e-07, + "loss": 0.1158, + "step": 11302 + }, + { + "epoch": 0.72, + "grad_norm": 1.6170540323583107, + "learning_rate": 1.9086885033316042e-07, + "loss": 0.2886, + "step": 11303 + }, + { + "epoch": 0.72, + "grad_norm": 1.156481405504817, + "learning_rate": 1.9078768653496957e-07, + "loss": 0.3127, + "step": 11304 + }, + { + "epoch": 0.72, + "grad_norm": 1.371487411194727, + "learning_rate": 1.9070653592839774e-07, + "loss": 0.0578, + "step": 11305 + }, + { + "epoch": 0.72, + "grad_norm": 1.4820827101920844, + "learning_rate": 1.906253985169067e-07, + "loss": 0.2871, + "step": 11306 + }, + { + "epoch": 0.72, + "grad_norm": 8.743525703499758, + "learning_rate": 1.9054427430395825e-07, + "loss": 0.2672, + "step": 11307 + }, + { + "epoch": 0.72, + "grad_norm": 1.3482562172310517, + "learning_rate": 1.904631632930131e-07, + "loss": 0.0837, + "step": 11308 + }, + { + "epoch": 0.72, + "grad_norm": 0.7090816574899531, + "learning_rate": 1.9038206548753156e-07, + "loss": 0.3691, + "step": 11309 + }, + { + "epoch": 0.72, + "grad_norm": 0.6656375800217357, + "learning_rate": 1.9030098089097345e-07, + "loss": 0.1905, + "step": 11310 + }, + { + "epoch": 0.72, + "grad_norm": 0.9876900600288355, + "learning_rate": 1.9021990950679821e-07, + "loss": 0.0927, + "step": 11311 + }, + { + "epoch": 0.72, + "grad_norm": 0.6965926457653281, + "learning_rate": 1.901388513384643e-07, + "loss": 0.1765, + "step": 11312 + }, + { + "epoch": 0.72, + "grad_norm": 0.423903362526783, + "learning_rate": 1.900578063894298e-07, + "loss": 0.0063, + "step": 11313 + }, + { + "epoch": 0.72, + "grad_norm": 1.227600379231426, + "learning_rate": 1.8997677466315253e-07, + "loss": 0.2589, + "step": 11314 + }, + { + "epoch": 0.72, + "grad_norm": 1.3112065002380018, + "learning_rate": 1.8989575616308916e-07, + "loss": 0.1935, + "step": 11315 + }, + { + "epoch": 0.72, + "grad_norm": 1.5071865985285988, + "learning_rate": 1.898147508926964e-07, + "loss": 0.5045, + "step": 11316 + }, + { + "epoch": 0.72, + "grad_norm": 0.6361604339442415, + "learning_rate": 1.8973375885542963e-07, + "loss": 0.2456, + "step": 11317 + }, + { + "epoch": 0.72, + "grad_norm": 0.49806507043030224, + "learning_rate": 1.8965278005474473e-07, + "loss": 0.2286, + "step": 11318 + }, + { + "epoch": 0.72, + "grad_norm": 0.5761747681338725, + "learning_rate": 1.8957181449409582e-07, + "loss": 0.0065, + "step": 11319 + }, + { + "epoch": 0.72, + "grad_norm": 0.7775147011624235, + "learning_rate": 1.894908621769376e-07, + "loss": 0.0907, + "step": 11320 + }, + { + "epoch": 0.72, + "grad_norm": 0.3591939137242455, + "learning_rate": 1.8940992310672315e-07, + "loss": 0.2522, + "step": 11321 + }, + { + "epoch": 0.72, + "grad_norm": 0.975254351357844, + "learning_rate": 1.89328997286906e-07, + "loss": 0.2037, + "step": 11322 + }, + { + "epoch": 0.72, + "grad_norm": 1.0850047961635327, + "learning_rate": 1.892480847209383e-07, + "loss": 0.183, + "step": 11323 + }, + { + "epoch": 0.72, + "grad_norm": 0.9839293972390819, + "learning_rate": 1.8916718541227185e-07, + "loss": 0.3225, + "step": 11324 + }, + { + "epoch": 0.72, + "grad_norm": 0.7497703514236261, + "learning_rate": 1.8908629936435827e-07, + "loss": 0.1687, + "step": 11325 + }, + { + "epoch": 0.72, + "grad_norm": 0.2926129797374673, + "learning_rate": 1.8900542658064805e-07, + "loss": 0.1619, + "step": 11326 + }, + { + "epoch": 0.72, + "grad_norm": 1.8220806442208703, + "learning_rate": 1.8892456706459163e-07, + "loss": 0.133, + "step": 11327 + }, + { + "epoch": 0.72, + "grad_norm": 1.0574883016902499, + "learning_rate": 1.8884372081963835e-07, + "loss": 0.0633, + "step": 11328 + }, + { + "epoch": 0.72, + "grad_norm": 0.775531667402893, + "learning_rate": 1.8876288784923745e-07, + "loss": 0.0403, + "step": 11329 + }, + { + "epoch": 0.72, + "grad_norm": 3.709690766797677, + "learning_rate": 1.8868206815683763e-07, + "loss": 0.0914, + "step": 11330 + }, + { + "epoch": 0.72, + "grad_norm": 0.9142820871214081, + "learning_rate": 1.8860126174588636e-07, + "loss": 0.2878, + "step": 11331 + }, + { + "epoch": 0.72, + "grad_norm": 1.953645651570006, + "learning_rate": 1.8852046861983134e-07, + "loss": 0.0113, + "step": 11332 + }, + { + "epoch": 0.72, + "grad_norm": 0.5171814683896292, + "learning_rate": 1.884396887821194e-07, + "loss": 0.1535, + "step": 11333 + }, + { + "epoch": 0.72, + "grad_norm": 0.9780494805901289, + "learning_rate": 1.883589222361965e-07, + "loss": 0.3118, + "step": 11334 + }, + { + "epoch": 0.72, + "grad_norm": 0.4719540877982585, + "learning_rate": 1.8827816898550863e-07, + "loss": 0.1827, + "step": 11335 + }, + { + "epoch": 0.72, + "grad_norm": 0.5489758011674478, + "learning_rate": 1.8819742903350068e-07, + "loss": 0.1459, + "step": 11336 + }, + { + "epoch": 0.72, + "grad_norm": 1.3944660254622898, + "learning_rate": 1.8811670238361703e-07, + "loss": 0.1734, + "step": 11337 + }, + { + "epoch": 0.72, + "grad_norm": 0.577398856012256, + "learning_rate": 1.8803598903930205e-07, + "loss": 0.0038, + "step": 11338 + }, + { + "epoch": 0.72, + "grad_norm": 0.821320998314224, + "learning_rate": 1.8795528900399872e-07, + "loss": 0.2236, + "step": 11339 + }, + { + "epoch": 0.72, + "grad_norm": 0.7722991064448721, + "learning_rate": 1.878746022811502e-07, + "loss": 0.1255, + "step": 11340 + }, + { + "epoch": 0.72, + "grad_norm": 0.36684766591212525, + "learning_rate": 1.8779392887419843e-07, + "loss": 0.1759, + "step": 11341 + }, + { + "epoch": 0.72, + "grad_norm": 2.3443206899330606, + "learning_rate": 1.8771326878658545e-07, + "loss": 0.374, + "step": 11342 + }, + { + "epoch": 0.72, + "grad_norm": 0.8821809580335769, + "learning_rate": 1.8763262202175202e-07, + "loss": 0.2174, + "step": 11343 + }, + { + "epoch": 0.72, + "grad_norm": 18.54331626413916, + "learning_rate": 1.8755198858313903e-07, + "loss": 0.1073, + "step": 11344 + }, + { + "epoch": 0.72, + "grad_norm": 1.3406909347627889, + "learning_rate": 1.874713684741861e-07, + "loss": 0.1718, + "step": 11345 + }, + { + "epoch": 0.72, + "grad_norm": 1.7917407665217508, + "learning_rate": 1.8739076169833308e-07, + "loss": 0.1452, + "step": 11346 + }, + { + "epoch": 0.72, + "grad_norm": 0.7331813446289264, + "learning_rate": 1.8731016825901842e-07, + "loss": 0.2365, + "step": 11347 + }, + { + "epoch": 0.72, + "grad_norm": 1.423763078498533, + "learning_rate": 1.8722958815968054e-07, + "loss": 0.1211, + "step": 11348 + }, + { + "epoch": 0.72, + "grad_norm": 9.238856469415873, + "learning_rate": 1.871490214037572e-07, + "loss": 0.0339, + "step": 11349 + }, + { + "epoch": 0.72, + "grad_norm": 0.38855925532029184, + "learning_rate": 1.8706846799468568e-07, + "loss": 0.0896, + "step": 11350 + }, + { + "epoch": 0.72, + "grad_norm": 1.112503105090476, + "learning_rate": 1.8698792793590235e-07, + "loss": 0.1634, + "step": 11351 + }, + { + "epoch": 0.72, + "grad_norm": 0.6381406907226377, + "learning_rate": 1.8690740123084315e-07, + "loss": 0.0893, + "step": 11352 + }, + { + "epoch": 0.72, + "grad_norm": 0.9092041439344964, + "learning_rate": 1.868268878829437e-07, + "loss": 0.2248, + "step": 11353 + }, + { + "epoch": 0.72, + "grad_norm": 5.6039574825094425, + "learning_rate": 1.8674638789563869e-07, + "loss": 0.1151, + "step": 11354 + }, + { + "epoch": 0.72, + "grad_norm": 0.7360579684385566, + "learning_rate": 1.866659012723626e-07, + "loss": 0.2566, + "step": 11355 + }, + { + "epoch": 0.72, + "grad_norm": 0.8850195778522192, + "learning_rate": 1.8658542801654887e-07, + "loss": 0.1602, + "step": 11356 + }, + { + "epoch": 0.72, + "grad_norm": 0.7711100564924428, + "learning_rate": 1.8650496813163096e-07, + "loss": 0.0459, + "step": 11357 + }, + { + "epoch": 0.72, + "grad_norm": 4.383151805890166, + "learning_rate": 1.864245216210412e-07, + "loss": 0.0569, + "step": 11358 + }, + { + "epoch": 0.72, + "grad_norm": 1.6917029957197063, + "learning_rate": 1.8634408848821186e-07, + "loss": 0.0785, + "step": 11359 + }, + { + "epoch": 0.72, + "grad_norm": 0.8159387198461046, + "learning_rate": 1.8626366873657413e-07, + "loss": 0.1036, + "step": 11360 + }, + { + "epoch": 0.72, + "grad_norm": 2.240459356947686, + "learning_rate": 1.8618326236955906e-07, + "loss": 0.1267, + "step": 11361 + }, + { + "epoch": 0.72, + "grad_norm": 7.495084438111064, + "learning_rate": 1.8610286939059676e-07, + "loss": 0.1815, + "step": 11362 + }, + { + "epoch": 0.72, + "grad_norm": 2.692074811260936, + "learning_rate": 1.860224898031172e-07, + "loss": 0.1091, + "step": 11363 + }, + { + "epoch": 0.72, + "grad_norm": 3.4801473968529875, + "learning_rate": 1.8594212361054922e-07, + "loss": 0.0339, + "step": 11364 + }, + { + "epoch": 0.72, + "grad_norm": 1.0618883813365898, + "learning_rate": 1.8586177081632158e-07, + "loss": 0.3165, + "step": 11365 + }, + { + "epoch": 0.72, + "grad_norm": 21.182772692848125, + "learning_rate": 1.8578143142386248e-07, + "loss": 0.2846, + "step": 11366 + }, + { + "epoch": 0.72, + "grad_norm": 1.5496385039487668, + "learning_rate": 1.8570110543659907e-07, + "loss": 0.4394, + "step": 11367 + }, + { + "epoch": 0.72, + "grad_norm": 0.8117317447610425, + "learning_rate": 1.856207928579584e-07, + "loss": 0.1669, + "step": 11368 + }, + { + "epoch": 0.73, + "grad_norm": 0.6162402209721237, + "learning_rate": 1.8554049369136655e-07, + "loss": 0.0026, + "step": 11369 + }, + { + "epoch": 0.73, + "grad_norm": 0.7646794922934054, + "learning_rate": 1.8546020794024954e-07, + "loss": 0.1577, + "step": 11370 + }, + { + "epoch": 0.73, + "grad_norm": 0.79595816006261, + "learning_rate": 1.853799356080322e-07, + "loss": 0.3337, + "step": 11371 + }, + { + "epoch": 0.73, + "grad_norm": 1.1560967344600477, + "learning_rate": 1.8529967669813945e-07, + "loss": 0.2162, + "step": 11372 + }, + { + "epoch": 0.73, + "grad_norm": 1.9609528498824436, + "learning_rate": 1.8521943121399497e-07, + "loss": 0.3063, + "step": 11373 + }, + { + "epoch": 0.73, + "grad_norm": 0.7273985933698603, + "learning_rate": 1.8513919915902248e-07, + "loss": 0.2677, + "step": 11374 + }, + { + "epoch": 0.73, + "grad_norm": 0.8282398453406957, + "learning_rate": 1.8505898053664455e-07, + "loss": 0.28, + "step": 11375 + }, + { + "epoch": 0.73, + "grad_norm": 10.672463712150755, + "learning_rate": 1.849787753502838e-07, + "loss": 0.1784, + "step": 11376 + }, + { + "epoch": 0.73, + "grad_norm": 0.5570384879018646, + "learning_rate": 1.848985836033617e-07, + "loss": 0.1562, + "step": 11377 + }, + { + "epoch": 0.73, + "grad_norm": 5.666323930178118, + "learning_rate": 1.8481840529929938e-07, + "loss": 0.223, + "step": 11378 + }, + { + "epoch": 0.73, + "grad_norm": 1.0651202670771267, + "learning_rate": 1.8473824044151758e-07, + "loss": 0.0999, + "step": 11379 + }, + { + "epoch": 0.73, + "grad_norm": 10.857961344166354, + "learning_rate": 1.8465808903343606e-07, + "loss": 0.1848, + "step": 11380 + }, + { + "epoch": 0.73, + "grad_norm": 0.3676116604325516, + "learning_rate": 1.8457795107847435e-07, + "loss": 0.0898, + "step": 11381 + }, + { + "epoch": 0.73, + "grad_norm": 10.171026555137782, + "learning_rate": 1.8449782658005152e-07, + "loss": 0.3671, + "step": 11382 + }, + { + "epoch": 0.73, + "grad_norm": 0.5106940084247752, + "learning_rate": 1.8441771554158554e-07, + "loss": 0.1456, + "step": 11383 + }, + { + "epoch": 0.73, + "grad_norm": 0.7994769509285218, + "learning_rate": 1.843376179664941e-07, + "loss": 0.2311, + "step": 11384 + }, + { + "epoch": 0.73, + "grad_norm": 1.5945871463968089, + "learning_rate": 1.842575338581947e-07, + "loss": 0.2475, + "step": 11385 + }, + { + "epoch": 0.73, + "grad_norm": 1.3367533168219845, + "learning_rate": 1.8417746322010342e-07, + "loss": 0.2572, + "step": 11386 + }, + { + "epoch": 0.73, + "grad_norm": 0.212180939838843, + "learning_rate": 1.8409740605563662e-07, + "loss": 0.0048, + "step": 11387 + }, + { + "epoch": 0.73, + "grad_norm": 3.529879221363101, + "learning_rate": 1.8401736236820931e-07, + "loss": 0.0993, + "step": 11388 + }, + { + "epoch": 0.73, + "grad_norm": 0.5938335393745138, + "learning_rate": 1.8393733216123675e-07, + "loss": 0.2476, + "step": 11389 + }, + { + "epoch": 0.73, + "grad_norm": 0.2951656965333097, + "learning_rate": 1.8385731543813278e-07, + "loss": 0.0676, + "step": 11390 + }, + { + "epoch": 0.73, + "grad_norm": 1.1143514561504615, + "learning_rate": 1.837773122023114e-07, + "loss": 0.1534, + "step": 11391 + }, + { + "epoch": 0.73, + "grad_norm": 0.3308697602123853, + "learning_rate": 1.8369732245718562e-07, + "loss": 0.1973, + "step": 11392 + }, + { + "epoch": 0.73, + "grad_norm": 0.354457923485755, + "learning_rate": 1.836173462061677e-07, + "loss": 0.0941, + "step": 11393 + }, + { + "epoch": 0.73, + "grad_norm": 0.6274520460016499, + "learning_rate": 1.8353738345267e-07, + "loss": 0.1106, + "step": 11394 + }, + { + "epoch": 0.73, + "grad_norm": 0.5221636998211159, + "learning_rate": 1.834574342001035e-07, + "loss": 0.3306, + "step": 11395 + }, + { + "epoch": 0.73, + "grad_norm": 7.324097496094751, + "learning_rate": 1.8337749845187934e-07, + "loss": 0.0962, + "step": 11396 + }, + { + "epoch": 0.73, + "grad_norm": 0.675354180486095, + "learning_rate": 1.8329757621140746e-07, + "loss": 0.0131, + "step": 11397 + }, + { + "epoch": 0.73, + "grad_norm": 1.5524860551788853, + "learning_rate": 1.8321766748209778e-07, + "loss": 0.1393, + "step": 11398 + }, + { + "epoch": 0.73, + "grad_norm": 0.5590398221356457, + "learning_rate": 1.8313777226735904e-07, + "loss": 0.1419, + "step": 11399 + }, + { + "epoch": 0.73, + "grad_norm": 0.5652153307301052, + "learning_rate": 1.8305789057059995e-07, + "loss": 0.1761, + "step": 11400 + }, + { + "epoch": 0.73, + "grad_norm": 1.1758393193850527, + "learning_rate": 1.8297802239522847e-07, + "loss": 0.133, + "step": 11401 + }, + { + "epoch": 0.73, + "grad_norm": 0.7437972372207307, + "learning_rate": 1.8289816774465178e-07, + "loss": 0.1527, + "step": 11402 + }, + { + "epoch": 0.73, + "grad_norm": 1.3463829145471895, + "learning_rate": 1.8281832662227665e-07, + "loss": 0.6341, + "step": 11403 + }, + { + "epoch": 0.73, + "grad_norm": 1.2889171271947935, + "learning_rate": 1.8273849903150946e-07, + "loss": 0.0184, + "step": 11404 + }, + { + "epoch": 0.73, + "grad_norm": 1.3106307738070322, + "learning_rate": 1.8265868497575576e-07, + "loss": 0.0877, + "step": 11405 + }, + { + "epoch": 0.73, + "grad_norm": 0.5275145363797861, + "learning_rate": 1.8257888445842023e-07, + "loss": 0.2441, + "step": 11406 + }, + { + "epoch": 0.73, + "grad_norm": 0.11567962623004249, + "learning_rate": 1.824990974829078e-07, + "loss": 0.0019, + "step": 11407 + }, + { + "epoch": 0.73, + "grad_norm": 0.3797639917060166, + "learning_rate": 1.82419324052622e-07, + "loss": 0.1215, + "step": 11408 + }, + { + "epoch": 0.73, + "grad_norm": 6.989969077827051, + "learning_rate": 1.823395641709664e-07, + "loss": 0.3756, + "step": 11409 + }, + { + "epoch": 0.73, + "grad_norm": 0.8942893852458406, + "learning_rate": 1.8225981784134336e-07, + "loss": 0.0271, + "step": 11410 + }, + { + "epoch": 0.73, + "grad_norm": 1.9581676941039796, + "learning_rate": 1.8218008506715544e-07, + "loss": 0.0194, + "step": 11411 + }, + { + "epoch": 0.73, + "grad_norm": 0.600710384600168, + "learning_rate": 1.8210036585180383e-07, + "loss": 0.0841, + "step": 11412 + }, + { + "epoch": 0.73, + "grad_norm": 2.112113519964818, + "learning_rate": 1.8202066019868979e-07, + "loss": 0.4596, + "step": 11413 + }, + { + "epoch": 0.73, + "grad_norm": 1.0098642959598396, + "learning_rate": 1.8194096811121346e-07, + "loss": 0.1137, + "step": 11414 + }, + { + "epoch": 0.73, + "grad_norm": 0.250568542399658, + "learning_rate": 1.8186128959277497e-07, + "loss": 0.0919, + "step": 11415 + }, + { + "epoch": 0.73, + "grad_norm": 0.9247788958931612, + "learning_rate": 1.8178162464677328e-07, + "loss": 0.2528, + "step": 11416 + }, + { + "epoch": 0.73, + "grad_norm": 0.4988442802089735, + "learning_rate": 1.817019732766073e-07, + "loss": 0.0702, + "step": 11417 + }, + { + "epoch": 0.73, + "grad_norm": 1.0537970175279456, + "learning_rate": 1.8162233548567486e-07, + "loss": 0.3904, + "step": 11418 + }, + { + "epoch": 0.73, + "grad_norm": 1.4204321740646035, + "learning_rate": 1.8154271127737357e-07, + "loss": 0.0892, + "step": 11419 + }, + { + "epoch": 0.73, + "grad_norm": 0.7301438334478592, + "learning_rate": 1.814631006551006e-07, + "loss": 0.1586, + "step": 11420 + }, + { + "epoch": 0.73, + "grad_norm": 4.607254441125836, + "learning_rate": 1.813835036222519e-07, + "loss": 0.2037, + "step": 11421 + }, + { + "epoch": 0.73, + "grad_norm": 1.4555645094974445, + "learning_rate": 1.8130392018222362e-07, + "loss": 0.1012, + "step": 11422 + }, + { + "epoch": 0.73, + "grad_norm": 0.5832270929414122, + "learning_rate": 1.812243503384106e-07, + "loss": 0.2479, + "step": 11423 + }, + { + "epoch": 0.73, + "grad_norm": 0.9202720334256785, + "learning_rate": 1.811447940942078e-07, + "loss": 0.2745, + "step": 11424 + }, + { + "epoch": 0.73, + "grad_norm": 0.8422475057212134, + "learning_rate": 1.810652514530089e-07, + "loss": 0.1688, + "step": 11425 + }, + { + "epoch": 0.73, + "grad_norm": 1.267528395624066, + "learning_rate": 1.8098572241820764e-07, + "loss": 0.1757, + "step": 11426 + }, + { + "epoch": 0.73, + "grad_norm": 0.42829894387744827, + "learning_rate": 1.8090620699319658e-07, + "loss": 0.1465, + "step": 11427 + }, + { + "epoch": 0.73, + "grad_norm": 1.3229782021811989, + "learning_rate": 1.8082670518136839e-07, + "loss": 0.1382, + "step": 11428 + }, + { + "epoch": 0.73, + "grad_norm": 13.010708927575749, + "learning_rate": 1.807472169861144e-07, + "loss": 0.4057, + "step": 11429 + }, + { + "epoch": 0.73, + "grad_norm": 0.46407104850424413, + "learning_rate": 1.806677424108261e-07, + "loss": 0.1411, + "step": 11430 + }, + { + "epoch": 0.73, + "grad_norm": 2.3753019193240372, + "learning_rate": 1.8058828145889367e-07, + "loss": 0.2436, + "step": 11431 + }, + { + "epoch": 0.73, + "grad_norm": 2.601650084403245, + "learning_rate": 1.8050883413370738e-07, + "loss": 0.0615, + "step": 11432 + }, + { + "epoch": 0.73, + "grad_norm": 0.10839587137874689, + "learning_rate": 1.8042940043865655e-07, + "loss": 0.0044, + "step": 11433 + }, + { + "epoch": 0.73, + "grad_norm": 1.94888379640534, + "learning_rate": 1.8034998037712967e-07, + "loss": 0.2202, + "step": 11434 + }, + { + "epoch": 0.73, + "grad_norm": 1.0849138426443623, + "learning_rate": 1.8027057395251528e-07, + "loss": 0.4132, + "step": 11435 + }, + { + "epoch": 0.73, + "grad_norm": 1.4985725447357996, + "learning_rate": 1.8019118116820091e-07, + "loss": 0.1153, + "step": 11436 + }, + { + "epoch": 0.73, + "grad_norm": 1.0127380697798904, + "learning_rate": 1.8011180202757382e-07, + "loss": 0.0446, + "step": 11437 + }, + { + "epoch": 0.73, + "grad_norm": 0.21527906885469447, + "learning_rate": 1.8003243653402013e-07, + "loss": 0.1046, + "step": 11438 + }, + { + "epoch": 0.73, + "grad_norm": 0.5323627967733878, + "learning_rate": 1.7995308469092608e-07, + "loss": 0.0247, + "step": 11439 + }, + { + "epoch": 0.73, + "grad_norm": 1.1324239374290164, + "learning_rate": 1.7987374650167664e-07, + "loss": 0.1062, + "step": 11440 + }, + { + "epoch": 0.73, + "grad_norm": 1.2843530420456393, + "learning_rate": 1.797944219696569e-07, + "loss": 0.0342, + "step": 11441 + }, + { + "epoch": 0.73, + "grad_norm": 1.1114536875301562, + "learning_rate": 1.797151110982506e-07, + "loss": 0.2956, + "step": 11442 + }, + { + "epoch": 0.73, + "grad_norm": 1.5086169328961192, + "learning_rate": 1.7963581389084175e-07, + "loss": 0.1714, + "step": 11443 + }, + { + "epoch": 0.73, + "grad_norm": 0.919644543589692, + "learning_rate": 1.7955653035081287e-07, + "loss": 0.249, + "step": 11444 + }, + { + "epoch": 0.73, + "grad_norm": 0.42747007523985503, + "learning_rate": 1.7947726048154676e-07, + "loss": 0.0351, + "step": 11445 + }, + { + "epoch": 0.73, + "grad_norm": 0.7128928853881998, + "learning_rate": 1.793980042864251e-07, + "loss": 0.2259, + "step": 11446 + }, + { + "epoch": 0.73, + "grad_norm": 2.726557080274698, + "learning_rate": 1.7931876176882883e-07, + "loss": 0.1929, + "step": 11447 + }, + { + "epoch": 0.73, + "grad_norm": 2.1988796467009433, + "learning_rate": 1.79239532932139e-07, + "loss": 0.255, + "step": 11448 + }, + { + "epoch": 0.73, + "grad_norm": 1.0306298807776058, + "learning_rate": 1.791603177797354e-07, + "loss": 0.3409, + "step": 11449 + }, + { + "epoch": 0.73, + "grad_norm": 0.8206713277340335, + "learning_rate": 1.7908111631499772e-07, + "loss": 0.2763, + "step": 11450 + }, + { + "epoch": 0.73, + "grad_norm": 0.906822347935492, + "learning_rate": 1.7900192854130464e-07, + "loss": 0.1326, + "step": 11451 + }, + { + "epoch": 0.73, + "grad_norm": 0.7884778066634541, + "learning_rate": 1.789227544620347e-07, + "loss": 0.1866, + "step": 11452 + }, + { + "epoch": 0.73, + "grad_norm": 0.4145810436910572, + "learning_rate": 1.7884359408056532e-07, + "loss": 0.031, + "step": 11453 + }, + { + "epoch": 0.73, + "grad_norm": 0.6210063643046614, + "learning_rate": 1.7876444740027384e-07, + "loss": 0.1273, + "step": 11454 + }, + { + "epoch": 0.73, + "grad_norm": 0.9570963601069166, + "learning_rate": 1.7868531442453677e-07, + "loss": 0.23, + "step": 11455 + }, + { + "epoch": 0.73, + "grad_norm": 0.9909922353503083, + "learning_rate": 1.7860619515673032e-07, + "loss": 0.3218, + "step": 11456 + }, + { + "epoch": 0.73, + "grad_norm": 0.436526150111342, + "learning_rate": 1.785270896002295e-07, + "loss": 0.0065, + "step": 11457 + }, + { + "epoch": 0.73, + "grad_norm": 1.4145354786040616, + "learning_rate": 1.784479977584094e-07, + "loss": 0.1989, + "step": 11458 + }, + { + "epoch": 0.73, + "grad_norm": 1.0745897571282355, + "learning_rate": 1.78368919634644e-07, + "loss": 0.1913, + "step": 11459 + }, + { + "epoch": 0.73, + "grad_norm": 2.6166655301826944, + "learning_rate": 1.7828985523230722e-07, + "loss": 0.3184, + "step": 11460 + }, + { + "epoch": 0.73, + "grad_norm": 0.5085237659998391, + "learning_rate": 1.782108045547719e-07, + "loss": 0.0833, + "step": 11461 + }, + { + "epoch": 0.73, + "grad_norm": 0.3943343845860209, + "learning_rate": 1.7813176760541037e-07, + "loss": 0.2125, + "step": 11462 + }, + { + "epoch": 0.73, + "grad_norm": 1.625016401705777, + "learning_rate": 1.7805274438759482e-07, + "loss": 0.0823, + "step": 11463 + }, + { + "epoch": 0.73, + "grad_norm": 2.275557332623727, + "learning_rate": 1.7797373490469624e-07, + "loss": 0.2726, + "step": 11464 + }, + { + "epoch": 0.73, + "grad_norm": 1.0708090834780841, + "learning_rate": 1.7789473916008568e-07, + "loss": 0.3473, + "step": 11465 + }, + { + "epoch": 0.73, + "grad_norm": 1.224580636042428, + "learning_rate": 1.7781575715713287e-07, + "loss": 0.2007, + "step": 11466 + }, + { + "epoch": 0.73, + "grad_norm": 0.8447087835760301, + "learning_rate": 1.7773678889920774e-07, + "loss": 0.0485, + "step": 11467 + }, + { + "epoch": 0.73, + "grad_norm": 4.765010029835095, + "learning_rate": 1.776578343896788e-07, + "loss": 0.0837, + "step": 11468 + }, + { + "epoch": 0.73, + "grad_norm": 0.4329833036547865, + "learning_rate": 1.775788936319148e-07, + "loss": 0.1843, + "step": 11469 + }, + { + "epoch": 0.73, + "grad_norm": 0.5821034965963179, + "learning_rate": 1.7749996662928317e-07, + "loss": 0.1486, + "step": 11470 + }, + { + "epoch": 0.73, + "grad_norm": 1.621258290149381, + "learning_rate": 1.7742105338515135e-07, + "loss": 0.3242, + "step": 11471 + }, + { + "epoch": 0.73, + "grad_norm": 1.129873094277355, + "learning_rate": 1.7734215390288598e-07, + "loss": 0.2612, + "step": 11472 + }, + { + "epoch": 0.73, + "grad_norm": 0.49494560339519383, + "learning_rate": 1.7726326818585275e-07, + "loss": 0.0985, + "step": 11473 + }, + { + "epoch": 0.73, + "grad_norm": 0.08218488388321017, + "learning_rate": 1.771843962374175e-07, + "loss": 0.0018, + "step": 11474 + }, + { + "epoch": 0.73, + "grad_norm": 2.6028904083027404, + "learning_rate": 1.7710553806094465e-07, + "loss": 0.2327, + "step": 11475 + }, + { + "epoch": 0.73, + "grad_norm": 12.263165722367507, + "learning_rate": 1.770266936597988e-07, + "loss": 0.2769, + "step": 11476 + }, + { + "epoch": 0.73, + "grad_norm": 1.504074391140485, + "learning_rate": 1.7694786303734326e-07, + "loss": 0.2694, + "step": 11477 + }, + { + "epoch": 0.73, + "grad_norm": 1.7620545644939543, + "learning_rate": 1.7686904619694154e-07, + "loss": 0.0712, + "step": 11478 + }, + { + "epoch": 0.73, + "grad_norm": 0.7622744474898832, + "learning_rate": 1.7679024314195567e-07, + "loss": 0.2432, + "step": 11479 + }, + { + "epoch": 0.73, + "grad_norm": 0.9690519051497881, + "learning_rate": 1.7671145387574798e-07, + "loss": 0.4384, + "step": 11480 + }, + { + "epoch": 0.73, + "grad_norm": 0.6862667021500114, + "learning_rate": 1.7663267840167934e-07, + "loss": 0.3638, + "step": 11481 + }, + { + "epoch": 0.73, + "grad_norm": 0.46122680833365964, + "learning_rate": 1.7655391672311088e-07, + "loss": 0.1538, + "step": 11482 + }, + { + "epoch": 0.73, + "grad_norm": 10.401978850357725, + "learning_rate": 1.7647516884340235e-07, + "loss": 0.2038, + "step": 11483 + }, + { + "epoch": 0.73, + "grad_norm": 2.1790502517707164, + "learning_rate": 1.763964347659137e-07, + "loss": 0.267, + "step": 11484 + }, + { + "epoch": 0.73, + "grad_norm": 0.9168656808320996, + "learning_rate": 1.7631771449400346e-07, + "loss": 0.2097, + "step": 11485 + }, + { + "epoch": 0.73, + "grad_norm": 0.5945793788160334, + "learning_rate": 1.7623900803103036e-07, + "loss": 0.2263, + "step": 11486 + }, + { + "epoch": 0.73, + "grad_norm": 1.7329442615887327, + "learning_rate": 1.7616031538035187e-07, + "loss": 0.1219, + "step": 11487 + }, + { + "epoch": 0.73, + "grad_norm": 0.7730545468778828, + "learning_rate": 1.7608163654532548e-07, + "loss": 0.3914, + "step": 11488 + }, + { + "epoch": 0.73, + "grad_norm": 0.5839277464470503, + "learning_rate": 1.760029715293075e-07, + "loss": 0.0079, + "step": 11489 + }, + { + "epoch": 0.73, + "grad_norm": 0.5553975994262143, + "learning_rate": 1.7592432033565407e-07, + "loss": 0.2914, + "step": 11490 + }, + { + "epoch": 0.73, + "grad_norm": 7.576759306034706, + "learning_rate": 1.7584568296772073e-07, + "loss": 0.0139, + "step": 11491 + }, + { + "epoch": 0.73, + "grad_norm": 0.7765750848745309, + "learning_rate": 1.7576705942886206e-07, + "loss": 0.1938, + "step": 11492 + }, + { + "epoch": 0.73, + "grad_norm": 0.7595096649079054, + "learning_rate": 1.7568844972243257e-07, + "loss": 0.0817, + "step": 11493 + }, + { + "epoch": 0.73, + "grad_norm": 1.6963008825521375, + "learning_rate": 1.7560985385178561e-07, + "loss": 0.1989, + "step": 11494 + }, + { + "epoch": 0.73, + "grad_norm": 5.610266027254652, + "learning_rate": 1.7553127182027456e-07, + "loss": 0.2918, + "step": 11495 + }, + { + "epoch": 0.73, + "grad_norm": 2.4262986009962106, + "learning_rate": 1.7545270363125153e-07, + "loss": 0.1154, + "step": 11496 + }, + { + "epoch": 0.73, + "grad_norm": 1.1687380590174075, + "learning_rate": 1.7537414928806876e-07, + "loss": 0.299, + "step": 11497 + }, + { + "epoch": 0.73, + "grad_norm": 0.3245236831780917, + "learning_rate": 1.7529560879407718e-07, + "loss": 0.1032, + "step": 11498 + }, + { + "epoch": 0.73, + "grad_norm": 1.3085804959006284, + "learning_rate": 1.7521708215262788e-07, + "loss": 0.0966, + "step": 11499 + }, + { + "epoch": 0.73, + "grad_norm": 0.5337805967239032, + "learning_rate": 1.751385693670705e-07, + "loss": 0.0953, + "step": 11500 + } + ], + "logging_steps": 1.0, + "max_steps": 15681, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 1464249587613696.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}