{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 5532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00036153289949385393, "grad_norm": 3.6605888466350693, "learning_rate": 0.0, "loss": 0.1924, "step": 1 }, { "epoch": 0.0007230657989877079, "grad_norm": 4.7286637737438335, "learning_rate": 3.6101083032490976e-08, "loss": 0.1846, "step": 2 }, { "epoch": 0.0010845986984815619, "grad_norm": 3.263956469813783, "learning_rate": 7.220216606498195e-08, "loss": 0.0986, "step": 3 }, { "epoch": 0.0014461315979754157, "grad_norm": 3.5302678156135103, "learning_rate": 1.0830324909747293e-07, "loss": 0.373, "step": 4 }, { "epoch": 0.0018076644974692696, "grad_norm": 3.749980941071601, "learning_rate": 1.444043321299639e-07, "loss": 0.3086, "step": 5 }, { "epoch": 0.0021691973969631237, "grad_norm": 4.855489408552276, "learning_rate": 1.805054151624549e-07, "loss": 0.2109, "step": 6 }, { "epoch": 0.0025307302964569776, "grad_norm": 3.947042151855127, "learning_rate": 2.1660649819494586e-07, "loss": 0.3496, "step": 7 }, { "epoch": 0.0028922631959508315, "grad_norm": 4.088508922330489, "learning_rate": 2.527075812274368e-07, "loss": 0.4004, "step": 8 }, { "epoch": 0.0032537960954446853, "grad_norm": 4.2973844760826045, "learning_rate": 2.888086642599278e-07, "loss": 0.2949, "step": 9 }, { "epoch": 0.0036153289949385392, "grad_norm": 3.5126007911631847, "learning_rate": 3.2490974729241875e-07, "loss": 0.1504, "step": 10 }, { "epoch": 0.0039768618944323935, "grad_norm": 3.607958802293626, "learning_rate": 3.610108303249098e-07, "loss": 0.1904, "step": 11 }, { "epoch": 0.004338394793926247, "grad_norm": 4.546779650140174, "learning_rate": 3.971119133574008e-07, "loss": 0.1621, "step": 12 }, { "epoch": 0.004699927693420101, "grad_norm": 3.122968629661034, "learning_rate": 4.332129963898917e-07, "loss": 0.1079, "step": 13 }, { "epoch": 0.005061460592913955, "grad_norm": 3.736762558153893, "learning_rate": 4.693140794223827e-07, "loss": 0.3008, "step": 14 }, { "epoch": 0.005422993492407809, "grad_norm": 3.7724534250464767, "learning_rate": 5.054151624548736e-07, "loss": 0.1914, "step": 15 }, { "epoch": 0.005784526391901663, "grad_norm": 4.224349184593737, "learning_rate": 5.415162454873646e-07, "loss": 0.5156, "step": 16 }, { "epoch": 0.006146059291395517, "grad_norm": 3.489482549541498, "learning_rate": 5.776173285198556e-07, "loss": 0.3301, "step": 17 }, { "epoch": 0.006507592190889371, "grad_norm": 3.794629280858908, "learning_rate": 6.137184115523466e-07, "loss": 0.3262, "step": 18 }, { "epoch": 0.006869125090383225, "grad_norm": 4.232467522494649, "learning_rate": 6.498194945848375e-07, "loss": 0.2246, "step": 19 }, { "epoch": 0.0072306579898770785, "grad_norm": 3.5077380768516115, "learning_rate": 6.859205776173286e-07, "loss": 0.3691, "step": 20 }, { "epoch": 0.007592190889370932, "grad_norm": 4.438049042057491, "learning_rate": 7.220216606498196e-07, "loss": 0.1924, "step": 21 }, { "epoch": 0.007953723788864787, "grad_norm": 11.360423037562322, "learning_rate": 7.581227436823105e-07, "loss": 0.1611, "step": 22 }, { "epoch": 0.008315256688358641, "grad_norm": 3.377493542387236, "learning_rate": 7.942238267148016e-07, "loss": 0.2969, "step": 23 }, { "epoch": 0.008676789587852495, "grad_norm": 3.957672150568997, "learning_rate": 8.303249097472924e-07, "loss": 0.4844, "step": 24 }, { "epoch": 0.009038322487346349, "grad_norm": 2.757346747767065, "learning_rate": 8.664259927797834e-07, "loss": 0.1143, "step": 25 }, { "epoch": 0.009399855386840203, "grad_norm": 3.186898101664302, "learning_rate": 9.025270758122745e-07, "loss": 0.1777, "step": 26 }, { "epoch": 0.009761388286334056, "grad_norm": 3.8719764026840475, "learning_rate": 9.386281588447654e-07, "loss": 0.3125, "step": 27 }, { "epoch": 0.01012292118582791, "grad_norm": 3.8434430811575306, "learning_rate": 9.747292418772564e-07, "loss": 0.4375, "step": 28 }, { "epoch": 0.010484454085321764, "grad_norm": 4.887789918898309, "learning_rate": 1.0108303249097473e-06, "loss": 0.3789, "step": 29 }, { "epoch": 0.010845986984815618, "grad_norm": 4.374622828184665, "learning_rate": 1.0469314079422384e-06, "loss": 0.2363, "step": 30 }, { "epoch": 0.011207519884309472, "grad_norm": 4.327069256206455, "learning_rate": 1.0830324909747293e-06, "loss": 0.3105, "step": 31 }, { "epoch": 0.011569052783803326, "grad_norm": 3.610471382096879, "learning_rate": 1.1191335740072204e-06, "loss": 0.3086, "step": 32 }, { "epoch": 0.01193058568329718, "grad_norm": 3.4595355978167324, "learning_rate": 1.1552346570397112e-06, "loss": 0.1953, "step": 33 }, { "epoch": 0.012292118582791034, "grad_norm": 3.633128319973182, "learning_rate": 1.1913357400722023e-06, "loss": 0.3691, "step": 34 }, { "epoch": 0.012653651482284888, "grad_norm": 3.827743481348059, "learning_rate": 1.2274368231046932e-06, "loss": 0.3086, "step": 35 }, { "epoch": 0.013015184381778741, "grad_norm": 4.417650512809707, "learning_rate": 1.263537906137184e-06, "loss": 0.1855, "step": 36 }, { "epoch": 0.013376717281272595, "grad_norm": 4.856436625430131, "learning_rate": 1.299638989169675e-06, "loss": 3.7812, "step": 37 }, { "epoch": 0.01373825018076645, "grad_norm": 3.854688942273255, "learning_rate": 1.3357400722021663e-06, "loss": 0.2451, "step": 38 }, { "epoch": 0.014099783080260303, "grad_norm": 6.157073542717689, "learning_rate": 1.3718411552346572e-06, "loss": 0.4102, "step": 39 }, { "epoch": 0.014461315979754157, "grad_norm": 9.68514602926723, "learning_rate": 1.407942238267148e-06, "loss": 0.168, "step": 40 }, { "epoch": 0.01482284887924801, "grad_norm": 3.5987428972706037, "learning_rate": 1.4440433212996392e-06, "loss": 0.3184, "step": 41 }, { "epoch": 0.015184381778741865, "grad_norm": 1.3132865861875302, "learning_rate": 1.48014440433213e-06, "loss": 0.0349, "step": 42 }, { "epoch": 0.01554591467823572, "grad_norm": 3.1015885869991644, "learning_rate": 1.516245487364621e-06, "loss": 0.1719, "step": 43 }, { "epoch": 0.015907447577729574, "grad_norm": 4.446995464239063, "learning_rate": 1.5523465703971122e-06, "loss": 0.1787, "step": 44 }, { "epoch": 0.016268980477223426, "grad_norm": 3.613890996700636, "learning_rate": 1.5884476534296031e-06, "loss": 0.2422, "step": 45 }, { "epoch": 0.016630513376717282, "grad_norm": 3.4161090819941133, "learning_rate": 1.624548736462094e-06, "loss": 0.124, "step": 46 }, { "epoch": 0.016992046276211134, "grad_norm": 6.508335666660144, "learning_rate": 1.6606498194945849e-06, "loss": 0.1816, "step": 47 }, { "epoch": 0.01735357917570499, "grad_norm": 4.112097490041782, "learning_rate": 1.696750902527076e-06, "loss": 0.2109, "step": 48 }, { "epoch": 0.017715112075198842, "grad_norm": 1.9612265463242018, "learning_rate": 1.7328519855595669e-06, "loss": 0.1514, "step": 49 }, { "epoch": 0.018076644974692697, "grad_norm": 10.115572841411181, "learning_rate": 1.7689530685920577e-06, "loss": 0.125, "step": 50 }, { "epoch": 0.01843817787418655, "grad_norm": 3.5662412093268396, "learning_rate": 1.805054151624549e-06, "loss": 0.1855, "step": 51 }, { "epoch": 0.018799710773680405, "grad_norm": 1.907051786878937, "learning_rate": 1.84115523465704e-06, "loss": 0.0967, "step": 52 }, { "epoch": 0.019161243673174257, "grad_norm": 1.9828616654535387, "learning_rate": 1.8772563176895308e-06, "loss": 0.3086, "step": 53 }, { "epoch": 0.019522776572668113, "grad_norm": 2.2461171113508316, "learning_rate": 1.9133574007220217e-06, "loss": 0.2988, "step": 54 }, { "epoch": 0.019884309472161965, "grad_norm": 1.0476692010405984, "learning_rate": 1.949458483754513e-06, "loss": 0.0239, "step": 55 }, { "epoch": 0.02024584237165582, "grad_norm": 0.7463791879628975, "learning_rate": 1.9855595667870035e-06, "loss": 0.0194, "step": 56 }, { "epoch": 0.020607375271149676, "grad_norm": 0.9652128995517196, "learning_rate": 2.0216606498194946e-06, "loss": 0.0435, "step": 57 }, { "epoch": 0.02096890817064353, "grad_norm": 0.036151360515368734, "learning_rate": 2.0577617328519857e-06, "loss": 0.0031, "step": 58 }, { "epoch": 0.021330441070137384, "grad_norm": 0.4171038140326745, "learning_rate": 2.0938628158844768e-06, "loss": 0.016, "step": 59 }, { "epoch": 0.021691973969631236, "grad_norm": 1.548761606921243, "learning_rate": 2.129963898916968e-06, "loss": 0.1338, "step": 60 }, { "epoch": 0.022053506869125092, "grad_norm": 1.7355809382520426, "learning_rate": 2.1660649819494585e-06, "loss": 0.0366, "step": 61 }, { "epoch": 0.022415039768618944, "grad_norm": 9.116915680780515, "learning_rate": 2.2021660649819496e-06, "loss": 3.6719, "step": 62 }, { "epoch": 0.0227765726681128, "grad_norm": 0.1400465870733404, "learning_rate": 2.2382671480144407e-06, "loss": 0.007, "step": 63 }, { "epoch": 0.023138105567606652, "grad_norm": 0.27653957302229665, "learning_rate": 2.274368231046932e-06, "loss": 0.0107, "step": 64 }, { "epoch": 0.023499638467100507, "grad_norm": 1.2169899660406236, "learning_rate": 2.3104693140794225e-06, "loss": 0.0967, "step": 65 }, { "epoch": 0.02386117136659436, "grad_norm": 0.7397943751755311, "learning_rate": 2.3465703971119136e-06, "loss": 0.042, "step": 66 }, { "epoch": 0.024222704266088215, "grad_norm": 0.21695591619389237, "learning_rate": 2.3826714801444047e-06, "loss": 0.0109, "step": 67 }, { "epoch": 0.024584237165582067, "grad_norm": 0.18282322200591783, "learning_rate": 2.4187725631768953e-06, "loss": 0.0098, "step": 68 }, { "epoch": 0.024945770065075923, "grad_norm": 1.5789240725407354, "learning_rate": 2.4548736462093864e-06, "loss": 0.0674, "step": 69 }, { "epoch": 0.025307302964569775, "grad_norm": 1.2632210095127947, "learning_rate": 2.4909747292418775e-06, "loss": 0.0693, "step": 70 }, { "epoch": 0.02566883586406363, "grad_norm": 0.39259255266841475, "learning_rate": 2.527075812274368e-06, "loss": 0.0291, "step": 71 }, { "epoch": 0.026030368763557483, "grad_norm": 0.0921060531847573, "learning_rate": 2.5631768953068593e-06, "loss": 0.005, "step": 72 }, { "epoch": 0.02639190166305134, "grad_norm": 1.2607347948826981, "learning_rate": 2.59927797833935e-06, "loss": 0.0786, "step": 73 }, { "epoch": 0.02675343456254519, "grad_norm": 0.14028552185215687, "learning_rate": 2.6353790613718415e-06, "loss": 0.006, "step": 74 }, { "epoch": 0.027114967462039046, "grad_norm": 2.184708865303066, "learning_rate": 2.6714801444043326e-06, "loss": 0.2158, "step": 75 }, { "epoch": 0.0274765003615329, "grad_norm": 1.9094956685886104, "learning_rate": 2.7075812274368237e-06, "loss": 0.2158, "step": 76 }, { "epoch": 0.027838033261026754, "grad_norm": 0.4983219921846807, "learning_rate": 2.7436823104693144e-06, "loss": 0.0247, "step": 77 }, { "epoch": 0.028199566160520606, "grad_norm": 0.3850375269614138, "learning_rate": 2.7797833935018055e-06, "loss": 0.0211, "step": 78 }, { "epoch": 0.02856109906001446, "grad_norm": 1.756746605737981, "learning_rate": 2.815884476534296e-06, "loss": 0.1426, "step": 79 }, { "epoch": 0.028922631959508314, "grad_norm": 0.08127159945015831, "learning_rate": 2.8519855595667872e-06, "loss": 0.0037, "step": 80 }, { "epoch": 0.02928416485900217, "grad_norm": 1.1821840023792478, "learning_rate": 2.8880866425992783e-06, "loss": 0.0337, "step": 81 }, { "epoch": 0.02964569775849602, "grad_norm": 0.15916062176456147, "learning_rate": 2.924187725631769e-06, "loss": 0.008, "step": 82 }, { "epoch": 0.030007230657989877, "grad_norm": 1.2012436510806208, "learning_rate": 2.96028880866426e-06, "loss": 0.0654, "step": 83 }, { "epoch": 0.03036876355748373, "grad_norm": 2.029651360701056, "learning_rate": 2.996389891696751e-06, "loss": 0.1621, "step": 84 }, { "epoch": 0.030730296456977585, "grad_norm": 0.7755972393487421, "learning_rate": 3.032490974729242e-06, "loss": 0.033, "step": 85 }, { "epoch": 0.03109182935647144, "grad_norm": 0.19346877417570849, "learning_rate": 3.068592057761733e-06, "loss": 0.0128, "step": 86 }, { "epoch": 0.031453362255965296, "grad_norm": 1.1963139256936253, "learning_rate": 3.1046931407942245e-06, "loss": 0.0601, "step": 87 }, { "epoch": 0.03181489515545915, "grad_norm": 0.06066573768761247, "learning_rate": 3.140794223826715e-06, "loss": 0.0034, "step": 88 }, { "epoch": 0.032176428054953, "grad_norm": 0.8688922272452481, "learning_rate": 3.1768953068592062e-06, "loss": 0.0547, "step": 89 }, { "epoch": 0.03253796095444685, "grad_norm": 1.4224794404320202, "learning_rate": 3.2129963898916973e-06, "loss": 0.0669, "step": 90 }, { "epoch": 0.03289949385394071, "grad_norm": 2.701399316688088, "learning_rate": 3.249097472924188e-06, "loss": 0.1738, "step": 91 }, { "epoch": 0.033261026753434564, "grad_norm": 0.07368279743019308, "learning_rate": 3.285198555956679e-06, "loss": 0.0039, "step": 92 }, { "epoch": 0.033622559652928416, "grad_norm": 2.017726897542109, "learning_rate": 3.3212996389891698e-06, "loss": 0.2598, "step": 93 }, { "epoch": 0.03398409255242227, "grad_norm": 2.093159805452815, "learning_rate": 3.357400722021661e-06, "loss": 0.3789, "step": 94 }, { "epoch": 0.03434562545191613, "grad_norm": 0.07761694824190786, "learning_rate": 3.393501805054152e-06, "loss": 0.0051, "step": 95 }, { "epoch": 0.03470715835140998, "grad_norm": 0.10437347076633914, "learning_rate": 3.4296028880866426e-06, "loss": 0.0043, "step": 96 }, { "epoch": 0.03506869125090383, "grad_norm": 0.2910934330045492, "learning_rate": 3.4657039711191337e-06, "loss": 0.0156, "step": 97 }, { "epoch": 0.035430224150397684, "grad_norm": 0.2870315143808603, "learning_rate": 3.501805054151625e-06, "loss": 0.0178, "step": 98 }, { "epoch": 0.03579175704989154, "grad_norm": 0.46792304402446167, "learning_rate": 3.5379061371841155e-06, "loss": 0.028, "step": 99 }, { "epoch": 0.036153289949385395, "grad_norm": 0.03286806761567982, "learning_rate": 3.574007220216607e-06, "loss": 0.0019, "step": 100 }, { "epoch": 0.03651482284887925, "grad_norm": 1.2136860322689467, "learning_rate": 3.610108303249098e-06, "loss": 0.0776, "step": 101 }, { "epoch": 0.0368763557483731, "grad_norm": 0.07253659074127626, "learning_rate": 3.6462093862815888e-06, "loss": 0.0038, "step": 102 }, { "epoch": 0.03723788864786696, "grad_norm": 2.1403186898898903, "learning_rate": 3.68231046931408e-06, "loss": 0.2617, "step": 103 }, { "epoch": 0.03759942154736081, "grad_norm": 0.22392099615857705, "learning_rate": 3.718411552346571e-06, "loss": 0.0098, "step": 104 }, { "epoch": 0.03796095444685466, "grad_norm": 0.11783618277431182, "learning_rate": 3.7545126353790616e-06, "loss": 0.0063, "step": 105 }, { "epoch": 0.038322487346348515, "grad_norm": 0.23773640685829375, "learning_rate": 3.7906137184115527e-06, "loss": 0.0165, "step": 106 }, { "epoch": 0.038684020245842374, "grad_norm": 0.6915004888904146, "learning_rate": 3.826714801444043e-06, "loss": 0.0291, "step": 107 }, { "epoch": 0.039045553145336226, "grad_norm": 0.36244456317725854, "learning_rate": 3.862815884476535e-06, "loss": 0.0153, "step": 108 }, { "epoch": 0.03940708604483008, "grad_norm": 11.717423952058446, "learning_rate": 3.898916967509026e-06, "loss": 3.2812, "step": 109 }, { "epoch": 0.03976861894432393, "grad_norm": 0.11031541123044156, "learning_rate": 3.935018050541516e-06, "loss": 0.0058, "step": 110 }, { "epoch": 0.04013015184381779, "grad_norm": 1.1528569811866702, "learning_rate": 3.971119133574007e-06, "loss": 0.054, "step": 111 }, { "epoch": 0.04049168474331164, "grad_norm": 0.1981991807717241, "learning_rate": 4.0072202166064985e-06, "loss": 0.0112, "step": 112 }, { "epoch": 0.040853217642805494, "grad_norm": 1.4407448308863193, "learning_rate": 4.043321299638989e-06, "loss": 0.1621, "step": 113 }, { "epoch": 0.04121475054229935, "grad_norm": 0.5029277949641657, "learning_rate": 4.079422382671481e-06, "loss": 0.0271, "step": 114 }, { "epoch": 0.041576283441793205, "grad_norm": 1.0785309373949357, "learning_rate": 4.115523465703971e-06, "loss": 0.0493, "step": 115 }, { "epoch": 0.04193781634128706, "grad_norm": 2.1055676605894904, "learning_rate": 4.151624548736463e-06, "loss": 0.125, "step": 116 }, { "epoch": 0.04229934924078091, "grad_norm": 0.7371565391327046, "learning_rate": 4.1877256317689535e-06, "loss": 0.0408, "step": 117 }, { "epoch": 0.04266088214027477, "grad_norm": 0.8702402908789656, "learning_rate": 4.223826714801444e-06, "loss": 0.0415, "step": 118 }, { "epoch": 0.04302241503976862, "grad_norm": 3.0613051744676616, "learning_rate": 4.259927797833936e-06, "loss": 0.1426, "step": 119 }, { "epoch": 0.04338394793926247, "grad_norm": 0.42952080439505574, "learning_rate": 4.296028880866426e-06, "loss": 0.0266, "step": 120 }, { "epoch": 0.043745480838756325, "grad_norm": 0.7090837144245338, "learning_rate": 4.332129963898917e-06, "loss": 0.0374, "step": 121 }, { "epoch": 0.044107013738250184, "grad_norm": 1.3264516376841167, "learning_rate": 4.3682310469314086e-06, "loss": 0.0869, "step": 122 }, { "epoch": 0.044468546637744036, "grad_norm": 0.4506487066032254, "learning_rate": 4.404332129963899e-06, "loss": 0.0271, "step": 123 }, { "epoch": 0.04483007953723789, "grad_norm": 0.4195246671827901, "learning_rate": 4.44043321299639e-06, "loss": 0.0189, "step": 124 }, { "epoch": 0.04519161243673174, "grad_norm": 0.301095798905983, "learning_rate": 4.4765342960288814e-06, "loss": 0.0173, "step": 125 }, { "epoch": 0.0455531453362256, "grad_norm": 0.2451738492488577, "learning_rate": 4.512635379061372e-06, "loss": 0.0145, "step": 126 }, { "epoch": 0.04591467823571945, "grad_norm": 2.011170365330564, "learning_rate": 4.548736462093864e-06, "loss": 0.1934, "step": 127 }, { "epoch": 0.046276211135213303, "grad_norm": 0.4222859280036451, "learning_rate": 4.584837545126354e-06, "loss": 0.0167, "step": 128 }, { "epoch": 0.046637744034707156, "grad_norm": 1.232340135455101, "learning_rate": 4.620938628158845e-06, "loss": 0.0591, "step": 129 }, { "epoch": 0.046999276934201015, "grad_norm": 0.27204986030543926, "learning_rate": 4.6570397111913365e-06, "loss": 0.0132, "step": 130 }, { "epoch": 0.04736080983369487, "grad_norm": 0.28130152087192667, "learning_rate": 4.693140794223827e-06, "loss": 0.0121, "step": 131 }, { "epoch": 0.04772234273318872, "grad_norm": 0.17940740377489436, "learning_rate": 4.729241877256318e-06, "loss": 0.0086, "step": 132 }, { "epoch": 0.04808387563268257, "grad_norm": 0.6216000722261609, "learning_rate": 4.765342960288809e-06, "loss": 0.033, "step": 133 }, { "epoch": 0.04844540853217643, "grad_norm": 0.6351451381282092, "learning_rate": 4.8014440433213e-06, "loss": 0.0234, "step": 134 }, { "epoch": 0.04880694143167028, "grad_norm": 0.7751474441463657, "learning_rate": 4.837545126353791e-06, "loss": 0.0366, "step": 135 }, { "epoch": 0.049168474331164135, "grad_norm": 0.1568460778847388, "learning_rate": 4.873646209386282e-06, "loss": 0.0033, "step": 136 }, { "epoch": 0.04953000723065799, "grad_norm": 0.8670987365289148, "learning_rate": 4.909747292418773e-06, "loss": 0.0364, "step": 137 }, { "epoch": 0.049891540130151846, "grad_norm": 2.078992110366091, "learning_rate": 4.9458483754512636e-06, "loss": 0.1426, "step": 138 }, { "epoch": 0.0502530730296457, "grad_norm": 0.07809413039878006, "learning_rate": 4.981949458483755e-06, "loss": 0.0052, "step": 139 }, { "epoch": 0.05061460592913955, "grad_norm": 0.14137121852322102, "learning_rate": 5.018050541516246e-06, "loss": 0.0069, "step": 140 }, { "epoch": 0.0509761388286334, "grad_norm": 0.6227051856703806, "learning_rate": 5.054151624548736e-06, "loss": 0.0232, "step": 141 }, { "epoch": 0.05133767172812726, "grad_norm": 0.8678987932473161, "learning_rate": 5.090252707581228e-06, "loss": 0.0325, "step": 142 }, { "epoch": 0.05169920462762111, "grad_norm": 0.1262307995086629, "learning_rate": 5.126353790613719e-06, "loss": 0.0068, "step": 143 }, { "epoch": 0.052060737527114966, "grad_norm": 0.046562087586063096, "learning_rate": 5.16245487364621e-06, "loss": 0.0029, "step": 144 }, { "epoch": 0.052422270426608825, "grad_norm": 0.053082000069478885, "learning_rate": 5.1985559566787e-06, "loss": 0.003, "step": 145 }, { "epoch": 0.05278380332610268, "grad_norm": 1.786997630731407, "learning_rate": 5.2346570397111915e-06, "loss": 0.0771, "step": 146 }, { "epoch": 0.05314533622559653, "grad_norm": 0.042428537201088364, "learning_rate": 5.270758122743683e-06, "loss": 0.0026, "step": 147 }, { "epoch": 0.05350686912509038, "grad_norm": 2.278154706278263, "learning_rate": 5.306859205776174e-06, "loss": 0.4258, "step": 148 }, { "epoch": 0.05386840202458424, "grad_norm": 4.280736972909031, "learning_rate": 5.342960288808665e-06, "loss": 0.2471, "step": 149 }, { "epoch": 0.05422993492407809, "grad_norm": 2.0293704739161256, "learning_rate": 5.379061371841156e-06, "loss": 0.1064, "step": 150 }, { "epoch": 0.054591467823571944, "grad_norm": 2.663647064942399, "learning_rate": 5.415162454873647e-06, "loss": 0.1709, "step": 151 }, { "epoch": 0.0549530007230658, "grad_norm": 1.5254630221319587, "learning_rate": 5.451263537906137e-06, "loss": 0.0645, "step": 152 }, { "epoch": 0.055314533622559656, "grad_norm": 2.2055768659947605, "learning_rate": 5.487364620938629e-06, "loss": 0.248, "step": 153 }, { "epoch": 0.05567606652205351, "grad_norm": 0.10401289735231972, "learning_rate": 5.523465703971119e-06, "loss": 0.0054, "step": 154 }, { "epoch": 0.05603759942154736, "grad_norm": 0.13136152261786382, "learning_rate": 5.559566787003611e-06, "loss": 0.0048, "step": 155 }, { "epoch": 0.05639913232104121, "grad_norm": 1.2568623265273653, "learning_rate": 5.595667870036101e-06, "loss": 0.0535, "step": 156 }, { "epoch": 0.05676066522053507, "grad_norm": 0.18693424067739858, "learning_rate": 5.631768953068592e-06, "loss": 0.0084, "step": 157 }, { "epoch": 0.05712219812002892, "grad_norm": 0.125383602480092, "learning_rate": 5.667870036101083e-06, "loss": 0.0082, "step": 158 }, { "epoch": 0.057483731019522775, "grad_norm": 1.447587124694376, "learning_rate": 5.7039711191335744e-06, "loss": 0.0835, "step": 159 }, { "epoch": 0.05784526391901663, "grad_norm": 0.6996131493346345, "learning_rate": 5.740072202166066e-06, "loss": 0.0309, "step": 160 }, { "epoch": 0.05820679681851049, "grad_norm": 0.506943643540143, "learning_rate": 5.776173285198557e-06, "loss": 0.0242, "step": 161 }, { "epoch": 0.05856832971800434, "grad_norm": 2.0216986904916467, "learning_rate": 5.812274368231048e-06, "loss": 0.248, "step": 162 }, { "epoch": 0.05892986261749819, "grad_norm": 2.35855148849068, "learning_rate": 5.848375451263538e-06, "loss": 0.332, "step": 163 }, { "epoch": 0.05929139551699204, "grad_norm": 0.25239475585382953, "learning_rate": 5.8844765342960295e-06, "loss": 0.0133, "step": 164 }, { "epoch": 0.0596529284164859, "grad_norm": 0.6177356012159485, "learning_rate": 5.92057761732852e-06, "loss": 0.0232, "step": 165 }, { "epoch": 0.060014461315979754, "grad_norm": 0.05140409417357922, "learning_rate": 5.956678700361012e-06, "loss": 0.0031, "step": 166 }, { "epoch": 0.060375994215473607, "grad_norm": 0.3079500786722837, "learning_rate": 5.992779783393502e-06, "loss": 0.0145, "step": 167 }, { "epoch": 0.06073752711496746, "grad_norm": 0.11120638457912364, "learning_rate": 6.028880866425994e-06, "loss": 0.0063, "step": 168 }, { "epoch": 0.06109906001446132, "grad_norm": 0.0317784392939129, "learning_rate": 6.064981949458484e-06, "loss": 0.0018, "step": 169 }, { "epoch": 0.06146059291395517, "grad_norm": 0.22820338111542132, "learning_rate": 6.101083032490975e-06, "loss": 0.0105, "step": 170 }, { "epoch": 0.06182212581344902, "grad_norm": 1.02118507340362, "learning_rate": 6.137184115523466e-06, "loss": 0.0483, "step": 171 }, { "epoch": 0.06218365871294288, "grad_norm": 0.8784066890378911, "learning_rate": 6.173285198555957e-06, "loss": 0.0442, "step": 172 }, { "epoch": 0.06254519161243673, "grad_norm": 1.7416218216278252, "learning_rate": 6.209386281588449e-06, "loss": 0.1426, "step": 173 }, { "epoch": 0.06290672451193059, "grad_norm": 2.199539531454235, "learning_rate": 6.245487364620939e-06, "loss": 0.2832, "step": 174 }, { "epoch": 0.06326825741142444, "grad_norm": 0.23899681278649237, "learning_rate": 6.28158844765343e-06, "loss": 0.0118, "step": 175 }, { "epoch": 0.0636297903109183, "grad_norm": 1.1244832573090435, "learning_rate": 6.317689530685921e-06, "loss": 0.0398, "step": 176 }, { "epoch": 0.06399132321041215, "grad_norm": 1.0380352858475266, "learning_rate": 6.3537906137184125e-06, "loss": 0.0535, "step": 177 }, { "epoch": 0.064352856109906, "grad_norm": 0.02831285018335768, "learning_rate": 6.389891696750903e-06, "loss": 0.0017, "step": 178 }, { "epoch": 0.06471438900939985, "grad_norm": 0.33857413554746607, "learning_rate": 6.425992779783395e-06, "loss": 0.0167, "step": 179 }, { "epoch": 0.0650759219088937, "grad_norm": 1.9798317852364773, "learning_rate": 6.4620938628158845e-06, "loss": 0.1064, "step": 180 }, { "epoch": 0.06543745480838756, "grad_norm": 0.16453084416710198, "learning_rate": 6.498194945848376e-06, "loss": 0.0083, "step": 181 }, { "epoch": 0.06579898770788142, "grad_norm": 1.6666875099738394, "learning_rate": 6.534296028880867e-06, "loss": 0.2363, "step": 182 }, { "epoch": 0.06616052060737528, "grad_norm": 0.07689504910381935, "learning_rate": 6.570397111913358e-06, "loss": 0.0046, "step": 183 }, { "epoch": 0.06652205350686913, "grad_norm": 0.2641312934558719, "learning_rate": 6.606498194945848e-06, "loss": 0.015, "step": 184 }, { "epoch": 0.06688358640636298, "grad_norm": 0.01257103767355615, "learning_rate": 6.6425992779783395e-06, "loss": 0.0008, "step": 185 }, { "epoch": 0.06724511930585683, "grad_norm": 1.998771230102778, "learning_rate": 6.678700361010831e-06, "loss": 0.1924, "step": 186 }, { "epoch": 0.06760665220535068, "grad_norm": 1.8117801321139935, "learning_rate": 6.714801444043322e-06, "loss": 0.3047, "step": 187 }, { "epoch": 0.06796818510484454, "grad_norm": 0.17066199455018724, "learning_rate": 6.750902527075813e-06, "loss": 0.0087, "step": 188 }, { "epoch": 0.06832971800433839, "grad_norm": 1.5038719025326959, "learning_rate": 6.787003610108304e-06, "loss": 0.0771, "step": 189 }, { "epoch": 0.06869125090383225, "grad_norm": 2.1337013420276945, "learning_rate": 6.8231046931407954e-06, "loss": 0.2129, "step": 190 }, { "epoch": 0.0690527838033261, "grad_norm": 0.1424086380477965, "learning_rate": 6.859205776173285e-06, "loss": 0.0067, "step": 191 }, { "epoch": 0.06941431670281996, "grad_norm": 0.3084920507921112, "learning_rate": 6.895306859205777e-06, "loss": 0.0166, "step": 192 }, { "epoch": 0.06977584960231381, "grad_norm": 1.7192621353410442, "learning_rate": 6.9314079422382674e-06, "loss": 0.2363, "step": 193 }, { "epoch": 0.07013738250180766, "grad_norm": 0.05799796568786894, "learning_rate": 6.967509025270759e-06, "loss": 0.0036, "step": 194 }, { "epoch": 0.07049891540130152, "grad_norm": 1.700065620781472, "learning_rate": 7.00361010830325e-06, "loss": 0.0986, "step": 195 }, { "epoch": 0.07086044830079537, "grad_norm": 0.20576252093084021, "learning_rate": 7.039711191335741e-06, "loss": 0.0105, "step": 196 }, { "epoch": 0.07122198120028922, "grad_norm": 1.8673401926673838, "learning_rate": 7.075812274368231e-06, "loss": 0.2139, "step": 197 }, { "epoch": 0.07158351409978309, "grad_norm": 0.27863252658925025, "learning_rate": 7.1119133574007225e-06, "loss": 0.0123, "step": 198 }, { "epoch": 0.07194504699927694, "grad_norm": 0.12570761881473974, "learning_rate": 7.148014440433214e-06, "loss": 0.0062, "step": 199 }, { "epoch": 0.07230657989877079, "grad_norm": 1.1120031291702366, "learning_rate": 7.184115523465705e-06, "loss": 0.064, "step": 200 }, { "epoch": 0.07266811279826464, "grad_norm": 1.5335928371522158, "learning_rate": 7.220216606498196e-06, "loss": 0.0986, "step": 201 }, { "epoch": 0.0730296456977585, "grad_norm": 1.358353842009612, "learning_rate": 7.256317689530686e-06, "loss": 0.1147, "step": 202 }, { "epoch": 0.07339117859725235, "grad_norm": 0.07520490634565677, "learning_rate": 7.2924187725631776e-06, "loss": 0.0042, "step": 203 }, { "epoch": 0.0737527114967462, "grad_norm": 1.8158939121892834, "learning_rate": 7.328519855595668e-06, "loss": 0.3066, "step": 204 }, { "epoch": 0.07411424439624006, "grad_norm": 0.2198642439288841, "learning_rate": 7.36462093862816e-06, "loss": 0.0042, "step": 205 }, { "epoch": 0.07447577729573392, "grad_norm": 1.446837702138085, "learning_rate": 7.40072202166065e-06, "loss": 0.0835, "step": 206 }, { "epoch": 0.07483731019522777, "grad_norm": 0.1364037515546864, "learning_rate": 7.436823104693142e-06, "loss": 0.0074, "step": 207 }, { "epoch": 0.07519884309472162, "grad_norm": 0.3982480476354511, "learning_rate": 7.472924187725632e-06, "loss": 0.0325, "step": 208 }, { "epoch": 0.07556037599421547, "grad_norm": 0.37663809420655553, "learning_rate": 7.509025270758123e-06, "loss": 0.0325, "step": 209 }, { "epoch": 0.07592190889370933, "grad_norm": 0.41567142399397417, "learning_rate": 7.545126353790614e-06, "loss": 0.0325, "step": 210 }, { "epoch": 0.07628344179320318, "grad_norm": 0.15577865325034582, "learning_rate": 7.5812274368231055e-06, "loss": 0.0107, "step": 211 }, { "epoch": 0.07664497469269703, "grad_norm": 0.1900103644047026, "learning_rate": 7.617328519855596e-06, "loss": 0.0106, "step": 212 }, { "epoch": 0.0770065075921909, "grad_norm": 1.0133896223172936, "learning_rate": 7.653429602888087e-06, "loss": 0.0483, "step": 213 }, { "epoch": 0.07736804049168475, "grad_norm": 1.9362415361391703, "learning_rate": 7.68953068592058e-06, "loss": 0.2246, "step": 214 }, { "epoch": 0.0777295733911786, "grad_norm": 0.23406858081617238, "learning_rate": 7.72563176895307e-06, "loss": 0.0132, "step": 215 }, { "epoch": 0.07809110629067245, "grad_norm": 0.34763607258075485, "learning_rate": 7.76173285198556e-06, "loss": 0.026, "step": 216 }, { "epoch": 0.0784526391901663, "grad_norm": 0.10758812703934928, "learning_rate": 7.797833935018051e-06, "loss": 0.0059, "step": 217 }, { "epoch": 0.07881417208966016, "grad_norm": 0.07599561252714801, "learning_rate": 7.833935018050542e-06, "loss": 0.0041, "step": 218 }, { "epoch": 0.07917570498915401, "grad_norm": 0.2676052827661171, "learning_rate": 7.870036101083033e-06, "loss": 0.0186, "step": 219 }, { "epoch": 0.07953723788864786, "grad_norm": 0.9931345697824643, "learning_rate": 7.906137184115525e-06, "loss": 0.0703, "step": 220 }, { "epoch": 0.07989877078814173, "grad_norm": 0.07387757178358598, "learning_rate": 7.942238267148014e-06, "loss": 0.0041, "step": 221 }, { "epoch": 0.08026030368763558, "grad_norm": 0.6863465919452939, "learning_rate": 7.978339350180506e-06, "loss": 0.0483, "step": 222 }, { "epoch": 0.08062183658712943, "grad_norm": 0.037725337159705316, "learning_rate": 8.014440433212997e-06, "loss": 0.002, "step": 223 }, { "epoch": 0.08098336948662328, "grad_norm": 0.11762453604135134, "learning_rate": 8.050541516245488e-06, "loss": 0.0067, "step": 224 }, { "epoch": 0.08134490238611713, "grad_norm": 0.47812406547487046, "learning_rate": 8.086642599277978e-06, "loss": 0.0437, "step": 225 }, { "epoch": 0.08170643528561099, "grad_norm": 0.17880636914817644, "learning_rate": 8.12274368231047e-06, "loss": 0.0105, "step": 226 }, { "epoch": 0.08206796818510484, "grad_norm": 0.09248335310001357, "learning_rate": 8.158844765342961e-06, "loss": 0.0052, "step": 227 }, { "epoch": 0.0824295010845987, "grad_norm": 0.06728302667001494, "learning_rate": 8.194945848375452e-06, "loss": 0.0036, "step": 228 }, { "epoch": 0.08279103398409256, "grad_norm": 1.7514108383763156, "learning_rate": 8.231046931407943e-06, "loss": 0.3047, "step": 229 }, { "epoch": 0.08315256688358641, "grad_norm": 0.4692094405549025, "learning_rate": 8.267148014440433e-06, "loss": 0.0208, "step": 230 }, { "epoch": 0.08351409978308026, "grad_norm": 1.5845857511888966, "learning_rate": 8.303249097472926e-06, "loss": 0.0913, "step": 231 }, { "epoch": 0.08387563268257411, "grad_norm": 0.1799143336753989, "learning_rate": 8.339350180505416e-06, "loss": 0.0105, "step": 232 }, { "epoch": 0.08423716558206797, "grad_norm": 0.3020802213883477, "learning_rate": 8.375451263537907e-06, "loss": 0.0132, "step": 233 }, { "epoch": 0.08459869848156182, "grad_norm": 0.03603022484079047, "learning_rate": 8.411552346570398e-06, "loss": 0.002, "step": 234 }, { "epoch": 0.08496023138105567, "grad_norm": 2.7020901809736544, "learning_rate": 8.447653429602888e-06, "loss": 0.3164, "step": 235 }, { "epoch": 0.08532176428054954, "grad_norm": 0.314346943819451, "learning_rate": 8.483754512635379e-06, "loss": 0.0186, "step": 236 }, { "epoch": 0.08568329718004339, "grad_norm": 0.04801718924660537, "learning_rate": 8.519855595667871e-06, "loss": 0.0028, "step": 237 }, { "epoch": 0.08604483007953724, "grad_norm": 0.9139660488560585, "learning_rate": 8.55595667870036e-06, "loss": 0.0581, "step": 238 }, { "epoch": 0.08640636297903109, "grad_norm": 0.39292031413713063, "learning_rate": 8.592057761732853e-06, "loss": 0.0232, "step": 239 }, { "epoch": 0.08676789587852494, "grad_norm": 0.3982769753320758, "learning_rate": 8.628158844765343e-06, "loss": 0.0287, "step": 240 }, { "epoch": 0.0871294287780188, "grad_norm": 1.6001498269632586, "learning_rate": 8.664259927797834e-06, "loss": 0.293, "step": 241 }, { "epoch": 0.08749096167751265, "grad_norm": 0.15947479743285498, "learning_rate": 8.700361010830326e-06, "loss": 0.0096, "step": 242 }, { "epoch": 0.0878524945770065, "grad_norm": 0.44432554269164615, "learning_rate": 8.736462093862817e-06, "loss": 0.0317, "step": 243 }, { "epoch": 0.08821402747650037, "grad_norm": 0.08382151532085393, "learning_rate": 8.772563176895308e-06, "loss": 0.0051, "step": 244 }, { "epoch": 0.08857556037599422, "grad_norm": 1.0458754314179184, "learning_rate": 8.808664259927798e-06, "loss": 0.0698, "step": 245 }, { "epoch": 0.08893709327548807, "grad_norm": 0.1436902879745156, "learning_rate": 8.84476534296029e-06, "loss": 0.0105, "step": 246 }, { "epoch": 0.08929862617498192, "grad_norm": 1.534436612945545, "learning_rate": 8.88086642599278e-06, "loss": 0.2139, "step": 247 }, { "epoch": 0.08966015907447578, "grad_norm": 0.08528043617651035, "learning_rate": 8.916967509025272e-06, "loss": 0.0058, "step": 248 }, { "epoch": 0.09002169197396963, "grad_norm": 0.30096259223139904, "learning_rate": 8.953068592057763e-06, "loss": 0.0166, "step": 249 }, { "epoch": 0.09038322487346348, "grad_norm": 0.5311157741843285, "learning_rate": 8.989169675090254e-06, "loss": 0.0354, "step": 250 }, { "epoch": 0.09074475777295733, "grad_norm": 0.3831533544419753, "learning_rate": 9.025270758122744e-06, "loss": 0.0264, "step": 251 }, { "epoch": 0.0911062906724512, "grad_norm": 1.7084479055674515, "learning_rate": 9.061371841155235e-06, "loss": 0.2471, "step": 252 }, { "epoch": 0.09146782357194505, "grad_norm": 1.7005698087483865, "learning_rate": 9.097472924187727e-06, "loss": 0.2129, "step": 253 }, { "epoch": 0.0918293564714389, "grad_norm": 0.2454753826567496, "learning_rate": 9.133574007220218e-06, "loss": 0.0131, "step": 254 }, { "epoch": 0.09219088937093275, "grad_norm": 0.09441441030641944, "learning_rate": 9.169675090252709e-06, "loss": 0.0052, "step": 255 }, { "epoch": 0.09255242227042661, "grad_norm": 0.8134809321955924, "learning_rate": 9.2057761732852e-06, "loss": 0.0481, "step": 256 }, { "epoch": 0.09291395516992046, "grad_norm": 0.30680982770149967, "learning_rate": 9.24187725631769e-06, "loss": 0.0165, "step": 257 }, { "epoch": 0.09327548806941431, "grad_norm": 0.6023894943022899, "learning_rate": 9.27797833935018e-06, "loss": 0.0435, "step": 258 }, { "epoch": 0.09363702096890818, "grad_norm": 1.3040611481317754, "learning_rate": 9.314079422382673e-06, "loss": 0.1318, "step": 259 }, { "epoch": 0.09399855386840203, "grad_norm": 0.41682357895436695, "learning_rate": 9.350180505415164e-06, "loss": 0.032, "step": 260 }, { "epoch": 0.09436008676789588, "grad_norm": 0.19917166982213608, "learning_rate": 9.386281588447654e-06, "loss": 0.0118, "step": 261 }, { "epoch": 0.09472161966738973, "grad_norm": 1.2006208616293785, "learning_rate": 9.422382671480145e-06, "loss": 0.0698, "step": 262 }, { "epoch": 0.09508315256688359, "grad_norm": 0.4954880317152259, "learning_rate": 9.458483754512636e-06, "loss": 0.0286, "step": 263 }, { "epoch": 0.09544468546637744, "grad_norm": 0.32153572070750136, "learning_rate": 9.494584837545126e-06, "loss": 0.0166, "step": 264 }, { "epoch": 0.09580621836587129, "grad_norm": 0.7207328580983915, "learning_rate": 9.530685920577619e-06, "loss": 0.0479, "step": 265 }, { "epoch": 0.09616775126536514, "grad_norm": 0.2838340968855046, "learning_rate": 9.56678700361011e-06, "loss": 0.0148, "step": 266 }, { "epoch": 0.09652928416485901, "grad_norm": 0.053557701008028945, "learning_rate": 9.6028880866426e-06, "loss": 0.0037, "step": 267 }, { "epoch": 0.09689081706435286, "grad_norm": 1.4339453036227474, "learning_rate": 9.63898916967509e-06, "loss": 0.1309, "step": 268 }, { "epoch": 0.09725234996384671, "grad_norm": 1.8166915310216438, "learning_rate": 9.675090252707581e-06, "loss": 0.1226, "step": 269 }, { "epoch": 0.09761388286334056, "grad_norm": 1.785913593315668, "learning_rate": 9.711191335740074e-06, "loss": 0.1309, "step": 270 }, { "epoch": 0.09797541576283442, "grad_norm": 0.32454295226792246, "learning_rate": 9.747292418772564e-06, "loss": 0.0204, "step": 271 }, { "epoch": 0.09833694866232827, "grad_norm": 1.0437554178181252, "learning_rate": 9.783393501805055e-06, "loss": 0.0767, "step": 272 }, { "epoch": 0.09869848156182212, "grad_norm": 1.4035847638139847, "learning_rate": 9.819494584837546e-06, "loss": 0.0903, "step": 273 }, { "epoch": 0.09906001446131597, "grad_norm": 0.036623879848847705, "learning_rate": 9.855595667870036e-06, "loss": 0.0019, "step": 274 }, { "epoch": 0.09942154736080984, "grad_norm": 0.8739457396259983, "learning_rate": 9.891696750902527e-06, "loss": 0.0393, "step": 275 }, { "epoch": 0.09978308026030369, "grad_norm": 1.9874889603687207, "learning_rate": 9.92779783393502e-06, "loss": 0.1602, "step": 276 }, { "epoch": 0.10014461315979754, "grad_norm": 0.3300771855630241, "learning_rate": 9.96389891696751e-06, "loss": 0.0184, "step": 277 }, { "epoch": 0.1005061460592914, "grad_norm": 0.15708963236660564, "learning_rate": 1e-05, "loss": 0.0094, "step": 278 }, { "epoch": 0.10086767895878525, "grad_norm": 1.33227005992789, "learning_rate": 9.999999106500529e-06, "loss": 0.1709, "step": 279 }, { "epoch": 0.1012292118582791, "grad_norm": 0.589778974358704, "learning_rate": 9.99999642600243e-06, "loss": 0.0286, "step": 280 }, { "epoch": 0.10159074475777295, "grad_norm": 0.6020114302016507, "learning_rate": 9.999991958506665e-06, "loss": 0.032, "step": 281 }, { "epoch": 0.1019522776572668, "grad_norm": 1.7380657681274472, "learning_rate": 9.999985704014829e-06, "loss": 0.2471, "step": 282 }, { "epoch": 0.10231381055676067, "grad_norm": 1.6350700144628563, "learning_rate": 9.999977662529157e-06, "loss": 0.0981, "step": 283 }, { "epoch": 0.10267534345625452, "grad_norm": 0.13541628520014456, "learning_rate": 9.999967834052524e-06, "loss": 0.0075, "step": 284 }, { "epoch": 0.10303687635574837, "grad_norm": 0.7886176785317593, "learning_rate": 9.999956218588443e-06, "loss": 0.0532, "step": 285 }, { "epoch": 0.10339840925524223, "grad_norm": 1.3219012092028188, "learning_rate": 9.999942816141063e-06, "loss": 0.1055, "step": 286 }, { "epoch": 0.10375994215473608, "grad_norm": 0.3865189393829623, "learning_rate": 9.999927626715178e-06, "loss": 0.0286, "step": 287 }, { "epoch": 0.10412147505422993, "grad_norm": 0.9620776684274479, "learning_rate": 9.999910650316214e-06, "loss": 0.0698, "step": 288 }, { "epoch": 0.10448300795372378, "grad_norm": 0.2764035050114357, "learning_rate": 9.999891886950236e-06, "loss": 0.0205, "step": 289 }, { "epoch": 0.10484454085321765, "grad_norm": 0.1586613421447814, "learning_rate": 9.999871336623956e-06, "loss": 0.0106, "step": 290 }, { "epoch": 0.1052060737527115, "grad_norm": 0.27891553006505265, "learning_rate": 9.999848999344714e-06, "loss": 0.0183, "step": 291 }, { "epoch": 0.10556760665220535, "grad_norm": 0.27085400770298856, "learning_rate": 9.999824875120495e-06, "loss": 0.0183, "step": 292 }, { "epoch": 0.1059291395516992, "grad_norm": 0.5452622065400079, "learning_rate": 9.99979896395992e-06, "loss": 0.0256, "step": 293 }, { "epoch": 0.10629067245119306, "grad_norm": 1.5315828311114081, "learning_rate": 9.99977126587225e-06, "loss": 0.1226, "step": 294 }, { "epoch": 0.10665220535068691, "grad_norm": 0.7570916733456831, "learning_rate": 9.999741780867388e-06, "loss": 0.0527, "step": 295 }, { "epoch": 0.10701373825018076, "grad_norm": 1.3687093979078684, "learning_rate": 9.999710508955866e-06, "loss": 0.2344, "step": 296 }, { "epoch": 0.10737527114967461, "grad_norm": 0.14034431842065026, "learning_rate": 9.999677450148864e-06, "loss": 0.0102, "step": 297 }, { "epoch": 0.10773680404916848, "grad_norm": 0.6811774104441453, "learning_rate": 9.999642604458196e-06, "loss": 0.0581, "step": 298 }, { "epoch": 0.10809833694866233, "grad_norm": 1.4885810315414838, "learning_rate": 9.999605971896317e-06, "loss": 0.2344, "step": 299 }, { "epoch": 0.10845986984815618, "grad_norm": 1.3810498706512706, "learning_rate": 9.999567552476318e-06, "loss": 0.1309, "step": 300 }, { "epoch": 0.10882140274765004, "grad_norm": 0.1499788169773227, "learning_rate": 9.99952734621193e-06, "loss": 0.0092, "step": 301 }, { "epoch": 0.10918293564714389, "grad_norm": 1.3034401057444005, "learning_rate": 9.999485353117526e-06, "loss": 0.2363, "step": 302 }, { "epoch": 0.10954446854663774, "grad_norm": 0.11206366114651548, "learning_rate": 9.99944157320811e-06, "loss": 0.0081, "step": 303 }, { "epoch": 0.1099060014461316, "grad_norm": 1.3096361830934975, "learning_rate": 9.999396006499331e-06, "loss": 0.2695, "step": 304 }, { "epoch": 0.11026753434562545, "grad_norm": 1.4873596338089368, "learning_rate": 9.999348653007475e-06, "loss": 0.2344, "step": 305 }, { "epoch": 0.11062906724511931, "grad_norm": 1.3401319895310078, "learning_rate": 9.999299512749465e-06, "loss": 0.1504, "step": 306 }, { "epoch": 0.11099060014461316, "grad_norm": 0.21867526192932749, "learning_rate": 9.999248585742865e-06, "loss": 0.0164, "step": 307 }, { "epoch": 0.11135213304410702, "grad_norm": 0.3354827604954443, "learning_rate": 9.999195872005874e-06, "loss": 0.0258, "step": 308 }, { "epoch": 0.11171366594360087, "grad_norm": 0.22572560512609238, "learning_rate": 9.999141371557334e-06, "loss": 0.0183, "step": 309 }, { "epoch": 0.11207519884309472, "grad_norm": 0.1732831676960955, "learning_rate": 9.999085084416724e-06, "loss": 0.0133, "step": 310 }, { "epoch": 0.11243673174258857, "grad_norm": 0.1298775577677329, "learning_rate": 9.999027010604159e-06, "loss": 0.0094, "step": 311 }, { "epoch": 0.11279826464208242, "grad_norm": 0.19098677927683883, "learning_rate": 9.998967150140395e-06, "loss": 0.0145, "step": 312 }, { "epoch": 0.11315979754157629, "grad_norm": 0.4764213995967436, "learning_rate": 9.998905503046827e-06, "loss": 0.0356, "step": 313 }, { "epoch": 0.11352133044107014, "grad_norm": 0.560754259929798, "learning_rate": 9.998842069345486e-06, "loss": 0.0435, "step": 314 }, { "epoch": 0.113882863340564, "grad_norm": 0.7918802829946592, "learning_rate": 9.998776849059046e-06, "loss": 0.0532, "step": 315 }, { "epoch": 0.11424439624005785, "grad_norm": 1.808006136964854, "learning_rate": 9.998709842210815e-06, "loss": 0.0581, "step": 316 }, { "epoch": 0.1146059291395517, "grad_norm": 0.8933773278968613, "learning_rate": 9.998641048824741e-06, "loss": 0.064, "step": 317 }, { "epoch": 0.11496746203904555, "grad_norm": 1.079635851231079, "learning_rate": 9.998570468925411e-06, "loss": 0.1143, "step": 318 }, { "epoch": 0.1153289949385394, "grad_norm": 0.3293486430294729, "learning_rate": 9.99849810253805e-06, "loss": 0.0228, "step": 319 }, { "epoch": 0.11569052783803326, "grad_norm": 0.8663468442154871, "learning_rate": 9.998423949688523e-06, "loss": 0.0698, "step": 320 }, { "epoch": 0.11605206073752712, "grad_norm": 0.26683925191190916, "learning_rate": 9.99834801040333e-06, "loss": 0.0204, "step": 321 }, { "epoch": 0.11641359363702097, "grad_norm": 0.41062021039163105, "learning_rate": 9.998270284709612e-06, "loss": 0.0256, "step": 322 }, { "epoch": 0.11677512653651483, "grad_norm": 1.0626279093525661, "learning_rate": 9.998190772635151e-06, "loss": 0.1602, "step": 323 }, { "epoch": 0.11713665943600868, "grad_norm": 0.6962768678651388, "learning_rate": 9.998109474208363e-06, "loss": 0.083, "step": 324 }, { "epoch": 0.11749819233550253, "grad_norm": 0.11910003499246159, "learning_rate": 9.998026389458301e-06, "loss": 0.0073, "step": 325 }, { "epoch": 0.11785972523499638, "grad_norm": 0.4397818787475158, "learning_rate": 9.997941518414665e-06, "loss": 0.0317, "step": 326 }, { "epoch": 0.11822125813449023, "grad_norm": 0.2836308592508433, "learning_rate": 9.997854861107786e-06, "loss": 0.0184, "step": 327 }, { "epoch": 0.11858279103398409, "grad_norm": 0.89435088297694, "learning_rate": 9.99776641756863e-06, "loss": 0.0532, "step": 328 }, { "epoch": 0.11894432393347795, "grad_norm": 1.0717922630693053, "learning_rate": 9.997676187828816e-06, "loss": 0.0981, "step": 329 }, { "epoch": 0.1193058568329718, "grad_norm": 0.3460295905352031, "learning_rate": 9.997584171920583e-06, "loss": 0.0228, "step": 330 }, { "epoch": 0.11966738973246566, "grad_norm": 0.48648516452974466, "learning_rate": 9.997490369876823e-06, "loss": 0.0317, "step": 331 }, { "epoch": 0.12002892263195951, "grad_norm": 0.2524854239978728, "learning_rate": 9.99739478173106e-06, "loss": 0.0183, "step": 332 }, { "epoch": 0.12039045553145336, "grad_norm": 0.21600525361102088, "learning_rate": 9.997297407517456e-06, "loss": 0.0164, "step": 333 }, { "epoch": 0.12075198843094721, "grad_norm": 0.2803693288654307, "learning_rate": 9.997198247270816e-06, "loss": 0.0205, "step": 334 }, { "epoch": 0.12111352133044107, "grad_norm": 0.3921676752784009, "learning_rate": 9.997097301026573e-06, "loss": 0.0317, "step": 335 }, { "epoch": 0.12147505422993492, "grad_norm": 0.576955720037896, "learning_rate": 9.996994568820811e-06, "loss": 0.0532, "step": 336 }, { "epoch": 0.12183658712942878, "grad_norm": 0.2865754912382121, "learning_rate": 9.996890050690246e-06, "loss": 0.0208, "step": 337 }, { "epoch": 0.12219812002892264, "grad_norm": 0.21848336566906362, "learning_rate": 9.996783746672229e-06, "loss": 0.0164, "step": 338 }, { "epoch": 0.12255965292841649, "grad_norm": 0.10162029695216006, "learning_rate": 9.996675656804757e-06, "loss": 0.0063, "step": 339 }, { "epoch": 0.12292118582791034, "grad_norm": 0.27231137057927673, "learning_rate": 9.99656578112646e-06, "loss": 0.0205, "step": 340 }, { "epoch": 0.12328271872740419, "grad_norm": 0.13348103901948474, "learning_rate": 9.996454119676607e-06, "loss": 0.0103, "step": 341 }, { "epoch": 0.12364425162689804, "grad_norm": 0.49607702147080524, "learning_rate": 9.996340672495104e-06, "loss": 0.0391, "step": 342 }, { "epoch": 0.1240057845263919, "grad_norm": 0.09994195577345365, "learning_rate": 9.996225439622501e-06, "loss": 0.0081, "step": 343 }, { "epoch": 0.12436731742588576, "grad_norm": 0.8258396389537327, "learning_rate": 9.99610842109998e-06, "loss": 0.0581, "step": 344 }, { "epoch": 0.12472885032537961, "grad_norm": 0.1580676805056003, "learning_rate": 9.995989616969363e-06, "loss": 0.0115, "step": 345 }, { "epoch": 0.12509038322487345, "grad_norm": 0.04698350604357887, "learning_rate": 9.995869027273113e-06, "loss": 0.0035, "step": 346 }, { "epoch": 0.12545191612436732, "grad_norm": 0.21953244774503688, "learning_rate": 9.995746652054325e-06, "loss": 0.0146, "step": 347 }, { "epoch": 0.12581344902386118, "grad_norm": 0.14864488537401305, "learning_rate": 9.99562249135674e-06, "loss": 0.0116, "step": 348 }, { "epoch": 0.12617498192335502, "grad_norm": 1.88205837575642, "learning_rate": 9.995496545224729e-06, "loss": 0.1807, "step": 349 }, { "epoch": 0.1265365148228489, "grad_norm": 0.18426004922745876, "learning_rate": 9.995368813703307e-06, "loss": 0.0115, "step": 350 }, { "epoch": 0.12689804772234273, "grad_norm": 0.1440665042774869, "learning_rate": 9.995239296838126e-06, "loss": 0.0118, "step": 351 }, { "epoch": 0.1272595806218366, "grad_norm": 0.09272703129042259, "learning_rate": 9.995107994675475e-06, "loss": 0.0057, "step": 352 }, { "epoch": 0.12762111352133043, "grad_norm": 1.4158321243504723, "learning_rate": 9.99497490726228e-06, "loss": 0.2129, "step": 353 }, { "epoch": 0.1279826464208243, "grad_norm": 0.04817735133649258, "learning_rate": 9.994840034646108e-06, "loss": 0.0035, "step": 354 }, { "epoch": 0.12834417932031814, "grad_norm": 1.396993420631158, "learning_rate": 9.99470337687516e-06, "loss": 0.2344, "step": 355 }, { "epoch": 0.128705712219812, "grad_norm": 0.4251435182477467, "learning_rate": 9.994564933998281e-06, "loss": 0.0391, "step": 356 }, { "epoch": 0.12906724511930587, "grad_norm": 0.6785301984179631, "learning_rate": 9.994424706064946e-06, "loss": 0.0581, "step": 357 }, { "epoch": 0.1294287780187997, "grad_norm": 0.06993371803424339, "learning_rate": 9.99428269312528e-06, "loss": 0.005, "step": 358 }, { "epoch": 0.12979031091829357, "grad_norm": 0.062450966855185766, "learning_rate": 9.994138895230029e-06, "loss": 0.0045, "step": 359 }, { "epoch": 0.1301518438177874, "grad_norm": 0.07215935727277095, "learning_rate": 9.993993312430592e-06, "loss": 0.0045, "step": 360 }, { "epoch": 0.13051337671728128, "grad_norm": 0.13152684909939805, "learning_rate": 9.993845944779e-06, "loss": 0.0083, "step": 361 }, { "epoch": 0.13087490961677511, "grad_norm": 0.02807995684039971, "learning_rate": 9.99369679232792e-06, "loss": 0.0019, "step": 362 }, { "epoch": 0.13123644251626898, "grad_norm": 0.08401117023785826, "learning_rate": 9.993545855130662e-06, "loss": 0.0056, "step": 363 }, { "epoch": 0.13159797541576285, "grad_norm": 0.19839780548446784, "learning_rate": 9.993393133241167e-06, "loss": 0.0116, "step": 364 }, { "epoch": 0.13195950831525669, "grad_norm": 16.54476626549626, "learning_rate": 9.993238626714021e-06, "loss": 4.0312, "step": 365 }, { "epoch": 0.13232104121475055, "grad_norm": 0.04020172228055538, "learning_rate": 9.993082335604445e-06, "loss": 0.0027, "step": 366 }, { "epoch": 0.1326825741142444, "grad_norm": 1.1332647127453634, "learning_rate": 9.992924259968292e-06, "loss": 0.1309, "step": 367 }, { "epoch": 0.13304410701373826, "grad_norm": 0.5257653278610752, "learning_rate": 9.992764399862067e-06, "loss": 0.0356, "step": 368 }, { "epoch": 0.1334056399132321, "grad_norm": 1.193241571997317, "learning_rate": 9.992602755342896e-06, "loss": 0.0767, "step": 369 }, { "epoch": 0.13376717281272596, "grad_norm": 0.3538625501418642, "learning_rate": 9.992439326468554e-06, "loss": 0.0231, "step": 370 }, { "epoch": 0.13412870571221983, "grad_norm": 0.15260121336527704, "learning_rate": 9.992274113297453e-06, "loss": 0.0116, "step": 371 }, { "epoch": 0.13449023861171366, "grad_norm": 0.04723445818597678, "learning_rate": 9.992107115888637e-06, "loss": 0.0035, "step": 372 }, { "epoch": 0.13485177151120753, "grad_norm": 0.052619094284140355, "learning_rate": 9.991938334301789e-06, "loss": 0.004, "step": 373 }, { "epoch": 0.13521330441070137, "grad_norm": 0.042680298995111415, "learning_rate": 9.991767768597233e-06, "loss": 0.0024, "step": 374 }, { "epoch": 0.13557483731019523, "grad_norm": 0.0974829764400351, "learning_rate": 9.991595418835933e-06, "loss": 0.0037, "step": 375 }, { "epoch": 0.13593637020968907, "grad_norm": 0.7256473345217507, "learning_rate": 9.991421285079484e-06, "loss": 0.0435, "step": 376 }, { "epoch": 0.13629790310918294, "grad_norm": 0.13852339969412572, "learning_rate": 9.991245367390119e-06, "loss": 0.0118, "step": 377 }, { "epoch": 0.13665943600867678, "grad_norm": 1.1093188199928268, "learning_rate": 9.991067665830714e-06, "loss": 0.1602, "step": 378 }, { "epoch": 0.13702096890817064, "grad_norm": 0.031378998039082305, "learning_rate": 9.990888180464777e-06, "loss": 0.0027, "step": 379 }, { "epoch": 0.1373825018076645, "grad_norm": 1.1998459102757006, "learning_rate": 9.990706911356459e-06, "loss": 0.1416, "step": 380 }, { "epoch": 0.13774403470715835, "grad_norm": 1.4540765107615834, "learning_rate": 9.990523858570544e-06, "loss": 0.2695, "step": 381 }, { "epoch": 0.1381055676066522, "grad_norm": 0.37932473942316747, "learning_rate": 9.990339022172454e-06, "loss": 0.0256, "step": 382 }, { "epoch": 0.13846710050614605, "grad_norm": 0.06425664454677434, "learning_rate": 9.990152402228252e-06, "loss": 0.0045, "step": 383 }, { "epoch": 0.13882863340563992, "grad_norm": 0.2821199562157908, "learning_rate": 9.989963998804636e-06, "loss": 0.0258, "step": 384 }, { "epoch": 0.13919016630513376, "grad_norm": 2.819309804640581, "learning_rate": 9.989773811968938e-06, "loss": 0.2344, "step": 385 }, { "epoch": 0.13955169920462762, "grad_norm": 0.36472457392308844, "learning_rate": 9.989581841789132e-06, "loss": 0.0286, "step": 386 }, { "epoch": 0.1399132321041215, "grad_norm": 0.4092388006179798, "learning_rate": 9.989388088333829e-06, "loss": 0.0231, "step": 387 }, { "epoch": 0.14027476500361533, "grad_norm": 0.10003702577675377, "learning_rate": 9.989192551672278e-06, "loss": 0.0071, "step": 388 }, { "epoch": 0.1406362979031092, "grad_norm": 0.6417038917376602, "learning_rate": 9.98899523187436e-06, "loss": 0.0317, "step": 389 }, { "epoch": 0.14099783080260303, "grad_norm": 0.15421342296154822, "learning_rate": 9.9887961290106e-06, "loss": 0.0093, "step": 390 }, { "epoch": 0.1413593637020969, "grad_norm": 0.19427154842494754, "learning_rate": 9.988595243152155e-06, "loss": 0.0129, "step": 391 }, { "epoch": 0.14172089660159073, "grad_norm": 0.1550241583519594, "learning_rate": 9.988392574370825e-06, "loss": 0.0103, "step": 392 }, { "epoch": 0.1420824295010846, "grad_norm": 0.6874235500405838, "learning_rate": 9.988188122739039e-06, "loss": 0.0479, "step": 393 }, { "epoch": 0.14244396240057844, "grad_norm": 0.16338613451868542, "learning_rate": 9.987981888329874e-06, "loss": 0.0067, "step": 394 }, { "epoch": 0.1428054953000723, "grad_norm": 2.2477348607184493, "learning_rate": 9.987773871217033e-06, "loss": 0.1602, "step": 395 }, { "epoch": 0.14316702819956617, "grad_norm": 0.10353992762809043, "learning_rate": 9.987564071474862e-06, "loss": 0.0072, "step": 396 }, { "epoch": 0.14352856109906, "grad_norm": 0.3202457214495913, "learning_rate": 9.987352489178346e-06, "loss": 0.0183, "step": 397 }, { "epoch": 0.14389009399855388, "grad_norm": 0.1128539258905928, "learning_rate": 9.987139124403102e-06, "loss": 0.0072, "step": 398 }, { "epoch": 0.1442516268980477, "grad_norm": 0.26041485248344903, "learning_rate": 9.986923977225388e-06, "loss": 0.0102, "step": 399 }, { "epoch": 0.14461315979754158, "grad_norm": 0.4914371207644942, "learning_rate": 9.986707047722097e-06, "loss": 0.0391, "step": 400 }, { "epoch": 0.14497469269703542, "grad_norm": 0.08402421950771481, "learning_rate": 9.986488335970759e-06, "loss": 0.0044, "step": 401 }, { "epoch": 0.14533622559652928, "grad_norm": 1.680507020796679, "learning_rate": 9.986267842049542e-06, "loss": 0.1055, "step": 402 }, { "epoch": 0.14569775849602315, "grad_norm": 1.3255348031682255, "learning_rate": 9.986045566037252e-06, "loss": 0.1914, "step": 403 }, { "epoch": 0.146059291395517, "grad_norm": 0.1164648466939482, "learning_rate": 9.985821508013327e-06, "loss": 0.0063, "step": 404 }, { "epoch": 0.14642082429501085, "grad_norm": 1.2486726216554493, "learning_rate": 9.985595668057848e-06, "loss": 0.2695, "step": 405 }, { "epoch": 0.1467823571945047, "grad_norm": 0.5918282917346741, "learning_rate": 9.98536804625153e-06, "loss": 0.0393, "step": 406 }, { "epoch": 0.14714389009399856, "grad_norm": 1.328388390363194, "learning_rate": 9.985138642675723e-06, "loss": 0.2695, "step": 407 }, { "epoch": 0.1475054229934924, "grad_norm": 1.2916310840543406, "learning_rate": 9.984907457412419e-06, "loss": 0.1602, "step": 408 }, { "epoch": 0.14786695589298626, "grad_norm": 0.1743614694887466, "learning_rate": 9.98467449054424e-06, "loss": 0.0115, "step": 409 }, { "epoch": 0.14822848879248013, "grad_norm": 0.0492563693586612, "learning_rate": 9.98443974215445e-06, "loss": 0.004, "step": 410 }, { "epoch": 0.14859002169197397, "grad_norm": 1.5450236329337432, "learning_rate": 9.98420321232695e-06, "loss": 0.0908, "step": 411 }, { "epoch": 0.14895155459146783, "grad_norm": 0.04703177138863534, "learning_rate": 9.983964901146272e-06, "loss": 0.0026, "step": 412 }, { "epoch": 0.14931308749096167, "grad_norm": 1.1724952618142779, "learning_rate": 9.983724808697591e-06, "loss": 0.1143, "step": 413 }, { "epoch": 0.14967462039045554, "grad_norm": 0.11384869044692837, "learning_rate": 9.983482935066716e-06, "loss": 0.0072, "step": 414 }, { "epoch": 0.15003615328994938, "grad_norm": 1.4162654109881407, "learning_rate": 9.98323928034009e-06, "loss": 0.2236, "step": 415 }, { "epoch": 0.15039768618944324, "grad_norm": 0.3272217882865507, "learning_rate": 9.982993844604799e-06, "loss": 0.0204, "step": 416 }, { "epoch": 0.15075921908893708, "grad_norm": 1.1034589963038668, "learning_rate": 9.982746627948556e-06, "loss": 0.1699, "step": 417 }, { "epoch": 0.15112075198843095, "grad_norm": 0.09597625475145034, "learning_rate": 9.982497630459723e-06, "loss": 0.0058, "step": 418 }, { "epoch": 0.1514822848879248, "grad_norm": 0.5516249092788658, "learning_rate": 9.982246852227287e-06, "loss": 0.0315, "step": 419 }, { "epoch": 0.15184381778741865, "grad_norm": 1.2409463767341053, "learning_rate": 9.981994293340878e-06, "loss": 0.1699, "step": 420 }, { "epoch": 0.15220535068691252, "grad_norm": 0.7604842149182292, "learning_rate": 9.98173995389076e-06, "loss": 0.0522, "step": 421 }, { "epoch": 0.15256688358640635, "grad_norm": 1.51100435646668, "learning_rate": 9.981483833967833e-06, "loss": 0.1807, "step": 422 }, { "epoch": 0.15292841648590022, "grad_norm": 0.8474354556889205, "learning_rate": 9.981225933663634e-06, "loss": 0.0479, "step": 423 }, { "epoch": 0.15328994938539406, "grad_norm": 1.3746532082550598, "learning_rate": 9.98096625307034e-06, "loss": 0.1309, "step": 424 }, { "epoch": 0.15365148228488792, "grad_norm": 0.0450290861416473, "learning_rate": 9.980704792280758e-06, "loss": 0.0018, "step": 425 }, { "epoch": 0.1540130151843818, "grad_norm": 1.1587452999919705, "learning_rate": 9.980441551388332e-06, "loss": 0.083, "step": 426 }, { "epoch": 0.15437454808387563, "grad_norm": 0.653761762614362, "learning_rate": 9.980176530487149e-06, "loss": 0.0349, "step": 427 }, { "epoch": 0.1547360809833695, "grad_norm": 1.0443877743689096, "learning_rate": 9.979909729671923e-06, "loss": 0.083, "step": 428 }, { "epoch": 0.15509761388286333, "grad_norm": 1.5307263408845766, "learning_rate": 9.979641149038013e-06, "loss": 0.1226, "step": 429 }, { "epoch": 0.1554591467823572, "grad_norm": 1.733453469883969, "learning_rate": 9.979370788681406e-06, "loss": 0.1699, "step": 430 }, { "epoch": 0.15582067968185104, "grad_norm": 1.1646010074099138, "learning_rate": 9.979098648698731e-06, "loss": 0.0903, "step": 431 }, { "epoch": 0.1561822125813449, "grad_norm": 0.4528965654574228, "learning_rate": 9.978824729187248e-06, "loss": 0.0388, "step": 432 }, { "epoch": 0.15654374548083877, "grad_norm": 2.7911245620506326, "learning_rate": 9.978549030244858e-06, "loss": 0.2363, "step": 433 }, { "epoch": 0.1569052783803326, "grad_norm": 0.3701514056894471, "learning_rate": 9.978271551970095e-06, "loss": 0.0283, "step": 434 }, { "epoch": 0.15726681127982647, "grad_norm": 0.32546800414936705, "learning_rate": 9.97799229446213e-06, "loss": 0.0254, "step": 435 }, { "epoch": 0.1576283441793203, "grad_norm": 0.9305285681208991, "learning_rate": 9.977711257820772e-06, "loss": 0.0479, "step": 436 }, { "epoch": 0.15798987707881418, "grad_norm": 0.26624288549974495, "learning_rate": 9.977428442146459e-06, "loss": 0.0227, "step": 437 }, { "epoch": 0.15835140997830802, "grad_norm": 1.2094423054166785, "learning_rate": 9.977143847540272e-06, "loss": 0.0903, "step": 438 }, { "epoch": 0.15871294287780188, "grad_norm": 0.25209674670509064, "learning_rate": 9.976857474103922e-06, "loss": 0.0145, "step": 439 }, { "epoch": 0.15907447577729572, "grad_norm": 0.05251113154062471, "learning_rate": 9.976569321939763e-06, "loss": 0.0036, "step": 440 }, { "epoch": 0.1594360086767896, "grad_norm": 0.8627078307468874, "learning_rate": 9.976279391150778e-06, "loss": 0.0635, "step": 441 }, { "epoch": 0.15979754157628345, "grad_norm": 0.345288179957694, "learning_rate": 9.975987681840589e-06, "loss": 0.0286, "step": 442 }, { "epoch": 0.1601590744757773, "grad_norm": 0.4626685144154535, "learning_rate": 9.975694194113452e-06, "loss": 0.0286, "step": 443 }, { "epoch": 0.16052060737527116, "grad_norm": 21.04561952358492, "learning_rate": 9.975398928074262e-06, "loss": 3.0938, "step": 444 }, { "epoch": 0.160882140274765, "grad_norm": 0.2108813602515678, "learning_rate": 9.975101883828543e-06, "loss": 0.0145, "step": 445 }, { "epoch": 0.16124367317425886, "grad_norm": 0.6215990636517958, "learning_rate": 9.97480306148246e-06, "loss": 0.0435, "step": 446 }, { "epoch": 0.1616052060737527, "grad_norm": 0.4126324433105087, "learning_rate": 9.974502461142815e-06, "loss": 0.0286, "step": 447 }, { "epoch": 0.16196673897324657, "grad_norm": 0.07263036015998989, "learning_rate": 9.97420008291704e-06, "loss": 0.0045, "step": 448 }, { "epoch": 0.16232827187274043, "grad_norm": 0.1412301630685498, "learning_rate": 9.973895926913203e-06, "loss": 0.009, "step": 449 }, { "epoch": 0.16268980477223427, "grad_norm": 1.020054567934596, "learning_rate": 9.973589993240015e-06, "loss": 0.0527, "step": 450 }, { "epoch": 0.16305133767172814, "grad_norm": 0.3298116011008264, "learning_rate": 9.973282282006812e-06, "loss": 0.0162, "step": 451 }, { "epoch": 0.16341287057122197, "grad_norm": 0.4237189223063582, "learning_rate": 9.972972793323568e-06, "loss": 0.0286, "step": 452 }, { "epoch": 0.16377440347071584, "grad_norm": 0.14770492212394631, "learning_rate": 9.9726615273009e-06, "loss": 0.0072, "step": 453 }, { "epoch": 0.16413593637020968, "grad_norm": 0.12100495256591423, "learning_rate": 9.97234848405005e-06, "loss": 0.009, "step": 454 }, { "epoch": 0.16449746926970354, "grad_norm": 1.2954354584808812, "learning_rate": 9.9720336636829e-06, "loss": 0.1699, "step": 455 }, { "epoch": 0.1648590021691974, "grad_norm": 0.12376449026965568, "learning_rate": 9.971717066311971e-06, "loss": 0.0092, "step": 456 }, { "epoch": 0.16522053506869125, "grad_norm": 0.2911866440764772, "learning_rate": 9.971398692050411e-06, "loss": 0.0183, "step": 457 }, { "epoch": 0.16558206796818511, "grad_norm": 1.4514601809891072, "learning_rate": 9.971078541012007e-06, "loss": 0.0698, "step": 458 }, { "epoch": 0.16594360086767895, "grad_norm": 0.329988342516131, "learning_rate": 9.97075661331118e-06, "loss": 0.0183, "step": 459 }, { "epoch": 0.16630513376717282, "grad_norm": 0.11562280348594006, "learning_rate": 9.97043290906299e-06, "loss": 0.009, "step": 460 }, { "epoch": 0.16666666666666666, "grad_norm": 0.9528881671159951, "learning_rate": 9.970107428383129e-06, "loss": 0.0693, "step": 461 }, { "epoch": 0.16702819956616052, "grad_norm": 1.682120046335089, "learning_rate": 9.969780171387919e-06, "loss": 0.1807, "step": 462 }, { "epoch": 0.16738973246565436, "grad_norm": 0.10472440063070734, "learning_rate": 9.969451138194326e-06, "loss": 0.0072, "step": 463 }, { "epoch": 0.16775126536514823, "grad_norm": 0.37840157023477566, "learning_rate": 9.969120328919946e-06, "loss": 0.0286, "step": 464 }, { "epoch": 0.1681127982646421, "grad_norm": 0.6370610328094083, "learning_rate": 9.968787743683008e-06, "loss": 0.0435, "step": 465 }, { "epoch": 0.16847433116413593, "grad_norm": 0.07257478844639459, "learning_rate": 9.968453382602378e-06, "loss": 0.0056, "step": 466 }, { "epoch": 0.1688358640636298, "grad_norm": 1.3637553443272885, "learning_rate": 9.968117245797559e-06, "loss": 0.2344, "step": 467 }, { "epoch": 0.16919739696312364, "grad_norm": 0.061096542134638, "learning_rate": 9.967779333388682e-06, "loss": 0.0031, "step": 468 }, { "epoch": 0.1695589298626175, "grad_norm": 0.11311754007750115, "learning_rate": 9.967439645496523e-06, "loss": 0.0072, "step": 469 }, { "epoch": 0.16992046276211134, "grad_norm": 1.2586278891917986, "learning_rate": 9.967098182242482e-06, "loss": 0.0693, "step": 470 }, { "epoch": 0.1702819956616052, "grad_norm": 13.424290244831134, "learning_rate": 9.9667549437486e-06, "loss": 3.7031, "step": 471 }, { "epoch": 0.17064352856109907, "grad_norm": 0.07076479579624033, "learning_rate": 9.966409930137548e-06, "loss": 0.0063, "step": 472 }, { "epoch": 0.1710050614605929, "grad_norm": 1.355769869831535, "learning_rate": 9.966063141532634e-06, "loss": 0.3164, "step": 473 }, { "epoch": 0.17136659436008678, "grad_norm": 1.3331175837074754, "learning_rate": 9.965714578057803e-06, "loss": 0.293, "step": 474 }, { "epoch": 0.17172812725958062, "grad_norm": 0.29768031328635153, "learning_rate": 9.965364239837629e-06, "loss": 0.0164, "step": 475 }, { "epoch": 0.17208966015907448, "grad_norm": 0.08788793953769752, "learning_rate": 9.965012126997325e-06, "loss": 0.0071, "step": 476 }, { "epoch": 0.17245119305856832, "grad_norm": 0.07988467273793763, "learning_rate": 9.964658239662734e-06, "loss": 0.0071, "step": 477 }, { "epoch": 0.17281272595806219, "grad_norm": 0.5355837571199866, "learning_rate": 9.964302577960334e-06, "loss": 0.0352, "step": 478 }, { "epoch": 0.17317425885755602, "grad_norm": 0.05871275084535953, "learning_rate": 9.963945142017241e-06, "loss": 0.0056, "step": 479 }, { "epoch": 0.1735357917570499, "grad_norm": 1.1508680976514487, "learning_rate": 9.963585931961203e-06, "loss": 0.1143, "step": 480 }, { "epoch": 0.17389732465654376, "grad_norm": 0.09066021094920092, "learning_rate": 9.963224947920601e-06, "loss": 0.0081, "step": 481 }, { "epoch": 0.1742588575560376, "grad_norm": 0.6715879448819038, "learning_rate": 9.962862190024449e-06, "loss": 0.0579, "step": 482 }, { "epoch": 0.17462039045553146, "grad_norm": 0.08077103438339728, "learning_rate": 9.962497658402396e-06, "loss": 0.0071, "step": 483 }, { "epoch": 0.1749819233550253, "grad_norm": 0.34418192249485874, "learning_rate": 9.96213135318473e-06, "loss": 0.0145, "step": 484 }, { "epoch": 0.17534345625451916, "grad_norm": 1.1686019468248323, "learning_rate": 9.961763274502364e-06, "loss": 0.2578, "step": 485 }, { "epoch": 0.175704989154013, "grad_norm": 0.6601191593030189, "learning_rate": 9.961393422486851e-06, "loss": 0.0693, "step": 486 }, { "epoch": 0.17606652205350687, "grad_norm": 0.14828998288593123, "learning_rate": 9.961021797270376e-06, "loss": 0.0145, "step": 487 }, { "epoch": 0.17642805495300073, "grad_norm": 0.1830955387013095, "learning_rate": 9.960648398985758e-06, "loss": 0.0101, "step": 488 }, { "epoch": 0.17678958785249457, "grad_norm": 0.5490493686848521, "learning_rate": 9.960273227766448e-06, "loss": 0.0435, "step": 489 }, { "epoch": 0.17715112075198844, "grad_norm": 0.40936954160892114, "learning_rate": 9.959896283746535e-06, "loss": 0.0352, "step": 490 }, { "epoch": 0.17751265365148228, "grad_norm": 0.8750694437536705, "learning_rate": 9.959517567060736e-06, "loss": 0.0693, "step": 491 }, { "epoch": 0.17787418655097614, "grad_norm": 0.5621120734875075, "learning_rate": 9.959137077844405e-06, "loss": 0.0317, "step": 492 }, { "epoch": 0.17823571945046998, "grad_norm": 0.763328590475873, "learning_rate": 9.95875481623353e-06, "loss": 0.0286, "step": 493 }, { "epoch": 0.17859725234996385, "grad_norm": 1.0774419922901806, "learning_rate": 9.958370782364728e-06, "loss": 0.1504, "step": 494 }, { "epoch": 0.1789587852494577, "grad_norm": 0.20837112353884174, "learning_rate": 9.957984976375258e-06, "loss": 0.0182, "step": 495 }, { "epoch": 0.17932031814895155, "grad_norm": 0.6979686047432003, "learning_rate": 9.957597398403e-06, "loss": 0.0354, "step": 496 }, { "epoch": 0.17968185104844542, "grad_norm": 0.27571785726679643, "learning_rate": 9.95720804858648e-06, "loss": 0.0256, "step": 497 }, { "epoch": 0.18004338394793926, "grad_norm": 0.2817913825787927, "learning_rate": 9.95681692706485e-06, "loss": 0.0229, "step": 498 }, { "epoch": 0.18040491684743312, "grad_norm": 1.0522620775543872, "learning_rate": 9.956424033977896e-06, "loss": 0.2236, "step": 499 }, { "epoch": 0.18076644974692696, "grad_norm": 0.03234300888663396, "learning_rate": 9.956029369466038e-06, "loss": 0.0025, "step": 500 }, { "epoch": 0.18112798264642083, "grad_norm": 0.40179768211967903, "learning_rate": 9.955632933670329e-06, "loss": 0.0256, "step": 501 }, { "epoch": 0.18148951554591466, "grad_norm": 1.0077767716392925, "learning_rate": 9.955234726732455e-06, "loss": 0.1143, "step": 502 }, { "epoch": 0.18185104844540853, "grad_norm": 0.2801458732133164, "learning_rate": 9.954834748794737e-06, "loss": 0.0256, "step": 503 }, { "epoch": 0.1822125813449024, "grad_norm": 2.4637209834817306, "learning_rate": 9.954433000000123e-06, "loss": 0.1055, "step": 504 }, { "epoch": 0.18257411424439624, "grad_norm": 0.2539369989064063, "learning_rate": 9.954029480492202e-06, "loss": 0.0203, "step": 505 }, { "epoch": 0.1829356471438901, "grad_norm": 0.41772707344131405, "learning_rate": 9.95362419041519e-06, "loss": 0.0352, "step": 506 }, { "epoch": 0.18329718004338394, "grad_norm": 0.6074147664746943, "learning_rate": 9.953217129913939e-06, "loss": 0.0354, "step": 507 }, { "epoch": 0.1836587129428778, "grad_norm": 13.144914497041693, "learning_rate": 9.95280829913393e-06, "loss": 1.8594, "step": 508 }, { "epoch": 0.18402024584237164, "grad_norm": 0.3824971135013609, "learning_rate": 9.95239769822128e-06, "loss": 0.0206, "step": 509 }, { "epoch": 0.1843817787418655, "grad_norm": 0.34220766315139245, "learning_rate": 9.951985327322738e-06, "loss": 0.0317, "step": 510 }, { "epoch": 0.18474331164135938, "grad_norm": 1.1338389085563783, "learning_rate": 9.951571186585685e-06, "loss": 0.1055, "step": 511 }, { "epoch": 0.18510484454085321, "grad_norm": 1.0839087886515977, "learning_rate": 9.951155276158134e-06, "loss": 0.0762, "step": 512 }, { "epoch": 0.18546637744034708, "grad_norm": 0.02687256229560281, "learning_rate": 9.950737596188733e-06, "loss": 0.0016, "step": 513 }, { "epoch": 0.18582791033984092, "grad_norm": 1.044144375503699, "learning_rate": 9.950318146826759e-06, "loss": 0.0981, "step": 514 }, { "epoch": 0.18618944323933478, "grad_norm": 1.5381640464967006, "learning_rate": 9.949896928222126e-06, "loss": 0.0903, "step": 515 }, { "epoch": 0.18655097613882862, "grad_norm": 1.457025472673116, "learning_rate": 9.949473940525374e-06, "loss": 0.1914, "step": 516 }, { "epoch": 0.1869125090383225, "grad_norm": 0.016152150998953365, "learning_rate": 9.94904918388768e-06, "loss": 0.0009, "step": 517 }, { "epoch": 0.18727404193781635, "grad_norm": 0.4357757068519825, "learning_rate": 9.948622658460853e-06, "loss": 0.0206, "step": 518 }, { "epoch": 0.1876355748373102, "grad_norm": 0.10839217231849783, "learning_rate": 9.948194364397332e-06, "loss": 0.0065, "step": 519 }, { "epoch": 0.18799710773680406, "grad_norm": 0.5488163111737587, "learning_rate": 9.94776430185019e-06, "loss": 0.0479, "step": 520 }, { "epoch": 0.1883586406362979, "grad_norm": 1.1785470810973853, "learning_rate": 9.94733247097313e-06, "loss": 0.1914, "step": 521 }, { "epoch": 0.18872017353579176, "grad_norm": 0.042568323681461455, "learning_rate": 9.946898871920489e-06, "loss": 0.0029, "step": 522 }, { "epoch": 0.1890817064352856, "grad_norm": 1.0159240752471912, "learning_rate": 9.946463504847235e-06, "loss": 0.2012, "step": 523 }, { "epoch": 0.18944323933477947, "grad_norm": 0.07367939750764008, "learning_rate": 9.946026369908968e-06, "loss": 0.0057, "step": 524 }, { "epoch": 0.1898047722342733, "grad_norm": 0.3066147960906041, "learning_rate": 9.945587467261922e-06, "loss": 0.0164, "step": 525 }, { "epoch": 0.19016630513376717, "grad_norm": 0.1881977492518138, "learning_rate": 9.945146797062957e-06, "loss": 0.0182, "step": 526 }, { "epoch": 0.19052783803326104, "grad_norm": 1.8767186920130172, "learning_rate": 9.94470435946957e-06, "loss": 0.1699, "step": 527 }, { "epoch": 0.19088937093275488, "grad_norm": 0.8134263578293847, "learning_rate": 9.944260154639891e-06, "loss": 0.1055, "step": 528 }, { "epoch": 0.19125090383224874, "grad_norm": 0.7573381127131121, "learning_rate": 9.943814182732674e-06, "loss": 0.0579, "step": 529 }, { "epoch": 0.19161243673174258, "grad_norm": 0.24740253197034265, "learning_rate": 9.943366443907312e-06, "loss": 0.0203, "step": 530 }, { "epoch": 0.19197396963123645, "grad_norm": 1.2245245602421206, "learning_rate": 9.942916938323825e-06, "loss": 0.1406, "step": 531 }, { "epoch": 0.19233550253073028, "grad_norm": 1.8107721452540593, "learning_rate": 9.94246566614287e-06, "loss": 0.1143, "step": 532 }, { "epoch": 0.19269703543022415, "grad_norm": 0.2693212594530967, "learning_rate": 9.942012627525728e-06, "loss": 0.0283, "step": 533 }, { "epoch": 0.19305856832971802, "grad_norm": 1.925475255401272, "learning_rate": 9.941557822634316e-06, "loss": 0.2012, "step": 534 }, { "epoch": 0.19342010122921185, "grad_norm": 0.8838528197573985, "learning_rate": 9.94110125163118e-06, "loss": 0.1226, "step": 535 }, { "epoch": 0.19378163412870572, "grad_norm": 0.7852667006717442, "learning_rate": 9.9406429146795e-06, "loss": 0.083, "step": 536 }, { "epoch": 0.19414316702819956, "grad_norm": 0.2238861946542774, "learning_rate": 9.940182811943084e-06, "loss": 0.013, "step": 537 }, { "epoch": 0.19450469992769343, "grad_norm": 0.5984364353218437, "learning_rate": 9.939720943586376e-06, "loss": 0.0579, "step": 538 }, { "epoch": 0.19486623282718726, "grad_norm": 0.40304031307851074, "learning_rate": 9.939257309774442e-06, "loss": 0.0388, "step": 539 }, { "epoch": 0.19522776572668113, "grad_norm": 0.363354657433479, "learning_rate": 9.93879191067299e-06, "loss": 0.0388, "step": 540 }, { "epoch": 0.195589298626175, "grad_norm": 0.020637256781322194, "learning_rate": 9.93832474644835e-06, "loss": 0.0011, "step": 541 }, { "epoch": 0.19595083152566883, "grad_norm": 0.402591804928837, "learning_rate": 9.93785581726749e-06, "loss": 0.043, "step": 542 }, { "epoch": 0.1963123644251627, "grad_norm": 0.16551250700908143, "learning_rate": 9.937385123298002e-06, "loss": 0.0092, "step": 543 }, { "epoch": 0.19667389732465654, "grad_norm": 0.09938728864918632, "learning_rate": 9.936912664708112e-06, "loss": 0.0073, "step": 544 }, { "epoch": 0.1970354302241504, "grad_norm": 0.6859909674711766, "learning_rate": 9.936438441666678e-06, "loss": 0.0432, "step": 545 }, { "epoch": 0.19739696312364424, "grad_norm": 0.5144059925040487, "learning_rate": 9.935962454343188e-06, "loss": 0.0527, "step": 546 }, { "epoch": 0.1977584960231381, "grad_norm": 0.8083353606232296, "learning_rate": 9.935484702907757e-06, "loss": 0.1055, "step": 547 }, { "epoch": 0.19812002892263195, "grad_norm": 0.9972335299294217, "learning_rate": 9.935005187531135e-06, "loss": 0.0762, "step": 548 }, { "epoch": 0.1984815618221258, "grad_norm": 0.1850508432868584, "learning_rate": 9.934523908384701e-06, "loss": 0.013, "step": 549 }, { "epoch": 0.19884309472161968, "grad_norm": 1.0014887893662228, "learning_rate": 9.934040865640463e-06, "loss": 0.083, "step": 550 }, { "epoch": 0.19920462762111352, "grad_norm": 0.3428006124117098, "learning_rate": 9.933556059471061e-06, "loss": 0.0315, "step": 551 }, { "epoch": 0.19956616052060738, "grad_norm": 0.4573266589035884, "learning_rate": 9.933069490049765e-06, "loss": 0.0391, "step": 552 }, { "epoch": 0.19992769342010122, "grad_norm": 0.47114174901708705, "learning_rate": 9.932581157550475e-06, "loss": 0.043, "step": 553 }, { "epoch": 0.2002892263195951, "grad_norm": 0.2735249311051171, "learning_rate": 9.932091062147717e-06, "loss": 0.0254, "step": 554 }, { "epoch": 0.20065075921908893, "grad_norm": 1.1658817306620102, "learning_rate": 9.931599204016658e-06, "loss": 0.1055, "step": 555 }, { "epoch": 0.2010122921185828, "grad_norm": 0.5665464581329824, "learning_rate": 9.931105583333082e-06, "loss": 0.0283, "step": 556 }, { "epoch": 0.20137382501807666, "grad_norm": 0.21492664591553653, "learning_rate": 9.930610200273412e-06, "loss": 0.0227, "step": 557 }, { "epoch": 0.2017353579175705, "grad_norm": 0.037830588609202236, "learning_rate": 9.930113055014696e-06, "loss": 0.0022, "step": 558 }, { "epoch": 0.20209689081706436, "grad_norm": 0.5551014765393786, "learning_rate": 9.929614147734617e-06, "loss": 0.0522, "step": 559 }, { "epoch": 0.2024584237165582, "grad_norm": 0.0675855769971716, "learning_rate": 9.92911347861148e-06, "loss": 0.0071, "step": 560 }, { "epoch": 0.20281995661605207, "grad_norm": 1.1328411717761049, "learning_rate": 9.928611047824226e-06, "loss": 0.1055, "step": 561 }, { "epoch": 0.2031814895155459, "grad_norm": 0.4596602321251785, "learning_rate": 9.928106855552424e-06, "loss": 0.0256, "step": 562 }, { "epoch": 0.20354302241503977, "grad_norm": 0.11737760630539908, "learning_rate": 9.927600901976273e-06, "loss": 0.0129, "step": 563 }, { "epoch": 0.2039045553145336, "grad_norm": 1.0667822521092951, "learning_rate": 9.927093187276597e-06, "loss": 0.1807, "step": 564 }, { "epoch": 0.20426608821402747, "grad_norm": 0.2168083510907658, "learning_rate": 9.926583711634857e-06, "loss": 0.0131, "step": 565 }, { "epoch": 0.20462762111352134, "grad_norm": 0.616640343961535, "learning_rate": 9.926072475233139e-06, "loss": 0.0391, "step": 566 }, { "epoch": 0.20498915401301518, "grad_norm": 1.0236598379103385, "learning_rate": 9.925559478254157e-06, "loss": 0.1807, "step": 567 }, { "epoch": 0.20535068691250905, "grad_norm": 0.8597302212951577, "learning_rate": 9.925044720881257e-06, "loss": 0.0479, "step": 568 }, { "epoch": 0.20571221981200288, "grad_norm": 1.561897811170505, "learning_rate": 9.924528203298413e-06, "loss": 0.1309, "step": 569 }, { "epoch": 0.20607375271149675, "grad_norm": 0.032176584374132795, "learning_rate": 9.924009925690229e-06, "loss": 0.0022, "step": 570 }, { "epoch": 0.2064352856109906, "grad_norm": 1.7714368081920286, "learning_rate": 9.923489888241936e-06, "loss": 0.1914, "step": 571 }, { "epoch": 0.20679681851048445, "grad_norm": 0.3329955970064571, "learning_rate": 9.922968091139397e-06, "loss": 0.0228, "step": 572 }, { "epoch": 0.20715835140997832, "grad_norm": 0.10581148667463856, "learning_rate": 9.9224445345691e-06, "loss": 0.0092, "step": 573 }, { "epoch": 0.20751988430947216, "grad_norm": 0.04188101273148882, "learning_rate": 9.921919218718165e-06, "loss": 0.0025, "step": 574 }, { "epoch": 0.20788141720896602, "grad_norm": 1.6603492430433033, "learning_rate": 9.921392143774342e-06, "loss": 0.1143, "step": 575 }, { "epoch": 0.20824295010845986, "grad_norm": 0.35477205935032613, "learning_rate": 9.920863309926003e-06, "loss": 0.0204, "step": 576 }, { "epoch": 0.20860448300795373, "grad_norm": 0.1101480010895025, "learning_rate": 9.920332717362157e-06, "loss": 0.0072, "step": 577 }, { "epoch": 0.20896601590744757, "grad_norm": 0.38212518839831494, "learning_rate": 9.919800366272436e-06, "loss": 0.0254, "step": 578 }, { "epoch": 0.20932754880694143, "grad_norm": 0.1538022153617307, "learning_rate": 9.919266256847102e-06, "loss": 0.0072, "step": 579 }, { "epoch": 0.2096890817064353, "grad_norm": 0.17182720983813848, "learning_rate": 9.918730389277046e-06, "loss": 0.0162, "step": 580 }, { "epoch": 0.21005061460592914, "grad_norm": 0.07793604987055368, "learning_rate": 9.918192763753788e-06, "loss": 0.0057, "step": 581 }, { "epoch": 0.210412147505423, "grad_norm": 0.2656487351531011, "learning_rate": 9.917653380469475e-06, "loss": 0.0227, "step": 582 }, { "epoch": 0.21077368040491684, "grad_norm": 1.2024971838480973, "learning_rate": 9.91711223961688e-06, "loss": 0.083, "step": 583 }, { "epoch": 0.2111352133044107, "grad_norm": 0.03642456337057671, "learning_rate": 9.916569341389405e-06, "loss": 0.0025, "step": 584 }, { "epoch": 0.21149674620390455, "grad_norm": 0.11494431341432185, "learning_rate": 9.91602468598109e-06, "loss": 0.0081, "step": 585 }, { "epoch": 0.2118582791033984, "grad_norm": 1.0279167493340438, "learning_rate": 9.915478273586587e-06, "loss": 0.1914, "step": 586 }, { "epoch": 0.21221981200289225, "grad_norm": 0.1894062428463986, "learning_rate": 9.914930104401187e-06, "loss": 0.0116, "step": 587 }, { "epoch": 0.21258134490238612, "grad_norm": 0.14985664424856426, "learning_rate": 9.914380178620807e-06, "loss": 0.0162, "step": 588 }, { "epoch": 0.21294287780187998, "grad_norm": 0.29528123413472873, "learning_rate": 9.913828496441985e-06, "loss": 0.0116, "step": 589 }, { "epoch": 0.21330441070137382, "grad_norm": 0.484300721878795, "learning_rate": 9.913275058061898e-06, "loss": 0.0317, "step": 590 }, { "epoch": 0.21366594360086769, "grad_norm": 0.2092275982744164, "learning_rate": 9.91271986367834e-06, "loss": 0.0183, "step": 591 }, { "epoch": 0.21402747650036152, "grad_norm": 0.2556660800473309, "learning_rate": 9.91216291348974e-06, "loss": 0.0204, "step": 592 }, { "epoch": 0.2143890093998554, "grad_norm": 1.070506565739671, "learning_rate": 9.911604207695153e-06, "loss": 0.1699, "step": 593 }, { "epoch": 0.21475054229934923, "grad_norm": 0.5030364257141847, "learning_rate": 9.911043746494258e-06, "loss": 0.0354, "step": 594 }, { "epoch": 0.2151120751988431, "grad_norm": 0.07463619935760867, "learning_rate": 9.910481530087363e-06, "loss": 0.008, "step": 595 }, { "epoch": 0.21547360809833696, "grad_norm": 0.8973395904803644, "learning_rate": 9.909917558675406e-06, "loss": 0.0635, "step": 596 }, { "epoch": 0.2158351409978308, "grad_norm": 0.09156374546581213, "learning_rate": 9.90935183245995e-06, "loss": 0.0045, "step": 597 }, { "epoch": 0.21619667389732466, "grad_norm": 2.1032547780618467, "learning_rate": 9.908784351643186e-06, "loss": 0.1699, "step": 598 }, { "epoch": 0.2165582067968185, "grad_norm": 0.9438077283003032, "learning_rate": 9.90821511642793e-06, "loss": 0.2236, "step": 599 }, { "epoch": 0.21691973969631237, "grad_norm": 0.1184608207583573, "learning_rate": 9.907644127017627e-06, "loss": 0.0114, "step": 600 }, { "epoch": 0.2172812725958062, "grad_norm": 0.897882869261813, "learning_rate": 9.907071383616349e-06, "loss": 0.1309, "step": 601 }, { "epoch": 0.21764280549530007, "grad_norm": 0.1178498187114061, "learning_rate": 9.906496886428793e-06, "loss": 0.0103, "step": 602 }, { "epoch": 0.21800433839479394, "grad_norm": 2.8566227507375666, "learning_rate": 9.905920635660286e-06, "loss": 0.1143, "step": 603 }, { "epoch": 0.21836587129428778, "grad_norm": 0.23372196432625408, "learning_rate": 9.90534263151678e-06, "loss": 0.0227, "step": 604 }, { "epoch": 0.21872740419378164, "grad_norm": 0.29399452381778884, "learning_rate": 9.904762874204853e-06, "loss": 0.0254, "step": 605 }, { "epoch": 0.21908893709327548, "grad_norm": 0.19554491207026506, "learning_rate": 9.90418136393171e-06, "loss": 0.0182, "step": 606 }, { "epoch": 0.21945046999276935, "grad_norm": 0.4202702353333835, "learning_rate": 9.90359810090518e-06, "loss": 0.0228, "step": 607 }, { "epoch": 0.2198120028922632, "grad_norm": 0.3333661376399739, "learning_rate": 9.903013085333727e-06, "loss": 0.0317, "step": 608 }, { "epoch": 0.22017353579175705, "grad_norm": 4.3756167123824445, "learning_rate": 9.902426317426428e-06, "loss": 0.2695, "step": 609 }, { "epoch": 0.2205350686912509, "grad_norm": 0.1966175911676928, "learning_rate": 9.901837797393e-06, "loss": 0.013, "step": 610 }, { "epoch": 0.22089660159074476, "grad_norm": 0.09701780395620066, "learning_rate": 9.901247525443778e-06, "loss": 0.0073, "step": 611 }, { "epoch": 0.22125813449023862, "grad_norm": 0.3971487468618244, "learning_rate": 9.900655501789725e-06, "loss": 0.0349, "step": 612 }, { "epoch": 0.22161966738973246, "grad_norm": 0.7622455284466854, "learning_rate": 9.900061726642428e-06, "loss": 0.1504, "step": 613 }, { "epoch": 0.22198120028922633, "grad_norm": 0.0427112175000243, "learning_rate": 9.899466200214105e-06, "loss": 0.0025, "step": 614 }, { "epoch": 0.22234273318872017, "grad_norm": 0.03256553877451598, "learning_rate": 9.898868922717598e-06, "loss": 0.0022, "step": 615 }, { "epoch": 0.22270426608821403, "grad_norm": 0.7051628704462445, "learning_rate": 9.89826989436637e-06, "loss": 0.0391, "step": 616 }, { "epoch": 0.22306579898770787, "grad_norm": 0.05953655269943623, "learning_rate": 9.897669115374516e-06, "loss": 0.004, "step": 617 }, { "epoch": 0.22342733188720174, "grad_norm": 0.3349950599408771, "learning_rate": 9.897066585956752e-06, "loss": 0.0315, "step": 618 }, { "epoch": 0.2237888647866956, "grad_norm": 0.3764996897699481, "learning_rate": 9.896462306328425e-06, "loss": 0.0283, "step": 619 }, { "epoch": 0.22415039768618944, "grad_norm": 0.24641261381050977, "learning_rate": 9.895856276705504e-06, "loss": 0.0254, "step": 620 }, { "epoch": 0.2245119305856833, "grad_norm": 0.3389325243309901, "learning_rate": 9.895248497304581e-06, "loss": 0.0352, "step": 621 }, { "epoch": 0.22487346348517714, "grad_norm": 0.6325819199561818, "learning_rate": 9.89463896834288e-06, "loss": 0.0579, "step": 622 }, { "epoch": 0.225234996384671, "grad_norm": 1.2180542551274387, "learning_rate": 9.894027690038244e-06, "loss": 0.0635, "step": 623 }, { "epoch": 0.22559652928416485, "grad_norm": 0.5174349238545933, "learning_rate": 9.893414662609144e-06, "loss": 0.0205, "step": 624 }, { "epoch": 0.22595806218365871, "grad_norm": 0.09556053234027861, "learning_rate": 9.892799886274676e-06, "loss": 0.005, "step": 625 }, { "epoch": 0.22631959508315258, "grad_norm": 0.221020450373278, "learning_rate": 9.892183361254561e-06, "loss": 0.0204, "step": 626 }, { "epoch": 0.22668112798264642, "grad_norm": 0.7801142441387207, "learning_rate": 9.891565087769145e-06, "loss": 0.0476, "step": 627 }, { "epoch": 0.22704266088214028, "grad_norm": 1.3319575115528866, "learning_rate": 9.890945066039402e-06, "loss": 0.293, "step": 628 }, { "epoch": 0.22740419378163412, "grad_norm": 0.9691350261583744, "learning_rate": 9.890323296286923e-06, "loss": 0.1504, "step": 629 }, { "epoch": 0.227765726681128, "grad_norm": 0.12298531627835381, "learning_rate": 9.889699778733928e-06, "loss": 0.0143, "step": 630 }, { "epoch": 0.22812725958062183, "grad_norm": 0.9679219391764615, "learning_rate": 9.889074513603265e-06, "loss": 0.1602, "step": 631 }, { "epoch": 0.2284887924801157, "grad_norm": 0.08399750910206483, "learning_rate": 9.888447501118404e-06, "loss": 0.0045, "step": 632 }, { "epoch": 0.22885032537960953, "grad_norm": 0.1969427524524217, "learning_rate": 9.887818741503436e-06, "loss": 0.0203, "step": 633 }, { "epoch": 0.2292118582791034, "grad_norm": 0.49237904462904186, "learning_rate": 9.887188234983082e-06, "loss": 0.0317, "step": 634 }, { "epoch": 0.22957339117859726, "grad_norm": 0.21181412312923956, "learning_rate": 9.886555981782685e-06, "loss": 0.0227, "step": 635 }, { "epoch": 0.2299349240780911, "grad_norm": 0.16587287236029039, "learning_rate": 9.885921982128211e-06, "loss": 0.0161, "step": 636 }, { "epoch": 0.23029645697758497, "grad_norm": 1.1645648538835998, "learning_rate": 9.88528623624625e-06, "loss": 0.1807, "step": 637 }, { "epoch": 0.2306579898770788, "grad_norm": 0.20421343876158612, "learning_rate": 9.884648744364021e-06, "loss": 0.0181, "step": 638 }, { "epoch": 0.23101952277657267, "grad_norm": 0.2845683966092849, "learning_rate": 9.884009506709361e-06, "loss": 0.0283, "step": 639 }, { "epoch": 0.2313810556760665, "grad_norm": 0.9766529548920172, "learning_rate": 9.883368523510734e-06, "loss": 0.1504, "step": 640 }, { "epoch": 0.23174258857556038, "grad_norm": 0.0346107586358212, "learning_rate": 9.882725794997228e-06, "loss": 0.0022, "step": 641 }, { "epoch": 0.23210412147505424, "grad_norm": 0.8980986633707699, "learning_rate": 9.882081321398554e-06, "loss": 0.1143, "step": 642 }, { "epoch": 0.23246565437454808, "grad_norm": 0.5118128699745492, "learning_rate": 9.881435102945043e-06, "loss": 0.0693, "step": 643 }, { "epoch": 0.23282718727404195, "grad_norm": 1.232898988584782, "learning_rate": 9.880787139867659e-06, "loss": 0.0762, "step": 644 }, { "epoch": 0.23318872017353579, "grad_norm": 1.3724142667776786, "learning_rate": 9.88013743239798e-06, "loss": 0.2363, "step": 645 }, { "epoch": 0.23355025307302965, "grad_norm": 0.989314193334668, "learning_rate": 9.879485980768213e-06, "loss": 0.1055, "step": 646 }, { "epoch": 0.2339117859725235, "grad_norm": 0.6223523891076347, "learning_rate": 9.878832785211187e-06, "loss": 0.0579, "step": 647 }, { "epoch": 0.23427331887201736, "grad_norm": 0.3506696145149089, "learning_rate": 9.878177845960351e-06, "loss": 0.0205, "step": 648 }, { "epoch": 0.2346348517715112, "grad_norm": 0.8558459928332556, "learning_rate": 9.877521163249785e-06, "loss": 0.0579, "step": 649 }, { "epoch": 0.23499638467100506, "grad_norm": 0.21776272584590448, "learning_rate": 9.876862737314184e-06, "loss": 0.0254, "step": 650 }, { "epoch": 0.23535791757049893, "grad_norm": 0.19730325164510615, "learning_rate": 9.876202568388868e-06, "loss": 0.0203, "step": 651 }, { "epoch": 0.23571945046999276, "grad_norm": 0.9673410365392268, "learning_rate": 9.875540656709784e-06, "loss": 0.0635, "step": 652 }, { "epoch": 0.23608098336948663, "grad_norm": 0.19242653658147238, "learning_rate": 9.874877002513499e-06, "loss": 0.0145, "step": 653 }, { "epoch": 0.23644251626898047, "grad_norm": 0.061490248859548687, "learning_rate": 9.874211606037201e-06, "loss": 0.004, "step": 654 }, { "epoch": 0.23680404916847433, "grad_norm": 0.3435247803924811, "learning_rate": 9.873544467518705e-06, "loss": 0.0283, "step": 655 }, { "epoch": 0.23716558206796817, "grad_norm": 0.37394547122969385, "learning_rate": 9.872875587196444e-06, "loss": 0.0349, "step": 656 }, { "epoch": 0.23752711496746204, "grad_norm": 1.0728419019965214, "learning_rate": 9.872204965309478e-06, "loss": 0.0579, "step": 657 }, { "epoch": 0.2378886478669559, "grad_norm": 0.13916585562008252, "learning_rate": 9.871532602097483e-06, "loss": 0.0143, "step": 658 }, { "epoch": 0.23825018076644974, "grad_norm": 0.4853724477065112, "learning_rate": 9.870858497800766e-06, "loss": 0.0432, "step": 659 }, { "epoch": 0.2386117136659436, "grad_norm": 0.05842418996329717, "learning_rate": 9.87018265266025e-06, "loss": 0.0051, "step": 660 }, { "epoch": 0.23897324656543745, "grad_norm": 0.3433684106965482, "learning_rate": 9.86950506691748e-06, "loss": 0.0283, "step": 661 }, { "epoch": 0.2393347794649313, "grad_norm": 0.8961303587556855, "learning_rate": 9.868825740814627e-06, "loss": 0.1309, "step": 662 }, { "epoch": 0.23969631236442515, "grad_norm": 1.2565262319149237, "learning_rate": 9.868144674594483e-06, "loss": 0.1807, "step": 663 }, { "epoch": 0.24005784526391902, "grad_norm": 0.2095762973858583, "learning_rate": 9.867461868500459e-06, "loss": 0.0204, "step": 664 }, { "epoch": 0.24041937816341288, "grad_norm": 0.07704455099943051, "learning_rate": 9.866777322776591e-06, "loss": 0.005, "step": 665 }, { "epoch": 0.24078091106290672, "grad_norm": 0.645501540713082, "learning_rate": 9.866091037667534e-06, "loss": 0.0527, "step": 666 }, { "epoch": 0.2411424439624006, "grad_norm": 0.21156210175274467, "learning_rate": 9.86540301341857e-06, "loss": 0.0161, "step": 667 }, { "epoch": 0.24150397686189443, "grad_norm": 0.8545890734364003, "learning_rate": 9.86471325027559e-06, "loss": 0.1699, "step": 668 }, { "epoch": 0.2418655097613883, "grad_norm": 0.15956974137299504, "learning_rate": 9.864021748485126e-06, "loss": 0.0161, "step": 669 }, { "epoch": 0.24222704266088213, "grad_norm": 1.0575440881199376, "learning_rate": 9.863328508294313e-06, "loss": 0.1602, "step": 670 }, { "epoch": 0.242588575560376, "grad_norm": 3.055885502243391, "learning_rate": 9.862633529950918e-06, "loss": 0.0762, "step": 671 }, { "epoch": 0.24295010845986983, "grad_norm": 0.06029897670973075, "learning_rate": 9.861936813703327e-06, "loss": 0.0057, "step": 672 }, { "epoch": 0.2433116413593637, "grad_norm": 4.98093696913841, "learning_rate": 9.861238359800543e-06, "loss": 0.1914, "step": 673 }, { "epoch": 0.24367317425885757, "grad_norm": 0.22967741622488322, "learning_rate": 9.860538168492198e-06, "loss": 0.0203, "step": 674 }, { "epoch": 0.2440347071583514, "grad_norm": 0.14648489364622558, "learning_rate": 9.859836240028534e-06, "loss": 0.0115, "step": 675 }, { "epoch": 0.24439624005784527, "grad_norm": 0.30788707027346124, "learning_rate": 9.859132574660426e-06, "loss": 0.0349, "step": 676 }, { "epoch": 0.2447577729573391, "grad_norm": 0.1425042248992247, "learning_rate": 9.85842717263936e-06, "loss": 0.0181, "step": 677 }, { "epoch": 0.24511930585683298, "grad_norm": 0.3036239745499691, "learning_rate": 9.857720034217446e-06, "loss": 0.0164, "step": 678 }, { "epoch": 0.2454808387563268, "grad_norm": 0.07426061630120113, "learning_rate": 9.857011159647419e-06, "loss": 0.0064, "step": 679 }, { "epoch": 0.24584237165582068, "grad_norm": 0.2101487256856219, "learning_rate": 9.85630054918263e-06, "loss": 0.0227, "step": 680 }, { "epoch": 0.24620390455531455, "grad_norm": 0.4231736415683055, "learning_rate": 9.855588203077047e-06, "loss": 0.0349, "step": 681 }, { "epoch": 0.24656543745480838, "grad_norm": 0.05479888086313799, "learning_rate": 9.854874121585266e-06, "loss": 0.0031, "step": 682 }, { "epoch": 0.24692697035430225, "grad_norm": 0.19895454325665832, "learning_rate": 9.854158304962498e-06, "loss": 0.0164, "step": 683 }, { "epoch": 0.2472885032537961, "grad_norm": 0.8308187316323804, "learning_rate": 9.853440753464578e-06, "loss": 0.1807, "step": 684 }, { "epoch": 0.24765003615328995, "grad_norm": 0.9017042065444678, "learning_rate": 9.852721467347954e-06, "loss": 0.1406, "step": 685 }, { "epoch": 0.2480115690527838, "grad_norm": 0.81076550151506, "learning_rate": 9.852000446869704e-06, "loss": 0.0635, "step": 686 }, { "epoch": 0.24837310195227766, "grad_norm": 0.40640177122194154, "learning_rate": 9.851277692287518e-06, "loss": 0.0205, "step": 687 }, { "epoch": 0.24873463485177152, "grad_norm": 0.28494155548730016, "learning_rate": 9.850553203859707e-06, "loss": 0.0349, "step": 688 }, { "epoch": 0.24909616775126536, "grad_norm": 0.8365702886724805, "learning_rate": 9.849826981845206e-06, "loss": 0.083, "step": 689 }, { "epoch": 0.24945770065075923, "grad_norm": 0.17532620945516664, "learning_rate": 9.849099026503565e-06, "loss": 0.0227, "step": 690 }, { "epoch": 0.24981923355025307, "grad_norm": 0.826207362950122, "learning_rate": 9.848369338094955e-06, "loss": 0.0977, "step": 691 }, { "epoch": 0.2501807664497469, "grad_norm": 0.39414328113923564, "learning_rate": 9.847637916880167e-06, "loss": 0.0476, "step": 692 }, { "epoch": 0.25054229934924077, "grad_norm": 0.6462702886175999, "learning_rate": 9.84690476312061e-06, "loss": 0.0757, "step": 693 }, { "epoch": 0.25090383224873464, "grad_norm": 0.7373471285549597, "learning_rate": 9.846169877078315e-06, "loss": 0.0635, "step": 694 }, { "epoch": 0.2512653651482285, "grad_norm": 0.41527022886017795, "learning_rate": 9.845433259015929e-06, "loss": 0.0435, "step": 695 }, { "epoch": 0.25162689804772237, "grad_norm": 0.22113815850277616, "learning_rate": 9.844694909196717e-06, "loss": 0.0183, "step": 696 }, { "epoch": 0.2519884309472162, "grad_norm": 0.7191365620948735, "learning_rate": 9.843954827884568e-06, "loss": 0.1406, "step": 697 }, { "epoch": 0.25234996384671005, "grad_norm": 0.14557497460897897, "learning_rate": 9.843213015343985e-06, "loss": 0.0081, "step": 698 }, { "epoch": 0.2527114967462039, "grad_norm": 0.23390522470765338, "learning_rate": 9.84246947184009e-06, "loss": 0.0145, "step": 699 }, { "epoch": 0.2530730296456978, "grad_norm": 0.6011048417765903, "learning_rate": 9.841724197638631e-06, "loss": 0.0317, "step": 700 }, { "epoch": 0.2534345625451916, "grad_norm": 0.6006404255900196, "learning_rate": 9.840977193005966e-06, "loss": 0.0522, "step": 701 }, { "epoch": 0.25379609544468545, "grad_norm": 0.3323446972745632, "learning_rate": 9.840228458209074e-06, "loss": 0.0388, "step": 702 }, { "epoch": 0.2541576283441793, "grad_norm": 0.638604358221921, "learning_rate": 9.839477993515549e-06, "loss": 0.1143, "step": 703 }, { "epoch": 0.2545191612436732, "grad_norm": 0.00948903807674972, "learning_rate": 9.838725799193614e-06, "loss": 0.0006, "step": 704 }, { "epoch": 0.25488069414316705, "grad_norm": 0.5807092307441285, "learning_rate": 9.837971875512098e-06, "loss": 0.0757, "step": 705 }, { "epoch": 0.25524222704266086, "grad_norm": 0.5856123047688412, "learning_rate": 9.837216222740456e-06, "loss": 0.0693, "step": 706 }, { "epoch": 0.25560375994215473, "grad_norm": 0.22949594715893837, "learning_rate": 9.836458841148755e-06, "loss": 0.013, "step": 707 }, { "epoch": 0.2559652928416486, "grad_norm": 0.017574328503032952, "learning_rate": 9.835699731007686e-06, "loss": 0.001, "step": 708 }, { "epoch": 0.25632682574114246, "grad_norm": 0.3291019632341502, "learning_rate": 9.834938892588553e-06, "loss": 0.043, "step": 709 }, { "epoch": 0.25668835864063627, "grad_norm": 0.8033889915704052, "learning_rate": 9.834176326163281e-06, "loss": 0.0903, "step": 710 }, { "epoch": 0.25704989154013014, "grad_norm": 1.6086604038615975, "learning_rate": 9.833412032004407e-06, "loss": 0.293, "step": 711 }, { "epoch": 0.257411424439624, "grad_norm": 0.8561329729135426, "learning_rate": 9.832646010385097e-06, "loss": 0.1143, "step": 712 }, { "epoch": 0.25777295733911787, "grad_norm": 3.7343709484229506, "learning_rate": 9.831878261579122e-06, "loss": 0.1226, "step": 713 }, { "epoch": 0.25813449023861174, "grad_norm": 0.08407139063488847, "learning_rate": 9.831108785860875e-06, "loss": 0.0035, "step": 714 }, { "epoch": 0.25849602313810555, "grad_norm": 1.5107658591843227, "learning_rate": 9.830337583505367e-06, "loss": 0.1309, "step": 715 }, { "epoch": 0.2588575560375994, "grad_norm": 0.42297741412919926, "learning_rate": 9.829564654788227e-06, "loss": 0.0254, "step": 716 }, { "epoch": 0.2592190889370933, "grad_norm": 0.5160853417093851, "learning_rate": 9.8287899999857e-06, "loss": 0.0476, "step": 717 }, { "epoch": 0.25958062183658714, "grad_norm": 0.6108322849206382, "learning_rate": 9.828013619374644e-06, "loss": 0.0635, "step": 718 }, { "epoch": 0.259942154736081, "grad_norm": 0.34578194183173727, "learning_rate": 9.827235513232539e-06, "loss": 0.0315, "step": 719 }, { "epoch": 0.2603036876355748, "grad_norm": 0.43123661701850613, "learning_rate": 9.82645568183748e-06, "loss": 0.0522, "step": 720 }, { "epoch": 0.2606652205350687, "grad_norm": 0.2580872769757765, "learning_rate": 9.82567412546818e-06, "loss": 0.0283, "step": 721 }, { "epoch": 0.26102675343456255, "grad_norm": 0.034736368150797704, "learning_rate": 9.824890844403968e-06, "loss": 0.0028, "step": 722 }, { "epoch": 0.2613882863340564, "grad_norm": 0.7297181140734669, "learning_rate": 9.824105838924784e-06, "loss": 0.1914, "step": 723 }, { "epoch": 0.26174981923355023, "grad_norm": 0.3324600863566299, "learning_rate": 9.82331910931119e-06, "loss": 0.0349, "step": 724 }, { "epoch": 0.2621113521330441, "grad_norm": 0.15967962372785535, "learning_rate": 9.822530655844367e-06, "loss": 0.0203, "step": 725 }, { "epoch": 0.26247288503253796, "grad_norm": 1.0085310331297528, "learning_rate": 9.821740478806104e-06, "loss": 0.083, "step": 726 }, { "epoch": 0.2628344179320318, "grad_norm": 0.1621000109244007, "learning_rate": 9.820948578478813e-06, "loss": 0.0181, "step": 727 }, { "epoch": 0.2631959508315257, "grad_norm": 0.22080379331385708, "learning_rate": 9.820154955145516e-06, "loss": 0.0183, "step": 728 }, { "epoch": 0.2635574837310195, "grad_norm": 0.33855806934142196, "learning_rate": 9.819359609089855e-06, "loss": 0.0432, "step": 729 }, { "epoch": 0.26391901663051337, "grad_norm": 0.36047929216788166, "learning_rate": 9.818562540596087e-06, "loss": 0.0388, "step": 730 }, { "epoch": 0.26428054953000724, "grad_norm": 0.38167748209139474, "learning_rate": 9.817763749949083e-06, "loss": 0.0432, "step": 731 }, { "epoch": 0.2646420824295011, "grad_norm": 1.118169570916192, "learning_rate": 9.816963237434334e-06, "loss": 0.0635, "step": 732 }, { "epoch": 0.2650036153289949, "grad_norm": 0.8110957731707067, "learning_rate": 9.816161003337938e-06, "loss": 0.1699, "step": 733 }, { "epoch": 0.2653651482284888, "grad_norm": 4.11947396090593, "learning_rate": 9.815357047946618e-06, "loss": 2.6094, "step": 734 }, { "epoch": 0.26572668112798264, "grad_norm": 0.1945192505523653, "learning_rate": 9.814551371547704e-06, "loss": 0.0254, "step": 735 }, { "epoch": 0.2660882140274765, "grad_norm": 0.462251052749884, "learning_rate": 9.813743974429147e-06, "loss": 0.0435, "step": 736 }, { "epoch": 0.2664497469269704, "grad_norm": 0.05442978991948815, "learning_rate": 9.812934856879507e-06, "loss": 0.0039, "step": 737 }, { "epoch": 0.2668112798264642, "grad_norm": 0.7222540708712984, "learning_rate": 9.812124019187967e-06, "loss": 0.1143, "step": 738 }, { "epoch": 0.26717281272595805, "grad_norm": 0.03845387688632828, "learning_rate": 9.811311461644317e-06, "loss": 0.0024, "step": 739 }, { "epoch": 0.2675343456254519, "grad_norm": 0.8880828713334664, "learning_rate": 9.810497184538967e-06, "loss": 0.0693, "step": 740 }, { "epoch": 0.2678958785249458, "grad_norm": 0.33304833011137236, "learning_rate": 9.809681188162938e-06, "loss": 0.0388, "step": 741 }, { "epoch": 0.26825741142443965, "grad_norm": 0.3882446345714656, "learning_rate": 9.808863472807868e-06, "loss": 0.0349, "step": 742 }, { "epoch": 0.26861894432393346, "grad_norm": 0.9059192302958727, "learning_rate": 9.808044038766006e-06, "loss": 0.1504, "step": 743 }, { "epoch": 0.26898047722342733, "grad_norm": 0.7539785807442558, "learning_rate": 9.807222886330221e-06, "loss": 0.1504, "step": 744 }, { "epoch": 0.2693420101229212, "grad_norm": 0.2781422440238081, "learning_rate": 9.806400015793991e-06, "loss": 0.0283, "step": 745 }, { "epoch": 0.26970354302241506, "grad_norm": 0.3541801406242717, "learning_rate": 9.805575427451409e-06, "loss": 0.0183, "step": 746 }, { "epoch": 0.27006507592190887, "grad_norm": 0.10685611188559183, "learning_rate": 9.804749121597182e-06, "loss": 0.0072, "step": 747 }, { "epoch": 0.27042660882140274, "grad_norm": 0.23857428462794417, "learning_rate": 9.803921098526634e-06, "loss": 0.0283, "step": 748 }, { "epoch": 0.2707881417208966, "grad_norm": 0.7374133771358058, "learning_rate": 9.8030913585357e-06, "loss": 0.1914, "step": 749 }, { "epoch": 0.27114967462039047, "grad_norm": 0.2184585364085346, "learning_rate": 9.802259901920927e-06, "loss": 0.0254, "step": 750 }, { "epoch": 0.27151120751988433, "grad_norm": 0.1139049192603137, "learning_rate": 9.80142672897948e-06, "loss": 0.0064, "step": 751 }, { "epoch": 0.27187274041937814, "grad_norm": 0.9193880649327146, "learning_rate": 9.800591840009133e-06, "loss": 0.0977, "step": 752 }, { "epoch": 0.272234273318872, "grad_norm": 0.03973074791167774, "learning_rate": 9.799755235308274e-06, "loss": 0.0025, "step": 753 }, { "epoch": 0.2725958062183659, "grad_norm": 0.19727485294550826, "learning_rate": 9.798916915175908e-06, "loss": 0.0254, "step": 754 }, { "epoch": 0.27295733911785974, "grad_norm": 0.7133231730916949, "learning_rate": 9.798076879911649e-06, "loss": 0.1055, "step": 755 }, { "epoch": 0.27331887201735355, "grad_norm": 0.058242069360513925, "learning_rate": 9.797235129815725e-06, "loss": 0.004, "step": 756 }, { "epoch": 0.2736804049168474, "grad_norm": 0.22635844243474804, "learning_rate": 9.796391665188979e-06, "loss": 0.0254, "step": 757 }, { "epoch": 0.2740419378163413, "grad_norm": 1.626293402336418, "learning_rate": 9.795546486332864e-06, "loss": 0.2363, "step": 758 }, { "epoch": 0.27440347071583515, "grad_norm": 0.18068506727529754, "learning_rate": 9.794699593549446e-06, "loss": 0.0283, "step": 759 }, { "epoch": 0.274765003615329, "grad_norm": 0.2073762753993741, "learning_rate": 9.793850987141407e-06, "loss": 0.0254, "step": 760 }, { "epoch": 0.27512653651482283, "grad_norm": 0.7636308073827189, "learning_rate": 9.793000667412034e-06, "loss": 0.083, "step": 761 }, { "epoch": 0.2754880694143167, "grad_norm": 1.102416922640137, "learning_rate": 9.792148634665237e-06, "loss": 0.0081, "step": 762 }, { "epoch": 0.27584960231381056, "grad_norm": 0.03872588109985107, "learning_rate": 9.791294889205528e-06, "loss": 0.0035, "step": 763 }, { "epoch": 0.2762111352133044, "grad_norm": 0.8177269441047764, "learning_rate": 9.790439431338037e-06, "loss": 0.1055, "step": 764 }, { "epoch": 0.2765726681127983, "grad_norm": 0.7951618646453824, "learning_rate": 9.789582261368504e-06, "loss": 0.0903, "step": 765 }, { "epoch": 0.2769342010122921, "grad_norm": 0.7404644971321356, "learning_rate": 9.78872337960328e-06, "loss": 0.083, "step": 766 }, { "epoch": 0.27729573391178597, "grad_norm": 0.9825932982555973, "learning_rate": 9.787862786349334e-06, "loss": 0.0693, "step": 767 }, { "epoch": 0.27765726681127983, "grad_norm": 0.33943192280339596, "learning_rate": 9.787000481914235e-06, "loss": 0.0432, "step": 768 }, { "epoch": 0.2780187997107737, "grad_norm": 0.7078841262657934, "learning_rate": 9.786136466606176e-06, "loss": 0.1226, "step": 769 }, { "epoch": 0.2783803326102675, "grad_norm": 0.6698351615804657, "learning_rate": 9.785270740733954e-06, "loss": 0.1602, "step": 770 }, { "epoch": 0.2787418655097614, "grad_norm": 0.4136556481952552, "learning_rate": 9.78440330460698e-06, "loss": 0.0256, "step": 771 }, { "epoch": 0.27910339840925524, "grad_norm": 0.14747170838043985, "learning_rate": 9.783534158535272e-06, "loss": 0.0081, "step": 772 }, { "epoch": 0.2794649313087491, "grad_norm": 0.06788218796232151, "learning_rate": 9.782663302829467e-06, "loss": 0.005, "step": 773 }, { "epoch": 0.279826464208243, "grad_norm": 0.23353377512107618, "learning_rate": 9.781790737800808e-06, "loss": 0.0227, "step": 774 }, { "epoch": 0.2801879971077368, "grad_norm": 0.26513811262509235, "learning_rate": 9.780916463761145e-06, "loss": 0.0352, "step": 775 }, { "epoch": 0.28054953000723065, "grad_norm": 1.1454187323556522, "learning_rate": 9.78004048102295e-06, "loss": 0.0693, "step": 776 }, { "epoch": 0.2809110629067245, "grad_norm": 0.7400365810929974, "learning_rate": 9.779162789899295e-06, "loss": 0.083, "step": 777 }, { "epoch": 0.2812725958062184, "grad_norm": 0.3260358090653474, "learning_rate": 9.778283390703867e-06, "loss": 0.0388, "step": 778 }, { "epoch": 0.2816341287057122, "grad_norm": 27.246222522627274, "learning_rate": 9.777402283750965e-06, "loss": 11.375, "step": 779 }, { "epoch": 0.28199566160520606, "grad_norm": 0.17714769020383517, "learning_rate": 9.776519469355492e-06, "loss": 0.0254, "step": 780 }, { "epoch": 0.2823571945046999, "grad_norm": 0.6272119937363023, "learning_rate": 9.775634947832971e-06, "loss": 0.1699, "step": 781 }, { "epoch": 0.2827187274041938, "grad_norm": 0.19992055554566296, "learning_rate": 9.774748719499528e-06, "loss": 0.0283, "step": 782 }, { "epoch": 0.28308026030368766, "grad_norm": 0.1578043383752355, "learning_rate": 9.773860784671898e-06, "loss": 0.0203, "step": 783 }, { "epoch": 0.28344179320318147, "grad_norm": 0.7244899746275777, "learning_rate": 9.772971143667433e-06, "loss": 0.1309, "step": 784 }, { "epoch": 0.28380332610267534, "grad_norm": 0.04109743636421771, "learning_rate": 9.772079796804088e-06, "loss": 0.0022, "step": 785 }, { "epoch": 0.2841648590021692, "grad_norm": 0.15791522225701488, "learning_rate": 9.77118674440043e-06, "loss": 0.0254, "step": 786 }, { "epoch": 0.28452639190166307, "grad_norm": 0.19507270122288306, "learning_rate": 9.770291986775637e-06, "loss": 0.0203, "step": 787 }, { "epoch": 0.2848879248011569, "grad_norm": 0.0752308649125829, "learning_rate": 9.769395524249496e-06, "loss": 0.0051, "step": 788 }, { "epoch": 0.28524945770065074, "grad_norm": 0.5966592186624274, "learning_rate": 9.768497357142399e-06, "loss": 0.1055, "step": 789 }, { "epoch": 0.2856109906001446, "grad_norm": 0.2797036245002511, "learning_rate": 9.767597485775355e-06, "loss": 0.0388, "step": 790 }, { "epoch": 0.2859725234996385, "grad_norm": 0.5966186180200447, "learning_rate": 9.766695910469974e-06, "loss": 0.1504, "step": 791 }, { "epoch": 0.28633405639913234, "grad_norm": 0.2062682906242401, "learning_rate": 9.76579263154848e-06, "loss": 0.0203, "step": 792 }, { "epoch": 0.28669558929862615, "grad_norm": 0.8692526601213223, "learning_rate": 9.764887649333707e-06, "loss": 0.0693, "step": 793 }, { "epoch": 0.28705712219812, "grad_norm": 0.3465667532914965, "learning_rate": 9.763980964149093e-06, "loss": 0.0254, "step": 794 }, { "epoch": 0.2874186550976139, "grad_norm": 0.35107233201134813, "learning_rate": 9.763072576318688e-06, "loss": 0.0388, "step": 795 }, { "epoch": 0.28778018799710775, "grad_norm": 0.5441667831783878, "learning_rate": 9.76216248616715e-06, "loss": 0.1406, "step": 796 }, { "epoch": 0.2881417208966016, "grad_norm": 0.12912885827574241, "learning_rate": 9.761250694019743e-06, "loss": 0.0181, "step": 797 }, { "epoch": 0.2885032537960954, "grad_norm": 0.2939789107715229, "learning_rate": 9.760337200202344e-06, "loss": 0.0432, "step": 798 }, { "epoch": 0.2888647866955893, "grad_norm": 0.8039808249598335, "learning_rate": 9.759422005041432e-06, "loss": 0.0977, "step": 799 }, { "epoch": 0.28922631959508316, "grad_norm": 6.481236113631918, "learning_rate": 9.758505108864103e-06, "loss": 0.0903, "step": 800 }, { "epoch": 0.289587852494577, "grad_norm": 0.20091836485155037, "learning_rate": 9.75758651199805e-06, "loss": 0.0283, "step": 801 }, { "epoch": 0.28994938539407084, "grad_norm": 0.18467737018518304, "learning_rate": 9.756666214771583e-06, "loss": 0.0254, "step": 802 }, { "epoch": 0.2903109182935647, "grad_norm": 0.22474663252264365, "learning_rate": 9.755744217513615e-06, "loss": 0.0315, "step": 803 }, { "epoch": 0.29067245119305857, "grad_norm": 0.6482383851440353, "learning_rate": 9.754820520553666e-06, "loss": 0.0977, "step": 804 }, { "epoch": 0.29103398409255243, "grad_norm": 0.2449535693387849, "learning_rate": 9.753895124221865e-06, "loss": 0.0352, "step": 805 }, { "epoch": 0.2913955169920463, "grad_norm": 0.1848443383646351, "learning_rate": 9.752968028848953e-06, "loss": 0.0283, "step": 806 }, { "epoch": 0.2917570498915401, "grad_norm": 0.21261920862326666, "learning_rate": 9.752039234766272e-06, "loss": 0.0283, "step": 807 }, { "epoch": 0.292118582791034, "grad_norm": 0.2332293390231207, "learning_rate": 9.751108742305766e-06, "loss": 0.0315, "step": 808 }, { "epoch": 0.29248011569052784, "grad_norm": 0.8434273819025344, "learning_rate": 9.750176551800001e-06, "loss": 0.1055, "step": 809 }, { "epoch": 0.2928416485900217, "grad_norm": 0.28640588106407905, "learning_rate": 9.74924266358214e-06, "loss": 0.0283, "step": 810 }, { "epoch": 0.2932031814895155, "grad_norm": 0.2589536572417334, "learning_rate": 9.748307077985951e-06, "loss": 0.0352, "step": 811 }, { "epoch": 0.2935647143890094, "grad_norm": 0.11536851618448346, "learning_rate": 9.747369795345815e-06, "loss": 0.0082, "step": 812 }, { "epoch": 0.29392624728850325, "grad_norm": 0.2380963061428969, "learning_rate": 9.746430815996717e-06, "loss": 0.0254, "step": 813 }, { "epoch": 0.2942877801879971, "grad_norm": 0.020934011785938882, "learning_rate": 9.745490140274248e-06, "loss": 0.0015, "step": 814 }, { "epoch": 0.294649313087491, "grad_norm": 0.8299346542016504, "learning_rate": 9.744547768514602e-06, "loss": 0.1504, "step": 815 }, { "epoch": 0.2950108459869848, "grad_norm": 0.5378957158118721, "learning_rate": 9.743603701054585e-06, "loss": 0.083, "step": 816 }, { "epoch": 0.29537237888647866, "grad_norm": 0.6899203472245415, "learning_rate": 9.742657938231607e-06, "loss": 0.0635, "step": 817 }, { "epoch": 0.2957339117859725, "grad_norm": 0.6241696476542495, "learning_rate": 9.741710480383683e-06, "loss": 0.1699, "step": 818 }, { "epoch": 0.2960954446854664, "grad_norm": 0.3850002938223796, "learning_rate": 9.740761327849435e-06, "loss": 0.0391, "step": 819 }, { "epoch": 0.29645697758496026, "grad_norm": 0.21406736981436592, "learning_rate": 9.739810480968088e-06, "loss": 0.0283, "step": 820 }, { "epoch": 0.29681851048445407, "grad_norm": 0.15592072012508923, "learning_rate": 9.738857940079474e-06, "loss": 0.0115, "step": 821 }, { "epoch": 0.29718004338394793, "grad_norm": 0.20606612346283423, "learning_rate": 9.737903705524034e-06, "loss": 0.0315, "step": 822 }, { "epoch": 0.2975415762834418, "grad_norm": 0.4989179416341259, "learning_rate": 9.736947777642809e-06, "loss": 0.0527, "step": 823 }, { "epoch": 0.29790310918293567, "grad_norm": 0.18703819300774618, "learning_rate": 9.735990156777447e-06, "loss": 0.0283, "step": 824 }, { "epoch": 0.2982646420824295, "grad_norm": 0.10183704598770103, "learning_rate": 9.735030843270203e-06, "loss": 0.0064, "step": 825 }, { "epoch": 0.29862617498192334, "grad_norm": 0.32534502975544716, "learning_rate": 9.734069837463935e-06, "loss": 0.0432, "step": 826 }, { "epoch": 0.2989877078814172, "grad_norm": 0.34760970553832227, "learning_rate": 9.733107139702107e-06, "loss": 0.0388, "step": 827 }, { "epoch": 0.2993492407809111, "grad_norm": 0.03273320597600549, "learning_rate": 9.732142750328786e-06, "loss": 0.0009, "step": 828 }, { "epoch": 0.29971077368040494, "grad_norm": 0.031366803113347656, "learning_rate": 9.731176669688645e-06, "loss": 0.0019, "step": 829 }, { "epoch": 0.30007230657989875, "grad_norm": 0.13826288872270828, "learning_rate": 9.73020889812696e-06, "loss": 0.0103, "step": 830 }, { "epoch": 0.3004338394793926, "grad_norm": 0.15317322851129744, "learning_rate": 9.729239435989613e-06, "loss": 0.0115, "step": 831 }, { "epoch": 0.3007953723788865, "grad_norm": 0.19674731086906705, "learning_rate": 9.72826828362309e-06, "loss": 0.0228, "step": 832 }, { "epoch": 0.30115690527838035, "grad_norm": 0.044612384051570725, "learning_rate": 9.72729544137448e-06, "loss": 0.0028, "step": 833 }, { "epoch": 0.30151843817787416, "grad_norm": 0.10164931300164662, "learning_rate": 9.726320909591475e-06, "loss": 0.0081, "step": 834 }, { "epoch": 0.301879971077368, "grad_norm": 0.8106986652795738, "learning_rate": 9.725344688622377e-06, "loss": 0.1504, "step": 835 }, { "epoch": 0.3022415039768619, "grad_norm": 0.8653325143125418, "learning_rate": 9.724366778816083e-06, "loss": 0.1504, "step": 836 }, { "epoch": 0.30260303687635576, "grad_norm": 0.7533717068361422, "learning_rate": 9.723387180522101e-06, "loss": 0.0432, "step": 837 }, { "epoch": 0.3029645697758496, "grad_norm": 8.41764626090047, "learning_rate": 9.722405894090536e-06, "loss": 1.6016, "step": 838 }, { "epoch": 0.30332610267534343, "grad_norm": 0.1105672538866791, "learning_rate": 9.721422919872102e-06, "loss": 0.0103, "step": 839 }, { "epoch": 0.3036876355748373, "grad_norm": 0.2116381637714212, "learning_rate": 9.720438258218112e-06, "loss": 0.0315, "step": 840 }, { "epoch": 0.30404916847433117, "grad_norm": 1.7597390950646545, "learning_rate": 9.719451909480487e-06, "loss": 0.1699, "step": 841 }, { "epoch": 0.30441070137382503, "grad_norm": 0.1090614426078719, "learning_rate": 9.718463874011742e-06, "loss": 0.0064, "step": 842 }, { "epoch": 0.3047722342733189, "grad_norm": 0.0051773466905160395, "learning_rate": 9.717474152165007e-06, "loss": 0.0003, "step": 843 }, { "epoch": 0.3051337671728127, "grad_norm": 3.617550414409873, "learning_rate": 9.716482744294004e-06, "loss": 1.5625, "step": 844 }, { "epoch": 0.3054953000723066, "grad_norm": 0.3514597957348166, "learning_rate": 9.715489650753064e-06, "loss": 0.0476, "step": 845 }, { "epoch": 0.30585683297180044, "grad_norm": 0.3109088944077699, "learning_rate": 9.714494871897118e-06, "loss": 0.043, "step": 846 }, { "epoch": 0.3062183658712943, "grad_norm": 0.5400039166996459, "learning_rate": 9.7134984080817e-06, "loss": 0.0352, "step": 847 }, { "epoch": 0.3065798987707881, "grad_norm": 0.05890324795051929, "learning_rate": 9.712500259662945e-06, "loss": 0.0039, "step": 848 }, { "epoch": 0.306941431670282, "grad_norm": 0.23938230236405114, "learning_rate": 9.711500426997593e-06, "loss": 0.0349, "step": 849 }, { "epoch": 0.30730296456977585, "grad_norm": 0.5806519680777371, "learning_rate": 9.71049891044298e-06, "loss": 0.083, "step": 850 }, { "epoch": 0.3076644974692697, "grad_norm": 0.07075749218039303, "learning_rate": 9.709495710357053e-06, "loss": 0.0025, "step": 851 }, { "epoch": 0.3080260303687636, "grad_norm": 0.24275398064311193, "learning_rate": 9.708490827098352e-06, "loss": 0.0349, "step": 852 }, { "epoch": 0.3083875632682574, "grad_norm": 0.1180186083792585, "learning_rate": 9.707484261026023e-06, "loss": 0.0072, "step": 853 }, { "epoch": 0.30874909616775126, "grad_norm": 0.21260269657821007, "learning_rate": 9.706476012499815e-06, "loss": 0.0315, "step": 854 }, { "epoch": 0.3091106290672451, "grad_norm": 0.4789103186427671, "learning_rate": 9.70546608188007e-06, "loss": 0.0349, "step": 855 }, { "epoch": 0.309472161966739, "grad_norm": 0.06482388612146898, "learning_rate": 9.704454469527741e-06, "loss": 0.004, "step": 856 }, { "epoch": 0.3098336948662328, "grad_norm": 0.8495180545412864, "learning_rate": 9.70344117580438e-06, "loss": 0.1143, "step": 857 }, { "epoch": 0.31019522776572667, "grad_norm": 0.29574596213579507, "learning_rate": 9.702426201072133e-06, "loss": 0.0349, "step": 858 }, { "epoch": 0.31055676066522053, "grad_norm": 1.4711207739307874, "learning_rate": 9.701409545693754e-06, "loss": 0.1406, "step": 859 }, { "epoch": 0.3109182935647144, "grad_norm": 0.13098934239524188, "learning_rate": 9.700391210032597e-06, "loss": 0.0103, "step": 860 }, { "epoch": 0.31127982646420826, "grad_norm": 0.02641612708945291, "learning_rate": 9.699371194452613e-06, "loss": 0.0015, "step": 861 }, { "epoch": 0.3116413593637021, "grad_norm": 0.6610553101675217, "learning_rate": 9.698349499318356e-06, "loss": 0.0635, "step": 862 }, { "epoch": 0.31200289226319594, "grad_norm": 0.15590585693487147, "learning_rate": 9.697326124994979e-06, "loss": 0.0091, "step": 863 }, { "epoch": 0.3123644251626898, "grad_norm": 0.7733250261997263, "learning_rate": 9.696301071848235e-06, "loss": 0.0762, "step": 864 }, { "epoch": 0.3127259580621837, "grad_norm": 0.7892814273775051, "learning_rate": 9.69527434024448e-06, "loss": 0.1226, "step": 865 }, { "epoch": 0.31308749096167754, "grad_norm": 0.7292399881247017, "learning_rate": 9.694245930550668e-06, "loss": 0.1309, "step": 866 }, { "epoch": 0.31344902386117135, "grad_norm": 0.058725281125170406, "learning_rate": 9.693215843134351e-06, "loss": 0.0035, "step": 867 }, { "epoch": 0.3138105567606652, "grad_norm": 0.22676147132523797, "learning_rate": 9.69218407836368e-06, "loss": 0.0254, "step": 868 }, { "epoch": 0.3141720896601591, "grad_norm": 0.09761518196857759, "learning_rate": 9.691150636607411e-06, "loss": 0.0045, "step": 869 }, { "epoch": 0.31453362255965295, "grad_norm": 0.06861556042058915, "learning_rate": 9.690115518234894e-06, "loss": 0.0045, "step": 870 }, { "epoch": 0.31489515545914676, "grad_norm": 0.1279502964102589, "learning_rate": 9.689078723616081e-06, "loss": 0.0015, "step": 871 }, { "epoch": 0.3152566883586406, "grad_norm": 2.309465012769175, "learning_rate": 9.688040253121523e-06, "loss": 0.1504, "step": 872 }, { "epoch": 0.3156182212581345, "grad_norm": 1.0334555904023908, "learning_rate": 9.687000107122367e-06, "loss": 0.0476, "step": 873 }, { "epoch": 0.31597975415762836, "grad_norm": 0.020814816115027588, "learning_rate": 9.68595828599036e-06, "loss": 0.0011, "step": 874 }, { "epoch": 0.3163412870571222, "grad_norm": 0.27402306041423097, "learning_rate": 9.684914790097852e-06, "loss": 0.0145, "step": 875 }, { "epoch": 0.31670281995661603, "grad_norm": 0.3082976395514401, "learning_rate": 9.683869619817788e-06, "loss": 0.0388, "step": 876 }, { "epoch": 0.3170643528561099, "grad_norm": 0.8409626459597344, "learning_rate": 9.682822775523709e-06, "loss": 0.1406, "step": 877 }, { "epoch": 0.31742588575560376, "grad_norm": 0.26162622388589407, "learning_rate": 9.681774257589758e-06, "loss": 0.013, "step": 878 }, { "epoch": 0.31778741865509763, "grad_norm": 2.2946244361357904, "learning_rate": 9.680724066390675e-06, "loss": 0.2012, "step": 879 }, { "epoch": 0.31814895155459144, "grad_norm": 0.03060478859505956, "learning_rate": 9.6796722023018e-06, "loss": 0.0013, "step": 880 }, { "epoch": 0.3185104844540853, "grad_norm": 0.19012518329812467, "learning_rate": 9.678618665699067e-06, "loss": 0.0254, "step": 881 }, { "epoch": 0.3188720173535792, "grad_norm": 0.31134616540370824, "learning_rate": 9.677563456959009e-06, "loss": 0.0315, "step": 882 }, { "epoch": 0.31923355025307304, "grad_norm": 0.018442279371226975, "learning_rate": 9.67650657645876e-06, "loss": 0.0003, "step": 883 }, { "epoch": 0.3195950831525669, "grad_norm": 0.7764911255321578, "learning_rate": 9.675448024576048e-06, "loss": 0.1504, "step": 884 }, { "epoch": 0.3199566160520607, "grad_norm": 1.5869287484444863, "learning_rate": 9.674387801689198e-06, "loss": 0.1504, "step": 885 }, { "epoch": 0.3203181489515546, "grad_norm": 0.5337518933057271, "learning_rate": 9.673325908177133e-06, "loss": 0.0388, "step": 886 }, { "epoch": 0.32067968185104845, "grad_norm": 0.010850971038678716, "learning_rate": 9.672262344419377e-06, "loss": 0.0007, "step": 887 }, { "epoch": 0.3210412147505423, "grad_norm": 0.29908056221334633, "learning_rate": 9.671197110796043e-06, "loss": 0.005, "step": 888 }, { "epoch": 0.3214027476500362, "grad_norm": 0.19151490337341362, "learning_rate": 9.670130207687848e-06, "loss": 0.0081, "step": 889 }, { "epoch": 0.32176428054953, "grad_norm": 0.48028346583544346, "learning_rate": 9.669061635476103e-06, "loss": 0.0254, "step": 890 }, { "epoch": 0.32212581344902386, "grad_norm": 0.3953187728261348, "learning_rate": 9.667991394542712e-06, "loss": 0.0205, "step": 891 }, { "epoch": 0.3224873463485177, "grad_norm": 0.03561412138242224, "learning_rate": 9.666919485270186e-06, "loss": 0.0025, "step": 892 }, { "epoch": 0.3228488792480116, "grad_norm": 0.75699494185242, "learning_rate": 9.66584590804162e-06, "loss": 0.1226, "step": 893 }, { "epoch": 0.3232104121475054, "grad_norm": 0.016925879272314624, "learning_rate": 9.664770663240708e-06, "loss": 0.0009, "step": 894 }, { "epoch": 0.32357194504699927, "grad_norm": 0.05250635559164235, "learning_rate": 9.663693751251749e-06, "loss": 0.0019, "step": 895 }, { "epoch": 0.32393347794649313, "grad_norm": 0.41904399909081064, "learning_rate": 9.662615172459626e-06, "loss": 0.0228, "step": 896 }, { "epoch": 0.324295010845987, "grad_norm": 0.2437158049926797, "learning_rate": 9.661534927249824e-06, "loss": 0.0349, "step": 897 }, { "epoch": 0.32465654374548086, "grad_norm": 0.43551363909344226, "learning_rate": 9.660453016008423e-06, "loss": 0.0388, "step": 898 }, { "epoch": 0.3250180766449747, "grad_norm": 1.060044607240746, "learning_rate": 9.659369439122096e-06, "loss": 0.1055, "step": 899 }, { "epoch": 0.32537960954446854, "grad_norm": 0.6691493987160547, "learning_rate": 9.658284196978118e-06, "loss": 0.0898, "step": 900 }, { "epoch": 0.3257411424439624, "grad_norm": 4.057070891275637, "learning_rate": 9.657197289964352e-06, "loss": 0.0349, "step": 901 }, { "epoch": 0.32610267534345627, "grad_norm": 0.6412168566582206, "learning_rate": 9.656108718469252e-06, "loss": 0.1143, "step": 902 }, { "epoch": 0.3264642082429501, "grad_norm": 0.25965802638316504, "learning_rate": 9.655018482881883e-06, "loss": 0.0349, "step": 903 }, { "epoch": 0.32682574114244395, "grad_norm": 0.9715408017149644, "learning_rate": 9.65392658359189e-06, "loss": 0.083, "step": 904 }, { "epoch": 0.3271872740419378, "grad_norm": 0.645737823956235, "learning_rate": 9.652833020989516e-06, "loss": 0.1055, "step": 905 }, { "epoch": 0.3275488069414317, "grad_norm": 0.5517514660206869, "learning_rate": 9.651737795465604e-06, "loss": 0.1143, "step": 906 }, { "epoch": 0.32791033984092555, "grad_norm": 0.6740296512538496, "learning_rate": 9.650640907411587e-06, "loss": 0.1143, "step": 907 }, { "epoch": 0.32827187274041936, "grad_norm": 0.5688087128015363, "learning_rate": 9.649542357219487e-06, "loss": 0.0898, "step": 908 }, { "epoch": 0.3286334056399132, "grad_norm": 2.0514220587021104, "learning_rate": 9.648442145281933e-06, "loss": 0.2129, "step": 909 }, { "epoch": 0.3289949385394071, "grad_norm": 0.26260135236940424, "learning_rate": 9.647340271992136e-06, "loss": 0.0183, "step": 910 }, { "epoch": 0.32935647143890096, "grad_norm": 0.0044597180079166045, "learning_rate": 9.646236737743907e-06, "loss": 0.0002, "step": 911 }, { "epoch": 0.3297180043383948, "grad_norm": 0.5338444455366229, "learning_rate": 9.64513154293165e-06, "loss": 0.0693, "step": 912 }, { "epoch": 0.33007953723788863, "grad_norm": 0.46186536331479555, "learning_rate": 9.644024687950358e-06, "loss": 0.0757, "step": 913 }, { "epoch": 0.3304410701373825, "grad_norm": 0.450497042383289, "learning_rate": 9.642916173195623e-06, "loss": 0.0762, "step": 914 }, { "epoch": 0.33080260303687636, "grad_norm": 0.5619477146989402, "learning_rate": 9.641805999063627e-06, "loss": 0.083, "step": 915 }, { "epoch": 0.33116413593637023, "grad_norm": 0.43612476947996554, "learning_rate": 9.640694165951148e-06, "loss": 0.0757, "step": 916 }, { "epoch": 0.33152566883586404, "grad_norm": 0.035024672489259785, "learning_rate": 9.639580674255553e-06, "loss": 0.0015, "step": 917 }, { "epoch": 0.3318872017353579, "grad_norm": 0.5901637245902818, "learning_rate": 9.638465524374803e-06, "loss": 0.0903, "step": 918 }, { "epoch": 0.3322487346348518, "grad_norm": 0.36204942321714395, "learning_rate": 9.637348716707455e-06, "loss": 0.0432, "step": 919 }, { "epoch": 0.33261026753434564, "grad_norm": 0.01268697033077961, "learning_rate": 9.636230251652653e-06, "loss": 0.0006, "step": 920 }, { "epoch": 0.3329718004338395, "grad_norm": 0.7810901708192713, "learning_rate": 9.635110129610138e-06, "loss": 0.0903, "step": 921 }, { "epoch": 0.3333333333333333, "grad_norm": 0.4823459642087694, "learning_rate": 9.633988350980241e-06, "loss": 0.0391, "step": 922 }, { "epoch": 0.3336948662328272, "grad_norm": 0.3808940451028547, "learning_rate": 9.632864916163886e-06, "loss": 0.0635, "step": 923 }, { "epoch": 0.33405639913232105, "grad_norm": 0.4023467839169962, "learning_rate": 9.631739825562586e-06, "loss": 0.0693, "step": 924 }, { "epoch": 0.3344179320318149, "grad_norm": 0.48473195223205917, "learning_rate": 9.63061307957845e-06, "loss": 0.0762, "step": 925 }, { "epoch": 0.3347794649313087, "grad_norm": 0.5575314110192315, "learning_rate": 9.629484678614179e-06, "loss": 0.0903, "step": 926 }, { "epoch": 0.3351409978308026, "grad_norm": 0.18107214980211006, "learning_rate": 9.628354623073059e-06, "loss": 0.0064, "step": 927 }, { "epoch": 0.33550253073029646, "grad_norm": 0.38332517050597, "learning_rate": 9.627222913358973e-06, "loss": 0.0635, "step": 928 }, { "epoch": 0.3358640636297903, "grad_norm": 0.18375028573262256, "learning_rate": 9.626089549876395e-06, "loss": 0.0064, "step": 929 }, { "epoch": 0.3362255965292842, "grad_norm": 0.5197850937159805, "learning_rate": 9.624954533030388e-06, "loss": 0.083, "step": 930 }, { "epoch": 0.336587129428778, "grad_norm": 0.45281363093707394, "learning_rate": 9.623817863226607e-06, "loss": 0.083, "step": 931 }, { "epoch": 0.33694866232827186, "grad_norm": 0.556799043235355, "learning_rate": 9.622679540871299e-06, "loss": 0.0903, "step": 932 }, { "epoch": 0.33731019522776573, "grad_norm": 0.2789905946068715, "learning_rate": 9.621539566371297e-06, "loss": 0.0162, "step": 933 }, { "epoch": 0.3376717281272596, "grad_norm": 2.4769349588858267, "learning_rate": 9.620397940134029e-06, "loss": 0.2236, "step": 934 }, { "epoch": 0.33803326102675346, "grad_norm": 0.5514012887167715, "learning_rate": 9.619254662567512e-06, "loss": 0.083, "step": 935 }, { "epoch": 0.3383947939262473, "grad_norm": 0.23896846515226336, "learning_rate": 9.618109734080355e-06, "loss": 0.0092, "step": 936 }, { "epoch": 0.33875632682574114, "grad_norm": 0.49694184737489633, "learning_rate": 9.616963155081753e-06, "loss": 0.0476, "step": 937 }, { "epoch": 0.339117859725235, "grad_norm": 0.48263545544764963, "learning_rate": 9.615814925981492e-06, "loss": 0.0693, "step": 938 }, { "epoch": 0.33947939262472887, "grad_norm": 0.012609828409812465, "learning_rate": 9.614665047189953e-06, "loss": 0.0006, "step": 939 }, { "epoch": 0.3398409255242227, "grad_norm": 1.255907208305617, "learning_rate": 9.6135135191181e-06, "loss": 0.1143, "step": 940 }, { "epoch": 0.34020245842371655, "grad_norm": 0.4624970791508704, "learning_rate": 9.612360342177487e-06, "loss": 0.0693, "step": 941 }, { "epoch": 0.3405639913232104, "grad_norm": 0.024162361888447018, "learning_rate": 9.611205516780262e-06, "loss": 0.0012, "step": 942 }, { "epoch": 0.3409255242227043, "grad_norm": 0.01154078448503239, "learning_rate": 9.61004904333916e-06, "loss": 0.0005, "step": 943 }, { "epoch": 0.34128705712219815, "grad_norm": 1.4163039482233417, "learning_rate": 9.608890922267501e-06, "loss": 0.2236, "step": 944 }, { "epoch": 0.34164859002169196, "grad_norm": 0.050516932522504984, "learning_rate": 9.607731153979198e-06, "loss": 0.0031, "step": 945 }, { "epoch": 0.3420101229211858, "grad_norm": 0.027375551422453896, "learning_rate": 9.606569738888755e-06, "loss": 0.0015, "step": 946 }, { "epoch": 0.3423716558206797, "grad_norm": 0.4756232165686125, "learning_rate": 9.60540667741126e-06, "loss": 0.0283, "step": 947 }, { "epoch": 0.34273318872017355, "grad_norm": 0.08097434352027547, "learning_rate": 9.604241969962389e-06, "loss": 0.0057, "step": 948 }, { "epoch": 0.34309472161966736, "grad_norm": 0.272057607544153, "learning_rate": 9.60307561695841e-06, "loss": 0.0349, "step": 949 }, { "epoch": 0.34345625451916123, "grad_norm": 0.26984555954586453, "learning_rate": 9.601907618816177e-06, "loss": 0.0388, "step": 950 }, { "epoch": 0.3438177874186551, "grad_norm": 0.28674313666447726, "learning_rate": 9.600737975953131e-06, "loss": 0.0476, "step": 951 }, { "epoch": 0.34417932031814896, "grad_norm": 0.09067101275099654, "learning_rate": 9.599566688787305e-06, "loss": 0.0056, "step": 952 }, { "epoch": 0.34454085321764283, "grad_norm": 0.6314369145559043, "learning_rate": 9.598393757737315e-06, "loss": 0.0693, "step": 953 }, { "epoch": 0.34490238611713664, "grad_norm": 0.05880600124216776, "learning_rate": 9.597219183222366e-06, "loss": 0.0035, "step": 954 }, { "epoch": 0.3452639190166305, "grad_norm": 0.39654834602713707, "learning_rate": 9.596042965662252e-06, "loss": 0.0349, "step": 955 }, { "epoch": 0.34562545191612437, "grad_norm": 0.17238020468185217, "learning_rate": 9.594865105477352e-06, "loss": 0.0145, "step": 956 }, { "epoch": 0.34598698481561824, "grad_norm": 2.2040022619963584, "learning_rate": 9.59368560308863e-06, "loss": 0.1807, "step": 957 }, { "epoch": 0.34634851771511205, "grad_norm": 0.7737620621401012, "learning_rate": 9.592504458917646e-06, "loss": 0.1226, "step": 958 }, { "epoch": 0.3467100506146059, "grad_norm": 0.23675096770972037, "learning_rate": 9.591321673386536e-06, "loss": 0.0315, "step": 959 }, { "epoch": 0.3470715835140998, "grad_norm": 0.12334875833766326, "learning_rate": 9.59013724691803e-06, "loss": 0.0063, "step": 960 }, { "epoch": 0.34743311641359365, "grad_norm": 1.3719190102394805, "learning_rate": 9.58895117993544e-06, "loss": 0.1406, "step": 961 }, { "epoch": 0.3477946493130875, "grad_norm": 0.42798544731212657, "learning_rate": 9.587763472862667e-06, "loss": 0.0476, "step": 962 }, { "epoch": 0.3481561822125813, "grad_norm": 0.25024532577481684, "learning_rate": 9.586574126124198e-06, "loss": 0.0352, "step": 963 }, { "epoch": 0.3485177151120752, "grad_norm": 0.5168703657315771, "learning_rate": 9.585383140145101e-06, "loss": 0.0476, "step": 964 }, { "epoch": 0.34887924801156905, "grad_norm": 0.2521716619069522, "learning_rate": 9.58419051535104e-06, "loss": 0.0317, "step": 965 }, { "epoch": 0.3492407809110629, "grad_norm": 0.05057441894673969, "learning_rate": 9.582996252168256e-06, "loss": 0.0019, "step": 966 }, { "epoch": 0.3496023138105568, "grad_norm": 0.9481702634864791, "learning_rate": 9.58180035102358e-06, "loss": 0.0388, "step": 967 }, { "epoch": 0.3499638467100506, "grad_norm": 2.0528866315160674, "learning_rate": 9.580602812344422e-06, "loss": 0.1226, "step": 968 }, { "epoch": 0.35032537960954446, "grad_norm": 2.303903846015961, "learning_rate": 9.579403636558789e-06, "loss": 0.1699, "step": 969 }, { "epoch": 0.35068691250903833, "grad_norm": 0.8162752655605114, "learning_rate": 9.57820282409526e-06, "loss": 0.1226, "step": 970 }, { "epoch": 0.3510484454085322, "grad_norm": 0.1766501866591628, "learning_rate": 9.57700037538301e-06, "loss": 0.0254, "step": 971 }, { "epoch": 0.351409978308026, "grad_norm": 0.9394457046178308, "learning_rate": 9.575796290851791e-06, "loss": 0.0762, "step": 972 }, { "epoch": 0.35177151120751987, "grad_norm": 0.6534243097836427, "learning_rate": 9.574590570931944e-06, "loss": 0.1602, "step": 973 }, { "epoch": 0.35213304410701374, "grad_norm": 0.316969323845766, "learning_rate": 9.57338321605439e-06, "loss": 0.0352, "step": 974 }, { "epoch": 0.3524945770065076, "grad_norm": 0.1914056781109621, "learning_rate": 9.572174226650641e-06, "loss": 0.0254, "step": 975 }, { "epoch": 0.35285610990600147, "grad_norm": 0.2490339053498248, "learning_rate": 9.57096360315279e-06, "loss": 0.0317, "step": 976 }, { "epoch": 0.3532176428054953, "grad_norm": 0.8404660877167074, "learning_rate": 9.56975134599351e-06, "loss": 0.0903, "step": 977 }, { "epoch": 0.35357917570498915, "grad_norm": 0.2682000144818748, "learning_rate": 9.568537455606064e-06, "loss": 0.0254, "step": 978 }, { "epoch": 0.353940708604483, "grad_norm": 0.5335395005215386, "learning_rate": 9.567321932424297e-06, "loss": 0.0317, "step": 979 }, { "epoch": 0.3543022415039769, "grad_norm": 0.18269224403957895, "learning_rate": 9.566104776882631e-06, "loss": 0.0254, "step": 980 }, { "epoch": 0.3546637744034707, "grad_norm": 0.27692912712357687, "learning_rate": 9.564885989416085e-06, "loss": 0.0315, "step": 981 }, { "epoch": 0.35502530730296455, "grad_norm": 0.5834562777738689, "learning_rate": 9.56366557046025e-06, "loss": 0.1602, "step": 982 }, { "epoch": 0.3553868402024584, "grad_norm": 0.6792927809586548, "learning_rate": 9.562443520451301e-06, "loss": 0.1055, "step": 983 }, { "epoch": 0.3557483731019523, "grad_norm": 0.06861448119202636, "learning_rate": 9.561219839826e-06, "loss": 0.0028, "step": 984 }, { "epoch": 0.35610990600144615, "grad_norm": 0.9483104741377114, "learning_rate": 9.559994529021695e-06, "loss": 0.0903, "step": 985 }, { "epoch": 0.35647143890093996, "grad_norm": 0.15086706644693873, "learning_rate": 9.558767588476305e-06, "loss": 0.0092, "step": 986 }, { "epoch": 0.35683297180043383, "grad_norm": 0.21801041013141473, "learning_rate": 9.55753901862834e-06, "loss": 0.0315, "step": 987 }, { "epoch": 0.3571945046999277, "grad_norm": 0.2629289008772857, "learning_rate": 9.556308819916892e-06, "loss": 0.0227, "step": 988 }, { "epoch": 0.35755603759942156, "grad_norm": 0.8333920839458012, "learning_rate": 9.555076992781636e-06, "loss": 0.0476, "step": 989 }, { "epoch": 0.3579175704989154, "grad_norm": 0.11859127379662215, "learning_rate": 9.55384353766282e-06, "loss": 0.0072, "step": 990 }, { "epoch": 0.35827910339840924, "grad_norm": 0.26685199383892283, "learning_rate": 9.552608455001287e-06, "loss": 0.0315, "step": 991 }, { "epoch": 0.3586406362979031, "grad_norm": 0.1936030638543042, "learning_rate": 9.551371745238454e-06, "loss": 0.0254, "step": 992 }, { "epoch": 0.35900216919739697, "grad_norm": 0.6409723125999514, "learning_rate": 9.550133408816317e-06, "loss": 0.1309, "step": 993 }, { "epoch": 0.35936370209689084, "grad_norm": 0.5862580233363474, "learning_rate": 9.548893446177463e-06, "loss": 0.1309, "step": 994 }, { "epoch": 0.35972523499638465, "grad_norm": 0.27400516250071677, "learning_rate": 9.547651857765049e-06, "loss": 0.0388, "step": 995 }, { "epoch": 0.3600867678958785, "grad_norm": 0.4465755315388589, "learning_rate": 9.546408644022822e-06, "loss": 0.0349, "step": 996 }, { "epoch": 0.3604483007953724, "grad_norm": 0.31983284797424694, "learning_rate": 9.545163805395103e-06, "loss": 0.0388, "step": 997 }, { "epoch": 0.36080983369486624, "grad_norm": 0.28827851988014214, "learning_rate": 9.543917342326802e-06, "loss": 0.0388, "step": 998 }, { "epoch": 0.3611713665943601, "grad_norm": 0.21665389335213, "learning_rate": 9.5426692552634e-06, "loss": 0.0315, "step": 999 }, { "epoch": 0.3615328994938539, "grad_norm": 0.15222855484312717, "learning_rate": 9.541419544650966e-06, "loss": 0.0227, "step": 1000 }, { "epoch": 0.3618944323933478, "grad_norm": 0.6179743448210696, "learning_rate": 9.540168210936145e-06, "loss": 0.1309, "step": 1001 }, { "epoch": 0.36225596529284165, "grad_norm": 0.43585308343006113, "learning_rate": 9.538915254566163e-06, "loss": 0.0579, "step": 1002 }, { "epoch": 0.3626174981923355, "grad_norm": 0.14775981409924477, "learning_rate": 9.537660675988827e-06, "loss": 0.0227, "step": 1003 }, { "epoch": 0.36297903109182933, "grad_norm": 0.15132872149034288, "learning_rate": 9.536404475652524e-06, "loss": 0.0203, "step": 1004 }, { "epoch": 0.3633405639913232, "grad_norm": 0.2595920992510271, "learning_rate": 9.535146654006216e-06, "loss": 0.0283, "step": 1005 }, { "epoch": 0.36370209689081706, "grad_norm": 0.012689288648002131, "learning_rate": 9.533887211499453e-06, "loss": 0.0003, "step": 1006 }, { "epoch": 0.3640636297903109, "grad_norm": 0.17421460825159601, "learning_rate": 9.532626148582358e-06, "loss": 0.0254, "step": 1007 }, { "epoch": 0.3644251626898048, "grad_norm": 0.1526495844506736, "learning_rate": 9.531363465705633e-06, "loss": 0.0227, "step": 1008 }, { "epoch": 0.3647866955892986, "grad_norm": 0.004570220648244808, "learning_rate": 9.530099163320562e-06, "loss": 0.0003, "step": 1009 }, { "epoch": 0.36514822848879247, "grad_norm": 0.8378118052812766, "learning_rate": 9.528833241879006e-06, "loss": 0.0579, "step": 1010 }, { "epoch": 0.36550976138828634, "grad_norm": 0.8759758079862531, "learning_rate": 9.527565701833405e-06, "loss": 0.0977, "step": 1011 }, { "epoch": 0.3658712942877802, "grad_norm": 0.22382366840949527, "learning_rate": 9.526296543636777e-06, "loss": 0.0317, "step": 1012 }, { "epoch": 0.36623282718727407, "grad_norm": 0.8138506691785357, "learning_rate": 9.525025767742721e-06, "loss": 0.0903, "step": 1013 }, { "epoch": 0.3665943600867679, "grad_norm": 0.1838059002154909, "learning_rate": 9.52375337460541e-06, "loss": 0.0103, "step": 1014 }, { "epoch": 0.36695589298626174, "grad_norm": 0.6750272165697171, "learning_rate": 9.522479364679598e-06, "loss": 0.1055, "step": 1015 }, { "epoch": 0.3673174258857556, "grad_norm": 0.3931582764565587, "learning_rate": 9.521203738420614e-06, "loss": 0.0283, "step": 1016 }, { "epoch": 0.3676789587852495, "grad_norm": 0.04726968611959931, "learning_rate": 9.519926496284369e-06, "loss": 0.0027, "step": 1017 }, { "epoch": 0.3680404916847433, "grad_norm": 0.016308011407592145, "learning_rate": 9.518647638727348e-06, "loss": 0.0009, "step": 1018 }, { "epoch": 0.36840202458423715, "grad_norm": 0.20139969473537767, "learning_rate": 9.517367166206615e-06, "loss": 0.0254, "step": 1019 }, { "epoch": 0.368763557483731, "grad_norm": 0.18090059329616762, "learning_rate": 9.516085079179809e-06, "loss": 0.0227, "step": 1020 }, { "epoch": 0.3691250903832249, "grad_norm": 0.06995981246235886, "learning_rate": 9.51480137810515e-06, "loss": 0.0045, "step": 1021 }, { "epoch": 0.36948662328271875, "grad_norm": 0.031268667027163334, "learning_rate": 9.513516063441431e-06, "loss": 0.0006, "step": 1022 }, { "epoch": 0.36984815618221256, "grad_norm": 0.00875488903123643, "learning_rate": 9.512229135648023e-06, "loss": 0.0004, "step": 1023 }, { "epoch": 0.37020968908170643, "grad_norm": 0.1864665005430027, "learning_rate": 9.510940595184875e-06, "loss": 0.0227, "step": 1024 }, { "epoch": 0.3705712219812003, "grad_norm": 1.2471853220160865, "learning_rate": 9.50965044251251e-06, "loss": 0.1807, "step": 1025 }, { "epoch": 0.37093275488069416, "grad_norm": 0.44359984195703095, "learning_rate": 9.508358678092028e-06, "loss": 0.0352, "step": 1026 }, { "epoch": 0.37129428778018797, "grad_norm": 0.018436852538993714, "learning_rate": 9.507065302385107e-06, "loss": 0.0009, "step": 1027 }, { "epoch": 0.37165582067968184, "grad_norm": 0.2928538634907696, "learning_rate": 9.505770315853998e-06, "loss": 0.0349, "step": 1028 }, { "epoch": 0.3720173535791757, "grad_norm": 0.014439979893405877, "learning_rate": 9.504473718961526e-06, "loss": 0.0007, "step": 1029 }, { "epoch": 0.37237888647866957, "grad_norm": 0.01122154955781484, "learning_rate": 9.503175512171102e-06, "loss": 0.0005, "step": 1030 }, { "epoch": 0.37274041937816343, "grad_norm": 0.1354031354087734, "learning_rate": 9.501875695946697e-06, "loss": 0.0181, "step": 1031 }, { "epoch": 0.37310195227765725, "grad_norm": 0.1453702318899419, "learning_rate": 9.50057427075287e-06, "loss": 0.0181, "step": 1032 }, { "epoch": 0.3734634851771511, "grad_norm": 1.429817670244538, "learning_rate": 9.499271237054748e-06, "loss": 0.0977, "step": 1033 }, { "epoch": 0.373825018076645, "grad_norm": 0.08162356893380883, "learning_rate": 9.497966595318035e-06, "loss": 0.0044, "step": 1034 }, { "epoch": 0.37418655097613884, "grad_norm": 0.0312445297835645, "learning_rate": 9.49666034600901e-06, "loss": 0.0005, "step": 1035 }, { "epoch": 0.3745480838756327, "grad_norm": 0.3593288791218259, "learning_rate": 9.495352489594528e-06, "loss": 0.0227, "step": 1036 }, { "epoch": 0.3749096167751265, "grad_norm": 0.01428168011478446, "learning_rate": 9.494043026542013e-06, "loss": 0.0006, "step": 1037 }, { "epoch": 0.3752711496746204, "grad_norm": 0.24977910451680477, "learning_rate": 9.492731957319467e-06, "loss": 0.0254, "step": 1038 }, { "epoch": 0.37563268257411425, "grad_norm": 0.17955774693409152, "learning_rate": 9.491419282395471e-06, "loss": 0.0203, "step": 1039 }, { "epoch": 0.3759942154736081, "grad_norm": 0.8256899875006966, "learning_rate": 9.49010500223917e-06, "loss": 0.1602, "step": 1040 }, { "epoch": 0.37635574837310193, "grad_norm": 0.24175815715060564, "learning_rate": 9.48878911732029e-06, "loss": 0.0283, "step": 1041 }, { "epoch": 0.3767172812725958, "grad_norm": 0.1508766369427608, "learning_rate": 9.487471628109124e-06, "loss": 0.0161, "step": 1042 }, { "epoch": 0.37707881417208966, "grad_norm": 0.1702932390795921, "learning_rate": 9.486152535076548e-06, "loss": 0.0182, "step": 1043 }, { "epoch": 0.3774403470715835, "grad_norm": 1.0120843549439038, "learning_rate": 9.484831838694002e-06, "loss": 0.0579, "step": 1044 }, { "epoch": 0.3778018799710774, "grad_norm": 0.14509030549002302, "learning_rate": 9.483509539433501e-06, "loss": 0.0181, "step": 1045 }, { "epoch": 0.3781634128705712, "grad_norm": 1.0914185219302555, "learning_rate": 9.48218563776764e-06, "loss": 0.1699, "step": 1046 }, { "epoch": 0.37852494577006507, "grad_norm": 0.25181366409480915, "learning_rate": 9.480860134169577e-06, "loss": 0.0161, "step": 1047 }, { "epoch": 0.37888647866955893, "grad_norm": 0.2942249139197522, "learning_rate": 9.479533029113047e-06, "loss": 0.0203, "step": 1048 }, { "epoch": 0.3792480115690528, "grad_norm": 0.14079201360180654, "learning_rate": 9.47820432307236e-06, "loss": 0.0161, "step": 1049 }, { "epoch": 0.3796095444685466, "grad_norm": 0.15935362441662712, "learning_rate": 9.47687401652239e-06, "loss": 0.0143, "step": 1050 }, { "epoch": 0.3799710773680405, "grad_norm": 2.0286979849530513, "learning_rate": 9.475542109938592e-06, "loss": 0.0762, "step": 1051 }, { "epoch": 0.38033261026753434, "grad_norm": 1.0115702055412132, "learning_rate": 9.47420860379699e-06, "loss": 0.1055, "step": 1052 }, { "epoch": 0.3806941431670282, "grad_norm": 0.9148509266678688, "learning_rate": 9.472873498574175e-06, "loss": 0.083, "step": 1053 }, { "epoch": 0.3810556760665221, "grad_norm": 0.5360733854086319, "learning_rate": 9.471536794747319e-06, "loss": 0.0317, "step": 1054 }, { "epoch": 0.3814172089660159, "grad_norm": 1.1272790174915226, "learning_rate": 9.470198492794152e-06, "loss": 0.0635, "step": 1055 }, { "epoch": 0.38177874186550975, "grad_norm": 2.2921554451595862, "learning_rate": 9.468858593192989e-06, "loss": 0.1406, "step": 1056 }, { "epoch": 0.3821402747650036, "grad_norm": 0.00710127659841989, "learning_rate": 9.467517096422709e-06, "loss": 0.0003, "step": 1057 }, { "epoch": 0.3825018076644975, "grad_norm": 0.34303880470767795, "learning_rate": 9.46617400296276e-06, "loss": 0.0227, "step": 1058 }, { "epoch": 0.38286334056399135, "grad_norm": 1.1010205134942703, "learning_rate": 9.464829313293164e-06, "loss": 0.1914, "step": 1059 }, { "epoch": 0.38322487346348516, "grad_norm": 0.6553384044364963, "learning_rate": 9.463483027894516e-06, "loss": 0.0435, "step": 1060 }, { "epoch": 0.383586406362979, "grad_norm": 0.3800480795161335, "learning_rate": 9.462135147247975e-06, "loss": 0.0228, "step": 1061 }, { "epoch": 0.3839479392624729, "grad_norm": 0.3502447745633044, "learning_rate": 9.460785671835275e-06, "loss": 0.0315, "step": 1062 }, { "epoch": 0.38430947216196676, "grad_norm": 0.10019592540119988, "learning_rate": 9.459434602138715e-06, "loss": 0.005, "step": 1063 }, { "epoch": 0.38467100506146057, "grad_norm": 0.09657437376320081, "learning_rate": 9.45808193864117e-06, "loss": 0.0128, "step": 1064 }, { "epoch": 0.38503253796095444, "grad_norm": 0.13962427899491436, "learning_rate": 9.456727681826082e-06, "loss": 0.0181, "step": 1065 }, { "epoch": 0.3853940708604483, "grad_norm": 0.09196182176851692, "learning_rate": 9.45537183217746e-06, "loss": 0.005, "step": 1066 }, { "epoch": 0.38575560375994217, "grad_norm": 0.5412190320759691, "learning_rate": 9.454014390179886e-06, "loss": 0.0203, "step": 1067 }, { "epoch": 0.38611713665943603, "grad_norm": 1.0832365888143378, "learning_rate": 9.45265535631851e-06, "loss": 0.0762, "step": 1068 }, { "epoch": 0.38647866955892984, "grad_norm": 0.12094407608979016, "learning_rate": 9.451294731079049e-06, "loss": 0.0072, "step": 1069 }, { "epoch": 0.3868402024584237, "grad_norm": 2.573032896705991, "learning_rate": 9.449932514947788e-06, "loss": 0.1309, "step": 1070 }, { "epoch": 0.3872017353579176, "grad_norm": 0.010233027816371908, "learning_rate": 9.448568708411588e-06, "loss": 0.0006, "step": 1071 }, { "epoch": 0.38756326825741144, "grad_norm": 0.8986138641395985, "learning_rate": 9.447203311957869e-06, "loss": 0.2129, "step": 1072 }, { "epoch": 0.38792480115690525, "grad_norm": 0.004759095228252381, "learning_rate": 9.445836326074625e-06, "loss": 0.0003, "step": 1073 }, { "epoch": 0.3882863340563991, "grad_norm": 0.7911566085081393, "learning_rate": 9.444467751250416e-06, "loss": 0.0527, "step": 1074 }, { "epoch": 0.388647866955893, "grad_norm": 0.7832924458026012, "learning_rate": 9.443097587974373e-06, "loss": 0.0349, "step": 1075 }, { "epoch": 0.38900939985538685, "grad_norm": 16.200852418927855, "learning_rate": 9.441725836736186e-06, "loss": 0.1406, "step": 1076 }, { "epoch": 0.3893709327548807, "grad_norm": 2.0600240113580353, "learning_rate": 9.440352498026126e-06, "loss": 0.1504, "step": 1077 }, { "epoch": 0.3897324656543745, "grad_norm": 0.003025727785315545, "learning_rate": 9.438977572335017e-06, "loss": 0.0002, "step": 1078 }, { "epoch": 0.3900939985538684, "grad_norm": 0.22527530015220049, "learning_rate": 9.437601060154263e-06, "loss": 0.0254, "step": 1079 }, { "epoch": 0.39045553145336226, "grad_norm": 0.1337068581852147, "learning_rate": 9.436222961975826e-06, "loss": 0.0143, "step": 1080 }, { "epoch": 0.3908170643528561, "grad_norm": 0.14259894352864805, "learning_rate": 9.434843278292239e-06, "loss": 0.0181, "step": 1081 }, { "epoch": 0.39117859725235, "grad_norm": 0.4810116295565453, "learning_rate": 9.433462009596598e-06, "loss": 0.0388, "step": 1082 }, { "epoch": 0.3915401301518438, "grad_norm": 0.0017578175913453785, "learning_rate": 9.432079156382572e-06, "loss": 0.0001, "step": 1083 }, { "epoch": 0.39190166305133767, "grad_norm": 0.11481760473500575, "learning_rate": 9.43069471914439e-06, "loss": 0.0143, "step": 1084 }, { "epoch": 0.39226319595083153, "grad_norm": 0.7212535363694256, "learning_rate": 9.429308698376854e-06, "loss": 0.0693, "step": 1085 }, { "epoch": 0.3926247288503254, "grad_norm": 1.6350032522021207, "learning_rate": 9.42792109457532e-06, "loss": 0.1699, "step": 1086 }, { "epoch": 0.3929862617498192, "grad_norm": 0.6856643768222361, "learning_rate": 9.426531908235721e-06, "loss": 0.0579, "step": 1087 }, { "epoch": 0.3933477946493131, "grad_norm": 0.3941473215117137, "learning_rate": 9.425141139854555e-06, "loss": 0.0183, "step": 1088 }, { "epoch": 0.39370932754880694, "grad_norm": 0.9681039699559233, "learning_rate": 9.423748789928878e-06, "loss": 0.0693, "step": 1089 }, { "epoch": 0.3940708604483008, "grad_norm": 0.08691855323345055, "learning_rate": 9.422354858956317e-06, "loss": 0.0114, "step": 1090 }, { "epoch": 0.3944323933477947, "grad_norm": 0.08948361618442341, "learning_rate": 9.420959347435064e-06, "loss": 0.0101, "step": 1091 }, { "epoch": 0.3947939262472885, "grad_norm": 0.013448388505894849, "learning_rate": 9.41956225586387e-06, "loss": 0.0006, "step": 1092 }, { "epoch": 0.39515545914678235, "grad_norm": 0.13543428711896488, "learning_rate": 9.418163584742061e-06, "loss": 0.0161, "step": 1093 }, { "epoch": 0.3955169920462762, "grad_norm": 0.18450660100870878, "learning_rate": 9.41676333456952e-06, "loss": 0.0203, "step": 1094 }, { "epoch": 0.3958785249457701, "grad_norm": 0.1461100026531346, "learning_rate": 9.415361505846693e-06, "loss": 0.0101, "step": 1095 }, { "epoch": 0.3962400578452639, "grad_norm": 0.4049892059012022, "learning_rate": 9.413958099074598e-06, "loss": 0.0227, "step": 1096 }, { "epoch": 0.39660159074475776, "grad_norm": 0.06105992232008616, "learning_rate": 9.412553114754807e-06, "loss": 0.0028, "step": 1097 }, { "epoch": 0.3969631236442516, "grad_norm": 1.5780566458025282, "learning_rate": 9.411146553389467e-06, "loss": 0.2236, "step": 1098 }, { "epoch": 0.3973246565437455, "grad_norm": 0.3470408432600259, "learning_rate": 9.409738415481278e-06, "loss": 0.0183, "step": 1099 }, { "epoch": 0.39768618944323936, "grad_norm": 0.13562214721399832, "learning_rate": 9.40832870153351e-06, "loss": 0.0143, "step": 1100 }, { "epoch": 0.39804772234273317, "grad_norm": 0.056393149672734975, "learning_rate": 9.406917412049995e-06, "loss": 0.007, "step": 1101 }, { "epoch": 0.39840925524222703, "grad_norm": 0.09325211705304107, "learning_rate": 9.405504547535127e-06, "loss": 0.0114, "step": 1102 }, { "epoch": 0.3987707881417209, "grad_norm": 1.0971215859878216, "learning_rate": 9.404090108493863e-06, "loss": 0.1055, "step": 1103 }, { "epoch": 0.39913232104121477, "grad_norm": 0.978588052837998, "learning_rate": 9.402674095431724e-06, "loss": 0.2344, "step": 1104 }, { "epoch": 0.39949385394070863, "grad_norm": 0.08664403357710915, "learning_rate": 9.401256508854793e-06, "loss": 0.009, "step": 1105 }, { "epoch": 0.39985538684020244, "grad_norm": 0.3979253582079351, "learning_rate": 9.399837349269713e-06, "loss": 0.0203, "step": 1106 }, { "epoch": 0.4002169197396963, "grad_norm": 0.2773800206765329, "learning_rate": 9.398416617183694e-06, "loss": 0.0182, "step": 1107 }, { "epoch": 0.4005784526391902, "grad_norm": 0.46925293416132846, "learning_rate": 9.396994313104504e-06, "loss": 0.0388, "step": 1108 }, { "epoch": 0.40093998553868404, "grad_norm": 1.2752773100673755, "learning_rate": 9.395570437540474e-06, "loss": 0.1055, "step": 1109 }, { "epoch": 0.40130151843817785, "grad_norm": 0.12065348452984719, "learning_rate": 9.394144991000497e-06, "loss": 0.0161, "step": 1110 }, { "epoch": 0.4016630513376717, "grad_norm": 0.814329996440552, "learning_rate": 9.392717973994028e-06, "loss": 0.2578, "step": 1111 }, { "epoch": 0.4020245842371656, "grad_norm": 1.1033032815828392, "learning_rate": 9.391289387031084e-06, "loss": 0.1055, "step": 1112 }, { "epoch": 0.40238611713665945, "grad_norm": 0.06597189094302595, "learning_rate": 9.389859230622237e-06, "loss": 0.009, "step": 1113 }, { "epoch": 0.4027476500361533, "grad_norm": 0.06292302087214882, "learning_rate": 9.38842750527863e-06, "loss": 0.007, "step": 1114 }, { "epoch": 0.4031091829356471, "grad_norm": 0.08494442949504416, "learning_rate": 9.386994211511957e-06, "loss": 0.0114, "step": 1115 }, { "epoch": 0.403470715835141, "grad_norm": 1.4193517672319234, "learning_rate": 9.385559349834478e-06, "loss": 0.1226, "step": 1116 }, { "epoch": 0.40383224873463486, "grad_norm": 0.014147192076410775, "learning_rate": 9.384122920759014e-06, "loss": 0.0007, "step": 1117 }, { "epoch": 0.4041937816341287, "grad_norm": 0.14291316386576222, "learning_rate": 9.382684924798944e-06, "loss": 0.0182, "step": 1118 }, { "epoch": 0.40455531453362253, "grad_norm": 0.19575420479070366, "learning_rate": 9.381245362468206e-06, "loss": 0.0181, "step": 1119 }, { "epoch": 0.4049168474331164, "grad_norm": 0.07403850611656816, "learning_rate": 9.3798042342813e-06, "loss": 0.0101, "step": 1120 }, { "epoch": 0.40527838033261027, "grad_norm": 1.1073614617199157, "learning_rate": 9.378361540753284e-06, "loss": 0.1143, "step": 1121 }, { "epoch": 0.40563991323210413, "grad_norm": 0.12106640196113036, "learning_rate": 9.37691728239978e-06, "loss": 0.0143, "step": 1122 }, { "epoch": 0.406001446131598, "grad_norm": 0.8965149648981541, "learning_rate": 9.37547145973696e-06, "loss": 0.1602, "step": 1123 }, { "epoch": 0.4063629790310918, "grad_norm": 0.7697552946347148, "learning_rate": 9.374024073281566e-06, "loss": 0.0527, "step": 1124 }, { "epoch": 0.4067245119305857, "grad_norm": 2.144202214778625, "learning_rate": 9.37257512355089e-06, "loss": 0.0903, "step": 1125 }, { "epoch": 0.40708604483007954, "grad_norm": 0.098280093162453, "learning_rate": 9.371124611062788e-06, "loss": 0.0101, "step": 1126 }, { "epoch": 0.4074475777295734, "grad_norm": 0.34883535139088484, "learning_rate": 9.369672536335673e-06, "loss": 0.0203, "step": 1127 }, { "epoch": 0.4078091106290672, "grad_norm": 0.08022474473708886, "learning_rate": 9.368218899888515e-06, "loss": 0.0114, "step": 1128 }, { "epoch": 0.4081706435285611, "grad_norm": 0.19726310328081267, "learning_rate": 9.366763702240844e-06, "loss": 0.0227, "step": 1129 }, { "epoch": 0.40853217642805495, "grad_norm": 0.5843015711000515, "learning_rate": 9.365306943912747e-06, "loss": 0.0283, "step": 1130 }, { "epoch": 0.4088937093275488, "grad_norm": 0.5421198335004562, "learning_rate": 9.36384862542487e-06, "loss": 0.0388, "step": 1131 }, { "epoch": 0.4092552422270427, "grad_norm": 0.6692204876198101, "learning_rate": 9.362388747298417e-06, "loss": 0.1406, "step": 1132 }, { "epoch": 0.4096167751265365, "grad_norm": 0.16386846880470202, "learning_rate": 9.360927310055144e-06, "loss": 0.0203, "step": 1133 }, { "epoch": 0.40997830802603036, "grad_norm": 0.12919835989542527, "learning_rate": 9.35946431421737e-06, "loss": 0.0101, "step": 1134 }, { "epoch": 0.4103398409255242, "grad_norm": 0.9182413377847383, "learning_rate": 9.357999760307973e-06, "loss": 0.1309, "step": 1135 }, { "epoch": 0.4107013738250181, "grad_norm": 0.792946360393989, "learning_rate": 9.356533648850378e-06, "loss": 0.0476, "step": 1136 }, { "epoch": 0.41106290672451196, "grad_norm": 0.12495526597410292, "learning_rate": 9.35506598036858e-06, "loss": 0.0161, "step": 1137 }, { "epoch": 0.41142443962400577, "grad_norm": 0.11672797671482336, "learning_rate": 9.353596755387117e-06, "loss": 0.009, "step": 1138 }, { "epoch": 0.41178597252349963, "grad_norm": 1.4075921757292031, "learning_rate": 9.352125974431095e-06, "loss": 0.083, "step": 1139 }, { "epoch": 0.4121475054229935, "grad_norm": 0.22986355535258104, "learning_rate": 9.350653638026165e-06, "loss": 0.0161, "step": 1140 }, { "epoch": 0.41250903832248736, "grad_norm": 0.8306698550109307, "learning_rate": 9.349179746698545e-06, "loss": 0.1226, "step": 1141 }, { "epoch": 0.4128705712219812, "grad_norm": 0.021781431910505373, "learning_rate": 9.347704300975e-06, "loss": 0.0015, "step": 1142 }, { "epoch": 0.41323210412147504, "grad_norm": 0.31359522632111836, "learning_rate": 9.346227301382857e-06, "loss": 0.0227, "step": 1143 }, { "epoch": 0.4135936370209689, "grad_norm": 0.7409009725594932, "learning_rate": 9.344748748449993e-06, "loss": 0.0903, "step": 1144 }, { "epoch": 0.4139551699204628, "grad_norm": 0.010472055824160364, "learning_rate": 9.343268642704843e-06, "loss": 0.0006, "step": 1145 }, { "epoch": 0.41431670281995664, "grad_norm": 0.23395422540439423, "learning_rate": 9.341786984676397e-06, "loss": 0.0283, "step": 1146 }, { "epoch": 0.41467823571945045, "grad_norm": 0.09350869256097366, "learning_rate": 9.340303774894198e-06, "loss": 0.0114, "step": 1147 }, { "epoch": 0.4150397686189443, "grad_norm": 0.23024057574033477, "learning_rate": 9.338819013888347e-06, "loss": 0.0254, "step": 1148 }, { "epoch": 0.4154013015184382, "grad_norm": 1.1459103784339888, "learning_rate": 9.337332702189494e-06, "loss": 0.1699, "step": 1149 }, { "epoch": 0.41576283441793205, "grad_norm": 0.6932525602772105, "learning_rate": 9.335844840328851e-06, "loss": 0.2236, "step": 1150 }, { "epoch": 0.41612436731742586, "grad_norm": 0.2030131541462344, "learning_rate": 9.334355428838179e-06, "loss": 0.0254, "step": 1151 }, { "epoch": 0.4164859002169197, "grad_norm": 0.09244064237495715, "learning_rate": 9.332864468249788e-06, "loss": 0.0114, "step": 1152 }, { "epoch": 0.4168474331164136, "grad_norm": 0.4169416623032936, "learning_rate": 9.33137195909655e-06, "loss": 0.0476, "step": 1153 }, { "epoch": 0.41720896601590746, "grad_norm": 1.8487764636095454, "learning_rate": 9.329877901911889e-06, "loss": 0.2363, "step": 1154 }, { "epoch": 0.4175704989154013, "grad_norm": 0.5674133177404711, "learning_rate": 9.32838229722978e-06, "loss": 0.1602, "step": 1155 }, { "epoch": 0.41793203181489513, "grad_norm": 0.013839452473954428, "learning_rate": 9.326885145584753e-06, "loss": 0.0007, "step": 1156 }, { "epoch": 0.418293564714389, "grad_norm": 0.8651345348552122, "learning_rate": 9.325386447511884e-06, "loss": 0.0349, "step": 1157 }, { "epoch": 0.41865509761388287, "grad_norm": 2.226165747168171, "learning_rate": 9.323886203546815e-06, "loss": 0.1914, "step": 1158 }, { "epoch": 0.41901663051337673, "grad_norm": 0.10647733699560955, "learning_rate": 9.322384414225727e-06, "loss": 0.0064, "step": 1159 }, { "epoch": 0.4193781634128706, "grad_norm": 0.9847510740243917, "learning_rate": 9.320881080085363e-06, "loss": 0.1226, "step": 1160 }, { "epoch": 0.4197396963123644, "grad_norm": 1.4609810951785365, "learning_rate": 9.319376201663012e-06, "loss": 0.1807, "step": 1161 }, { "epoch": 0.4201012292118583, "grad_norm": 0.5289491547758116, "learning_rate": 9.31786977949652e-06, "loss": 0.0317, "step": 1162 }, { "epoch": 0.42046276211135214, "grad_norm": 1.1933901125421524, "learning_rate": 9.316361814124278e-06, "loss": 0.0977, "step": 1163 }, { "epoch": 0.420824295010846, "grad_norm": 0.15374259570843488, "learning_rate": 9.314852306085235e-06, "loss": 0.0227, "step": 1164 }, { "epoch": 0.4211858279103398, "grad_norm": 0.2955666643071228, "learning_rate": 9.313341255918889e-06, "loss": 0.013, "step": 1165 }, { "epoch": 0.4215473608098337, "grad_norm": 1.1140166657820134, "learning_rate": 9.31182866416529e-06, "loss": 0.0762, "step": 1166 }, { "epoch": 0.42190889370932755, "grad_norm": 0.15440982315492727, "learning_rate": 9.310314531365033e-06, "loss": 0.0227, "step": 1167 }, { "epoch": 0.4222704266088214, "grad_norm": 0.23681697506718583, "learning_rate": 9.308798858059274e-06, "loss": 0.0349, "step": 1168 }, { "epoch": 0.4226319595083153, "grad_norm": 0.19842397232513373, "learning_rate": 9.307281644789712e-06, "loss": 0.0254, "step": 1169 }, { "epoch": 0.4229934924078091, "grad_norm": 1.2487862216731718, "learning_rate": 9.3057628920986e-06, "loss": 0.1055, "step": 1170 }, { "epoch": 0.42335502530730296, "grad_norm": 0.5897646762296794, "learning_rate": 9.304242600528738e-06, "loss": 0.1699, "step": 1171 }, { "epoch": 0.4237165582067968, "grad_norm": 0.010054443887121485, "learning_rate": 9.302720770623479e-06, "loss": 0.0007, "step": 1172 }, { "epoch": 0.4240780911062907, "grad_norm": 0.2849481922513313, "learning_rate": 9.301197402926726e-06, "loss": 0.0317, "step": 1173 }, { "epoch": 0.4244396240057845, "grad_norm": 0.8143750419344674, "learning_rate": 9.299672497982926e-06, "loss": 0.1055, "step": 1174 }, { "epoch": 0.42480115690527837, "grad_norm": 0.2272375500629241, "learning_rate": 9.298146056337085e-06, "loss": 0.0317, "step": 1175 }, { "epoch": 0.42516268980477223, "grad_norm": 0.03322068008112504, "learning_rate": 9.29661807853475e-06, "loss": 0.0022, "step": 1176 }, { "epoch": 0.4255242227042661, "grad_norm": 0.23978546253735886, "learning_rate": 9.29508856512202e-06, "loss": 0.0283, "step": 1177 }, { "epoch": 0.42588575560375996, "grad_norm": 0.4734156471363492, "learning_rate": 9.293557516645543e-06, "loss": 0.0283, "step": 1178 }, { "epoch": 0.4262472885032538, "grad_norm": 0.46150009777638407, "learning_rate": 9.292024933652518e-06, "loss": 0.1143, "step": 1179 }, { "epoch": 0.42660882140274764, "grad_norm": 0.17835034267883895, "learning_rate": 9.290490816690685e-06, "loss": 0.0283, "step": 1180 }, { "epoch": 0.4269703543022415, "grad_norm": 0.28064726714085125, "learning_rate": 9.28895516630834e-06, "loss": 0.0182, "step": 1181 }, { "epoch": 0.42733188720173537, "grad_norm": 0.13996755100190547, "learning_rate": 9.287417983054326e-06, "loss": 0.0203, "step": 1182 }, { "epoch": 0.42769342010122924, "grad_norm": 0.13210279664969737, "learning_rate": 9.285879267478027e-06, "loss": 0.0227, "step": 1183 }, { "epoch": 0.42805495300072305, "grad_norm": 0.3029250004088295, "learning_rate": 9.284339020129382e-06, "loss": 0.0432, "step": 1184 }, { "epoch": 0.4284164859002169, "grad_norm": 0.22453904486246956, "learning_rate": 9.282797241558876e-06, "loss": 0.0161, "step": 1185 }, { "epoch": 0.4287780187997108, "grad_norm": 0.3209676173696083, "learning_rate": 9.281253932317542e-06, "loss": 0.0432, "step": 1186 }, { "epoch": 0.42913955169920465, "grad_norm": 0.5996315964133384, "learning_rate": 9.279709092956955e-06, "loss": 0.1807, "step": 1187 }, { "epoch": 0.42950108459869846, "grad_norm": 0.11378138339981957, "learning_rate": 9.27816272402924e-06, "loss": 0.0161, "step": 1188 }, { "epoch": 0.4298626174981923, "grad_norm": 0.6019016501645528, "learning_rate": 9.27661482608707e-06, "loss": 0.0527, "step": 1189 }, { "epoch": 0.4302241503976862, "grad_norm": 0.0224452379627602, "learning_rate": 9.275065399683665e-06, "loss": 0.0013, "step": 1190 }, { "epoch": 0.43058568329718006, "grad_norm": 0.6691474124470183, "learning_rate": 9.273514445372787e-06, "loss": 0.083, "step": 1191 }, { "epoch": 0.4309472161966739, "grad_norm": 0.12517494758227235, "learning_rate": 9.27196196370875e-06, "loss": 0.0181, "step": 1192 }, { "epoch": 0.43130874909616773, "grad_norm": 0.17997803134059934, "learning_rate": 9.270407955246408e-06, "loss": 0.0283, "step": 1193 }, { "epoch": 0.4316702819956616, "grad_norm": 0.47391543646469647, "learning_rate": 9.268852420541163e-06, "loss": 0.0254, "step": 1194 }, { "epoch": 0.43203181489515546, "grad_norm": 0.9205196070322, "learning_rate": 9.267295360148965e-06, "loss": 0.0903, "step": 1195 }, { "epoch": 0.43239334779464933, "grad_norm": 0.8375062645349147, "learning_rate": 9.265736774626306e-06, "loss": 0.0903, "step": 1196 }, { "epoch": 0.43275488069414314, "grad_norm": 0.6142749427791061, "learning_rate": 9.264176664530223e-06, "loss": 0.0635, "step": 1197 }, { "epoch": 0.433116413593637, "grad_norm": 0.13851436261278546, "learning_rate": 9.262615030418301e-06, "loss": 0.0203, "step": 1198 }, { "epoch": 0.4334779464931309, "grad_norm": 0.04296612183606773, "learning_rate": 9.261051872848666e-06, "loss": 0.0024, "step": 1199 }, { "epoch": 0.43383947939262474, "grad_norm": 0.6580864318029653, "learning_rate": 9.259487192379991e-06, "loss": 0.1602, "step": 1200 }, { "epoch": 0.4342010122921186, "grad_norm": 0.2158289837998454, "learning_rate": 9.257920989571492e-06, "loss": 0.0283, "step": 1201 }, { "epoch": 0.4345625451916124, "grad_norm": 0.6188807865375571, "learning_rate": 9.25635326498293e-06, "loss": 0.083, "step": 1202 }, { "epoch": 0.4349240780911063, "grad_norm": 0.12555207312257172, "learning_rate": 9.254784019174611e-06, "loss": 0.0143, "step": 1203 }, { "epoch": 0.43528561099060015, "grad_norm": 0.771481182481317, "learning_rate": 9.253213252707381e-06, "loss": 0.1602, "step": 1204 }, { "epoch": 0.435647143890094, "grad_norm": 0.09201332842787867, "learning_rate": 9.25164096614263e-06, "loss": 0.0143, "step": 1205 }, { "epoch": 0.4360086767895879, "grad_norm": 0.1066445337384726, "learning_rate": 9.250067160042296e-06, "loss": 0.0161, "step": 1206 }, { "epoch": 0.4363702096890817, "grad_norm": 0.5806056835085686, "learning_rate": 9.248491834968857e-06, "loss": 0.1504, "step": 1207 }, { "epoch": 0.43673174258857556, "grad_norm": 0.6867898044938882, "learning_rate": 9.246914991485332e-06, "loss": 0.1602, "step": 1208 }, { "epoch": 0.4370932754880694, "grad_norm": 1.3608483577950767, "learning_rate": 9.245336630155285e-06, "loss": 0.1602, "step": 1209 }, { "epoch": 0.4374548083875633, "grad_norm": 0.29254908656597517, "learning_rate": 9.243756751542823e-06, "loss": 0.0317, "step": 1210 }, { "epoch": 0.4378163412870571, "grad_norm": 0.26138395052302066, "learning_rate": 9.242175356212592e-06, "loss": 0.0227, "step": 1211 }, { "epoch": 0.43817787418655096, "grad_norm": 0.6349084417903456, "learning_rate": 9.240592444729786e-06, "loss": 0.0762, "step": 1212 }, { "epoch": 0.43853940708604483, "grad_norm": 0.6521817381558306, "learning_rate": 9.239008017660133e-06, "loss": 0.0762, "step": 1213 }, { "epoch": 0.4389009399855387, "grad_norm": 0.12881439377571122, "learning_rate": 9.237422075569912e-06, "loss": 0.0092, "step": 1214 }, { "epoch": 0.43926247288503256, "grad_norm": 0.16769354356467672, "learning_rate": 9.235834619025934e-06, "loss": 0.0254, "step": 1215 }, { "epoch": 0.4396240057845264, "grad_norm": 0.5778311298605016, "learning_rate": 9.234245648595557e-06, "loss": 0.0977, "step": 1216 }, { "epoch": 0.43998553868402024, "grad_norm": 0.18277862718109236, "learning_rate": 9.232655164846678e-06, "loss": 0.0283, "step": 1217 }, { "epoch": 0.4403470715835141, "grad_norm": 0.3854733462251578, "learning_rate": 9.231063168347736e-06, "loss": 0.0476, "step": 1218 }, { "epoch": 0.44070860448300797, "grad_norm": 0.16472109874716762, "learning_rate": 9.229469659667713e-06, "loss": 0.0254, "step": 1219 }, { "epoch": 0.4410701373825018, "grad_norm": 0.4746453820846885, "learning_rate": 9.227874639376124e-06, "loss": 0.0254, "step": 1220 }, { "epoch": 0.44143167028199565, "grad_norm": 0.5785197482660971, "learning_rate": 9.226278108043032e-06, "loss": 0.0388, "step": 1221 }, { "epoch": 0.4417932031814895, "grad_norm": 0.007374769934217355, "learning_rate": 9.224680066239037e-06, "loss": 0.0006, "step": 1222 }, { "epoch": 0.4421547360809834, "grad_norm": 0.5014725040963731, "learning_rate": 9.223080514535277e-06, "loss": 0.0635, "step": 1223 }, { "epoch": 0.44251626898047725, "grad_norm": 0.39042883630789793, "learning_rate": 9.221479453503433e-06, "loss": 0.0476, "step": 1224 }, { "epoch": 0.44287780187997106, "grad_norm": 0.38057442351168763, "learning_rate": 9.219876883715722e-06, "loss": 0.0527, "step": 1225 }, { "epoch": 0.4432393347794649, "grad_norm": 0.650853237860797, "learning_rate": 9.218272805744903e-06, "loss": 0.1055, "step": 1226 }, { "epoch": 0.4436008676789588, "grad_norm": 0.023302065611783673, "learning_rate": 9.216667220164276e-06, "loss": 0.0007, "step": 1227 }, { "epoch": 0.44396240057845265, "grad_norm": 0.3115669265587802, "learning_rate": 9.215060127547671e-06, "loss": 0.0388, "step": 1228 }, { "epoch": 0.4443239334779465, "grad_norm": 0.6387213792149632, "learning_rate": 9.213451528469468e-06, "loss": 0.0903, "step": 1229 }, { "epoch": 0.44468546637744033, "grad_norm": 0.2055969933291953, "learning_rate": 9.211841423504577e-06, "loss": 0.0164, "step": 1230 }, { "epoch": 0.4450469992769342, "grad_norm": 0.844664434754044, "learning_rate": 9.21022981322845e-06, "loss": 0.0476, "step": 1231 }, { "epoch": 0.44540853217642806, "grad_norm": 0.5728309590708259, "learning_rate": 9.208616698217079e-06, "loss": 0.1309, "step": 1232 }, { "epoch": 0.44577006507592193, "grad_norm": 0.2093602828461574, "learning_rate": 9.207002079046985e-06, "loss": 0.0145, "step": 1233 }, { "epoch": 0.44613159797541574, "grad_norm": 0.22591647468857304, "learning_rate": 9.205385956295238e-06, "loss": 0.0388, "step": 1234 }, { "epoch": 0.4464931308749096, "grad_norm": 0.04863815852642659, "learning_rate": 9.203768330539436e-06, "loss": 0.0031, "step": 1235 }, { "epoch": 0.44685466377440347, "grad_norm": 0.2564009986927574, "learning_rate": 9.202149202357721e-06, "loss": 0.0388, "step": 1236 }, { "epoch": 0.44721619667389734, "grad_norm": 0.21098117125370078, "learning_rate": 9.200528572328768e-06, "loss": 0.0315, "step": 1237 }, { "epoch": 0.4475777295733912, "grad_norm": 4.21791946433083, "learning_rate": 9.19890644103179e-06, "loss": 1.0703, "step": 1238 }, { "epoch": 0.447939262472885, "grad_norm": 0.3740829587875207, "learning_rate": 9.197282809046533e-06, "loss": 0.0527, "step": 1239 }, { "epoch": 0.4483007953723789, "grad_norm": 0.357594435009163, "learning_rate": 9.195657676953288e-06, "loss": 0.0183, "step": 1240 }, { "epoch": 0.44866232827187275, "grad_norm": 0.39286910023642585, "learning_rate": 9.194031045332877e-06, "loss": 0.0527, "step": 1241 }, { "epoch": 0.4490238611713666, "grad_norm": 0.6493728089770354, "learning_rate": 9.19240291476665e-06, "loss": 0.0977, "step": 1242 }, { "epoch": 0.4493853940708604, "grad_norm": 0.384625826039215, "learning_rate": 9.190773285836513e-06, "loss": 0.0527, "step": 1243 }, { "epoch": 0.4497469269703543, "grad_norm": 0.5556833502193759, "learning_rate": 9.189142159124883e-06, "loss": 0.0579, "step": 1244 }, { "epoch": 0.45010845986984815, "grad_norm": 0.3335056325083359, "learning_rate": 9.187509535214731e-06, "loss": 0.043, "step": 1245 }, { "epoch": 0.450469992769342, "grad_norm": 0.2168944167282057, "learning_rate": 9.185875414689553e-06, "loss": 0.0315, "step": 1246 }, { "epoch": 0.4508315256688359, "grad_norm": 0.3494456475885598, "learning_rate": 9.184239798133387e-06, "loss": 0.0204, "step": 1247 }, { "epoch": 0.4511930585683297, "grad_norm": 0.1538475153317161, "learning_rate": 9.182602686130802e-06, "loss": 0.0227, "step": 1248 }, { "epoch": 0.45155459146782356, "grad_norm": 0.2260640753662232, "learning_rate": 9.180964079266897e-06, "loss": 0.0388, "step": 1249 }, { "epoch": 0.45191612436731743, "grad_norm": 0.048309551418342575, "learning_rate": 9.179323978127313e-06, "loss": 0.0028, "step": 1250 }, { "epoch": 0.4522776572668113, "grad_norm": 0.16484232121069076, "learning_rate": 9.17768238329822e-06, "loss": 0.0072, "step": 1251 }, { "epoch": 0.45263919016630516, "grad_norm": 0.24596405687353998, "learning_rate": 9.176039295366328e-06, "loss": 0.0183, "step": 1252 }, { "epoch": 0.45300072306579897, "grad_norm": 0.04126441703518941, "learning_rate": 9.17439471491887e-06, "loss": 0.0027, "step": 1253 }, { "epoch": 0.45336225596529284, "grad_norm": 0.03964513759696041, "learning_rate": 9.172748642543624e-06, "loss": 0.0007, "step": 1254 }, { "epoch": 0.4537237888647867, "grad_norm": 0.022681704863195793, "learning_rate": 9.171101078828893e-06, "loss": 0.0006, "step": 1255 }, { "epoch": 0.45408532176428057, "grad_norm": 0.09219765161327564, "learning_rate": 9.169452024363517e-06, "loss": 0.005, "step": 1256 }, { "epoch": 0.4544468546637744, "grad_norm": 0.007699041036828194, "learning_rate": 9.167801479736866e-06, "loss": 0.0004, "step": 1257 }, { "epoch": 0.45480838756326825, "grad_norm": 0.09427624176414731, "learning_rate": 9.166149445538848e-06, "loss": 0.0064, "step": 1258 }, { "epoch": 0.4551699204627621, "grad_norm": 0.42903559099031446, "learning_rate": 9.164495922359895e-06, "loss": 0.0254, "step": 1259 }, { "epoch": 0.455531453362256, "grad_norm": 0.09133890701175762, "learning_rate": 9.16284091079098e-06, "loss": 0.0114, "step": 1260 }, { "epoch": 0.45589298626174984, "grad_norm": 1.521179398793763, "learning_rate": 9.161184411423602e-06, "loss": 0.1602, "step": 1261 }, { "epoch": 0.45625451916124365, "grad_norm": 0.10447553013896549, "learning_rate": 9.159526424849792e-06, "loss": 0.0143, "step": 1262 }, { "epoch": 0.4566160520607375, "grad_norm": 0.8522177898881136, "learning_rate": 9.157866951662117e-06, "loss": 0.0527, "step": 1263 }, { "epoch": 0.4569775849602314, "grad_norm": 0.7240767678838493, "learning_rate": 9.15620599245367e-06, "loss": 0.1504, "step": 1264 }, { "epoch": 0.45733911785972525, "grad_norm": 0.011833224779869336, "learning_rate": 9.15454354781808e-06, "loss": 0.0006, "step": 1265 }, { "epoch": 0.45770065075921906, "grad_norm": 0.7134449999980215, "learning_rate": 9.152879618349502e-06, "loss": 0.0476, "step": 1266 }, { "epoch": 0.45806218365871293, "grad_norm": 0.2488493528047971, "learning_rate": 9.151214204642623e-06, "loss": 0.0254, "step": 1267 }, { "epoch": 0.4584237165582068, "grad_norm": 0.06128013957423213, "learning_rate": 9.149547307292665e-06, "loss": 0.0031, "step": 1268 }, { "epoch": 0.45878524945770066, "grad_norm": 0.3970325117546391, "learning_rate": 9.147878926895375e-06, "loss": 0.0476, "step": 1269 }, { "epoch": 0.4591467823571945, "grad_norm": 0.7488732927767204, "learning_rate": 9.146209064047031e-06, "loss": 0.2012, "step": 1270 }, { "epoch": 0.45950831525668834, "grad_norm": 0.05611503154747247, "learning_rate": 9.144537719344445e-06, "loss": 0.0035, "step": 1271 }, { "epoch": 0.4598698481561822, "grad_norm": 5.960484985062118, "learning_rate": 9.14286489338495e-06, "loss": 1.6641, "step": 1272 }, { "epoch": 0.46023138105567607, "grad_norm": 0.04517365887297904, "learning_rate": 9.141190586766418e-06, "loss": 0.0027, "step": 1273 }, { "epoch": 0.46059291395516994, "grad_norm": 2.647150394281298, "learning_rate": 9.139514800087243e-06, "loss": 0.1143, "step": 1274 }, { "epoch": 0.4609544468546638, "grad_norm": 0.18894804559078668, "learning_rate": 9.137837533946355e-06, "loss": 0.0161, "step": 1275 }, { "epoch": 0.4613159797541576, "grad_norm": 1.7094106354317071, "learning_rate": 9.136158788943203e-06, "loss": 0.1504, "step": 1276 }, { "epoch": 0.4616775126536515, "grad_norm": 11.576161751864165, "learning_rate": 9.134478565677772e-06, "loss": 1.0781, "step": 1277 }, { "epoch": 0.46203904555314534, "grad_norm": 0.10874732099920964, "learning_rate": 9.132796864750575e-06, "loss": 0.0143, "step": 1278 }, { "epoch": 0.4624005784526392, "grad_norm": 0.6263277422719161, "learning_rate": 9.131113686762652e-06, "loss": 0.0432, "step": 1279 }, { "epoch": 0.462762111352133, "grad_norm": 0.9239197955953701, "learning_rate": 9.129429032315568e-06, "loss": 0.1226, "step": 1280 }, { "epoch": 0.4631236442516269, "grad_norm": 0.12271325128422397, "learning_rate": 9.127742902011419e-06, "loss": 0.0143, "step": 1281 }, { "epoch": 0.46348517715112075, "grad_norm": 0.09657123528868523, "learning_rate": 9.12605529645283e-06, "loss": 0.0128, "step": 1282 }, { "epoch": 0.4638467100506146, "grad_norm": 0.012774256564710278, "learning_rate": 9.124366216242947e-06, "loss": 0.0006, "step": 1283 }, { "epoch": 0.4642082429501085, "grad_norm": 0.029915943869199487, "learning_rate": 9.122675661985449e-06, "loss": 0.0017, "step": 1284 }, { "epoch": 0.4645697758496023, "grad_norm": 0.23724000124850636, "learning_rate": 9.120983634284539e-06, "loss": 0.0164, "step": 1285 }, { "epoch": 0.46493130874909616, "grad_norm": 0.11993199775343626, "learning_rate": 9.119290133744947e-06, "loss": 0.0143, "step": 1286 }, { "epoch": 0.46529284164859, "grad_norm": 0.11524196674624239, "learning_rate": 9.117595160971932e-06, "loss": 0.0161, "step": 1287 }, { "epoch": 0.4656543745480839, "grad_norm": 0.23438417988144933, "learning_rate": 9.115898716571276e-06, "loss": 0.0254, "step": 1288 }, { "epoch": 0.4660159074475777, "grad_norm": 0.8541057254241771, "learning_rate": 9.114200801149286e-06, "loss": 0.083, "step": 1289 }, { "epoch": 0.46637744034707157, "grad_norm": 0.3700471660231949, "learning_rate": 9.1125014153128e-06, "loss": 0.0315, "step": 1290 }, { "epoch": 0.46673897324656544, "grad_norm": 0.2947376441693336, "learning_rate": 9.110800559669172e-06, "loss": 0.0352, "step": 1291 }, { "epoch": 0.4671005061460593, "grad_norm": 0.8433301986618615, "learning_rate": 9.109098234826295e-06, "loss": 0.1504, "step": 1292 }, { "epoch": 0.46746203904555317, "grad_norm": 0.016053527978448598, "learning_rate": 9.107394441392576e-06, "loss": 0.0005, "step": 1293 }, { "epoch": 0.467823571945047, "grad_norm": 0.1104375025275882, "learning_rate": 9.10568917997695e-06, "loss": 0.0143, "step": 1294 }, { "epoch": 0.46818510484454084, "grad_norm": 0.10378228546818791, "learning_rate": 9.103982451188876e-06, "loss": 0.0143, "step": 1295 }, { "epoch": 0.4685466377440347, "grad_norm": 0.14126745805663837, "learning_rate": 9.102274255638341e-06, "loss": 0.0181, "step": 1296 }, { "epoch": 0.4689081706435286, "grad_norm": 0.7885843062786007, "learning_rate": 9.100564593935854e-06, "loss": 0.2236, "step": 1297 }, { "epoch": 0.4692697035430224, "grad_norm": 0.24732453408157326, "learning_rate": 9.098853466692445e-06, "loss": 0.0254, "step": 1298 }, { "epoch": 0.46963123644251625, "grad_norm": 1.1872672055696016, "learning_rate": 9.097140874519672e-06, "loss": 0.0977, "step": 1299 }, { "epoch": 0.4699927693420101, "grad_norm": 0.8054698057090105, "learning_rate": 9.095426818029616e-06, "loss": 0.0635, "step": 1300 }, { "epoch": 0.470354302241504, "grad_norm": 0.004705917303105831, "learning_rate": 9.09371129783488e-06, "loss": 0.0002, "step": 1301 }, { "epoch": 0.47071583514099785, "grad_norm": 0.0679172393888412, "learning_rate": 9.09199431454859e-06, "loss": 0.0035, "step": 1302 }, { "epoch": 0.47107736804049166, "grad_norm": 0.04928791574036185, "learning_rate": 9.090275868784392e-06, "loss": 0.0024, "step": 1303 }, { "epoch": 0.47143890093998553, "grad_norm": 1.087964147640073, "learning_rate": 9.088555961156466e-06, "loss": 0.0145, "step": 1304 }, { "epoch": 0.4718004338394794, "grad_norm": 0.313571823428644, "learning_rate": 9.086834592279501e-06, "loss": 0.0352, "step": 1305 }, { "epoch": 0.47216196673897326, "grad_norm": 1.323461925608383, "learning_rate": 9.085111762768714e-06, "loss": 0.1602, "step": 1306 }, { "epoch": 0.4725234996384671, "grad_norm": 0.23503787488518071, "learning_rate": 9.083387473239847e-06, "loss": 0.0115, "step": 1307 }, { "epoch": 0.47288503253796094, "grad_norm": 0.15020017733051513, "learning_rate": 9.081661724309157e-06, "loss": 0.0056, "step": 1308 }, { "epoch": 0.4732465654374548, "grad_norm": 0.17755451356204552, "learning_rate": 9.07993451659343e-06, "loss": 0.0227, "step": 1309 }, { "epoch": 0.47360809833694867, "grad_norm": 0.4064418918138653, "learning_rate": 9.078205850709967e-06, "loss": 0.0388, "step": 1310 }, { "epoch": 0.47396963123644253, "grad_norm": 0.008559481307568135, "learning_rate": 9.076475727276592e-06, "loss": 0.0003, "step": 1311 }, { "epoch": 0.47433116413593635, "grad_norm": 0.1862947835915675, "learning_rate": 9.074744146911654e-06, "loss": 0.0227, "step": 1312 }, { "epoch": 0.4746926970354302, "grad_norm": 0.8149279812257123, "learning_rate": 9.073011110234017e-06, "loss": 0.0903, "step": 1313 }, { "epoch": 0.4750542299349241, "grad_norm": 0.9865182748462842, "learning_rate": 9.07127661786307e-06, "loss": 0.0693, "step": 1314 }, { "epoch": 0.47541576283441794, "grad_norm": 1.0211690638915645, "learning_rate": 9.06954067041872e-06, "loss": 0.1055, "step": 1315 }, { "epoch": 0.4757772957339118, "grad_norm": 0.015182055807125467, "learning_rate": 9.06780326852139e-06, "loss": 0.0007, "step": 1316 }, { "epoch": 0.4761388286334056, "grad_norm": 0.212272660976177, "learning_rate": 9.066064412792033e-06, "loss": 0.0114, "step": 1317 }, { "epoch": 0.4765003615328995, "grad_norm": 0.1319503458276417, "learning_rate": 9.06432410385211e-06, "loss": 0.0181, "step": 1318 }, { "epoch": 0.47686189443239335, "grad_norm": 0.6526314494622408, "learning_rate": 9.062582342323613e-06, "loss": 0.0432, "step": 1319 }, { "epoch": 0.4772234273318872, "grad_norm": 0.6773303320888976, "learning_rate": 9.060839128829044e-06, "loss": 0.1807, "step": 1320 }, { "epoch": 0.47758496023138103, "grad_norm": 0.8359690384928555, "learning_rate": 9.059094463991426e-06, "loss": 0.1143, "step": 1321 }, { "epoch": 0.4779464931308749, "grad_norm": 0.12305836170809041, "learning_rate": 9.057348348434304e-06, "loss": 0.0161, "step": 1322 }, { "epoch": 0.47830802603036876, "grad_norm": 1.5550645072911389, "learning_rate": 9.055600782781738e-06, "loss": 0.1602, "step": 1323 }, { "epoch": 0.4786695589298626, "grad_norm": 0.9404265914771953, "learning_rate": 9.053851767658309e-06, "loss": 0.1406, "step": 1324 }, { "epoch": 0.4790310918293565, "grad_norm": 0.018889618833561928, "learning_rate": 9.052101303689113e-06, "loss": 0.0009, "step": 1325 }, { "epoch": 0.4793926247288503, "grad_norm": 2.933196547610868, "learning_rate": 9.050349391499766e-06, "loss": 0.0283, "step": 1326 }, { "epoch": 0.47975415762834417, "grad_norm": 0.8580697480594512, "learning_rate": 9.0485960317164e-06, "loss": 0.1807, "step": 1327 }, { "epoch": 0.48011569052783803, "grad_norm": 0.7129938201580822, "learning_rate": 9.046841224965671e-06, "loss": 0.0254, "step": 1328 }, { "epoch": 0.4804772234273319, "grad_norm": 0.24408726559196256, "learning_rate": 9.045084971874738e-06, "loss": 0.0388, "step": 1329 }, { "epoch": 0.48083875632682577, "grad_norm": 0.7527739737921725, "learning_rate": 9.043327273071292e-06, "loss": 0.0762, "step": 1330 }, { "epoch": 0.4812002892263196, "grad_norm": 0.6443207735186283, "learning_rate": 9.04156812918353e-06, "loss": 0.1699, "step": 1331 }, { "epoch": 0.48156182212581344, "grad_norm": 0.5141120027558159, "learning_rate": 9.039807540840173e-06, "loss": 0.1309, "step": 1332 }, { "epoch": 0.4819233550253073, "grad_norm": 1.5356490350136016, "learning_rate": 9.038045508670453e-06, "loss": 0.1807, "step": 1333 }, { "epoch": 0.4822848879248012, "grad_norm": 0.11695330538785442, "learning_rate": 9.03628203330412e-06, "loss": 0.0063, "step": 1334 }, { "epoch": 0.482646420824295, "grad_norm": 0.33657704801283245, "learning_rate": 9.034517115371442e-06, "loss": 0.0388, "step": 1335 }, { "epoch": 0.48300795372378885, "grad_norm": 0.21307221645911442, "learning_rate": 9.032750755503196e-06, "loss": 0.0349, "step": 1336 }, { "epoch": 0.4833694866232827, "grad_norm": 0.66115989874695, "learning_rate": 9.030982954330683e-06, "loss": 0.0635, "step": 1337 }, { "epoch": 0.4837310195227766, "grad_norm": 0.006545541587034544, "learning_rate": 9.029213712485711e-06, "loss": 0.0003, "step": 1338 }, { "epoch": 0.48409255242227045, "grad_norm": 0.6597159543635837, "learning_rate": 9.02744303060061e-06, "loss": 0.0903, "step": 1339 }, { "epoch": 0.48445408532176426, "grad_norm": 0.8911044679076816, "learning_rate": 9.02567090930822e-06, "loss": 0.0903, "step": 1340 }, { "epoch": 0.4848156182212581, "grad_norm": 0.2731566579886056, "learning_rate": 9.023897349241895e-06, "loss": 0.0388, "step": 1341 }, { "epoch": 0.485177151120752, "grad_norm": 0.2536020428050146, "learning_rate": 9.022122351035507e-06, "loss": 0.0432, "step": 1342 }, { "epoch": 0.48553868402024586, "grad_norm": 0.5598112171630429, "learning_rate": 9.020345915323441e-06, "loss": 0.0903, "step": 1343 }, { "epoch": 0.48590021691973967, "grad_norm": 0.00593785442301689, "learning_rate": 9.018568042740593e-06, "loss": 0.0003, "step": 1344 }, { "epoch": 0.48626174981923354, "grad_norm": 0.2482404354995763, "learning_rate": 9.016788733922372e-06, "loss": 0.0388, "step": 1345 }, { "epoch": 0.4866232827187274, "grad_norm": 0.20058023766555, "learning_rate": 9.015007989504707e-06, "loss": 0.0349, "step": 1346 }, { "epoch": 0.48698481561822127, "grad_norm": 2.740843453784979, "learning_rate": 9.013225810124032e-06, "loss": 1.2344, "step": 1347 }, { "epoch": 0.48734634851771513, "grad_norm": 0.18605405141315048, "learning_rate": 9.011442196417304e-06, "loss": 0.0315, "step": 1348 }, { "epoch": 0.48770788141720894, "grad_norm": 0.09471391951641249, "learning_rate": 9.00965714902198e-06, "loss": 0.005, "step": 1349 }, { "epoch": 0.4880694143167028, "grad_norm": 0.5268237235884234, "learning_rate": 9.007870668576035e-06, "loss": 0.0391, "step": 1350 }, { "epoch": 0.4884309472161967, "grad_norm": 0.2516648041956773, "learning_rate": 9.00608275571796e-06, "loss": 0.0388, "step": 1351 }, { "epoch": 0.48879248011569054, "grad_norm": 0.33327577463512476, "learning_rate": 9.004293411086753e-06, "loss": 0.0432, "step": 1352 }, { "epoch": 0.4891540130151844, "grad_norm": 0.29042996468605686, "learning_rate": 9.002502635321925e-06, "loss": 0.0432, "step": 1353 }, { "epoch": 0.4895155459146782, "grad_norm": 0.6537033888393659, "learning_rate": 9.000710429063503e-06, "loss": 0.0527, "step": 1354 }, { "epoch": 0.4898770788141721, "grad_norm": 0.22501928630431192, "learning_rate": 8.998916792952016e-06, "loss": 0.0352, "step": 1355 }, { "epoch": 0.49023861171366595, "grad_norm": 0.5847320009093495, "learning_rate": 8.99712172762851e-06, "loss": 0.1309, "step": 1356 }, { "epoch": 0.4906001446131598, "grad_norm": 0.15530377846019366, "learning_rate": 8.995325233734544e-06, "loss": 0.0254, "step": 1357 }, { "epoch": 0.4909616775126536, "grad_norm": 0.2582438582745673, "learning_rate": 8.993527311912182e-06, "loss": 0.0254, "step": 1358 }, { "epoch": 0.4913232104121475, "grad_norm": 0.2709689788683993, "learning_rate": 8.991727962804002e-06, "loss": 0.0283, "step": 1359 }, { "epoch": 0.49168474331164136, "grad_norm": 0.1455888306764615, "learning_rate": 8.98992718705309e-06, "loss": 0.0057, "step": 1360 }, { "epoch": 0.4920462762111352, "grad_norm": 0.4980190919324519, "learning_rate": 8.988124985303045e-06, "loss": 0.0476, "step": 1361 }, { "epoch": 0.4924078091106291, "grad_norm": 0.13331601929347228, "learning_rate": 8.98632135819797e-06, "loss": 0.0227, "step": 1362 }, { "epoch": 0.4927693420101229, "grad_norm": 0.661550581069571, "learning_rate": 8.984516306382482e-06, "loss": 0.0579, "step": 1363 }, { "epoch": 0.49313087490961677, "grad_norm": 0.7126996263288167, "learning_rate": 8.982709830501708e-06, "loss": 0.1699, "step": 1364 }, { "epoch": 0.49349240780911063, "grad_norm": 0.5675267008281845, "learning_rate": 8.980901931201283e-06, "loss": 0.1602, "step": 1365 }, { "epoch": 0.4938539407086045, "grad_norm": 0.1601978336848755, "learning_rate": 8.979092609127344e-06, "loss": 0.0227, "step": 1366 }, { "epoch": 0.4942154736080983, "grad_norm": 0.824400181676824, "learning_rate": 8.97728186492655e-06, "loss": 0.0693, "step": 1367 }, { "epoch": 0.4945770065075922, "grad_norm": 0.7734490941168171, "learning_rate": 8.975469699246055e-06, "loss": 0.1807, "step": 1368 }, { "epoch": 0.49493853940708604, "grad_norm": 0.6037920070535662, "learning_rate": 8.973656112733529e-06, "loss": 0.1406, "step": 1369 }, { "epoch": 0.4953000723065799, "grad_norm": 0.8146444497492313, "learning_rate": 8.971841106037148e-06, "loss": 0.1406, "step": 1370 }, { "epoch": 0.4956616052060738, "grad_norm": 0.6432738411445708, "learning_rate": 8.970024679805592e-06, "loss": 0.0391, "step": 1371 }, { "epoch": 0.4960231381055676, "grad_norm": 0.5975189592790725, "learning_rate": 8.968206834688052e-06, "loss": 0.1309, "step": 1372 }, { "epoch": 0.49638467100506145, "grad_norm": 0.21157642258521367, "learning_rate": 8.96638757133423e-06, "loss": 0.0283, "step": 1373 }, { "epoch": 0.4967462039045553, "grad_norm": 0.5735719552954742, "learning_rate": 8.964566890394325e-06, "loss": 0.0635, "step": 1374 }, { "epoch": 0.4971077368040492, "grad_norm": 0.2759501381549015, "learning_rate": 8.962744792519052e-06, "loss": 0.0388, "step": 1375 }, { "epoch": 0.49746926970354305, "grad_norm": 0.2653324732683245, "learning_rate": 8.960921278359624e-06, "loss": 0.0476, "step": 1376 }, { "epoch": 0.49783080260303686, "grad_norm": 0.19342664656672584, "learning_rate": 8.959096348567769e-06, "loss": 0.0349, "step": 1377 }, { "epoch": 0.4981923355025307, "grad_norm": 0.5265711116653513, "learning_rate": 8.957270003795713e-06, "loss": 0.1055, "step": 1378 }, { "epoch": 0.4985538684020246, "grad_norm": 0.34801355184767757, "learning_rate": 8.955442244696193e-06, "loss": 0.0254, "step": 1379 }, { "epoch": 0.49891540130151846, "grad_norm": 0.45726091852383527, "learning_rate": 8.953613071922451e-06, "loss": 0.0476, "step": 1380 }, { "epoch": 0.49927693420101227, "grad_norm": 0.8455925099116176, "learning_rate": 8.95178248612823e-06, "loss": 0.0903, "step": 1381 }, { "epoch": 0.49963846710050613, "grad_norm": 0.026714209112377662, "learning_rate": 8.949950487967782e-06, "loss": 0.0015, "step": 1382 }, { "epoch": 0.5, "grad_norm": 0.16534373339514008, "learning_rate": 8.948117078095865e-06, "loss": 0.0115, "step": 1383 }, { "epoch": 0.5003615328994938, "grad_norm": 0.6829934744275646, "learning_rate": 8.946282257167736e-06, "loss": 0.0579, "step": 1384 }, { "epoch": 0.5007230657989877, "grad_norm": 0.2577683330672329, "learning_rate": 8.944446025839161e-06, "loss": 0.0432, "step": 1385 }, { "epoch": 0.5010845986984815, "grad_norm": 0.5240687755094331, "learning_rate": 8.942608384766412e-06, "loss": 0.0476, "step": 1386 }, { "epoch": 0.5014461315979755, "grad_norm": 0.21440340633859917, "learning_rate": 8.940769334606254e-06, "loss": 0.0145, "step": 1387 }, { "epoch": 0.5018076644974693, "grad_norm": 0.28617029780166076, "learning_rate": 8.93892887601597e-06, "loss": 0.0476, "step": 1388 }, { "epoch": 0.5021691973969631, "grad_norm": 0.22284804417321522, "learning_rate": 8.937087009653335e-06, "loss": 0.0388, "step": 1389 }, { "epoch": 0.502530730296457, "grad_norm": 0.36322622854880166, "learning_rate": 8.935243736176636e-06, "loss": 0.0476, "step": 1390 }, { "epoch": 0.5028922631959508, "grad_norm": 0.621897241448212, "learning_rate": 8.933399056244655e-06, "loss": 0.0527, "step": 1391 }, { "epoch": 0.5032537960954447, "grad_norm": 0.2919663744387686, "learning_rate": 8.931552970516681e-06, "loss": 0.0476, "step": 1392 }, { "epoch": 0.5036153289949385, "grad_norm": 0.1860185986411699, "learning_rate": 8.929705479652507e-06, "loss": 0.0315, "step": 1393 }, { "epoch": 0.5039768618944324, "grad_norm": 0.5676694013500811, "learning_rate": 8.927856584312422e-06, "loss": 0.0391, "step": 1394 }, { "epoch": 0.5043383947939263, "grad_norm": 0.26659080307905414, "learning_rate": 8.926006285157223e-06, "loss": 0.0432, "step": 1395 }, { "epoch": 0.5046999276934201, "grad_norm": 0.7085641183160952, "learning_rate": 8.924154582848205e-06, "loss": 0.0762, "step": 1396 }, { "epoch": 0.5050614605929139, "grad_norm": 0.40777675179058454, "learning_rate": 8.922301478047171e-06, "loss": 0.0527, "step": 1397 }, { "epoch": 0.5054229934924078, "grad_norm": 0.517032173271908, "learning_rate": 8.920446971416413e-06, "loss": 0.1055, "step": 1398 }, { "epoch": 0.5057845263919016, "grad_norm": 0.3341726458294827, "learning_rate": 8.918591063618735e-06, "loss": 0.0476, "step": 1399 }, { "epoch": 0.5061460592913956, "grad_norm": 0.21231147588107535, "learning_rate": 8.916733755317439e-06, "loss": 0.0349, "step": 1400 }, { "epoch": 0.5065075921908894, "grad_norm": 0.20770225084471536, "learning_rate": 8.914875047176325e-06, "loss": 0.0349, "step": 1401 }, { "epoch": 0.5068691250903832, "grad_norm": 0.0033207816837739386, "learning_rate": 8.913014939859697e-06, "loss": 0.0002, "step": 1402 }, { "epoch": 0.5072306579898771, "grad_norm": 0.014718958175427973, "learning_rate": 8.911153434032354e-06, "loss": 0.0007, "step": 1403 }, { "epoch": 0.5075921908893709, "grad_norm": 0.08013150666208875, "learning_rate": 8.909290530359597e-06, "loss": 0.005, "step": 1404 }, { "epoch": 0.5079537237888648, "grad_norm": 0.2511539941491908, "learning_rate": 8.907426229507233e-06, "loss": 0.0349, "step": 1405 }, { "epoch": 0.5083152566883586, "grad_norm": 0.5549028302984946, "learning_rate": 8.905560532141556e-06, "loss": 0.0283, "step": 1406 }, { "epoch": 0.5086767895878525, "grad_norm": 0.8315925703951591, "learning_rate": 8.903693438929371e-06, "loss": 0.0693, "step": 1407 }, { "epoch": 0.5090383224873464, "grad_norm": 1.1798540269585998, "learning_rate": 8.901824950537975e-06, "loss": 0.1309, "step": 1408 }, { "epoch": 0.5093998553868402, "grad_norm": 0.3577517363748292, "learning_rate": 8.899955067635164e-06, "loss": 0.0476, "step": 1409 }, { "epoch": 0.5097613882863341, "grad_norm": 0.38226077336712583, "learning_rate": 8.898083790889236e-06, "loss": 0.0388, "step": 1410 }, { "epoch": 0.5101229211858279, "grad_norm": 0.38834123960986683, "learning_rate": 8.896211120968983e-06, "loss": 0.0388, "step": 1411 }, { "epoch": 0.5104844540853217, "grad_norm": 2.3510765051123346, "learning_rate": 8.894337058543699e-06, "loss": 0.2363, "step": 1412 }, { "epoch": 0.5108459869848156, "grad_norm": 0.6340006338283316, "learning_rate": 8.892461604283169e-06, "loss": 0.1406, "step": 1413 }, { "epoch": 0.5112075198843095, "grad_norm": 0.19847263185587705, "learning_rate": 8.890584758857685e-06, "loss": 0.0283, "step": 1414 }, { "epoch": 0.5115690527838034, "grad_norm": 0.02535660621909047, "learning_rate": 8.88870652293803e-06, "loss": 0.001, "step": 1415 }, { "epoch": 0.5119305856832972, "grad_norm": 0.19054815670245495, "learning_rate": 8.886826897195484e-06, "loss": 0.0254, "step": 1416 }, { "epoch": 0.512292118582791, "grad_norm": 0.023570231196345368, "learning_rate": 8.884945882301825e-06, "loss": 0.001, "step": 1417 }, { "epoch": 0.5126536514822849, "grad_norm": 0.10846152946795935, "learning_rate": 8.883063478929327e-06, "loss": 0.0056, "step": 1418 }, { "epoch": 0.5130151843817787, "grad_norm": 0.24200328105869268, "learning_rate": 8.881179687750761e-06, "loss": 0.0182, "step": 1419 }, { "epoch": 0.5133767172812725, "grad_norm": 0.23515119129651008, "learning_rate": 8.879294509439394e-06, "loss": 0.0283, "step": 1420 }, { "epoch": 0.5137382501807665, "grad_norm": 0.12621944530278037, "learning_rate": 8.877407944668988e-06, "loss": 0.0057, "step": 1421 }, { "epoch": 0.5140997830802603, "grad_norm": 0.4121164993931168, "learning_rate": 8.875519994113802e-06, "loss": 0.0352, "step": 1422 }, { "epoch": 0.5144613159797542, "grad_norm": 0.167977432217891, "learning_rate": 8.873630658448586e-06, "loss": 0.0181, "step": 1423 }, { "epoch": 0.514822848879248, "grad_norm": 0.31101367691006954, "learning_rate": 8.871739938348591e-06, "loss": 0.0283, "step": 1424 }, { "epoch": 0.5151843817787418, "grad_norm": 0.816977782881466, "learning_rate": 8.86984783448956e-06, "loss": 0.1055, "step": 1425 }, { "epoch": 0.5155459146782357, "grad_norm": 0.00754809254538474, "learning_rate": 8.86795434754773e-06, "loss": 0.0003, "step": 1426 }, { "epoch": 0.5159074475777295, "grad_norm": 0.7003302104783516, "learning_rate": 8.86605947819983e-06, "loss": 0.1143, "step": 1427 }, { "epoch": 0.5162689804772235, "grad_norm": 0.033328493626781565, "learning_rate": 8.86416322712309e-06, "loss": 0.0017, "step": 1428 }, { "epoch": 0.5166305133767173, "grad_norm": 0.024115072578249394, "learning_rate": 8.862265594995227e-06, "loss": 0.0013, "step": 1429 }, { "epoch": 0.5169920462762111, "grad_norm": 0.6909805958966416, "learning_rate": 8.860366582494457e-06, "loss": 0.0391, "step": 1430 }, { "epoch": 0.517353579175705, "grad_norm": 0.936478629223443, "learning_rate": 8.858466190299486e-06, "loss": 0.0903, "step": 1431 }, { "epoch": 0.5177151120751988, "grad_norm": 0.858256001887543, "learning_rate": 8.856564419089511e-06, "loss": 0.1143, "step": 1432 }, { "epoch": 0.5180766449746927, "grad_norm": 2.276536073071561, "learning_rate": 8.854661269544227e-06, "loss": 0.1807, "step": 1433 }, { "epoch": 0.5184381778741866, "grad_norm": 0.6429225723218881, "learning_rate": 8.852756742343818e-06, "loss": 0.0635, "step": 1434 }, { "epoch": 0.5187997107736804, "grad_norm": 0.24779681260828385, "learning_rate": 8.850850838168963e-06, "loss": 0.0182, "step": 1435 }, { "epoch": 0.5191612436731743, "grad_norm": 0.23812422801517721, "learning_rate": 8.848943557700831e-06, "loss": 0.0145, "step": 1436 }, { "epoch": 0.5195227765726681, "grad_norm": 0.8266076258652062, "learning_rate": 8.847034901621083e-06, "loss": 0.0762, "step": 1437 }, { "epoch": 0.519884309472162, "grad_norm": 0.3619631384655703, "learning_rate": 8.845124870611875e-06, "loss": 0.0432, "step": 1438 }, { "epoch": 0.5202458423716558, "grad_norm": 0.14911885426467592, "learning_rate": 8.843213465355848e-06, "loss": 0.0203, "step": 1439 }, { "epoch": 0.5206073752711496, "grad_norm": 0.05641047226985446, "learning_rate": 8.841300686536141e-06, "loss": 0.0027, "step": 1440 }, { "epoch": 0.5209689081706436, "grad_norm": 0.4845264866755232, "learning_rate": 8.839386534836378e-06, "loss": 0.0227, "step": 1441 }, { "epoch": 0.5213304410701374, "grad_norm": 0.5824963137654208, "learning_rate": 8.837471010940678e-06, "loss": 0.0579, "step": 1442 }, { "epoch": 0.5216919739696312, "grad_norm": 0.20640671861151597, "learning_rate": 8.835554115533649e-06, "loss": 0.0227, "step": 1443 }, { "epoch": 0.5220535068691251, "grad_norm": 0.02455364542935939, "learning_rate": 8.833635849300389e-06, "loss": 0.0013, "step": 1444 }, { "epoch": 0.5224150397686189, "grad_norm": 0.4853178554971988, "learning_rate": 8.831716212926484e-06, "loss": 0.0476, "step": 1445 }, { "epoch": 0.5227765726681128, "grad_norm": 0.7284645477233389, "learning_rate": 8.829795207098013e-06, "loss": 0.0579, "step": 1446 }, { "epoch": 0.5231381055676066, "grad_norm": 0.0015569482464212515, "learning_rate": 8.827872832501545e-06, "loss": 0.0001, "step": 1447 }, { "epoch": 0.5234996384671005, "grad_norm": 5.81733546952873, "learning_rate": 8.825949089824133e-06, "loss": 1.1094, "step": 1448 }, { "epoch": 0.5238611713665944, "grad_norm": 0.48467319430836886, "learning_rate": 8.824023979753325e-06, "loss": 0.0476, "step": 1449 }, { "epoch": 0.5242227042660882, "grad_norm": 0.17050513491050548, "learning_rate": 8.822097502977153e-06, "loss": 0.0203, "step": 1450 }, { "epoch": 0.5245842371655821, "grad_norm": 0.6858635174473395, "learning_rate": 8.820169660184141e-06, "loss": 0.0476, "step": 1451 }, { "epoch": 0.5249457700650759, "grad_norm": 0.03630066971814514, "learning_rate": 8.818240452063297e-06, "loss": 0.0012, "step": 1452 }, { "epoch": 0.5253073029645697, "grad_norm": 1.0925577611276702, "learning_rate": 8.816309879304122e-06, "loss": 0.0762, "step": 1453 }, { "epoch": 0.5256688358640637, "grad_norm": 0.2909237184716103, "learning_rate": 8.814377942596602e-06, "loss": 0.0283, "step": 1454 }, { "epoch": 0.5260303687635575, "grad_norm": 0.3600955052642801, "learning_rate": 8.812444642631208e-06, "loss": 0.0352, "step": 1455 }, { "epoch": 0.5263919016630514, "grad_norm": 0.2502096076658029, "learning_rate": 8.810509980098907e-06, "loss": 0.0254, "step": 1456 }, { "epoch": 0.5267534345625452, "grad_norm": 0.19991549321722374, "learning_rate": 8.80857395569114e-06, "loss": 0.0161, "step": 1457 }, { "epoch": 0.527114967462039, "grad_norm": 0.14114472985546722, "learning_rate": 8.806636570099847e-06, "loss": 0.0181, "step": 1458 }, { "epoch": 0.5274765003615329, "grad_norm": 0.14921232884800573, "learning_rate": 8.804697824017447e-06, "loss": 0.0128, "step": 1459 }, { "epoch": 0.5278380332610267, "grad_norm": 0.416351610437047, "learning_rate": 8.802757718136846e-06, "loss": 0.0162, "step": 1460 }, { "epoch": 0.5281995661605207, "grad_norm": 0.2676699190027372, "learning_rate": 8.80081625315144e-06, "loss": 0.0227, "step": 1461 }, { "epoch": 0.5285610990600145, "grad_norm": 0.19734059757840164, "learning_rate": 8.798873429755108e-06, "loss": 0.0143, "step": 1462 }, { "epoch": 0.5289226319595083, "grad_norm": 0.2563582751650667, "learning_rate": 8.796929248642213e-06, "loss": 0.0254, "step": 1463 }, { "epoch": 0.5292841648590022, "grad_norm": 0.08140267184991538, "learning_rate": 8.794983710507607e-06, "loss": 0.0101, "step": 1464 }, { "epoch": 0.529645697758496, "grad_norm": 0.20053533144899904, "learning_rate": 8.793036816046622e-06, "loss": 0.0115, "step": 1465 }, { "epoch": 0.5300072306579898, "grad_norm": 0.07598019813373584, "learning_rate": 8.79108856595508e-06, "loss": 0.0079, "step": 1466 }, { "epoch": 0.5303687635574837, "grad_norm": 0.5028967887635823, "learning_rate": 8.789138960929286e-06, "loss": 0.0283, "step": 1467 }, { "epoch": 0.5307302964569776, "grad_norm": 0.6530588329138401, "learning_rate": 8.787188001666027e-06, "loss": 0.0315, "step": 1468 }, { "epoch": 0.5310918293564715, "grad_norm": 1.0776726325513744, "learning_rate": 8.785235688862574e-06, "loss": 0.1914, "step": 1469 }, { "epoch": 0.5314533622559653, "grad_norm": 0.11229598041096854, "learning_rate": 8.783282023216685e-06, "loss": 0.0114, "step": 1470 }, { "epoch": 0.5318148951554591, "grad_norm": 0.017700960325321712, "learning_rate": 8.7813270054266e-06, "loss": 0.0007, "step": 1471 }, { "epoch": 0.532176428054953, "grad_norm": 0.2517489043229539, "learning_rate": 8.77937063619104e-06, "loss": 0.0182, "step": 1472 }, { "epoch": 0.5325379609544468, "grad_norm": 1.5202484287388243, "learning_rate": 8.777412916209214e-06, "loss": 0.0476, "step": 1473 }, { "epoch": 0.5328994938539408, "grad_norm": 0.15854747273403866, "learning_rate": 8.775453846180807e-06, "loss": 0.0143, "step": 1474 }, { "epoch": 0.5332610267534346, "grad_norm": 0.0037758402372301846, "learning_rate": 8.773493426805993e-06, "loss": 0.0002, "step": 1475 }, { "epoch": 0.5336225596529284, "grad_norm": 0.006202081183347475, "learning_rate": 8.771531658785425e-06, "loss": 0.0003, "step": 1476 }, { "epoch": 0.5339840925524223, "grad_norm": 1.2271100753821043, "learning_rate": 8.769568542820238e-06, "loss": 0.0635, "step": 1477 }, { "epoch": 0.5343456254519161, "grad_norm": 0.28792913433413386, "learning_rate": 8.767604079612049e-06, "loss": 0.0143, "step": 1478 }, { "epoch": 0.53470715835141, "grad_norm": 0.7904396932930681, "learning_rate": 8.765638269862957e-06, "loss": 0.0635, "step": 1479 }, { "epoch": 0.5350686912509038, "grad_norm": 0.06050445361702226, "learning_rate": 8.763671114275542e-06, "loss": 0.007, "step": 1480 }, { "epoch": 0.5354302241503976, "grad_norm": 0.07827641145650241, "learning_rate": 8.761702613552866e-06, "loss": 0.009, "step": 1481 }, { "epoch": 0.5357917570498916, "grad_norm": 0.3631526096236626, "learning_rate": 8.759732768398468e-06, "loss": 0.0227, "step": 1482 }, { "epoch": 0.5361532899493854, "grad_norm": 0.0018419567991354352, "learning_rate": 8.757761579516372e-06, "loss": 0.0001, "step": 1483 }, { "epoch": 0.5365148228488793, "grad_norm": 0.2006011479695936, "learning_rate": 8.755789047611083e-06, "loss": 0.007, "step": 1484 }, { "epoch": 0.5368763557483731, "grad_norm": 0.2817064965037197, "learning_rate": 8.75381517338758e-06, "loss": 0.0081, "step": 1485 }, { "epoch": 0.5372378886478669, "grad_norm": 0.14055516725410697, "learning_rate": 8.751839957551326e-06, "loss": 0.0143, "step": 1486 }, { "epoch": 0.5375994215473608, "grad_norm": 0.03680617166824012, "learning_rate": 8.749863400808263e-06, "loss": 0.0013, "step": 1487 }, { "epoch": 0.5379609544468547, "grad_norm": 0.16234871715202193, "learning_rate": 8.74788550386481e-06, "loss": 0.0143, "step": 1488 }, { "epoch": 0.5383224873463485, "grad_norm": 0.2086123177129957, "learning_rate": 8.745906267427871e-06, "loss": 0.0203, "step": 1489 }, { "epoch": 0.5386840202458424, "grad_norm": 0.03821575223793575, "learning_rate": 8.743925692204823e-06, "loss": 0.0017, "step": 1490 }, { "epoch": 0.5390455531453362, "grad_norm": 0.18643143822981734, "learning_rate": 8.741943778903522e-06, "loss": 0.0114, "step": 1491 }, { "epoch": 0.5394070860448301, "grad_norm": 1.706602089469343, "learning_rate": 8.739960528232302e-06, "loss": 0.1807, "step": 1492 }, { "epoch": 0.5397686189443239, "grad_norm": 0.04849759989261618, "learning_rate": 8.737975940899981e-06, "loss": 0.0038, "step": 1493 }, { "epoch": 0.5401301518438177, "grad_norm": 0.10112634436447007, "learning_rate": 8.735990017615848e-06, "loss": 0.0056, "step": 1494 }, { "epoch": 0.5404916847433117, "grad_norm": 0.2990290008355621, "learning_rate": 8.73400275908967e-06, "loss": 0.0254, "step": 1495 }, { "epoch": 0.5408532176428055, "grad_norm": 0.01773064840351514, "learning_rate": 8.732014166031694e-06, "loss": 0.0006, "step": 1496 }, { "epoch": 0.5412147505422994, "grad_norm": 0.0007089709545889917, "learning_rate": 8.730024239152643e-06, "loss": 0.0, "step": 1497 }, { "epoch": 0.5415762834417932, "grad_norm": 0.004089319502183418, "learning_rate": 8.728032979163717e-06, "loss": 0.0, "step": 1498 }, { "epoch": 0.541937816341287, "grad_norm": 0.10753314238798899, "learning_rate": 8.72604038677659e-06, "loss": 0.0101, "step": 1499 }, { "epoch": 0.5422993492407809, "grad_norm": 0.8346860003883928, "learning_rate": 8.724046462703413e-06, "loss": 0.0203, "step": 1500 }, { "epoch": 0.5426608821402747, "grad_norm": 0.0026299696418298477, "learning_rate": 8.722051207656819e-06, "loss": 0.0001, "step": 1501 }, { "epoch": 0.5430224150397687, "grad_norm": 0.10154670058427401, "learning_rate": 8.720054622349906e-06, "loss": 0.0063, "step": 1502 }, { "epoch": 0.5433839479392625, "grad_norm": 0.09338848815216695, "learning_rate": 8.71805670749626e-06, "loss": 0.0049, "step": 1503 }, { "epoch": 0.5437454808387563, "grad_norm": 0.0037148416703960225, "learning_rate": 8.716057463809928e-06, "loss": 0.0001, "step": 1504 }, { "epoch": 0.5441070137382502, "grad_norm": 0.07852583904286158, "learning_rate": 8.714056892005445e-06, "loss": 0.0019, "step": 1505 }, { "epoch": 0.544468546637744, "grad_norm": 0.913942207785023, "learning_rate": 8.712054992797812e-06, "loss": 0.0432, "step": 1506 }, { "epoch": 0.5448300795372379, "grad_norm": 1.8025650062451013, "learning_rate": 8.710051766902508e-06, "loss": 0.3398, "step": 1507 }, { "epoch": 0.5451916124367318, "grad_norm": 1.1107775266516204, "learning_rate": 8.708047215035484e-06, "loss": 0.0349, "step": 1508 }, { "epoch": 0.5455531453362256, "grad_norm": 0.9982160849129911, "learning_rate": 8.706041337913169e-06, "loss": 0.0388, "step": 1509 }, { "epoch": 0.5459146782357195, "grad_norm": 0.07276156463658563, "learning_rate": 8.704034136252463e-06, "loss": 0.0056, "step": 1510 }, { "epoch": 0.5462762111352133, "grad_norm": 0.002640265752770372, "learning_rate": 8.702025610770738e-06, "loss": 0.0001, "step": 1511 }, { "epoch": 0.5466377440347071, "grad_norm": 1.241407181156008, "learning_rate": 8.700015762185839e-06, "loss": 0.2578, "step": 1512 }, { "epoch": 0.546999276934201, "grad_norm": 0.0650694141567609, "learning_rate": 8.69800459121609e-06, "loss": 0.0056, "step": 1513 }, { "epoch": 0.5473608098336948, "grad_norm": 0.16382119623255748, "learning_rate": 8.695992098580279e-06, "loss": 0.0114, "step": 1514 }, { "epoch": 0.5477223427331888, "grad_norm": 0.28180398782337046, "learning_rate": 8.693978284997672e-06, "loss": 0.0182, "step": 1515 }, { "epoch": 0.5480838756326826, "grad_norm": 1.6552138475718896, "learning_rate": 8.691963151188005e-06, "loss": 0.3047, "step": 1516 }, { "epoch": 0.5484454085321764, "grad_norm": 0.012562270749680618, "learning_rate": 8.68994669787149e-06, "loss": 0.0004, "step": 1517 }, { "epoch": 0.5488069414316703, "grad_norm": 1.8371305911717744, "learning_rate": 8.6879289257688e-06, "loss": 0.1226, "step": 1518 }, { "epoch": 0.5491684743311641, "grad_norm": 2.0161208235738854, "learning_rate": 8.685909835601091e-06, "loss": 0.083, "step": 1519 }, { "epoch": 0.549530007230658, "grad_norm": 1.5058664628681457, "learning_rate": 8.683889428089987e-06, "loss": 0.1055, "step": 1520 }, { "epoch": 0.5498915401301518, "grad_norm": 0.0003316313523282073, "learning_rate": 8.681867703957577e-06, "loss": 0.0, "step": 1521 }, { "epoch": 0.5502530730296457, "grad_norm": 1.1100167338421258, "learning_rate": 8.679844663926426e-06, "loss": 0.2129, "step": 1522 }, { "epoch": 0.5506146059291396, "grad_norm": 1.03255887886348, "learning_rate": 8.677820308719572e-06, "loss": 0.2578, "step": 1523 }, { "epoch": 0.5509761388286334, "grad_norm": 0.0005696777933491293, "learning_rate": 8.675794639060513e-06, "loss": 0.0, "step": 1524 }, { "epoch": 0.5513376717281273, "grad_norm": 0.4205483876564213, "learning_rate": 8.673767655673227e-06, "loss": 0.0283, "step": 1525 }, { "epoch": 0.5516992046276211, "grad_norm": 0.8128378755254214, "learning_rate": 8.671739359282154e-06, "loss": 0.1914, "step": 1526 }, { "epoch": 0.5520607375271149, "grad_norm": 1.111437016637432, "learning_rate": 8.66970975061221e-06, "loss": 0.1914, "step": 1527 }, { "epoch": 0.5524222704266089, "grad_norm": 0.136170085397605, "learning_rate": 8.667678830388777e-06, "loss": 0.0143, "step": 1528 }, { "epoch": 0.5527838033261027, "grad_norm": 0.16002444788876427, "learning_rate": 8.665646599337703e-06, "loss": 0.0181, "step": 1529 }, { "epoch": 0.5531453362255966, "grad_norm": 0.6110671634929289, "learning_rate": 8.663613058185307e-06, "loss": 0.1914, "step": 1530 }, { "epoch": 0.5535068691250904, "grad_norm": 0.2853191976419728, "learning_rate": 8.661578207658379e-06, "loss": 0.0283, "step": 1531 }, { "epoch": 0.5538684020245842, "grad_norm": 0.8165736676999455, "learning_rate": 8.659542048484172e-06, "loss": 0.0579, "step": 1532 }, { "epoch": 0.5542299349240781, "grad_norm": 0.12994121524643157, "learning_rate": 8.657504581390409e-06, "loss": 0.0039, "step": 1533 }, { "epoch": 0.5545914678235719, "grad_norm": 0.32966726595082174, "learning_rate": 8.655465807105279e-06, "loss": 0.0349, "step": 1534 }, { "epoch": 0.5549530007230657, "grad_norm": 0.8469969033552511, "learning_rate": 8.65342572635744e-06, "loss": 0.1602, "step": 1535 }, { "epoch": 0.5553145336225597, "grad_norm": 1.6768629752246593, "learning_rate": 8.651384339876022e-06, "loss": 0.1406, "step": 1536 }, { "epoch": 0.5556760665220535, "grad_norm": 0.00032152550981501775, "learning_rate": 8.649341648390608e-06, "loss": 0.0, "step": 1537 }, { "epoch": 0.5560375994215474, "grad_norm": 0.20846717018351527, "learning_rate": 8.64729765263126e-06, "loss": 0.0254, "step": 1538 }, { "epoch": 0.5563991323210412, "grad_norm": 0.25291273059660296, "learning_rate": 8.645252353328502e-06, "loss": 0.0283, "step": 1539 }, { "epoch": 0.556760665220535, "grad_norm": 14.298312080912025, "learning_rate": 8.64320575121332e-06, "loss": 1.0078, "step": 1540 }, { "epoch": 0.5571221981200289, "grad_norm": 0.5793096120890874, "learning_rate": 8.641157847017172e-06, "loss": 0.0527, "step": 1541 }, { "epoch": 0.5574837310195228, "grad_norm": 2.3121007024590963, "learning_rate": 8.639108641471978e-06, "loss": 0.4023, "step": 1542 }, { "epoch": 0.5578452639190167, "grad_norm": 0.2974168817340073, "learning_rate": 8.637058135310124e-06, "loss": 0.009, "step": 1543 }, { "epoch": 0.5582067968185105, "grad_norm": 0.798295495906039, "learning_rate": 8.635006329264459e-06, "loss": 0.1143, "step": 1544 }, { "epoch": 0.5585683297180043, "grad_norm": 0.17373252948298487, "learning_rate": 8.6329532240683e-06, "loss": 0.0227, "step": 1545 }, { "epoch": 0.5589298626174982, "grad_norm": 0.23878307755794376, "learning_rate": 8.630898820455425e-06, "loss": 0.0317, "step": 1546 }, { "epoch": 0.559291395516992, "grad_norm": 1.6428502751359362, "learning_rate": 8.628843119160079e-06, "loss": 0.1602, "step": 1547 }, { "epoch": 0.559652928416486, "grad_norm": 0.5966798743633441, "learning_rate": 8.626786120916967e-06, "loss": 0.1406, "step": 1548 }, { "epoch": 0.5600144613159798, "grad_norm": 0.1941909431980876, "learning_rate": 8.62472782646126e-06, "loss": 0.0315, "step": 1549 }, { "epoch": 0.5603759942154736, "grad_norm": 1.4427884401382238, "learning_rate": 8.622668236528594e-06, "loss": 0.1406, "step": 1550 }, { "epoch": 0.5607375271149675, "grad_norm": 0.7554661748546162, "learning_rate": 8.620607351855065e-06, "loss": 0.1406, "step": 1551 }, { "epoch": 0.5610990600144613, "grad_norm": 0.7509197595876246, "learning_rate": 8.618545173177231e-06, "loss": 0.0476, "step": 1552 }, { "epoch": 0.5614605929139552, "grad_norm": 1.1215381562522493, "learning_rate": 8.616481701232118e-06, "loss": 0.0476, "step": 1553 }, { "epoch": 0.561822125813449, "grad_norm": 0.7555719443451352, "learning_rate": 8.614416936757206e-06, "loss": 0.1143, "step": 1554 }, { "epoch": 0.5621836587129428, "grad_norm": 1.8341034225604853, "learning_rate": 8.612350880490446e-06, "loss": 0.1226, "step": 1555 }, { "epoch": 0.5625451916124368, "grad_norm": 0.6343323461584266, "learning_rate": 8.61028353317024e-06, "loss": 0.1309, "step": 1556 }, { "epoch": 0.5629067245119306, "grad_norm": 0.6311737434336753, "learning_rate": 8.608214895535465e-06, "loss": 0.1226, "step": 1557 }, { "epoch": 0.5632682574114244, "grad_norm": 0.23122335419724957, "learning_rate": 8.606144968325445e-06, "loss": 0.0388, "step": 1558 }, { "epoch": 0.5636297903109183, "grad_norm": 0.26437243948737754, "learning_rate": 8.604073752279975e-06, "loss": 0.0283, "step": 1559 }, { "epoch": 0.5639913232104121, "grad_norm": 0.009662367347184562, "learning_rate": 8.602001248139308e-06, "loss": 0.0004, "step": 1560 }, { "epoch": 0.564352856109906, "grad_norm": 0.40557875672412474, "learning_rate": 8.599927456644155e-06, "loss": 0.0352, "step": 1561 }, { "epoch": 0.5647143890093999, "grad_norm": 0.21501628144063947, "learning_rate": 8.597852378535687e-06, "loss": 0.0315, "step": 1562 }, { "epoch": 0.5650759219088937, "grad_norm": 0.008844957852264225, "learning_rate": 8.595776014555539e-06, "loss": 0.0004, "step": 1563 }, { "epoch": 0.5654374548083876, "grad_norm": 0.6467986182301078, "learning_rate": 8.593698365445802e-06, "loss": 0.0476, "step": 1564 }, { "epoch": 0.5657989877078814, "grad_norm": 0.6688653119270542, "learning_rate": 8.591619431949028e-06, "loss": 0.083, "step": 1565 }, { "epoch": 0.5661605206073753, "grad_norm": 0.5023219061526318, "learning_rate": 8.589539214808228e-06, "loss": 0.0317, "step": 1566 }, { "epoch": 0.5665220535068691, "grad_norm": 0.3886951439184475, "learning_rate": 8.587457714766868e-06, "loss": 0.0476, "step": 1567 }, { "epoch": 0.5668835864063629, "grad_norm": 0.5071522880544984, "learning_rate": 8.58537493256888e-06, "loss": 0.1143, "step": 1568 }, { "epoch": 0.5672451193058569, "grad_norm": 0.01061287341061197, "learning_rate": 8.583290868958646e-06, "loss": 0.0006, "step": 1569 }, { "epoch": 0.5676066522053507, "grad_norm": 0.34306531381906585, "learning_rate": 8.581205524681012e-06, "loss": 0.0183, "step": 1570 }, { "epoch": 0.5679681851048446, "grad_norm": 1.275317890563055, "learning_rate": 8.57911890048128e-06, "loss": 0.1318, "step": 1571 }, { "epoch": 0.5683297180043384, "grad_norm": 0.5272295247628038, "learning_rate": 8.577030997105206e-06, "loss": 0.1226, "step": 1572 }, { "epoch": 0.5686912509038322, "grad_norm": 0.22071342753884318, "learning_rate": 8.574941815299012e-06, "loss": 0.0349, "step": 1573 }, { "epoch": 0.5690527838033261, "grad_norm": 0.012908902543520801, "learning_rate": 8.572851355809366e-06, "loss": 0.0006, "step": 1574 }, { "epoch": 0.56941431670282, "grad_norm": 0.495330914501514, "learning_rate": 8.5707596193834e-06, "loss": 0.0204, "step": 1575 }, { "epoch": 0.5697758496023138, "grad_norm": 0.38206759331335277, "learning_rate": 8.5686666067687e-06, "loss": 0.0527, "step": 1576 }, { "epoch": 0.5701373825018077, "grad_norm": 2.383932757968484, "learning_rate": 8.566572318713306e-06, "loss": 0.0977, "step": 1577 }, { "epoch": 0.5704989154013015, "grad_norm": 0.004798900609304118, "learning_rate": 8.564476755965718e-06, "loss": 0.0002, "step": 1578 }, { "epoch": 0.5708604483007954, "grad_norm": 0.2565594305839693, "learning_rate": 8.562379919274892e-06, "loss": 0.0476, "step": 1579 }, { "epoch": 0.5712219812002892, "grad_norm": 0.17756823408828626, "learning_rate": 8.560281809390232e-06, "loss": 0.0315, "step": 1580 }, { "epoch": 0.571583514099783, "grad_norm": 0.7074813980508179, "learning_rate": 8.558182427061606e-06, "loss": 0.0432, "step": 1581 }, { "epoch": 0.571945046999277, "grad_norm": 0.009127672420165137, "learning_rate": 8.556081773039333e-06, "loss": 0.0005, "step": 1582 }, { "epoch": 0.5723065798987708, "grad_norm": 0.23592861807225324, "learning_rate": 8.553979848074182e-06, "loss": 0.0129, "step": 1583 }, { "epoch": 0.5726681127982647, "grad_norm": 0.26097382066422914, "learning_rate": 8.551876652917385e-06, "loss": 0.0391, "step": 1584 }, { "epoch": 0.5730296456977585, "grad_norm": 0.21794454081804318, "learning_rate": 8.54977218832062e-06, "loss": 0.0315, "step": 1585 }, { "epoch": 0.5733911785972523, "grad_norm": 0.32929350063698587, "learning_rate": 8.547666455036026e-06, "loss": 0.0391, "step": 1586 }, { "epoch": 0.5737527114967462, "grad_norm": 0.8166315971883084, "learning_rate": 8.54555945381619e-06, "loss": 0.0635, "step": 1587 }, { "epoch": 0.57411424439624, "grad_norm": 0.3385291324593594, "learning_rate": 8.543451185414152e-06, "loss": 0.0227, "step": 1588 }, { "epoch": 0.574475777295734, "grad_norm": 0.00460294751744897, "learning_rate": 8.541341650583407e-06, "loss": 0.0002, "step": 1589 }, { "epoch": 0.5748373101952278, "grad_norm": 0.5548953095358053, "learning_rate": 8.539230850077907e-06, "loss": 0.1143, "step": 1590 }, { "epoch": 0.5751988430947216, "grad_norm": 0.09120315160248693, "learning_rate": 8.537118784652045e-06, "loss": 0.0056, "step": 1591 }, { "epoch": 0.5755603759942155, "grad_norm": 0.5632019760414148, "learning_rate": 8.535005455060678e-06, "loss": 0.0693, "step": 1592 }, { "epoch": 0.5759219088937093, "grad_norm": 0.18028591994200052, "learning_rate": 8.532890862059107e-06, "loss": 0.0315, "step": 1593 }, { "epoch": 0.5762834417932032, "grad_norm": 0.6268271516184782, "learning_rate": 8.530775006403088e-06, "loss": 0.1055, "step": 1594 }, { "epoch": 0.576644974692697, "grad_norm": 0.22673143337222726, "learning_rate": 8.528657888848823e-06, "loss": 0.0352, "step": 1595 }, { "epoch": 0.5770065075921909, "grad_norm": 0.3635653236290076, "learning_rate": 8.526539510152978e-06, "loss": 0.0476, "step": 1596 }, { "epoch": 0.5773680404916848, "grad_norm": 1.2072632918922, "learning_rate": 8.524419871072653e-06, "loss": 0.1318, "step": 1597 }, { "epoch": 0.5777295733911786, "grad_norm": 0.2817885130290144, "learning_rate": 8.522298972365411e-06, "loss": 0.0145, "step": 1598 }, { "epoch": 0.5780911062906724, "grad_norm": 0.2952418453779865, "learning_rate": 8.52017681478926e-06, "loss": 0.0315, "step": 1599 }, { "epoch": 0.5784526391901663, "grad_norm": 0.39288034210308537, "learning_rate": 8.518053399102659e-06, "loss": 0.0204, "step": 1600 }, { "epoch": 0.5788141720896601, "grad_norm": 0.6992489674490814, "learning_rate": 8.515928726064514e-06, "loss": 0.0903, "step": 1601 }, { "epoch": 0.579175704989154, "grad_norm": 0.5611483812552152, "learning_rate": 8.513802796434186e-06, "loss": 0.0283, "step": 1602 }, { "epoch": 0.5795372378886479, "grad_norm": 1.128817677360383, "learning_rate": 8.511675610971479e-06, "loss": 0.1055, "step": 1603 }, { "epoch": 0.5798987707881417, "grad_norm": 0.5980082068603756, "learning_rate": 8.50954717043665e-06, "loss": 0.0476, "step": 1604 }, { "epoch": 0.5802603036876356, "grad_norm": 0.18963783861112576, "learning_rate": 8.507417475590403e-06, "loss": 0.0352, "step": 1605 }, { "epoch": 0.5806218365871294, "grad_norm": 0.943929921495858, "learning_rate": 8.50528652719389e-06, "loss": 0.0635, "step": 1606 }, { "epoch": 0.5809833694866233, "grad_norm": 0.5440102539004134, "learning_rate": 8.503154326008712e-06, "loss": 0.0283, "step": 1607 }, { "epoch": 0.5813449023861171, "grad_norm": 1.071710008463165, "learning_rate": 8.501020872796916e-06, "loss": 0.0432, "step": 1608 }, { "epoch": 0.581706435285611, "grad_norm": 0.004000661720811044, "learning_rate": 8.498886168321e-06, "loss": 0.0002, "step": 1609 }, { "epoch": 0.5820679681851049, "grad_norm": 0.17578489602757372, "learning_rate": 8.496750213343907e-06, "loss": 0.0254, "step": 1610 }, { "epoch": 0.5824295010845987, "grad_norm": 0.6127089831080316, "learning_rate": 8.494613008629025e-06, "loss": 0.0762, "step": 1611 }, { "epoch": 0.5827910339840926, "grad_norm": 0.14372594403969371, "learning_rate": 8.49247455494019e-06, "loss": 0.0081, "step": 1612 }, { "epoch": 0.5831525668835864, "grad_norm": 0.18013640277392587, "learning_rate": 8.49033485304169e-06, "loss": 0.0283, "step": 1613 }, { "epoch": 0.5835140997830802, "grad_norm": 0.15816715626280237, "learning_rate": 8.488193903698246e-06, "loss": 0.0103, "step": 1614 }, { "epoch": 0.5838756326825741, "grad_norm": 0.5712706026082975, "learning_rate": 8.486051707675039e-06, "loss": 0.0527, "step": 1615 }, { "epoch": 0.584237165582068, "grad_norm": 0.5408526511300596, "learning_rate": 8.483908265737685e-06, "loss": 0.1406, "step": 1616 }, { "epoch": 0.5845986984815619, "grad_norm": 0.20626986915367898, "learning_rate": 8.481763578652253e-06, "loss": 0.0254, "step": 1617 }, { "epoch": 0.5849602313810557, "grad_norm": 0.858632321116094, "learning_rate": 8.479617647185255e-06, "loss": 0.0762, "step": 1618 }, { "epoch": 0.5853217642805495, "grad_norm": 0.8437378608801609, "learning_rate": 8.477470472103641e-06, "loss": 0.0693, "step": 1619 }, { "epoch": 0.5856832971800434, "grad_norm": 0.5211938198256986, "learning_rate": 8.475322054174816e-06, "loss": 0.1602, "step": 1620 }, { "epoch": 0.5860448300795372, "grad_norm": 0.4357396497998162, "learning_rate": 8.473172394166622e-06, "loss": 0.0352, "step": 1621 }, { "epoch": 0.586406362979031, "grad_norm": 0.3995589491025988, "learning_rate": 8.471021492847348e-06, "loss": 0.0315, "step": 1622 }, { "epoch": 0.586767895878525, "grad_norm": 0.47534902621325575, "learning_rate": 8.468869350985725e-06, "loss": 0.0432, "step": 1623 }, { "epoch": 0.5871294287780188, "grad_norm": 0.5167788578247865, "learning_rate": 8.466715969350928e-06, "loss": 0.0762, "step": 1624 }, { "epoch": 0.5874909616775127, "grad_norm": 0.8799273598262808, "learning_rate": 8.464561348712575e-06, "loss": 0.1309, "step": 1625 }, { "epoch": 0.5878524945770065, "grad_norm": 0.20131871749849967, "learning_rate": 8.462405489840726e-06, "loss": 0.0315, "step": 1626 }, { "epoch": 0.5882140274765003, "grad_norm": 0.22929018854467553, "learning_rate": 8.460248393505885e-06, "loss": 0.0349, "step": 1627 }, { "epoch": 0.5885755603759942, "grad_norm": 0.002452005229382836, "learning_rate": 8.458090060479e-06, "loss": 0.0001, "step": 1628 }, { "epoch": 0.588937093275488, "grad_norm": 0.7324283424037026, "learning_rate": 8.455930491531456e-06, "loss": 0.0349, "step": 1629 }, { "epoch": 0.589298626174982, "grad_norm": 0.7362288768907421, "learning_rate": 8.453769687435086e-06, "loss": 0.0977, "step": 1630 }, { "epoch": 0.5896601590744758, "grad_norm": 0.2670327164055367, "learning_rate": 8.451607648962156e-06, "loss": 0.0071, "step": 1631 }, { "epoch": 0.5900216919739696, "grad_norm": 0.2109010601973659, "learning_rate": 8.449444376885383e-06, "loss": 0.0352, "step": 1632 }, { "epoch": 0.5903832248734635, "grad_norm": 0.0028810618622087053, "learning_rate": 8.447279871977916e-06, "loss": 0.0001, "step": 1633 }, { "epoch": 0.5907447577729573, "grad_norm": 0.0017253732101529023, "learning_rate": 8.44511413501335e-06, "loss": 0.0001, "step": 1634 }, { "epoch": 0.5911062906724512, "grad_norm": 0.0030343629490846864, "learning_rate": 8.44294716676572e-06, "loss": 0.0001, "step": 1635 }, { "epoch": 0.591467823571945, "grad_norm": 0.004011202934861907, "learning_rate": 8.4407789680095e-06, "loss": 0.0001, "step": 1636 }, { "epoch": 0.5918293564714389, "grad_norm": 0.22878284728938933, "learning_rate": 8.438609539519601e-06, "loss": 0.0317, "step": 1637 }, { "epoch": 0.5921908893709328, "grad_norm": 0.20191555447309706, "learning_rate": 8.43643888207138e-06, "loss": 0.0254, "step": 1638 }, { "epoch": 0.5925524222704266, "grad_norm": 0.33895369364175987, "learning_rate": 8.434266996440628e-06, "loss": 0.0388, "step": 1639 }, { "epoch": 0.5929139551699205, "grad_norm": 0.21989613700925378, "learning_rate": 8.432093883403576e-06, "loss": 0.0283, "step": 1640 }, { "epoch": 0.5932754880694143, "grad_norm": 0.705147151712581, "learning_rate": 8.429919543736892e-06, "loss": 0.1406, "step": 1641 }, { "epoch": 0.5936370209689081, "grad_norm": 0.7963482449213748, "learning_rate": 8.42774397821769e-06, "loss": 0.083, "step": 1642 }, { "epoch": 0.5939985538684021, "grad_norm": 0.7252821985739446, "learning_rate": 8.425567187623513e-06, "loss": 0.0391, "step": 1643 }, { "epoch": 0.5943600867678959, "grad_norm": 0.2614617220287872, "learning_rate": 8.423389172732345e-06, "loss": 0.0317, "step": 1644 }, { "epoch": 0.5947216196673897, "grad_norm": 0.08229024023941152, "learning_rate": 8.42120993432261e-06, "loss": 0.005, "step": 1645 }, { "epoch": 0.5950831525668836, "grad_norm": 0.3055047083769267, "learning_rate": 8.419029473173166e-06, "loss": 0.0388, "step": 1646 }, { "epoch": 0.5954446854663774, "grad_norm": 0.06748157616468597, "learning_rate": 8.41684779006331e-06, "loss": 0.0031, "step": 1647 }, { "epoch": 0.5958062183658713, "grad_norm": 0.03632850105662851, "learning_rate": 8.414664885772774e-06, "loss": 0.0019, "step": 1648 }, { "epoch": 0.5961677512653651, "grad_norm": 0.11354814743314467, "learning_rate": 8.412480761081729e-06, "loss": 0.0063, "step": 1649 }, { "epoch": 0.596529284164859, "grad_norm": 0.7027631707413158, "learning_rate": 8.410295416770782e-06, "loss": 0.0977, "step": 1650 }, { "epoch": 0.5968908170643529, "grad_norm": 0.011241491779254325, "learning_rate": 8.40810885362097e-06, "loss": 0.0006, "step": 1651 }, { "epoch": 0.5972523499638467, "grad_norm": 0.0022345374133962742, "learning_rate": 8.405921072413774e-06, "loss": 0.0001, "step": 1652 }, { "epoch": 0.5976138828633406, "grad_norm": 0.30877558189522575, "learning_rate": 8.403732073931106e-06, "loss": 0.0476, "step": 1653 }, { "epoch": 0.5979754157628344, "grad_norm": 0.24160908090632593, "learning_rate": 8.401541858955312e-06, "loss": 0.0349, "step": 1654 }, { "epoch": 0.5983369486623282, "grad_norm": 0.816853199396416, "learning_rate": 8.399350428269177e-06, "loss": 0.0527, "step": 1655 }, { "epoch": 0.5986984815618221, "grad_norm": 0.2822686227044238, "learning_rate": 8.397157782655915e-06, "loss": 0.0145, "step": 1656 }, { "epoch": 0.599060014461316, "grad_norm": 0.018579782367913546, "learning_rate": 8.394963922899178e-06, "loss": 0.0011, "step": 1657 }, { "epoch": 0.5994215473608099, "grad_norm": 0.19442118933026503, "learning_rate": 8.392768849783053e-06, "loss": 0.0283, "step": 1658 }, { "epoch": 0.5997830802603037, "grad_norm": 0.02958567964698857, "learning_rate": 8.390572564092056e-06, "loss": 0.0013, "step": 1659 }, { "epoch": 0.6001446131597975, "grad_norm": 0.30003469496980967, "learning_rate": 8.388375066611141e-06, "loss": 0.0204, "step": 1660 }, { "epoch": 0.6005061460592914, "grad_norm": 0.11129857397681463, "learning_rate": 8.386176358125693e-06, "loss": 0.0064, "step": 1661 }, { "epoch": 0.6008676789587852, "grad_norm": 0.8088473673133124, "learning_rate": 8.383976439421525e-06, "loss": 0.0635, "step": 1662 }, { "epoch": 0.6012292118582792, "grad_norm": 0.18103861070355382, "learning_rate": 8.381775311284896e-06, "loss": 0.0254, "step": 1663 }, { "epoch": 0.601590744757773, "grad_norm": 0.6625692722885713, "learning_rate": 8.379572974502483e-06, "loss": 0.1406, "step": 1664 }, { "epoch": 0.6019522776572668, "grad_norm": 1.2886657352247823, "learning_rate": 8.377369429861403e-06, "loss": 0.0693, "step": 1665 }, { "epoch": 0.6023138105567607, "grad_norm": 0.7603018003059739, "learning_rate": 8.375164678149201e-06, "loss": 0.0693, "step": 1666 }, { "epoch": 0.6026753434562545, "grad_norm": 0.22989351509097772, "learning_rate": 8.372958720153855e-06, "loss": 0.0091, "step": 1667 }, { "epoch": 0.6030368763557483, "grad_norm": 0.170617272219113, "learning_rate": 8.370751556663774e-06, "loss": 0.0254, "step": 1668 }, { "epoch": 0.6033984092552422, "grad_norm": 0.22576237042294828, "learning_rate": 8.3685431884678e-06, "loss": 0.0227, "step": 1669 }, { "epoch": 0.603759942154736, "grad_norm": 0.03663659330184921, "learning_rate": 8.366333616355198e-06, "loss": 0.0005, "step": 1670 }, { "epoch": 0.60412147505423, "grad_norm": 0.7048677759424695, "learning_rate": 8.364122841115675e-06, "loss": 0.0317, "step": 1671 }, { "epoch": 0.6044830079537238, "grad_norm": 0.25445970337643253, "learning_rate": 8.361910863539357e-06, "loss": 0.0317, "step": 1672 }, { "epoch": 0.6048445408532176, "grad_norm": 0.6347019575242633, "learning_rate": 8.359697684416805e-06, "loss": 0.1406, "step": 1673 }, { "epoch": 0.6052060737527115, "grad_norm": 0.9474788260193642, "learning_rate": 8.357483304539012e-06, "loss": 0.083, "step": 1674 }, { "epoch": 0.6055676066522053, "grad_norm": 0.563804969443867, "learning_rate": 8.355267724697394e-06, "loss": 0.0522, "step": 1675 }, { "epoch": 0.6059291395516992, "grad_norm": 0.14600245336997783, "learning_rate": 8.353050945683798e-06, "loss": 0.0181, "step": 1676 }, { "epoch": 0.6062906724511931, "grad_norm": 0.03572865205259862, "learning_rate": 8.350832968290502e-06, "loss": 0.0006, "step": 1677 }, { "epoch": 0.6066522053506869, "grad_norm": 0.39706663912520057, "learning_rate": 8.34861379331021e-06, "loss": 0.0352, "step": 1678 }, { "epoch": 0.6070137382501808, "grad_norm": 0.400433473826511, "learning_rate": 8.346393421536056e-06, "loss": 0.0162, "step": 1679 }, { "epoch": 0.6073752711496746, "grad_norm": 0.8131934407541974, "learning_rate": 8.344171853761599e-06, "loss": 0.1226, "step": 1680 }, { "epoch": 0.6077368040491685, "grad_norm": 1.1558789061535806, "learning_rate": 8.341949090780827e-06, "loss": 0.0693, "step": 1681 }, { "epoch": 0.6080983369486623, "grad_norm": 0.3454076934098343, "learning_rate": 8.339725133388154e-06, "loss": 0.0349, "step": 1682 }, { "epoch": 0.6084598698481561, "grad_norm": 0.7272691703762653, "learning_rate": 8.337499982378426e-06, "loss": 0.0635, "step": 1683 }, { "epoch": 0.6088214027476501, "grad_norm": 0.032540930844048646, "learning_rate": 8.335273638546906e-06, "loss": 0.0005, "step": 1684 }, { "epoch": 0.6091829356471439, "grad_norm": 0.3136747694726391, "learning_rate": 8.333046102689293e-06, "loss": 0.0349, "step": 1685 }, { "epoch": 0.6095444685466378, "grad_norm": 0.00035914139504547295, "learning_rate": 8.330817375601705e-06, "loss": 0.0, "step": 1686 }, { "epoch": 0.6099060014461316, "grad_norm": 0.21371545479024825, "learning_rate": 8.328587458080691e-06, "loss": 0.0254, "step": 1687 }, { "epoch": 0.6102675343456254, "grad_norm": 0.0002722699274678118, "learning_rate": 8.326356350923221e-06, "loss": 0.0, "step": 1688 }, { "epoch": 0.6106290672451193, "grad_norm": 0.0006367260808415793, "learning_rate": 8.324124054926695e-06, "loss": 0.0, "step": 1689 }, { "epoch": 0.6109906001446131, "grad_norm": 0.4036275944772169, "learning_rate": 8.32189057088893e-06, "loss": 0.0227, "step": 1690 }, { "epoch": 0.611352133044107, "grad_norm": 0.026692331619621326, "learning_rate": 8.319655899608182e-06, "loss": 0.0012, "step": 1691 }, { "epoch": 0.6117136659436009, "grad_norm": 1.0468621577840953, "learning_rate": 8.31742004188311e-06, "loss": 0.0693, "step": 1692 }, { "epoch": 0.6120751988430947, "grad_norm": 0.6734501478636129, "learning_rate": 8.315182998512817e-06, "loss": 0.0283, "step": 1693 }, { "epoch": 0.6124367317425886, "grad_norm": 0.005069567474169714, "learning_rate": 8.31294477029682e-06, "loss": 0.0002, "step": 1694 }, { "epoch": 0.6127982646420824, "grad_norm": 0.3899314418937365, "learning_rate": 8.310705358035062e-06, "loss": 0.0349, "step": 1695 }, { "epoch": 0.6131597975415762, "grad_norm": 2.499881862010968, "learning_rate": 8.308464762527907e-06, "loss": 0.1699, "step": 1696 }, { "epoch": 0.6135213304410702, "grad_norm": 0.003865444522388866, "learning_rate": 8.306222984576145e-06, "loss": 0.0002, "step": 1697 }, { "epoch": 0.613882863340564, "grad_norm": 0.18145900774996782, "learning_rate": 8.303980024980986e-06, "loss": 0.0056, "step": 1698 }, { "epoch": 0.6142443962400579, "grad_norm": 2.62674445071756, "learning_rate": 8.301735884544062e-06, "loss": 0.2236, "step": 1699 }, { "epoch": 0.6146059291395517, "grad_norm": 0.20918988039026146, "learning_rate": 8.29949056406743e-06, "loss": 0.0254, "step": 1700 }, { "epoch": 0.6149674620390455, "grad_norm": 0.6407427952869394, "learning_rate": 8.297244064353566e-06, "loss": 0.0254, "step": 1701 }, { "epoch": 0.6153289949385394, "grad_norm": 0.40494925851057023, "learning_rate": 8.294996386205372e-06, "loss": 0.0254, "step": 1702 }, { "epoch": 0.6156905278380332, "grad_norm": 0.6331740877722607, "learning_rate": 8.292747530426165e-06, "loss": 0.1504, "step": 1703 }, { "epoch": 0.6160520607375272, "grad_norm": 0.9265730979673779, "learning_rate": 8.290497497819682e-06, "loss": 0.0903, "step": 1704 }, { "epoch": 0.616413593637021, "grad_norm": 0.12311455560886958, "learning_rate": 8.28824628919009e-06, "loss": 0.0161, "step": 1705 }, { "epoch": 0.6167751265365148, "grad_norm": 0.29785218181757084, "learning_rate": 8.285993905341968e-06, "loss": 0.0254, "step": 1706 }, { "epoch": 0.6171366594360087, "grad_norm": 1.0234355397120931, "learning_rate": 8.283740347080318e-06, "loss": 0.0388, "step": 1707 }, { "epoch": 0.6174981923355025, "grad_norm": 0.2001633830380345, "learning_rate": 8.281485615210559e-06, "loss": 0.0227, "step": 1708 }, { "epoch": 0.6178597252349964, "grad_norm": 0.5804818711261612, "learning_rate": 8.279229710538536e-06, "loss": 0.0432, "step": 1709 }, { "epoch": 0.6182212581344902, "grad_norm": 1.0903433456457519, "learning_rate": 8.276972633870507e-06, "loss": 0.1309, "step": 1710 }, { "epoch": 0.6185827910339841, "grad_norm": 0.9994338995031923, "learning_rate": 8.274714386013147e-06, "loss": 0.0579, "step": 1711 }, { "epoch": 0.618944323933478, "grad_norm": 0.1652232547525637, "learning_rate": 8.272454967773559e-06, "loss": 0.0227, "step": 1712 }, { "epoch": 0.6193058568329718, "grad_norm": 0.41902484759728553, "learning_rate": 8.270194379959256e-06, "loss": 0.0352, "step": 1713 }, { "epoch": 0.6196673897324656, "grad_norm": 0.8246814839716903, "learning_rate": 8.26793262337817e-06, "loss": 0.1504, "step": 1714 }, { "epoch": 0.6200289226319595, "grad_norm": 0.9985667182716, "learning_rate": 8.265669698838656e-06, "loss": 0.0476, "step": 1715 }, { "epoch": 0.6203904555314533, "grad_norm": 1.3677276893088244, "learning_rate": 8.26340560714948e-06, "loss": 0.1699, "step": 1716 }, { "epoch": 0.6207519884309473, "grad_norm": 0.10457416664904881, "learning_rate": 8.261140349119829e-06, "loss": 0.0056, "step": 1717 }, { "epoch": 0.6211135213304411, "grad_norm": 0.007222472176080509, "learning_rate": 8.258873925559304e-06, "loss": 0.0003, "step": 1718 }, { "epoch": 0.6214750542299349, "grad_norm": 0.1134381069507584, "learning_rate": 8.256606337277926e-06, "loss": 0.0161, "step": 1719 }, { "epoch": 0.6218365871294288, "grad_norm": 1.1888038790108524, "learning_rate": 8.254337585086132e-06, "loss": 0.1406, "step": 1720 }, { "epoch": 0.6221981200289226, "grad_norm": 0.6221824462147065, "learning_rate": 8.252067669794772e-06, "loss": 0.0254, "step": 1721 }, { "epoch": 0.6225596529284165, "grad_norm": 5.173559583085803, "learning_rate": 8.249796592215112e-06, "loss": 0.5469, "step": 1722 }, { "epoch": 0.6229211858279103, "grad_norm": 0.6044450805657549, "learning_rate": 8.247524353158836e-06, "loss": 0.0527, "step": 1723 }, { "epoch": 0.6232827187274042, "grad_norm": 0.16210689232585682, "learning_rate": 8.245250953438041e-06, "loss": 0.0181, "step": 1724 }, { "epoch": 0.6236442516268981, "grad_norm": 0.8017533335271919, "learning_rate": 8.242976393865242e-06, "loss": 0.1699, "step": 1725 }, { "epoch": 0.6240057845263919, "grad_norm": 0.11939906977251073, "learning_rate": 8.240700675253362e-06, "loss": 0.0181, "step": 1726 }, { "epoch": 0.6243673174258858, "grad_norm": 0.002825352569288785, "learning_rate": 8.238423798415747e-06, "loss": 0.0001, "step": 1727 }, { "epoch": 0.6247288503253796, "grad_norm": 0.004505082042185973, "learning_rate": 8.236145764166147e-06, "loss": 0.0002, "step": 1728 }, { "epoch": 0.6250903832248734, "grad_norm": 0.0006117949056868747, "learning_rate": 8.233866573318736e-06, "loss": 0.0, "step": 1729 }, { "epoch": 0.6254519161243673, "grad_norm": 0.31042338360069766, "learning_rate": 8.231586226688093e-06, "loss": 0.0352, "step": 1730 }, { "epoch": 0.6258134490238612, "grad_norm": 0.27195363837137226, "learning_rate": 8.229304725089216e-06, "loss": 0.0317, "step": 1731 }, { "epoch": 0.6261749819233551, "grad_norm": 0.18331143684804863, "learning_rate": 8.22702206933751e-06, "loss": 0.0203, "step": 1732 }, { "epoch": 0.6265365148228489, "grad_norm": 0.6715672174152186, "learning_rate": 8.2247382602488e-06, "loss": 0.0283, "step": 1733 }, { "epoch": 0.6268980477223427, "grad_norm": 0.6691386522249371, "learning_rate": 8.222453298639314e-06, "loss": 0.0476, "step": 1734 }, { "epoch": 0.6272595806218366, "grad_norm": 0.15913164630266982, "learning_rate": 8.220167185325699e-06, "loss": 0.0045, "step": 1735 }, { "epoch": 0.6276211135213304, "grad_norm": 1.0335203752075763, "learning_rate": 8.217879921125012e-06, "loss": 0.0762, "step": 1736 }, { "epoch": 0.6279826464208242, "grad_norm": 0.7114272418171819, "learning_rate": 8.21559150685472e-06, "loss": 0.0349, "step": 1737 }, { "epoch": 0.6283441793203182, "grad_norm": 0.1349418875189994, "learning_rate": 8.213301943332703e-06, "loss": 0.0181, "step": 1738 }, { "epoch": 0.628705712219812, "grad_norm": 2.7767296719434693, "learning_rate": 8.211011231377251e-06, "loss": 0.0579, "step": 1739 }, { "epoch": 0.6290672451193059, "grad_norm": 0.009330120898378489, "learning_rate": 8.208719371807059e-06, "loss": 0.0005, "step": 1740 }, { "epoch": 0.6294287780187997, "grad_norm": 0.20868518574106168, "learning_rate": 8.206426365441243e-06, "loss": 0.0227, "step": 1741 }, { "epoch": 0.6297903109182935, "grad_norm": 0.2637370983317925, "learning_rate": 8.204132213099321e-06, "loss": 0.0315, "step": 1742 }, { "epoch": 0.6301518438177874, "grad_norm": 2.613018927338558, "learning_rate": 8.201836915601222e-06, "loss": 0.416, "step": 1743 }, { "epoch": 0.6305133767172812, "grad_norm": 0.5505936002366484, "learning_rate": 8.199540473767284e-06, "loss": 0.0388, "step": 1744 }, { "epoch": 0.6308749096167752, "grad_norm": 0.7355883051646753, "learning_rate": 8.197242888418255e-06, "loss": 0.0579, "step": 1745 }, { "epoch": 0.631236442516269, "grad_norm": 0.3368683125467279, "learning_rate": 8.194944160375294e-06, "loss": 0.0349, "step": 1746 }, { "epoch": 0.6315979754157628, "grad_norm": 1.8222373863499741, "learning_rate": 8.192644290459963e-06, "loss": 0.2012, "step": 1747 }, { "epoch": 0.6319595083152567, "grad_norm": 0.05219487087634193, "learning_rate": 8.19034327949424e-06, "loss": 0.0013, "step": 1748 }, { "epoch": 0.6323210412147505, "grad_norm": 2.4472936792283444, "learning_rate": 8.1880411283005e-06, "loss": 0.3535, "step": 1749 }, { "epoch": 0.6326825741142444, "grad_norm": 0.17786782911980575, "learning_rate": 8.185737837701532e-06, "loss": 0.0227, "step": 1750 }, { "epoch": 0.6330441070137383, "grad_norm": 0.2874246457676767, "learning_rate": 8.183433408520533e-06, "loss": 0.0203, "step": 1751 }, { "epoch": 0.6334056399132321, "grad_norm": 0.4135811424904911, "learning_rate": 8.181127841581109e-06, "loss": 0.0352, "step": 1752 }, { "epoch": 0.633767172812726, "grad_norm": 0.030824862507061526, "learning_rate": 8.178821137707263e-06, "loss": 0.0013, "step": 1753 }, { "epoch": 0.6341287057122198, "grad_norm": 0.0017914435494427383, "learning_rate": 8.176513297723413e-06, "loss": 0.0001, "step": 1754 }, { "epoch": 0.6344902386117137, "grad_norm": 0.0008229137013181858, "learning_rate": 8.174204322454382e-06, "loss": 0.0, "step": 1755 }, { "epoch": 0.6348517715112075, "grad_norm": 0.6018415053959089, "learning_rate": 8.171894212725397e-06, "loss": 0.0254, "step": 1756 }, { "epoch": 0.6352133044107013, "grad_norm": 0.15450779273498438, "learning_rate": 8.169582969362089e-06, "loss": 0.0056, "step": 1757 }, { "epoch": 0.6355748373101953, "grad_norm": 0.7644763640156402, "learning_rate": 8.167270593190495e-06, "loss": 0.0203, "step": 1758 }, { "epoch": 0.6359363702096891, "grad_norm": 0.002091866828820639, "learning_rate": 8.164957085037063e-06, "loss": 0.0001, "step": 1759 }, { "epoch": 0.6362979031091829, "grad_norm": 0.23240390136216252, "learning_rate": 8.162642445728632e-06, "loss": 0.0227, "step": 1760 }, { "epoch": 0.6366594360086768, "grad_norm": 0.20550528857348094, "learning_rate": 8.16032667609246e-06, "loss": 0.0227, "step": 1761 }, { "epoch": 0.6370209689081706, "grad_norm": 0.29882785280486, "learning_rate": 8.158009776956202e-06, "loss": 0.0227, "step": 1762 }, { "epoch": 0.6373825018076645, "grad_norm": 0.002005111422346491, "learning_rate": 8.155691749147917e-06, "loss": 0.0001, "step": 1763 }, { "epoch": 0.6377440347071583, "grad_norm": 0.0009936829487690593, "learning_rate": 8.153372593496065e-06, "loss": 0.0001, "step": 1764 }, { "epoch": 0.6381055676066522, "grad_norm": 0.0027068564504314574, "learning_rate": 8.151052310829515e-06, "loss": 0.0001, "step": 1765 }, { "epoch": 0.6384671005061461, "grad_norm": 0.0784677157746207, "learning_rate": 8.148730901977533e-06, "loss": 0.0114, "step": 1766 }, { "epoch": 0.6388286334056399, "grad_norm": 0.25702104268457976, "learning_rate": 8.146408367769792e-06, "loss": 0.0203, "step": 1767 }, { "epoch": 0.6391901663051338, "grad_norm": 0.16472687850391077, "learning_rate": 8.144084709036362e-06, "loss": 0.0182, "step": 1768 }, { "epoch": 0.6395516992046276, "grad_norm": 1.2236609487581884, "learning_rate": 8.141759926607724e-06, "loss": 0.0476, "step": 1769 }, { "epoch": 0.6399132321041214, "grad_norm": 1.2169999864165975, "learning_rate": 8.139434021314749e-06, "loss": 0.1226, "step": 1770 }, { "epoch": 0.6402747650036154, "grad_norm": 1.2649106208970626, "learning_rate": 8.137106993988717e-06, "loss": 0.1309, "step": 1771 }, { "epoch": 0.6406362979031092, "grad_norm": 0.3879531671130575, "learning_rate": 8.134778845461308e-06, "loss": 0.0203, "step": 1772 }, { "epoch": 0.6409978308026031, "grad_norm": 0.12632082807309292, "learning_rate": 8.132449576564603e-06, "loss": 0.0128, "step": 1773 }, { "epoch": 0.6413593637020969, "grad_norm": 4.521156128585228, "learning_rate": 8.130119188131078e-06, "loss": 0.0693, "step": 1774 }, { "epoch": 0.6417208966015907, "grad_norm": 0.6985186885666853, "learning_rate": 8.127787680993617e-06, "loss": 0.2344, "step": 1775 }, { "epoch": 0.6420824295010846, "grad_norm": 0.5493172958833231, "learning_rate": 8.125455055985499e-06, "loss": 0.0388, "step": 1776 }, { "epoch": 0.6424439624005784, "grad_norm": 0.7778155522963593, "learning_rate": 8.123121313940403e-06, "loss": 0.2471, "step": 1777 }, { "epoch": 0.6428054953000724, "grad_norm": 0.17770901085100776, "learning_rate": 8.12078645569241e-06, "loss": 0.0182, "step": 1778 }, { "epoch": 0.6431670281995662, "grad_norm": 0.9405619912508072, "learning_rate": 8.118450482075995e-06, "loss": 0.1699, "step": 1779 }, { "epoch": 0.64352856109906, "grad_norm": 1.1995647605834465, "learning_rate": 8.116113393926036e-06, "loss": 0.0762, "step": 1780 }, { "epoch": 0.6438900939985539, "grad_norm": 0.35148063728620754, "learning_rate": 8.113775192077806e-06, "loss": 0.0317, "step": 1781 }, { "epoch": 0.6442516268980477, "grad_norm": 0.17934723687354026, "learning_rate": 8.111435877366982e-06, "loss": 0.0203, "step": 1782 }, { "epoch": 0.6446131597975415, "grad_norm": 0.026580990325762784, "learning_rate": 8.109095450629629e-06, "loss": 0.0008, "step": 1783 }, { "epoch": 0.6449746926970354, "grad_norm": 0.12529568845751501, "learning_rate": 8.10675391270222e-06, "loss": 0.0162, "step": 1784 }, { "epoch": 0.6453362255965293, "grad_norm": 0.13514351711166073, "learning_rate": 8.10441126442162e-06, "loss": 0.0143, "step": 1785 }, { "epoch": 0.6456977584960232, "grad_norm": 0.17133497113016904, "learning_rate": 8.102067506625086e-06, "loss": 0.008, "step": 1786 }, { "epoch": 0.646059291395517, "grad_norm": 0.45194120702412677, "learning_rate": 8.099722640150283e-06, "loss": 0.0317, "step": 1787 }, { "epoch": 0.6464208242950108, "grad_norm": 0.041796869936875265, "learning_rate": 8.097376665835258e-06, "loss": 0.0017, "step": 1788 }, { "epoch": 0.6467823571945047, "grad_norm": 0.1207593845813763, "learning_rate": 8.095029584518472e-06, "loss": 0.0161, "step": 1789 }, { "epoch": 0.6471438900939985, "grad_norm": 0.14502283468653493, "learning_rate": 8.092681397038762e-06, "loss": 0.0143, "step": 1790 }, { "epoch": 0.6475054229934925, "grad_norm": 1.1735847768481897, "learning_rate": 8.090332104235375e-06, "loss": 0.0693, "step": 1791 }, { "epoch": 0.6478669558929863, "grad_norm": 0.001905948736736653, "learning_rate": 8.087981706947946e-06, "loss": 0.0001, "step": 1792 }, { "epoch": 0.6482284887924801, "grad_norm": 0.3419925249172993, "learning_rate": 8.085630206016505e-06, "loss": 0.0283, "step": 1793 }, { "epoch": 0.648590021691974, "grad_norm": 0.15474991244563985, "learning_rate": 8.083277602281481e-06, "loss": 0.0161, "step": 1794 }, { "epoch": 0.6489515545914678, "grad_norm": 0.1691865331801225, "learning_rate": 8.080923896583692e-06, "loss": 0.0203, "step": 1795 }, { "epoch": 0.6493130874909617, "grad_norm": 0.0112837016456128, "learning_rate": 8.078569089764352e-06, "loss": 0.0005, "step": 1796 }, { "epoch": 0.6496746203904555, "grad_norm": 1.1347332927974698, "learning_rate": 8.076213182665072e-06, "loss": 0.1504, "step": 1797 }, { "epoch": 0.6500361532899493, "grad_norm": 0.10807989936531905, "learning_rate": 8.073856176127845e-06, "loss": 0.0143, "step": 1798 }, { "epoch": 0.6503976861894433, "grad_norm": 0.009061862419332848, "learning_rate": 8.071498070995075e-06, "loss": 0.0005, "step": 1799 }, { "epoch": 0.6507592190889371, "grad_norm": 1.3603100023918389, "learning_rate": 8.069138868109539e-06, "loss": 0.1055, "step": 1800 }, { "epoch": 0.651120751988431, "grad_norm": 0.6907807224719349, "learning_rate": 8.066778568314418e-06, "loss": 0.0349, "step": 1801 }, { "epoch": 0.6514822848879248, "grad_norm": 0.010288092383542025, "learning_rate": 8.064417172453286e-06, "loss": 0.0005, "step": 1802 }, { "epoch": 0.6518438177874186, "grad_norm": 0.003158882205048261, "learning_rate": 8.062054681370102e-06, "loss": 0.0002, "step": 1803 }, { "epoch": 0.6522053506869125, "grad_norm": 1.6362257888997065, "learning_rate": 8.059691095909223e-06, "loss": 0.1055, "step": 1804 }, { "epoch": 0.6525668835864064, "grad_norm": 0.08974916127703955, "learning_rate": 8.057326416915393e-06, "loss": 0.0143, "step": 1805 }, { "epoch": 0.6529284164859002, "grad_norm": 0.28064663235795406, "learning_rate": 8.054960645233743e-06, "loss": 0.0283, "step": 1806 }, { "epoch": 0.6532899493853941, "grad_norm": 0.18180408852157465, "learning_rate": 8.052593781709806e-06, "loss": 0.0227, "step": 1807 }, { "epoch": 0.6536514822848879, "grad_norm": 0.1681183212308126, "learning_rate": 8.050225827189492e-06, "loss": 0.0203, "step": 1808 }, { "epoch": 0.6540130151843818, "grad_norm": 2.6398528287177023, "learning_rate": 8.047856782519114e-06, "loss": 0.2471, "step": 1809 }, { "epoch": 0.6543745480838756, "grad_norm": 0.3467041868072477, "learning_rate": 8.045486648545367e-06, "loss": 0.0145, "step": 1810 }, { "epoch": 0.6547360809833694, "grad_norm": 0.14436242275847486, "learning_rate": 8.04311542611533e-06, "loss": 0.0203, "step": 1811 }, { "epoch": 0.6550976138828634, "grad_norm": 0.1298256504427974, "learning_rate": 8.040743116076485e-06, "loss": 0.0161, "step": 1812 }, { "epoch": 0.6554591467823572, "grad_norm": 0.03102319296647661, "learning_rate": 8.038369719276692e-06, "loss": 0.0013, "step": 1813 }, { "epoch": 0.6558206796818511, "grad_norm": 0.6706134978683179, "learning_rate": 8.035995236564202e-06, "loss": 0.0227, "step": 1814 }, { "epoch": 0.6561822125813449, "grad_norm": 0.7061696371264174, "learning_rate": 8.033619668787656e-06, "loss": 0.0476, "step": 1815 }, { "epoch": 0.6565437454808387, "grad_norm": 0.8541524406156918, "learning_rate": 8.031243016796078e-06, "loss": 0.1914, "step": 1816 }, { "epoch": 0.6569052783803326, "grad_norm": 0.7658925397584904, "learning_rate": 8.028865281438888e-06, "loss": 0.1602, "step": 1817 }, { "epoch": 0.6572668112798264, "grad_norm": 0.9583509793581059, "learning_rate": 8.026486463565884e-06, "loss": 0.0635, "step": 1818 }, { "epoch": 0.6576283441793204, "grad_norm": 0.8705484143048631, "learning_rate": 8.024106564027257e-06, "loss": 0.1602, "step": 1819 }, { "epoch": 0.6579898770788142, "grad_norm": 0.11492726058332349, "learning_rate": 8.021725583673583e-06, "loss": 0.0162, "step": 1820 }, { "epoch": 0.658351409978308, "grad_norm": 1.0059435256381346, "learning_rate": 8.019343523355824e-06, "loss": 0.0977, "step": 1821 }, { "epoch": 0.6587129428778019, "grad_norm": 0.18290898549720797, "learning_rate": 8.016960383925326e-06, "loss": 0.0203, "step": 1822 }, { "epoch": 0.6590744757772957, "grad_norm": 0.20050702141759438, "learning_rate": 8.014576166233823e-06, "loss": 0.0283, "step": 1823 }, { "epoch": 0.6594360086767896, "grad_norm": 0.06245232251137809, "learning_rate": 8.012190871133434e-06, "loss": 0.0035, "step": 1824 }, { "epoch": 0.6597975415762835, "grad_norm": 0.12176616684024774, "learning_rate": 8.009804499476664e-06, "loss": 0.0161, "step": 1825 }, { "epoch": 0.6601590744757773, "grad_norm": 1.2326265205078333, "learning_rate": 8.007417052116401e-06, "loss": 0.0693, "step": 1826 }, { "epoch": 0.6605206073752712, "grad_norm": 0.5613341218740272, "learning_rate": 8.005028529905918e-06, "loss": 0.0635, "step": 1827 }, { "epoch": 0.660882140274765, "grad_norm": 0.09274630681683801, "learning_rate": 8.002638933698872e-06, "loss": 0.0035, "step": 1828 }, { "epoch": 0.6612436731742588, "grad_norm": 0.20213680633561482, "learning_rate": 8.000248264349306e-06, "loss": 0.0254, "step": 1829 }, { "epoch": 0.6616052060737527, "grad_norm": 0.27403204032846573, "learning_rate": 7.997856522711645e-06, "loss": 0.0091, "step": 1830 }, { "epoch": 0.6619667389732465, "grad_norm": 0.152962885893667, "learning_rate": 7.995463709640692e-06, "loss": 0.0227, "step": 1831 }, { "epoch": 0.6623282718727405, "grad_norm": 0.005119207288695607, "learning_rate": 7.993069825991643e-06, "loss": 0.0002, "step": 1832 }, { "epoch": 0.6626898047722343, "grad_norm": 0.4057956243022047, "learning_rate": 7.99067487262007e-06, "loss": 0.0254, "step": 1833 }, { "epoch": 0.6630513376717281, "grad_norm": 0.6838575257646317, "learning_rate": 7.988278850381927e-06, "loss": 0.1914, "step": 1834 }, { "epoch": 0.663412870571222, "grad_norm": 1.03140126194431, "learning_rate": 7.985881760133556e-06, "loss": 0.0432, "step": 1835 }, { "epoch": 0.6637744034707158, "grad_norm": 2.089058718301002, "learning_rate": 7.983483602731673e-06, "loss": 0.2012, "step": 1836 }, { "epoch": 0.6641359363702097, "grad_norm": 0.03862702180239089, "learning_rate": 7.98108437903338e-06, "loss": 0.0019, "step": 1837 }, { "epoch": 0.6644974692697035, "grad_norm": 0.6353748272385401, "learning_rate": 7.978684089896159e-06, "loss": 0.1807, "step": 1838 }, { "epoch": 0.6648590021691974, "grad_norm": 0.9301367144260171, "learning_rate": 7.976282736177872e-06, "loss": 0.1226, "step": 1839 }, { "epoch": 0.6652205350686913, "grad_norm": 0.20489236939281757, "learning_rate": 7.973880318736764e-06, "loss": 0.0283, "step": 1840 }, { "epoch": 0.6655820679681851, "grad_norm": 0.6933928020100255, "learning_rate": 7.971476838431459e-06, "loss": 0.0432, "step": 1841 }, { "epoch": 0.665943600867679, "grad_norm": 0.13380571273622963, "learning_rate": 7.969072296120958e-06, "loss": 0.0181, "step": 1842 }, { "epoch": 0.6663051337671728, "grad_norm": 0.3140236075419723, "learning_rate": 7.966666692664645e-06, "loss": 0.0349, "step": 1843 }, { "epoch": 0.6666666666666666, "grad_norm": 0.2949781975089199, "learning_rate": 7.964260028922282e-06, "loss": 0.0183, "step": 1844 }, { "epoch": 0.6670281995661606, "grad_norm": 1.1083946185130338, "learning_rate": 7.96185230575401e-06, "loss": 0.0762, "step": 1845 }, { "epoch": 0.6673897324656544, "grad_norm": 0.17859292296594945, "learning_rate": 7.95944352402035e-06, "loss": 0.0181, "step": 1846 }, { "epoch": 0.6677512653651483, "grad_norm": 0.24595228938077526, "learning_rate": 7.957033684582198e-06, "loss": 0.0283, "step": 1847 }, { "epoch": 0.6681127982646421, "grad_norm": 0.0012758144558212576, "learning_rate": 7.954622788300831e-06, "loss": 0.0, "step": 1848 }, { "epoch": 0.6684743311641359, "grad_norm": 0.05371974196847167, "learning_rate": 7.952210836037903e-06, "loss": 0.0024, "step": 1849 }, { "epoch": 0.6688358640636298, "grad_norm": 0.8953546329701039, "learning_rate": 7.949797828655446e-06, "loss": 0.0391, "step": 1850 }, { "epoch": 0.6691973969631236, "grad_norm": 0.03246350161002171, "learning_rate": 7.947383767015867e-06, "loss": 0.0015, "step": 1851 }, { "epoch": 0.6695589298626174, "grad_norm": 0.7633036379727791, "learning_rate": 7.944968651981953e-06, "loss": 0.0762, "step": 1852 }, { "epoch": 0.6699204627621114, "grad_norm": 0.7520809098377855, "learning_rate": 7.942552484416863e-06, "loss": 0.1699, "step": 1853 }, { "epoch": 0.6702819956616052, "grad_norm": 1.0347360436722817, "learning_rate": 7.940135265184135e-06, "loss": 0.1699, "step": 1854 }, { "epoch": 0.6706435285610991, "grad_norm": 0.14417239605647214, "learning_rate": 7.937716995147685e-06, "loss": 0.0227, "step": 1855 }, { "epoch": 0.6710050614605929, "grad_norm": 0.15848580796512252, "learning_rate": 7.935297675171802e-06, "loss": 0.0227, "step": 1856 }, { "epoch": 0.6713665943600867, "grad_norm": 0.6398643609736085, "learning_rate": 7.932877306121148e-06, "loss": 0.1226, "step": 1857 }, { "epoch": 0.6717281272595806, "grad_norm": 0.06851419327080793, "learning_rate": 7.930455888860764e-06, "loss": 0.0027, "step": 1858 }, { "epoch": 0.6720896601590745, "grad_norm": 0.5439386814696219, "learning_rate": 7.928033424256063e-06, "loss": 0.0162, "step": 1859 }, { "epoch": 0.6724511930585684, "grad_norm": 1.8612496911647032, "learning_rate": 7.925609913172834e-06, "loss": 0.1504, "step": 1860 }, { "epoch": 0.6728127259580622, "grad_norm": 0.08133456705805132, "learning_rate": 7.923185356477241e-06, "loss": 0.0035, "step": 1861 }, { "epoch": 0.673174258857556, "grad_norm": 0.9083828132765137, "learning_rate": 7.920759755035818e-06, "loss": 0.0762, "step": 1862 }, { "epoch": 0.6735357917570499, "grad_norm": 0.0012051797668098376, "learning_rate": 7.918333109715475e-06, "loss": 0.0001, "step": 1863 }, { "epoch": 0.6738973246565437, "grad_norm": 4.973006758262653, "learning_rate": 7.915905421383494e-06, "loss": 0.9688, "step": 1864 }, { "epoch": 0.6742588575560376, "grad_norm": 0.04393903030504444, "learning_rate": 7.913476690907532e-06, "loss": 0.0017, "step": 1865 }, { "epoch": 0.6746203904555315, "grad_norm": 0.35191000544560963, "learning_rate": 7.911046919155614e-06, "loss": 0.0115, "step": 1866 }, { "epoch": 0.6749819233550253, "grad_norm": 0.906230657483842, "learning_rate": 7.908616106996143e-06, "loss": 0.0527, "step": 1867 }, { "epoch": 0.6753434562545192, "grad_norm": 0.5242409677574112, "learning_rate": 7.906184255297887e-06, "loss": 0.0527, "step": 1868 }, { "epoch": 0.675704989154013, "grad_norm": 0.2719009794315623, "learning_rate": 7.903751364929993e-06, "loss": 0.0315, "step": 1869 }, { "epoch": 0.6760665220535069, "grad_norm": 0.2375103454899889, "learning_rate": 7.901317436761973e-06, "loss": 0.0388, "step": 1870 }, { "epoch": 0.6764280549530007, "grad_norm": 0.0005423098702890508, "learning_rate": 7.898882471663714e-06, "loss": 0.0, "step": 1871 }, { "epoch": 0.6767895878524945, "grad_norm": 2.7951879261590613, "learning_rate": 7.896446470505473e-06, "loss": 0.416, "step": 1872 }, { "epoch": 0.6771511207519885, "grad_norm": 8.495761252898303, "learning_rate": 7.894009434157873e-06, "loss": 0.0752, "step": 1873 }, { "epoch": 0.6775126536514823, "grad_norm": 0.3518728089470073, "learning_rate": 7.89157136349191e-06, "loss": 0.0432, "step": 1874 }, { "epoch": 0.6778741865509761, "grad_norm": 0.22201393552474974, "learning_rate": 7.889132259378954e-06, "loss": 0.0349, "step": 1875 }, { "epoch": 0.67823571945047, "grad_norm": 0.20870894693246606, "learning_rate": 7.886692122690737e-06, "loss": 0.0315, "step": 1876 }, { "epoch": 0.6785972523499638, "grad_norm": 0.6577099871310942, "learning_rate": 7.884250954299368e-06, "loss": 0.0693, "step": 1877 }, { "epoch": 0.6789587852494577, "grad_norm": 0.5358611060548991, "learning_rate": 7.881808755077314e-06, "loss": 0.0283, "step": 1878 }, { "epoch": 0.6793203181489516, "grad_norm": 0.6888069142090546, "learning_rate": 7.87936552589742e-06, "loss": 0.1143, "step": 1879 }, { "epoch": 0.6796818510484454, "grad_norm": 0.07599893104906252, "learning_rate": 7.876921267632894e-06, "loss": 0.0024, "step": 1880 }, { "epoch": 0.6800433839479393, "grad_norm": 0.31237603368950106, "learning_rate": 7.874475981157315e-06, "loss": 0.0349, "step": 1881 }, { "epoch": 0.6804049168474331, "grad_norm": 2.080951401403826, "learning_rate": 7.872029667344626e-06, "loss": 0.0527, "step": 1882 }, { "epoch": 0.680766449746927, "grad_norm": 0.291516626886813, "learning_rate": 7.86958232706914e-06, "loss": 0.0432, "step": 1883 }, { "epoch": 0.6811279826464208, "grad_norm": 0.23400016950179214, "learning_rate": 7.867133961205536e-06, "loss": 0.0072, "step": 1884 }, { "epoch": 0.6814895155459146, "grad_norm": 0.17513429548842238, "learning_rate": 7.86468457062886e-06, "loss": 0.0283, "step": 1885 }, { "epoch": 0.6818510484454086, "grad_norm": 0.0015548180310393667, "learning_rate": 7.862234156214523e-06, "loss": 0.0001, "step": 1886 }, { "epoch": 0.6822125813449024, "grad_norm": 0.3961235381333004, "learning_rate": 7.859782718838302e-06, "loss": 0.0432, "step": 1887 }, { "epoch": 0.6825741142443963, "grad_norm": 0.1830111240539302, "learning_rate": 7.857330259376341e-06, "loss": 0.0283, "step": 1888 }, { "epoch": 0.6829356471438901, "grad_norm": 0.07280752264792326, "learning_rate": 7.854876778705147e-06, "loss": 0.0019, "step": 1889 }, { "epoch": 0.6832971800433839, "grad_norm": 0.0008401089979923564, "learning_rate": 7.852422277701596e-06, "loss": 0.0, "step": 1890 }, { "epoch": 0.6836587129428778, "grad_norm": 2.250036084619133, "learning_rate": 7.849966757242926e-06, "loss": 0.1699, "step": 1891 }, { "epoch": 0.6840202458423716, "grad_norm": 0.011025836445104464, "learning_rate": 7.847510218206737e-06, "loss": 0.0003, "step": 1892 }, { "epoch": 0.6843817787418656, "grad_norm": 0.36529426812659105, "learning_rate": 7.845052661470998e-06, "loss": 0.0349, "step": 1893 }, { "epoch": 0.6847433116413594, "grad_norm": 0.5140866117592493, "learning_rate": 7.842594087914038e-06, "loss": 0.0228, "step": 1894 }, { "epoch": 0.6851048445408532, "grad_norm": 0.20774647656152873, "learning_rate": 7.840134498414548e-06, "loss": 0.0283, "step": 1895 }, { "epoch": 0.6854663774403471, "grad_norm": 2.9326491338522302, "learning_rate": 7.837673893851591e-06, "loss": 0.1914, "step": 1896 }, { "epoch": 0.6858279103398409, "grad_norm": 0.002815628998082322, "learning_rate": 7.835212275104584e-06, "loss": 0.0001, "step": 1897 }, { "epoch": 0.6861894432393347, "grad_norm": 0.12544377917481844, "learning_rate": 7.832749643053305e-06, "loss": 0.0045, "step": 1898 }, { "epoch": 0.6865509761388287, "grad_norm": 0.0014864956059083886, "learning_rate": 7.830285998577905e-06, "loss": 0.0001, "step": 1899 }, { "epoch": 0.6869125090383225, "grad_norm": 0.5787139746443142, "learning_rate": 7.827821342558883e-06, "loss": 0.083, "step": 1900 }, { "epoch": 0.6872740419378164, "grad_norm": 3.5410445574732767, "learning_rate": 7.825355675877111e-06, "loss": 0.2812, "step": 1901 }, { "epoch": 0.6876355748373102, "grad_norm": 0.0007199197089365021, "learning_rate": 7.822888999413818e-06, "loss": 0.0, "step": 1902 }, { "epoch": 0.687997107736804, "grad_norm": 0.06385608348366303, "learning_rate": 7.82042131405059e-06, "loss": 0.0031, "step": 1903 }, { "epoch": 0.6883586406362979, "grad_norm": 0.009369219856936232, "learning_rate": 7.817952620669383e-06, "loss": 0.0004, "step": 1904 }, { "epoch": 0.6887201735357917, "grad_norm": 0.1843498247448555, "learning_rate": 7.8154829201525e-06, "loss": 0.0254, "step": 1905 }, { "epoch": 0.6890817064352857, "grad_norm": 0.1600120116151609, "learning_rate": 7.813012213382618e-06, "loss": 0.0203, "step": 1906 }, { "epoch": 0.6894432393347795, "grad_norm": 0.17642933441426725, "learning_rate": 7.810540501242765e-06, "loss": 0.0254, "step": 1907 }, { "epoch": 0.6898047722342733, "grad_norm": 0.8133324882603922, "learning_rate": 7.808067784616328e-06, "loss": 0.1143, "step": 1908 }, { "epoch": 0.6901663051337672, "grad_norm": 0.011774610804056247, "learning_rate": 7.80559406438706e-06, "loss": 0.0003, "step": 1909 }, { "epoch": 0.690527838033261, "grad_norm": 0.12182028962040932, "learning_rate": 7.803119341439063e-06, "loss": 0.0182, "step": 1910 }, { "epoch": 0.6908893709327549, "grad_norm": 0.023539189686439883, "learning_rate": 7.800643616656805e-06, "loss": 0.0014, "step": 1911 }, { "epoch": 0.6912509038322487, "grad_norm": 0.0011976683023125069, "learning_rate": 7.79816689092511e-06, "loss": 0.0001, "step": 1912 }, { "epoch": 0.6916124367317426, "grad_norm": 1.6752896377425275, "learning_rate": 7.79568916512916e-06, "loss": 0.1143, "step": 1913 }, { "epoch": 0.6919739696312365, "grad_norm": 0.14629996950583202, "learning_rate": 7.793210440154492e-06, "loss": 0.0063, "step": 1914 }, { "epoch": 0.6923355025307303, "grad_norm": 0.016523356680871897, "learning_rate": 7.790730716887001e-06, "loss": 0.0007, "step": 1915 }, { "epoch": 0.6926970354302241, "grad_norm": 0.007488104905668844, "learning_rate": 7.78824999621294e-06, "loss": 0.0003, "step": 1916 }, { "epoch": 0.693058568329718, "grad_norm": 0.0016037755142554657, "learning_rate": 7.785768279018921e-06, "loss": 0.0001, "step": 1917 }, { "epoch": 0.6934201012292118, "grad_norm": 0.013978514340733444, "learning_rate": 7.783285566191907e-06, "loss": 0.0006, "step": 1918 }, { "epoch": 0.6937816341287057, "grad_norm": 0.28089146634331597, "learning_rate": 7.780801858619217e-06, "loss": 0.0283, "step": 1919 }, { "epoch": 0.6941431670281996, "grad_norm": 0.02891352002438457, "learning_rate": 7.77831715718853e-06, "loss": 0.001, "step": 1920 }, { "epoch": 0.6945046999276934, "grad_norm": 0.0965643782446606, "learning_rate": 7.77583146278788e-06, "loss": 0.0143, "step": 1921 }, { "epoch": 0.6948662328271873, "grad_norm": 0.009691033446742289, "learning_rate": 7.773344776305648e-06, "loss": 0.0003, "step": 1922 }, { "epoch": 0.6952277657266811, "grad_norm": 0.5669781232700812, "learning_rate": 7.77085709863058e-06, "loss": 0.1914, "step": 1923 }, { "epoch": 0.695589298626175, "grad_norm": 0.11497559346835062, "learning_rate": 7.76836843065177e-06, "loss": 0.0182, "step": 1924 }, { "epoch": 0.6959508315256688, "grad_norm": 0.1416810416768076, "learning_rate": 7.765878773258666e-06, "loss": 0.0203, "step": 1925 }, { "epoch": 0.6963123644251626, "grad_norm": 0.42813701239859336, "learning_rate": 7.763388127341071e-06, "loss": 0.0352, "step": 1926 }, { "epoch": 0.6966738973246566, "grad_norm": 0.4353446392365717, "learning_rate": 7.760896493789144e-06, "loss": 0.0432, "step": 1927 }, { "epoch": 0.6970354302241504, "grad_norm": 1.0137051069180187, "learning_rate": 7.758403873493393e-06, "loss": 0.1143, "step": 1928 }, { "epoch": 0.6973969631236443, "grad_norm": 0.0017271501355558842, "learning_rate": 7.75591026734468e-06, "loss": 0.0001, "step": 1929 }, { "epoch": 0.6977584960231381, "grad_norm": 1.2161293482907496, "learning_rate": 7.753415676234217e-06, "loss": 0.1309, "step": 1930 }, { "epoch": 0.6981200289226319, "grad_norm": 0.11786204398795375, "learning_rate": 7.750920101053574e-06, "loss": 0.0057, "step": 1931 }, { "epoch": 0.6984815618221258, "grad_norm": 0.9619127905568504, "learning_rate": 7.748423542694668e-06, "loss": 0.0977, "step": 1932 }, { "epoch": 0.6988430947216197, "grad_norm": 0.040851689650369324, "learning_rate": 7.745926002049766e-06, "loss": 0.0015, "step": 1933 }, { "epoch": 0.6992046276211136, "grad_norm": 0.7190350499722845, "learning_rate": 7.743427480011491e-06, "loss": 0.0903, "step": 1934 }, { "epoch": 0.6995661605206074, "grad_norm": 0.005040684344713329, "learning_rate": 7.740927977472814e-06, "loss": 0.0002, "step": 1935 }, { "epoch": 0.6999276934201012, "grad_norm": 0.7214871137187869, "learning_rate": 7.738427495327057e-06, "loss": 0.0432, "step": 1936 }, { "epoch": 0.7002892263195951, "grad_norm": 0.2986277254955887, "learning_rate": 7.73592603446789e-06, "loss": 0.0091, "step": 1937 }, { "epoch": 0.7006507592190889, "grad_norm": 0.11851380497049892, "learning_rate": 7.733423595789337e-06, "loss": 0.0181, "step": 1938 }, { "epoch": 0.7010122921185827, "grad_norm": 0.08434716102990653, "learning_rate": 7.730920180185765e-06, "loss": 0.0035, "step": 1939 }, { "epoch": 0.7013738250180767, "grad_norm": 0.7794204376187012, "learning_rate": 7.7284157885519e-06, "loss": 0.1055, "step": 1940 }, { "epoch": 0.7017353579175705, "grad_norm": 0.7769273102711095, "learning_rate": 7.725910421782808e-06, "loss": 0.0432, "step": 1941 }, { "epoch": 0.7020968908170644, "grad_norm": 0.25075428518626264, "learning_rate": 7.723404080773904e-06, "loss": 0.0102, "step": 1942 }, { "epoch": 0.7024584237165582, "grad_norm": 0.010557836981385554, "learning_rate": 7.720896766420957e-06, "loss": 0.0005, "step": 1943 }, { "epoch": 0.702819956616052, "grad_norm": 0.049896494492389226, "learning_rate": 7.71838847962008e-06, "loss": 0.0019, "step": 1944 }, { "epoch": 0.7031814895155459, "grad_norm": 0.07744781994602346, "learning_rate": 7.715879221267736e-06, "loss": 0.0039, "step": 1945 }, { "epoch": 0.7035430224150397, "grad_norm": 0.0050028654337478115, "learning_rate": 7.713368992260731e-06, "loss": 0.0002, "step": 1946 }, { "epoch": 0.7039045553145337, "grad_norm": 1.3860687438582284, "learning_rate": 7.710857793496218e-06, "loss": 0.0977, "step": 1947 }, { "epoch": 0.7042660882140275, "grad_norm": 0.015430918972036217, "learning_rate": 7.708345625871703e-06, "loss": 0.001, "step": 1948 }, { "epoch": 0.7046276211135213, "grad_norm": 0.271376572123242, "learning_rate": 7.705832490285034e-06, "loss": 0.0315, "step": 1949 }, { "epoch": 0.7049891540130152, "grad_norm": 0.18129385170284187, "learning_rate": 7.703318387634403e-06, "loss": 0.0254, "step": 1950 }, { "epoch": 0.705350686912509, "grad_norm": 0.001335375157013568, "learning_rate": 7.700803318818352e-06, "loss": 0.0, "step": 1951 }, { "epoch": 0.7057122198120029, "grad_norm": 0.16076792962188155, "learning_rate": 7.698287284735763e-06, "loss": 0.0063, "step": 1952 }, { "epoch": 0.7060737527114967, "grad_norm": 0.4664969658388093, "learning_rate": 7.695770286285869e-06, "loss": 0.0204, "step": 1953 }, { "epoch": 0.7064352856109906, "grad_norm": 0.3010153378245163, "learning_rate": 7.693252324368245e-06, "loss": 0.0315, "step": 1954 }, { "epoch": 0.7067968185104845, "grad_norm": 1.312207399810824, "learning_rate": 7.690733399882805e-06, "loss": 0.1055, "step": 1955 }, { "epoch": 0.7071583514099783, "grad_norm": 1.4484495589375193, "learning_rate": 7.688213513729819e-06, "loss": 0.2471, "step": 1956 }, { "epoch": 0.7075198843094722, "grad_norm": 0.07197249942606987, "learning_rate": 7.685692666809889e-06, "loss": 0.0022, "step": 1957 }, { "epoch": 0.707881417208966, "grad_norm": 0.34483957298760465, "learning_rate": 7.683170860023967e-06, "loss": 0.0388, "step": 1958 }, { "epoch": 0.7082429501084598, "grad_norm": 0.1655567621148282, "learning_rate": 7.680648094273346e-06, "loss": 0.0227, "step": 1959 }, { "epoch": 0.7086044830079538, "grad_norm": 0.02593033324175722, "learning_rate": 7.67812437045966e-06, "loss": 0.001, "step": 1960 }, { "epoch": 0.7089660159074476, "grad_norm": 0.0025268478702909374, "learning_rate": 7.675599689484892e-06, "loss": 0.0001, "step": 1961 }, { "epoch": 0.7093275488069414, "grad_norm": 0.00253426537347804, "learning_rate": 7.673074052251358e-06, "loss": 0.0001, "step": 1962 }, { "epoch": 0.7096890817064353, "grad_norm": 0.397723014874791, "learning_rate": 7.670547459661723e-06, "loss": 0.0388, "step": 1963 }, { "epoch": 0.7100506146059291, "grad_norm": 0.000990458324896655, "learning_rate": 7.66801991261899e-06, "loss": 0.0001, "step": 1964 }, { "epoch": 0.710412147505423, "grad_norm": 0.7517291896989645, "learning_rate": 7.6654914120265e-06, "loss": 0.1504, "step": 1965 }, { "epoch": 0.7107736804049168, "grad_norm": 0.49141686013798985, "learning_rate": 7.662961958787946e-06, "loss": 0.0432, "step": 1966 }, { "epoch": 0.7111352133044107, "grad_norm": 0.8488769634060238, "learning_rate": 7.660431553807348e-06, "loss": 0.0762, "step": 1967 }, { "epoch": 0.7114967462039046, "grad_norm": 0.20739740315011945, "learning_rate": 7.657900197989072e-06, "loss": 0.0254, "step": 1968 }, { "epoch": 0.7118582791033984, "grad_norm": 0.6305527102111768, "learning_rate": 7.655367892237831e-06, "loss": 0.0527, "step": 1969 }, { "epoch": 0.7122198120028923, "grad_norm": 1.723178137425632, "learning_rate": 7.652834637458662e-06, "loss": 0.0977, "step": 1970 }, { "epoch": 0.7125813449023861, "grad_norm": 0.15024088343447356, "learning_rate": 7.650300434556954e-06, "loss": 0.0056, "step": 1971 }, { "epoch": 0.7129428778018799, "grad_norm": 0.209014660938164, "learning_rate": 7.647765284438432e-06, "loss": 0.0254, "step": 1972 }, { "epoch": 0.7133044107013738, "grad_norm": 0.5584157046227802, "learning_rate": 7.645229188009153e-06, "loss": 0.1504, "step": 1973 }, { "epoch": 0.7136659436008677, "grad_norm": 0.6933703543431406, "learning_rate": 7.642692146175524e-06, "loss": 0.0432, "step": 1974 }, { "epoch": 0.7140274765003616, "grad_norm": 0.14928315614202695, "learning_rate": 7.640154159844275e-06, "loss": 0.0181, "step": 1975 }, { "epoch": 0.7143890093998554, "grad_norm": 0.7876208328063425, "learning_rate": 7.63761522992249e-06, "loss": 0.0693, "step": 1976 }, { "epoch": 0.7147505422993492, "grad_norm": 0.44656316107922994, "learning_rate": 7.635075357317577e-06, "loss": 0.0349, "step": 1977 }, { "epoch": 0.7151120751988431, "grad_norm": 0.16160287410132573, "learning_rate": 7.632534542937287e-06, "loss": 0.0203, "step": 1978 }, { "epoch": 0.7154736080983369, "grad_norm": 0.7249046634231283, "learning_rate": 7.629992787689708e-06, "loss": 0.0432, "step": 1979 }, { "epoch": 0.7158351409978309, "grad_norm": 0.01034656493894126, "learning_rate": 7.627450092483263e-06, "loss": 0.0005, "step": 1980 }, { "epoch": 0.7161966738973247, "grad_norm": 0.6037630867020093, "learning_rate": 7.624906458226708e-06, "loss": 0.1699, "step": 1981 }, { "epoch": 0.7165582067968185, "grad_norm": 0.34820776004024406, "learning_rate": 7.6223618858291374e-06, "loss": 0.0254, "step": 1982 }, { "epoch": 0.7169197396963124, "grad_norm": 0.13767178865393292, "learning_rate": 7.619816376199984e-06, "loss": 0.0143, "step": 1983 }, { "epoch": 0.7172812725958062, "grad_norm": 1.1630285390161506, "learning_rate": 7.617269930249011e-06, "loss": 0.0762, "step": 1984 }, { "epoch": 0.7176428054953, "grad_norm": 0.4613334132757271, "learning_rate": 7.614722548886316e-06, "loss": 0.0317, "step": 1985 }, { "epoch": 0.7180043383947939, "grad_norm": 0.10838401934480309, "learning_rate": 7.612174233022336e-06, "loss": 0.0143, "step": 1986 }, { "epoch": 0.7183658712942878, "grad_norm": 1.0113285416629951, "learning_rate": 7.609624983567834e-06, "loss": 0.1143, "step": 1987 }, { "epoch": 0.7187274041937817, "grad_norm": 0.0033920643933942245, "learning_rate": 7.607074801433914e-06, "loss": 0.0001, "step": 1988 }, { "epoch": 0.7190889370932755, "grad_norm": 0.08754815195300172, "learning_rate": 7.60452368753201e-06, "loss": 0.0143, "step": 1989 }, { "epoch": 0.7194504699927693, "grad_norm": 0.8410312150201433, "learning_rate": 7.601971642773891e-06, "loss": 0.1055, "step": 1990 }, { "epoch": 0.7198120028922632, "grad_norm": 0.004119286725497081, "learning_rate": 7.599418668071656e-06, "loss": 0.0002, "step": 1991 }, { "epoch": 0.720173535791757, "grad_norm": 0.16595063660415882, "learning_rate": 7.596864764337735e-06, "loss": 0.0114, "step": 1992 }, { "epoch": 0.720535068691251, "grad_norm": 0.09320724052893105, "learning_rate": 7.594309932484898e-06, "loss": 0.0057, "step": 1993 }, { "epoch": 0.7208966015907448, "grad_norm": 0.12907421896247645, "learning_rate": 7.5917541734262376e-06, "loss": 0.0182, "step": 1994 }, { "epoch": 0.7212581344902386, "grad_norm": 0.6440645492142192, "learning_rate": 7.589197488075183e-06, "loss": 0.0476, "step": 1995 }, { "epoch": 0.7216196673897325, "grad_norm": 0.10909937690089708, "learning_rate": 7.586639877345492e-06, "loss": 0.0143, "step": 1996 }, { "epoch": 0.7219812002892263, "grad_norm": 0.4260275740966396, "learning_rate": 7.584081342151255e-06, "loss": 0.0227, "step": 1997 }, { "epoch": 0.7223427331887202, "grad_norm": 0.016478886090810562, "learning_rate": 7.581521883406893e-06, "loss": 0.0007, "step": 1998 }, { "epoch": 0.722704266088214, "grad_norm": 0.09719440791607735, "learning_rate": 7.578961502027153e-06, "loss": 0.005, "step": 1999 }, { "epoch": 0.7230657989877078, "grad_norm": 0.7061285890702065, "learning_rate": 7.576400198927117e-06, "loss": 0.1699, "step": 2000 }, { "epoch": 0.7234273318872018, "grad_norm": 0.8281218943289413, "learning_rate": 7.5738379750221936e-06, "loss": 0.1309, "step": 2001 }, { "epoch": 0.7237888647866956, "grad_norm": 0.2819646670651866, "learning_rate": 7.571274831228119e-06, "loss": 0.0102, "step": 2002 }, { "epoch": 0.7241503976861895, "grad_norm": 1.9412770735969014, "learning_rate": 7.568710768460965e-06, "loss": 0.1309, "step": 2003 }, { "epoch": 0.7245119305856833, "grad_norm": 0.12885488203018688, "learning_rate": 7.566145787637125e-06, "loss": 0.0203, "step": 2004 }, { "epoch": 0.7248734634851771, "grad_norm": 0.3515863445738856, "learning_rate": 7.56357988967332e-06, "loss": 0.0317, "step": 2005 }, { "epoch": 0.725234996384671, "grad_norm": 0.027238215818319745, "learning_rate": 7.561013075486605e-06, "loss": 0.001, "step": 2006 }, { "epoch": 0.7255965292841648, "grad_norm": 0.8792073769243397, "learning_rate": 7.5584453459943565e-06, "loss": 0.0635, "step": 2007 }, { "epoch": 0.7259580621836587, "grad_norm": 0.027995390805012937, "learning_rate": 7.5558767021142795e-06, "loss": 0.0006, "step": 2008 }, { "epoch": 0.7263195950831526, "grad_norm": 0.0018959066446220885, "learning_rate": 7.553307144764412e-06, "loss": 0.0001, "step": 2009 }, { "epoch": 0.7266811279826464, "grad_norm": 0.12167255085686776, "learning_rate": 7.550736674863108e-06, "loss": 0.0182, "step": 2010 }, { "epoch": 0.7270426608821403, "grad_norm": 0.3251277061965914, "learning_rate": 7.548165293329055e-06, "loss": 0.0432, "step": 2011 }, { "epoch": 0.7274041937816341, "grad_norm": 0.14981155899062967, "learning_rate": 7.545593001081264e-06, "loss": 0.0182, "step": 2012 }, { "epoch": 0.7277657266811279, "grad_norm": 0.44397119079533615, "learning_rate": 7.543019799039071e-06, "loss": 0.0388, "step": 2013 }, { "epoch": 0.7281272595806219, "grad_norm": 0.26689726231124283, "learning_rate": 7.54044568812214e-06, "loss": 0.0315, "step": 2014 }, { "epoch": 0.7284887924801157, "grad_norm": 0.1627570419045774, "learning_rate": 7.537870669250453e-06, "loss": 0.0227, "step": 2015 }, { "epoch": 0.7288503253796096, "grad_norm": 0.5416207515289438, "learning_rate": 7.535294743344328e-06, "loss": 0.0476, "step": 2016 }, { "epoch": 0.7292118582791034, "grad_norm": 0.21299227011763538, "learning_rate": 7.532717911324394e-06, "loss": 0.0227, "step": 2017 }, { "epoch": 0.7295733911785972, "grad_norm": 1.292177081206163, "learning_rate": 7.530140174111616e-06, "loss": 0.1143, "step": 2018 }, { "epoch": 0.7299349240780911, "grad_norm": 0.2503063359643993, "learning_rate": 7.527561532627272e-06, "loss": 0.0254, "step": 2019 }, { "epoch": 0.7302964569775849, "grad_norm": 0.2548495113584195, "learning_rate": 7.5249819877929685e-06, "loss": 0.0254, "step": 2020 }, { "epoch": 0.7306579898770789, "grad_norm": 0.015920435857315764, "learning_rate": 7.5224015405306384e-06, "loss": 0.0006, "step": 2021 }, { "epoch": 0.7310195227765727, "grad_norm": 0.024522585806541385, "learning_rate": 7.519820191762528e-06, "loss": 0.0012, "step": 2022 }, { "epoch": 0.7313810556760665, "grad_norm": 0.5569571274779137, "learning_rate": 7.517237942411213e-06, "loss": 0.0254, "step": 2023 }, { "epoch": 0.7317425885755604, "grad_norm": 0.8099368043572152, "learning_rate": 7.514654793399589e-06, "loss": 0.1309, "step": 2024 }, { "epoch": 0.7321041214750542, "grad_norm": 0.2888862434502698, "learning_rate": 7.512070745650872e-06, "loss": 0.0227, "step": 2025 }, { "epoch": 0.7324656543745481, "grad_norm": 0.3563591443794249, "learning_rate": 7.5094858000886005e-06, "loss": 0.0315, "step": 2026 }, { "epoch": 0.732827187274042, "grad_norm": 0.5088567167992125, "learning_rate": 7.506899957636634e-06, "loss": 0.0579, "step": 2027 }, { "epoch": 0.7331887201735358, "grad_norm": 0.6042903398804123, "learning_rate": 7.504313219219153e-06, "loss": 0.0352, "step": 2028 }, { "epoch": 0.7335502530730297, "grad_norm": 0.7473993311844995, "learning_rate": 7.501725585760654e-06, "loss": 0.1504, "step": 2029 }, { "epoch": 0.7339117859725235, "grad_norm": 0.3374895838179765, "learning_rate": 7.499137058185959e-06, "loss": 0.0283, "step": 2030 }, { "epoch": 0.7342733188720173, "grad_norm": 0.07468938539033021, "learning_rate": 7.496547637420208e-06, "loss": 0.0101, "step": 2031 }, { "epoch": 0.7346348517715112, "grad_norm": 0.29341749918982757, "learning_rate": 7.493957324388856e-06, "loss": 0.0129, "step": 2032 }, { "epoch": 0.734996384671005, "grad_norm": 0.13707973820436753, "learning_rate": 7.491366120017684e-06, "loss": 0.008, "step": 2033 }, { "epoch": 0.735357917570499, "grad_norm": 0.08869680635415444, "learning_rate": 7.488774025232788e-06, "loss": 0.0031, "step": 2034 }, { "epoch": 0.7357194504699928, "grad_norm": 0.7470440886433145, "learning_rate": 7.486181040960579e-06, "loss": 0.0476, "step": 2035 }, { "epoch": 0.7360809833694866, "grad_norm": 0.17222099005326, "learning_rate": 7.483587168127791e-06, "loss": 0.0203, "step": 2036 }, { "epoch": 0.7364425162689805, "grad_norm": 0.636890443145729, "learning_rate": 7.480992407661473e-06, "loss": 0.0432, "step": 2037 }, { "epoch": 0.7368040491684743, "grad_norm": 0.12459136662691417, "learning_rate": 7.478396760488992e-06, "loss": 0.0161, "step": 2038 }, { "epoch": 0.7371655820679682, "grad_norm": 0.2808659129043671, "learning_rate": 7.475800227538032e-06, "loss": 0.0283, "step": 2039 }, { "epoch": 0.737527114967462, "grad_norm": 1.757099635188086, "learning_rate": 7.473202809736593e-06, "loss": 0.1055, "step": 2040 }, { "epoch": 0.7378886478669558, "grad_norm": 0.00939796456592391, "learning_rate": 7.470604508012992e-06, "loss": 0.0003, "step": 2041 }, { "epoch": 0.7382501807664498, "grad_norm": 0.8073121878426739, "learning_rate": 7.46800532329586e-06, "loss": 0.1602, "step": 2042 }, { "epoch": 0.7386117136659436, "grad_norm": 0.20728639259064396, "learning_rate": 7.465405256514149e-06, "loss": 0.0181, "step": 2043 }, { "epoch": 0.7389732465654375, "grad_norm": 0.5243356731332979, "learning_rate": 7.462804308597118e-06, "loss": 0.0254, "step": 2044 }, { "epoch": 0.7393347794649313, "grad_norm": 0.0012192736126610407, "learning_rate": 7.460202480474346e-06, "loss": 0.0001, "step": 2045 }, { "epoch": 0.7396963123644251, "grad_norm": 0.7321174998723526, "learning_rate": 7.457599773075728e-06, "loss": 0.0635, "step": 2046 }, { "epoch": 0.740057845263919, "grad_norm": 0.2752522692070828, "learning_rate": 7.454996187331469e-06, "loss": 0.0254, "step": 2047 }, { "epoch": 0.7404193781634129, "grad_norm": 0.034633594411478924, "learning_rate": 7.452391724172091e-06, "loss": 0.0013, "step": 2048 }, { "epoch": 0.7407809110629068, "grad_norm": 0.0012948380978982173, "learning_rate": 7.449786384528428e-06, "loss": 0.0001, "step": 2049 }, { "epoch": 0.7411424439624006, "grad_norm": 0.5065000641231673, "learning_rate": 7.447180169331628e-06, "loss": 0.0476, "step": 2050 }, { "epoch": 0.7415039768618944, "grad_norm": 0.1574834677686841, "learning_rate": 7.444573079513153e-06, "loss": 0.0143, "step": 2051 }, { "epoch": 0.7418655097613883, "grad_norm": 0.1477642454053232, "learning_rate": 7.441965116004775e-06, "loss": 0.0181, "step": 2052 }, { "epoch": 0.7422270426608821, "grad_norm": 0.1359291876635158, "learning_rate": 7.4393562797385795e-06, "loss": 0.0143, "step": 2053 }, { "epoch": 0.7425885755603759, "grad_norm": 0.30600388942466, "learning_rate": 7.436746571646965e-06, "loss": 0.0283, "step": 2054 }, { "epoch": 0.7429501084598699, "grad_norm": 0.14526358906940445, "learning_rate": 7.434135992662639e-06, "loss": 0.0161, "step": 2055 }, { "epoch": 0.7433116413593637, "grad_norm": 0.001918368608333997, "learning_rate": 7.431524543718624e-06, "loss": 0.0001, "step": 2056 }, { "epoch": 0.7436731742588576, "grad_norm": 0.11338571414579215, "learning_rate": 7.42891222574825e-06, "loss": 0.0101, "step": 2057 }, { "epoch": 0.7440347071583514, "grad_norm": 0.9470658213166777, "learning_rate": 7.426299039685159e-06, "loss": 0.1504, "step": 2058 }, { "epoch": 0.7443962400578452, "grad_norm": 0.1509274856902834, "learning_rate": 7.4236849864633034e-06, "loss": 0.0161, "step": 2059 }, { "epoch": 0.7447577729573391, "grad_norm": 0.8744428575619367, "learning_rate": 7.421070067016945e-06, "loss": 0.2471, "step": 2060 }, { "epoch": 0.745119305856833, "grad_norm": 0.08577058876572471, "learning_rate": 7.418454282280655e-06, "loss": 0.0079, "step": 2061 }, { "epoch": 0.7454808387563269, "grad_norm": 0.8305522164201261, "learning_rate": 7.415837633189316e-06, "loss": 0.0145, "step": 2062 }, { "epoch": 0.7458423716558207, "grad_norm": 0.24215388274988947, "learning_rate": 7.413220120678115e-06, "loss": 0.0203, "step": 2063 }, { "epoch": 0.7462039045553145, "grad_norm": 0.7096862050436726, "learning_rate": 7.410601745682554e-06, "loss": 0.2012, "step": 2064 }, { "epoch": 0.7465654374548084, "grad_norm": 0.004469086620917944, "learning_rate": 7.407982509138436e-06, "loss": 0.0002, "step": 2065 }, { "epoch": 0.7469269703543022, "grad_norm": 0.48912190614831685, "learning_rate": 7.405362411981879e-06, "loss": 0.0227, "step": 2066 }, { "epoch": 0.7472885032537961, "grad_norm": 0.1856432015414389, "learning_rate": 7.402741455149303e-06, "loss": 0.0203, "step": 2067 }, { "epoch": 0.74765003615329, "grad_norm": 0.1013227503273492, "learning_rate": 7.400119639577439e-06, "loss": 0.0128, "step": 2068 }, { "epoch": 0.7480115690527838, "grad_norm": 0.9347646139364241, "learning_rate": 7.397496966203321e-06, "loss": 0.1699, "step": 2069 }, { "epoch": 0.7483731019522777, "grad_norm": 3.4773209650926895, "learning_rate": 7.394873435964294e-06, "loss": 0.2129, "step": 2070 }, { "epoch": 0.7487346348517715, "grad_norm": 0.3764464425538426, "learning_rate": 7.392249049798006e-06, "loss": 0.0227, "step": 2071 }, { "epoch": 0.7490961677512654, "grad_norm": 0.004891672211028481, "learning_rate": 7.389623808642412e-06, "loss": 0.0002, "step": 2072 }, { "epoch": 0.7494577006507592, "grad_norm": 0.31610983548524396, "learning_rate": 7.386997713435774e-06, "loss": 0.0081, "step": 2073 }, { "epoch": 0.749819233550253, "grad_norm": 0.23428430210074444, "learning_rate": 7.384370765116657e-06, "loss": 0.0115, "step": 2074 }, { "epoch": 0.750180766449747, "grad_norm": 0.11087098880377332, "learning_rate": 7.38174296462393e-06, "loss": 0.005, "step": 2075 }, { "epoch": 0.7505422993492408, "grad_norm": 0.08597562557565731, "learning_rate": 7.379114312896772e-06, "loss": 0.0128, "step": 2076 }, { "epoch": 0.7509038322487346, "grad_norm": 17.92953489379399, "learning_rate": 7.37648481087466e-06, "loss": 1.2031, "step": 2077 }, { "epoch": 0.7512653651482285, "grad_norm": 2.6071925462494403, "learning_rate": 7.3738544594973785e-06, "loss": 0.1602, "step": 2078 }, { "epoch": 0.7516268980477223, "grad_norm": 0.3866226732523734, "learning_rate": 7.371223259705014e-06, "loss": 0.0283, "step": 2079 }, { "epoch": 0.7519884309472162, "grad_norm": 0.28148596014839455, "learning_rate": 7.368591212437957e-06, "loss": 0.0145, "step": 2080 }, { "epoch": 0.75234996384671, "grad_norm": 0.12061709516220015, "learning_rate": 7.3659583186369e-06, "loss": 0.0114, "step": 2081 }, { "epoch": 0.7527114967462039, "grad_norm": 0.3897745996519583, "learning_rate": 7.36332457924284e-06, "loss": 0.0349, "step": 2082 }, { "epoch": 0.7530730296456978, "grad_norm": 0.09966585360385148, "learning_rate": 7.360689995197073e-06, "loss": 0.005, "step": 2083 }, { "epoch": 0.7534345625451916, "grad_norm": 0.009146490487020216, "learning_rate": 7.3580545674412e-06, "loss": 0.0003, "step": 2084 }, { "epoch": 0.7537960954446855, "grad_norm": 0.26359596109446476, "learning_rate": 7.355418296917123e-06, "loss": 0.0227, "step": 2085 }, { "epoch": 0.7541576283441793, "grad_norm": 0.0011811092988430491, "learning_rate": 7.352781184567044e-06, "loss": 0.0, "step": 2086 }, { "epoch": 0.7545191612436731, "grad_norm": 0.24214803225352785, "learning_rate": 7.350143231333465e-06, "loss": 0.0283, "step": 2087 }, { "epoch": 0.754880694143167, "grad_norm": 0.5140275494199181, "learning_rate": 7.347504438159193e-06, "loss": 0.0432, "step": 2088 }, { "epoch": 0.7552422270426609, "grad_norm": 0.15934593248986356, "learning_rate": 7.344864805987329e-06, "loss": 0.0128, "step": 2089 }, { "epoch": 0.7556037599421548, "grad_norm": 0.23365745227283433, "learning_rate": 7.342224335761277e-06, "loss": 0.0227, "step": 2090 }, { "epoch": 0.7559652928416486, "grad_norm": 0.0027219378375090905, "learning_rate": 7.339583028424744e-06, "loss": 0.0001, "step": 2091 }, { "epoch": 0.7563268257411424, "grad_norm": 2.6910279312630734, "learning_rate": 7.336940884921728e-06, "loss": 0.3164, "step": 2092 }, { "epoch": 0.7566883586406363, "grad_norm": 0.615188060321142, "learning_rate": 7.334297906196535e-06, "loss": 0.0143, "step": 2093 }, { "epoch": 0.7570498915401301, "grad_norm": 0.6795224711698683, "learning_rate": 7.331654093193763e-06, "loss": 0.2129, "step": 2094 }, { "epoch": 0.7574114244396241, "grad_norm": 0.3547990789587372, "learning_rate": 7.329009446858308e-06, "loss": 0.0102, "step": 2095 }, { "epoch": 0.7577729573391179, "grad_norm": 1.7975316667738706, "learning_rate": 7.326363968135371e-06, "loss": 0.083, "step": 2096 }, { "epoch": 0.7581344902386117, "grad_norm": 0.13410546037765306, "learning_rate": 7.32371765797044e-06, "loss": 0.0181, "step": 2097 }, { "epoch": 0.7584960231381056, "grad_norm": 0.23119516201009807, "learning_rate": 7.321070517309311e-06, "loss": 0.0129, "step": 2098 }, { "epoch": 0.7588575560375994, "grad_norm": 0.0015460761414499945, "learning_rate": 7.318422547098068e-06, "loss": 0.0001, "step": 2099 }, { "epoch": 0.7592190889370932, "grad_norm": 0.2609730265996556, "learning_rate": 7.315773748283095e-06, "loss": 0.0203, "step": 2100 }, { "epoch": 0.7595806218365871, "grad_norm": 0.7998779293219012, "learning_rate": 7.313124121811074e-06, "loss": 0.0432, "step": 2101 }, { "epoch": 0.759942154736081, "grad_norm": 0.18799922232888672, "learning_rate": 7.310473668628979e-06, "loss": 0.0227, "step": 2102 }, { "epoch": 0.7603036876355749, "grad_norm": 1.0763715271021734, "learning_rate": 7.307822389684085e-06, "loss": 0.0579, "step": 2103 }, { "epoch": 0.7606652205350687, "grad_norm": 0.12364579724132801, "learning_rate": 7.3051702859239525e-06, "loss": 0.0161, "step": 2104 }, { "epoch": 0.7610267534345625, "grad_norm": 0.13365307307184926, "learning_rate": 7.3025173582964484e-06, "loss": 0.0161, "step": 2105 }, { "epoch": 0.7613882863340564, "grad_norm": 1.0807131839139397, "learning_rate": 7.299863607749727e-06, "loss": 0.2012, "step": 2106 }, { "epoch": 0.7617498192335502, "grad_norm": 0.31845987730790115, "learning_rate": 7.297209035232235e-06, "loss": 0.0254, "step": 2107 }, { "epoch": 0.7621113521330442, "grad_norm": 0.3717807197966765, "learning_rate": 7.294553641692721e-06, "loss": 0.0254, "step": 2108 }, { "epoch": 0.762472885032538, "grad_norm": 0.8410899461144983, "learning_rate": 7.291897428080218e-06, "loss": 0.1807, "step": 2109 }, { "epoch": 0.7628344179320318, "grad_norm": 0.5623515360071666, "learning_rate": 7.289240395344059e-06, "loss": 0.0203, "step": 2110 }, { "epoch": 0.7631959508315257, "grad_norm": 0.09534521837360946, "learning_rate": 7.2865825444338656e-06, "loss": 0.0024, "step": 2111 }, { "epoch": 0.7635574837310195, "grad_norm": 0.39467981962025483, "learning_rate": 7.283923876299552e-06, "loss": 0.0129, "step": 2112 }, { "epoch": 0.7639190166305134, "grad_norm": 0.22625822396459855, "learning_rate": 7.28126439189133e-06, "loss": 0.0143, "step": 2113 }, { "epoch": 0.7642805495300072, "grad_norm": 0.08389102498204547, "learning_rate": 7.278604092159694e-06, "loss": 0.0128, "step": 2114 }, { "epoch": 0.764642082429501, "grad_norm": 0.008888829217748685, "learning_rate": 7.275942978055436e-06, "loss": 0.0002, "step": 2115 }, { "epoch": 0.765003615328995, "grad_norm": 0.8422762497886878, "learning_rate": 7.2732810505296395e-06, "loss": 0.0317, "step": 2116 }, { "epoch": 0.7653651482284888, "grad_norm": 0.931810862654786, "learning_rate": 7.270618310533672e-06, "loss": 0.0527, "step": 2117 }, { "epoch": 0.7657266811279827, "grad_norm": 1.0088012419403227, "learning_rate": 7.267954759019203e-06, "loss": 0.1055, "step": 2118 }, { "epoch": 0.7660882140274765, "grad_norm": 0.08698590296729578, "learning_rate": 7.2652903969381805e-06, "loss": 0.0114, "step": 2119 }, { "epoch": 0.7664497469269703, "grad_norm": 0.023573031434779103, "learning_rate": 7.262625225242848e-06, "loss": 0.0011, "step": 2120 }, { "epoch": 0.7668112798264642, "grad_norm": 2.0960527150348027, "learning_rate": 7.259959244885738e-06, "loss": 0.1143, "step": 2121 }, { "epoch": 0.767172812725958, "grad_norm": 0.14287713328390278, "learning_rate": 7.257292456819669e-06, "loss": 0.0162, "step": 2122 }, { "epoch": 0.7675343456254519, "grad_norm": 1.0652629694641962, "learning_rate": 7.254624861997754e-06, "loss": 0.0579, "step": 2123 }, { "epoch": 0.7678958785249458, "grad_norm": 0.000657832142555495, "learning_rate": 7.25195646137339e-06, "loss": 0.0, "step": 2124 }, { "epoch": 0.7682574114244396, "grad_norm": 0.2137235557027147, "learning_rate": 7.24928725590026e-06, "loss": 0.0181, "step": 2125 }, { "epoch": 0.7686189443239335, "grad_norm": 0.12860887597332665, "learning_rate": 7.246617246532341e-06, "loss": 0.0143, "step": 2126 }, { "epoch": 0.7689804772234273, "grad_norm": 0.14179235192473957, "learning_rate": 7.243946434223891e-06, "loss": 0.0182, "step": 2127 }, { "epoch": 0.7693420101229211, "grad_norm": 0.7610590138915944, "learning_rate": 7.241274819929459e-06, "loss": 0.0143, "step": 2128 }, { "epoch": 0.7697035430224151, "grad_norm": 0.019786973140255147, "learning_rate": 7.2386024046038806e-06, "loss": 0.0007, "step": 2129 }, { "epoch": 0.7700650759219089, "grad_norm": 1.0971934338063067, "learning_rate": 7.235929189202274e-06, "loss": 0.1226, "step": 2130 }, { "epoch": 0.7704266088214028, "grad_norm": 0.15865504783203882, "learning_rate": 7.233255174680048e-06, "loss": 0.0071, "step": 2131 }, { "epoch": 0.7707881417208966, "grad_norm": 0.12865345946978293, "learning_rate": 7.230580361992893e-06, "loss": 0.0145, "step": 2132 }, { "epoch": 0.7711496746203904, "grad_norm": 0.7740437703838827, "learning_rate": 7.227904752096788e-06, "loss": 0.1406, "step": 2133 }, { "epoch": 0.7715112075198843, "grad_norm": 0.21889092714351663, "learning_rate": 7.225228345947995e-06, "loss": 0.0181, "step": 2134 }, { "epoch": 0.7718727404193781, "grad_norm": 0.10615236977243593, "learning_rate": 7.22255114450306e-06, "loss": 0.0089, "step": 2135 }, { "epoch": 0.7722342733188721, "grad_norm": 0.38046263144286124, "learning_rate": 7.2198731487188156e-06, "loss": 0.0388, "step": 2136 }, { "epoch": 0.7725958062183659, "grad_norm": 0.1595718808226001, "learning_rate": 7.217194359552375e-06, "loss": 0.0143, "step": 2137 }, { "epoch": 0.7729573391178597, "grad_norm": 0.971910939708221, "learning_rate": 7.214514777961139e-06, "loss": 0.2344, "step": 2138 }, { "epoch": 0.7733188720173536, "grad_norm": 0.07859840701664514, "learning_rate": 7.211834404902789e-06, "loss": 0.0101, "step": 2139 }, { "epoch": 0.7736804049168474, "grad_norm": 0.0782819199015963, "learning_rate": 7.209153241335289e-06, "loss": 0.0101, "step": 2140 }, { "epoch": 0.7740419378163413, "grad_norm": 0.714676664664037, "learning_rate": 7.206471288216888e-06, "loss": 0.1699, "step": 2141 }, { "epoch": 0.7744034707158352, "grad_norm": 0.25054925392106436, "learning_rate": 7.203788546506113e-06, "loss": 0.0203, "step": 2142 }, { "epoch": 0.774765003615329, "grad_norm": 1.3365410853652329, "learning_rate": 7.201105017161777e-06, "loss": 0.0432, "step": 2143 }, { "epoch": 0.7751265365148229, "grad_norm": 0.16359293026612187, "learning_rate": 7.198420701142973e-06, "loss": 0.0181, "step": 2144 }, { "epoch": 0.7754880694143167, "grad_norm": 0.04227248805488933, "learning_rate": 7.195735599409074e-06, "loss": 0.0015, "step": 2145 }, { "epoch": 0.7758496023138105, "grad_norm": 0.29913137258240036, "learning_rate": 7.193049712919735e-06, "loss": 0.0056, "step": 2146 }, { "epoch": 0.7762111352133044, "grad_norm": 1.5826206406002619, "learning_rate": 7.19036304263489e-06, "loss": 0.0693, "step": 2147 }, { "epoch": 0.7765726681127982, "grad_norm": 0.11501587356541336, "learning_rate": 7.187675589514757e-06, "loss": 0.0143, "step": 2148 }, { "epoch": 0.7769342010122922, "grad_norm": 0.43520977078357986, "learning_rate": 7.184987354519831e-06, "loss": 0.0388, "step": 2149 }, { "epoch": 0.777295733911786, "grad_norm": 0.02592952535513986, "learning_rate": 7.182298338610885e-06, "loss": 0.0009, "step": 2150 }, { "epoch": 0.7776572668112798, "grad_norm": 0.715464937967777, "learning_rate": 7.1796085427489725e-06, "loss": 0.1226, "step": 2151 }, { "epoch": 0.7780187997107737, "grad_norm": 0.017290479747667415, "learning_rate": 7.176917967895427e-06, "loss": 0.0004, "step": 2152 }, { "epoch": 0.7783803326102675, "grad_norm": 0.00566274886231742, "learning_rate": 7.17422661501186e-06, "loss": 0.0002, "step": 2153 }, { "epoch": 0.7787418655097614, "grad_norm": 0.3213578122923375, "learning_rate": 7.171534485060158e-06, "loss": 0.0315, "step": 2154 }, { "epoch": 0.7791033984092552, "grad_norm": 0.6984098796700668, "learning_rate": 7.168841579002492e-06, "loss": 0.1699, "step": 2155 }, { "epoch": 0.779464931308749, "grad_norm": 0.33210333574062334, "learning_rate": 7.166147897801302e-06, "loss": 0.0227, "step": 2156 }, { "epoch": 0.779826464208243, "grad_norm": 0.1615140813605612, "learning_rate": 7.1634534424193105e-06, "loss": 0.0203, "step": 2157 }, { "epoch": 0.7801879971077368, "grad_norm": 0.0023304551218522825, "learning_rate": 7.160758213819515e-06, "loss": 0.0001, "step": 2158 }, { "epoch": 0.7805495300072307, "grad_norm": 0.986197860506504, "learning_rate": 7.158062212965189e-06, "loss": 0.1406, "step": 2159 }, { "epoch": 0.7809110629067245, "grad_norm": 0.11485285058808752, "learning_rate": 7.155365440819886e-06, "loss": 0.0143, "step": 2160 }, { "epoch": 0.7812725958062183, "grad_norm": 0.10977095134209604, "learning_rate": 7.152667898347427e-06, "loss": 0.0161, "step": 2161 }, { "epoch": 0.7816341287057122, "grad_norm": 0.010990175482239345, "learning_rate": 7.149969586511916e-06, "loss": 0.0004, "step": 2162 }, { "epoch": 0.7819956616052061, "grad_norm": 0.01627267794497686, "learning_rate": 7.147270506277729e-06, "loss": 0.0006, "step": 2163 }, { "epoch": 0.7823571945047, "grad_norm": 0.3470693155580043, "learning_rate": 7.144570658609515e-06, "loss": 0.0254, "step": 2164 }, { "epoch": 0.7827187274041938, "grad_norm": 1.2863167308687082, "learning_rate": 7.141870044472203e-06, "loss": 0.083, "step": 2165 }, { "epoch": 0.7830802603036876, "grad_norm": 0.0016044229805434512, "learning_rate": 7.139168664830987e-06, "loss": 0.0001, "step": 2166 }, { "epoch": 0.7834417932031815, "grad_norm": 0.22633831157360823, "learning_rate": 7.136466520651342e-06, "loss": 0.0283, "step": 2167 }, { "epoch": 0.7838033261026753, "grad_norm": 0.7340512761483501, "learning_rate": 7.133763612899013e-06, "loss": 0.0283, "step": 2168 }, { "epoch": 0.7841648590021691, "grad_norm": 0.0004328809361506882, "learning_rate": 7.131059942540018e-06, "loss": 0.0, "step": 2169 }, { "epoch": 0.7845263919016631, "grad_norm": 0.0027860745191582424, "learning_rate": 7.128355510540652e-06, "loss": 0.0001, "step": 2170 }, { "epoch": 0.7848879248011569, "grad_norm": 0.1624182988001773, "learning_rate": 7.1256503178674756e-06, "loss": 0.0063, "step": 2171 }, { "epoch": 0.7852494577006508, "grad_norm": 2.7747678919199696, "learning_rate": 7.122944365487321e-06, "loss": 0.1602, "step": 2172 }, { "epoch": 0.7856109906001446, "grad_norm": 0.7252945138563206, "learning_rate": 7.120237654367301e-06, "loss": 0.1406, "step": 2173 }, { "epoch": 0.7859725234996384, "grad_norm": 1.0039916131602347, "learning_rate": 7.117530185474789e-06, "loss": 0.083, "step": 2174 }, { "epoch": 0.7863340563991323, "grad_norm": 0.0006824437559850482, "learning_rate": 7.114821959777438e-06, "loss": 0.0, "step": 2175 }, { "epoch": 0.7866955892986262, "grad_norm": 0.9656833772098998, "learning_rate": 7.112112978243162e-06, "loss": 0.0527, "step": 2176 }, { "epoch": 0.7870571221981201, "grad_norm": 1.8684499224332476, "learning_rate": 7.109403241840156e-06, "loss": 0.2471, "step": 2177 }, { "epoch": 0.7874186550976139, "grad_norm": 2.482498964971009, "learning_rate": 7.106692751536875e-06, "loss": 0.1406, "step": 2178 }, { "epoch": 0.7877801879971077, "grad_norm": 4.069140651201568, "learning_rate": 7.103981508302049e-06, "loss": 0.6133, "step": 2179 }, { "epoch": 0.7881417208966016, "grad_norm": 0.2793820608522027, "learning_rate": 7.101269513104677e-06, "loss": 0.0315, "step": 2180 }, { "epoch": 0.7885032537960954, "grad_norm": 0.0020158507275614217, "learning_rate": 7.098556766914023e-06, "loss": 0.0001, "step": 2181 }, { "epoch": 0.7888647866955893, "grad_norm": 0.3488079263004908, "learning_rate": 7.095843270699625e-06, "loss": 0.0349, "step": 2182 }, { "epoch": 0.7892263195950832, "grad_norm": 0.004470996404876482, "learning_rate": 7.093129025431283e-06, "loss": 0.0002, "step": 2183 }, { "epoch": 0.789587852494577, "grad_norm": 0.003187909434574915, "learning_rate": 7.09041403207907e-06, "loss": 0.0001, "step": 2184 }, { "epoch": 0.7899493853940709, "grad_norm": 0.8502912180911599, "learning_rate": 7.087698291613323e-06, "loss": 0.0317, "step": 2185 }, { "epoch": 0.7903109182935647, "grad_norm": 0.09838385595356482, "learning_rate": 7.084981805004647e-06, "loss": 0.0056, "step": 2186 }, { "epoch": 0.7906724511930586, "grad_norm": 0.35635021601418465, "learning_rate": 7.082264573223914e-06, "loss": 0.0349, "step": 2187 }, { "epoch": 0.7910339840925524, "grad_norm": 0.5121407579737626, "learning_rate": 7.079546597242262e-06, "loss": 0.0432, "step": 2188 }, { "epoch": 0.7913955169920462, "grad_norm": 0.27977503451116675, "learning_rate": 7.076827878031094e-06, "loss": 0.0349, "step": 2189 }, { "epoch": 0.7917570498915402, "grad_norm": 0.09122390389500908, "learning_rate": 7.074108416562081e-06, "loss": 0.0143, "step": 2190 }, { "epoch": 0.792118582791034, "grad_norm": 0.626621546821871, "learning_rate": 7.071388213807159e-06, "loss": 0.0693, "step": 2191 }, { "epoch": 0.7924801156905278, "grad_norm": 0.0014194010912059588, "learning_rate": 7.068667270738525e-06, "loss": 0.0001, "step": 2192 }, { "epoch": 0.7928416485900217, "grad_norm": 0.3801659901758602, "learning_rate": 7.065945588328646e-06, "loss": 0.0349, "step": 2193 }, { "epoch": 0.7932031814895155, "grad_norm": 0.026928145985178998, "learning_rate": 7.06322316755025e-06, "loss": 0.0013, "step": 2194 }, { "epoch": 0.7935647143890094, "grad_norm": 0.21296762279458412, "learning_rate": 7.060500009376327e-06, "loss": 0.009, "step": 2195 }, { "epoch": 0.7939262472885033, "grad_norm": 1.1253500727140051, "learning_rate": 7.0577761147801385e-06, "loss": 0.1406, "step": 2196 }, { "epoch": 0.7942877801879971, "grad_norm": 0.1678524286155856, "learning_rate": 7.0550514847352e-06, "loss": 0.0161, "step": 2197 }, { "epoch": 0.794649313087491, "grad_norm": 0.15039711105784415, "learning_rate": 7.052326120215294e-06, "loss": 0.0203, "step": 2198 }, { "epoch": 0.7950108459869848, "grad_norm": 0.1715983258664865, "learning_rate": 7.049600022194465e-06, "loss": 0.0203, "step": 2199 }, { "epoch": 0.7953723788864787, "grad_norm": 0.19060960264175567, "learning_rate": 7.046873191647022e-06, "loss": 0.0227, "step": 2200 }, { "epoch": 0.7957339117859725, "grad_norm": 0.11123982912488656, "learning_rate": 7.044145629547532e-06, "loss": 0.0161, "step": 2201 }, { "epoch": 0.7960954446854663, "grad_norm": 2.040498575608662, "learning_rate": 7.041417336870826e-06, "loss": 0.1807, "step": 2202 }, { "epoch": 0.7964569775849603, "grad_norm": 0.0015029888365933831, "learning_rate": 7.038688314591994e-06, "loss": 0.0001, "step": 2203 }, { "epoch": 0.7968185104844541, "grad_norm": 0.3327038000028796, "learning_rate": 7.035958563686387e-06, "loss": 0.0317, "step": 2204 }, { "epoch": 0.797180043383948, "grad_norm": 0.6708722341139495, "learning_rate": 7.033228085129621e-06, "loss": 0.0432, "step": 2205 }, { "epoch": 0.7975415762834418, "grad_norm": 1.760509214790743, "learning_rate": 7.030496879897566e-06, "loss": 0.3535, "step": 2206 }, { "epoch": 0.7979031091829356, "grad_norm": 0.24561020170076628, "learning_rate": 7.027764948966355e-06, "loss": 0.0102, "step": 2207 }, { "epoch": 0.7982646420824295, "grad_norm": 0.9130659733879595, "learning_rate": 7.02503229331238e-06, "loss": 0.0635, "step": 2208 }, { "epoch": 0.7986261749819233, "grad_norm": 3.2417003079013424, "learning_rate": 7.022298913912288e-06, "loss": 0.2578, "step": 2209 }, { "epoch": 0.7989877078814173, "grad_norm": 4.194436922820878, "learning_rate": 7.019564811742992e-06, "loss": 0.1504, "step": 2210 }, { "epoch": 0.7993492407809111, "grad_norm": 0.07224494988350538, "learning_rate": 7.016829987781659e-06, "loss": 0.0027, "step": 2211 }, { "epoch": 0.7997107736804049, "grad_norm": 1.5587911775923358, "learning_rate": 7.014094443005715e-06, "loss": 0.1309, "step": 2212 }, { "epoch": 0.8000723065798988, "grad_norm": 0.23072687297915384, "learning_rate": 7.011358178392841e-06, "loss": 0.0182, "step": 2213 }, { "epoch": 0.8004338394793926, "grad_norm": 1.1820163402099884, "learning_rate": 7.008621194920977e-06, "loss": 0.0762, "step": 2214 }, { "epoch": 0.8007953723788864, "grad_norm": 0.0866152806151098, "learning_rate": 7.005883493568324e-06, "loss": 0.0128, "step": 2215 }, { "epoch": 0.8011569052783803, "grad_norm": 0.11911805464422587, "learning_rate": 7.003145075313334e-06, "loss": 0.0145, "step": 2216 }, { "epoch": 0.8015184381778742, "grad_norm": 0.10671331652486347, "learning_rate": 7.000405941134716e-06, "loss": 0.0143, "step": 2217 }, { "epoch": 0.8018799710773681, "grad_norm": 0.5065857216592242, "learning_rate": 6.997666092011436e-06, "loss": 0.0432, "step": 2218 }, { "epoch": 0.8022415039768619, "grad_norm": 0.1691046350670752, "learning_rate": 6.994925528922716e-06, "loss": 0.0162, "step": 2219 }, { "epoch": 0.8026030368763557, "grad_norm": 0.3841376053322432, "learning_rate": 6.992184252848035e-06, "loss": 0.0254, "step": 2220 }, { "epoch": 0.8029645697758496, "grad_norm": 0.17906140114546776, "learning_rate": 6.989442264767121e-06, "loss": 0.0203, "step": 2221 }, { "epoch": 0.8033261026753434, "grad_norm": 1.6189687612998942, "learning_rate": 6.986699565659963e-06, "loss": 0.2695, "step": 2222 }, { "epoch": 0.8036876355748374, "grad_norm": 0.7131762911826163, "learning_rate": 6.983956156506798e-06, "loss": 0.0283, "step": 2223 }, { "epoch": 0.8040491684743312, "grad_norm": 1.0847262076808224, "learning_rate": 6.981212038288121e-06, "loss": 0.1699, "step": 2224 }, { "epoch": 0.804410701373825, "grad_norm": 0.19638029454445122, "learning_rate": 6.978467211984681e-06, "loss": 0.0162, "step": 2225 }, { "epoch": 0.8047722342733189, "grad_norm": 0.10963676820594943, "learning_rate": 6.975721678577476e-06, "loss": 0.0143, "step": 2226 }, { "epoch": 0.8051337671728127, "grad_norm": 0.08544573453702148, "learning_rate": 6.97297543904776e-06, "loss": 0.0056, "step": 2227 }, { "epoch": 0.8054953000723066, "grad_norm": 0.7883345774919194, "learning_rate": 6.970228494377039e-06, "loss": 0.0352, "step": 2228 }, { "epoch": 0.8058568329718004, "grad_norm": 0.1249578002925448, "learning_rate": 6.96748084554707e-06, "loss": 0.0128, "step": 2229 }, { "epoch": 0.8062183658712943, "grad_norm": 0.12121284588649478, "learning_rate": 6.964732493539861e-06, "loss": 0.009, "step": 2230 }, { "epoch": 0.8065798987707882, "grad_norm": 0.23939874244369708, "learning_rate": 6.961983439337675e-06, "loss": 0.0227, "step": 2231 }, { "epoch": 0.806941431670282, "grad_norm": 0.4422075390377522, "learning_rate": 6.959233683923022e-06, "loss": 0.009, "step": 2232 }, { "epoch": 0.8073029645697759, "grad_norm": 0.14716844058965897, "learning_rate": 6.956483228278662e-06, "loss": 0.0081, "step": 2233 }, { "epoch": 0.8076644974692697, "grad_norm": 0.316443914665201, "learning_rate": 6.953732073387609e-06, "loss": 0.0254, "step": 2234 }, { "epoch": 0.8080260303687635, "grad_norm": 0.12759470112839097, "learning_rate": 6.950980220233127e-06, "loss": 0.0162, "step": 2235 }, { "epoch": 0.8083875632682574, "grad_norm": 0.23431830749465285, "learning_rate": 6.948227669798725e-06, "loss": 0.0145, "step": 2236 }, { "epoch": 0.8087490961677513, "grad_norm": 0.9082212365733227, "learning_rate": 6.945474423068166e-06, "loss": 0.2471, "step": 2237 }, { "epoch": 0.8091106290672451, "grad_norm": 0.17615731155735562, "learning_rate": 6.942720481025458e-06, "loss": 0.0203, "step": 2238 }, { "epoch": 0.809472161966739, "grad_norm": 0.1184953588262382, "learning_rate": 6.939965844654859e-06, "loss": 0.0056, "step": 2239 }, { "epoch": 0.8098336948662328, "grad_norm": 0.5679418474939637, "learning_rate": 6.9372105149408775e-06, "loss": 0.0432, "step": 2240 }, { "epoch": 0.8101952277657267, "grad_norm": 1.4139172632874317, "learning_rate": 6.934454492868268e-06, "loss": 0.2236, "step": 2241 }, { "epoch": 0.8105567606652205, "grad_norm": 0.061401781039112466, "learning_rate": 6.93169777942203e-06, "loss": 0.0079, "step": 2242 }, { "epoch": 0.8109182935647143, "grad_norm": 0.08231757494792294, "learning_rate": 6.9289403755874126e-06, "loss": 0.0049, "step": 2243 }, { "epoch": 0.8112798264642083, "grad_norm": 0.5387693471994481, "learning_rate": 6.9261822823499124e-06, "loss": 0.0317, "step": 2244 }, { "epoch": 0.8116413593637021, "grad_norm": 0.10068397771885504, "learning_rate": 6.923423500695272e-06, "loss": 0.0101, "step": 2245 }, { "epoch": 0.812002892263196, "grad_norm": 1.1811308440214816, "learning_rate": 6.920664031609478e-06, "loss": 0.2012, "step": 2246 }, { "epoch": 0.8123644251626898, "grad_norm": 0.17504578854638853, "learning_rate": 6.917903876078764e-06, "loss": 0.0162, "step": 2247 }, { "epoch": 0.8127259580621836, "grad_norm": 0.9412696735728331, "learning_rate": 6.91514303508961e-06, "loss": 0.1914, "step": 2248 }, { "epoch": 0.8130874909616775, "grad_norm": 0.0874427623657035, "learning_rate": 6.912381509628737e-06, "loss": 0.0011, "step": 2249 }, { "epoch": 0.8134490238611713, "grad_norm": 0.08444754133610238, "learning_rate": 6.909619300683119e-06, "loss": 0.0079, "step": 2250 }, { "epoch": 0.8138105567606653, "grad_norm": 0.08665129450191664, "learning_rate": 6.906856409239964e-06, "loss": 0.0101, "step": 2251 }, { "epoch": 0.8141720896601591, "grad_norm": 0.09845750578285883, "learning_rate": 6.904092836286733e-06, "loss": 0.0114, "step": 2252 }, { "epoch": 0.8145336225596529, "grad_norm": 1.2437817917492435, "learning_rate": 6.901328582811123e-06, "loss": 0.1914, "step": 2253 }, { "epoch": 0.8148951554591468, "grad_norm": 0.5756470090222369, "learning_rate": 6.898563649801078e-06, "loss": 0.0391, "step": 2254 }, { "epoch": 0.8152566883586406, "grad_norm": 1.0298959952510658, "learning_rate": 6.895798038244786e-06, "loss": 0.1602, "step": 2255 }, { "epoch": 0.8156182212581344, "grad_norm": 0.19618206014601103, "learning_rate": 6.893031749130674e-06, "loss": 0.0203, "step": 2256 }, { "epoch": 0.8159797541576284, "grad_norm": 0.11793803781316152, "learning_rate": 6.890264783447417e-06, "loss": 0.0114, "step": 2257 }, { "epoch": 0.8163412870571222, "grad_norm": 0.3491451047685959, "learning_rate": 6.887497142183924e-06, "loss": 0.0317, "step": 2258 }, { "epoch": 0.8167028199566161, "grad_norm": 0.38846154363695484, "learning_rate": 6.884728826329349e-06, "loss": 0.0254, "step": 2259 }, { "epoch": 0.8170643528561099, "grad_norm": 0.7488587971155145, "learning_rate": 6.881959836873091e-06, "loss": 0.1406, "step": 2260 }, { "epoch": 0.8174258857556037, "grad_norm": 0.12815507998569056, "learning_rate": 6.879190174804783e-06, "loss": 0.0145, "step": 2261 }, { "epoch": 0.8177874186550976, "grad_norm": 0.12476347998711698, "learning_rate": 6.876419841114305e-06, "loss": 0.0182, "step": 2262 }, { "epoch": 0.8181489515545914, "grad_norm": 0.100109658787794, "learning_rate": 6.873648836791772e-06, "loss": 0.0161, "step": 2263 }, { "epoch": 0.8185104844540854, "grad_norm": 0.014391326097487308, "learning_rate": 6.870877162827538e-06, "loss": 0.0007, "step": 2264 }, { "epoch": 0.8188720173535792, "grad_norm": 0.6536635959775605, "learning_rate": 6.8681048202122026e-06, "loss": 0.1309, "step": 2265 }, { "epoch": 0.819233550253073, "grad_norm": 0.29400362710467287, "learning_rate": 6.865331809936597e-06, "loss": 0.0283, "step": 2266 }, { "epoch": 0.8195950831525669, "grad_norm": 0.14156467924174684, "learning_rate": 6.862558132991798e-06, "loss": 0.0227, "step": 2267 }, { "epoch": 0.8199566160520607, "grad_norm": 1.229777162573779, "learning_rate": 6.859783790369116e-06, "loss": 0.1699, "step": 2268 }, { "epoch": 0.8203181489515546, "grad_norm": 0.1965977071900588, "learning_rate": 6.857008783060097e-06, "loss": 0.0145, "step": 2269 }, { "epoch": 0.8206796818510484, "grad_norm": 0.8306368751663096, "learning_rate": 6.854233112056533e-06, "loss": 0.1602, "step": 2270 }, { "epoch": 0.8210412147505423, "grad_norm": 0.1375680820898948, "learning_rate": 6.851456778350445e-06, "loss": 0.0143, "step": 2271 }, { "epoch": 0.8214027476500362, "grad_norm": 0.5662376784163965, "learning_rate": 6.848679782934094e-06, "loss": 0.1309, "step": 2272 }, { "epoch": 0.82176428054953, "grad_norm": 0.008590706284401506, "learning_rate": 6.845902126799981e-06, "loss": 0.0007, "step": 2273 }, { "epoch": 0.8221258134490239, "grad_norm": 0.7019850919433389, "learning_rate": 6.843123810940837e-06, "loss": 0.1055, "step": 2274 }, { "epoch": 0.8224873463485177, "grad_norm": 0.1965848283624743, "learning_rate": 6.8403448363496315e-06, "loss": 0.0283, "step": 2275 }, { "epoch": 0.8228488792480115, "grad_norm": 0.5489382659357773, "learning_rate": 6.83756520401957e-06, "loss": 0.0388, "step": 2276 }, { "epoch": 0.8232104121475055, "grad_norm": 0.45584529092620435, "learning_rate": 6.834784914944092e-06, "loss": 0.0527, "step": 2277 }, { "epoch": 0.8235719450469993, "grad_norm": 0.006172686053335821, "learning_rate": 6.832003970116874e-06, "loss": 0.0003, "step": 2278 }, { "epoch": 0.8239334779464931, "grad_norm": 0.11099333547300437, "learning_rate": 6.829222370531823e-06, "loss": 0.0056, "step": 2279 }, { "epoch": 0.824295010845987, "grad_norm": 0.8001050183053927, "learning_rate": 6.826440117183082e-06, "loss": 0.0527, "step": 2280 }, { "epoch": 0.8246565437454808, "grad_norm": 0.595237960520753, "learning_rate": 6.823657211065028e-06, "loss": 0.1504, "step": 2281 }, { "epoch": 0.8250180766449747, "grad_norm": 0.5921620568551178, "learning_rate": 6.820873653172273e-06, "loss": 0.0352, "step": 2282 }, { "epoch": 0.8253796095444685, "grad_norm": 0.009171332780136135, "learning_rate": 6.818089444499658e-06, "loss": 0.0004, "step": 2283 }, { "epoch": 0.8257411424439624, "grad_norm": 0.23688769927244546, "learning_rate": 6.81530458604226e-06, "loss": 0.0182, "step": 2284 }, { "epoch": 0.8261026753434563, "grad_norm": 0.23976876765520455, "learning_rate": 6.812519078795386e-06, "loss": 0.0317, "step": 2285 }, { "epoch": 0.8264642082429501, "grad_norm": 0.1243748279069696, "learning_rate": 6.809732923754575e-06, "loss": 0.0203, "step": 2286 }, { "epoch": 0.826825741142444, "grad_norm": 0.042992724703711446, "learning_rate": 6.8069461219155985e-06, "loss": 0.0024, "step": 2287 }, { "epoch": 0.8271872740419378, "grad_norm": 0.5602074790460543, "learning_rate": 6.804158674274461e-06, "loss": 0.0693, "step": 2288 }, { "epoch": 0.8275488069414316, "grad_norm": 0.008786508454362312, "learning_rate": 6.801370581827393e-06, "loss": 0.0002, "step": 2289 }, { "epoch": 0.8279103398409255, "grad_norm": 0.059550025681842714, "learning_rate": 6.798581845570859e-06, "loss": 0.0031, "step": 2290 }, { "epoch": 0.8282718727404194, "grad_norm": 0.25476519949982, "learning_rate": 6.795792466501554e-06, "loss": 0.0349, "step": 2291 }, { "epoch": 0.8286334056399133, "grad_norm": 0.061674523199504366, "learning_rate": 6.7930024456164e-06, "loss": 0.0031, "step": 2292 }, { "epoch": 0.8289949385394071, "grad_norm": 0.043355721182610975, "learning_rate": 6.79021178391255e-06, "loss": 0.0024, "step": 2293 }, { "epoch": 0.8293564714389009, "grad_norm": 0.15246256400472424, "learning_rate": 6.787420482387387e-06, "loss": 0.0022, "step": 2294 }, { "epoch": 0.8297180043383948, "grad_norm": 0.46784019736053456, "learning_rate": 6.78462854203852e-06, "loss": 0.0579, "step": 2295 }, { "epoch": 0.8300795372378886, "grad_norm": 0.04566048203955086, "learning_rate": 6.781835963863789e-06, "loss": 0.0031, "step": 2296 }, { "epoch": 0.8304410701373826, "grad_norm": 0.6081569413819973, "learning_rate": 6.77904274886126e-06, "loss": 0.0762, "step": 2297 }, { "epoch": 0.8308026030368764, "grad_norm": 0.5533298867504312, "learning_rate": 6.77624889802923e-06, "loss": 0.0527, "step": 2298 }, { "epoch": 0.8311641359363702, "grad_norm": 0.47957700956187843, "learning_rate": 6.773454412366216e-06, "loss": 0.0352, "step": 2299 }, { "epoch": 0.8315256688358641, "grad_norm": 0.209698544753835, "learning_rate": 6.770659292870971e-06, "loss": 0.0283, "step": 2300 }, { "epoch": 0.8318872017353579, "grad_norm": 0.4956986610914273, "learning_rate": 6.767863540542467e-06, "loss": 0.0527, "step": 2301 }, { "epoch": 0.8322487346348517, "grad_norm": 0.6513803027622358, "learning_rate": 6.7650671563799075e-06, "loss": 0.0432, "step": 2302 }, { "epoch": 0.8326102675343456, "grad_norm": 0.5330121788798989, "learning_rate": 6.76227014138272e-06, "loss": 0.1504, "step": 2303 }, { "epoch": 0.8329718004338394, "grad_norm": 1.4094716578987234, "learning_rate": 6.759472496550554e-06, "loss": 0.1406, "step": 2304 }, { "epoch": 0.8333333333333334, "grad_norm": 0.4646570593731281, "learning_rate": 6.75667422288329e-06, "loss": 0.0476, "step": 2305 }, { "epoch": 0.8336948662328272, "grad_norm": 0.4067403223934056, "learning_rate": 6.75387532138103e-06, "loss": 0.0388, "step": 2306 }, { "epoch": 0.834056399132321, "grad_norm": 1.118462724207027, "learning_rate": 6.751075793044099e-06, "loss": 0.1807, "step": 2307 }, { "epoch": 0.8344179320318149, "grad_norm": 0.10509415797921304, "learning_rate": 6.74827563887305e-06, "loss": 0.0181, "step": 2308 }, { "epoch": 0.8347794649313087, "grad_norm": 0.5148412674775188, "learning_rate": 6.745474859868657e-06, "loss": 0.0527, "step": 2309 }, { "epoch": 0.8351409978308026, "grad_norm": 0.4739757678027344, "learning_rate": 6.742673457031917e-06, "loss": 0.0527, "step": 2310 }, { "epoch": 0.8355025307302965, "grad_norm": 0.08379190567664588, "learning_rate": 6.73987143136405e-06, "loss": 0.0056, "step": 2311 }, { "epoch": 0.8358640636297903, "grad_norm": 0.15889341361995013, "learning_rate": 6.737068783866501e-06, "loss": 0.0227, "step": 2312 }, { "epoch": 0.8362255965292842, "grad_norm": 0.1611303718002797, "learning_rate": 6.734265515540937e-06, "loss": 0.0254, "step": 2313 }, { "epoch": 0.836587129428778, "grad_norm": 0.05889344454916049, "learning_rate": 6.731461627389242e-06, "loss": 0.0022, "step": 2314 }, { "epoch": 0.8369486623282719, "grad_norm": 0.09781604688650082, "learning_rate": 6.728657120413529e-06, "loss": 0.0045, "step": 2315 }, { "epoch": 0.8373101952277657, "grad_norm": 0.007126077602513963, "learning_rate": 6.725851995616123e-06, "loss": 0.0003, "step": 2316 }, { "epoch": 0.8376717281272595, "grad_norm": 0.5323671556462629, "learning_rate": 6.723046253999579e-06, "loss": 0.1699, "step": 2317 }, { "epoch": 0.8380332610267535, "grad_norm": 0.018197906685208078, "learning_rate": 6.720239896566668e-06, "loss": 0.0009, "step": 2318 }, { "epoch": 0.8383947939262473, "grad_norm": 0.2724110107403459, "learning_rate": 6.717432924320382e-06, "loss": 0.0143, "step": 2319 }, { "epoch": 0.8387563268257412, "grad_norm": 0.2105919707205582, "learning_rate": 6.714625338263929e-06, "loss": 0.0129, "step": 2320 }, { "epoch": 0.839117859725235, "grad_norm": 0.717638440007994, "learning_rate": 6.711817139400743e-06, "loss": 0.1055, "step": 2321 }, { "epoch": 0.8394793926247288, "grad_norm": 2.142827033734126, "learning_rate": 6.709008328734472e-06, "loss": 0.2012, "step": 2322 }, { "epoch": 0.8398409255242227, "grad_norm": 0.8240446628684958, "learning_rate": 6.706198907268986e-06, "loss": 0.0693, "step": 2323 }, { "epoch": 0.8402024584237165, "grad_norm": 0.47852719862250126, "learning_rate": 6.703388876008371e-06, "loss": 0.0527, "step": 2324 }, { "epoch": 0.8405639913232104, "grad_norm": 1.3421315619555603, "learning_rate": 6.70057823595693e-06, "loss": 0.1226, "step": 2325 }, { "epoch": 0.8409255242227043, "grad_norm": 0.11426968190198747, "learning_rate": 6.697766988119187e-06, "loss": 0.0181, "step": 2326 }, { "epoch": 0.8412870571221981, "grad_norm": 0.32684798529276615, "learning_rate": 6.694955133499881e-06, "loss": 0.0352, "step": 2327 }, { "epoch": 0.841648590021692, "grad_norm": 0.6337249356480225, "learning_rate": 6.692142673103967e-06, "loss": 0.0254, "step": 2328 }, { "epoch": 0.8420101229211858, "grad_norm": 0.0791392317107167, "learning_rate": 6.6893296079366185e-06, "loss": 0.0128, "step": 2329 }, { "epoch": 0.8423716558206796, "grad_norm": 0.010039851628002283, "learning_rate": 6.686515939003226e-06, "loss": 0.0005, "step": 2330 }, { "epoch": 0.8427331887201736, "grad_norm": 0.5778172057139709, "learning_rate": 6.683701667309393e-06, "loss": 0.1699, "step": 2331 }, { "epoch": 0.8430947216196674, "grad_norm": 1.7937264343344186, "learning_rate": 6.680886793860939e-06, "loss": 0.1602, "step": 2332 }, { "epoch": 0.8434562545191613, "grad_norm": 0.35182038223490614, "learning_rate": 6.678071319663899e-06, "loss": 0.0182, "step": 2333 }, { "epoch": 0.8438177874186551, "grad_norm": 0.025880901179291312, "learning_rate": 6.675255245724524e-06, "loss": 0.0019, "step": 2334 }, { "epoch": 0.8441793203181489, "grad_norm": 0.11204887315343735, "learning_rate": 6.672438573049278e-06, "loss": 0.0203, "step": 2335 }, { "epoch": 0.8445408532176428, "grad_norm": 0.004409894306041762, "learning_rate": 6.669621302644838e-06, "loss": 0.0002, "step": 2336 }, { "epoch": 0.8449023861171366, "grad_norm": 0.14745451331071666, "learning_rate": 6.666803435518096e-06, "loss": 0.0161, "step": 2337 }, { "epoch": 0.8452639190166306, "grad_norm": 0.6657267167394009, "learning_rate": 6.66398497267616e-06, "loss": 0.0579, "step": 2338 }, { "epoch": 0.8456254519161244, "grad_norm": 1.281030561466528, "learning_rate": 6.661165915126344e-06, "loss": 0.1699, "step": 2339 }, { "epoch": 0.8459869848156182, "grad_norm": 0.15307504691892151, "learning_rate": 6.658346263876183e-06, "loss": 0.0227, "step": 2340 }, { "epoch": 0.8463485177151121, "grad_norm": 1.0151241717420252, "learning_rate": 6.655526019933416e-06, "loss": 0.0432, "step": 2341 }, { "epoch": 0.8467100506146059, "grad_norm": 0.1406360848622502, "learning_rate": 6.652705184305998e-06, "loss": 0.0203, "step": 2342 }, { "epoch": 0.8470715835140998, "grad_norm": 0.160893308705395, "learning_rate": 6.649883758002097e-06, "loss": 0.0182, "step": 2343 }, { "epoch": 0.8474331164135936, "grad_norm": 0.0783968894745582, "learning_rate": 6.647061742030087e-06, "loss": 0.0044, "step": 2344 }, { "epoch": 0.8477946493130875, "grad_norm": 0.1636359100408655, "learning_rate": 6.644239137398563e-06, "loss": 0.0227, "step": 2345 }, { "epoch": 0.8481561822125814, "grad_norm": 0.8377023381882648, "learning_rate": 6.641415945116313e-06, "loss": 0.083, "step": 2346 }, { "epoch": 0.8485177151120752, "grad_norm": 0.8790468293811021, "learning_rate": 6.638592166192353e-06, "loss": 0.1143, "step": 2347 }, { "epoch": 0.848879248011569, "grad_norm": 0.9708393266005668, "learning_rate": 6.635767801635897e-06, "loss": 0.0203, "step": 2348 }, { "epoch": 0.8492407809110629, "grad_norm": 0.2078080567554949, "learning_rate": 6.632942852456375e-06, "loss": 0.0162, "step": 2349 }, { "epoch": 0.8496023138105567, "grad_norm": 0.1800987761256538, "learning_rate": 6.630117319663425e-06, "loss": 0.0254, "step": 2350 }, { "epoch": 0.8499638467100507, "grad_norm": 0.681139710350677, "learning_rate": 6.627291204266885e-06, "loss": 0.1602, "step": 2351 }, { "epoch": 0.8503253796095445, "grad_norm": 0.6996131772326559, "learning_rate": 6.624464507276813e-06, "loss": 0.1504, "step": 2352 }, { "epoch": 0.8506869125090383, "grad_norm": 0.6496281241508214, "learning_rate": 6.621637229703468e-06, "loss": 0.1504, "step": 2353 }, { "epoch": 0.8510484454085322, "grad_norm": 0.590216532185255, "learning_rate": 6.618809372557322e-06, "loss": 0.1406, "step": 2354 }, { "epoch": 0.851409978308026, "grad_norm": 0.013307378258350653, "learning_rate": 6.6159809368490465e-06, "loss": 0.0008, "step": 2355 }, { "epoch": 0.8517715112075199, "grad_norm": 0.5837814120453376, "learning_rate": 6.613151923589525e-06, "loss": 0.083, "step": 2356 }, { "epoch": 0.8521330441070137, "grad_norm": 0.5605541332257334, "learning_rate": 6.610322333789847e-06, "loss": 0.0527, "step": 2357 }, { "epoch": 0.8524945770065075, "grad_norm": 0.6616548260210348, "learning_rate": 6.6074921684613045e-06, "loss": 0.0479, "step": 2358 }, { "epoch": 0.8528561099060015, "grad_norm": 0.1854049080441643, "learning_rate": 6.604661428615403e-06, "loss": 0.0315, "step": 2359 }, { "epoch": 0.8532176428054953, "grad_norm": 0.45630779753488254, "learning_rate": 6.601830115263845e-06, "loss": 0.0635, "step": 2360 }, { "epoch": 0.8535791757049892, "grad_norm": 0.14518947563527843, "learning_rate": 6.598998229418542e-06, "loss": 0.0254, "step": 2361 }, { "epoch": 0.853940708604483, "grad_norm": 0.43464947881441895, "learning_rate": 6.596165772091609e-06, "loss": 0.0476, "step": 2362 }, { "epoch": 0.8543022415039768, "grad_norm": 0.12905163127485242, "learning_rate": 6.593332744295364e-06, "loss": 0.0227, "step": 2363 }, { "epoch": 0.8546637744034707, "grad_norm": 0.525936263853853, "learning_rate": 6.590499147042335e-06, "loss": 0.1309, "step": 2364 }, { "epoch": 0.8550253073029646, "grad_norm": 0.6075344923271063, "learning_rate": 6.587664981345245e-06, "loss": 0.1143, "step": 2365 }, { "epoch": 0.8553868402024585, "grad_norm": 0.24469354164559107, "learning_rate": 6.5848302482170264e-06, "loss": 0.0145, "step": 2366 }, { "epoch": 0.8557483731019523, "grad_norm": 0.8108002354865863, "learning_rate": 6.5819949486708125e-06, "loss": 0.1143, "step": 2367 }, { "epoch": 0.8561099060014461, "grad_norm": 0.19169720487730868, "learning_rate": 6.579159083719936e-06, "loss": 0.0102, "step": 2368 }, { "epoch": 0.85647143890094, "grad_norm": 0.005081988807740432, "learning_rate": 6.576322654377937e-06, "loss": 0.0003, "step": 2369 }, { "epoch": 0.8568329718004338, "grad_norm": 0.2546302966268719, "learning_rate": 6.573485661658554e-06, "loss": 0.0349, "step": 2370 }, { "epoch": 0.8571945046999276, "grad_norm": 1.4661784971338125, "learning_rate": 6.5706481065757275e-06, "loss": 0.1406, "step": 2371 }, { "epoch": 0.8575560375994216, "grad_norm": 0.01696347821266655, "learning_rate": 6.5678099901436e-06, "loss": 0.0008, "step": 2372 }, { "epoch": 0.8579175704989154, "grad_norm": 0.49622958643703247, "learning_rate": 6.5649713133765115e-06, "loss": 0.0903, "step": 2373 }, { "epoch": 0.8582791033984093, "grad_norm": 0.30940928413461927, "learning_rate": 6.562132077289006e-06, "loss": 0.0527, "step": 2374 }, { "epoch": 0.8586406362979031, "grad_norm": 0.48376824051291295, "learning_rate": 6.559292282895827e-06, "loss": 0.0352, "step": 2375 }, { "epoch": 0.8590021691973969, "grad_norm": 0.15966485554584842, "learning_rate": 6.556451931211915e-06, "loss": 0.0254, "step": 2376 }, { "epoch": 0.8593637020968908, "grad_norm": 0.4799224126116426, "learning_rate": 6.553611023252411e-06, "loss": 0.0693, "step": 2377 }, { "epoch": 0.8597252349963846, "grad_norm": 0.6588253685785835, "learning_rate": 6.550769560032654e-06, "loss": 0.0693, "step": 2378 }, { "epoch": 0.8600867678958786, "grad_norm": 0.11519878015534106, "learning_rate": 6.547927542568184e-06, "loss": 0.009, "step": 2379 }, { "epoch": 0.8604483007953724, "grad_norm": 1.507144348102979, "learning_rate": 6.545084971874738e-06, "loss": 0.1055, "step": 2380 }, { "epoch": 0.8608098336948662, "grad_norm": 1.3944857117369158, "learning_rate": 6.5422418489682484e-06, "loss": 0.0977, "step": 2381 }, { "epoch": 0.8611713665943601, "grad_norm": 0.004382603439400229, "learning_rate": 6.5393981748648486e-06, "loss": 0.0002, "step": 2382 }, { "epoch": 0.8615328994938539, "grad_norm": 0.5222633511601508, "learning_rate": 6.536553950580864e-06, "loss": 0.0527, "step": 2383 }, { "epoch": 0.8618944323933478, "grad_norm": 0.3915712061361341, "learning_rate": 6.533709177132822e-06, "loss": 0.0352, "step": 2384 }, { "epoch": 0.8622559652928417, "grad_norm": 0.10826667277342826, "learning_rate": 6.530863855537445e-06, "loss": 0.0071, "step": 2385 }, { "epoch": 0.8626174981923355, "grad_norm": 0.521300805680891, "learning_rate": 6.528017986811649e-06, "loss": 0.0762, "step": 2386 }, { "epoch": 0.8629790310918294, "grad_norm": 0.045187676983686934, "learning_rate": 6.525171571972546e-06, "loss": 0.0024, "step": 2387 }, { "epoch": 0.8633405639913232, "grad_norm": 0.12307393769475715, "learning_rate": 6.522324612037445e-06, "loss": 0.0081, "step": 2388 }, { "epoch": 0.8637020968908171, "grad_norm": 0.1885186213236113, "learning_rate": 6.5194771080238495e-06, "loss": 0.0349, "step": 2389 }, { "epoch": 0.8640636297903109, "grad_norm": 0.2546978803478287, "learning_rate": 6.5166290609494566e-06, "loss": 0.0432, "step": 2390 }, { "epoch": 0.8644251626898047, "grad_norm": 0.24747599267833767, "learning_rate": 6.5137804718321576e-06, "loss": 0.0388, "step": 2391 }, { "epoch": 0.8647866955892987, "grad_norm": 0.31854376517090816, "learning_rate": 6.510931341690037e-06, "loss": 0.0432, "step": 2392 }, { "epoch": 0.8651482284887925, "grad_norm": 0.18898124131154018, "learning_rate": 6.508081671541373e-06, "loss": 0.0352, "step": 2393 }, { "epoch": 0.8655097613882863, "grad_norm": 0.5120323688884525, "learning_rate": 6.505231462404639e-06, "loss": 0.0352, "step": 2394 }, { "epoch": 0.8658712942877802, "grad_norm": 0.04615480447205407, "learning_rate": 6.502380715298497e-06, "loss": 0.0035, "step": 2395 }, { "epoch": 0.866232827187274, "grad_norm": 0.4815198072279806, "learning_rate": 6.499529431241804e-06, "loss": 0.0762, "step": 2396 }, { "epoch": 0.8665943600867679, "grad_norm": 0.27936023947329114, "learning_rate": 6.496677611253611e-06, "loss": 0.0432, "step": 2397 }, { "epoch": 0.8669558929862617, "grad_norm": 0.1825259907753582, "learning_rate": 6.493825256353153e-06, "loss": 0.0204, "step": 2398 }, { "epoch": 0.8673174258857556, "grad_norm": 0.002793041547988144, "learning_rate": 6.4909723675598655e-06, "loss": 0.0001, "step": 2399 }, { "epoch": 0.8676789587852495, "grad_norm": 1.4565843567417898, "learning_rate": 6.488118945893368e-06, "loss": 0.3047, "step": 2400 }, { "epoch": 0.8680404916847433, "grad_norm": 0.6994380151319288, "learning_rate": 6.4852649923734725e-06, "loss": 0.0579, "step": 2401 }, { "epoch": 0.8684020245842372, "grad_norm": 0.13373058089423898, "learning_rate": 6.4824105080201835e-06, "loss": 0.0227, "step": 2402 }, { "epoch": 0.868763557483731, "grad_norm": 0.5389250783150851, "learning_rate": 6.479555493853691e-06, "loss": 0.1406, "step": 2403 }, { "epoch": 0.8691250903832248, "grad_norm": 0.16534585070525423, "learning_rate": 6.476699950894377e-06, "loss": 0.0317, "step": 2404 }, { "epoch": 0.8694866232827188, "grad_norm": 0.5473118870212698, "learning_rate": 6.473843880162812e-06, "loss": 0.1226, "step": 2405 }, { "epoch": 0.8698481561822126, "grad_norm": 0.2568577820755873, "learning_rate": 6.470987282679756e-06, "loss": 0.0315, "step": 2406 }, { "epoch": 0.8702096890817065, "grad_norm": 0.5309359566331162, "learning_rate": 6.468130159466156e-06, "loss": 0.1143, "step": 2407 }, { "epoch": 0.8705712219812003, "grad_norm": 0.007512998900014446, "learning_rate": 6.465272511543146e-06, "loss": 0.0003, "step": 2408 }, { "epoch": 0.8709327548806941, "grad_norm": 0.16357137677262124, "learning_rate": 6.462414339932049e-06, "loss": 0.0315, "step": 2409 }, { "epoch": 0.871294287780188, "grad_norm": 0.06107418459212892, "learning_rate": 6.459555645654378e-06, "loss": 0.0044, "step": 2410 }, { "epoch": 0.8716558206796818, "grad_norm": 0.694055849628707, "learning_rate": 6.456696429731824e-06, "loss": 0.0762, "step": 2411 }, { "epoch": 0.8720173535791758, "grad_norm": 0.3177052832746466, "learning_rate": 6.453836693186276e-06, "loss": 0.0315, "step": 2412 }, { "epoch": 0.8723788864786696, "grad_norm": 0.777903071903183, "learning_rate": 6.4509764370398e-06, "loss": 0.0903, "step": 2413 }, { "epoch": 0.8727404193781634, "grad_norm": 0.5250650198708322, "learning_rate": 6.448115662314651e-06, "loss": 0.1504, "step": 2414 }, { "epoch": 0.8731019522776573, "grad_norm": 0.4983955083900525, "learning_rate": 6.44525437003327e-06, "loss": 0.0579, "step": 2415 }, { "epoch": 0.8734634851771511, "grad_norm": 0.2420101019505923, "learning_rate": 6.442392561218283e-06, "loss": 0.0388, "step": 2416 }, { "epoch": 0.8738250180766449, "grad_norm": 0.11052991417392996, "learning_rate": 6.439530236892498e-06, "loss": 0.0063, "step": 2417 }, { "epoch": 0.8741865509761388, "grad_norm": 0.5587197758056974, "learning_rate": 6.436667398078911e-06, "loss": 0.1055, "step": 2418 }, { "epoch": 0.8745480838756327, "grad_norm": 0.6243716726909685, "learning_rate": 6.433804045800698e-06, "loss": 0.0527, "step": 2419 }, { "epoch": 0.8749096167751266, "grad_norm": 0.3606303613239375, "learning_rate": 6.4309401810812225e-06, "loss": 0.0476, "step": 2420 }, { "epoch": 0.8752711496746204, "grad_norm": 0.13340025193341895, "learning_rate": 6.428075804944027e-06, "loss": 0.0091, "step": 2421 }, { "epoch": 0.8756326825741142, "grad_norm": 0.15957419147768664, "learning_rate": 6.425210918412843e-06, "loss": 0.0283, "step": 2422 }, { "epoch": 0.8759942154736081, "grad_norm": 0.4651580996349553, "learning_rate": 6.422345522511575e-06, "loss": 0.0527, "step": 2423 }, { "epoch": 0.8763557483731019, "grad_norm": 0.002725951169439106, "learning_rate": 6.419479618264318e-06, "loss": 0.0002, "step": 2424 }, { "epoch": 0.8767172812725958, "grad_norm": 0.04563454982231764, "learning_rate": 6.416613206695346e-06, "loss": 0.0027, "step": 2425 }, { "epoch": 0.8770788141720897, "grad_norm": 0.4051381770469542, "learning_rate": 6.413746288829112e-06, "loss": 0.0317, "step": 2426 }, { "epoch": 0.8774403470715835, "grad_norm": 0.4035460302735954, "learning_rate": 6.410878865690253e-06, "loss": 0.0635, "step": 2427 }, { "epoch": 0.8778018799710774, "grad_norm": 0.2885165081763022, "learning_rate": 6.408010938303584e-06, "loss": 0.0283, "step": 2428 }, { "epoch": 0.8781634128705712, "grad_norm": 0.544220697357082, "learning_rate": 6.4051425076941046e-06, "loss": 0.0391, "step": 2429 }, { "epoch": 0.8785249457700651, "grad_norm": 0.6311118921647718, "learning_rate": 6.402273574886989e-06, "loss": 0.0527, "step": 2430 }, { "epoch": 0.8788864786695589, "grad_norm": 0.07569711990764091, "learning_rate": 6.399404140907593e-06, "loss": 0.0056, "step": 2431 }, { "epoch": 0.8792480115690527, "grad_norm": 0.5189818666554066, "learning_rate": 6.3965342067814526e-06, "loss": 0.0352, "step": 2432 }, { "epoch": 0.8796095444685467, "grad_norm": 0.45040017816523165, "learning_rate": 6.393663773534281e-06, "loss": 0.0283, "step": 2433 }, { "epoch": 0.8799710773680405, "grad_norm": 0.004645447176847554, "learning_rate": 6.3907928421919715e-06, "loss": 0.0003, "step": 2434 }, { "epoch": 0.8803326102675344, "grad_norm": 0.023807967743983482, "learning_rate": 6.387921413780594e-06, "loss": 0.0013, "step": 2435 }, { "epoch": 0.8806941431670282, "grad_norm": 0.1340951324586457, "learning_rate": 6.385049489326395e-06, "loss": 0.0072, "step": 2436 }, { "epoch": 0.881055676066522, "grad_norm": 0.13225426029729476, "learning_rate": 6.382177069855802e-06, "loss": 0.0227, "step": 2437 }, { "epoch": 0.8814172089660159, "grad_norm": 0.49372197788901256, "learning_rate": 6.3793041563954165e-06, "loss": 0.0317, "step": 2438 }, { "epoch": 0.8817787418655098, "grad_norm": 0.29343632916987605, "learning_rate": 6.3764307499720145e-06, "loss": 0.0388, "step": 2439 }, { "epoch": 0.8821402747650036, "grad_norm": 0.6723253505462046, "learning_rate": 6.3735568516125545e-06, "loss": 0.0388, "step": 2440 }, { "epoch": 0.8825018076644975, "grad_norm": 1.1370368799029933, "learning_rate": 6.370682462344165e-06, "loss": 0.083, "step": 2441 }, { "epoch": 0.8828633405639913, "grad_norm": 0.23854894426729634, "learning_rate": 6.367807583194152e-06, "loss": 0.0349, "step": 2442 }, { "epoch": 0.8832248734634852, "grad_norm": 0.29212292143640495, "learning_rate": 6.364932215189998e-06, "loss": 0.0315, "step": 2443 }, { "epoch": 0.883586406362979, "grad_norm": 4.40141246033941, "learning_rate": 6.36205635935936e-06, "loss": 0.4531, "step": 2444 }, { "epoch": 0.8839479392624728, "grad_norm": 0.35406234860814884, "learning_rate": 6.359180016730064e-06, "loss": 0.0476, "step": 2445 }, { "epoch": 0.8843094721619668, "grad_norm": 2.030985568202003, "learning_rate": 6.356303188330118e-06, "loss": 0.1699, "step": 2446 }, { "epoch": 0.8846710050614606, "grad_norm": 0.5113138203486973, "learning_rate": 6.3534258751877e-06, "loss": 0.0476, "step": 2447 }, { "epoch": 0.8850325379609545, "grad_norm": 0.8479654252760185, "learning_rate": 6.350548078331158e-06, "loss": 0.083, "step": 2448 }, { "epoch": 0.8853940708604483, "grad_norm": 0.1038114711612738, "learning_rate": 6.347669798789019e-06, "loss": 0.0063, "step": 2449 }, { "epoch": 0.8857556037599421, "grad_norm": 0.5880570180746024, "learning_rate": 6.3447910375899764e-06, "loss": 0.1309, "step": 2450 }, { "epoch": 0.886117136659436, "grad_norm": 0.7832711936179578, "learning_rate": 6.341911795762903e-06, "loss": 0.0227, "step": 2451 }, { "epoch": 0.8864786695589298, "grad_norm": 0.8457937820899639, "learning_rate": 6.339032074336836e-06, "loss": 0.1309, "step": 2452 }, { "epoch": 0.8868402024584238, "grad_norm": 0.43650007492286164, "learning_rate": 6.3361518743409885e-06, "loss": 0.0579, "step": 2453 }, { "epoch": 0.8872017353579176, "grad_norm": 0.6214529707108892, "learning_rate": 6.333271196804743e-06, "loss": 0.0635, "step": 2454 }, { "epoch": 0.8875632682574114, "grad_norm": 0.008226296249496182, "learning_rate": 6.330390042757653e-06, "loss": 0.0005, "step": 2455 }, { "epoch": 0.8879248011569053, "grad_norm": 0.0723284312083935, "learning_rate": 6.3275084132294425e-06, "loss": 0.0031, "step": 2456 }, { "epoch": 0.8882863340563991, "grad_norm": 1.1981089936760594, "learning_rate": 6.324626309250006e-06, "loss": 0.0579, "step": 2457 }, { "epoch": 0.888647866955893, "grad_norm": 0.2827301914411205, "learning_rate": 6.321743731849406e-06, "loss": 0.0388, "step": 2458 }, { "epoch": 0.8890093998553869, "grad_norm": 0.11462559278234498, "learning_rate": 6.3188606820578744e-06, "loss": 0.0203, "step": 2459 }, { "epoch": 0.8893709327548807, "grad_norm": 0.0019869140031000356, "learning_rate": 6.315977160905813e-06, "loss": 0.0001, "step": 2460 }, { "epoch": 0.8897324656543746, "grad_norm": 0.1965527002423, "learning_rate": 6.313093169423793e-06, "loss": 0.0129, "step": 2461 }, { "epoch": 0.8900939985538684, "grad_norm": 0.17955214116848864, "learning_rate": 6.3102087086425516e-06, "loss": 0.0254, "step": 2462 }, { "epoch": 0.8904555314533622, "grad_norm": 0.6418319270856981, "learning_rate": 6.307323779592993e-06, "loss": 0.0693, "step": 2463 }, { "epoch": 0.8908170643528561, "grad_norm": 1.6340154535783336, "learning_rate": 6.304438383306193e-06, "loss": 0.1226, "step": 2464 }, { "epoch": 0.8911785972523499, "grad_norm": 0.21428892373640265, "learning_rate": 6.301552520813388e-06, "loss": 0.0317, "step": 2465 }, { "epoch": 0.8915401301518439, "grad_norm": 0.14268478147320957, "learning_rate": 6.298666193145988e-06, "loss": 0.0115, "step": 2466 }, { "epoch": 0.8919016630513377, "grad_norm": 0.21217033637495022, "learning_rate": 6.295779401335564e-06, "loss": 0.0315, "step": 2467 }, { "epoch": 0.8922631959508315, "grad_norm": 0.5937009540837043, "learning_rate": 6.292892146413856e-06, "loss": 0.0476, "step": 2468 }, { "epoch": 0.8926247288503254, "grad_norm": 0.0034234658431009026, "learning_rate": 6.290004429412768e-06, "loss": 0.0002, "step": 2469 }, { "epoch": 0.8929862617498192, "grad_norm": 0.9161521510322924, "learning_rate": 6.287116251364369e-06, "loss": 0.0432, "step": 2470 }, { "epoch": 0.8933477946493131, "grad_norm": 0.5573398917063439, "learning_rate": 6.284227613300893e-06, "loss": 0.1602, "step": 2471 }, { "epoch": 0.8937093275488069, "grad_norm": 1.0201382052446912, "learning_rate": 6.28133851625474e-06, "loss": 0.083, "step": 2472 }, { "epoch": 0.8940708604483008, "grad_norm": 0.1901494975828986, "learning_rate": 6.2784489612584695e-06, "loss": 0.0254, "step": 2473 }, { "epoch": 0.8944323933477947, "grad_norm": 0.8003496547950509, "learning_rate": 6.275558949344813e-06, "loss": 0.0693, "step": 2474 }, { "epoch": 0.8947939262472885, "grad_norm": 0.5779382847415852, "learning_rate": 6.272668481546655e-06, "loss": 0.0527, "step": 2475 }, { "epoch": 0.8951554591467824, "grad_norm": 0.9781493903191112, "learning_rate": 6.2697775588970526e-06, "loss": 0.0903, "step": 2476 }, { "epoch": 0.8955169920462762, "grad_norm": 0.8677911579605077, "learning_rate": 6.266886182429216e-06, "loss": 0.1309, "step": 2477 }, { "epoch": 0.89587852494577, "grad_norm": 0.8435143579213323, "learning_rate": 6.263994353176526e-06, "loss": 0.083, "step": 2478 }, { "epoch": 0.896240057845264, "grad_norm": 2.096483847889158, "learning_rate": 6.261102072172523e-06, "loss": 0.1807, "step": 2479 }, { "epoch": 0.8966015907447578, "grad_norm": 0.013286938235793355, "learning_rate": 6.258209340450903e-06, "loss": 0.0006, "step": 2480 }, { "epoch": 0.8969631236442517, "grad_norm": 0.7222503064544348, "learning_rate": 6.2553161590455305e-06, "loss": 0.1504, "step": 2481 }, { "epoch": 0.8973246565437455, "grad_norm": 0.18795240654738982, "learning_rate": 6.252422528990427e-06, "loss": 0.0254, "step": 2482 }, { "epoch": 0.8976861894432393, "grad_norm": 0.5220811375262224, "learning_rate": 6.249528451319777e-06, "loss": 0.0476, "step": 2483 }, { "epoch": 0.8980477223427332, "grad_norm": 0.22635870301184588, "learning_rate": 6.246633927067923e-06, "loss": 0.0081, "step": 2484 }, { "epoch": 0.898409255242227, "grad_norm": 0.10250630860709734, "learning_rate": 6.243738957269366e-06, "loss": 0.0181, "step": 2485 }, { "epoch": 0.8987707881417208, "grad_norm": 0.44734545012759336, "learning_rate": 6.240843542958768e-06, "loss": 0.0044, "step": 2486 }, { "epoch": 0.8991323210412148, "grad_norm": 0.8918726009911592, "learning_rate": 6.23794768517095e-06, "loss": 0.0693, "step": 2487 }, { "epoch": 0.8994938539407086, "grad_norm": 0.06472781771425314, "learning_rate": 6.235051384940889e-06, "loss": 0.0027, "step": 2488 }, { "epoch": 0.8998553868402025, "grad_norm": 0.7096808610788586, "learning_rate": 6.232154643303726e-06, "loss": 0.0903, "step": 2489 }, { "epoch": 0.9002169197396963, "grad_norm": 0.20381284458865787, "learning_rate": 6.229257461294752e-06, "loss": 0.0283, "step": 2490 }, { "epoch": 0.9005784526391901, "grad_norm": 0.003192459151334529, "learning_rate": 6.2263598399494205e-06, "loss": 0.0001, "step": 2491 }, { "epoch": 0.900939985538684, "grad_norm": 0.5225684239849214, "learning_rate": 6.2234617803033425e-06, "loss": 0.0476, "step": 2492 }, { "epoch": 0.9013015184381779, "grad_norm": 0.8117511879822352, "learning_rate": 6.2205632833922805e-06, "loss": 0.0903, "step": 2493 }, { "epoch": 0.9016630513376718, "grad_norm": 0.03215734023378558, "learning_rate": 6.217664350252162e-06, "loss": 0.0016, "step": 2494 }, { "epoch": 0.9020245842371656, "grad_norm": 1.0031710716482067, "learning_rate": 6.214764981919057e-06, "loss": 0.0762, "step": 2495 }, { "epoch": 0.9023861171366594, "grad_norm": 1.075014023338748, "learning_rate": 6.2118651794292075e-06, "loss": 0.0903, "step": 2496 }, { "epoch": 0.9027476500361533, "grad_norm": 0.6970803658691092, "learning_rate": 6.208964943818997e-06, "loss": 0.0579, "step": 2497 }, { "epoch": 0.9031091829356471, "grad_norm": 0.35210986655750837, "learning_rate": 6.20606427612497e-06, "loss": 0.0283, "step": 2498 }, { "epoch": 0.903470715835141, "grad_norm": 0.04778428348220648, "learning_rate": 6.203163177383828e-06, "loss": 0.0027, "step": 2499 }, { "epoch": 0.9038322487346349, "grad_norm": 0.357032834387012, "learning_rate": 6.200261648632417e-06, "loss": 0.0182, "step": 2500 }, { "epoch": 0.9041937816341287, "grad_norm": 0.10519701978472061, "learning_rate": 6.1973596909077485e-06, "loss": 0.0182, "step": 2501 }, { "epoch": 0.9045553145336226, "grad_norm": 0.035060998458572114, "learning_rate": 6.194457305246978e-06, "loss": 0.0021, "step": 2502 }, { "epoch": 0.9049168474331164, "grad_norm": 0.2404173695086655, "learning_rate": 6.191554492687418e-06, "loss": 0.0352, "step": 2503 }, { "epoch": 0.9052783803326103, "grad_norm": 0.26417287263533906, "learning_rate": 6.188651254266536e-06, "loss": 0.0283, "step": 2504 }, { "epoch": 0.9056399132321041, "grad_norm": 0.2285730702711088, "learning_rate": 6.185747591021944e-06, "loss": 0.0352, "step": 2505 }, { "epoch": 0.9060014461315979, "grad_norm": 0.7180612444323463, "learning_rate": 6.182843503991416e-06, "loss": 0.1055, "step": 2506 }, { "epoch": 0.9063629790310919, "grad_norm": 0.27568218813224604, "learning_rate": 6.179938994212868e-06, "loss": 0.0317, "step": 2507 }, { "epoch": 0.9067245119305857, "grad_norm": 0.4609018291428133, "learning_rate": 6.177034062724372e-06, "loss": 0.0432, "step": 2508 }, { "epoch": 0.9070860448300795, "grad_norm": 0.0028232747698242774, "learning_rate": 6.17412871056415e-06, "loss": 0.0001, "step": 2509 }, { "epoch": 0.9074475777295734, "grad_norm": 1.2137072299435663, "learning_rate": 6.171222938770576e-06, "loss": 0.0388, "step": 2510 }, { "epoch": 0.9078091106290672, "grad_norm": 0.23100616794656004, "learning_rate": 6.16831674838217e-06, "loss": 0.0283, "step": 2511 }, { "epoch": 0.9081706435285611, "grad_norm": 0.11486984912855269, "learning_rate": 6.165410140437605e-06, "loss": 0.0182, "step": 2512 }, { "epoch": 0.908532176428055, "grad_norm": 0.07196021233318126, "learning_rate": 6.162503115975701e-06, "loss": 0.0039, "step": 2513 }, { "epoch": 0.9088937093275488, "grad_norm": 0.061144999845551086, "learning_rate": 6.15959567603543e-06, "loss": 0.0031, "step": 2514 }, { "epoch": 0.9092552422270427, "grad_norm": 0.7717218711967054, "learning_rate": 6.156687821655909e-06, "loss": 0.0903, "step": 2515 }, { "epoch": 0.9096167751265365, "grad_norm": 0.49365112077854, "learning_rate": 6.153779553876403e-06, "loss": 0.0388, "step": 2516 }, { "epoch": 0.9099783080260304, "grad_norm": 0.6935790027110299, "learning_rate": 6.1508708737363295e-06, "loss": 0.0527, "step": 2517 }, { "epoch": 0.9103398409255242, "grad_norm": 0.007573317587724873, "learning_rate": 6.147961782275248e-06, "loss": 0.0002, "step": 2518 }, { "epoch": 0.910701373825018, "grad_norm": 1.060082847960898, "learning_rate": 6.145052280532868e-06, "loss": 0.1504, "step": 2519 }, { "epoch": 0.911062906724512, "grad_norm": 0.24310136316037204, "learning_rate": 6.142142369549045e-06, "loss": 0.0254, "step": 2520 }, { "epoch": 0.9114244396240058, "grad_norm": 0.552468083185058, "learning_rate": 6.139232050363779e-06, "loss": 0.0388, "step": 2521 }, { "epoch": 0.9117859725234997, "grad_norm": 1.0568431076711127, "learning_rate": 6.13632132401722e-06, "loss": 0.1406, "step": 2522 }, { "epoch": 0.9121475054229935, "grad_norm": 1.8966913270599064, "learning_rate": 6.133410191549658e-06, "loss": 0.2012, "step": 2523 }, { "epoch": 0.9125090383224873, "grad_norm": 1.8988244570960167, "learning_rate": 6.130498654001534e-06, "loss": 0.1504, "step": 2524 }, { "epoch": 0.9128705712219812, "grad_norm": 0.9501095098543237, "learning_rate": 6.127586712413429e-06, "loss": 0.0388, "step": 2525 }, { "epoch": 0.913232104121475, "grad_norm": 0.2569224188665428, "learning_rate": 6.124674367826072e-06, "loss": 0.0024, "step": 2526 }, { "epoch": 0.913593637020969, "grad_norm": 0.1226783997375203, "learning_rate": 6.121761621280333e-06, "loss": 0.0056, "step": 2527 }, { "epoch": 0.9139551699204628, "grad_norm": 0.10670258506467789, "learning_rate": 6.1188484738172264e-06, "loss": 0.0161, "step": 2528 }, { "epoch": 0.9143167028199566, "grad_norm": 0.03847681333501189, "learning_rate": 6.115934926477911e-06, "loss": 0.0027, "step": 2529 }, { "epoch": 0.9146782357194505, "grad_norm": 0.8516231413983774, "learning_rate": 6.11302098030369e-06, "loss": 0.0693, "step": 2530 }, { "epoch": 0.9150397686189443, "grad_norm": 0.21695590206745963, "learning_rate": 6.110106636336004e-06, "loss": 0.0283, "step": 2531 }, { "epoch": 0.9154013015184381, "grad_norm": 0.028051328734584786, "learning_rate": 6.107191895616442e-06, "loss": 0.0019, "step": 2532 }, { "epoch": 0.915762834417932, "grad_norm": 0.47434782511067514, "learning_rate": 6.104276759186728e-06, "loss": 0.0432, "step": 2533 }, { "epoch": 0.9161243673174259, "grad_norm": 0.14095254649574215, "learning_rate": 6.1013612280887344e-06, "loss": 0.0161, "step": 2534 }, { "epoch": 0.9164859002169198, "grad_norm": 0.8485142548389811, "learning_rate": 6.098445303364472e-06, "loss": 0.1602, "step": 2535 }, { "epoch": 0.9168474331164136, "grad_norm": 1.039117583759926, "learning_rate": 6.095528986056088e-06, "loss": 0.1807, "step": 2536 }, { "epoch": 0.9172089660159074, "grad_norm": 0.04789075784001543, "learning_rate": 6.092612277205876e-06, "loss": 0.0013, "step": 2537 }, { "epoch": 0.9175704989154013, "grad_norm": 1.1212657592550674, "learning_rate": 6.0896951778562665e-06, "loss": 0.0762, "step": 2538 }, { "epoch": 0.9179320318148951, "grad_norm": 2.970842503866357, "learning_rate": 6.086777689049831e-06, "loss": 0.0388, "step": 2539 }, { "epoch": 0.918293564714389, "grad_norm": 0.6581143674973507, "learning_rate": 6.083859811829278e-06, "loss": 0.0693, "step": 2540 }, { "epoch": 0.9186550976138829, "grad_norm": 0.1873161692255787, "learning_rate": 6.080941547237458e-06, "loss": 0.0102, "step": 2541 }, { "epoch": 0.9190166305133767, "grad_norm": 0.6581170592759135, "learning_rate": 6.078022896317356e-06, "loss": 0.1406, "step": 2542 }, { "epoch": 0.9193781634128706, "grad_norm": 0.0017064482959804374, "learning_rate": 6.075103860112099e-06, "loss": 0.0001, "step": 2543 }, { "epoch": 0.9197396963123644, "grad_norm": 0.6431507406254625, "learning_rate": 6.07218443966495e-06, "loss": 0.1309, "step": 2544 }, { "epoch": 0.9201012292118583, "grad_norm": 0.027221407876817063, "learning_rate": 6.069264636019306e-06, "loss": 0.0012, "step": 2545 }, { "epoch": 0.9204627621113521, "grad_norm": 0.2315919714685056, "learning_rate": 6.066344450218711e-06, "loss": 0.0227, "step": 2546 }, { "epoch": 0.920824295010846, "grad_norm": 0.16779202890807834, "learning_rate": 6.0634238833068315e-06, "loss": 0.0254, "step": 2547 }, { "epoch": 0.9211858279103399, "grad_norm": 1.9960238634879697, "learning_rate": 6.060502936327481e-06, "loss": 0.1602, "step": 2548 }, { "epoch": 0.9215473608098337, "grad_norm": 0.04362270762257802, "learning_rate": 6.057581610324605e-06, "loss": 0.0019, "step": 2549 }, { "epoch": 0.9219088937093276, "grad_norm": 0.13198682474971654, "learning_rate": 6.054659906342284e-06, "loss": 0.0203, "step": 2550 }, { "epoch": 0.9222704266088214, "grad_norm": 0.22814609164330657, "learning_rate": 6.051737825424737e-06, "loss": 0.0283, "step": 2551 }, { "epoch": 0.9226319595083152, "grad_norm": 0.06664152262213553, "learning_rate": 6.048815368616311e-06, "loss": 0.0031, "step": 2552 }, { "epoch": 0.9229934924078091, "grad_norm": 0.853858765980144, "learning_rate": 6.045892536961494e-06, "loss": 0.0693, "step": 2553 }, { "epoch": 0.923355025307303, "grad_norm": 0.5471581043445459, "learning_rate": 6.042969331504906e-06, "loss": 0.1406, "step": 2554 }, { "epoch": 0.9237165582067968, "grad_norm": 1.1192626592559665, "learning_rate": 6.040045753291298e-06, "loss": 0.0977, "step": 2555 }, { "epoch": 0.9240780911062907, "grad_norm": 0.12026333716140633, "learning_rate": 6.037121803365559e-06, "loss": 0.0039, "step": 2556 }, { "epoch": 0.9244396240057845, "grad_norm": 0.005076325947242294, "learning_rate": 6.034197482772705e-06, "loss": 0.0002, "step": 2557 }, { "epoch": 0.9248011569052784, "grad_norm": 0.07124580873661465, "learning_rate": 6.031272792557889e-06, "loss": 0.0024, "step": 2558 }, { "epoch": 0.9251626898047722, "grad_norm": 0.20488602876909862, "learning_rate": 6.028347733766394e-06, "loss": 0.0254, "step": 2559 }, { "epoch": 0.925524222704266, "grad_norm": 0.5265704817409208, "learning_rate": 6.025422307443636e-06, "loss": 0.1309, "step": 2560 }, { "epoch": 0.92588575560376, "grad_norm": 0.23644471488880475, "learning_rate": 6.022496514635163e-06, "loss": 0.0317, "step": 2561 }, { "epoch": 0.9262472885032538, "grad_norm": 0.014670370900017125, "learning_rate": 6.019570356386651e-06, "loss": 0.0007, "step": 2562 }, { "epoch": 0.9266088214027477, "grad_norm": 0.03698928907263239, "learning_rate": 6.016643833743908e-06, "loss": 0.0017, "step": 2563 }, { "epoch": 0.9269703543022415, "grad_norm": 0.501024522913144, "learning_rate": 6.0137169477528745e-06, "loss": 0.0635, "step": 2564 }, { "epoch": 0.9273318872017353, "grad_norm": 0.5212523840653417, "learning_rate": 6.010789699459616e-06, "loss": 0.1055, "step": 2565 }, { "epoch": 0.9276934201012292, "grad_norm": 0.06430484927517177, "learning_rate": 6.007862089910335e-06, "loss": 0.0044, "step": 2566 }, { "epoch": 0.928054953000723, "grad_norm": 0.34818993144216265, "learning_rate": 6.004934120151354e-06, "loss": 0.0388, "step": 2567 }, { "epoch": 0.928416485900217, "grad_norm": 0.002413320889174326, "learning_rate": 6.002005791229131e-06, "loss": 0.0001, "step": 2568 }, { "epoch": 0.9287780187997108, "grad_norm": 0.07927141965078001, "learning_rate": 5.999077104190249e-06, "loss": 0.0044, "step": 2569 }, { "epoch": 0.9291395516992046, "grad_norm": 0.42496705661630363, "learning_rate": 5.9961480600814205e-06, "loss": 0.0432, "step": 2570 }, { "epoch": 0.9295010845986985, "grad_norm": 0.019869394848858458, "learning_rate": 5.993218659949488e-06, "loss": 0.0009, "step": 2571 }, { "epoch": 0.9298626174981923, "grad_norm": 0.24571949208631436, "learning_rate": 5.9902889048414125e-06, "loss": 0.0349, "step": 2572 }, { "epoch": 0.9302241503976862, "grad_norm": 0.5751597664468464, "learning_rate": 5.987358795804294e-06, "loss": 0.0432, "step": 2573 }, { "epoch": 0.93058568329718, "grad_norm": 0.0012179519887772822, "learning_rate": 5.984428333885349e-06, "loss": 0.0001, "step": 2574 }, { "epoch": 0.9309472161966739, "grad_norm": 0.002383776251660681, "learning_rate": 5.981497520131926e-06, "loss": 0.0001, "step": 2575 }, { "epoch": 0.9313087490961678, "grad_norm": 5.778432923358336, "learning_rate": 5.9785663555914965e-06, "loss": 0.457, "step": 2576 }, { "epoch": 0.9316702819956616, "grad_norm": 0.37026986359636294, "learning_rate": 5.975634841311657e-06, "loss": 0.0476, "step": 2577 }, { "epoch": 0.9320318148951554, "grad_norm": 2.148459528742973, "learning_rate": 5.972702978340133e-06, "loss": 0.3652, "step": 2578 }, { "epoch": 0.9323933477946493, "grad_norm": 0.4177123992899832, "learning_rate": 5.969770767724768e-06, "loss": 0.0254, "step": 2579 }, { "epoch": 0.9327548806941431, "grad_norm": 0.10897514113413934, "learning_rate": 5.966838210513535e-06, "loss": 0.0203, "step": 2580 }, { "epoch": 0.9331164135936371, "grad_norm": 0.3751463998149892, "learning_rate": 5.963905307754531e-06, "loss": 0.0432, "step": 2581 }, { "epoch": 0.9334779464931309, "grad_norm": 0.12704773081845996, "learning_rate": 5.960972060495973e-06, "loss": 0.0227, "step": 2582 }, { "epoch": 0.9338394793926247, "grad_norm": 0.2723322103104546, "learning_rate": 5.958038469786203e-06, "loss": 0.0115, "step": 2583 }, { "epoch": 0.9342010122921186, "grad_norm": 0.6793475591157967, "learning_rate": 5.955104536673687e-06, "loss": 0.1406, "step": 2584 }, { "epoch": 0.9345625451916124, "grad_norm": 0.03500630486082458, "learning_rate": 5.95217026220701e-06, "loss": 0.0017, "step": 2585 }, { "epoch": 0.9349240780911063, "grad_norm": 0.03810601156859444, "learning_rate": 5.949235647434884e-06, "loss": 0.0027, "step": 2586 }, { "epoch": 0.9352856109906001, "grad_norm": 0.23616990949098035, "learning_rate": 5.946300693406136e-06, "loss": 0.0388, "step": 2587 }, { "epoch": 0.935647143890094, "grad_norm": 1.326896021233759, "learning_rate": 5.943365401169721e-06, "loss": 0.1143, "step": 2588 }, { "epoch": 0.9360086767895879, "grad_norm": 0.15616752286721886, "learning_rate": 5.94042977177471e-06, "loss": 0.0283, "step": 2589 }, { "epoch": 0.9363702096890817, "grad_norm": 2.260931631573249, "learning_rate": 5.937493806270297e-06, "loss": 0.1699, "step": 2590 }, { "epoch": 0.9367317425885756, "grad_norm": 0.1444369997180884, "learning_rate": 5.9345575057057955e-06, "loss": 0.0254, "step": 2591 }, { "epoch": 0.9370932754880694, "grad_norm": 0.11888446415842725, "learning_rate": 5.931620871130639e-06, "loss": 0.0091, "step": 2592 }, { "epoch": 0.9374548083875632, "grad_norm": 0.03109085220834391, "learning_rate": 5.928683903594381e-06, "loss": 0.0015, "step": 2593 }, { "epoch": 0.9378163412870572, "grad_norm": 0.20219728285279231, "learning_rate": 5.925746604146691e-06, "loss": 0.0283, "step": 2594 }, { "epoch": 0.938177874186551, "grad_norm": 0.15989738631783196, "learning_rate": 5.922808973837359e-06, "loss": 0.0227, "step": 2595 }, { "epoch": 0.9385394070860448, "grad_norm": 0.10870142617545471, "learning_rate": 5.919871013716294e-06, "loss": 0.0181, "step": 2596 }, { "epoch": 0.9389009399855387, "grad_norm": 0.002422221189708355, "learning_rate": 5.916932724833525e-06, "loss": 0.0001, "step": 2597 }, { "epoch": 0.9392624728850325, "grad_norm": 0.05387901747245953, "learning_rate": 5.913994108239193e-06, "loss": 0.0022, "step": 2598 }, { "epoch": 0.9396240057845264, "grad_norm": 0.003516893960438953, "learning_rate": 5.911055164983559e-06, "loss": 0.0001, "step": 2599 }, { "epoch": 0.9399855386840202, "grad_norm": 0.19257731341319811, "learning_rate": 5.908115896117e-06, "loss": 0.0254, "step": 2600 }, { "epoch": 0.940347071583514, "grad_norm": 0.09125978644684929, "learning_rate": 5.905176302690015e-06, "loss": 0.005, "step": 2601 }, { "epoch": 0.940708604483008, "grad_norm": 0.002463742557529108, "learning_rate": 5.902236385753207e-06, "loss": 0.0001, "step": 2602 }, { "epoch": 0.9410701373825018, "grad_norm": 0.3571760424908177, "learning_rate": 5.899296146357307e-06, "loss": 0.0352, "step": 2603 }, { "epoch": 0.9414316702819957, "grad_norm": 0.18428630716123065, "learning_rate": 5.896355585553154e-06, "loss": 0.0254, "step": 2604 }, { "epoch": 0.9417932031814895, "grad_norm": 0.5328697103092984, "learning_rate": 5.893414704391702e-06, "loss": 0.0432, "step": 2605 }, { "epoch": 0.9421547360809833, "grad_norm": 0.7833561775995802, "learning_rate": 5.890473503924026e-06, "loss": 0.0432, "step": 2606 }, { "epoch": 0.9425162689804772, "grad_norm": 0.21713486470091856, "learning_rate": 5.887531985201307e-06, "loss": 0.0081, "step": 2607 }, { "epoch": 0.9428778018799711, "grad_norm": 1.2445607532567853, "learning_rate": 5.884590149274843e-06, "loss": 0.1309, "step": 2608 }, { "epoch": 0.943239334779465, "grad_norm": 0.22088153826190352, "learning_rate": 5.881647997196046e-06, "loss": 0.0283, "step": 2609 }, { "epoch": 0.9436008676789588, "grad_norm": 0.057005830231524245, "learning_rate": 5.8787055300164406e-06, "loss": 0.0024, "step": 2610 }, { "epoch": 0.9439624005784526, "grad_norm": 0.7959726119105253, "learning_rate": 5.875762748787666e-06, "loss": 0.0903, "step": 2611 }, { "epoch": 0.9443239334779465, "grad_norm": 0.2361494803207636, "learning_rate": 5.872819654561468e-06, "loss": 0.0315, "step": 2612 }, { "epoch": 0.9446854663774403, "grad_norm": 0.19869486537428424, "learning_rate": 5.869876248389711e-06, "loss": 0.0283, "step": 2613 }, { "epoch": 0.9450469992769343, "grad_norm": 0.173652302909828, "learning_rate": 5.866932531324366e-06, "loss": 0.0227, "step": 2614 }, { "epoch": 0.9454085321764281, "grad_norm": 0.9834193202044288, "learning_rate": 5.863988504417516e-06, "loss": 0.0903, "step": 2615 }, { "epoch": 0.9457700650759219, "grad_norm": 1.0717988286417923, "learning_rate": 5.861044168721358e-06, "loss": 0.083, "step": 2616 }, { "epoch": 0.9461315979754158, "grad_norm": 0.6402711186664498, "learning_rate": 5.858099525288194e-06, "loss": 0.0527, "step": 2617 }, { "epoch": 0.9464931308749096, "grad_norm": 0.11359599185095476, "learning_rate": 5.855154575170445e-06, "loss": 0.0182, "step": 2618 }, { "epoch": 0.9468546637744034, "grad_norm": 0.6154964758678626, "learning_rate": 5.852209319420629e-06, "loss": 0.0693, "step": 2619 }, { "epoch": 0.9472161966738973, "grad_norm": 0.2736189467818607, "learning_rate": 5.849263759091382e-06, "loss": 0.0145, "step": 2620 }, { "epoch": 0.9475777295733911, "grad_norm": 0.001146002614739068, "learning_rate": 5.846317895235446e-06, "loss": 0.0001, "step": 2621 }, { "epoch": 0.9479392624728851, "grad_norm": 0.13166828537857939, "learning_rate": 5.843371728905673e-06, "loss": 0.0181, "step": 2622 }, { "epoch": 0.9483007953723789, "grad_norm": 0.12804326970893778, "learning_rate": 5.840425261155022e-06, "loss": 0.0091, "step": 2623 }, { "epoch": 0.9486623282718727, "grad_norm": 0.1232637072056232, "learning_rate": 5.8374784930365616e-06, "loss": 0.0161, "step": 2624 }, { "epoch": 0.9490238611713666, "grad_norm": 0.24647316131608601, "learning_rate": 5.8345314256034624e-06, "loss": 0.0317, "step": 2625 }, { "epoch": 0.9493853940708604, "grad_norm": 0.8334104089928566, "learning_rate": 5.8315840599090104e-06, "loss": 0.1309, "step": 2626 }, { "epoch": 0.9497469269703543, "grad_norm": 0.006469184942590384, "learning_rate": 5.8286363970065876e-06, "loss": 0.0003, "step": 2627 }, { "epoch": 0.9501084598698482, "grad_norm": 0.06473975499720963, "learning_rate": 5.8256884379496945e-06, "loss": 0.0044, "step": 2628 }, { "epoch": 0.950469992769342, "grad_norm": 0.12374491257627751, "learning_rate": 5.8227401837919275e-06, "loss": 0.0161, "step": 2629 }, { "epoch": 0.9508315256688359, "grad_norm": 0.01122976801190358, "learning_rate": 5.81979163558699e-06, "loss": 0.0006, "step": 2630 }, { "epoch": 0.9511930585683297, "grad_norm": 2.714571793125676, "learning_rate": 5.816842794388697e-06, "loss": 0.1602, "step": 2631 }, { "epoch": 0.9515545914678236, "grad_norm": 0.0019627498501025367, "learning_rate": 5.81389366125096e-06, "loss": 0.0001, "step": 2632 }, { "epoch": 0.9519161243673174, "grad_norm": 0.353227479018132, "learning_rate": 5.810944237227803e-06, "loss": 0.0349, "step": 2633 }, { "epoch": 0.9522776572668112, "grad_norm": 0.27167805417263086, "learning_rate": 5.807994523373345e-06, "loss": 0.0283, "step": 2634 }, { "epoch": 0.9526391901663052, "grad_norm": 0.21170525827312442, "learning_rate": 5.805044520741814e-06, "loss": 0.0254, "step": 2635 }, { "epoch": 0.953000723065799, "grad_norm": 0.09937228734211781, "learning_rate": 5.8020942303875425e-06, "loss": 0.0056, "step": 2636 }, { "epoch": 0.9533622559652929, "grad_norm": 0.028017467654699736, "learning_rate": 5.799143653364961e-06, "loss": 0.0008, "step": 2637 }, { "epoch": 0.9537237888647867, "grad_norm": 1.5007148768291994, "learning_rate": 5.796192790728608e-06, "loss": 0.1406, "step": 2638 }, { "epoch": 0.9540853217642805, "grad_norm": 0.620949899674873, "learning_rate": 5.793241643533119e-06, "loss": 0.0391, "step": 2639 }, { "epoch": 0.9544468546637744, "grad_norm": 0.5927046107541343, "learning_rate": 5.790290212833235e-06, "loss": 0.2012, "step": 2640 }, { "epoch": 0.9548083875632682, "grad_norm": 0.33068227892009544, "learning_rate": 5.787338499683794e-06, "loss": 0.0315, "step": 2641 }, { "epoch": 0.9551699204627621, "grad_norm": 0.7573835468871365, "learning_rate": 5.78438650513974e-06, "loss": 0.1055, "step": 2642 }, { "epoch": 0.955531453362256, "grad_norm": 0.014124091801159255, "learning_rate": 5.781434230256114e-06, "loss": 0.0007, "step": 2643 }, { "epoch": 0.9558929862617498, "grad_norm": 2.701523784594924, "learning_rate": 5.778481676088062e-06, "loss": 0.1055, "step": 2644 }, { "epoch": 0.9562545191612437, "grad_norm": 0.4230688669730386, "learning_rate": 5.7755288436908195e-06, "loss": 0.0388, "step": 2645 }, { "epoch": 0.9566160520607375, "grad_norm": 0.3552476677608464, "learning_rate": 5.772575734119734e-06, "loss": 0.0388, "step": 2646 }, { "epoch": 0.9569775849602313, "grad_norm": 0.08165291665577554, "learning_rate": 5.769622348430243e-06, "loss": 0.0039, "step": 2647 }, { "epoch": 0.9573391178597253, "grad_norm": 0.23259815702169884, "learning_rate": 5.766668687677888e-06, "loss": 0.0315, "step": 2648 }, { "epoch": 0.9577006507592191, "grad_norm": 0.007152375129378124, "learning_rate": 5.763714752918305e-06, "loss": 0.0003, "step": 2649 }, { "epoch": 0.958062183658713, "grad_norm": 0.12744903073008595, "learning_rate": 5.760760545207232e-06, "loss": 0.0181, "step": 2650 }, { "epoch": 0.9584237165582068, "grad_norm": 0.08797514340422877, "learning_rate": 5.757806065600499e-06, "loss": 0.0063, "step": 2651 }, { "epoch": 0.9587852494577006, "grad_norm": 1.117880736050199, "learning_rate": 5.754851315154038e-06, "loss": 0.0762, "step": 2652 }, { "epoch": 0.9591467823571945, "grad_norm": 0.13764877414761736, "learning_rate": 5.7518962949238786e-06, "loss": 0.0182, "step": 2653 }, { "epoch": 0.9595083152566883, "grad_norm": 0.30943912704064025, "learning_rate": 5.748941005966141e-06, "loss": 0.0388, "step": 2654 }, { "epoch": 0.9598698481561823, "grad_norm": 0.18059386522718876, "learning_rate": 5.745985449337045e-06, "loss": 0.0227, "step": 2655 }, { "epoch": 0.9602313810556761, "grad_norm": 0.06967517423981531, "learning_rate": 5.743029626092907e-06, "loss": 0.0039, "step": 2656 }, { "epoch": 0.9605929139551699, "grad_norm": 0.3593077380537216, "learning_rate": 5.740073537290137e-06, "loss": 0.0254, "step": 2657 }, { "epoch": 0.9609544468546638, "grad_norm": 0.17592621697810418, "learning_rate": 5.737117183985242e-06, "loss": 0.0227, "step": 2658 }, { "epoch": 0.9613159797541576, "grad_norm": 0.09855047698576605, "learning_rate": 5.734160567234821e-06, "loss": 0.0063, "step": 2659 }, { "epoch": 0.9616775126536515, "grad_norm": 0.0029658595210645487, "learning_rate": 5.731203688095569e-06, "loss": 0.0001, "step": 2660 }, { "epoch": 0.9620390455531453, "grad_norm": 0.07409924366647881, "learning_rate": 5.728246547624272e-06, "loss": 0.0089, "step": 2661 }, { "epoch": 0.9624005784526392, "grad_norm": 0.18804658631931917, "learning_rate": 5.725289146877812e-06, "loss": 0.0227, "step": 2662 }, { "epoch": 0.9627621113521331, "grad_norm": 0.023182602483907643, "learning_rate": 5.722331486913165e-06, "loss": 0.0007, "step": 2663 }, { "epoch": 0.9631236442516269, "grad_norm": 0.12279839588292894, "learning_rate": 5.719373568787396e-06, "loss": 0.0063, "step": 2664 }, { "epoch": 0.9634851771511207, "grad_norm": 0.10649848321350633, "learning_rate": 5.716415393557667e-06, "loss": 0.0161, "step": 2665 }, { "epoch": 0.9638467100506146, "grad_norm": 0.004005551492927241, "learning_rate": 5.713456962281227e-06, "loss": 0.0001, "step": 2666 }, { "epoch": 0.9642082429501084, "grad_norm": 0.269543938556988, "learning_rate": 5.7104982760154184e-06, "loss": 0.0254, "step": 2667 }, { "epoch": 0.9645697758496024, "grad_norm": 0.08335565657999064, "learning_rate": 5.707539335817676e-06, "loss": 0.0114, "step": 2668 }, { "epoch": 0.9649313087490962, "grad_norm": 0.7094696836564032, "learning_rate": 5.704580142745525e-06, "loss": 0.2129, "step": 2669 }, { "epoch": 0.96529284164859, "grad_norm": 0.1354245587892692, "learning_rate": 5.701620697856579e-06, "loss": 0.0203, "step": 2670 }, { "epoch": 0.9656543745480839, "grad_norm": 0.06889826821871177, "learning_rate": 5.6986610022085445e-06, "loss": 0.0101, "step": 2671 }, { "epoch": 0.9660159074475777, "grad_norm": 0.07598637902301007, "learning_rate": 5.695701056859213e-06, "loss": 0.0101, "step": 2672 }, { "epoch": 0.9663774403470716, "grad_norm": 0.70065839681715, "learning_rate": 5.692740862866472e-06, "loss": 0.1914, "step": 2673 }, { "epoch": 0.9667389732465654, "grad_norm": 0.26718374943341683, "learning_rate": 5.689780421288295e-06, "loss": 0.0162, "step": 2674 }, { "epoch": 0.9671005061460592, "grad_norm": 0.7585131339467793, "learning_rate": 5.686819733182739e-06, "loss": 0.1602, "step": 2675 }, { "epoch": 0.9674620390455532, "grad_norm": 0.01995742331600693, "learning_rate": 5.683858799607955e-06, "loss": 0.001, "step": 2676 }, { "epoch": 0.967823571945047, "grad_norm": 0.6981165277850486, "learning_rate": 5.68089762162218e-06, "loss": 0.2129, "step": 2677 }, { "epoch": 0.9681851048445409, "grad_norm": 0.03323267787600041, "learning_rate": 5.67793620028374e-06, "loss": 0.0013, "step": 2678 }, { "epoch": 0.9685466377440347, "grad_norm": 0.014297702041908385, "learning_rate": 5.674974536651045e-06, "loss": 0.0006, "step": 2679 }, { "epoch": 0.9689081706435285, "grad_norm": 2.188132766859537, "learning_rate": 5.672012631782593e-06, "loss": 0.1602, "step": 2680 }, { "epoch": 0.9692697035430224, "grad_norm": 0.4387468173697011, "learning_rate": 5.669050486736968e-06, "loss": 0.0283, "step": 2681 }, { "epoch": 0.9696312364425163, "grad_norm": 0.02698761730473109, "learning_rate": 5.666088102572838e-06, "loss": 0.0012, "step": 2682 }, { "epoch": 0.9699927693420102, "grad_norm": 0.06723403283429553, "learning_rate": 5.663125480348963e-06, "loss": 0.0027, "step": 2683 }, { "epoch": 0.970354302241504, "grad_norm": 0.1060116728605845, "learning_rate": 5.660162621124182e-06, "loss": 0.0161, "step": 2684 }, { "epoch": 0.9707158351409978, "grad_norm": 0.13705632166733814, "learning_rate": 5.657199525957419e-06, "loss": 0.0143, "step": 2685 }, { "epoch": 0.9710773680404917, "grad_norm": 0.4273489321766885, "learning_rate": 5.654236195907683e-06, "loss": 0.0432, "step": 2686 }, { "epoch": 0.9714389009399855, "grad_norm": 0.20696395637890375, "learning_rate": 5.65127263203407e-06, "loss": 0.0254, "step": 2687 }, { "epoch": 0.9718004338394793, "grad_norm": 0.5977271513120527, "learning_rate": 5.648308835395755e-06, "loss": 0.1504, "step": 2688 }, { "epoch": 0.9721619667389733, "grad_norm": 0.32520878243151885, "learning_rate": 5.645344807051999e-06, "loss": 0.0162, "step": 2689 }, { "epoch": 0.9725234996384671, "grad_norm": 0.0936451026002498, "learning_rate": 5.642380548062145e-06, "loss": 0.0161, "step": 2690 }, { "epoch": 0.972885032537961, "grad_norm": 0.1147870774895405, "learning_rate": 5.63941605948562e-06, "loss": 0.0181, "step": 2691 }, { "epoch": 0.9732465654374548, "grad_norm": 0.10508997891159798, "learning_rate": 5.636451342381928e-06, "loss": 0.0181, "step": 2692 }, { "epoch": 0.9736080983369486, "grad_norm": 0.6479437894673369, "learning_rate": 5.633486397810661e-06, "loss": 0.1504, "step": 2693 }, { "epoch": 0.9739696312364425, "grad_norm": 1.530665577505112, "learning_rate": 5.630521226831491e-06, "loss": 0.0762, "step": 2694 }, { "epoch": 0.9743311641359363, "grad_norm": 0.011553409035218986, "learning_rate": 5.627555830504167e-06, "loss": 0.0006, "step": 2695 }, { "epoch": 0.9746926970354303, "grad_norm": 2.28571249065441, "learning_rate": 5.6245902098885205e-06, "loss": 0.0933, "step": 2696 }, { "epoch": 0.9750542299349241, "grad_norm": 0.33622450409149546, "learning_rate": 5.621624366044464e-06, "loss": 0.0145, "step": 2697 }, { "epoch": 0.9754157628344179, "grad_norm": 0.055117946216880584, "learning_rate": 5.6186583000319925e-06, "loss": 0.0035, "step": 2698 }, { "epoch": 0.9757772957339118, "grad_norm": 0.6211164363817463, "learning_rate": 5.615692012911175e-06, "loss": 0.1309, "step": 2699 }, { "epoch": 0.9761388286334056, "grad_norm": 0.13050101274803483, "learning_rate": 5.612725505742161e-06, "loss": 0.0203, "step": 2700 }, { "epoch": 0.9765003615328995, "grad_norm": 0.25067368952706776, "learning_rate": 5.609758779585182e-06, "loss": 0.0352, "step": 2701 }, { "epoch": 0.9768618944323934, "grad_norm": 1.2326512786546733, "learning_rate": 5.606791835500543e-06, "loss": 0.1055, "step": 2702 }, { "epoch": 0.9772234273318872, "grad_norm": 0.15323547628886933, "learning_rate": 5.603824674548629e-06, "loss": 0.0203, "step": 2703 }, { "epoch": 0.9775849602313811, "grad_norm": 0.617835625572925, "learning_rate": 5.600857297789904e-06, "loss": 0.0476, "step": 2704 }, { "epoch": 0.9779464931308749, "grad_norm": 0.001588198715652149, "learning_rate": 5.597889706284909e-06, "loss": 0.0001, "step": 2705 }, { "epoch": 0.9783080260303688, "grad_norm": 0.22017140380118194, "learning_rate": 5.594921901094259e-06, "loss": 0.0283, "step": 2706 }, { "epoch": 0.9786695589298626, "grad_norm": 0.002188815617430732, "learning_rate": 5.591953883278645e-06, "loss": 0.0001, "step": 2707 }, { "epoch": 0.9790310918293564, "grad_norm": 0.0005119475536570048, "learning_rate": 5.58898565389884e-06, "loss": 0.0, "step": 2708 }, { "epoch": 0.9793926247288504, "grad_norm": 1.666655936148385, "learning_rate": 5.5860172140156866e-06, "loss": 0.2695, "step": 2709 }, { "epoch": 0.9797541576283442, "grad_norm": 1.9412451614131354, "learning_rate": 5.583048564690103e-06, "loss": 0.0903, "step": 2710 }, { "epoch": 0.980115690527838, "grad_norm": 0.3422903156036883, "learning_rate": 5.580079706983087e-06, "loss": 0.0182, "step": 2711 }, { "epoch": 0.9804772234273319, "grad_norm": 0.23834825647271485, "learning_rate": 5.577110641955705e-06, "loss": 0.0283, "step": 2712 }, { "epoch": 0.9808387563268257, "grad_norm": 0.08269136560387419, "learning_rate": 5.5741413706691015e-06, "loss": 0.0114, "step": 2713 }, { "epoch": 0.9812002892263196, "grad_norm": 0.18582691038583235, "learning_rate": 5.571171894184494e-06, "loss": 0.0254, "step": 2714 }, { "epoch": 0.9815618221258134, "grad_norm": 0.09604024672282606, "learning_rate": 5.568202213563172e-06, "loss": 0.0161, "step": 2715 }, { "epoch": 0.9819233550253073, "grad_norm": 0.1474039860090385, "learning_rate": 5.565232329866499e-06, "loss": 0.0227, "step": 2716 }, { "epoch": 0.9822848879248012, "grad_norm": 0.8627690036886527, "learning_rate": 5.562262244155909e-06, "loss": 0.0579, "step": 2717 }, { "epoch": 0.982646420824295, "grad_norm": 0.1341043793798538, "learning_rate": 5.559291957492914e-06, "loss": 0.0071, "step": 2718 }, { "epoch": 0.9830079537237889, "grad_norm": 0.3699843055990134, "learning_rate": 5.556321470939089e-06, "loss": 0.0162, "step": 2719 }, { "epoch": 0.9833694866232827, "grad_norm": 0.10372559440508476, "learning_rate": 5.553350785556089e-06, "loss": 0.0161, "step": 2720 }, { "epoch": 0.9837310195227765, "grad_norm": 0.045733541479671985, "learning_rate": 5.550379902405636e-06, "loss": 0.0024, "step": 2721 }, { "epoch": 0.9840925524222705, "grad_norm": 1.014042973493308, "learning_rate": 5.547408822549521e-06, "loss": 0.2012, "step": 2722 }, { "epoch": 0.9844540853217643, "grad_norm": 0.007077671189234269, "learning_rate": 5.544437547049608e-06, "loss": 0.0003, "step": 2723 }, { "epoch": 0.9848156182212582, "grad_norm": 0.419748603258725, "learning_rate": 5.5414660769678296e-06, "loss": 0.0182, "step": 2724 }, { "epoch": 0.985177151120752, "grad_norm": 1.0690720870601342, "learning_rate": 5.538494413366191e-06, "loss": 0.1226, "step": 2725 }, { "epoch": 0.9855386840202458, "grad_norm": 0.771761282324979, "learning_rate": 5.535522557306764e-06, "loss": 0.1602, "step": 2726 }, { "epoch": 0.9859002169197397, "grad_norm": 0.11434104225053418, "learning_rate": 5.532550509851687e-06, "loss": 0.0143, "step": 2727 }, { "epoch": 0.9862617498192335, "grad_norm": 0.7780465227652879, "learning_rate": 5.529578272063169e-06, "loss": 0.1406, "step": 2728 }, { "epoch": 0.9866232827187275, "grad_norm": 0.23461069363451273, "learning_rate": 5.526605845003488e-06, "loss": 0.0315, "step": 2729 }, { "epoch": 0.9869848156182213, "grad_norm": 0.1423174305133621, "learning_rate": 5.52363322973499e-06, "loss": 0.0203, "step": 2730 }, { "epoch": 0.9873463485177151, "grad_norm": 0.5844095072296102, "learning_rate": 5.520660427320088e-06, "loss": 0.2236, "step": 2731 }, { "epoch": 0.987707881417209, "grad_norm": 0.20008956971926511, "learning_rate": 5.517687438821256e-06, "loss": 0.0283, "step": 2732 }, { "epoch": 0.9880694143167028, "grad_norm": 0.037316791128872116, "learning_rate": 5.514714265301045e-06, "loss": 0.0017, "step": 2733 }, { "epoch": 0.9884309472161966, "grad_norm": 0.13389231636966628, "learning_rate": 5.511740907822063e-06, "loss": 0.0181, "step": 2734 }, { "epoch": 0.9887924801156905, "grad_norm": 0.38644459582694135, "learning_rate": 5.508767367446989e-06, "loss": 0.0432, "step": 2735 }, { "epoch": 0.9891540130151844, "grad_norm": 0.09188407129434113, "learning_rate": 5.5057936452385656e-06, "loss": 0.0161, "step": 2736 }, { "epoch": 0.9895155459146783, "grad_norm": 0.15663050090280864, "learning_rate": 5.502819742259599e-06, "loss": 0.0203, "step": 2737 }, { "epoch": 0.9898770788141721, "grad_norm": 0.12148501576237125, "learning_rate": 5.499845659572964e-06, "loss": 0.0071, "step": 2738 }, { "epoch": 0.9902386117136659, "grad_norm": 0.01639579904699984, "learning_rate": 5.496871398241595e-06, "loss": 0.0007, "step": 2739 }, { "epoch": 0.9906001446131598, "grad_norm": 0.6032866049516911, "learning_rate": 5.493896959328493e-06, "loss": 0.1309, "step": 2740 }, { "epoch": 0.9909616775126536, "grad_norm": 0.509792074324749, "learning_rate": 5.490922343896722e-06, "loss": 0.1602, "step": 2741 }, { "epoch": 0.9913232104121475, "grad_norm": 1.7659090893032128, "learning_rate": 5.487947553009409e-06, "loss": 0.0977, "step": 2742 }, { "epoch": 0.9916847433116414, "grad_norm": 0.12955241979519463, "learning_rate": 5.484972587729744e-06, "loss": 0.0081, "step": 2743 }, { "epoch": 0.9920462762111352, "grad_norm": 0.4934619222116009, "learning_rate": 5.481997449120977e-06, "loss": 0.1914, "step": 2744 }, { "epoch": 0.9924078091106291, "grad_norm": 0.4202087149098069, "learning_rate": 5.479022138246425e-06, "loss": 0.0283, "step": 2745 }, { "epoch": 0.9927693420101229, "grad_norm": 0.46786645893836104, "learning_rate": 5.476046656169461e-06, "loss": 0.0635, "step": 2746 }, { "epoch": 0.9931308749096168, "grad_norm": 0.2152213209974383, "learning_rate": 5.473071003953524e-06, "loss": 0.0315, "step": 2747 }, { "epoch": 0.9934924078091106, "grad_norm": 0.8017978845781769, "learning_rate": 5.47009518266211e-06, "loss": 0.0762, "step": 2748 }, { "epoch": 0.9938539407086044, "grad_norm": 0.18160992761117065, "learning_rate": 5.4671191933587746e-06, "loss": 0.0283, "step": 2749 }, { "epoch": 0.9942154736080984, "grad_norm": 0.289116651949262, "learning_rate": 5.464143037107139e-06, "loss": 0.0476, "step": 2750 }, { "epoch": 0.9945770065075922, "grad_norm": 0.003319643859228333, "learning_rate": 5.46116671497088e-06, "loss": 0.0002, "step": 2751 }, { "epoch": 0.9949385394070861, "grad_norm": 0.020357902019871206, "learning_rate": 5.458190228013736e-06, "loss": 0.001, "step": 2752 }, { "epoch": 0.9953000723065799, "grad_norm": 0.5353343002134852, "learning_rate": 5.455213577299499e-06, "loss": 0.1807, "step": 2753 }, { "epoch": 0.9956616052060737, "grad_norm": 0.23025199778688107, "learning_rate": 5.452236763892026e-06, "loss": 0.0129, "step": 2754 }, { "epoch": 0.9960231381055676, "grad_norm": 0.49583572509728774, "learning_rate": 5.4492597888552304e-06, "loss": 0.1309, "step": 2755 }, { "epoch": 0.9963846710050615, "grad_norm": 0.0036891632975371213, "learning_rate": 5.44628265325308e-06, "loss": 0.0002, "step": 2756 }, { "epoch": 0.9967462039045553, "grad_norm": 0.17952715634934854, "learning_rate": 5.443305358149603e-06, "loss": 0.0349, "step": 2757 }, { "epoch": 0.9971077368040492, "grad_norm": 0.08820649738727301, "learning_rate": 5.440327904608886e-06, "loss": 0.0056, "step": 2758 }, { "epoch": 0.997469269703543, "grad_norm": 0.1747573722002725, "learning_rate": 5.4373502936950674e-06, "loss": 0.0315, "step": 2759 }, { "epoch": 0.9978308026030369, "grad_norm": 0.24954356372556727, "learning_rate": 5.434372526472347e-06, "loss": 0.0349, "step": 2760 }, { "epoch": 0.9981923355025307, "grad_norm": 0.39087439892996456, "learning_rate": 5.431394604004977e-06, "loss": 0.0388, "step": 2761 }, { "epoch": 0.9985538684020245, "grad_norm": 0.16507508405002388, "learning_rate": 5.4284165273572665e-06, "loss": 0.0283, "step": 2762 }, { "epoch": 0.9989154013015185, "grad_norm": 0.27797231427371916, "learning_rate": 5.42543829759358e-06, "loss": 0.0476, "step": 2763 }, { "epoch": 0.9992769342010123, "grad_norm": 0.21897134801072288, "learning_rate": 5.422459915778334e-06, "loss": 0.0349, "step": 2764 }, { "epoch": 0.9996384671005062, "grad_norm": 0.5288411517327958, "learning_rate": 5.4194813829760055e-06, "loss": 0.1504, "step": 2765 }, { "epoch": 1.0, "grad_norm": 0.08941947609915558, "learning_rate": 5.416502700251118e-06, "loss": 0.0045, "step": 2766 }, { "epoch": 1.000361532899494, "grad_norm": 0.24194743597610216, "learning_rate": 5.4135238686682545e-06, "loss": 0.0388, "step": 2767 }, { "epoch": 1.0007230657989876, "grad_norm": 0.21935189321759965, "learning_rate": 5.410544889292047e-06, "loss": 0.0315, "step": 2768 }, { "epoch": 1.0010845986984815, "grad_norm": 0.0016789062810246807, "learning_rate": 5.407565763187182e-06, "loss": 0.0001, "step": 2769 }, { "epoch": 1.0014461315979755, "grad_norm": 0.48427412587249824, "learning_rate": 5.404586491418399e-06, "loss": 0.0579, "step": 2770 }, { "epoch": 1.0018076644974692, "grad_norm": 1.576348934769752, "learning_rate": 5.40160707505049e-06, "loss": 0.1226, "step": 2771 }, { "epoch": 1.002169197396963, "grad_norm": 0.17135883306157587, "learning_rate": 5.398627515148298e-06, "loss": 0.0315, "step": 2772 }, { "epoch": 1.002530730296457, "grad_norm": 0.24578520101792292, "learning_rate": 5.3956478127767155e-06, "loss": 0.0115, "step": 2773 }, { "epoch": 1.002892263195951, "grad_norm": 0.35523061389459454, "learning_rate": 5.392667969000688e-06, "loss": 0.0527, "step": 2774 }, { "epoch": 1.0032537960954446, "grad_norm": 0.17465573744205032, "learning_rate": 5.389687984885211e-06, "loss": 0.0283, "step": 2775 }, { "epoch": 1.0036153289949385, "grad_norm": 0.12642884998040677, "learning_rate": 5.3867078614953305e-06, "loss": 0.0227, "step": 2776 }, { "epoch": 1.0039768618944325, "grad_norm": 0.15897244297332977, "learning_rate": 5.383727599896143e-06, "loss": 0.0283, "step": 2777 }, { "epoch": 1.0043383947939262, "grad_norm": 0.12506582110997244, "learning_rate": 5.380747201152792e-06, "loss": 0.0227, "step": 2778 }, { "epoch": 1.00469992769342, "grad_norm": 0.0013248790787909115, "learning_rate": 5.37776666633047e-06, "loss": 0.0001, "step": 2779 }, { "epoch": 1.005061460592914, "grad_norm": 0.19094110400051437, "learning_rate": 5.374785996494423e-06, "loss": 0.0071, "step": 2780 }, { "epoch": 1.0054229934924077, "grad_norm": 0.14139712157570292, "learning_rate": 5.371805192709939e-06, "loss": 0.0254, "step": 2781 }, { "epoch": 1.0057845263919016, "grad_norm": 0.5605167615089163, "learning_rate": 5.3688242560423585e-06, "loss": 0.1699, "step": 2782 }, { "epoch": 1.0061460592913956, "grad_norm": 0.8288339011733215, "learning_rate": 5.365843187557066e-06, "loss": 0.1143, "step": 2783 }, { "epoch": 1.0065075921908895, "grad_norm": 0.8623032927638364, "learning_rate": 5.362861988319495e-06, "loss": 0.0527, "step": 2784 }, { "epoch": 1.0068691250903832, "grad_norm": 0.13980757842620992, "learning_rate": 5.359880659395127e-06, "loss": 0.0227, "step": 2785 }, { "epoch": 1.007230657989877, "grad_norm": 0.4301467368808841, "learning_rate": 5.356899201849487e-06, "loss": 0.0527, "step": 2786 }, { "epoch": 1.007592190889371, "grad_norm": 0.12309329257914002, "learning_rate": 5.353917616748147e-06, "loss": 0.0227, "step": 2787 }, { "epoch": 1.0079537237888647, "grad_norm": 0.14854430522887185, "learning_rate": 5.3509359051567265e-06, "loss": 0.0203, "step": 2788 }, { "epoch": 1.0083152566883586, "grad_norm": 0.7758869682284583, "learning_rate": 5.347954068140886e-06, "loss": 0.0977, "step": 2789 }, { "epoch": 1.0086767895878526, "grad_norm": 0.47425695256094114, "learning_rate": 5.344972106766336e-06, "loss": 0.1504, "step": 2790 }, { "epoch": 1.0090383224873463, "grad_norm": 0.009921109711938686, "learning_rate": 5.341990022098829e-06, "loss": 0.0004, "step": 2791 }, { "epoch": 1.0093998553868402, "grad_norm": 0.06938904326561793, "learning_rate": 5.339007815204157e-06, "loss": 0.0039, "step": 2792 }, { "epoch": 1.009761388286334, "grad_norm": 0.8820798381909076, "learning_rate": 5.336025487148167e-06, "loss": 0.0527, "step": 2793 }, { "epoch": 1.0101229211858278, "grad_norm": 0.776206494242474, "learning_rate": 5.333043038996737e-06, "loss": 0.1055, "step": 2794 }, { "epoch": 1.0104844540853217, "grad_norm": 0.20352832458311024, "learning_rate": 5.3300604718157955e-06, "loss": 0.0283, "step": 2795 }, { "epoch": 1.0108459869848156, "grad_norm": 0.010706147175801362, "learning_rate": 5.327077786671311e-06, "loss": 0.0003, "step": 2796 }, { "epoch": 1.0112075198843096, "grad_norm": 0.001088184531062685, "learning_rate": 5.324094984629293e-06, "loss": 0.0, "step": 2797 }, { "epoch": 1.0115690527838033, "grad_norm": 0.5397798651373978, "learning_rate": 5.321112066755799e-06, "loss": 0.0579, "step": 2798 }, { "epoch": 1.0119305856832972, "grad_norm": 1.2211171060197563, "learning_rate": 5.318129034116918e-06, "loss": 0.0635, "step": 2799 }, { "epoch": 1.0122921185827911, "grad_norm": 0.8634785209437307, "learning_rate": 5.315145887778788e-06, "loss": 0.0527, "step": 2800 }, { "epoch": 1.0126536514822848, "grad_norm": 0.8213985875799121, "learning_rate": 5.312162628807584e-06, "loss": 0.0762, "step": 2801 }, { "epoch": 1.0130151843817787, "grad_norm": 0.0043254291960958275, "learning_rate": 5.3091792582695215e-06, "loss": 0.0002, "step": 2802 }, { "epoch": 1.0133767172812727, "grad_norm": 93.8242689144651, "learning_rate": 5.306195777230859e-06, "loss": 1.3516, "step": 2803 }, { "epoch": 1.0137382501807664, "grad_norm": 0.36277650931824573, "learning_rate": 5.303212186757889e-06, "loss": 0.0283, "step": 2804 }, { "epoch": 1.0140997830802603, "grad_norm": 0.0014391883031976208, "learning_rate": 5.300228487916949e-06, "loss": 0.0001, "step": 2805 }, { "epoch": 1.0144613159797542, "grad_norm": 0.2540200923836589, "learning_rate": 5.29724468177441e-06, "loss": 0.0283, "step": 2806 }, { "epoch": 1.0148228488792481, "grad_norm": 0.3373903333039065, "learning_rate": 5.294260769396683e-06, "loss": 0.0476, "step": 2807 }, { "epoch": 1.0151843817787418, "grad_norm": 0.24577406510171776, "learning_rate": 5.291276751850222e-06, "loss": 0.0315, "step": 2808 }, { "epoch": 1.0155459146782357, "grad_norm": 0.8686086704181973, "learning_rate": 5.288292630201508e-06, "loss": 0.0476, "step": 2809 }, { "epoch": 1.0159074475777297, "grad_norm": 0.0006763022818982293, "learning_rate": 5.285308405517071e-06, "loss": 0.0, "step": 2810 }, { "epoch": 1.0162689804772234, "grad_norm": 1.1128875922987436, "learning_rate": 5.2823240788634685e-06, "loss": 0.1406, "step": 2811 }, { "epoch": 1.0166305133767173, "grad_norm": 0.15925168582331756, "learning_rate": 5.279339651307301e-06, "loss": 0.0254, "step": 2812 }, { "epoch": 1.0169920462762112, "grad_norm": 0.46201884485420325, "learning_rate": 5.276355123915203e-06, "loss": 0.0432, "step": 2813 }, { "epoch": 1.017353579175705, "grad_norm": 0.18311310701544492, "learning_rate": 5.273370497753839e-06, "loss": 0.0317, "step": 2814 }, { "epoch": 1.0177151120751988, "grad_norm": 0.5980603692162363, "learning_rate": 5.270385773889918e-06, "loss": 0.1055, "step": 2815 }, { "epoch": 1.0180766449746927, "grad_norm": 0.028368127529323963, "learning_rate": 5.267400953390177e-06, "loss": 0.0013, "step": 2816 }, { "epoch": 1.0184381778741864, "grad_norm": 0.20800937692326948, "learning_rate": 5.2644160373213935e-06, "loss": 0.0102, "step": 2817 }, { "epoch": 1.0187997107736804, "grad_norm": 1.0838913683754505, "learning_rate": 5.2614310267503745e-06, "loss": 0.1226, "step": 2818 }, { "epoch": 1.0191612436731743, "grad_norm": 0.635521036530611, "learning_rate": 5.25844592274396e-06, "loss": 0.1226, "step": 2819 }, { "epoch": 1.0195227765726682, "grad_norm": 0.725316339169824, "learning_rate": 5.2554607263690285e-06, "loss": 0.1143, "step": 2820 }, { "epoch": 1.019884309472162, "grad_norm": 0.13938792261412894, "learning_rate": 5.252475438692486e-06, "loss": 0.0254, "step": 2821 }, { "epoch": 1.0202458423716558, "grad_norm": 0.1919901287582335, "learning_rate": 5.249490060781276e-06, "loss": 0.0283, "step": 2822 }, { "epoch": 1.0206073752711498, "grad_norm": 0.04980843532010587, "learning_rate": 5.2465045937023704e-06, "loss": 0.0019, "step": 2823 }, { "epoch": 1.0209689081706435, "grad_norm": 0.0016435878407853365, "learning_rate": 5.2435190385227765e-06, "loss": 0.0001, "step": 2824 }, { "epoch": 1.0213304410701374, "grad_norm": 0.6382040421658566, "learning_rate": 5.240533396309528e-06, "loss": 0.0476, "step": 2825 }, { "epoch": 1.0216919739696313, "grad_norm": 0.7168084808553526, "learning_rate": 5.237547668129694e-06, "loss": 0.0527, "step": 2826 }, { "epoch": 1.022053506869125, "grad_norm": 0.23684351346946683, "learning_rate": 5.234561855050375e-06, "loss": 0.0352, "step": 2827 }, { "epoch": 1.022415039768619, "grad_norm": 11.02624829993462, "learning_rate": 5.2315759581386985e-06, "loss": 0.6641, "step": 2828 }, { "epoch": 1.0227765726681128, "grad_norm": 0.33495326393279945, "learning_rate": 5.228589978461824e-06, "loss": 0.0388, "step": 2829 }, { "epoch": 1.0231381055676068, "grad_norm": 0.2095792570821744, "learning_rate": 5.225603917086938e-06, "loss": 0.0254, "step": 2830 }, { "epoch": 1.0234996384671005, "grad_norm": 1.8395202294396966, "learning_rate": 5.222617775081259e-06, "loss": 0.1406, "step": 2831 }, { "epoch": 1.0238611713665944, "grad_norm": 1.207045155152342, "learning_rate": 5.219631553512034e-06, "loss": 0.1055, "step": 2832 }, { "epoch": 1.0242227042660883, "grad_norm": 0.31987229140659107, "learning_rate": 5.21664525344654e-06, "loss": 0.0352, "step": 2833 }, { "epoch": 1.024584237165582, "grad_norm": 0.0011998877191760399, "learning_rate": 5.2136588759520775e-06, "loss": 0.0, "step": 2834 }, { "epoch": 1.024945770065076, "grad_norm": 0.6644059177943337, "learning_rate": 5.210672422095978e-06, "loss": 0.0635, "step": 2835 }, { "epoch": 1.0253073029645698, "grad_norm": 0.11394417989462886, "learning_rate": 5.207685892945599e-06, "loss": 0.0044, "step": 2836 }, { "epoch": 1.0256688358640635, "grad_norm": 0.10194223924654201, "learning_rate": 5.204699289568326e-06, "loss": 0.0056, "step": 2837 }, { "epoch": 1.0260303687635575, "grad_norm": 2.2457496028721797, "learning_rate": 5.20171261303157e-06, "loss": 0.0283, "step": 2838 }, { "epoch": 1.0263919016630514, "grad_norm": 0.17364628690494976, "learning_rate": 5.198725864402768e-06, "loss": 0.0254, "step": 2839 }, { "epoch": 1.026753434562545, "grad_norm": 0.10507890548889053, "learning_rate": 5.195739044749385e-06, "loss": 0.0181, "step": 2840 }, { "epoch": 1.027114967462039, "grad_norm": 0.7058385263982379, "learning_rate": 5.192752155138907e-06, "loss": 0.1226, "step": 2841 }, { "epoch": 1.027476500361533, "grad_norm": 0.9207204294850175, "learning_rate": 5.189765196638852e-06, "loss": 0.1807, "step": 2842 }, { "epoch": 1.0278380332610269, "grad_norm": 0.16112718008390545, "learning_rate": 5.186778170316754e-06, "loss": 0.0227, "step": 2843 }, { "epoch": 1.0281995661605206, "grad_norm": 0.027631757767314757, "learning_rate": 5.183791077240178e-06, "loss": 0.0013, "step": 2844 }, { "epoch": 1.0285610990600145, "grad_norm": 1.0672433709373321, "learning_rate": 5.18080391847671e-06, "loss": 0.0977, "step": 2845 }, { "epoch": 1.0289226319595084, "grad_norm": 0.09265718807193075, "learning_rate": 5.177816695093958e-06, "loss": 0.0161, "step": 2846 }, { "epoch": 1.029284164859002, "grad_norm": 0.14987429481546613, "learning_rate": 5.174829408159558e-06, "loss": 0.0203, "step": 2847 }, { "epoch": 1.029645697758496, "grad_norm": 0.0031987130476004923, "learning_rate": 5.171842058741166e-06, "loss": 0.0001, "step": 2848 }, { "epoch": 1.03000723065799, "grad_norm": 0.8178285803061129, "learning_rate": 5.168854647906456e-06, "loss": 0.0527, "step": 2849 }, { "epoch": 1.0303687635574836, "grad_norm": 0.7844581532450796, "learning_rate": 5.165867176723132e-06, "loss": 0.0693, "step": 2850 }, { "epoch": 1.0307302964569776, "grad_norm": 0.36298988802956383, "learning_rate": 5.162879646258913e-06, "loss": 0.0283, "step": 2851 }, { "epoch": 1.0310918293564715, "grad_norm": 0.005208040514482499, "learning_rate": 5.159892057581542e-06, "loss": 0.0002, "step": 2852 }, { "epoch": 1.0314533622559654, "grad_norm": 1.116305740791178, "learning_rate": 5.156904411758785e-06, "loss": 0.1055, "step": 2853 }, { "epoch": 1.031814895155459, "grad_norm": 0.10140512619408838, "learning_rate": 5.153916709858423e-06, "loss": 0.0143, "step": 2854 }, { "epoch": 1.032176428054953, "grad_norm": 0.21561875854227905, "learning_rate": 5.1509289529482645e-06, "loss": 0.0317, "step": 2855 }, { "epoch": 1.032537960954447, "grad_norm": 0.36208403707841236, "learning_rate": 5.147941142096127e-06, "loss": 0.0182, "step": 2856 }, { "epoch": 1.0328994938539406, "grad_norm": 0.8154336083458242, "learning_rate": 5.144953278369858e-06, "loss": 0.2012, "step": 2857 }, { "epoch": 1.0332610267534346, "grad_norm": 0.09284757581885587, "learning_rate": 5.141965362837317e-06, "loss": 0.0161, "step": 2858 }, { "epoch": 1.0336225596529285, "grad_norm": 0.7162567067237137, "learning_rate": 5.138977396566384e-06, "loss": 0.1914, "step": 2859 }, { "epoch": 1.0339840925524222, "grad_norm": 0.5246528844942809, "learning_rate": 5.135989380624962e-06, "loss": 0.1807, "step": 2860 }, { "epoch": 1.034345625451916, "grad_norm": 0.0012123610779822846, "learning_rate": 5.133001316080961e-06, "loss": 0.0, "step": 2861 }, { "epoch": 1.03470715835141, "grad_norm": 0.10313083149973683, "learning_rate": 5.13001320400232e-06, "loss": 0.0161, "step": 2862 }, { "epoch": 1.0350686912509037, "grad_norm": 0.15240713545562187, "learning_rate": 5.127025045456986e-06, "loss": 0.0227, "step": 2863 }, { "epoch": 1.0354302241503976, "grad_norm": 0.017406856183586044, "learning_rate": 5.124036841512927e-06, "loss": 0.0008, "step": 2864 }, { "epoch": 1.0357917570498916, "grad_norm": 0.03271496410110351, "learning_rate": 5.121048593238129e-06, "loss": 0.0017, "step": 2865 }, { "epoch": 1.0361532899493855, "grad_norm": 0.09013884428989807, "learning_rate": 5.118060301700588e-06, "loss": 0.0143, "step": 2866 }, { "epoch": 1.0365148228488792, "grad_norm": 0.7846383212866118, "learning_rate": 5.1150719679683205e-06, "loss": 0.1143, "step": 2867 }, { "epoch": 1.0368763557483731, "grad_norm": 0.13952535870060584, "learning_rate": 5.112083593109356e-06, "loss": 0.0227, "step": 2868 }, { "epoch": 1.037237888647867, "grad_norm": 0.4564572730738217, "learning_rate": 5.109095178191739e-06, "loss": 0.1504, "step": 2869 }, { "epoch": 1.0375994215473607, "grad_norm": 0.18231314934358944, "learning_rate": 5.106106724283529e-06, "loss": 0.0227, "step": 2870 }, { "epoch": 1.0379609544468547, "grad_norm": 0.09633845935423485, "learning_rate": 5.103118232452796e-06, "loss": 0.0181, "step": 2871 }, { "epoch": 1.0383224873463486, "grad_norm": 0.0604779547585594, "learning_rate": 5.10012970376763e-06, "loss": 0.0034, "step": 2872 }, { "epoch": 1.0386840202458423, "grad_norm": 0.1861445338104647, "learning_rate": 5.097141139296129e-06, "loss": 0.0254, "step": 2873 }, { "epoch": 1.0390455531453362, "grad_norm": 0.15400369673797334, "learning_rate": 5.094152540106404e-06, "loss": 0.0227, "step": 2874 }, { "epoch": 1.0394070860448301, "grad_norm": 3.521995181393027, "learning_rate": 5.091163907266584e-06, "loss": 0.1836, "step": 2875 }, { "epoch": 1.0397686189443238, "grad_norm": 0.12446870145472089, "learning_rate": 5.0881752418448e-06, "loss": 0.0254, "step": 2876 }, { "epoch": 1.0401301518438177, "grad_norm": 0.162101689261527, "learning_rate": 5.085186544909204e-06, "loss": 0.0254, "step": 2877 }, { "epoch": 1.0404916847433117, "grad_norm": 0.011480825505296314, "learning_rate": 5.082197817527955e-06, "loss": 0.0004, "step": 2878 }, { "epoch": 1.0408532176428056, "grad_norm": 0.7582403986550905, "learning_rate": 5.0792090607692235e-06, "loss": 0.0693, "step": 2879 }, { "epoch": 1.0412147505422993, "grad_norm": 0.21152095833012868, "learning_rate": 5.076220275701191e-06, "loss": 0.0071, "step": 2880 }, { "epoch": 1.0415762834417932, "grad_norm": 0.625262400886256, "learning_rate": 5.073231463392047e-06, "loss": 0.1143, "step": 2881 }, { "epoch": 1.0419378163412871, "grad_norm": 0.5292951213791459, "learning_rate": 5.0702426249099935e-06, "loss": 0.1504, "step": 2882 }, { "epoch": 1.0422993492407808, "grad_norm": 1.2399629681591904, "learning_rate": 5.0672537613232405e-06, "loss": 0.1055, "step": 2883 }, { "epoch": 1.0426608821402747, "grad_norm": 0.626192377525811, "learning_rate": 5.0642648737000066e-06, "loss": 0.1143, "step": 2884 }, { "epoch": 1.0430224150397687, "grad_norm": 0.7155401115412208, "learning_rate": 5.061275963108524e-06, "loss": 0.1504, "step": 2885 }, { "epoch": 1.0433839479392624, "grad_norm": 0.6553985855388125, "learning_rate": 5.058287030617022e-06, "loss": 0.0432, "step": 2886 }, { "epoch": 1.0437454808387563, "grad_norm": 0.6506750521899987, "learning_rate": 5.055298077293748e-06, "loss": 0.0762, "step": 2887 }, { "epoch": 1.0441070137382502, "grad_norm": 0.673788670675608, "learning_rate": 5.052309104206953e-06, "loss": 0.0579, "step": 2888 }, { "epoch": 1.0444685466377441, "grad_norm": 0.6224961106964517, "learning_rate": 5.049320112424895e-06, "loss": 0.0388, "step": 2889 }, { "epoch": 1.0448300795372378, "grad_norm": 0.29678169918539526, "learning_rate": 5.046331103015839e-06, "loss": 0.0432, "step": 2890 }, { "epoch": 1.0451916124367318, "grad_norm": 0.4383581474664001, "learning_rate": 5.043342077048058e-06, "loss": 0.0204, "step": 2891 }, { "epoch": 1.0455531453362257, "grad_norm": 0.014326858280020447, "learning_rate": 5.040353035589826e-06, "loss": 0.0005, "step": 2892 }, { "epoch": 1.0459146782357194, "grad_norm": 0.45383263044490824, "learning_rate": 5.037363979709428e-06, "loss": 0.1226, "step": 2893 }, { "epoch": 1.0462762111352133, "grad_norm": 0.41880195223351957, "learning_rate": 5.034374910475153e-06, "loss": 0.0432, "step": 2894 }, { "epoch": 1.0466377440347072, "grad_norm": 0.5095785997696609, "learning_rate": 5.031385828955291e-06, "loss": 0.0527, "step": 2895 }, { "epoch": 1.046999276934201, "grad_norm": 0.16581721462850452, "learning_rate": 5.028396736218141e-06, "loss": 0.0283, "step": 2896 }, { "epoch": 1.0473608098336948, "grad_norm": 0.2312646402884425, "learning_rate": 5.025407633332003e-06, "loss": 0.0388, "step": 2897 }, { "epoch": 1.0477223427331888, "grad_norm": 0.22224196073625588, "learning_rate": 5.022418521365182e-06, "loss": 0.0349, "step": 2898 }, { "epoch": 1.0480838756326825, "grad_norm": 0.5070858325070965, "learning_rate": 5.019429401385985e-06, "loss": 0.0391, "step": 2899 }, { "epoch": 1.0484454085321764, "grad_norm": 0.24556921160473338, "learning_rate": 5.0164402744627275e-06, "loss": 0.0315, "step": 2900 }, { "epoch": 1.0488069414316703, "grad_norm": 0.5850036984080316, "learning_rate": 5.0134511416637164e-06, "loss": 0.0527, "step": 2901 }, { "epoch": 1.0491684743311642, "grad_norm": 0.0018234599394852966, "learning_rate": 5.010462004057272e-06, "loss": 0.0001, "step": 2902 }, { "epoch": 1.049530007230658, "grad_norm": 0.5662242032690553, "learning_rate": 5.007472862711708e-06, "loss": 0.0977, "step": 2903 }, { "epoch": 1.0498915401301518, "grad_norm": 1.2755526091583593, "learning_rate": 5.004483718695345e-06, "loss": 0.1143, "step": 2904 }, { "epoch": 1.0502530730296458, "grad_norm": 0.006403064252139817, "learning_rate": 5.0014945730765015e-06, "loss": 0.0002, "step": 2905 }, { "epoch": 1.0506146059291395, "grad_norm": 0.08961067065258309, "learning_rate": 4.9985054269234985e-06, "loss": 0.0031, "step": 2906 }, { "epoch": 1.0509761388286334, "grad_norm": 0.27377804803389033, "learning_rate": 4.9955162813046565e-06, "loss": 0.0388, "step": 2907 }, { "epoch": 1.0513376717281273, "grad_norm": 0.584207361288726, "learning_rate": 4.9925271372882925e-06, "loss": 0.0635, "step": 2908 }, { "epoch": 1.051699204627621, "grad_norm": 0.21492190974008707, "learning_rate": 4.98953799594273e-06, "loss": 0.0115, "step": 2909 }, { "epoch": 1.052060737527115, "grad_norm": 0.007936678716615967, "learning_rate": 4.986548858336286e-06, "loss": 0.0004, "step": 2910 }, { "epoch": 1.0524222704266089, "grad_norm": 0.028002846667768037, "learning_rate": 4.983559725537273e-06, "loss": 0.0012, "step": 2911 }, { "epoch": 1.0527838033261028, "grad_norm": 0.5720972593959619, "learning_rate": 4.9805705986140155e-06, "loss": 0.0693, "step": 2912 }, { "epoch": 1.0531453362255965, "grad_norm": 0.00240562648875095, "learning_rate": 4.977581478634819e-06, "loss": 0.0001, "step": 2913 }, { "epoch": 1.0535068691250904, "grad_norm": 2.4384715086748567, "learning_rate": 4.974592366667998e-06, "loss": 0.3789, "step": 2914 }, { "epoch": 1.0538684020245843, "grad_norm": 1.0876212586785134, "learning_rate": 4.971603263781862e-06, "loss": 0.0977, "step": 2915 }, { "epoch": 1.054229934924078, "grad_norm": 0.4046984113127395, "learning_rate": 4.96861417104471e-06, "loss": 0.0476, "step": 2916 }, { "epoch": 1.054591467823572, "grad_norm": 0.5514078299687779, "learning_rate": 4.965625089524849e-06, "loss": 0.0762, "step": 2917 }, { "epoch": 1.0549530007230659, "grad_norm": 0.6397547335816097, "learning_rate": 4.9626360202905725e-06, "loss": 0.043, "step": 2918 }, { "epoch": 1.0553145336225596, "grad_norm": 0.48126868129283606, "learning_rate": 4.959646964410175e-06, "loss": 0.1226, "step": 2919 }, { "epoch": 1.0556760665220535, "grad_norm": 0.06526921500620587, "learning_rate": 4.9566579229519455e-06, "loss": 0.0031, "step": 2920 }, { "epoch": 1.0560375994215474, "grad_norm": 0.131491590465938, "learning_rate": 4.953668896984161e-06, "loss": 0.0071, "step": 2921 }, { "epoch": 1.056399132321041, "grad_norm": 0.598767330850367, "learning_rate": 4.950679887575107e-06, "loss": 0.0762, "step": 2922 }, { "epoch": 1.056760665220535, "grad_norm": 0.1653529815014217, "learning_rate": 4.947690895793049e-06, "loss": 0.0254, "step": 2923 }, { "epoch": 1.057122198120029, "grad_norm": 0.11023167748988544, "learning_rate": 4.944701922706254e-06, "loss": 0.0071, "step": 2924 }, { "epoch": 1.0574837310195229, "grad_norm": 0.33208006228061543, "learning_rate": 4.941712969382981e-06, "loss": 0.0476, "step": 2925 }, { "epoch": 1.0578452639190166, "grad_norm": 0.08903871155722376, "learning_rate": 4.938724036891478e-06, "loss": 0.0031, "step": 2926 }, { "epoch": 1.0582067968185105, "grad_norm": 0.045412039536164135, "learning_rate": 4.935735126299994e-06, "loss": 0.0022, "step": 2927 }, { "epoch": 1.0585683297180044, "grad_norm": 0.5528018254209127, "learning_rate": 4.93274623867676e-06, "loss": 0.1143, "step": 2928 }, { "epoch": 1.058929862617498, "grad_norm": 0.6294737716095491, "learning_rate": 4.929757375090008e-06, "loss": 0.083, "step": 2929 }, { "epoch": 1.059291395516992, "grad_norm": 0.15337462346141043, "learning_rate": 4.9267685366079556e-06, "loss": 0.0315, "step": 2930 }, { "epoch": 1.059652928416486, "grad_norm": 0.25189421970702264, "learning_rate": 4.92377972429881e-06, "loss": 0.0388, "step": 2931 }, { "epoch": 1.0600144613159797, "grad_norm": 0.03557866671091932, "learning_rate": 4.920790939230778e-06, "loss": 0.0019, "step": 2932 }, { "epoch": 1.0603759942154736, "grad_norm": 0.27425271302448273, "learning_rate": 4.917802182472046e-06, "loss": 0.0388, "step": 2933 }, { "epoch": 1.0607375271149675, "grad_norm": 0.24004256112930591, "learning_rate": 4.914813455090797e-06, "loss": 0.0145, "step": 2934 }, { "epoch": 1.0610990600144614, "grad_norm": 0.0010271471018672967, "learning_rate": 4.911824758155201e-06, "loss": 0.0, "step": 2935 }, { "epoch": 1.0614605929139551, "grad_norm": 0.19535940145713152, "learning_rate": 4.908836092733417e-06, "loss": 0.0349, "step": 2936 }, { "epoch": 1.061822125813449, "grad_norm": 0.34031415329609493, "learning_rate": 4.905847459893597e-06, "loss": 0.0388, "step": 2937 }, { "epoch": 1.062183658712943, "grad_norm": 0.7419763110948553, "learning_rate": 4.902858860703872e-06, "loss": 0.0432, "step": 2938 }, { "epoch": 1.0625451916124367, "grad_norm": 0.44371790236347514, "learning_rate": 4.899870296232371e-06, "loss": 0.1143, "step": 2939 }, { "epoch": 1.0629067245119306, "grad_norm": 0.5938761542427745, "learning_rate": 4.896881767547205e-06, "loss": 0.1406, "step": 2940 }, { "epoch": 1.0632682574114245, "grad_norm": 0.33251523619293766, "learning_rate": 4.893893275716472e-06, "loss": 0.0315, "step": 2941 }, { "epoch": 1.0636297903109182, "grad_norm": 0.6036220438280604, "learning_rate": 4.890904821808263e-06, "loss": 0.1309, "step": 2942 }, { "epoch": 1.0639913232104121, "grad_norm": 0.18439258388826496, "learning_rate": 4.887916406890645e-06, "loss": 0.0315, "step": 2943 }, { "epoch": 1.064352856109906, "grad_norm": 0.0009354432777971604, "learning_rate": 4.88492803203168e-06, "loss": 0.0, "step": 2944 }, { "epoch": 1.0647143890093997, "grad_norm": 0.22028416387959457, "learning_rate": 4.881939698299413e-06, "loss": 0.0388, "step": 2945 }, { "epoch": 1.0650759219088937, "grad_norm": 0.39390017613993644, "learning_rate": 4.878951406761872e-06, "loss": 0.0388, "step": 2946 }, { "epoch": 1.0654374548083876, "grad_norm": 0.1410011646155064, "learning_rate": 4.875963158487074e-06, "loss": 0.0254, "step": 2947 }, { "epoch": 1.0657989877078815, "grad_norm": 1.3332716643186682, "learning_rate": 4.872974954543015e-06, "loss": 0.2129, "step": 2948 }, { "epoch": 1.0661605206073752, "grad_norm": 0.023397084293810703, "learning_rate": 4.8699867959976824e-06, "loss": 0.0004, "step": 2949 }, { "epoch": 1.0665220535068691, "grad_norm": 0.43933605409139925, "learning_rate": 4.866998683919041e-06, "loss": 0.0254, "step": 2950 }, { "epoch": 1.066883586406363, "grad_norm": 0.0006455160205819056, "learning_rate": 4.86401061937504e-06, "loss": 0.0, "step": 2951 }, { "epoch": 1.0672451193058567, "grad_norm": 1.0440221809260029, "learning_rate": 4.861022603433617e-06, "loss": 0.1309, "step": 2952 }, { "epoch": 1.0676066522053507, "grad_norm": 1.5308088464917644, "learning_rate": 4.8580346371626855e-06, "loss": 0.1807, "step": 2953 }, { "epoch": 1.0679681851048446, "grad_norm": 0.05377093849346154, "learning_rate": 4.855046721630145e-06, "loss": 0.0027, "step": 2954 }, { "epoch": 1.0683297180043383, "grad_norm": 1.3990769979980924, "learning_rate": 4.8520588579038755e-06, "loss": 0.0527, "step": 2955 }, { "epoch": 1.0686912509038322, "grad_norm": 0.7387491120459364, "learning_rate": 4.849071047051738e-06, "loss": 0.0977, "step": 2956 }, { "epoch": 1.0690527838033261, "grad_norm": 0.26694212880616686, "learning_rate": 4.846083290141578e-06, "loss": 0.0352, "step": 2957 }, { "epoch": 1.06941431670282, "grad_norm": 0.13949233312864318, "learning_rate": 4.843095588241216e-06, "loss": 0.0283, "step": 2958 }, { "epoch": 1.0697758496023138, "grad_norm": 2.4136129465557334, "learning_rate": 4.840107942418459e-06, "loss": 0.2129, "step": 2959 }, { "epoch": 1.0701373825018077, "grad_norm": 0.0012613864250800085, "learning_rate": 4.83712035374109e-06, "loss": 0.0001, "step": 2960 }, { "epoch": 1.0704989154013016, "grad_norm": 0.2297924300201102, "learning_rate": 4.83413282327687e-06, "loss": 0.0388, "step": 2961 }, { "epoch": 1.0708604483007953, "grad_norm": 0.15392966906551286, "learning_rate": 4.831145352093547e-06, "loss": 0.0283, "step": 2962 }, { "epoch": 1.0712219812002892, "grad_norm": 0.46765263503813065, "learning_rate": 4.828157941258837e-06, "loss": 0.1226, "step": 2963 }, { "epoch": 1.0715835140997831, "grad_norm": 0.06345238325594774, "learning_rate": 4.825170591840443e-06, "loss": 0.0027, "step": 2964 }, { "epoch": 1.0719450469992768, "grad_norm": 0.001977803542238724, "learning_rate": 4.822183304906043e-06, "loss": 0.0001, "step": 2965 }, { "epoch": 1.0723065798987708, "grad_norm": 0.2572678228432896, "learning_rate": 4.819196081523291e-06, "loss": 0.0315, "step": 2966 }, { "epoch": 1.0726681127982647, "grad_norm": 0.486435770784327, "learning_rate": 4.816208922759824e-06, "loss": 0.0527, "step": 2967 }, { "epoch": 1.0730296456977584, "grad_norm": 0.909637623425966, "learning_rate": 4.8132218296832475e-06, "loss": 0.0903, "step": 2968 }, { "epoch": 1.0733911785972523, "grad_norm": 0.10783993764137668, "learning_rate": 4.81023480336115e-06, "loss": 0.0203, "step": 2969 }, { "epoch": 1.0737527114967462, "grad_norm": 0.47126343960404493, "learning_rate": 4.8072478448610935e-06, "loss": 0.1602, "step": 2970 }, { "epoch": 1.0741142443962401, "grad_norm": 0.0013697634999671163, "learning_rate": 4.804260955250616e-06, "loss": 0.0001, "step": 2971 }, { "epoch": 1.0744757772957338, "grad_norm": 0.33213357474101307, "learning_rate": 4.8012741355972344e-06, "loss": 0.0476, "step": 2972 }, { "epoch": 1.0748373101952278, "grad_norm": 0.1327787362694284, "learning_rate": 4.7982873869684315e-06, "loss": 0.0254, "step": 2973 }, { "epoch": 1.0751988430947217, "grad_norm": 0.4191894200441296, "learning_rate": 4.795300710431676e-06, "loss": 0.0228, "step": 2974 }, { "epoch": 1.0755603759942154, "grad_norm": 0.5223014169909954, "learning_rate": 4.792314107054403e-06, "loss": 0.0162, "step": 2975 }, { "epoch": 1.0759219088937093, "grad_norm": 0.20181127158712045, "learning_rate": 4.789327577904023e-06, "loss": 0.0129, "step": 2976 }, { "epoch": 1.0762834417932032, "grad_norm": 0.002150861932387768, "learning_rate": 4.786341124047925e-06, "loss": 0.0001, "step": 2977 }, { "epoch": 1.076644974692697, "grad_norm": 0.15212051581384609, "learning_rate": 4.78335474655346e-06, "loss": 0.0315, "step": 2978 }, { "epoch": 1.0770065075921909, "grad_norm": 0.29889488395432545, "learning_rate": 4.7803684464879665e-06, "loss": 0.0391, "step": 2979 }, { "epoch": 1.0773680404916848, "grad_norm": 0.47126542701353885, "learning_rate": 4.777382224918742e-06, "loss": 0.1143, "step": 2980 }, { "epoch": 1.0777295733911787, "grad_norm": 0.15078487874547192, "learning_rate": 4.774396082913064e-06, "loss": 0.0254, "step": 2981 }, { "epoch": 1.0780911062906724, "grad_norm": 0.08798835055035774, "learning_rate": 4.77141002153818e-06, "loss": 0.0035, "step": 2982 }, { "epoch": 1.0784526391901663, "grad_norm": 0.14979298028412114, "learning_rate": 4.768424041861302e-06, "loss": 0.0283, "step": 2983 }, { "epoch": 1.0788141720896602, "grad_norm": 0.12076304210128685, "learning_rate": 4.765438144949626e-06, "loss": 0.0227, "step": 2984 }, { "epoch": 1.079175704989154, "grad_norm": 0.005600677802445194, "learning_rate": 4.762452331870306e-06, "loss": 0.0002, "step": 2985 }, { "epoch": 1.0795372378886479, "grad_norm": 0.6381799908572087, "learning_rate": 4.759466603690473e-06, "loss": 0.0527, "step": 2986 }, { "epoch": 1.0798987707881418, "grad_norm": 0.14824038582582882, "learning_rate": 4.756480961477226e-06, "loss": 0.0254, "step": 2987 }, { "epoch": 1.0802603036876355, "grad_norm": 0.25366878603629023, "learning_rate": 4.753495406297629e-06, "loss": 0.0352, "step": 2988 }, { "epoch": 1.0806218365871294, "grad_norm": 0.12821390338259409, "learning_rate": 4.750509939218725e-06, "loss": 0.0203, "step": 2989 }, { "epoch": 1.0809833694866233, "grad_norm": 0.3252805008972628, "learning_rate": 4.747524561307515e-06, "loss": 0.0203, "step": 2990 }, { "epoch": 1.081344902386117, "grad_norm": 0.20691306262012696, "learning_rate": 4.744539273630973e-06, "loss": 0.0115, "step": 2991 }, { "epoch": 1.081706435285611, "grad_norm": 0.1104530854897784, "learning_rate": 4.741554077256042e-06, "loss": 0.0203, "step": 2992 }, { "epoch": 1.0820679681851049, "grad_norm": 0.09597791800312247, "learning_rate": 4.738568973249626e-06, "loss": 0.0181, "step": 2993 }, { "epoch": 1.0824295010845988, "grad_norm": 0.14495820894563832, "learning_rate": 4.735583962678607e-06, "loss": 0.0227, "step": 2994 }, { "epoch": 1.0827910339840925, "grad_norm": 0.46356865480760473, "learning_rate": 4.7325990466098234e-06, "loss": 0.1504, "step": 2995 }, { "epoch": 1.0831525668835864, "grad_norm": 0.19793265684710185, "learning_rate": 4.729614226110084e-06, "loss": 0.0254, "step": 2996 }, { "epoch": 1.0835140997830803, "grad_norm": 0.4456968104700872, "learning_rate": 4.726629502246163e-06, "loss": 0.0145, "step": 2997 }, { "epoch": 1.083875632682574, "grad_norm": 1.1983514338976002, "learning_rate": 4.723644876084799e-06, "loss": 0.0635, "step": 2998 }, { "epoch": 1.084237165582068, "grad_norm": 0.09392267465987111, "learning_rate": 4.7206603486927e-06, "loss": 0.0161, "step": 2999 }, { "epoch": 1.0845986984815619, "grad_norm": 0.07888571258793822, "learning_rate": 4.7176759211365315e-06, "loss": 0.0143, "step": 3000 }, { "epoch": 1.0849602313810556, "grad_norm": 0.4846872942861042, "learning_rate": 4.7146915944829305e-06, "loss": 0.1699, "step": 3001 }, { "epoch": 1.0853217642805495, "grad_norm": 0.15061867314482558, "learning_rate": 4.711707369798495e-06, "loss": 0.0227, "step": 3002 }, { "epoch": 1.0856832971800434, "grad_norm": 0.0968704372267266, "learning_rate": 4.70872324814978e-06, "loss": 0.0182, "step": 3003 }, { "epoch": 1.0860448300795373, "grad_norm": 0.1844570140070956, "learning_rate": 4.705739230603319e-06, "loss": 0.0283, "step": 3004 }, { "epoch": 1.086406362979031, "grad_norm": 0.2758785720504394, "learning_rate": 4.702755318225592e-06, "loss": 0.0283, "step": 3005 }, { "epoch": 1.086767895878525, "grad_norm": 0.14087914141300098, "learning_rate": 4.699771512083053e-06, "loss": 0.0071, "step": 3006 }, { "epoch": 1.0871294287780189, "grad_norm": 0.5899471277644017, "learning_rate": 4.696787813242114e-06, "loss": 0.1807, "step": 3007 }, { "epoch": 1.0874909616775126, "grad_norm": 0.0026347897939791257, "learning_rate": 4.6938042227691425e-06, "loss": 0.0001, "step": 3008 }, { "epoch": 1.0878524945770065, "grad_norm": 0.1682257134988823, "learning_rate": 4.69082074173048e-06, "loss": 0.0254, "step": 3009 }, { "epoch": 1.0882140274765004, "grad_norm": 0.09080730045325187, "learning_rate": 4.6878373711924175e-06, "loss": 0.0161, "step": 3010 }, { "epoch": 1.0885755603759941, "grad_norm": 1.1942204637134834, "learning_rate": 4.684854112221214e-06, "loss": 0.1143, "step": 3011 }, { "epoch": 1.088937093275488, "grad_norm": 0.12186337235425596, "learning_rate": 4.681870965883085e-06, "loss": 0.0227, "step": 3012 }, { "epoch": 1.089298626174982, "grad_norm": 0.5775658876761278, "learning_rate": 4.6788879332442025e-06, "loss": 0.1309, "step": 3013 }, { "epoch": 1.0896601590744757, "grad_norm": 0.15713551161956293, "learning_rate": 4.675905015370708e-06, "loss": 0.0227, "step": 3014 }, { "epoch": 1.0900216919739696, "grad_norm": 0.15006234465314097, "learning_rate": 4.672922213328691e-06, "loss": 0.0254, "step": 3015 }, { "epoch": 1.0903832248734635, "grad_norm": 0.04968456317766555, "learning_rate": 4.669939528184206e-06, "loss": 0.0024, "step": 3016 }, { "epoch": 1.0907447577729574, "grad_norm": 0.8746441583347481, "learning_rate": 4.666956961003266e-06, "loss": 0.0635, "step": 3017 }, { "epoch": 1.0911062906724511, "grad_norm": 0.7415860551258708, "learning_rate": 4.663974512851834e-06, "loss": 0.1406, "step": 3018 }, { "epoch": 1.091467823571945, "grad_norm": 0.662817060734785, "learning_rate": 4.660992184795844e-06, "loss": 0.1309, "step": 3019 }, { "epoch": 1.091829356471439, "grad_norm": 0.12337604641134584, "learning_rate": 4.658009977901173e-06, "loss": 0.0227, "step": 3020 }, { "epoch": 1.0921908893709327, "grad_norm": 0.001233784025096671, "learning_rate": 4.655027893233665e-06, "loss": 0.0001, "step": 3021 }, { "epoch": 1.0925524222704266, "grad_norm": 0.25696382272860435, "learning_rate": 4.652045931859116e-06, "loss": 0.0391, "step": 3022 }, { "epoch": 1.0929139551699205, "grad_norm": 0.09799284868463466, "learning_rate": 4.649064094843274e-06, "loss": 0.0181, "step": 3023 }, { "epoch": 1.0932754880694142, "grad_norm": 0.8210365582552016, "learning_rate": 4.6460823832518555e-06, "loss": 0.0432, "step": 3024 }, { "epoch": 1.0936370209689081, "grad_norm": 0.7857800680545729, "learning_rate": 4.6431007981505146e-06, "loss": 0.1406, "step": 3025 }, { "epoch": 1.093998553868402, "grad_norm": 0.2695108741361507, "learning_rate": 4.640119340604875e-06, "loss": 0.0432, "step": 3026 }, { "epoch": 1.094360086767896, "grad_norm": 0.006333360097909301, "learning_rate": 4.637138011680508e-06, "loss": 0.0003, "step": 3027 }, { "epoch": 1.0947216196673897, "grad_norm": 0.8264365275303213, "learning_rate": 4.634156812442936e-06, "loss": 0.0977, "step": 3028 }, { "epoch": 1.0950831525668836, "grad_norm": 0.1562732628485004, "learning_rate": 4.631175743957644e-06, "loss": 0.0254, "step": 3029 }, { "epoch": 1.0954446854663775, "grad_norm": 0.07006875325896245, "learning_rate": 4.6281948072900625e-06, "loss": 0.0024, "step": 3030 }, { "epoch": 1.0958062183658712, "grad_norm": 0.29673206127943075, "learning_rate": 4.625214003505579e-06, "loss": 0.0388, "step": 3031 }, { "epoch": 1.0961677512653651, "grad_norm": 0.12845936385255716, "learning_rate": 4.622233333669531e-06, "loss": 0.0071, "step": 3032 }, { "epoch": 1.096529284164859, "grad_norm": 0.006134019327604949, "learning_rate": 4.61925279884721e-06, "loss": 0.0004, "step": 3033 }, { "epoch": 1.0968908170643528, "grad_norm": 0.5826147222058554, "learning_rate": 4.61627240010386e-06, "loss": 0.1699, "step": 3034 }, { "epoch": 1.0972523499638467, "grad_norm": 0.630638274899603, "learning_rate": 4.61329213850467e-06, "loss": 0.1406, "step": 3035 }, { "epoch": 1.0976138828633406, "grad_norm": 0.2401768530766888, "learning_rate": 4.6103120151147905e-06, "loss": 0.0317, "step": 3036 }, { "epoch": 1.0979754157628343, "grad_norm": 0.17068197232460627, "learning_rate": 4.6073320309993145e-06, "loss": 0.0254, "step": 3037 }, { "epoch": 1.0983369486623282, "grad_norm": 0.24703580186989343, "learning_rate": 4.604352187223286e-06, "loss": 0.0388, "step": 3038 }, { "epoch": 1.0986984815618221, "grad_norm": 0.1732507400417802, "learning_rate": 4.601372484851705e-06, "loss": 0.0283, "step": 3039 }, { "epoch": 1.099060014461316, "grad_norm": 0.0009421018425748855, "learning_rate": 4.5983929249495104e-06, "loss": 0.0, "step": 3040 }, { "epoch": 1.0994215473608098, "grad_norm": 0.25666915368725474, "learning_rate": 4.595413508581602e-06, "loss": 0.0115, "step": 3041 }, { "epoch": 1.0997830802603037, "grad_norm": 2.058492295629044, "learning_rate": 4.59243423681282e-06, "loss": 0.1406, "step": 3042 }, { "epoch": 1.1001446131597976, "grad_norm": 0.2381565350531547, "learning_rate": 4.589455110707955e-06, "loss": 0.0115, "step": 3043 }, { "epoch": 1.1005061460592913, "grad_norm": 0.024394920859969433, "learning_rate": 4.586476131331749e-06, "loss": 0.001, "step": 3044 }, { "epoch": 1.1008676789587852, "grad_norm": 0.4276214919152502, "learning_rate": 4.583497299748883e-06, "loss": 0.1406, "step": 3045 }, { "epoch": 1.1012292118582792, "grad_norm": 0.07885215668308307, "learning_rate": 4.580518617023996e-06, "loss": 0.0024, "step": 3046 }, { "epoch": 1.1015907447577729, "grad_norm": 0.47834628875058083, "learning_rate": 4.577540084221666e-06, "loss": 0.0227, "step": 3047 }, { "epoch": 1.1019522776572668, "grad_norm": 1.7624142013535582, "learning_rate": 4.574561702406421e-06, "loss": 0.3887, "step": 3048 }, { "epoch": 1.1023138105567607, "grad_norm": 0.5005781032253688, "learning_rate": 4.571583472642736e-06, "loss": 0.0476, "step": 3049 }, { "epoch": 1.1026753434562546, "grad_norm": 0.01280656735098546, "learning_rate": 4.568605395995025e-06, "loss": 0.0005, "step": 3050 }, { "epoch": 1.1030368763557483, "grad_norm": 0.3472762150495479, "learning_rate": 4.565627473527655e-06, "loss": 0.0476, "step": 3051 }, { "epoch": 1.1033984092552422, "grad_norm": 0.5671506809340181, "learning_rate": 4.562649706304933e-06, "loss": 0.0635, "step": 3052 }, { "epoch": 1.1037599421547362, "grad_norm": 0.1646008759303405, "learning_rate": 4.559672095391116e-06, "loss": 0.0317, "step": 3053 }, { "epoch": 1.1041214750542299, "grad_norm": 0.3415497516584924, "learning_rate": 4.5566946418503985e-06, "loss": 0.0432, "step": 3054 }, { "epoch": 1.1044830079537238, "grad_norm": 0.1831336581132581, "learning_rate": 4.553717346746922e-06, "loss": 0.0352, "step": 3055 }, { "epoch": 1.1048445408532177, "grad_norm": 0.030161474591121534, "learning_rate": 4.550740211144772e-06, "loss": 0.0017, "step": 3056 }, { "epoch": 1.1052060737527114, "grad_norm": 0.2631217391359639, "learning_rate": 4.547763236107975e-06, "loss": 0.0317, "step": 3057 }, { "epoch": 1.1055676066522053, "grad_norm": 0.4446442099218246, "learning_rate": 4.5447864227005015e-06, "loss": 0.0476, "step": 3058 }, { "epoch": 1.1059291395516992, "grad_norm": 0.24614556040962507, "learning_rate": 4.541809771986267e-06, "loss": 0.0352, "step": 3059 }, { "epoch": 1.106290672451193, "grad_norm": 0.856571873720659, "learning_rate": 4.538833285029121e-06, "loss": 0.1504, "step": 3060 }, { "epoch": 1.1066522053506869, "grad_norm": 0.6695044520621075, "learning_rate": 4.535856962892862e-06, "loss": 0.0693, "step": 3061 }, { "epoch": 1.1070137382501808, "grad_norm": 0.47764089510611063, "learning_rate": 4.532880806641226e-06, "loss": 0.1406, "step": 3062 }, { "epoch": 1.1073752711496747, "grad_norm": 0.15954553068739608, "learning_rate": 4.529904817337892e-06, "loss": 0.0254, "step": 3063 }, { "epoch": 1.1077368040491684, "grad_norm": 0.6283127827538828, "learning_rate": 4.526928996046479e-06, "loss": 0.0388, "step": 3064 }, { "epoch": 1.1080983369486623, "grad_norm": 0.5959160837675137, "learning_rate": 4.52395334383054e-06, "loss": 0.1309, "step": 3065 }, { "epoch": 1.1084598698481563, "grad_norm": 0.5872741021862792, "learning_rate": 4.520977861753576e-06, "loss": 0.1055, "step": 3066 }, { "epoch": 1.10882140274765, "grad_norm": 0.12403243302820388, "learning_rate": 4.518002550879023e-06, "loss": 0.0227, "step": 3067 }, { "epoch": 1.1091829356471439, "grad_norm": 1.1509363219874722, "learning_rate": 4.515027412270257e-06, "loss": 0.2236, "step": 3068 }, { "epoch": 1.1095444685466378, "grad_norm": 0.10336912677800826, "learning_rate": 4.512052446990593e-06, "loss": 0.0203, "step": 3069 }, { "epoch": 1.1099060014461315, "grad_norm": 0.47978869130853946, "learning_rate": 4.509077656103279e-06, "loss": 0.1602, "step": 3070 }, { "epoch": 1.1102675343456254, "grad_norm": 0.48365852398314807, "learning_rate": 4.506103040671508e-06, "loss": 0.1602, "step": 3071 }, { "epoch": 1.1106290672451193, "grad_norm": 0.5176302653687498, "learning_rate": 4.503128601758406e-06, "loss": 0.1226, "step": 3072 }, { "epoch": 1.1109906001446133, "grad_norm": 0.1567486484949904, "learning_rate": 4.500154340427037e-06, "loss": 0.0227, "step": 3073 }, { "epoch": 1.111352133044107, "grad_norm": 0.15211092266193144, "learning_rate": 4.497180257740403e-06, "loss": 0.0102, "step": 3074 }, { "epoch": 1.1117136659436009, "grad_norm": 0.13411727821630107, "learning_rate": 4.494206354761436e-06, "loss": 0.0283, "step": 3075 }, { "epoch": 1.1120751988430948, "grad_norm": 0.23157313649735692, "learning_rate": 4.491232632553013e-06, "loss": 0.0182, "step": 3076 }, { "epoch": 1.1124367317425885, "grad_norm": 0.05156143803841018, "learning_rate": 4.488259092177937e-06, "loss": 0.0031, "step": 3077 }, { "epoch": 1.1127982646420824, "grad_norm": 0.10335547198311269, "learning_rate": 4.4852857346989565e-06, "loss": 0.0203, "step": 3078 }, { "epoch": 1.1131597975415763, "grad_norm": 0.2651310306706737, "learning_rate": 4.4823125611787455e-06, "loss": 0.0227, "step": 3079 }, { "epoch": 1.11352133044107, "grad_norm": 0.18196599641908842, "learning_rate": 4.479339572679913e-06, "loss": 0.0352, "step": 3080 }, { "epoch": 1.113882863340564, "grad_norm": 0.19118267440009748, "learning_rate": 4.476366770265011e-06, "loss": 0.0352, "step": 3081 }, { "epoch": 1.1142443962400579, "grad_norm": 0.905213913541407, "learning_rate": 4.473394154996512e-06, "loss": 0.0903, "step": 3082 }, { "epoch": 1.1146059291395516, "grad_norm": 0.1984206809966975, "learning_rate": 4.470421727936832e-06, "loss": 0.0352, "step": 3083 }, { "epoch": 1.1149674620390455, "grad_norm": 0.6939007368523167, "learning_rate": 4.467449490148317e-06, "loss": 0.0903, "step": 3084 }, { "epoch": 1.1153289949385394, "grad_norm": 0.11894305950551598, "learning_rate": 4.464477442693237e-06, "loss": 0.0227, "step": 3085 }, { "epoch": 1.1156905278380334, "grad_norm": 0.6390581442299789, "learning_rate": 4.46150558663381e-06, "loss": 0.0903, "step": 3086 }, { "epoch": 1.116052060737527, "grad_norm": 0.11453893103676237, "learning_rate": 4.45853392303217e-06, "loss": 0.0227, "step": 3087 }, { "epoch": 1.116413593637021, "grad_norm": 0.30500739987389863, "learning_rate": 4.455562452950394e-06, "loss": 0.0145, "step": 3088 }, { "epoch": 1.116775126536515, "grad_norm": 1.7890103082240196, "learning_rate": 4.452591177450482e-06, "loss": 0.2012, "step": 3089 }, { "epoch": 1.1171366594360086, "grad_norm": 0.4564863252512345, "learning_rate": 4.449620097594365e-06, "loss": 0.0693, "step": 3090 }, { "epoch": 1.1174981923355025, "grad_norm": 0.004406841322477235, "learning_rate": 4.446649214443912e-06, "loss": 0.0002, "step": 3091 }, { "epoch": 1.1178597252349964, "grad_norm": 0.14325189119926204, "learning_rate": 4.443678529060912e-06, "loss": 0.0283, "step": 3092 }, { "epoch": 1.1182212581344901, "grad_norm": 0.041648956475091294, "learning_rate": 4.440708042507087e-06, "loss": 0.0024, "step": 3093 }, { "epoch": 1.118582791033984, "grad_norm": 0.6244278839076277, "learning_rate": 4.437737755844093e-06, "loss": 0.0635, "step": 3094 }, { "epoch": 1.118944323933478, "grad_norm": 0.822080338052822, "learning_rate": 4.434767670133502e-06, "loss": 0.083, "step": 3095 }, { "epoch": 1.119305856832972, "grad_norm": 0.1731869884735283, "learning_rate": 4.43179778643683e-06, "loss": 0.0349, "step": 3096 }, { "epoch": 1.1196673897324656, "grad_norm": 0.2069938165381886, "learning_rate": 4.428828105815507e-06, "loss": 0.0283, "step": 3097 }, { "epoch": 1.1200289226319595, "grad_norm": 0.1766632704511883, "learning_rate": 4.425858629330899e-06, "loss": 0.0349, "step": 3098 }, { "epoch": 1.1203904555314534, "grad_norm": 0.24333718714439723, "learning_rate": 4.4228893580442975e-06, "loss": 0.0254, "step": 3099 }, { "epoch": 1.1207519884309471, "grad_norm": 0.20688134253670526, "learning_rate": 4.419920293016914e-06, "loss": 0.0352, "step": 3100 }, { "epoch": 1.121113521330441, "grad_norm": 0.28992486468450357, "learning_rate": 4.416951435309899e-06, "loss": 0.0388, "step": 3101 }, { "epoch": 1.121475054229935, "grad_norm": 0.25531491794178396, "learning_rate": 4.413982785984315e-06, "loss": 0.0476, "step": 3102 }, { "epoch": 1.1218365871294287, "grad_norm": 0.1533609227257355, "learning_rate": 4.411014346101162e-06, "loss": 0.009, "step": 3103 }, { "epoch": 1.1221981200289226, "grad_norm": 0.17957933772344142, "learning_rate": 4.408046116721357e-06, "loss": 0.0317, "step": 3104 }, { "epoch": 1.1225596529284165, "grad_norm": 0.14344017626739436, "learning_rate": 4.405078098905743e-06, "loss": 0.0227, "step": 3105 }, { "epoch": 1.1229211858279102, "grad_norm": 0.18399976889423772, "learning_rate": 4.402110293715094e-06, "loss": 0.0315, "step": 3106 }, { "epoch": 1.1232827187274042, "grad_norm": 0.10953898416544162, "learning_rate": 4.399142702210097e-06, "loss": 0.0227, "step": 3107 }, { "epoch": 1.123644251626898, "grad_norm": 0.22828903257644487, "learning_rate": 4.3961753254513725e-06, "loss": 0.0388, "step": 3108 }, { "epoch": 1.124005784526392, "grad_norm": 0.19540288485574778, "learning_rate": 4.393208164499461e-06, "loss": 0.0317, "step": 3109 }, { "epoch": 1.1243673174258857, "grad_norm": 0.10075506728486443, "learning_rate": 4.390241220414819e-06, "loss": 0.005, "step": 3110 }, { "epoch": 1.1247288503253796, "grad_norm": 0.16952291272830863, "learning_rate": 4.3872744942578406e-06, "loss": 0.0283, "step": 3111 }, { "epoch": 1.1250903832248735, "grad_norm": 0.11063594202242147, "learning_rate": 4.384307987088826e-06, "loss": 0.0203, "step": 3112 }, { "epoch": 1.1254519161243672, "grad_norm": 0.1426301899583902, "learning_rate": 4.381341699968008e-06, "loss": 0.0254, "step": 3113 }, { "epoch": 1.1258134490238612, "grad_norm": 0.1354164922983209, "learning_rate": 4.378375633955537e-06, "loss": 0.0254, "step": 3114 }, { "epoch": 1.126174981923355, "grad_norm": 0.9047380109426466, "learning_rate": 4.37540979011148e-06, "loss": 0.1055, "step": 3115 }, { "epoch": 1.126536514822849, "grad_norm": 0.17000579527819343, "learning_rate": 4.372444169495836e-06, "loss": 0.0227, "step": 3116 }, { "epoch": 1.1268980477223427, "grad_norm": 0.02661583348078863, "learning_rate": 4.369478773168511e-06, "loss": 0.0015, "step": 3117 }, { "epoch": 1.1272595806218366, "grad_norm": 0.11727812743203216, "learning_rate": 4.3665136021893394e-06, "loss": 0.0181, "step": 3118 }, { "epoch": 1.1276211135213305, "grad_norm": 1.1748815024259402, "learning_rate": 4.363548657618073e-06, "loss": 0.083, "step": 3119 }, { "epoch": 1.1279826464208242, "grad_norm": 0.13996256754663952, "learning_rate": 4.360583940514382e-06, "loss": 0.0203, "step": 3120 }, { "epoch": 1.1283441793203182, "grad_norm": 0.5557130877427227, "learning_rate": 4.357619451937858e-06, "loss": 0.1602, "step": 3121 }, { "epoch": 1.128705712219812, "grad_norm": 0.2654216555438408, "learning_rate": 4.354655192948003e-06, "loss": 0.0182, "step": 3122 }, { "epoch": 1.1290672451193058, "grad_norm": 0.24186284045809067, "learning_rate": 4.351691164604247e-06, "loss": 0.0145, "step": 3123 }, { "epoch": 1.1294287780187997, "grad_norm": 0.08348656156276207, "learning_rate": 4.348727367965931e-06, "loss": 0.0161, "step": 3124 }, { "epoch": 1.1297903109182936, "grad_norm": 0.08351567512620364, "learning_rate": 4.345763804092318e-06, "loss": 0.0161, "step": 3125 }, { "epoch": 1.1301518438177873, "grad_norm": 0.0015713451690331768, "learning_rate": 4.342800474042584e-06, "loss": 0.0001, "step": 3126 }, { "epoch": 1.1305133767172812, "grad_norm": 0.009744252758568819, "learning_rate": 4.3398373788758196e-06, "loss": 0.0003, "step": 3127 }, { "epoch": 1.1308749096167752, "grad_norm": 0.07536860960779099, "learning_rate": 4.3368745196510375e-06, "loss": 0.0128, "step": 3128 }, { "epoch": 1.1312364425162689, "grad_norm": 0.09888616544991383, "learning_rate": 4.333911897427162e-06, "loss": 0.0161, "step": 3129 }, { "epoch": 1.1315979754157628, "grad_norm": 0.1962975819923064, "learning_rate": 4.330949513263034e-06, "loss": 0.0254, "step": 3130 }, { "epoch": 1.1319595083152567, "grad_norm": 0.8223195192605705, "learning_rate": 4.32798736821741e-06, "loss": 0.0229, "step": 3131 }, { "epoch": 1.1323210412147506, "grad_norm": 0.10718859702895749, "learning_rate": 4.325025463348957e-06, "loss": 0.0161, "step": 3132 }, { "epoch": 1.1326825741142443, "grad_norm": 0.7717698831258881, "learning_rate": 4.322063799716261e-06, "loss": 0.0903, "step": 3133 }, { "epoch": 1.1330441070137383, "grad_norm": 0.3818711499587651, "learning_rate": 4.3191023783778205e-06, "loss": 0.0182, "step": 3134 }, { "epoch": 1.1334056399132322, "grad_norm": 0.7453053503051501, "learning_rate": 4.316141200392046e-06, "loss": 0.1309, "step": 3135 }, { "epoch": 1.1337671728127259, "grad_norm": 0.18895348640145257, "learning_rate": 4.313180266817264e-06, "loss": 0.0254, "step": 3136 }, { "epoch": 1.1341287057122198, "grad_norm": 0.10398432422472613, "learning_rate": 4.310219578711707e-06, "loss": 0.0161, "step": 3137 }, { "epoch": 1.1344902386117137, "grad_norm": 0.08170831901277131, "learning_rate": 4.3072591371335285e-06, "loss": 0.0143, "step": 3138 }, { "epoch": 1.1348517715112076, "grad_norm": 0.07013058311855098, "learning_rate": 4.304298943140787e-06, "loss": 0.0128, "step": 3139 }, { "epoch": 1.1352133044107013, "grad_norm": 0.0013097358601307572, "learning_rate": 4.301338997791457e-06, "loss": 0.0001, "step": 3140 }, { "epoch": 1.1355748373101953, "grad_norm": 0.002729332249135536, "learning_rate": 4.298379302143424e-06, "loss": 0.0001, "step": 3141 }, { "epoch": 1.1359363702096892, "grad_norm": 0.5268342031610325, "learning_rate": 4.2954198572544766e-06, "loss": 0.0432, "step": 3142 }, { "epoch": 1.1362979031091829, "grad_norm": 0.06391050456862178, "learning_rate": 4.292460664182326e-06, "loss": 0.0039, "step": 3143 }, { "epoch": 1.1366594360086768, "grad_norm": 0.7591865543596286, "learning_rate": 4.289501723984582e-06, "loss": 0.083, "step": 3144 }, { "epoch": 1.1370209689081707, "grad_norm": 0.005692049535252044, "learning_rate": 4.286543037718774e-06, "loss": 0.0003, "step": 3145 }, { "epoch": 1.1373825018076644, "grad_norm": 0.38321077194879666, "learning_rate": 4.283584606442336e-06, "loss": 0.0527, "step": 3146 }, { "epoch": 1.1377440347071583, "grad_norm": 0.5738444126070852, "learning_rate": 4.280626431212604e-06, "loss": 0.1699, "step": 3147 }, { "epoch": 1.1381055676066523, "grad_norm": 0.15309717718653704, "learning_rate": 4.277668513086837e-06, "loss": 0.0203, "step": 3148 }, { "epoch": 1.138467100506146, "grad_norm": 0.07032982231871193, "learning_rate": 4.274710853122188e-06, "loss": 0.0128, "step": 3149 }, { "epoch": 1.13882863340564, "grad_norm": 0.23384798808306903, "learning_rate": 4.271753452375729e-06, "loss": 0.0227, "step": 3150 }, { "epoch": 1.1391901663051338, "grad_norm": 0.648443374614508, "learning_rate": 4.268796311904434e-06, "loss": 0.1504, "step": 3151 }, { "epoch": 1.1395516992046275, "grad_norm": 0.29326286137723906, "learning_rate": 4.26583943276518e-06, "loss": 0.0315, "step": 3152 }, { "epoch": 1.1399132321041214, "grad_norm": 0.14420591973248634, "learning_rate": 4.2628828160147585e-06, "loss": 0.0039, "step": 3153 }, { "epoch": 1.1402747650036154, "grad_norm": 0.10497113031438243, "learning_rate": 4.259926462709862e-06, "loss": 0.0161, "step": 3154 }, { "epoch": 1.1406362979031093, "grad_norm": 0.11295114928884728, "learning_rate": 4.256970373907094e-06, "loss": 0.0162, "step": 3155 }, { "epoch": 1.140997830802603, "grad_norm": 0.6711661980413446, "learning_rate": 4.254014550662957e-06, "loss": 0.0432, "step": 3156 }, { "epoch": 1.141359363702097, "grad_norm": 0.17846014082748174, "learning_rate": 4.251058994033861e-06, "loss": 0.0254, "step": 3157 }, { "epoch": 1.1417208966015908, "grad_norm": 0.10072810005006312, "learning_rate": 4.248103705076123e-06, "loss": 0.0071, "step": 3158 }, { "epoch": 1.1420824295010845, "grad_norm": 0.181629704993796, "learning_rate": 4.245148684845961e-06, "loss": 0.0254, "step": 3159 }, { "epoch": 1.1424439624005784, "grad_norm": 0.018927281554512884, "learning_rate": 4.2421939343995014e-06, "loss": 0.0008, "step": 3160 }, { "epoch": 1.1428054953000724, "grad_norm": 0.4662664299312786, "learning_rate": 4.2392394547927705e-06, "loss": 0.0388, "step": 3161 }, { "epoch": 1.1431670281995663, "grad_norm": 0.017876995694132584, "learning_rate": 4.2362852470816954e-06, "loss": 0.0009, "step": 3162 }, { "epoch": 1.14352856109906, "grad_norm": 0.13764037420503153, "learning_rate": 4.233331312322113e-06, "loss": 0.0203, "step": 3163 }, { "epoch": 1.143890093998554, "grad_norm": 0.13738008606979973, "learning_rate": 4.230377651569757e-06, "loss": 0.0182, "step": 3164 }, { "epoch": 1.1442516268980478, "grad_norm": 0.07795366391406444, "learning_rate": 4.227424265880267e-06, "loss": 0.0114, "step": 3165 }, { "epoch": 1.1446131597975415, "grad_norm": 0.23641773877683714, "learning_rate": 4.224471156309182e-06, "loss": 0.0283, "step": 3166 }, { "epoch": 1.1449746926970354, "grad_norm": 0.06972672701958096, "learning_rate": 4.221518323911941e-06, "loss": 0.0114, "step": 3167 }, { "epoch": 1.1453362255965294, "grad_norm": 1.0681781194430742, "learning_rate": 4.218565769743887e-06, "loss": 0.0903, "step": 3168 }, { "epoch": 1.145697758496023, "grad_norm": 0.8661752251893071, "learning_rate": 4.215613494860261e-06, "loss": 0.1226, "step": 3169 }, { "epoch": 1.146059291395517, "grad_norm": 0.12078946870258966, "learning_rate": 4.212661500316207e-06, "loss": 0.0143, "step": 3170 }, { "epoch": 1.146420824295011, "grad_norm": 0.5535736421960238, "learning_rate": 4.209709787166768e-06, "loss": 0.1699, "step": 3171 }, { "epoch": 1.1467823571945046, "grad_norm": 0.47427750179264483, "learning_rate": 4.206758356466882e-06, "loss": 0.0432, "step": 3172 }, { "epoch": 1.1471438900939985, "grad_norm": 0.8533040394940957, "learning_rate": 4.203807209271393e-06, "loss": 0.2012, "step": 3173 }, { "epoch": 1.1475054229934925, "grad_norm": 0.7918709832758501, "learning_rate": 4.20085634663504e-06, "loss": 0.1406, "step": 3174 }, { "epoch": 1.1478669558929862, "grad_norm": 0.1410766327882241, "learning_rate": 4.197905769612458e-06, "loss": 0.0203, "step": 3175 }, { "epoch": 1.14822848879248, "grad_norm": 0.009149901046465073, "learning_rate": 4.194955479258188e-06, "loss": 0.0005, "step": 3176 }, { "epoch": 1.148590021691974, "grad_norm": 1.0573854460274157, "learning_rate": 4.192005476626656e-06, "loss": 0.083, "step": 3177 }, { "epoch": 1.148951554591468, "grad_norm": 0.00257447727145841, "learning_rate": 4.1890557627722e-06, "loss": 0.0001, "step": 3178 }, { "epoch": 1.1493130874909616, "grad_norm": 0.7332764239839767, "learning_rate": 4.18610633874904e-06, "loss": 0.1226, "step": 3179 }, { "epoch": 1.1496746203904555, "grad_norm": 0.030748219678471866, "learning_rate": 4.183157205611304e-06, "loss": 0.0017, "step": 3180 }, { "epoch": 1.1500361532899495, "grad_norm": 0.5349457966765327, "learning_rate": 4.180208364413013e-06, "loss": 0.1914, "step": 3181 }, { "epoch": 1.1503976861894432, "grad_norm": 0.09368623824192937, "learning_rate": 4.177259816208075e-06, "loss": 0.0143, "step": 3182 }, { "epoch": 1.150759219088937, "grad_norm": 0.5201538628098537, "learning_rate": 4.174311562050308e-06, "loss": 0.1807, "step": 3183 }, { "epoch": 1.151120751988431, "grad_norm": 0.5595315466802312, "learning_rate": 4.171363602993412e-06, "loss": 0.0352, "step": 3184 }, { "epoch": 1.151482284887925, "grad_norm": 0.14303148983394995, "learning_rate": 4.168415940090992e-06, "loss": 0.0227, "step": 3185 }, { "epoch": 1.1518438177874186, "grad_norm": 0.5457931195425016, "learning_rate": 4.1654685743965375e-06, "loss": 0.1914, "step": 3186 }, { "epoch": 1.1522053506869125, "grad_norm": 0.3305814391068457, "learning_rate": 4.162521506963439e-06, "loss": 0.0388, "step": 3187 }, { "epoch": 1.1525668835864065, "grad_norm": 0.6230345976853113, "learning_rate": 4.159574738844979e-06, "loss": 0.0635, "step": 3188 }, { "epoch": 1.1529284164859002, "grad_norm": 0.662902309026667, "learning_rate": 4.156628271094328e-06, "loss": 0.1143, "step": 3189 }, { "epoch": 1.153289949385394, "grad_norm": 0.31796015473666933, "learning_rate": 4.153682104764556e-06, "loss": 0.0432, "step": 3190 }, { "epoch": 1.153651482284888, "grad_norm": 0.005314120375203251, "learning_rate": 4.150736240908619e-06, "loss": 0.0003, "step": 3191 }, { "epoch": 1.1540130151843817, "grad_norm": 0.12784724481579018, "learning_rate": 4.147790680579373e-06, "loss": 0.0227, "step": 3192 }, { "epoch": 1.1543745480838756, "grad_norm": 0.17944828588743866, "learning_rate": 4.144845424829558e-06, "loss": 0.0203, "step": 3193 }, { "epoch": 1.1547360809833696, "grad_norm": 0.15722082547349212, "learning_rate": 4.141900474711805e-06, "loss": 0.0283, "step": 3194 }, { "epoch": 1.1550976138828633, "grad_norm": 0.4887510209180828, "learning_rate": 4.138955831278643e-06, "loss": 0.1699, "step": 3195 }, { "epoch": 1.1554591467823572, "grad_norm": 0.6921544322426157, "learning_rate": 4.136011495582484e-06, "loss": 0.0693, "step": 3196 }, { "epoch": 1.155820679681851, "grad_norm": 0.5047419464224122, "learning_rate": 4.1330674686756354e-06, "loss": 0.1602, "step": 3197 }, { "epoch": 1.1561822125813448, "grad_norm": 0.16693511100501401, "learning_rate": 4.130123751610291e-06, "loss": 0.0254, "step": 3198 }, { "epoch": 1.1565437454808387, "grad_norm": 1.2647532647266546, "learning_rate": 4.127180345438533e-06, "loss": 0.2236, "step": 3199 }, { "epoch": 1.1569052783803326, "grad_norm": 0.12598050890931176, "learning_rate": 4.124237251212337e-06, "loss": 0.0227, "step": 3200 }, { "epoch": 1.1572668112798266, "grad_norm": 0.13943209509618518, "learning_rate": 4.121294469983559e-06, "loss": 0.0227, "step": 3201 }, { "epoch": 1.1576283441793203, "grad_norm": 0.18357926050525974, "learning_rate": 4.118352002803955e-06, "loss": 0.0315, "step": 3202 }, { "epoch": 1.1579898770788142, "grad_norm": 0.1508832984358912, "learning_rate": 4.11540985072516e-06, "loss": 0.0283, "step": 3203 }, { "epoch": 1.158351409978308, "grad_norm": 0.5044120607230927, "learning_rate": 4.112468014798695e-06, "loss": 0.0527, "step": 3204 }, { "epoch": 1.1587129428778018, "grad_norm": 0.1396884599916048, "learning_rate": 4.109526496075975e-06, "loss": 0.0063, "step": 3205 }, { "epoch": 1.1590744757772957, "grad_norm": 0.003404277300838203, "learning_rate": 4.106585295608297e-06, "loss": 0.0002, "step": 3206 }, { "epoch": 1.1594360086767896, "grad_norm": 0.5254591064051631, "learning_rate": 4.1036444144468475e-06, "loss": 0.0352, "step": 3207 }, { "epoch": 1.1597975415762836, "grad_norm": 0.2143971894616733, "learning_rate": 4.1007038536426954e-06, "loss": 0.0352, "step": 3208 }, { "epoch": 1.1601590744757773, "grad_norm": 0.11935451949656663, "learning_rate": 4.0977636142467935e-06, "loss": 0.0227, "step": 3209 }, { "epoch": 1.1605206073752712, "grad_norm": 6.863197391057164, "learning_rate": 4.094823697309988e-06, "loss": 0.3711, "step": 3210 }, { "epoch": 1.160882140274765, "grad_norm": 0.02611950400698164, "learning_rate": 4.091884103882999e-06, "loss": 0.0013, "step": 3211 }, { "epoch": 1.1612436731742588, "grad_norm": 0.2677505119119935, "learning_rate": 4.088944835016443e-06, "loss": 0.0162, "step": 3212 }, { "epoch": 1.1616052060737527, "grad_norm": 1.0035267844508895, "learning_rate": 4.08600589176081e-06, "loss": 0.0693, "step": 3213 }, { "epoch": 1.1619667389732466, "grad_norm": 0.0015284330273853015, "learning_rate": 4.083067275166477e-06, "loss": 0.0001, "step": 3214 }, { "epoch": 1.1623282718727403, "grad_norm": 0.15507558553265544, "learning_rate": 4.080128986283707e-06, "loss": 0.0254, "step": 3215 }, { "epoch": 1.1626898047722343, "grad_norm": 0.3785091103684162, "learning_rate": 4.077191026162642e-06, "loss": 0.0476, "step": 3216 }, { "epoch": 1.1630513376717282, "grad_norm": 0.23498824746677463, "learning_rate": 4.074253395853311e-06, "loss": 0.0352, "step": 3217 }, { "epoch": 1.163412870571222, "grad_norm": 0.4691641429300674, "learning_rate": 4.071316096405622e-06, "loss": 0.0527, "step": 3218 }, { "epoch": 1.1637744034707158, "grad_norm": 0.0022327099727336174, "learning_rate": 4.068379128869362e-06, "loss": 0.0001, "step": 3219 }, { "epoch": 1.1641359363702097, "grad_norm": 0.11624907752240515, "learning_rate": 4.065442494294205e-06, "loss": 0.0227, "step": 3220 }, { "epoch": 1.1644974692697034, "grad_norm": 1.578896653360541, "learning_rate": 4.062506193729704e-06, "loss": 0.3047, "step": 3221 }, { "epoch": 1.1648590021691974, "grad_norm": 0.20059513900723533, "learning_rate": 4.059570228225291e-06, "loss": 0.0283, "step": 3222 }, { "epoch": 1.1652205350686913, "grad_norm": 0.034918508367986996, "learning_rate": 4.056634598830282e-06, "loss": 0.0015, "step": 3223 }, { "epoch": 1.1655820679681852, "grad_norm": 0.4509999886253239, "learning_rate": 4.0536993065938655e-06, "loss": 0.0227, "step": 3224 }, { "epoch": 1.165943600867679, "grad_norm": 0.6020471249398324, "learning_rate": 4.050764352565119e-06, "loss": 0.0579, "step": 3225 }, { "epoch": 1.1663051337671728, "grad_norm": 0.18309715865143508, "learning_rate": 4.047829737792991e-06, "loss": 0.0283, "step": 3226 }, { "epoch": 1.1666666666666667, "grad_norm": 1.178225844833353, "learning_rate": 4.0448954633263145e-06, "loss": 0.1602, "step": 3227 }, { "epoch": 1.1670281995661604, "grad_norm": 0.6389881439129809, "learning_rate": 4.041961530213799e-06, "loss": 0.0903, "step": 3228 }, { "epoch": 1.1673897324656544, "grad_norm": 0.17423064639878694, "learning_rate": 4.039027939504028e-06, "loss": 0.0254, "step": 3229 }, { "epoch": 1.1677512653651483, "grad_norm": 0.42720521691820845, "learning_rate": 4.03609469224547e-06, "loss": 0.0579, "step": 3230 }, { "epoch": 1.1681127982646422, "grad_norm": 0.7074660069599128, "learning_rate": 4.033161789486465e-06, "loss": 0.0693, "step": 3231 }, { "epoch": 1.168474331164136, "grad_norm": 0.15908452125773528, "learning_rate": 4.030229232275233e-06, "loss": 0.0254, "step": 3232 }, { "epoch": 1.1688358640636298, "grad_norm": 0.8277813561738688, "learning_rate": 4.02729702165987e-06, "loss": 0.0903, "step": 3233 }, { "epoch": 1.1691973969631237, "grad_norm": 0.1425604919584725, "learning_rate": 4.024365158688344e-06, "loss": 0.0181, "step": 3234 }, { "epoch": 1.1695589298626174, "grad_norm": 0.13718653776429965, "learning_rate": 4.021433644408506e-06, "loss": 0.0203, "step": 3235 }, { "epoch": 1.1699204627621114, "grad_norm": 0.1405034360021732, "learning_rate": 4.018502479868075e-06, "loss": 0.005, "step": 3236 }, { "epoch": 1.1702819956616053, "grad_norm": 0.7969468530169626, "learning_rate": 4.0155716661146515e-06, "loss": 0.0102, "step": 3237 }, { "epoch": 1.170643528561099, "grad_norm": 0.14519942604106972, "learning_rate": 4.012641204195709e-06, "loss": 0.0254, "step": 3238 }, { "epoch": 1.171005061460593, "grad_norm": 0.49614993034392985, "learning_rate": 4.009711095158588e-06, "loss": 0.1699, "step": 3239 }, { "epoch": 1.1713665943600868, "grad_norm": 0.8009238021126094, "learning_rate": 4.006781340050515e-06, "loss": 0.1309, "step": 3240 }, { "epoch": 1.1717281272595805, "grad_norm": 0.4245241569075086, "learning_rate": 4.00385193991858e-06, "loss": 0.0432, "step": 3241 }, { "epoch": 1.1720896601590745, "grad_norm": 0.10792414713034854, "learning_rate": 4.000922895809752e-06, "loss": 0.0181, "step": 3242 }, { "epoch": 1.1724511930585684, "grad_norm": 0.10148360217638874, "learning_rate": 3.997994208770873e-06, "loss": 0.0182, "step": 3243 }, { "epoch": 1.172812725958062, "grad_norm": 0.07954992354097057, "learning_rate": 3.995065879848648e-06, "loss": 0.0056, "step": 3244 }, { "epoch": 1.173174258857556, "grad_norm": 0.08665876582248243, "learning_rate": 3.992137910089668e-06, "loss": 0.0181, "step": 3245 }, { "epoch": 1.17353579175705, "grad_norm": 0.2424600457348457, "learning_rate": 3.9892103005403845e-06, "loss": 0.0129, "step": 3246 }, { "epoch": 1.1738973246565438, "grad_norm": 0.10545426497253005, "learning_rate": 3.986283052247127e-06, "loss": 0.0162, "step": 3247 }, { "epoch": 1.1742588575560375, "grad_norm": 0.10002237573862277, "learning_rate": 3.983356166256094e-06, "loss": 0.0039, "step": 3248 }, { "epoch": 1.1746203904555315, "grad_norm": 0.08681147348277493, "learning_rate": 3.980429643613351e-06, "loss": 0.0161, "step": 3249 }, { "epoch": 1.1749819233550254, "grad_norm": 0.5107497194314379, "learning_rate": 3.9775034853648386e-06, "loss": 0.0203, "step": 3250 }, { "epoch": 1.175343456254519, "grad_norm": 0.5178010368422841, "learning_rate": 3.974577692556364e-06, "loss": 0.1699, "step": 3251 }, { "epoch": 1.175704989154013, "grad_norm": 0.2691497228731053, "learning_rate": 3.971652266233607e-06, "loss": 0.0145, "step": 3252 }, { "epoch": 1.176066522053507, "grad_norm": 0.1833343184885764, "learning_rate": 3.968727207442114e-06, "loss": 0.0254, "step": 3253 }, { "epoch": 1.1764280549530008, "grad_norm": 0.12326757829314786, "learning_rate": 3.965802517227297e-06, "loss": 0.0161, "step": 3254 }, { "epoch": 1.1767895878524945, "grad_norm": 0.8404891325595311, "learning_rate": 3.962878196634444e-06, "loss": 0.0283, "step": 3255 }, { "epoch": 1.1771511207519885, "grad_norm": 0.18684211142878407, "learning_rate": 3.959954246708703e-06, "loss": 0.0283, "step": 3256 }, { "epoch": 1.1775126536514824, "grad_norm": 0.8025579434091638, "learning_rate": 3.957030668495095e-06, "loss": 0.0579, "step": 3257 }, { "epoch": 1.177874186550976, "grad_norm": 0.636400247930005, "learning_rate": 3.954107463038506e-06, "loss": 0.0476, "step": 3258 }, { "epoch": 1.17823571945047, "grad_norm": 0.17378824227613848, "learning_rate": 3.95118463138369e-06, "loss": 0.0203, "step": 3259 }, { "epoch": 1.178597252349964, "grad_norm": 0.5719955553674732, "learning_rate": 3.948262174575266e-06, "loss": 0.1504, "step": 3260 }, { "epoch": 1.1789587852494576, "grad_norm": 0.2204450449789544, "learning_rate": 3.945340093657717e-06, "loss": 0.0203, "step": 3261 }, { "epoch": 1.1793203181489516, "grad_norm": 0.7133825595503666, "learning_rate": 3.942418389675396e-06, "loss": 0.0476, "step": 3262 }, { "epoch": 1.1796818510484455, "grad_norm": 0.5995648986277923, "learning_rate": 3.93949706367252e-06, "loss": 0.0476, "step": 3263 }, { "epoch": 1.1800433839479392, "grad_norm": 1.3250311484444972, "learning_rate": 3.93657611669317e-06, "loss": 0.0283, "step": 3264 }, { "epoch": 1.180404916847433, "grad_norm": 0.4577711584245705, "learning_rate": 3.933655549781292e-06, "loss": 0.1504, "step": 3265 }, { "epoch": 1.180766449746927, "grad_norm": 0.002583069338424223, "learning_rate": 3.930735363980693e-06, "loss": 0.0001, "step": 3266 }, { "epoch": 1.1811279826464207, "grad_norm": 0.2635819815150191, "learning_rate": 3.927815560335051e-06, "loss": 0.0283, "step": 3267 }, { "epoch": 1.1814895155459146, "grad_norm": 0.9241128623408195, "learning_rate": 3.9248961398879006e-06, "loss": 0.0693, "step": 3268 }, { "epoch": 1.1818510484454086, "grad_norm": 0.37942079795712746, "learning_rate": 3.921977103682645e-06, "loss": 0.0317, "step": 3269 }, { "epoch": 1.1822125813449025, "grad_norm": 0.5971825088454492, "learning_rate": 3.919058452762544e-06, "loss": 0.0977, "step": 3270 }, { "epoch": 1.1825741142443962, "grad_norm": 0.44498853530997823, "learning_rate": 3.916140188170723e-06, "loss": 0.0181, "step": 3271 }, { "epoch": 1.18293564714389, "grad_norm": 0.11716283038333278, "learning_rate": 3.91322231095017e-06, "loss": 0.0203, "step": 3272 }, { "epoch": 1.183297180043384, "grad_norm": 0.9751679303581554, "learning_rate": 3.910304822143734e-06, "loss": 0.0579, "step": 3273 }, { "epoch": 1.1836587129428777, "grad_norm": 3.050091230422678, "learning_rate": 3.907387722794125e-06, "loss": 0.0171, "step": 3274 }, { "epoch": 1.1840202458423716, "grad_norm": 0.07209809726602165, "learning_rate": 3.904471013943914e-06, "loss": 0.0019, "step": 3275 }, { "epoch": 1.1843817787418656, "grad_norm": 0.42561607135145696, "learning_rate": 3.90155469663553e-06, "loss": 0.0317, "step": 3276 }, { "epoch": 1.1847433116413595, "grad_norm": 0.6363190165066449, "learning_rate": 3.898638771911266e-06, "loss": 0.1055, "step": 3277 }, { "epoch": 1.1851048445408532, "grad_norm": 0.48546223889462087, "learning_rate": 3.895723240813272e-06, "loss": 0.0476, "step": 3278 }, { "epoch": 1.185466377440347, "grad_norm": 0.0011658910023214406, "learning_rate": 3.892808104383559e-06, "loss": 0.0001, "step": 3279 }, { "epoch": 1.185827910339841, "grad_norm": 0.7538970027316474, "learning_rate": 3.889893363663998e-06, "loss": 0.0635, "step": 3280 }, { "epoch": 1.1861894432393347, "grad_norm": 0.13602446587609837, "learning_rate": 3.886979019696312e-06, "loss": 0.0182, "step": 3281 }, { "epoch": 1.1865509761388287, "grad_norm": 1.48809046841384, "learning_rate": 3.88406507352209e-06, "loss": 0.1226, "step": 3282 }, { "epoch": 1.1869125090383226, "grad_norm": 0.0013586753048607807, "learning_rate": 3.881151526182774e-06, "loss": 0.0001, "step": 3283 }, { "epoch": 1.1872740419378163, "grad_norm": 0.6308276092285526, "learning_rate": 3.8782383787196685e-06, "loss": 0.0317, "step": 3284 }, { "epoch": 1.1876355748373102, "grad_norm": 0.05593666139714272, "learning_rate": 3.87532563217393e-06, "loss": 0.0027, "step": 3285 }, { "epoch": 1.1879971077368041, "grad_norm": 0.1961912465488071, "learning_rate": 3.872413287586572e-06, "loss": 0.0317, "step": 3286 }, { "epoch": 1.1883586406362978, "grad_norm": 0.7987396161433068, "learning_rate": 3.869501345998467e-06, "loss": 0.0903, "step": 3287 }, { "epoch": 1.1887201735357917, "grad_norm": 0.008906560126620762, "learning_rate": 3.866589808450342e-06, "loss": 0.0004, "step": 3288 }, { "epoch": 1.1890817064352857, "grad_norm": 0.5355135765808786, "learning_rate": 3.863678675982782e-06, "loss": 0.1914, "step": 3289 }, { "epoch": 1.1894432393347794, "grad_norm": 0.04013736246819589, "learning_rate": 3.860767949636223e-06, "loss": 0.0019, "step": 3290 }, { "epoch": 1.1898047722342733, "grad_norm": 0.2891307035796137, "learning_rate": 3.857857630450957e-06, "loss": 0.0102, "step": 3291 }, { "epoch": 1.1901663051337672, "grad_norm": 0.12951517939699547, "learning_rate": 3.854947719467134e-06, "loss": 0.0203, "step": 3292 }, { "epoch": 1.1905278380332611, "grad_norm": 0.7031092071210152, "learning_rate": 3.8520382177247525e-06, "loss": 0.0527, "step": 3293 }, { "epoch": 1.1908893709327548, "grad_norm": 0.6034565436645934, "learning_rate": 3.849129126263671e-06, "loss": 0.1055, "step": 3294 }, { "epoch": 1.1912509038322487, "grad_norm": 0.19000226390121297, "learning_rate": 3.846220446123599e-06, "loss": 0.0254, "step": 3295 }, { "epoch": 1.1916124367317427, "grad_norm": 0.09715399233137893, "learning_rate": 3.843312178344093e-06, "loss": 0.0161, "step": 3296 }, { "epoch": 1.1919739696312364, "grad_norm": 0.5356605716660225, "learning_rate": 3.840404323964572e-06, "loss": 0.1602, "step": 3297 }, { "epoch": 1.1923355025307303, "grad_norm": 0.6772677657350197, "learning_rate": 3.837496884024299e-06, "loss": 0.083, "step": 3298 }, { "epoch": 1.1926970354302242, "grad_norm": 0.12060212626227106, "learning_rate": 3.834589859562396e-06, "loss": 0.0203, "step": 3299 }, { "epoch": 1.1930585683297181, "grad_norm": 2.939237170506656, "learning_rate": 3.831683251617832e-06, "loss": 0.1504, "step": 3300 }, { "epoch": 1.1934201012292118, "grad_norm": 0.45958715235706415, "learning_rate": 3.828777061229426e-06, "loss": 0.1504, "step": 3301 }, { "epoch": 1.1937816341287057, "grad_norm": 0.5667699012381602, "learning_rate": 3.825871289435851e-06, "loss": 0.1406, "step": 3302 }, { "epoch": 1.1941431670281997, "grad_norm": 0.1328051146478648, "learning_rate": 3.822965937275629e-06, "loss": 0.0056, "step": 3303 }, { "epoch": 1.1945046999276934, "grad_norm": 0.1759009855135708, "learning_rate": 3.820061005787133e-06, "loss": 0.0283, "step": 3304 }, { "epoch": 1.1948662328271873, "grad_norm": 0.18232184718208896, "learning_rate": 3.817156496008587e-06, "loss": 0.0283, "step": 3305 }, { "epoch": 1.1952277657266812, "grad_norm": 0.12806456355823112, "learning_rate": 3.8142524089780564e-06, "loss": 0.0254, "step": 3306 }, { "epoch": 1.195589298626175, "grad_norm": 0.0013235597689984595, "learning_rate": 3.8113487457334657e-06, "loss": 0.0001, "step": 3307 }, { "epoch": 1.1959508315256688, "grad_norm": 0.14515678548811956, "learning_rate": 3.808445507312582e-06, "loss": 0.0254, "step": 3308 }, { "epoch": 1.1963123644251628, "grad_norm": 0.016693887424009086, "learning_rate": 3.805542694753023e-06, "loss": 0.0006, "step": 3309 }, { "epoch": 1.1966738973246565, "grad_norm": 0.08560094696119229, "learning_rate": 3.8026403090922544e-06, "loss": 0.0044, "step": 3310 }, { "epoch": 1.1970354302241504, "grad_norm": 0.19177932338396164, "learning_rate": 3.799738351367584e-06, "loss": 0.0254, "step": 3311 }, { "epoch": 1.1973969631236443, "grad_norm": 0.496161573050873, "learning_rate": 3.7968368226161743e-06, "loss": 0.0476, "step": 3312 }, { "epoch": 1.197758496023138, "grad_norm": 0.6378219912603675, "learning_rate": 3.7939357238750302e-06, "loss": 0.1226, "step": 3313 }, { "epoch": 1.198120028922632, "grad_norm": 0.3477108212753226, "learning_rate": 3.7910350561810045e-06, "loss": 0.0388, "step": 3314 }, { "epoch": 1.1984815618221258, "grad_norm": 0.09960174313860762, "learning_rate": 3.788134820570796e-06, "loss": 0.0056, "step": 3315 }, { "epoch": 1.1988430947216198, "grad_norm": 0.4825870962411264, "learning_rate": 3.7852350180809437e-06, "loss": 0.0432, "step": 3316 }, { "epoch": 1.1992046276211135, "grad_norm": 0.20907279885005595, "learning_rate": 3.7823356497478414e-06, "loss": 0.0227, "step": 3317 }, { "epoch": 1.1995661605206074, "grad_norm": 0.7654088167468931, "learning_rate": 3.7794367166077194e-06, "loss": 0.0527, "step": 3318 }, { "epoch": 1.1999276934201013, "grad_norm": 0.5455533185019569, "learning_rate": 3.7765382196966588e-06, "loss": 0.0476, "step": 3319 }, { "epoch": 1.200289226319595, "grad_norm": 0.2272815103493402, "learning_rate": 3.773640160050581e-06, "loss": 0.0203, "step": 3320 }, { "epoch": 1.200650759219089, "grad_norm": 0.3208373958905537, "learning_rate": 3.77074253870525e-06, "loss": 0.0317, "step": 3321 }, { "epoch": 1.2010122921185828, "grad_norm": 0.4205490491821074, "learning_rate": 3.7678453566962763e-06, "loss": 0.0388, "step": 3322 }, { "epoch": 1.2013738250180768, "grad_norm": 0.2325265899137673, "learning_rate": 3.7649486150591115e-06, "loss": 0.0315, "step": 3323 }, { "epoch": 1.2017353579175705, "grad_norm": 0.0027301881690192055, "learning_rate": 3.7620523148290517e-06, "loss": 0.0001, "step": 3324 }, { "epoch": 1.2020968908170644, "grad_norm": 0.32563883322652143, "learning_rate": 3.7591564570412343e-06, "loss": 0.0432, "step": 3325 }, { "epoch": 1.2024584237165583, "grad_norm": 0.08860267342887282, "learning_rate": 3.7562610427306357e-06, "loss": 0.0161, "step": 3326 }, { "epoch": 1.202819956616052, "grad_norm": 1.0837710915477354, "learning_rate": 3.7533660729320785e-06, "loss": 0.0903, "step": 3327 }, { "epoch": 1.203181489515546, "grad_norm": 0.14872960890840312, "learning_rate": 3.7504715486802234e-06, "loss": 0.008, "step": 3328 }, { "epoch": 1.2035430224150399, "grad_norm": 0.27566892880541194, "learning_rate": 3.7475774710095736e-06, "loss": 0.0283, "step": 3329 }, { "epoch": 1.2039045553145336, "grad_norm": 0.5330870912489559, "learning_rate": 3.7446838409544708e-06, "loss": 0.1309, "step": 3330 }, { "epoch": 1.2042660882140275, "grad_norm": 0.01767477268736957, "learning_rate": 3.7417906595490993e-06, "loss": 0.0006, "step": 3331 }, { "epoch": 1.2046276211135214, "grad_norm": 0.12834880274637817, "learning_rate": 3.7388979278274806e-06, "loss": 0.0063, "step": 3332 }, { "epoch": 1.204989154013015, "grad_norm": 0.7729325504052271, "learning_rate": 3.736005646823475e-06, "loss": 0.0635, "step": 3333 }, { "epoch": 1.205350686912509, "grad_norm": 1.1097597460223163, "learning_rate": 3.733113817570785e-06, "loss": 0.0579, "step": 3334 }, { "epoch": 1.205712219812003, "grad_norm": 0.931635891448395, "learning_rate": 3.7302224411029487e-06, "loss": 0.0527, "step": 3335 }, { "epoch": 1.2060737527114966, "grad_norm": 0.001077630533058801, "learning_rate": 3.7273315184533465e-06, "loss": 0.0, "step": 3336 }, { "epoch": 1.2064352856109906, "grad_norm": 2.6084150043182524, "learning_rate": 3.724441050655189e-06, "loss": 0.5508, "step": 3337 }, { "epoch": 1.2067968185104845, "grad_norm": 0.2238077629672442, "learning_rate": 3.7215510387415305e-06, "loss": 0.0254, "step": 3338 }, { "epoch": 1.2071583514099784, "grad_norm": 0.01195119703182089, "learning_rate": 3.7186614837452617e-06, "loss": 0.0004, "step": 3339 }, { "epoch": 1.207519884309472, "grad_norm": 0.0009292770150414313, "learning_rate": 3.7157723866991067e-06, "loss": 0.0, "step": 3340 }, { "epoch": 1.207881417208966, "grad_norm": 0.8155046696499398, "learning_rate": 3.712883748635633e-06, "loss": 0.1055, "step": 3341 }, { "epoch": 1.20824295010846, "grad_norm": 0.7362077741519121, "learning_rate": 3.709995570587234e-06, "loss": 0.0352, "step": 3342 }, { "epoch": 1.2086044830079536, "grad_norm": 0.0014599362082723351, "learning_rate": 3.7071078535861447e-06, "loss": 0.0001, "step": 3343 }, { "epoch": 1.2089660159074476, "grad_norm": 0.24451356378047523, "learning_rate": 3.704220598664437e-06, "loss": 0.0254, "step": 3344 }, { "epoch": 1.2093275488069415, "grad_norm": 0.01301864122023656, "learning_rate": 3.701333806854013e-06, "loss": 0.0005, "step": 3345 }, { "epoch": 1.2096890817064354, "grad_norm": 0.14635912141315455, "learning_rate": 3.6984474791866136e-06, "loss": 0.0227, "step": 3346 }, { "epoch": 1.210050614605929, "grad_norm": 0.0019040162533354485, "learning_rate": 3.69556161669381e-06, "loss": 0.0001, "step": 3347 }, { "epoch": 1.210412147505423, "grad_norm": 0.30891163902594304, "learning_rate": 3.6926762204070086e-06, "loss": 0.0315, "step": 3348 }, { "epoch": 1.210773680404917, "grad_norm": 0.5176090337974243, "learning_rate": 3.6897912913574505e-06, "loss": 0.0476, "step": 3349 }, { "epoch": 1.2111352133044107, "grad_norm": 0.00172804170656631, "learning_rate": 3.686906830576208e-06, "loss": 0.0001, "step": 3350 }, { "epoch": 1.2114967462039046, "grad_norm": 0.002936999396504436, "learning_rate": 3.684022839094189e-06, "loss": 0.0001, "step": 3351 }, { "epoch": 1.2118582791033985, "grad_norm": 0.8350420863457967, "learning_rate": 3.6811393179421285e-06, "loss": 0.1309, "step": 3352 }, { "epoch": 1.2122198120028922, "grad_norm": 0.04574998352279994, "learning_rate": 3.6782562681505963e-06, "loss": 0.0021, "step": 3353 }, { "epoch": 1.2125813449023861, "grad_norm": 0.130141600378419, "learning_rate": 3.675373690749996e-06, "loss": 0.0227, "step": 3354 }, { "epoch": 1.21294287780188, "grad_norm": 0.03608391124045306, "learning_rate": 3.672491586770558e-06, "loss": 0.0015, "step": 3355 }, { "epoch": 1.2133044107013737, "grad_norm": 0.4798674724714001, "learning_rate": 3.6696099572423484e-06, "loss": 0.0317, "step": 3356 }, { "epoch": 1.2136659436008677, "grad_norm": 0.2008854361575316, "learning_rate": 3.6667288031952584e-06, "loss": 0.0203, "step": 3357 }, { "epoch": 1.2140274765003616, "grad_norm": 0.22519285358748814, "learning_rate": 3.6638481256590123e-06, "loss": 0.0315, "step": 3358 }, { "epoch": 1.2143890093998553, "grad_norm": 1.0567409615272687, "learning_rate": 3.6609679256631647e-06, "loss": 0.1226, "step": 3359 }, { "epoch": 1.2147505422993492, "grad_norm": 0.17997886076119685, "learning_rate": 3.6580882042370974e-06, "loss": 0.0227, "step": 3360 }, { "epoch": 1.2151120751988431, "grad_norm": 0.08898316955334137, "learning_rate": 3.6552089624100244e-06, "loss": 0.0161, "step": 3361 }, { "epoch": 1.215473608098337, "grad_norm": 0.8824832409225254, "learning_rate": 3.6523302012109835e-06, "loss": 0.0527, "step": 3362 }, { "epoch": 1.2158351409978307, "grad_norm": 0.0034591188060307176, "learning_rate": 3.649451921668843e-06, "loss": 0.0001, "step": 3363 }, { "epoch": 1.2161966738973247, "grad_norm": 3.3010946987879355, "learning_rate": 3.646574124812302e-06, "loss": 0.2129, "step": 3364 }, { "epoch": 1.2165582067968186, "grad_norm": 0.6737829355344267, "learning_rate": 3.643696811669882e-06, "loss": 0.1699, "step": 3365 }, { "epoch": 1.2169197396963123, "grad_norm": 0.0965266025409756, "learning_rate": 3.6408199832699377e-06, "loss": 0.0161, "step": 3366 }, { "epoch": 1.2172812725958062, "grad_norm": 0.7586743770607414, "learning_rate": 3.6379436406406426e-06, "loss": 0.0693, "step": 3367 }, { "epoch": 1.2176428054953001, "grad_norm": 0.09911910048636548, "learning_rate": 3.6350677848100025e-06, "loss": 0.0071, "step": 3368 }, { "epoch": 1.218004338394794, "grad_norm": 0.7009500223319419, "learning_rate": 3.6321924168058487e-06, "loss": 0.1143, "step": 3369 }, { "epoch": 1.2183658712942878, "grad_norm": 0.11335970225261599, "learning_rate": 3.629317537655836e-06, "loss": 0.0181, "step": 3370 }, { "epoch": 1.2187274041937817, "grad_norm": 0.2527048948459969, "learning_rate": 3.626443148387447e-06, "loss": 0.0254, "step": 3371 }, { "epoch": 1.2190889370932756, "grad_norm": 0.3825486396784555, "learning_rate": 3.623569250027987e-06, "loss": 0.0181, "step": 3372 }, { "epoch": 1.2194504699927693, "grad_norm": 0.22188368939495748, "learning_rate": 3.6206958436045856e-06, "loss": 0.0101, "step": 3373 }, { "epoch": 1.2198120028922632, "grad_norm": 0.18883820892045844, "learning_rate": 3.617822930144199e-06, "loss": 0.0254, "step": 3374 }, { "epoch": 1.2201735357917571, "grad_norm": 1.9695737687948496, "learning_rate": 3.614950510673605e-06, "loss": 0.3164, "step": 3375 }, { "epoch": 1.2205350686912508, "grad_norm": 0.02115576182053836, "learning_rate": 3.6120785862194075e-06, "loss": 0.001, "step": 3376 }, { "epoch": 1.2208966015907448, "grad_norm": 0.0308629425794457, "learning_rate": 3.6092071578080306e-06, "loss": 0.0021, "step": 3377 }, { "epoch": 1.2212581344902387, "grad_norm": 0.2023422846619071, "learning_rate": 3.60633622646572e-06, "loss": 0.0283, "step": 3378 }, { "epoch": 1.2216196673897324, "grad_norm": 0.6461074123376266, "learning_rate": 3.603465793218549e-06, "loss": 0.1226, "step": 3379 }, { "epoch": 1.2219812002892263, "grad_norm": 0.004399176426089695, "learning_rate": 3.6005958590924085e-06, "loss": 0.0002, "step": 3380 }, { "epoch": 1.2223427331887202, "grad_norm": 0.002713797763170247, "learning_rate": 3.5977264251130127e-06, "loss": 0.0001, "step": 3381 }, { "epoch": 1.222704266088214, "grad_norm": 0.8982558205898862, "learning_rate": 3.5948574923058975e-06, "loss": 0.0476, "step": 3382 }, { "epoch": 1.2230657989877078, "grad_norm": 0.025620745938349017, "learning_rate": 3.591989061696417e-06, "loss": 0.001, "step": 3383 }, { "epoch": 1.2234273318872018, "grad_norm": 0.28514814343822736, "learning_rate": 3.5891211343097492e-06, "loss": 0.0352, "step": 3384 }, { "epoch": 1.2237888647866957, "grad_norm": 0.19341007642785069, "learning_rate": 3.5862537111708895e-06, "loss": 0.0102, "step": 3385 }, { "epoch": 1.2241503976861894, "grad_norm": 0.14522576948293495, "learning_rate": 3.583386793304655e-06, "loss": 0.0227, "step": 3386 }, { "epoch": 1.2245119305856833, "grad_norm": 0.22419184249908639, "learning_rate": 3.5805203817356837e-06, "loss": 0.0254, "step": 3387 }, { "epoch": 1.2248734634851772, "grad_norm": 0.5544231002829935, "learning_rate": 3.5776544774884263e-06, "loss": 0.0476, "step": 3388 }, { "epoch": 1.225234996384671, "grad_norm": 0.3218676233551886, "learning_rate": 3.5747890815871596e-06, "loss": 0.0352, "step": 3389 }, { "epoch": 1.2255965292841648, "grad_norm": 0.14921803894519708, "learning_rate": 3.5719241950559726e-06, "loss": 0.0081, "step": 3390 }, { "epoch": 1.2259580621836588, "grad_norm": 0.018740508398297034, "learning_rate": 3.5690598189187787e-06, "loss": 0.0009, "step": 3391 }, { "epoch": 1.2263195950831527, "grad_norm": 0.18722077963773223, "learning_rate": 3.566195954199304e-06, "loss": 0.0227, "step": 3392 }, { "epoch": 1.2266811279826464, "grad_norm": 0.559128133213107, "learning_rate": 3.5633326019210914e-06, "loss": 0.0388, "step": 3393 }, { "epoch": 1.2270426608821403, "grad_norm": 1.695846479662406, "learning_rate": 3.5604697631075035e-06, "loss": 0.2236, "step": 3394 }, { "epoch": 1.2274041937816342, "grad_norm": 0.7569969580993557, "learning_rate": 3.5576074387817184e-06, "loss": 0.1406, "step": 3395 }, { "epoch": 1.227765726681128, "grad_norm": 0.09531502846509433, "learning_rate": 3.554745629966731e-06, "loss": 0.0161, "step": 3396 }, { "epoch": 1.2281272595806219, "grad_norm": 2.036101724017435, "learning_rate": 3.5518843376853497e-06, "loss": 0.1055, "step": 3397 }, { "epoch": 1.2284887924801158, "grad_norm": 0.009881260866462925, "learning_rate": 3.549023562960202e-06, "loss": 0.0004, "step": 3398 }, { "epoch": 1.2288503253796095, "grad_norm": 0.1844551488738463, "learning_rate": 3.5461633068137256e-06, "loss": 0.0227, "step": 3399 }, { "epoch": 1.2292118582791034, "grad_norm": 0.19636774261352438, "learning_rate": 3.543303570268176e-06, "loss": 0.0129, "step": 3400 }, { "epoch": 1.2295733911785973, "grad_norm": 0.18363861188869712, "learning_rate": 3.540444354345624e-06, "loss": 0.0227, "step": 3401 }, { "epoch": 1.229934924078091, "grad_norm": 0.09671154854241475, "learning_rate": 3.53758566006795e-06, "loss": 0.0143, "step": 3402 }, { "epoch": 1.230296456977585, "grad_norm": 1.344973274218652, "learning_rate": 3.534727488456856e-06, "loss": 0.2129, "step": 3403 }, { "epoch": 1.2306579898770789, "grad_norm": 0.10490650383522049, "learning_rate": 3.5318698405338458e-06, "loss": 0.0181, "step": 3404 }, { "epoch": 1.2310195227765726, "grad_norm": 0.42553796154563645, "learning_rate": 3.529012717320245e-06, "loss": 0.0432, "step": 3405 }, { "epoch": 1.2313810556760665, "grad_norm": 0.7501343203075669, "learning_rate": 3.5261561198371887e-06, "loss": 0.1143, "step": 3406 }, { "epoch": 1.2317425885755604, "grad_norm": 0.002750960404575559, "learning_rate": 3.5233000491056236e-06, "loss": 0.0001, "step": 3407 }, { "epoch": 1.2321041214750543, "grad_norm": 0.8809967679620907, "learning_rate": 3.520444506146311e-06, "loss": 0.1143, "step": 3408 }, { "epoch": 1.232465654374548, "grad_norm": 0.49640708462356975, "learning_rate": 3.5175894919798186e-06, "loss": 0.0693, "step": 3409 }, { "epoch": 1.232827187274042, "grad_norm": 0.9546842670527637, "learning_rate": 3.5147350076265287e-06, "loss": 0.0476, "step": 3410 }, { "epoch": 1.2331887201735359, "grad_norm": 1.6072054222278966, "learning_rate": 3.511881054106634e-06, "loss": 0.2812, "step": 3411 }, { "epoch": 1.2335502530730296, "grad_norm": 0.5672184753850218, "learning_rate": 3.5090276324401353e-06, "loss": 0.1309, "step": 3412 }, { "epoch": 1.2339117859725235, "grad_norm": 0.5006666129048633, "learning_rate": 3.5061747436468485e-06, "loss": 0.0432, "step": 3413 }, { "epoch": 1.2342733188720174, "grad_norm": 0.49570223741199565, "learning_rate": 3.5033223887463918e-06, "loss": 0.0162, "step": 3414 }, { "epoch": 1.234634851771511, "grad_norm": 0.19290038477093652, "learning_rate": 3.5004705687581963e-06, "loss": 0.0254, "step": 3415 }, { "epoch": 1.234996384671005, "grad_norm": 0.24290138061976682, "learning_rate": 3.4976192847015045e-06, "loss": 0.0315, "step": 3416 }, { "epoch": 1.235357917570499, "grad_norm": 0.11131064612624482, "learning_rate": 3.494768537595362e-06, "loss": 0.0161, "step": 3417 }, { "epoch": 1.2357194504699929, "grad_norm": 1.0852167416802425, "learning_rate": 3.491918328458629e-06, "loss": 0.0476, "step": 3418 }, { "epoch": 1.2360809833694866, "grad_norm": 0.1266649705094523, "learning_rate": 3.489068658309965e-06, "loss": 0.0031, "step": 3419 }, { "epoch": 1.2364425162689805, "grad_norm": 0.004658374044650289, "learning_rate": 3.486219528167844e-06, "loss": 0.0002, "step": 3420 }, { "epoch": 1.2368040491684744, "grad_norm": 0.1918733035833656, "learning_rate": 3.4833709390505443e-06, "loss": 0.0227, "step": 3421 }, { "epoch": 1.2371655820679681, "grad_norm": 0.29177937472488585, "learning_rate": 3.48052289197615e-06, "loss": 0.0352, "step": 3422 }, { "epoch": 1.237527114967462, "grad_norm": 0.4487917334216807, "learning_rate": 3.4776753879625563e-06, "loss": 0.0317, "step": 3423 }, { "epoch": 1.237888647866956, "grad_norm": 0.20807846992656207, "learning_rate": 3.4748284280274557e-06, "loss": 0.0203, "step": 3424 }, { "epoch": 1.2382501807664497, "grad_norm": 1.0807607605686103, "learning_rate": 3.471982013188353e-06, "loss": 0.0693, "step": 3425 }, { "epoch": 1.2386117136659436, "grad_norm": 0.015798887085747594, "learning_rate": 3.4691361444625564e-06, "loss": 0.0008, "step": 3426 }, { "epoch": 1.2389732465654375, "grad_norm": 0.9021276202659368, "learning_rate": 3.4662908228671776e-06, "loss": 0.0352, "step": 3427 }, { "epoch": 1.2393347794649312, "grad_norm": 0.8771765627853131, "learning_rate": 3.463446049419138e-06, "loss": 0.1143, "step": 3428 }, { "epoch": 1.2396963123644251, "grad_norm": 0.7084228513350737, "learning_rate": 3.460601825135155e-06, "loss": 0.1699, "step": 3429 }, { "epoch": 1.240057845263919, "grad_norm": 0.2552549893154507, "learning_rate": 3.457758151031753e-06, "loss": 0.0227, "step": 3430 }, { "epoch": 1.240419378163413, "grad_norm": 0.003865876068045885, "learning_rate": 3.4549150281252635e-06, "loss": 0.0002, "step": 3431 }, { "epoch": 1.2407809110629067, "grad_norm": 0.6701861520800841, "learning_rate": 3.452072457431816e-06, "loss": 0.0317, "step": 3432 }, { "epoch": 1.2411424439624006, "grad_norm": 0.133789123225824, "learning_rate": 3.4492304399673476e-06, "loss": 0.0203, "step": 3433 }, { "epoch": 1.2415039768618945, "grad_norm": 0.6920677577113451, "learning_rate": 3.4463889767475917e-06, "loss": 0.1504, "step": 3434 }, { "epoch": 1.2418655097613882, "grad_norm": 0.1172167223993457, "learning_rate": 3.4435480687880867e-06, "loss": 0.0161, "step": 3435 }, { "epoch": 1.2422270426608821, "grad_norm": 0.7216851223592282, "learning_rate": 3.4407077171041748e-06, "loss": 0.1699, "step": 3436 }, { "epoch": 1.242588575560376, "grad_norm": 0.8243999165747702, "learning_rate": 3.4378679227109936e-06, "loss": 0.0693, "step": 3437 }, { "epoch": 1.2429501084598698, "grad_norm": 0.05419856475311039, "learning_rate": 3.43502868662349e-06, "loss": 0.0044, "step": 3438 }, { "epoch": 1.2433116413593637, "grad_norm": 0.625609381371557, "learning_rate": 3.4321900098564024e-06, "loss": 0.1807, "step": 3439 }, { "epoch": 1.2436731742588576, "grad_norm": 0.17756071590208916, "learning_rate": 3.429351893424273e-06, "loss": 0.0227, "step": 3440 }, { "epoch": 1.2440347071583515, "grad_norm": 0.1869568718322693, "learning_rate": 3.4265143383414473e-06, "loss": 0.0115, "step": 3441 }, { "epoch": 1.2443962400578452, "grad_norm": 0.32480330007320707, "learning_rate": 3.4236773456220633e-06, "loss": 0.0349, "step": 3442 }, { "epoch": 1.2447577729573391, "grad_norm": 0.16169693024161338, "learning_rate": 3.420840916280066e-06, "loss": 0.0254, "step": 3443 }, { "epoch": 1.245119305856833, "grad_norm": 0.03198140349487292, "learning_rate": 3.41800505132919e-06, "loss": 0.0012, "step": 3444 }, { "epoch": 1.2454808387563268, "grad_norm": 0.09459034581222014, "learning_rate": 3.415169751782974e-06, "loss": 0.0045, "step": 3445 }, { "epoch": 1.2458423716558207, "grad_norm": 0.2658516744883108, "learning_rate": 3.412335018654756e-06, "loss": 0.0283, "step": 3446 }, { "epoch": 1.2462039045553146, "grad_norm": 0.45794432290939335, "learning_rate": 3.4095008529576655e-06, "loss": 0.0432, "step": 3447 }, { "epoch": 1.2465654374548083, "grad_norm": 0.010880841837936809, "learning_rate": 3.4066672557046372e-06, "loss": 0.0003, "step": 3448 }, { "epoch": 1.2469269703543022, "grad_norm": 0.34396786452156064, "learning_rate": 3.4038342279083934e-06, "loss": 0.0204, "step": 3449 }, { "epoch": 1.2472885032537961, "grad_norm": 0.5360459216840692, "learning_rate": 3.40100177058146e-06, "loss": 0.1699, "step": 3450 }, { "epoch": 1.2476500361532898, "grad_norm": 0.8409219837363635, "learning_rate": 3.3981698847361567e-06, "loss": 0.1699, "step": 3451 }, { "epoch": 1.2480115690527838, "grad_norm": 0.6977362187930646, "learning_rate": 3.3953385713845976e-06, "loss": 0.0693, "step": 3452 }, { "epoch": 1.2483731019522777, "grad_norm": 0.8992999316997777, "learning_rate": 3.3925078315386963e-06, "loss": 0.0762, "step": 3453 }, { "epoch": 1.2487346348517716, "grad_norm": 0.47939628240443233, "learning_rate": 3.389677666210156e-06, "loss": 0.0476, "step": 3454 }, { "epoch": 1.2490961677512653, "grad_norm": 0.5124478174617338, "learning_rate": 3.3868480764104762e-06, "loss": 0.0579, "step": 3455 }, { "epoch": 1.2494577006507592, "grad_norm": 0.1402029920913206, "learning_rate": 3.384019063150955e-06, "loss": 0.0227, "step": 3456 }, { "epoch": 1.2498192335502532, "grad_norm": 0.56582096942904, "learning_rate": 3.381190627442679e-06, "loss": 0.1143, "step": 3457 }, { "epoch": 1.2501807664497468, "grad_norm": 0.17962670813083423, "learning_rate": 3.3783627702965326e-06, "loss": 0.0283, "step": 3458 }, { "epoch": 1.2505422993492408, "grad_norm": 0.6269416345325163, "learning_rate": 3.3755354927231892e-06, "loss": 0.083, "step": 3459 }, { "epoch": 1.2509038322487347, "grad_norm": 1.8828307483411115, "learning_rate": 3.372708795733116e-06, "loss": 0.0762, "step": 3460 }, { "epoch": 1.2512653651482286, "grad_norm": 0.3297960185182034, "learning_rate": 3.3698826803365783e-06, "loss": 0.0352, "step": 3461 }, { "epoch": 1.2516268980477223, "grad_norm": 0.20673938970166036, "learning_rate": 3.367057147543624e-06, "loss": 0.0182, "step": 3462 }, { "epoch": 1.2519884309472162, "grad_norm": 1.1954939057892555, "learning_rate": 3.3642321983641035e-06, "loss": 0.1143, "step": 3463 }, { "epoch": 1.2523499638467102, "grad_norm": 0.049609043836987915, "learning_rate": 3.3614078338076494e-06, "loss": 0.0021, "step": 3464 }, { "epoch": 1.2527114967462039, "grad_norm": 0.38384053542669194, "learning_rate": 3.3585840548836878e-06, "loss": 0.0145, "step": 3465 }, { "epoch": 1.2530730296456978, "grad_norm": 1.0162127005349233, "learning_rate": 3.3557608626014402e-06, "loss": 0.083, "step": 3466 }, { "epoch": 1.2534345625451917, "grad_norm": 0.14349185386850533, "learning_rate": 3.3529382579699123e-06, "loss": 0.0283, "step": 3467 }, { "epoch": 1.2537960954446854, "grad_norm": 0.1333031988339871, "learning_rate": 3.3501162419979038e-06, "loss": 0.0227, "step": 3468 }, { "epoch": 1.2541576283441793, "grad_norm": 0.563113667717659, "learning_rate": 3.347294815694002e-06, "loss": 0.1504, "step": 3469 }, { "epoch": 1.2545191612436732, "grad_norm": 0.001622090296025762, "learning_rate": 3.344473980066586e-06, "loss": 0.0001, "step": 3470 }, { "epoch": 1.254880694143167, "grad_norm": 0.5890488742877563, "learning_rate": 3.341653736123819e-06, "loss": 0.1406, "step": 3471 }, { "epoch": 1.2552422270426609, "grad_norm": 0.19577313052202758, "learning_rate": 3.3388340848736557e-06, "loss": 0.0254, "step": 3472 }, { "epoch": 1.2556037599421548, "grad_norm": 0.44946758378912327, "learning_rate": 3.3360150273238413e-06, "loss": 0.0182, "step": 3473 }, { "epoch": 1.2559652928416485, "grad_norm": 0.008198122980037041, "learning_rate": 3.3331965644819037e-06, "loss": 0.0004, "step": 3474 }, { "epoch": 1.2563268257411424, "grad_norm": 0.11410015999609102, "learning_rate": 3.3303786973551643e-06, "loss": 0.0203, "step": 3475 }, { "epoch": 1.2566883586406363, "grad_norm": 0.27749715212757453, "learning_rate": 3.327561426950725e-06, "loss": 0.0317, "step": 3476 }, { "epoch": 1.25704989154013, "grad_norm": 1.7753339638053236, "learning_rate": 3.324744754275477e-06, "loss": 0.2129, "step": 3477 }, { "epoch": 1.257411424439624, "grad_norm": 0.29866707105351475, "learning_rate": 3.321928680336103e-06, "loss": 0.0432, "step": 3478 }, { "epoch": 1.2577729573391179, "grad_norm": 0.7055936991834245, "learning_rate": 3.319113206139062e-06, "loss": 0.0635, "step": 3479 }, { "epoch": 1.2581344902386118, "grad_norm": 0.02455639346739285, "learning_rate": 3.316298332690609e-06, "loss": 0.0005, "step": 3480 }, { "epoch": 1.2584960231381055, "grad_norm": 0.510542483775039, "learning_rate": 3.3134840609967756e-06, "loss": 0.0527, "step": 3481 }, { "epoch": 1.2588575560375994, "grad_norm": 0.4850639828373699, "learning_rate": 3.3106703920633814e-06, "loss": 0.0352, "step": 3482 }, { "epoch": 1.2592190889370933, "grad_norm": 0.20007930471184454, "learning_rate": 3.307857326896034e-06, "loss": 0.0283, "step": 3483 }, { "epoch": 1.2595806218365873, "grad_norm": 0.4451572952349697, "learning_rate": 3.30504486650012e-06, "loss": 0.0579, "step": 3484 }, { "epoch": 1.259942154736081, "grad_norm": 0.5097405047883983, "learning_rate": 3.3022330118808142e-06, "loss": 0.0283, "step": 3485 }, { "epoch": 1.2603036876355749, "grad_norm": 0.25218440904038963, "learning_rate": 3.2994217640430715e-06, "loss": 0.0349, "step": 3486 }, { "epoch": 1.2606652205350688, "grad_norm": 0.17855204654080423, "learning_rate": 3.2966111239916305e-06, "loss": 0.0254, "step": 3487 }, { "epoch": 1.2610267534345625, "grad_norm": 0.01885523561129737, "learning_rate": 3.2938010927310147e-06, "loss": 0.0009, "step": 3488 }, { "epoch": 1.2613882863340564, "grad_norm": 0.4260792534583207, "learning_rate": 3.2909916712655278e-06, "loss": 0.1504, "step": 3489 }, { "epoch": 1.2617498192335503, "grad_norm": 0.6865631846794368, "learning_rate": 3.2881828605992587e-06, "loss": 0.0432, "step": 3490 }, { "epoch": 1.262111352133044, "grad_norm": 0.1611348663140872, "learning_rate": 3.2853746617360727e-06, "loss": 0.0227, "step": 3491 }, { "epoch": 1.262472885032538, "grad_norm": 0.5210150916730134, "learning_rate": 3.28256707567962e-06, "loss": 0.0317, "step": 3492 }, { "epoch": 1.2628344179320319, "grad_norm": 0.8434245188082746, "learning_rate": 3.2797601034333333e-06, "loss": 0.0283, "step": 3493 }, { "epoch": 1.2631959508315256, "grad_norm": 0.0834881029513935, "learning_rate": 3.276953746000421e-06, "loss": 0.0039, "step": 3494 }, { "epoch": 1.2635574837310195, "grad_norm": 0.3214538729993332, "learning_rate": 3.2741480043838793e-06, "loss": 0.0476, "step": 3495 }, { "epoch": 1.2639190166305134, "grad_norm": 0.2556462337850125, "learning_rate": 3.2713428795864743e-06, "loss": 0.008, "step": 3496 }, { "epoch": 1.2642805495300071, "grad_norm": 0.30566015155604587, "learning_rate": 3.268538372610759e-06, "loss": 0.0432, "step": 3497 }, { "epoch": 1.264642082429501, "grad_norm": 0.8125523719157267, "learning_rate": 3.2657344844590644e-06, "loss": 0.1055, "step": 3498 }, { "epoch": 1.265003615328995, "grad_norm": 0.5861650134898254, "learning_rate": 3.262931216133499e-06, "loss": 0.1309, "step": 3499 }, { "epoch": 1.2653651482284887, "grad_norm": 0.5827134037408606, "learning_rate": 3.2601285686359517e-06, "loss": 0.0118, "step": 3500 }, { "epoch": 1.2657266811279826, "grad_norm": 0.6206429136841474, "learning_rate": 3.2573265429680855e-06, "loss": 0.0388, "step": 3501 }, { "epoch": 1.2660882140274765, "grad_norm": 0.43776366176607695, "learning_rate": 3.254525140131345e-06, "loss": 0.0476, "step": 3502 }, { "epoch": 1.2664497469269704, "grad_norm": 0.022958311579478044, "learning_rate": 3.251724361126951e-06, "loss": 0.0013, "step": 3503 }, { "epoch": 1.2668112798264641, "grad_norm": 0.6446562462230239, "learning_rate": 3.248924206955901e-06, "loss": 0.0635, "step": 3504 }, { "epoch": 1.267172812725958, "grad_norm": 0.01645792327689497, "learning_rate": 3.246124678618972e-06, "loss": 0.0008, "step": 3505 }, { "epoch": 1.267534345625452, "grad_norm": 0.4264581729260234, "learning_rate": 3.2433257771167116e-06, "loss": 0.0527, "step": 3506 }, { "epoch": 1.267895878524946, "grad_norm": 0.3003548678766331, "learning_rate": 3.2405275034494468e-06, "loss": 0.0349, "step": 3507 }, { "epoch": 1.2682574114244396, "grad_norm": 0.4406227989755067, "learning_rate": 3.2377298586172816e-06, "loss": 0.0476, "step": 3508 }, { "epoch": 1.2686189443239335, "grad_norm": 0.6070486327608213, "learning_rate": 3.234932843620092e-06, "loss": 0.1226, "step": 3509 }, { "epoch": 1.2689804772234274, "grad_norm": 1.1059025827703362, "learning_rate": 3.2321364594575343e-06, "loss": 0.1226, "step": 3510 }, { "epoch": 1.2693420101229211, "grad_norm": 0.4720505036835229, "learning_rate": 3.229340707129031e-06, "loss": 0.0476, "step": 3511 }, { "epoch": 1.269703543022415, "grad_norm": 0.021662542435395664, "learning_rate": 3.2265455876337846e-06, "loss": 0.0009, "step": 3512 }, { "epoch": 1.270065075921909, "grad_norm": 0.07953036723276885, "learning_rate": 3.2237511019707725e-06, "loss": 0.0024, "step": 3513 }, { "epoch": 1.2704266088214027, "grad_norm": 0.8644961005784734, "learning_rate": 3.22095725113874e-06, "loss": 0.0283, "step": 3514 }, { "epoch": 1.2707881417208966, "grad_norm": 1.0663144763442913, "learning_rate": 3.218164036136213e-06, "loss": 0.1309, "step": 3515 }, { "epoch": 1.2711496746203905, "grad_norm": 0.1759491466309978, "learning_rate": 3.2153714579614818e-06, "loss": 0.0203, "step": 3516 }, { "epoch": 1.2715112075198842, "grad_norm": 0.020098629702821957, "learning_rate": 3.212579517612614e-06, "loss": 0.0009, "step": 3517 }, { "epoch": 1.2718727404193781, "grad_norm": 0.9106649029470553, "learning_rate": 3.2097882160874513e-06, "loss": 0.0693, "step": 3518 }, { "epoch": 1.272234273318872, "grad_norm": 0.002128438788340224, "learning_rate": 3.2069975543836007e-06, "loss": 0.0001, "step": 3519 }, { "epoch": 1.2725958062183658, "grad_norm": 0.13105921191728204, "learning_rate": 3.204207533498448e-06, "loss": 0.0254, "step": 3520 }, { "epoch": 1.2729573391178597, "grad_norm": 0.6992373779740884, "learning_rate": 3.2014181544291424e-06, "loss": 0.0762, "step": 3521 }, { "epoch": 1.2733188720173536, "grad_norm": 0.012159439015719906, "learning_rate": 3.1986294181726075e-06, "loss": 0.0006, "step": 3522 }, { "epoch": 1.2736804049168473, "grad_norm": 0.15504764461947934, "learning_rate": 3.1958413257255403e-06, "loss": 0.0203, "step": 3523 }, { "epoch": 1.2740419378163412, "grad_norm": 1.709605924669679, "learning_rate": 3.193053878084401e-06, "loss": 0.1504, "step": 3524 }, { "epoch": 1.2744034707158352, "grad_norm": 0.1595041993311184, "learning_rate": 3.1902670762454267e-06, "loss": 0.0315, "step": 3525 }, { "epoch": 1.274765003615329, "grad_norm": 0.2150137741986214, "learning_rate": 3.1874809212046166e-06, "loss": 0.0283, "step": 3526 }, { "epoch": 1.2751265365148228, "grad_norm": 0.6083684478311492, "learning_rate": 3.1846954139577414e-06, "loss": 0.0635, "step": 3527 }, { "epoch": 1.2754880694143167, "grad_norm": 0.09530963247494097, "learning_rate": 3.1819105555003426e-06, "loss": 0.0063, "step": 3528 }, { "epoch": 1.2758496023138106, "grad_norm": 0.014954171756452835, "learning_rate": 3.179126346827727e-06, "loss": 0.0009, "step": 3529 }, { "epoch": 1.2762111352133045, "grad_norm": 0.09717349705496091, "learning_rate": 3.176342788934973e-06, "loss": 0.0056, "step": 3530 }, { "epoch": 1.2765726681127982, "grad_norm": 0.795615390702428, "learning_rate": 3.17355988281692e-06, "loss": 0.0693, "step": 3531 }, { "epoch": 1.2769342010122922, "grad_norm": 1.1118438157645127, "learning_rate": 3.170777629468179e-06, "loss": 0.1055, "step": 3532 }, { "epoch": 1.277295733911786, "grad_norm": 0.23584557859084426, "learning_rate": 3.167996029883128e-06, "loss": 0.0317, "step": 3533 }, { "epoch": 1.2776572668112798, "grad_norm": 0.3241391929428164, "learning_rate": 3.165215085055908e-06, "loss": 0.0432, "step": 3534 }, { "epoch": 1.2780187997107737, "grad_norm": 0.8675360021696635, "learning_rate": 3.1624347959804314e-06, "loss": 0.083, "step": 3535 }, { "epoch": 1.2783803326102676, "grad_norm": 0.7031084832085599, "learning_rate": 3.15965516365037e-06, "loss": 0.1226, "step": 3536 }, { "epoch": 1.2787418655097613, "grad_norm": 0.016546838024904923, "learning_rate": 3.156876189059164e-06, "loss": 0.0006, "step": 3537 }, { "epoch": 1.2791033984092552, "grad_norm": 0.061434642184413246, "learning_rate": 3.15409787320002e-06, "loss": 0.0021, "step": 3538 }, { "epoch": 1.2794649313087492, "grad_norm": 0.01210699159578914, "learning_rate": 3.1513202170659053e-06, "loss": 0.0006, "step": 3539 }, { "epoch": 1.2798264642082429, "grad_norm": 0.2273098190403605, "learning_rate": 3.148543221649557e-06, "loss": 0.0283, "step": 3540 }, { "epoch": 1.2801879971077368, "grad_norm": 0.18009996300269138, "learning_rate": 3.145766887943468e-06, "loss": 0.0283, "step": 3541 }, { "epoch": 1.2805495300072307, "grad_norm": 0.5777664435823451, "learning_rate": 3.1429912169399047e-06, "loss": 0.0476, "step": 3542 }, { "epoch": 1.2809110629067244, "grad_norm": 0.6026223256936957, "learning_rate": 3.140216209630887e-06, "loss": 0.0693, "step": 3543 }, { "epoch": 1.2812725958062183, "grad_norm": 0.6340975397495083, "learning_rate": 3.137441867008203e-06, "loss": 0.0579, "step": 3544 }, { "epoch": 1.2816341287057122, "grad_norm": 19.572428460854, "learning_rate": 3.1346681900634045e-06, "loss": 6.8438, "step": 3545 }, { "epoch": 1.281995661605206, "grad_norm": 0.134527623422422, "learning_rate": 3.131895179787798e-06, "loss": 0.0227, "step": 3546 }, { "epoch": 1.2823571945046999, "grad_norm": 0.5224519234984297, "learning_rate": 3.129122837172463e-06, "loss": 0.1807, "step": 3547 }, { "epoch": 1.2827187274041938, "grad_norm": 0.29879786224897326, "learning_rate": 3.1263511632082306e-06, "loss": 0.0352, "step": 3548 }, { "epoch": 1.2830802603036877, "grad_norm": 0.19921147818349808, "learning_rate": 3.1235801588856956e-06, "loss": 0.0203, "step": 3549 }, { "epoch": 1.2834417932031814, "grad_norm": 0.6980543825812415, "learning_rate": 3.120809825195218e-06, "loss": 0.1055, "step": 3550 }, { "epoch": 1.2838033261026753, "grad_norm": 0.0019308595611562532, "learning_rate": 3.11804016312691e-06, "loss": 0.0001, "step": 3551 }, { "epoch": 1.2841648590021693, "grad_norm": 0.2666732478018059, "learning_rate": 3.115271173670652e-06, "loss": 0.0349, "step": 3552 }, { "epoch": 1.2845263919016632, "grad_norm": 0.1176389200335377, "learning_rate": 3.112502857816079e-06, "loss": 0.0203, "step": 3553 }, { "epoch": 1.2848879248011569, "grad_norm": 0.003235704651015278, "learning_rate": 3.109735216552585e-06, "loss": 0.0001, "step": 3554 }, { "epoch": 1.2852494577006508, "grad_norm": 0.6165272912569407, "learning_rate": 3.1069682508693276e-06, "loss": 0.0579, "step": 3555 }, { "epoch": 1.2856109906001447, "grad_norm": 0.72119658780679, "learning_rate": 3.1042019617552142e-06, "loss": 0.0693, "step": 3556 }, { "epoch": 1.2859725234996384, "grad_norm": 0.7886943848844881, "learning_rate": 3.101436350198924e-06, "loss": 0.1309, "step": 3557 }, { "epoch": 1.2863340563991323, "grad_norm": 0.1160975492108351, "learning_rate": 3.09867141718888e-06, "loss": 0.0181, "step": 3558 }, { "epoch": 1.2866955892986263, "grad_norm": 0.27542083151008995, "learning_rate": 3.0959071637132688e-06, "loss": 0.0129, "step": 3559 }, { "epoch": 1.28705712219812, "grad_norm": 0.09559187835465294, "learning_rate": 3.093143590760037e-06, "loss": 0.0056, "step": 3560 }, { "epoch": 1.2874186550976139, "grad_norm": 0.6475549997867561, "learning_rate": 3.090380699316882e-06, "loss": 0.0476, "step": 3561 }, { "epoch": 1.2877801879971078, "grad_norm": 0.6131494525277188, "learning_rate": 3.0876184903712637e-06, "loss": 0.1226, "step": 3562 }, { "epoch": 1.2881417208966015, "grad_norm": 0.07710056118970669, "learning_rate": 3.084856964910393e-06, "loss": 0.0128, "step": 3563 }, { "epoch": 1.2885032537960954, "grad_norm": 0.22055208954277053, "learning_rate": 3.082096123921238e-06, "loss": 0.0352, "step": 3564 }, { "epoch": 1.2888647866955893, "grad_norm": 0.8838730366528893, "learning_rate": 3.079335968390524e-06, "loss": 0.0977, "step": 3565 }, { "epoch": 1.289226319595083, "grad_norm": 1.3485814981395081, "learning_rate": 3.076576499304729e-06, "loss": 0.0579, "step": 3566 }, { "epoch": 1.289587852494577, "grad_norm": 0.15837510646354125, "learning_rate": 3.073817717650089e-06, "loss": 0.0227, "step": 3567 }, { "epoch": 1.289949385394071, "grad_norm": 0.2940370414066509, "learning_rate": 3.071059624412589e-06, "loss": 0.0203, "step": 3568 }, { "epoch": 1.2903109182935646, "grad_norm": 0.123960883286275, "learning_rate": 3.068302220577971e-06, "loss": 0.0203, "step": 3569 }, { "epoch": 1.2906724511930585, "grad_norm": 0.9245620041619823, "learning_rate": 3.0655455071317337e-06, "loss": 0.0762, "step": 3570 }, { "epoch": 1.2910339840925524, "grad_norm": 0.3607051954662658, "learning_rate": 3.062789485059122e-06, "loss": 0.0352, "step": 3571 }, { "epoch": 1.2913955169920464, "grad_norm": 0.10365740539083869, "learning_rate": 3.0600341553451416e-06, "loss": 0.0181, "step": 3572 }, { "epoch": 1.29175704989154, "grad_norm": 0.5172810519447898, "learning_rate": 3.057279518974544e-06, "loss": 0.0432, "step": 3573 }, { "epoch": 1.292118582791034, "grad_norm": 0.24017117831767584, "learning_rate": 3.0545255769318355e-06, "loss": 0.0227, "step": 3574 }, { "epoch": 1.292480115690528, "grad_norm": 1.2152633442644671, "learning_rate": 3.0517723302012757e-06, "loss": 0.0762, "step": 3575 }, { "epoch": 1.2928416485900218, "grad_norm": 0.10669201509242004, "learning_rate": 3.0490197797668738e-06, "loss": 0.0181, "step": 3576 }, { "epoch": 1.2932031814895155, "grad_norm": 0.3021145031137804, "learning_rate": 3.046267926612392e-06, "loss": 0.0352, "step": 3577 }, { "epoch": 1.2935647143890094, "grad_norm": 0.05968210807723714, "learning_rate": 3.0435167717213397e-06, "loss": 0.0024, "step": 3578 }, { "epoch": 1.2939262472885034, "grad_norm": 0.38574306307136424, "learning_rate": 3.040766316076981e-06, "loss": 0.0227, "step": 3579 }, { "epoch": 1.294287780187997, "grad_norm": 0.0012392406153431857, "learning_rate": 3.0380165606623267e-06, "loss": 0.0001, "step": 3580 }, { "epoch": 1.294649313087491, "grad_norm": 0.6382173580107429, "learning_rate": 3.035267506460139e-06, "loss": 0.1699, "step": 3581 }, { "epoch": 1.295010845986985, "grad_norm": 0.4318403047739237, "learning_rate": 3.032519154452932e-06, "loss": 0.0903, "step": 3582 }, { "epoch": 1.2953723788864786, "grad_norm": 0.34933739041495443, "learning_rate": 3.0297715056229627e-06, "loss": 0.0388, "step": 3583 }, { "epoch": 1.2957339117859725, "grad_norm": 0.5230094266704393, "learning_rate": 3.027024560952241e-06, "loss": 0.1914, "step": 3584 }, { "epoch": 1.2960954446854664, "grad_norm": 0.2956607330394127, "learning_rate": 3.024278321422526e-06, "loss": 0.0162, "step": 3585 }, { "epoch": 1.2964569775849601, "grad_norm": 0.12067415934192818, "learning_rate": 3.02153278801532e-06, "loss": 0.0203, "step": 3586 }, { "epoch": 1.296818510484454, "grad_norm": 0.15506378196713133, "learning_rate": 3.018787961711881e-06, "loss": 0.0071, "step": 3587 }, { "epoch": 1.297180043383948, "grad_norm": 0.11630742893171268, "learning_rate": 3.0160438434932048e-06, "loss": 0.0203, "step": 3588 }, { "epoch": 1.2975415762834417, "grad_norm": 0.5195997965031096, "learning_rate": 3.013300434340039e-06, "loss": 0.0432, "step": 3589 }, { "epoch": 1.2979031091829356, "grad_norm": 0.0914354787889161, "learning_rate": 3.0105577352328804e-06, "loss": 0.0181, "step": 3590 }, { "epoch": 1.2982646420824295, "grad_norm": 0.02627555794991311, "learning_rate": 3.007815747151966e-06, "loss": 0.0013, "step": 3591 }, { "epoch": 1.2986261749819232, "grad_norm": 0.14382054177439516, "learning_rate": 3.005074471077285e-06, "loss": 0.0227, "step": 3592 }, { "epoch": 1.2989877078814172, "grad_norm": 0.2771856101264469, "learning_rate": 3.002333907988566e-06, "loss": 0.0283, "step": 3593 }, { "epoch": 1.299349240780911, "grad_norm": 0.0011618532381866655, "learning_rate": 2.999594058865286e-06, "loss": 0.0001, "step": 3594 }, { "epoch": 1.299710773680405, "grad_norm": 0.003162826322834944, "learning_rate": 2.9968549246866685e-06, "loss": 0.0001, "step": 3595 }, { "epoch": 1.3000723065798987, "grad_norm": 0.33020544333349217, "learning_rate": 2.994116506431677e-06, "loss": 0.0162, "step": 3596 }, { "epoch": 1.3004338394793926, "grad_norm": 0.14600544687182018, "learning_rate": 2.9913788050790243e-06, "loss": 0.0056, "step": 3597 }, { "epoch": 1.3007953723788865, "grad_norm": 0.19060411777211592, "learning_rate": 2.988641821607162e-06, "loss": 0.0203, "step": 3598 }, { "epoch": 1.3011569052783805, "grad_norm": 0.0069645722590116485, "learning_rate": 2.9859055569942874e-06, "loss": 0.0003, "step": 3599 }, { "epoch": 1.3015184381778742, "grad_norm": 0.1017328803658782, "learning_rate": 2.983170012218343e-06, "loss": 0.0063, "step": 3600 }, { "epoch": 1.301879971077368, "grad_norm": 0.5055274104257675, "learning_rate": 2.980435188257008e-06, "loss": 0.1699, "step": 3601 }, { "epoch": 1.302241503976862, "grad_norm": 0.828971908921128, "learning_rate": 2.9777010860877143e-06, "loss": 0.1309, "step": 3602 }, { "epoch": 1.3026030368763557, "grad_norm": 0.7201651978648198, "learning_rate": 2.9749677066876237e-06, "loss": 0.0352, "step": 3603 }, { "epoch": 1.3029645697758496, "grad_norm": 0.7323393942278044, "learning_rate": 2.972235051033646e-06, "loss": 0.008, "step": 3604 }, { "epoch": 1.3033261026753435, "grad_norm": 0.09299905434340174, "learning_rate": 2.9695031201024355e-06, "loss": 0.0044, "step": 3605 }, { "epoch": 1.3036876355748372, "grad_norm": 0.11581645403168127, "learning_rate": 2.9667719148703794e-06, "loss": 0.0203, "step": 3606 }, { "epoch": 1.3040491684743312, "grad_norm": 2.5838515217969023, "learning_rate": 2.964041436313614e-06, "loss": 0.3789, "step": 3607 }, { "epoch": 1.304410701373825, "grad_norm": 0.024788783142140405, "learning_rate": 2.9613116854080076e-06, "loss": 0.0013, "step": 3608 }, { "epoch": 1.3047722342733188, "grad_norm": 0.0006365354752702375, "learning_rate": 2.9585826631291757e-06, "loss": 0.0, "step": 3609 }, { "epoch": 1.3051337671728127, "grad_norm": 0.27814590541805767, "learning_rate": 2.955854370452469e-06, "loss": 0.0052, "step": 3610 }, { "epoch": 1.3054953000723066, "grad_norm": 0.35843487454841805, "learning_rate": 2.9531268083529785e-06, "loss": 0.0432, "step": 3611 }, { "epoch": 1.3058568329718003, "grad_norm": 0.17919934116580702, "learning_rate": 2.950399977805536e-06, "loss": 0.0254, "step": 3612 }, { "epoch": 1.3062183658712943, "grad_norm": 0.46639188644918905, "learning_rate": 2.947673879784706e-06, "loss": 0.0227, "step": 3613 }, { "epoch": 1.3065798987707882, "grad_norm": 0.11005409038636323, "learning_rate": 2.9449485152648014e-06, "loss": 0.0049, "step": 3614 }, { "epoch": 1.3069414316702819, "grad_norm": 0.1340794136036676, "learning_rate": 2.9422238852198628e-06, "loss": 0.0227, "step": 3615 }, { "epoch": 1.3073029645697758, "grad_norm": 0.7275824746946177, "learning_rate": 2.939499990623672e-06, "loss": 0.0693, "step": 3616 }, { "epoch": 1.3076644974692697, "grad_norm": 0.01754380972986863, "learning_rate": 2.9367768324497527e-06, "loss": 0.0008, "step": 3617 }, { "epoch": 1.3080260303687636, "grad_norm": 0.23115921588259714, "learning_rate": 2.9340544116713536e-06, "loss": 0.0349, "step": 3618 }, { "epoch": 1.3083875632682573, "grad_norm": 0.08544989121436011, "learning_rate": 2.931332729261476e-06, "loss": 0.0039, "step": 3619 }, { "epoch": 1.3087490961677513, "grad_norm": 0.10454225055051167, "learning_rate": 2.9286117861928427e-06, "loss": 0.0181, "step": 3620 }, { "epoch": 1.3091106290672452, "grad_norm": 0.1719044771881624, "learning_rate": 2.925891583437919e-06, "loss": 0.0227, "step": 3621 }, { "epoch": 1.309472161966739, "grad_norm": 0.021269998892533885, "learning_rate": 2.923172121968908e-06, "loss": 0.0007, "step": 3622 }, { "epoch": 1.3098336948662328, "grad_norm": 0.6588965399052626, "learning_rate": 2.9204534027577387e-06, "loss": 0.1309, "step": 3623 }, { "epoch": 1.3101952277657267, "grad_norm": 0.30925946992402353, "learning_rate": 2.9177354267760876e-06, "loss": 0.0388, "step": 3624 }, { "epoch": 1.3105567606652206, "grad_norm": 2.5403179366558666, "learning_rate": 2.915018194995355e-06, "loss": 0.1699, "step": 3625 }, { "epoch": 1.3109182935647143, "grad_norm": 0.2055056884191248, "learning_rate": 2.912301708386679e-06, "loss": 0.0129, "step": 3626 }, { "epoch": 1.3112798264642083, "grad_norm": 0.013699061077206635, "learning_rate": 2.909585967920932e-06, "loss": 0.0007, "step": 3627 }, { "epoch": 1.3116413593637022, "grad_norm": 0.5139781041950272, "learning_rate": 2.906870974568717e-06, "loss": 0.0476, "step": 3628 }, { "epoch": 1.3120028922631959, "grad_norm": 0.07896308037452045, "learning_rate": 2.904156729300376e-06, "loss": 0.0049, "step": 3629 }, { "epoch": 1.3123644251626898, "grad_norm": 0.3583214858879045, "learning_rate": 2.9014432330859792e-06, "loss": 0.0349, "step": 3630 }, { "epoch": 1.3127259580621837, "grad_norm": 1.7773046552922431, "learning_rate": 2.898730486895324e-06, "loss": 0.1055, "step": 3631 }, { "epoch": 1.3130874909616774, "grad_norm": 0.8667719774806414, "learning_rate": 2.8960184916979515e-06, "loss": 0.083, "step": 3632 }, { "epoch": 1.3134490238611713, "grad_norm": 0.02661609289829665, "learning_rate": 2.893307248463126e-06, "loss": 0.0011, "step": 3633 }, { "epoch": 1.3138105567606653, "grad_norm": 0.3093893130234411, "learning_rate": 2.890596758159846e-06, "loss": 0.0254, "step": 3634 }, { "epoch": 1.314172089660159, "grad_norm": 0.02509163627022166, "learning_rate": 2.887887021756839e-06, "loss": 0.0011, "step": 3635 }, { "epoch": 1.314533622559653, "grad_norm": 0.07163489945751549, "learning_rate": 2.885178040222565e-06, "loss": 0.0035, "step": 3636 }, { "epoch": 1.3148951554591468, "grad_norm": 0.005825890485335806, "learning_rate": 2.882469814525213e-06, "loss": 0.0002, "step": 3637 }, { "epoch": 1.3152566883586405, "grad_norm": 1.28147381064494, "learning_rate": 2.8797623456326993e-06, "loss": 0.083, "step": 3638 }, { "epoch": 1.3156182212581344, "grad_norm": 0.3429125361044917, "learning_rate": 2.8770556345126787e-06, "loss": 0.0388, "step": 3639 }, { "epoch": 1.3159797541576284, "grad_norm": 0.002256942982662156, "learning_rate": 2.874349682132529e-06, "loss": 0.0001, "step": 3640 }, { "epoch": 1.3163412870571223, "grad_norm": 0.08371268359458997, "learning_rate": 2.8716444894593494e-06, "loss": 0.005, "step": 3641 }, { "epoch": 1.316702819956616, "grad_norm": 0.3475195477879671, "learning_rate": 2.868940057459982e-06, "loss": 0.0315, "step": 3642 }, { "epoch": 1.31706435285611, "grad_norm": 0.5104348622335239, "learning_rate": 2.8662363871009885e-06, "loss": 0.1504, "step": 3643 }, { "epoch": 1.3174258857556038, "grad_norm": 0.40644469196170613, "learning_rate": 2.86353347934866e-06, "loss": 0.0227, "step": 3644 }, { "epoch": 1.3177874186550977, "grad_norm": 2.0438889415511947, "learning_rate": 2.8608313351690153e-06, "loss": 0.2812, "step": 3645 }, { "epoch": 1.3181489515545914, "grad_norm": 0.005475533163650502, "learning_rate": 2.8581299555277995e-06, "loss": 0.0002, "step": 3646 }, { "epoch": 1.3185104844540854, "grad_norm": 0.2954946599683147, "learning_rate": 2.8554293413904867e-06, "loss": 0.0181, "step": 3647 }, { "epoch": 1.3188720173535793, "grad_norm": 0.19133418145487033, "learning_rate": 2.8527294937222717e-06, "loss": 0.0254, "step": 3648 }, { "epoch": 1.319233550253073, "grad_norm": 0.001442866256669655, "learning_rate": 2.850030413488084e-06, "loss": 0.0001, "step": 3649 }, { "epoch": 1.319595083152567, "grad_norm": 0.6598306962725716, "learning_rate": 2.8473321016525755e-06, "loss": 0.1699, "step": 3650 }, { "epoch": 1.3199566160520608, "grad_norm": 1.4159313750444695, "learning_rate": 2.844634559180116e-06, "loss": 0.1055, "step": 3651 }, { "epoch": 1.3203181489515545, "grad_norm": 0.4104154574535216, "learning_rate": 2.841937787034812e-06, "loss": 0.0352, "step": 3652 }, { "epoch": 1.3206796818510484, "grad_norm": 0.0024324863358879388, "learning_rate": 2.8392417861804867e-06, "loss": 0.0001, "step": 3653 }, { "epoch": 1.3210412147505424, "grad_norm": 0.016684138170629366, "learning_rate": 2.8365465575806916e-06, "loss": 0.001, "step": 3654 }, { "epoch": 1.321402747650036, "grad_norm": 0.02252868671880357, "learning_rate": 2.8338521021987e-06, "loss": 0.0009, "step": 3655 }, { "epoch": 1.32176428054953, "grad_norm": 0.07161987326018499, "learning_rate": 2.8311584209975105e-06, "loss": 0.0024, "step": 3656 }, { "epoch": 1.322125813449024, "grad_norm": 0.1536905366011509, "learning_rate": 2.8284655149398434e-06, "loss": 0.0039, "step": 3657 }, { "epoch": 1.3224873463485176, "grad_norm": 0.016670040619307807, "learning_rate": 2.8257733849881407e-06, "loss": 0.0009, "step": 3658 }, { "epoch": 1.3228488792480115, "grad_norm": 0.9415593594660934, "learning_rate": 2.823082032104573e-06, "loss": 0.0693, "step": 3659 }, { "epoch": 1.3232104121475055, "grad_norm": 0.0024424284529160735, "learning_rate": 2.8203914572510305e-06, "loss": 0.0001, "step": 3660 }, { "epoch": 1.3235719450469992, "grad_norm": 0.00568986103091318, "learning_rate": 2.817701661389116e-06, "loss": 0.0002, "step": 3661 }, { "epoch": 1.323933477946493, "grad_norm": 0.13460369821953877, "learning_rate": 2.815012645480171e-06, "loss": 0.0056, "step": 3662 }, { "epoch": 1.324295010845987, "grad_norm": 0.1610684813345715, "learning_rate": 2.8123244104852436e-06, "loss": 0.0227, "step": 3663 }, { "epoch": 1.324656543745481, "grad_norm": 0.324014657381733, "learning_rate": 2.8096369573651113e-06, "loss": 0.0388, "step": 3664 }, { "epoch": 1.3250180766449746, "grad_norm": 1.6651030491795944, "learning_rate": 2.806950287080268e-06, "loss": 0.1309, "step": 3665 }, { "epoch": 1.3253796095444685, "grad_norm": 0.8163461444628278, "learning_rate": 2.8042644005909287e-06, "loss": 0.0762, "step": 3666 }, { "epoch": 1.3257411424439625, "grad_norm": 0.2122114450834679, "learning_rate": 2.8015792988570297e-06, "loss": 0.0227, "step": 3667 }, { "epoch": 1.3261026753434564, "grad_norm": 0.664416404912397, "learning_rate": 2.7988949828382228e-06, "loss": 0.1143, "step": 3668 }, { "epoch": 1.32646420824295, "grad_norm": 0.5477438743754977, "learning_rate": 2.796211453493887e-06, "loss": 0.0349, "step": 3669 }, { "epoch": 1.326825741142444, "grad_norm": 0.6765934775881869, "learning_rate": 2.793528711783115e-06, "loss": 0.0227, "step": 3670 }, { "epoch": 1.327187274041938, "grad_norm": 0.6011003892276915, "learning_rate": 2.7908467586647112e-06, "loss": 0.1504, "step": 3671 }, { "epoch": 1.3275488069414316, "grad_norm": 0.539369892247282, "learning_rate": 2.788165595097212e-06, "loss": 0.1699, "step": 3672 }, { "epoch": 1.3279103398409255, "grad_norm": 1.1878949494510294, "learning_rate": 2.7854852220388617e-06, "loss": 0.1143, "step": 3673 }, { "epoch": 1.3282718727404195, "grad_norm": 0.7744629733700139, "learning_rate": 2.782805640447627e-06, "loss": 0.1226, "step": 3674 }, { "epoch": 1.3286334056399132, "grad_norm": 2.9796518544011925, "learning_rate": 2.7801268512811874e-06, "loss": 0.3047, "step": 3675 }, { "epoch": 1.328994938539407, "grad_norm": 0.47637450946308574, "learning_rate": 2.7774488554969425e-06, "loss": 0.0254, "step": 3676 }, { "epoch": 1.329356471438901, "grad_norm": 0.0018249753858851966, "learning_rate": 2.7747716540520082e-06, "loss": 0.0001, "step": 3677 }, { "epoch": 1.3297180043383947, "grad_norm": 0.28074950334989524, "learning_rate": 2.7720952479032127e-06, "loss": 0.0315, "step": 3678 }, { "epoch": 1.3300795372378886, "grad_norm": 0.5140855625155468, "learning_rate": 2.7694196380071074e-06, "loss": 0.1699, "step": 3679 }, { "epoch": 1.3304410701373826, "grad_norm": 0.23925100725929777, "learning_rate": 2.7667448253199536e-06, "loss": 0.0352, "step": 3680 }, { "epoch": 1.3308026030368763, "grad_norm": 0.3239952631011075, "learning_rate": 2.7640708107977264e-06, "loss": 0.0317, "step": 3681 }, { "epoch": 1.3311641359363702, "grad_norm": 0.19575139384909024, "learning_rate": 2.7613975953961207e-06, "loss": 0.0283, "step": 3682 }, { "epoch": 1.331525668835864, "grad_norm": 0.018082403702274746, "learning_rate": 2.7587251800705416e-06, "loss": 0.0009, "step": 3683 }, { "epoch": 1.3318872017353578, "grad_norm": 0.4084308147658668, "learning_rate": 2.7560535657761106e-06, "loss": 0.0527, "step": 3684 }, { "epoch": 1.3322487346348517, "grad_norm": 0.45343039345846325, "learning_rate": 2.7533827534676594e-06, "loss": 0.0635, "step": 3685 }, { "epoch": 1.3326102675343456, "grad_norm": 0.0025649053240047203, "learning_rate": 2.7507127440997392e-06, "loss": 0.0001, "step": 3686 }, { "epoch": 1.3329718004338396, "grad_norm": 0.5522567932791271, "learning_rate": 2.748043538626613e-06, "loss": 0.0527, "step": 3687 }, { "epoch": 1.3333333333333333, "grad_norm": 0.6822940373576638, "learning_rate": 2.7453751380022457e-06, "loss": 0.0762, "step": 3688 }, { "epoch": 1.3336948662328272, "grad_norm": 0.6127692060549302, "learning_rate": 2.7427075431803307e-06, "loss": 0.1143, "step": 3689 }, { "epoch": 1.334056399132321, "grad_norm": 0.5498190003859565, "learning_rate": 2.7400407551142636e-06, "loss": 0.1226, "step": 3690 }, { "epoch": 1.334417932031815, "grad_norm": 0.5092698893253582, "learning_rate": 2.7373747747571534e-06, "loss": 0.1406, "step": 3691 }, { "epoch": 1.3347794649313087, "grad_norm": 0.41437057590272053, "learning_rate": 2.7347096030618216e-06, "loss": 0.0522, "step": 3692 }, { "epoch": 1.3351409978308026, "grad_norm": 0.0771842738951024, "learning_rate": 2.7320452409807986e-06, "loss": 0.0027, "step": 3693 }, { "epoch": 1.3355025307302966, "grad_norm": 0.6244478629303194, "learning_rate": 2.7293816894663285e-06, "loss": 0.1143, "step": 3694 }, { "epoch": 1.3358640636297903, "grad_norm": 0.03101465356175352, "learning_rate": 2.7267189494703617e-06, "loss": 0.0019, "step": 3695 }, { "epoch": 1.3362255965292842, "grad_norm": 0.23015098585646931, "learning_rate": 2.724057021944564e-06, "loss": 0.0352, "step": 3696 }, { "epoch": 1.336587129428778, "grad_norm": 0.44664815747976633, "learning_rate": 2.7213959078403083e-06, "loss": 0.0476, "step": 3697 }, { "epoch": 1.3369486623282718, "grad_norm": 0.2315398011910768, "learning_rate": 2.7187356081086713e-06, "loss": 0.0432, "step": 3698 }, { "epoch": 1.3373101952277657, "grad_norm": 0.11280635229313332, "learning_rate": 2.7160761237004476e-06, "loss": 0.0063, "step": 3699 }, { "epoch": 1.3376717281272597, "grad_norm": 1.6219115873881196, "learning_rate": 2.7134174555661353e-06, "loss": 0.2471, "step": 3700 }, { "epoch": 1.3380332610267534, "grad_norm": 0.9039276184352096, "learning_rate": 2.7107596046559427e-06, "loss": 0.1309, "step": 3701 }, { "epoch": 1.3383947939262473, "grad_norm": 0.12219286099121497, "learning_rate": 2.708102571919783e-06, "loss": 0.0049, "step": 3702 }, { "epoch": 1.3387563268257412, "grad_norm": 0.6269635421518568, "learning_rate": 2.705446358307281e-06, "loss": 0.0476, "step": 3703 }, { "epoch": 1.339117859725235, "grad_norm": 0.2685755352981164, "learning_rate": 2.7027909647677664e-06, "loss": 0.0349, "step": 3704 }, { "epoch": 1.3394793926247288, "grad_norm": 0.0039802084868291645, "learning_rate": 2.700136392250274e-06, "loss": 0.0001, "step": 3705 }, { "epoch": 1.3398409255242227, "grad_norm": 1.0454197625860364, "learning_rate": 2.6974826417035515e-06, "loss": 0.083, "step": 3706 }, { "epoch": 1.3402024584237164, "grad_norm": 0.3336431229032295, "learning_rate": 2.694829714076049e-06, "loss": 0.0432, "step": 3707 }, { "epoch": 1.3405639913232104, "grad_norm": 0.00960149327065421, "learning_rate": 2.692177610315917e-06, "loss": 0.0004, "step": 3708 }, { "epoch": 1.3409255242227043, "grad_norm": 0.00392395390750645, "learning_rate": 2.6895263313710213e-06, "loss": 0.0001, "step": 3709 }, { "epoch": 1.3412870571221982, "grad_norm": 1.5308412090439658, "learning_rate": 2.686875878188927e-06, "loss": 0.1699, "step": 3710 }, { "epoch": 1.341648590021692, "grad_norm": 0.018774576605373915, "learning_rate": 2.684226251716906e-06, "loss": 0.0009, "step": 3711 }, { "epoch": 1.3420101229211858, "grad_norm": 0.019958342685013727, "learning_rate": 2.6815774529019345e-06, "loss": 0.001, "step": 3712 }, { "epoch": 1.3423716558206797, "grad_norm": 0.23983332970554608, "learning_rate": 2.678929482690691e-06, "loss": 0.0071, "step": 3713 }, { "epoch": 1.3427331887201737, "grad_norm": 0.03441012998402293, "learning_rate": 2.6762823420295612e-06, "loss": 0.0017, "step": 3714 }, { "epoch": 1.3430947216196674, "grad_norm": 0.154305204095759, "learning_rate": 2.6736360318646293e-06, "loss": 0.0227, "step": 3715 }, { "epoch": 1.3434562545191613, "grad_norm": 0.1602152777229973, "learning_rate": 2.670990553141691e-06, "loss": 0.0283, "step": 3716 }, { "epoch": 1.3438177874186552, "grad_norm": 0.2557577245788281, "learning_rate": 2.66834590680624e-06, "loss": 0.0432, "step": 3717 }, { "epoch": 1.344179320318149, "grad_norm": 0.04679060594069422, "learning_rate": 2.6657020938034654e-06, "loss": 0.003, "step": 3718 }, { "epoch": 1.3445408532176428, "grad_norm": 0.6238792060694209, "learning_rate": 2.6630591150782718e-06, "loss": 0.0391, "step": 3719 }, { "epoch": 1.3449023861171367, "grad_norm": 0.02871105931005034, "learning_rate": 2.6604169715752576e-06, "loss": 0.0012, "step": 3720 }, { "epoch": 1.3452639190166304, "grad_norm": 0.1684969348810191, "learning_rate": 2.6577756642387235e-06, "loss": 0.0283, "step": 3721 }, { "epoch": 1.3456254519161244, "grad_norm": 0.13679792025743773, "learning_rate": 2.6551351940126735e-06, "loss": 0.0129, "step": 3722 }, { "epoch": 1.3459869848156183, "grad_norm": 2.3342981037198673, "learning_rate": 2.6524955618408093e-06, "loss": 0.3535, "step": 3723 }, { "epoch": 1.346348517715112, "grad_norm": 0.9581974218673065, "learning_rate": 2.6498567686665367e-06, "loss": 0.1055, "step": 3724 }, { "epoch": 1.346710050614606, "grad_norm": 1.074469507325259, "learning_rate": 2.6472188154329567e-06, "loss": 0.0388, "step": 3725 }, { "epoch": 1.3470715835140998, "grad_norm": 0.022819150021556572, "learning_rate": 2.644581703082877e-06, "loss": 0.0013, "step": 3726 }, { "epoch": 1.3474331164135935, "grad_norm": 1.6558841926937142, "learning_rate": 2.641945432558802e-06, "loss": 0.2012, "step": 3727 }, { "epoch": 1.3477946493130875, "grad_norm": 0.42060468672222584, "learning_rate": 2.639310004802928e-06, "loss": 0.0476, "step": 3728 }, { "epoch": 1.3481561822125814, "grad_norm": 0.29237416427967317, "learning_rate": 2.636675420757162e-06, "loss": 0.0352, "step": 3729 }, { "epoch": 1.348517715112075, "grad_norm": 0.3835348182794586, "learning_rate": 2.634041681363102e-06, "loss": 0.0183, "step": 3730 }, { "epoch": 1.348879248011569, "grad_norm": 0.27213828227085035, "learning_rate": 2.6314087875620455e-06, "loss": 0.0317, "step": 3731 }, { "epoch": 1.349240780911063, "grad_norm": 0.00440143737232438, "learning_rate": 2.628776740294988e-06, "loss": 0.0002, "step": 3732 }, { "epoch": 1.3496023138105568, "grad_norm": 0.47085287093206835, "learning_rate": 2.6261455405026236e-06, "loss": 0.0527, "step": 3733 }, { "epoch": 1.3499638467100505, "grad_norm": 0.9082475815884374, "learning_rate": 2.6235151891253417e-06, "loss": 0.0693, "step": 3734 }, { "epoch": 1.3503253796095445, "grad_norm": 2.3507964066868494, "learning_rate": 2.6208856871032284e-06, "loss": 0.1807, "step": 3735 }, { "epoch": 1.3506869125090384, "grad_norm": 0.7011731641856646, "learning_rate": 2.61825703537607e-06, "loss": 0.0977, "step": 3736 }, { "epoch": 1.3510484454085323, "grad_norm": 0.23998446856449682, "learning_rate": 2.615629234883347e-06, "loss": 0.0349, "step": 3737 }, { "epoch": 1.351409978308026, "grad_norm": 0.05120974931846034, "learning_rate": 2.6130022865642275e-06, "loss": 0.0019, "step": 3738 }, { "epoch": 1.35177151120752, "grad_norm": 0.545924088566236, "learning_rate": 2.610376191357589e-06, "loss": 0.1504, "step": 3739 }, { "epoch": 1.3521330441070138, "grad_norm": 0.1442528779866297, "learning_rate": 2.607750950201996e-06, "loss": 0.0129, "step": 3740 }, { "epoch": 1.3524945770065075, "grad_norm": 0.18360622930688225, "learning_rate": 2.605126564035708e-06, "loss": 0.0283, "step": 3741 }, { "epoch": 1.3528561099060015, "grad_norm": 0.28195549752396093, "learning_rate": 2.602503033796681e-06, "loss": 0.0352, "step": 3742 }, { "epoch": 1.3532176428054954, "grad_norm": 1.3326127344618326, "learning_rate": 2.5998803604225638e-06, "loss": 0.1699, "step": 3743 }, { "epoch": 1.353579175704989, "grad_norm": 0.23851899424102493, "learning_rate": 2.5972585448506994e-06, "loss": 0.0349, "step": 3744 }, { "epoch": 1.353940708604483, "grad_norm": 0.2755385433004707, "learning_rate": 2.594637588018121e-06, "loss": 0.0145, "step": 3745 }, { "epoch": 1.354302241503977, "grad_norm": 0.42020829801361764, "learning_rate": 2.5920174908615643e-06, "loss": 0.0432, "step": 3746 }, { "epoch": 1.3546637744034706, "grad_norm": 0.761227373979397, "learning_rate": 2.589398254317447e-06, "loss": 0.0693, "step": 3747 }, { "epoch": 1.3550253073029646, "grad_norm": 0.5832080408277609, "learning_rate": 2.5867798793218856e-06, "loss": 0.0977, "step": 3748 }, { "epoch": 1.3553868402024585, "grad_norm": 0.6847768168408387, "learning_rate": 2.584162366810686e-06, "loss": 0.0527, "step": 3749 }, { "epoch": 1.3557483731019522, "grad_norm": 0.011217894902829414, "learning_rate": 2.581545717719347e-06, "loss": 0.0002, "step": 3750 }, { "epoch": 1.356109906001446, "grad_norm": 0.1564898673565563, "learning_rate": 2.5789299329830577e-06, "loss": 0.0056, "step": 3751 }, { "epoch": 1.35647143890094, "grad_norm": 0.022737068635494264, "learning_rate": 2.576315013536697e-06, "loss": 0.0013, "step": 3752 }, { "epoch": 1.3568329718004337, "grad_norm": 0.2948135967994176, "learning_rate": 2.573700960314843e-06, "loss": 0.0388, "step": 3753 }, { "epoch": 1.3571945046999276, "grad_norm": 0.19195359131010267, "learning_rate": 2.5710877742517528e-06, "loss": 0.0254, "step": 3754 }, { "epoch": 1.3575560375994216, "grad_norm": 0.2460067830819657, "learning_rate": 2.5684754562813763e-06, "loss": 0.0128, "step": 3755 }, { "epoch": 1.3579175704989155, "grad_norm": 0.050758440775070135, "learning_rate": 2.565864007337361e-06, "loss": 0.0031, "step": 3756 }, { "epoch": 1.3582791033984092, "grad_norm": 0.4269392952588903, "learning_rate": 2.5632534283530363e-06, "loss": 0.0388, "step": 3757 }, { "epoch": 1.358640636297903, "grad_norm": 0.17035594109715932, "learning_rate": 2.5606437202614213e-06, "loss": 0.0203, "step": 3758 }, { "epoch": 1.359002169197397, "grad_norm": 0.7164691120868184, "learning_rate": 2.5580348839952264e-06, "loss": 0.083, "step": 3759 }, { "epoch": 1.359363702096891, "grad_norm": 0.6331566365288989, "learning_rate": 2.5554269204868482e-06, "loss": 0.0977, "step": 3760 }, { "epoch": 1.3597252349963846, "grad_norm": 0.5489814078206265, "learning_rate": 2.5528198306683726e-06, "loss": 0.0635, "step": 3761 }, { "epoch": 1.3600867678958786, "grad_norm": 0.30856261911927907, "learning_rate": 2.5502136154715717e-06, "loss": 0.0283, "step": 3762 }, { "epoch": 1.3604483007953725, "grad_norm": 0.16595090787744077, "learning_rate": 2.547608275827911e-06, "loss": 0.0283, "step": 3763 }, { "epoch": 1.3608098336948662, "grad_norm": 0.1671757619378109, "learning_rate": 2.545003812668534e-06, "loss": 0.0227, "step": 3764 }, { "epoch": 1.36117136659436, "grad_norm": 0.19269955670796138, "learning_rate": 2.5424002269242732e-06, "loss": 0.0283, "step": 3765 }, { "epoch": 1.361532899493854, "grad_norm": 0.10636226870322785, "learning_rate": 2.5397975195256553e-06, "loss": 0.0203, "step": 3766 }, { "epoch": 1.3618944323933477, "grad_norm": 0.769505069760044, "learning_rate": 2.5371956914028838e-06, "loss": 0.1504, "step": 3767 }, { "epoch": 1.3622559652928417, "grad_norm": 0.3383783984873555, "learning_rate": 2.534594743485853e-06, "loss": 0.0349, "step": 3768 }, { "epoch": 1.3626174981923356, "grad_norm": 0.11694695803773818, "learning_rate": 2.531994676704141e-06, "loss": 0.0227, "step": 3769 }, { "epoch": 1.3629790310918293, "grad_norm": 0.10903577436484362, "learning_rate": 2.52939549198701e-06, "loss": 0.0203, "step": 3770 }, { "epoch": 1.3633405639913232, "grad_norm": 0.19048179525140266, "learning_rate": 2.5267971902634093e-06, "loss": 0.0227, "step": 3771 }, { "epoch": 1.3637020968908171, "grad_norm": 0.0017635592025277197, "learning_rate": 2.5241997724619683e-06, "loss": 0.0001, "step": 3772 }, { "epoch": 1.3640636297903108, "grad_norm": 0.1920597185090029, "learning_rate": 2.521603239511011e-06, "loss": 0.0254, "step": 3773 }, { "epoch": 1.3644251626898047, "grad_norm": 0.1448243080011492, "learning_rate": 2.51900759233853e-06, "loss": 0.0227, "step": 3774 }, { "epoch": 1.3647866955892987, "grad_norm": 0.0018122988082880224, "learning_rate": 2.5164128318722104e-06, "loss": 0.0001, "step": 3775 }, { "epoch": 1.3651482284887924, "grad_norm": 0.3717066829842044, "learning_rate": 2.5138189590394224e-06, "loss": 0.0101, "step": 3776 }, { "epoch": 1.3655097613882863, "grad_norm": 0.8615685660475104, "learning_rate": 2.5112259747672134e-06, "loss": 0.0903, "step": 3777 }, { "epoch": 1.3658712942877802, "grad_norm": 0.18630031782572337, "learning_rate": 2.508633879982316e-06, "loss": 0.0317, "step": 3778 }, { "epoch": 1.3662328271872741, "grad_norm": 0.9010552911499242, "learning_rate": 2.5060426756111446e-06, "loss": 0.0903, "step": 3779 }, { "epoch": 1.3665943600867678, "grad_norm": 0.10628210157937736, "learning_rate": 2.503452362579794e-06, "loss": 0.0063, "step": 3780 }, { "epoch": 1.3669558929862617, "grad_norm": 0.5269581108229773, "learning_rate": 2.500862941814043e-06, "loss": 0.1406, "step": 3781 }, { "epoch": 1.3673174258857557, "grad_norm": 0.17094485171896887, "learning_rate": 2.498274414239346e-06, "loss": 0.0254, "step": 3782 }, { "epoch": 1.3676789587852496, "grad_norm": 0.02841169536464937, "learning_rate": 2.4956867807808483e-06, "loss": 0.0019, "step": 3783 }, { "epoch": 1.3680404916847433, "grad_norm": 0.018467298519253265, "learning_rate": 2.4931000423633685e-06, "loss": 0.001, "step": 3784 }, { "epoch": 1.3684020245842372, "grad_norm": 0.19002830797428683, "learning_rate": 2.4905141999114003e-06, "loss": 0.0315, "step": 3785 }, { "epoch": 1.3687635574837311, "grad_norm": 0.13075779363150317, "learning_rate": 2.487929254349129e-06, "loss": 0.0203, "step": 3786 }, { "epoch": 1.3691250903832248, "grad_norm": 0.05090283896047524, "learning_rate": 2.4853452066004126e-06, "loss": 0.0035, "step": 3787 }, { "epoch": 1.3694866232827188, "grad_norm": 0.009926609203941764, "learning_rate": 2.4827620575887888e-06, "loss": 0.0003, "step": 3788 }, { "epoch": 1.3698481561822127, "grad_norm": 0.003687615187712021, "learning_rate": 2.4801798082374737e-06, "loss": 0.0002, "step": 3789 }, { "epoch": 1.3702096890817064, "grad_norm": 0.16018397407179108, "learning_rate": 2.477598459469364e-06, "loss": 0.0254, "step": 3790 }, { "epoch": 1.3705712219812003, "grad_norm": 2.0736780911688846, "learning_rate": 2.4750180122070328e-06, "loss": 0.1226, "step": 3791 }, { "epoch": 1.3709327548806942, "grad_norm": 0.33567864356112204, "learning_rate": 2.4724384673727285e-06, "loss": 0.0254, "step": 3792 }, { "epoch": 1.371294287780188, "grad_norm": 0.011446537641893746, "learning_rate": 2.469859825888385e-06, "loss": 0.0006, "step": 3793 }, { "epoch": 1.3716558206796818, "grad_norm": 0.2261448887955102, "learning_rate": 2.4672820886756074e-06, "loss": 0.0283, "step": 3794 }, { "epoch": 1.3720173535791758, "grad_norm": 0.003979851962218405, "learning_rate": 2.464705256655673e-06, "loss": 0.0002, "step": 3795 }, { "epoch": 1.3723788864786695, "grad_norm": 0.003549735169084915, "learning_rate": 2.462129330749547e-06, "loss": 0.0001, "step": 3796 }, { "epoch": 1.3727404193781634, "grad_norm": 0.13223083909521835, "learning_rate": 2.459554311877862e-06, "loss": 0.0254, "step": 3797 }, { "epoch": 1.3731019522776573, "grad_norm": 0.16778919241286533, "learning_rate": 2.4569802009609306e-06, "loss": 0.0254, "step": 3798 }, { "epoch": 1.373463485177151, "grad_norm": 1.12808634111963, "learning_rate": 2.454406998918738e-06, "loss": 0.0762, "step": 3799 }, { "epoch": 1.373825018076645, "grad_norm": 0.056567264938128685, "learning_rate": 2.451834706670947e-06, "loss": 0.0011, "step": 3800 }, { "epoch": 1.3741865509761388, "grad_norm": 0.003376636995929954, "learning_rate": 2.4492633251368943e-06, "loss": 0.0001, "step": 3801 }, { "epoch": 1.3745480838756328, "grad_norm": 0.3187164377319419, "learning_rate": 2.4466928552355885e-06, "loss": 0.0315, "step": 3802 }, { "epoch": 1.3749096167751265, "grad_norm": 0.004608599456278144, "learning_rate": 2.44412329788572e-06, "loss": 0.0002, "step": 3803 }, { "epoch": 1.3752711496746204, "grad_norm": 0.3409026681026073, "learning_rate": 2.441554654005647e-06, "loss": 0.0388, "step": 3804 }, { "epoch": 1.3756326825741143, "grad_norm": 0.1867073930227301, "learning_rate": 2.4389869245133967e-06, "loss": 0.0283, "step": 3805 }, { "epoch": 1.3759942154736082, "grad_norm": 0.8016221337380255, "learning_rate": 2.436420110326681e-06, "loss": 0.1406, "step": 3806 }, { "epoch": 1.376355748373102, "grad_norm": 0.3681188371182326, "learning_rate": 2.4338542123628774e-06, "loss": 0.0476, "step": 3807 }, { "epoch": 1.3767172812725958, "grad_norm": 0.1609978172705963, "learning_rate": 2.4312892315390364e-06, "loss": 0.0227, "step": 3808 }, { "epoch": 1.3770788141720898, "grad_norm": 0.2957513387913568, "learning_rate": 2.4287251687718816e-06, "loss": 0.0352, "step": 3809 }, { "epoch": 1.3774403470715835, "grad_norm": 0.32172470381297225, "learning_rate": 2.426162024977809e-06, "loss": 0.0317, "step": 3810 }, { "epoch": 1.3778018799710774, "grad_norm": 0.13085221801038985, "learning_rate": 2.4235998010728855e-06, "loss": 0.0254, "step": 3811 }, { "epoch": 1.3781634128705713, "grad_norm": 1.2832720632446828, "learning_rate": 2.421038497972848e-06, "loss": 0.1406, "step": 3812 }, { "epoch": 1.378524945770065, "grad_norm": 0.3070133360418799, "learning_rate": 2.4184781165931086e-06, "loss": 0.0315, "step": 3813 }, { "epoch": 1.378886478669559, "grad_norm": 0.2067564774606561, "learning_rate": 2.4159186578487476e-06, "loss": 0.0145, "step": 3814 }, { "epoch": 1.3792480115690529, "grad_norm": 0.32953165496527625, "learning_rate": 2.4133601226545087e-06, "loss": 0.0349, "step": 3815 }, { "epoch": 1.3796095444685466, "grad_norm": 0.11813445674358673, "learning_rate": 2.410802511924818e-06, "loss": 0.0181, "step": 3816 }, { "epoch": 1.3799710773680405, "grad_norm": 1.7625001232421043, "learning_rate": 2.4082458265737637e-06, "loss": 0.1226, "step": 3817 }, { "epoch": 1.3803326102675344, "grad_norm": 1.1322846852560084, "learning_rate": 2.4056900675151034e-06, "loss": 0.0762, "step": 3818 }, { "epoch": 1.380694143167028, "grad_norm": 1.1255954391804062, "learning_rate": 2.403135235662264e-06, "loss": 0.0977, "step": 3819 }, { "epoch": 1.381055676066522, "grad_norm": 0.6502081291401519, "learning_rate": 2.4005813319283473e-06, "loss": 0.0391, "step": 3820 }, { "epoch": 1.381417208966016, "grad_norm": 0.40107049087185004, "learning_rate": 2.3980283572261116e-06, "loss": 0.0129, "step": 3821 }, { "epoch": 1.3817787418655096, "grad_norm": 0.7382534528888142, "learning_rate": 2.3954763124679897e-06, "loss": 0.0693, "step": 3822 }, { "epoch": 1.3821402747650036, "grad_norm": 0.0019757770522840605, "learning_rate": 2.3929251985660866e-06, "loss": 0.0001, "step": 3823 }, { "epoch": 1.3825018076644975, "grad_norm": 0.29643452401208, "learning_rate": 2.3903750164321672e-06, "loss": 0.0283, "step": 3824 }, { "epoch": 1.3828633405639914, "grad_norm": 0.5490029421402852, "learning_rate": 2.387825766977666e-06, "loss": 0.1602, "step": 3825 }, { "epoch": 1.383224873463485, "grad_norm": 0.18919645189778442, "learning_rate": 2.385277451113685e-06, "loss": 0.0145, "step": 3826 }, { "epoch": 1.383586406362979, "grad_norm": 0.07818611749517537, "learning_rate": 2.38273006975099e-06, "loss": 0.0044, "step": 3827 }, { "epoch": 1.383947939262473, "grad_norm": 0.3029932266262429, "learning_rate": 2.380183623800017e-06, "loss": 0.0388, "step": 3828 }, { "epoch": 1.3843094721619669, "grad_norm": 0.04470132905291584, "learning_rate": 2.3776381141708617e-06, "loss": 0.0027, "step": 3829 }, { "epoch": 1.3846710050614606, "grad_norm": 0.17337990266114264, "learning_rate": 2.3750935417732946e-06, "loss": 0.0254, "step": 3830 }, { "epoch": 1.3850325379609545, "grad_norm": 0.11582962368150608, "learning_rate": 2.3725499075167397e-06, "loss": 0.0203, "step": 3831 }, { "epoch": 1.3853940708604484, "grad_norm": 0.10327692838772216, "learning_rate": 2.370007212310292e-06, "loss": 0.005, "step": 3832 }, { "epoch": 1.3857556037599421, "grad_norm": 0.13082154561588064, "learning_rate": 2.3674654570627128e-06, "loss": 0.0203, "step": 3833 }, { "epoch": 1.386117136659436, "grad_norm": 0.36085879734323084, "learning_rate": 2.364924642682424e-06, "loss": 0.0115, "step": 3834 }, { "epoch": 1.38647866955893, "grad_norm": 0.16500684931929502, "learning_rate": 2.3623847700775116e-06, "loss": 0.0034, "step": 3835 }, { "epoch": 1.3868402024584237, "grad_norm": 0.6055544085139798, "learning_rate": 2.3598458401557254e-06, "loss": 0.1226, "step": 3836 }, { "epoch": 1.3872017353579176, "grad_norm": 0.006933335072621979, "learning_rate": 2.3573078538244797e-06, "loss": 0.0003, "step": 3837 }, { "epoch": 1.3875632682574115, "grad_norm": 0.6251008961877447, "learning_rate": 2.3547708119908485e-06, "loss": 0.2012, "step": 3838 }, { "epoch": 1.3879248011569052, "grad_norm": 0.0037134099917765823, "learning_rate": 2.3522347155615692e-06, "loss": 0.0002, "step": 3839 }, { "epoch": 1.3882863340563991, "grad_norm": 0.714008129929877, "learning_rate": 2.3496995654430484e-06, "loss": 0.1055, "step": 3840 }, { "epoch": 1.388647866955893, "grad_norm": 0.24412632706892343, "learning_rate": 2.3471653625413405e-06, "loss": 0.0182, "step": 3841 }, { "epoch": 1.3890093998553867, "grad_norm": 0.579639472219382, "learning_rate": 2.344632107762171e-06, "loss": 0.0476, "step": 3842 }, { "epoch": 1.3893709327548807, "grad_norm": 1.8123854568309006, "learning_rate": 2.3420998020109275e-06, "loss": 0.1309, "step": 3843 }, { "epoch": 1.3897324656543746, "grad_norm": 0.0018659033940781442, "learning_rate": 2.339568446192654e-06, "loss": 0.0001, "step": 3844 }, { "epoch": 1.3900939985538683, "grad_norm": 0.24193864290093267, "learning_rate": 2.3370380412120556e-06, "loss": 0.0315, "step": 3845 }, { "epoch": 1.3904555314533622, "grad_norm": 0.10639403203974988, "learning_rate": 2.3345085879735002e-06, "loss": 0.0181, "step": 3846 }, { "epoch": 1.3908170643528561, "grad_norm": 0.1771962901842713, "learning_rate": 2.331980087381012e-06, "loss": 0.0254, "step": 3847 }, { "epoch": 1.39117859725235, "grad_norm": 0.19543477640159881, "learning_rate": 2.3294525403382784e-06, "loss": 0.0227, "step": 3848 }, { "epoch": 1.3915401301518437, "grad_norm": 0.0008045459733934177, "learning_rate": 2.326925947748641e-06, "loss": 0.0, "step": 3849 }, { "epoch": 1.3919016630513377, "grad_norm": 0.11315778801056742, "learning_rate": 2.3244003105151097e-06, "loss": 0.0182, "step": 3850 }, { "epoch": 1.3922631959508316, "grad_norm": 0.4550819108137046, "learning_rate": 2.321875629540341e-06, "loss": 0.0432, "step": 3851 }, { "epoch": 1.3926247288503255, "grad_norm": 0.5341831008107861, "learning_rate": 2.319351905726655e-06, "loss": 0.0579, "step": 3852 }, { "epoch": 1.3929862617498192, "grad_norm": 0.6771617710861877, "learning_rate": 2.316829139976034e-06, "loss": 0.0527, "step": 3853 }, { "epoch": 1.3933477946493131, "grad_norm": 0.3260832753717861, "learning_rate": 2.314307333190112e-06, "loss": 0.0162, "step": 3854 }, { "epoch": 1.393709327548807, "grad_norm": 0.7645241188059708, "learning_rate": 2.3117864862701827e-06, "loss": 0.0635, "step": 3855 }, { "epoch": 1.3940708604483008, "grad_norm": 0.10961386230865798, "learning_rate": 2.309266600117196e-06, "loss": 0.0181, "step": 3856 }, { "epoch": 1.3944323933477947, "grad_norm": 0.11922264428354597, "learning_rate": 2.306747675631758e-06, "loss": 0.0161, "step": 3857 }, { "epoch": 1.3947939262472886, "grad_norm": 0.004479791881274671, "learning_rate": 2.304229713714133e-06, "loss": 0.0002, "step": 3858 }, { "epoch": 1.3951554591467823, "grad_norm": 0.1438835151836072, "learning_rate": 2.3017127152642366e-06, "loss": 0.0227, "step": 3859 }, { "epoch": 1.3955169920462762, "grad_norm": 0.23846623316117446, "learning_rate": 2.2991966811816505e-06, "loss": 0.0283, "step": 3860 }, { "epoch": 1.3958785249457701, "grad_norm": 0.11583714846570123, "learning_rate": 2.2966816123655987e-06, "loss": 0.0161, "step": 3861 }, { "epoch": 1.3962400578452638, "grad_norm": 0.24450431091676106, "learning_rate": 2.2941675097149667e-06, "loss": 0.0203, "step": 3862 }, { "epoch": 1.3966015907447578, "grad_norm": 0.024427670519930646, "learning_rate": 2.291654374128297e-06, "loss": 0.001, "step": 3863 }, { "epoch": 1.3969631236442517, "grad_norm": 2.362666927590083, "learning_rate": 2.289142206503783e-06, "loss": 0.1504, "step": 3864 }, { "epoch": 1.3973246565437454, "grad_norm": 0.1577662344107944, "learning_rate": 2.286631007739272e-06, "loss": 0.0063, "step": 3865 }, { "epoch": 1.3976861894432393, "grad_norm": 0.2561837549248336, "learning_rate": 2.284120778732266e-06, "loss": 0.0315, "step": 3866 }, { "epoch": 1.3980477223427332, "grad_norm": 0.08367790784821892, "learning_rate": 2.2816115203799207e-06, "loss": 0.0143, "step": 3867 }, { "epoch": 1.398409255242227, "grad_norm": 0.2695366482042006, "learning_rate": 2.279103233579044e-06, "loss": 0.0182, "step": 3868 }, { "epoch": 1.3987707881417208, "grad_norm": 0.826157051211426, "learning_rate": 2.276595919226096e-06, "loss": 0.1309, "step": 3869 }, { "epoch": 1.3991323210412148, "grad_norm": 0.632489113283328, "learning_rate": 2.274089578217196e-06, "loss": 0.2012, "step": 3870 }, { "epoch": 1.3994938539407087, "grad_norm": 0.10125243248660595, "learning_rate": 2.2715842114481024e-06, "loss": 0.0161, "step": 3871 }, { "epoch": 1.3998553868402024, "grad_norm": 0.13954200452378435, "learning_rate": 2.2690798198142343e-06, "loss": 0.0203, "step": 3872 }, { "epoch": 1.4002169197396963, "grad_norm": 0.4458157998869284, "learning_rate": 2.2665764042106647e-06, "loss": 0.0182, "step": 3873 }, { "epoch": 1.4005784526391902, "grad_norm": 0.18783091432340285, "learning_rate": 2.2640739655321107e-06, "loss": 0.0254, "step": 3874 }, { "epoch": 1.4009399855386842, "grad_norm": 0.5193358001395021, "learning_rate": 2.2615725046729442e-06, "loss": 0.0527, "step": 3875 }, { "epoch": 1.4013015184381779, "grad_norm": 0.13037056580236084, "learning_rate": 2.2590720225271866e-06, "loss": 0.0227, "step": 3876 }, { "epoch": 1.4016630513376718, "grad_norm": 0.5539621917791561, "learning_rate": 2.2565725199885104e-06, "loss": 0.1807, "step": 3877 }, { "epoch": 1.4020245842371657, "grad_norm": 0.9256760687774024, "learning_rate": 2.2540739979502356e-06, "loss": 0.083, "step": 3878 }, { "epoch": 1.4023861171366594, "grad_norm": 0.11136526252910035, "learning_rate": 2.2515764573053336e-06, "loss": 0.0182, "step": 3879 }, { "epoch": 1.4027476500361533, "grad_norm": 0.08085447293753092, "learning_rate": 2.2490798989464262e-06, "loss": 0.0143, "step": 3880 }, { "epoch": 1.4031091829356472, "grad_norm": 0.1488774795010722, "learning_rate": 2.2465843237657853e-06, "loss": 0.0203, "step": 3881 }, { "epoch": 1.403470715835141, "grad_norm": 0.6672136054903458, "learning_rate": 2.2440897326553217e-06, "loss": 0.0635, "step": 3882 }, { "epoch": 1.4038322487346349, "grad_norm": 0.007703378312830896, "learning_rate": 2.2415961265066083e-06, "loss": 0.0001, "step": 3883 }, { "epoch": 1.4041937816341288, "grad_norm": 0.15679605486769543, "learning_rate": 2.2391035062108575e-06, "loss": 0.0254, "step": 3884 }, { "epoch": 1.4045553145336225, "grad_norm": 0.13366427565875794, "learning_rate": 2.2366118726589304e-06, "loss": 0.0227, "step": 3885 }, { "epoch": 1.4049168474331164, "grad_norm": 0.07731828040393428, "learning_rate": 2.2341212267413364e-06, "loss": 0.0143, "step": 3886 }, { "epoch": 1.4052783803326103, "grad_norm": 1.0697385849161019, "learning_rate": 2.231631569348233e-06, "loss": 0.0762, "step": 3887 }, { "epoch": 1.405639913232104, "grad_norm": 0.20229888311147107, "learning_rate": 2.229142901369422e-06, "loss": 0.0254, "step": 3888 }, { "epoch": 1.406001446131598, "grad_norm": 1.152527563347817, "learning_rate": 2.2266552236943515e-06, "loss": 0.1309, "step": 3889 }, { "epoch": 1.4063629790310919, "grad_norm": 0.4848566672678007, "learning_rate": 2.2241685372121215e-06, "loss": 0.0432, "step": 3890 }, { "epoch": 1.4067245119305856, "grad_norm": 2.5719633179472496, "learning_rate": 2.2216828428114695e-06, "loss": 0.1699, "step": 3891 }, { "epoch": 1.4070860448300795, "grad_norm": 0.09297259592152393, "learning_rate": 2.2191981413807834e-06, "loss": 0.0161, "step": 3892 }, { "epoch": 1.4074475777295734, "grad_norm": 0.13774558805682413, "learning_rate": 2.216714433808095e-06, "loss": 0.0063, "step": 3893 }, { "epoch": 1.407809110629067, "grad_norm": 0.09653814302488793, "learning_rate": 2.21423172098108e-06, "loss": 0.0181, "step": 3894 }, { "epoch": 1.408170643528561, "grad_norm": 0.42074712799262237, "learning_rate": 2.21175000378706e-06, "loss": 0.0388, "step": 3895 }, { "epoch": 1.408532176428055, "grad_norm": 0.06355846757286436, "learning_rate": 2.209269283112999e-06, "loss": 0.0027, "step": 3896 }, { "epoch": 1.4088937093275489, "grad_norm": 0.19255996833352593, "learning_rate": 2.206789559845511e-06, "loss": 0.0129, "step": 3897 }, { "epoch": 1.4092552422270428, "grad_norm": 0.673076122515389, "learning_rate": 2.2043108348708425e-06, "loss": 0.1055, "step": 3898 }, { "epoch": 1.4096167751265365, "grad_norm": 0.32664745700833725, "learning_rate": 2.2018331090748895e-06, "loss": 0.0388, "step": 3899 }, { "epoch": 1.4099783080260304, "grad_norm": 0.08906496632911871, "learning_rate": 2.1993563833431952e-06, "loss": 0.0143, "step": 3900 }, { "epoch": 1.4103398409255243, "grad_norm": 0.8399774120005797, "learning_rate": 2.1968806585609383e-06, "loss": 0.0388, "step": 3901 }, { "epoch": 1.410701373825018, "grad_norm": 0.11411614206517728, "learning_rate": 2.194405935612943e-06, "loss": 0.0063, "step": 3902 }, { "epoch": 1.411062906724512, "grad_norm": 0.12986587900866814, "learning_rate": 2.191932215383673e-06, "loss": 0.0161, "step": 3903 }, { "epoch": 1.4114244396240059, "grad_norm": 0.10606210233426132, "learning_rate": 2.1894594987572375e-06, "loss": 0.0182, "step": 3904 }, { "epoch": 1.4117859725234996, "grad_norm": 0.30251423168444624, "learning_rate": 2.186987786617384e-06, "loss": 0.0162, "step": 3905 }, { "epoch": 1.4121475054229935, "grad_norm": 0.11792840246776254, "learning_rate": 2.1845170798474995e-06, "loss": 0.0161, "step": 3906 }, { "epoch": 1.4125090383224874, "grad_norm": 0.9787051846970716, "learning_rate": 2.1820473793306207e-06, "loss": 0.0762, "step": 3907 }, { "epoch": 1.4128705712219811, "grad_norm": 0.0036037653726722746, "learning_rate": 2.1795786859494116e-06, "loss": 0.0002, "step": 3908 }, { "epoch": 1.413232104121475, "grad_norm": 0.13245333232684486, "learning_rate": 2.1771110005861836e-06, "loss": 0.008, "step": 3909 }, { "epoch": 1.413593637020969, "grad_norm": 0.7217583844895263, "learning_rate": 2.17464432412289e-06, "loss": 0.0903, "step": 3910 }, { "epoch": 1.4139551699204627, "grad_norm": 0.001849971268874843, "learning_rate": 2.172178657441118e-06, "loss": 0.0001, "step": 3911 }, { "epoch": 1.4143167028199566, "grad_norm": 0.22206407071316278, "learning_rate": 2.1697140014220973e-06, "loss": 0.0254, "step": 3912 }, { "epoch": 1.4146782357194505, "grad_norm": 0.08769235752784714, "learning_rate": 2.1672503569466956e-06, "loss": 0.0143, "step": 3913 }, { "epoch": 1.4150397686189442, "grad_norm": 0.1560380538808074, "learning_rate": 2.1647877248954184e-06, "loss": 0.0203, "step": 3914 }, { "epoch": 1.4154013015184381, "grad_norm": 0.692712701092771, "learning_rate": 2.1623261061484096e-06, "loss": 0.1699, "step": 3915 }, { "epoch": 1.415762834417932, "grad_norm": 0.512814000337275, "learning_rate": 2.15986550158545e-06, "loss": 0.2129, "step": 3916 }, { "epoch": 1.4161243673174257, "grad_norm": 0.12448452030011958, "learning_rate": 2.1574059120859647e-06, "loss": 0.0161, "step": 3917 }, { "epoch": 1.4164859002169197, "grad_norm": 0.06728911268603756, "learning_rate": 2.154947338529005e-06, "loss": 0.0101, "step": 3918 }, { "epoch": 1.4168474331164136, "grad_norm": 0.39269169219989886, "learning_rate": 2.152489781793263e-06, "loss": 0.0432, "step": 3919 }, { "epoch": 1.4172089660159075, "grad_norm": 2.805732095731558, "learning_rate": 2.1500332427570745e-06, "loss": 0.2812, "step": 3920 }, { "epoch": 1.4175704989154014, "grad_norm": 0.6422961692361282, "learning_rate": 2.147577722298404e-06, "loss": 0.1602, "step": 3921 }, { "epoch": 1.4179320318148951, "grad_norm": 0.0023416352906889606, "learning_rate": 2.1451232212948537e-06, "loss": 0.0001, "step": 3922 }, { "epoch": 1.418293564714389, "grad_norm": 0.10631039209419295, "learning_rate": 2.142669740623661e-06, "loss": 0.0182, "step": 3923 }, { "epoch": 1.418655097613883, "grad_norm": 1.7270291399037967, "learning_rate": 2.1402172811617e-06, "loss": 0.1602, "step": 3924 }, { "epoch": 1.4190166305133767, "grad_norm": 0.06597694054499324, "learning_rate": 2.1377658437854795e-06, "loss": 0.0031, "step": 3925 }, { "epoch": 1.4193781634128706, "grad_norm": 0.6673267221106229, "learning_rate": 2.1353154293711403e-06, "loss": 0.1699, "step": 3926 }, { "epoch": 1.4197396963123645, "grad_norm": 1.4216575306644705, "learning_rate": 2.1328660387944663e-06, "loss": 0.1309, "step": 3927 }, { "epoch": 1.4201012292118582, "grad_norm": 0.3869491730447887, "learning_rate": 2.1304176729308622e-06, "loss": 0.0182, "step": 3928 }, { "epoch": 1.4204627621113521, "grad_norm": 0.6043456264215117, "learning_rate": 2.1279703326553754e-06, "loss": 0.0476, "step": 3929 }, { "epoch": 1.420824295010846, "grad_norm": 0.14506546844036283, "learning_rate": 2.1255240188426863e-06, "loss": 0.0143, "step": 3930 }, { "epoch": 1.4211858279103398, "grad_norm": 0.0828569066438703, "learning_rate": 2.123078732367107e-06, "loss": 0.0039, "step": 3931 }, { "epoch": 1.4215473608098337, "grad_norm": 0.22323296400526477, "learning_rate": 2.120634474102581e-06, "loss": 0.0254, "step": 3932 }, { "epoch": 1.4219088937093276, "grad_norm": 0.19337655097556458, "learning_rate": 2.1181912449226873e-06, "loss": 0.0203, "step": 3933 }, { "epoch": 1.4222704266088213, "grad_norm": 0.2025693346392929, "learning_rate": 2.1157490457006337e-06, "loss": 0.0283, "step": 3934 }, { "epoch": 1.4226319595083152, "grad_norm": 0.09424626527714687, "learning_rate": 2.113307877309263e-06, "loss": 0.0143, "step": 3935 }, { "epoch": 1.4229934924078091, "grad_norm": 0.6717812276815655, "learning_rate": 2.1108677406210453e-06, "loss": 0.0527, "step": 3936 }, { "epoch": 1.4233550253073028, "grad_norm": 0.5171679388556047, "learning_rate": 2.1084286365080916e-06, "loss": 0.1914, "step": 3937 }, { "epoch": 1.4237165582067968, "grad_norm": 0.0017568099205510148, "learning_rate": 2.105990565842131e-06, "loss": 0.0001, "step": 3938 }, { "epoch": 1.4240780911062907, "grad_norm": 0.2488643239961615, "learning_rate": 2.103553529494529e-06, "loss": 0.0227, "step": 3939 }, { "epoch": 1.4244396240057844, "grad_norm": 1.1647268204160752, "learning_rate": 2.1011175283362866e-06, "loss": 0.0903, "step": 3940 }, { "epoch": 1.4248011569052783, "grad_norm": 0.15179188337489907, "learning_rate": 2.098682563238028e-06, "loss": 0.0203, "step": 3941 }, { "epoch": 1.4251626898047722, "grad_norm": 0.004614517727976265, "learning_rate": 2.096248635070009e-06, "loss": 0.0002, "step": 3942 }, { "epoch": 1.4255242227042662, "grad_norm": 0.10951910528114467, "learning_rate": 2.0938157447021146e-06, "loss": 0.0181, "step": 3943 }, { "epoch": 1.42588575560376, "grad_norm": 0.11984518990576754, "learning_rate": 2.09138389300386e-06, "loss": 0.0049, "step": 3944 }, { "epoch": 1.4262472885032538, "grad_norm": 0.5622675630929173, "learning_rate": 2.088953080844388e-06, "loss": 0.1406, "step": 3945 }, { "epoch": 1.4266088214027477, "grad_norm": 0.17188985135342535, "learning_rate": 2.0865233090924693e-06, "loss": 0.0254, "step": 3946 }, { "epoch": 1.4269703543022416, "grad_norm": 0.16328562454455595, "learning_rate": 2.084094578616508e-06, "loss": 0.009, "step": 3947 }, { "epoch": 1.4273318872017353, "grad_norm": 0.08372950648625639, "learning_rate": 2.0816668902845276e-06, "loss": 0.0143, "step": 3948 }, { "epoch": 1.4276934201012292, "grad_norm": 0.13191307927041684, "learning_rate": 2.0792402449641825e-06, "loss": 0.0203, "step": 3949 }, { "epoch": 1.4280549530007232, "grad_norm": 0.358692217296375, "learning_rate": 2.07681464352276e-06, "loss": 0.0315, "step": 3950 }, { "epoch": 1.4284164859002169, "grad_norm": 0.07906328031401384, "learning_rate": 2.074390086827166e-06, "loss": 0.0143, "step": 3951 }, { "epoch": 1.4287780187997108, "grad_norm": 0.20742913229716745, "learning_rate": 2.0719665757439382e-06, "loss": 0.0317, "step": 3952 }, { "epoch": 1.4291395516992047, "grad_norm": 0.6068865631585157, "learning_rate": 2.0695441111392385e-06, "loss": 0.1807, "step": 3953 }, { "epoch": 1.4295010845986984, "grad_norm": 0.10025459020800027, "learning_rate": 2.067122693878854e-06, "loss": 0.0161, "step": 3954 }, { "epoch": 1.4298626174981923, "grad_norm": 0.5261550273814877, "learning_rate": 2.0647023248282007e-06, "loss": 0.0204, "step": 3955 }, { "epoch": 1.4302241503976862, "grad_norm": 0.0035622916252827977, "learning_rate": 2.062283004852315e-06, "loss": 0.0002, "step": 3956 }, { "epoch": 1.43058568329718, "grad_norm": 0.5942698873796965, "learning_rate": 2.059864734815867e-06, "loss": 0.0635, "step": 3957 }, { "epoch": 1.4309472161966739, "grad_norm": 0.08442242257887826, "learning_rate": 2.0574475155831386e-06, "loss": 0.0143, "step": 3958 }, { "epoch": 1.4313087490961678, "grad_norm": 0.3138315549164327, "learning_rate": 2.055031348018049e-06, "loss": 0.0283, "step": 3959 }, { "epoch": 1.4316702819956615, "grad_norm": 0.2030582848783161, "learning_rate": 2.052616232984134e-06, "loss": 0.0182, "step": 3960 }, { "epoch": 1.4320318148951554, "grad_norm": 1.3764754606345528, "learning_rate": 2.050202171344556e-06, "loss": 0.1055, "step": 3961 }, { "epoch": 1.4323933477946493, "grad_norm": 1.2881161389852196, "learning_rate": 2.0477891639620984e-06, "loss": 0.1143, "step": 3962 }, { "epoch": 1.432754880694143, "grad_norm": 1.2110691602752464, "learning_rate": 2.0453772116991693e-06, "loss": 0.0635, "step": 3963 }, { "epoch": 1.433116413593637, "grad_norm": 0.09573078603581736, "learning_rate": 2.0429663154178046e-06, "loss": 0.0182, "step": 3964 }, { "epoch": 1.4334779464931309, "grad_norm": 0.0035454023204242167, "learning_rate": 2.040556475979653e-06, "loss": 0.0001, "step": 3965 }, { "epoch": 1.4338394793926248, "grad_norm": 0.8458806421923931, "learning_rate": 2.038147694245991e-06, "loss": 0.1406, "step": 3966 }, { "epoch": 1.4342010122921187, "grad_norm": 0.13517471233330045, "learning_rate": 2.035739971077721e-06, "loss": 0.0203, "step": 3967 }, { "epoch": 1.4345625451916124, "grad_norm": 0.44301437081201783, "learning_rate": 2.0333333073353563e-06, "loss": 0.0476, "step": 3968 }, { "epoch": 1.4349240780911063, "grad_norm": 0.07653044831584575, "learning_rate": 2.0309277038790433e-06, "loss": 0.0143, "step": 3969 }, { "epoch": 1.4352856109906003, "grad_norm": 0.7607743304167293, "learning_rate": 2.0285231615685423e-06, "loss": 0.1504, "step": 3970 }, { "epoch": 1.435647143890094, "grad_norm": 0.12600969484244198, "learning_rate": 2.0261196812632368e-06, "loss": 0.0203, "step": 3971 }, { "epoch": 1.4360086767895879, "grad_norm": 0.1450696343760392, "learning_rate": 2.0237172638221287e-06, "loss": 0.0203, "step": 3972 }, { "epoch": 1.4363702096890818, "grad_norm": 0.6380065228286635, "learning_rate": 2.021315910103841e-06, "loss": 0.1406, "step": 3973 }, { "epoch": 1.4367317425885755, "grad_norm": 0.730188519671193, "learning_rate": 2.0189156209666223e-06, "loss": 0.1504, "step": 3974 }, { "epoch": 1.4370932754880694, "grad_norm": 2.566241177985071, "learning_rate": 2.016516397268329e-06, "loss": 0.1406, "step": 3975 }, { "epoch": 1.4374548083875633, "grad_norm": 0.6855044912770024, "learning_rate": 2.0141182398664445e-06, "loss": 0.0388, "step": 3976 }, { "epoch": 1.437816341287057, "grad_norm": 0.11868679433711184, "learning_rate": 2.011721149618073e-06, "loss": 0.0182, "step": 3977 }, { "epoch": 1.438177874186551, "grad_norm": 0.8171144385163697, "learning_rate": 2.0093251273799313e-06, "loss": 0.0527, "step": 3978 }, { "epoch": 1.4385394070860449, "grad_norm": 0.41556907464949155, "learning_rate": 2.006930174008358e-06, "loss": 0.0254, "step": 3979 }, { "epoch": 1.4389009399855386, "grad_norm": 0.05983617803003663, "learning_rate": 2.004536290359309e-06, "loss": 0.0034, "step": 3980 }, { "epoch": 1.4392624728850325, "grad_norm": 0.09850784700128236, "learning_rate": 2.002143477288358e-06, "loss": 0.0161, "step": 3981 }, { "epoch": 1.4396240057845264, "grad_norm": 0.8022311462322206, "learning_rate": 1.999751735650695e-06, "loss": 0.1309, "step": 3982 }, { "epoch": 1.4399855386840201, "grad_norm": 0.2544162484012793, "learning_rate": 1.997361066301127e-06, "loss": 0.0203, "step": 3983 }, { "epoch": 1.440347071583514, "grad_norm": 0.1642020546050558, "learning_rate": 1.994971470094084e-06, "loss": 0.0254, "step": 3984 }, { "epoch": 1.440708604483008, "grad_norm": 0.09901544409821177, "learning_rate": 1.9925829478836013e-06, "loss": 0.0181, "step": 3985 }, { "epoch": 1.4410701373825017, "grad_norm": 0.06145909440746457, "learning_rate": 1.990195500523337e-06, "loss": 0.0031, "step": 3986 }, { "epoch": 1.4414316702819956, "grad_norm": 0.20677342277149074, "learning_rate": 1.9878091288665667e-06, "loss": 0.0254, "step": 3987 }, { "epoch": 1.4417932031814895, "grad_norm": 0.0021437985916020626, "learning_rate": 1.9854238337661786e-06, "loss": 0.0001, "step": 3988 }, { "epoch": 1.4421547360809834, "grad_norm": 0.26562921043198195, "learning_rate": 1.983039616074676e-06, "loss": 0.0352, "step": 3989 }, { "epoch": 1.4425162689804774, "grad_norm": 0.478790577554867, "learning_rate": 1.980656476644178e-06, "loss": 0.0693, "step": 3990 }, { "epoch": 1.442877801879971, "grad_norm": 0.22731097583252763, "learning_rate": 1.978274416326418e-06, "loss": 0.0315, "step": 3991 }, { "epoch": 1.443239334779465, "grad_norm": 0.9177226238114927, "learning_rate": 1.9758934359727432e-06, "loss": 0.1602, "step": 3992 }, { "epoch": 1.443600867678959, "grad_norm": 0.0017777149283565927, "learning_rate": 1.973513536434115e-06, "loss": 0.0001, "step": 3993 }, { "epoch": 1.4439624005784526, "grad_norm": 0.12045198721910821, "learning_rate": 1.971134718561114e-06, "loss": 0.0182, "step": 3994 }, { "epoch": 1.4443239334779465, "grad_norm": 0.9207055206647693, "learning_rate": 1.968756983203923e-06, "loss": 0.0903, "step": 3995 }, { "epoch": 1.4446854663774404, "grad_norm": 0.1728130222591075, "learning_rate": 1.9663803312123455e-06, "loss": 0.0114, "step": 3996 }, { "epoch": 1.4450469992769341, "grad_norm": 0.25258246295969866, "learning_rate": 1.964004763435799e-06, "loss": 0.0182, "step": 3997 }, { "epoch": 1.445408532176428, "grad_norm": 0.6575407875503484, "learning_rate": 1.961630280723309e-06, "loss": 0.1309, "step": 3998 }, { "epoch": 1.445770065075922, "grad_norm": 0.19182549297436752, "learning_rate": 1.9592568839235154e-06, "loss": 0.0114, "step": 3999 }, { "epoch": 1.4461315979754157, "grad_norm": 0.18245726236531276, "learning_rate": 1.9568845738846697e-06, "loss": 0.0227, "step": 4000 }, { "epoch": 1.4464931308749096, "grad_norm": 0.012037148818090622, "learning_rate": 1.9545133514546355e-06, "loss": 0.0005, "step": 4001 }, { "epoch": 1.4468546637744035, "grad_norm": 0.13282342396535002, "learning_rate": 1.9521432174808863e-06, "loss": 0.0227, "step": 4002 }, { "epoch": 1.4472161966738972, "grad_norm": 0.08405953143092054, "learning_rate": 1.949774172810507e-06, "loss": 0.0143, "step": 4003 }, { "epoch": 1.4475777295733911, "grad_norm": 0.06117087169860613, "learning_rate": 1.947406218290197e-06, "loss": 0.0019, "step": 4004 }, { "epoch": 1.447939262472885, "grad_norm": 0.33705589547967313, "learning_rate": 1.9450393547662593e-06, "loss": 0.0315, "step": 4005 }, { "epoch": 1.4483007953723788, "grad_norm": 0.13589189571184446, "learning_rate": 1.9426735830846093e-06, "loss": 0.009, "step": 4006 }, { "epoch": 1.4486623282718727, "grad_norm": 0.20134476799385914, "learning_rate": 1.940308904090778e-06, "loss": 0.0254, "step": 4007 }, { "epoch": 1.4490238611713666, "grad_norm": 0.6425275672713024, "learning_rate": 1.937945318629898e-06, "loss": 0.1699, "step": 4008 }, { "epoch": 1.4493853940708603, "grad_norm": 0.18677814921666344, "learning_rate": 1.9355828275467155e-06, "loss": 0.0254, "step": 4009 }, { "epoch": 1.4497469269703542, "grad_norm": 0.3200448471873007, "learning_rate": 1.933221431685583e-06, "loss": 0.0349, "step": 4010 }, { "epoch": 1.4501084598698482, "grad_norm": 0.14129179160017027, "learning_rate": 1.9308611318904643e-06, "loss": 0.0227, "step": 4011 }, { "epoch": 1.450469992769342, "grad_norm": 0.20887621785861588, "learning_rate": 1.928501929004929e-06, "loss": 0.0283, "step": 4012 }, { "epoch": 1.450831525668836, "grad_norm": 0.24341384490518486, "learning_rate": 1.926143823872154e-06, "loss": 0.008, "step": 4013 }, { "epoch": 1.4511930585683297, "grad_norm": 0.0992085129411519, "learning_rate": 1.9237868173349317e-06, "loss": 0.0182, "step": 4014 }, { "epoch": 1.4515545914678236, "grad_norm": 0.1642156183510608, "learning_rate": 1.9214309102356493e-06, "loss": 0.0283, "step": 4015 }, { "epoch": 1.4519161243673175, "grad_norm": 0.014684860348358931, "learning_rate": 1.9190761034163084e-06, "loss": 0.0006, "step": 4016 }, { "epoch": 1.4522776572668112, "grad_norm": 0.0897721189624904, "learning_rate": 1.91672239771852e-06, "loss": 0.0027, "step": 4017 }, { "epoch": 1.4526391901663052, "grad_norm": 0.37001943003745064, "learning_rate": 1.914369793983496e-06, "loss": 0.0227, "step": 4018 }, { "epoch": 1.453000723065799, "grad_norm": 0.010837130627410693, "learning_rate": 1.9120182930520564e-06, "loss": 0.0005, "step": 4019 }, { "epoch": 1.4533622559652928, "grad_norm": 0.0014123437465451152, "learning_rate": 1.909667895764627e-06, "loss": 0.0001, "step": 4020 }, { "epoch": 1.4537237888647867, "grad_norm": 0.0023305844144954125, "learning_rate": 1.9073186029612395e-06, "loss": 0.0001, "step": 4021 }, { "epoch": 1.4540853217642806, "grad_norm": 0.02486343042380763, "learning_rate": 1.90497041548153e-06, "loss": 0.0012, "step": 4022 }, { "epoch": 1.4544468546637743, "grad_norm": 0.0013003905829647093, "learning_rate": 1.9026233341647398e-06, "loss": 0.0001, "step": 4023 }, { "epoch": 1.4548083875632682, "grad_norm": 0.2125430302640459, "learning_rate": 1.90027735984972e-06, "loss": 0.0102, "step": 4024 }, { "epoch": 1.4551699204627622, "grad_norm": 0.17289898430711068, "learning_rate": 1.8979324933749155e-06, "loss": 0.0181, "step": 4025 }, { "epoch": 1.4555314533622559, "grad_norm": 0.0626345067862871, "learning_rate": 1.8955887355783814e-06, "loss": 0.0114, "step": 4026 }, { "epoch": 1.4558929862617498, "grad_norm": 1.0123473246204548, "learning_rate": 1.8932460872977803e-06, "loss": 0.0903, "step": 4027 }, { "epoch": 1.4562545191612437, "grad_norm": 0.16128028520589074, "learning_rate": 1.8909045493703716e-06, "loss": 0.0203, "step": 4028 }, { "epoch": 1.4566160520607374, "grad_norm": 0.28824244556154377, "learning_rate": 1.8885641226330204e-06, "loss": 0.0317, "step": 4029 }, { "epoch": 1.4569775849602313, "grad_norm": 0.7751073830998506, "learning_rate": 1.886224807922194e-06, "loss": 0.1143, "step": 4030 }, { "epoch": 1.4573391178597253, "grad_norm": 0.003220686845253955, "learning_rate": 1.8838866060739674e-06, "loss": 0.0001, "step": 4031 }, { "epoch": 1.457700650759219, "grad_norm": 0.6105382256299888, "learning_rate": 1.8815495179240084e-06, "loss": 0.0145, "step": 4032 }, { "epoch": 1.4580621836587129, "grad_norm": 0.20082334529397128, "learning_rate": 1.8792135443075915e-06, "loss": 0.0254, "step": 4033 }, { "epoch": 1.4584237165582068, "grad_norm": 0.021016828803617736, "learning_rate": 1.8768786860595995e-06, "loss": 0.001, "step": 4034 }, { "epoch": 1.4587852494577007, "grad_norm": 0.3249505934304125, "learning_rate": 1.8745449440145018e-06, "loss": 0.0388, "step": 4035 }, { "epoch": 1.4591467823571946, "grad_norm": 0.5933348878007474, "learning_rate": 1.8722123190063834e-06, "loss": 0.1699, "step": 4036 }, { "epoch": 1.4595083152566883, "grad_norm": 0.02295882989786434, "learning_rate": 1.8698808118689227e-06, "loss": 0.0012, "step": 4037 }, { "epoch": 1.4598698481561823, "grad_norm": 0.05980438757956209, "learning_rate": 1.8675504234353986e-06, "loss": 0.0024, "step": 4038 }, { "epoch": 1.4602313810556762, "grad_norm": 0.014331055646753441, "learning_rate": 1.8652211545386922e-06, "loss": 0.0007, "step": 4039 }, { "epoch": 1.4605929139551699, "grad_norm": 0.6257698722525443, "learning_rate": 1.862893006011282e-06, "loss": 0.1309, "step": 4040 }, { "epoch": 1.4609544468546638, "grad_norm": 0.1311482195304859, "learning_rate": 1.8605659786852531e-06, "loss": 0.0181, "step": 4041 }, { "epoch": 1.4613159797541577, "grad_norm": 0.9261331357354985, "learning_rate": 1.858240073392279e-06, "loss": 0.083, "step": 4042 }, { "epoch": 1.4616775126536514, "grad_norm": 0.047498733592708675, "learning_rate": 1.8559152909636375e-06, "loss": 0.0021, "step": 4043 }, { "epoch": 1.4620390455531453, "grad_norm": 0.18760932833573898, "learning_rate": 1.8535916322302111e-06, "loss": 0.0182, "step": 4044 }, { "epoch": 1.4624005784526393, "grad_norm": 0.634296852579604, "learning_rate": 1.8512690980224679e-06, "loss": 0.0432, "step": 4045 }, { "epoch": 1.462762111352133, "grad_norm": 0.7806503316061991, "learning_rate": 1.8489476891704866e-06, "loss": 0.1406, "step": 4046 }, { "epoch": 1.4631236442516269, "grad_norm": 0.09796630797217727, "learning_rate": 1.8466274065039358e-06, "loss": 0.0161, "step": 4047 }, { "epoch": 1.4634851771511208, "grad_norm": 0.0733258858903156, "learning_rate": 1.8443082508520848e-06, "loss": 0.0128, "step": 4048 }, { "epoch": 1.4638467100506145, "grad_norm": 0.00446379857081164, "learning_rate": 1.8419902230437985e-06, "loss": 0.0002, "step": 4049 }, { "epoch": 1.4642082429501084, "grad_norm": 0.015541810379461824, "learning_rate": 1.8396733239075387e-06, "loss": 0.0008, "step": 4050 }, { "epoch": 1.4645697758496024, "grad_norm": 0.3287513365105255, "learning_rate": 1.8373575542713696e-06, "loss": 0.0227, "step": 4051 }, { "epoch": 1.464931308749096, "grad_norm": 0.10052067216487004, "learning_rate": 1.8350429149629412e-06, "loss": 0.0143, "step": 4052 }, { "epoch": 1.46529284164859, "grad_norm": 0.10734898830205089, "learning_rate": 1.8327294068095054e-06, "loss": 0.0182, "step": 4053 }, { "epoch": 1.465654374548084, "grad_norm": 0.37838604647600843, "learning_rate": 1.8304170306379143e-06, "loss": 0.0315, "step": 4054 }, { "epoch": 1.4660159074475776, "grad_norm": 0.4312693367495238, "learning_rate": 1.8281057872746043e-06, "loss": 0.0432, "step": 4055 }, { "epoch": 1.4663774403470715, "grad_norm": 0.3796686267879344, "learning_rate": 1.8257956775456182e-06, "loss": 0.0315, "step": 4056 }, { "epoch": 1.4667389732465654, "grad_norm": 0.16560067733730577, "learning_rate": 1.8234867022765872e-06, "loss": 0.0227, "step": 4057 }, { "epoch": 1.4671005061460594, "grad_norm": 0.8778131865481502, "learning_rate": 1.8211788622927384e-06, "loss": 0.1406, "step": 4058 }, { "epoch": 1.4674620390455533, "grad_norm": 0.0017262317310648666, "learning_rate": 1.8188721584188934e-06, "loss": 0.0001, "step": 4059 }, { "epoch": 1.467823571945047, "grad_norm": 0.08425234245402162, "learning_rate": 1.8165665914794655e-06, "loss": 0.0143, "step": 4060 }, { "epoch": 1.468185104844541, "grad_norm": 0.10724966108830385, "learning_rate": 1.8142621622984702e-06, "loss": 0.0181, "step": 4061 }, { "epoch": 1.4685466377440348, "grad_norm": 0.12058405973666098, "learning_rate": 1.8119588716995035e-06, "loss": 0.0203, "step": 4062 }, { "epoch": 1.4689081706435285, "grad_norm": 0.6614324702845876, "learning_rate": 1.8096567205057614e-06, "loss": 0.2012, "step": 4063 }, { "epoch": 1.4692697035430224, "grad_norm": 1.9954455515138427, "learning_rate": 1.807355709540038e-06, "loss": 0.0203, "step": 4064 }, { "epoch": 1.4696312364425164, "grad_norm": 0.8629314354809875, "learning_rate": 1.8050558396247064e-06, "loss": 0.0635, "step": 4065 }, { "epoch": 1.46999276934201, "grad_norm": 0.732847263747133, "learning_rate": 1.8027571115817455e-06, "loss": 0.0693, "step": 4066 }, { "epoch": 1.470354302241504, "grad_norm": 0.0015444345595132275, "learning_rate": 1.800459526232718e-06, "loss": 0.0001, "step": 4067 }, { "epoch": 1.470715835140998, "grad_norm": 0.03244225138845818, "learning_rate": 1.7981630843987807e-06, "loss": 0.0021, "step": 4068 }, { "epoch": 1.4710773680404916, "grad_norm": 0.06482547984532751, "learning_rate": 1.7958677869006813e-06, "loss": 0.0031, "step": 4069 }, { "epoch": 1.4714389009399855, "grad_norm": 0.033207990032962695, "learning_rate": 1.7935736345587569e-06, "loss": 0.0013, "step": 4070 }, { "epoch": 1.4718004338394794, "grad_norm": 0.28060631410206455, "learning_rate": 1.7912806281929424e-06, "loss": 0.0254, "step": 4071 }, { "epoch": 1.4721619667389731, "grad_norm": 1.3162834799871934, "learning_rate": 1.788988768622752e-06, "loss": 0.1055, "step": 4072 }, { "epoch": 1.472523499638467, "grad_norm": 0.02532208438216343, "learning_rate": 1.786698056667297e-06, "loss": 0.001, "step": 4073 }, { "epoch": 1.472885032537961, "grad_norm": 0.15255855152197917, "learning_rate": 1.78440849314528e-06, "loss": 0.0013, "step": 4074 }, { "epoch": 1.4732465654374547, "grad_norm": 0.1955694210994345, "learning_rate": 1.7821200788749883e-06, "loss": 0.0254, "step": 4075 }, { "epoch": 1.4736080983369486, "grad_norm": 0.9890259328534345, "learning_rate": 1.7798328146743017e-06, "loss": 0.0635, "step": 4076 }, { "epoch": 1.4739696312364425, "grad_norm": 0.002597896058160173, "learning_rate": 1.7775467013606878e-06, "loss": 0.0001, "step": 4077 }, { "epoch": 1.4743311641359362, "grad_norm": 0.14296250855050557, "learning_rate": 1.7752617397512024e-06, "loss": 0.0203, "step": 4078 }, { "epoch": 1.4746926970354302, "grad_norm": 0.8142540872935137, "learning_rate": 1.7729779306624911e-06, "loss": 0.0762, "step": 4079 }, { "epoch": 1.475054229934924, "grad_norm": 0.2865508345762795, "learning_rate": 1.770695274910784e-06, "loss": 0.0182, "step": 4080 }, { "epoch": 1.475415762834418, "grad_norm": 0.744744395316121, "learning_rate": 1.7684137733119084e-06, "loss": 0.0579, "step": 4081 }, { "epoch": 1.475777295733912, "grad_norm": 0.004845168060442849, "learning_rate": 1.7661334266812657e-06, "loss": 0.0002, "step": 4082 }, { "epoch": 1.4761388286334056, "grad_norm": 0.12208942960400275, "learning_rate": 1.7638542358338528e-06, "loss": 0.0063, "step": 4083 }, { "epoch": 1.4765003615328995, "grad_norm": 0.18423843112476745, "learning_rate": 1.761576201584254e-06, "loss": 0.0203, "step": 4084 }, { "epoch": 1.4768618944323935, "grad_norm": 0.3623656255100465, "learning_rate": 1.7592993247466383e-06, "loss": 0.0317, "step": 4085 }, { "epoch": 1.4772234273318872, "grad_norm": 0.8219374013938068, "learning_rate": 1.7570236061347595e-06, "loss": 0.1309, "step": 4086 }, { "epoch": 1.477584960231381, "grad_norm": 0.8231925393883128, "learning_rate": 1.7547490465619593e-06, "loss": 0.0693, "step": 4087 }, { "epoch": 1.477946493130875, "grad_norm": 0.08988912790455694, "learning_rate": 1.7524756468411652e-06, "loss": 0.0143, "step": 4088 }, { "epoch": 1.4783080260303687, "grad_norm": 1.1397008777946849, "learning_rate": 1.7502034077848895e-06, "loss": 0.0903, "step": 4089 }, { "epoch": 1.4786695589298626, "grad_norm": 0.8564983983360513, "learning_rate": 1.747932330205228e-06, "loss": 0.1504, "step": 4090 }, { "epoch": 1.4790310918293565, "grad_norm": 0.004562384187803444, "learning_rate": 1.7456624149138697e-06, "loss": 0.0002, "step": 4091 }, { "epoch": 1.4793926247288502, "grad_norm": 0.11108353173683416, "learning_rate": 1.7433936627220748e-06, "loss": 0.0161, "step": 4092 }, { "epoch": 1.4797541576283442, "grad_norm": 0.6227197180780734, "learning_rate": 1.7411260744406965e-06, "loss": 0.2012, "step": 4093 }, { "epoch": 1.480115690527838, "grad_norm": 0.11314495689896019, "learning_rate": 1.738859650880173e-06, "loss": 0.0161, "step": 4094 }, { "epoch": 1.4804772234273318, "grad_norm": 0.1739543366653643, "learning_rate": 1.7365943928505219e-06, "loss": 0.0317, "step": 4095 }, { "epoch": 1.4808387563268257, "grad_norm": 1.007766384188841, "learning_rate": 1.734330301161346e-06, "loss": 0.0693, "step": 4096 }, { "epoch": 1.4812002892263196, "grad_norm": 0.5303328778771281, "learning_rate": 1.7320673766218316e-06, "loss": 0.1914, "step": 4097 }, { "epoch": 1.4815618221258133, "grad_norm": 0.4695760243940466, "learning_rate": 1.729805620040747e-06, "loss": 0.1699, "step": 4098 }, { "epoch": 1.4819233550253073, "grad_norm": 1.353933454533131, "learning_rate": 1.7275450322264437e-06, "loss": 0.1143, "step": 4099 }, { "epoch": 1.4822848879248012, "grad_norm": 0.0720791178848379, "learning_rate": 1.7252856139868536e-06, "loss": 0.0039, "step": 4100 }, { "epoch": 1.4826464208242949, "grad_norm": 0.1121691751626134, "learning_rate": 1.7230273661294972e-06, "loss": 0.0181, "step": 4101 }, { "epoch": 1.4830079537237888, "grad_norm": 0.10561525837395083, "learning_rate": 1.7207702894614653e-06, "loss": 0.0181, "step": 4102 }, { "epoch": 1.4833694866232827, "grad_norm": 0.8675457602285436, "learning_rate": 1.7185143847894415e-06, "loss": 0.0476, "step": 4103 }, { "epoch": 1.4837310195227766, "grad_norm": 0.00854392840206356, "learning_rate": 1.716259652919684e-06, "loss": 0.0001, "step": 4104 }, { "epoch": 1.4840925524222706, "grad_norm": 1.5978283206637791, "learning_rate": 1.7140060946580333e-06, "loss": 0.1309, "step": 4105 }, { "epoch": 1.4844540853217643, "grad_norm": 1.2009265528914166, "learning_rate": 1.7117537108099114e-06, "loss": 0.0635, "step": 4106 }, { "epoch": 1.4848156182212582, "grad_norm": 0.1252250262044585, "learning_rate": 1.7095025021803192e-06, "loss": 0.0181, "step": 4107 }, { "epoch": 1.485177151120752, "grad_norm": 0.16099243712129482, "learning_rate": 1.7072524695738386e-06, "loss": 0.0254, "step": 4108 }, { "epoch": 1.4855386840202458, "grad_norm": 0.43114059660885934, "learning_rate": 1.7050036137946296e-06, "loss": 0.0579, "step": 4109 }, { "epoch": 1.4859002169197397, "grad_norm": 0.0012532520647468375, "learning_rate": 1.7027559356464328e-06, "loss": 0.0, "step": 4110 }, { "epoch": 1.4862617498192336, "grad_norm": 0.17704077261838416, "learning_rate": 1.700509435932572e-06, "loss": 0.0203, "step": 4111 }, { "epoch": 1.4866232827187273, "grad_norm": 0.08715417889757622, "learning_rate": 1.6982641154559386e-06, "loss": 0.0161, "step": 4112 }, { "epoch": 1.4869848156182213, "grad_norm": 1.82923033616493, "learning_rate": 1.6960199750190153e-06, "loss": 0.0542, "step": 4113 }, { "epoch": 1.4873463485177152, "grad_norm": 0.09558267212799934, "learning_rate": 1.6937770154238559e-06, "loss": 0.0161, "step": 4114 }, { "epoch": 1.4877078814172089, "grad_norm": 0.016469594345759474, "learning_rate": 1.6915352374720939e-06, "loss": 0.0008, "step": 4115 }, { "epoch": 1.4880694143167028, "grad_norm": 0.25070349104069356, "learning_rate": 1.689294641964939e-06, "loss": 0.0145, "step": 4116 }, { "epoch": 1.4884309472161967, "grad_norm": 0.1638569952043222, "learning_rate": 1.6870552297031805e-06, "loss": 0.0227, "step": 4117 }, { "epoch": 1.4887924801156904, "grad_norm": 0.3668214866769323, "learning_rate": 1.6848170014871844e-06, "loss": 0.0352, "step": 4118 }, { "epoch": 1.4891540130151844, "grad_norm": 0.19237933788830666, "learning_rate": 1.6825799581168917e-06, "loss": 0.0254, "step": 4119 }, { "epoch": 1.4895155459146783, "grad_norm": 0.26398766532207163, "learning_rate": 1.6803441003918202e-06, "loss": 0.0145, "step": 4120 }, { "epoch": 1.489877078814172, "grad_norm": 0.14795267104301474, "learning_rate": 1.6781094291110705e-06, "loss": 0.0227, "step": 4121 }, { "epoch": 1.490238611713666, "grad_norm": 1.1962202940655795, "learning_rate": 1.6758759450733058e-06, "loss": 0.1699, "step": 4122 }, { "epoch": 1.4906001446131598, "grad_norm": 0.13165093381070672, "learning_rate": 1.6736436490767793e-06, "loss": 0.0161, "step": 4123 }, { "epoch": 1.4909616775126535, "grad_norm": 0.11985482448285609, "learning_rate": 1.67141254191931e-06, "loss": 0.0181, "step": 4124 }, { "epoch": 1.4913232104121474, "grad_norm": 0.1870646723342805, "learning_rate": 1.6691826243982955e-06, "loss": 0.0254, "step": 4125 }, { "epoch": 1.4916847433116414, "grad_norm": 0.018296887720791834, "learning_rate": 1.6669538973107087e-06, "loss": 0.0008, "step": 4126 }, { "epoch": 1.4920462762111353, "grad_norm": 0.4636991650542177, "learning_rate": 1.664726361453094e-06, "loss": 0.0432, "step": 4127 }, { "epoch": 1.4924078091106292, "grad_norm": 0.08210902486412923, "learning_rate": 1.6625000176215766e-06, "loss": 0.0161, "step": 4128 }, { "epoch": 1.492769342010123, "grad_norm": 1.3642324227764866, "learning_rate": 1.6602748666118474e-06, "loss": 0.0903, "step": 4129 }, { "epoch": 1.4931308749096168, "grad_norm": 0.570082422685175, "learning_rate": 1.6580509092191738e-06, "loss": 0.1699, "step": 4130 }, { "epoch": 1.4934924078091107, "grad_norm": 0.4802860005190491, "learning_rate": 1.6558281462384041e-06, "loss": 0.1914, "step": 4131 }, { "epoch": 1.4938539407086044, "grad_norm": 0.09526443010335374, "learning_rate": 1.6536065784639454e-06, "loss": 0.0143, "step": 4132 }, { "epoch": 1.4942154736080984, "grad_norm": 0.7816637511989153, "learning_rate": 1.6513862066897907e-06, "loss": 0.0579, "step": 4133 }, { "epoch": 1.4945770065075923, "grad_norm": 0.6233416671527832, "learning_rate": 1.6491670317094993e-06, "loss": 0.1914, "step": 4134 }, { "epoch": 1.494938539407086, "grad_norm": 0.4779380208954209, "learning_rate": 1.6469490543162037e-06, "loss": 0.1504, "step": 4135 }, { "epoch": 1.49530007230658, "grad_norm": 1.0988475908119455, "learning_rate": 1.6447322753026084e-06, "loss": 0.1143, "step": 4136 }, { "epoch": 1.4956616052060738, "grad_norm": 0.03745286507037307, "learning_rate": 1.6425166954609884e-06, "loss": 0.0015, "step": 4137 }, { "epoch": 1.4960231381055675, "grad_norm": 0.5416490217516224, "learning_rate": 1.640302315583196e-06, "loss": 0.1226, "step": 4138 }, { "epoch": 1.4963846710050615, "grad_norm": 0.122167623452118, "learning_rate": 1.6380891364606455e-06, "loss": 0.0161, "step": 4139 }, { "epoch": 1.4967462039045554, "grad_norm": 0.4316133798643259, "learning_rate": 1.635877158884326e-06, "loss": 0.0254, "step": 4140 }, { "epoch": 1.497107736804049, "grad_norm": 0.17570702589978898, "learning_rate": 1.6336663836448037e-06, "loss": 0.0254, "step": 4141 }, { "epoch": 1.497469269703543, "grad_norm": 0.25132941501699513, "learning_rate": 1.6314568115322015e-06, "loss": 0.0315, "step": 4142 }, { "epoch": 1.497830802603037, "grad_norm": 0.10671399967502655, "learning_rate": 1.6292484433362266e-06, "loss": 0.0203, "step": 4143 }, { "epoch": 1.4981923355025306, "grad_norm": 0.8525372477885538, "learning_rate": 1.627041279846146e-06, "loss": 0.1143, "step": 4144 }, { "epoch": 1.4985538684020245, "grad_norm": 0.099459995883467, "learning_rate": 1.6248353218508006e-06, "loss": 0.0056, "step": 4145 }, { "epoch": 1.4989154013015185, "grad_norm": 0.20363482475460792, "learning_rate": 1.6226305701385986e-06, "loss": 0.0283, "step": 4146 }, { "epoch": 1.4992769342010122, "grad_norm": 0.7043143419541986, "learning_rate": 1.6204270254975163e-06, "loss": 0.1226, "step": 4147 }, { "epoch": 1.499638467100506, "grad_norm": 0.008241337206326645, "learning_rate": 1.6182246887151055e-06, "loss": 0.0004, "step": 4148 }, { "epoch": 1.5, "grad_norm": 0.07197666305736768, "learning_rate": 1.6160235605784752e-06, "loss": 0.0044, "step": 4149 }, { "epoch": 1.5003615328994937, "grad_norm": 0.17545587854266242, "learning_rate": 1.6138236418743087e-06, "loss": 0.0254, "step": 4150 }, { "epoch": 1.5007230657989878, "grad_norm": 0.836595519972938, "learning_rate": 1.6116249333888617e-06, "loss": 0.0203, "step": 4151 }, { "epoch": 1.5010845986984815, "grad_norm": 0.522664938406787, "learning_rate": 1.6094274359079449e-06, "loss": 0.0162, "step": 4152 }, { "epoch": 1.5014461315979755, "grad_norm": 0.12124870295443536, "learning_rate": 1.607231150216948e-06, "loss": 0.008, "step": 4153 }, { "epoch": 1.5018076644974694, "grad_norm": 0.13288546486849553, "learning_rate": 1.6050360771008227e-06, "loss": 0.0227, "step": 4154 }, { "epoch": 1.502169197396963, "grad_norm": 0.09931515458886211, "learning_rate": 1.6028422173440867e-06, "loss": 0.0181, "step": 4155 }, { "epoch": 1.502530730296457, "grad_norm": 0.15360823880138397, "learning_rate": 1.6006495717308252e-06, "loss": 0.0203, "step": 4156 }, { "epoch": 1.502892263195951, "grad_norm": 0.7078703684567033, "learning_rate": 1.598458141044688e-06, "loss": 0.0317, "step": 4157 }, { "epoch": 1.5032537960954446, "grad_norm": 0.17010221088763716, "learning_rate": 1.5962679260688968e-06, "loss": 0.0254, "step": 4158 }, { "epoch": 1.5036153289949385, "grad_norm": 0.13813536171756327, "learning_rate": 1.5940789275862283e-06, "loss": 0.0161, "step": 4159 }, { "epoch": 1.5039768618944325, "grad_norm": 0.8452240128037756, "learning_rate": 1.591891146379031e-06, "loss": 0.0527, "step": 4160 }, { "epoch": 1.5043383947939262, "grad_norm": 0.18664573027144693, "learning_rate": 1.5897045832292217e-06, "loss": 0.0283, "step": 4161 }, { "epoch": 1.50469992769342, "grad_norm": 0.8805532655404427, "learning_rate": 1.587519238918272e-06, "loss": 0.1143, "step": 4162 }, { "epoch": 1.505061460592914, "grad_norm": 0.17172203501270958, "learning_rate": 1.5853351142272272e-06, "loss": 0.0227, "step": 4163 }, { "epoch": 1.5054229934924077, "grad_norm": 0.8720962515504794, "learning_rate": 1.583152209936692e-06, "loss": 0.0977, "step": 4164 }, { "epoch": 1.5057845263919016, "grad_norm": 0.2824727069555063, "learning_rate": 1.580970526826836e-06, "loss": 0.0283, "step": 4165 }, { "epoch": 1.5061460592913956, "grad_norm": 0.10574319583341713, "learning_rate": 1.5787900656773925e-06, "loss": 0.0203, "step": 4166 }, { "epoch": 1.5065075921908893, "grad_norm": 0.17809784969432055, "learning_rate": 1.5766108272676555e-06, "loss": 0.0203, "step": 4167 }, { "epoch": 1.5068691250903832, "grad_norm": 0.001338083096769304, "learning_rate": 1.5744328123764896e-06, "loss": 0.0001, "step": 4168 }, { "epoch": 1.507230657989877, "grad_norm": 0.008665488791189996, "learning_rate": 1.57225602178231e-06, "loss": 0.0004, "step": 4169 }, { "epoch": 1.5075921908893708, "grad_norm": 0.07192131246041653, "learning_rate": 1.5700804562631073e-06, "loss": 0.0044, "step": 4170 }, { "epoch": 1.507953723788865, "grad_norm": 0.13284890363116778, "learning_rate": 1.5679061165964253e-06, "loss": 0.0227, "step": 4171 }, { "epoch": 1.5083152566883586, "grad_norm": 0.7091610999434302, "learning_rate": 1.5657330035593726e-06, "loss": 0.0317, "step": 4172 }, { "epoch": 1.5086767895878523, "grad_norm": 0.623483191692433, "learning_rate": 1.5635611179286203e-06, "loss": 0.0432, "step": 4173 }, { "epoch": 1.5090383224873465, "grad_norm": 0.8103388252184915, "learning_rate": 1.5613904604803987e-06, "loss": 0.1699, "step": 4174 }, { "epoch": 1.5093998553868402, "grad_norm": 0.2534338958222556, "learning_rate": 1.559221031990501e-06, "loss": 0.0349, "step": 4175 }, { "epoch": 1.509761388286334, "grad_norm": 0.2826498158120306, "learning_rate": 1.5570528332342804e-06, "loss": 0.0254, "step": 4176 }, { "epoch": 1.510122921185828, "grad_norm": 0.26415505010693174, "learning_rate": 1.5548858649866489e-06, "loss": 0.0283, "step": 4177 }, { "epoch": 1.5104844540853217, "grad_norm": 1.9296435490058634, "learning_rate": 1.5527201280220855e-06, "loss": 0.1143, "step": 4178 }, { "epoch": 1.5108459869848156, "grad_norm": 0.5673952329443618, "learning_rate": 1.5505556231146178e-06, "loss": 0.1406, "step": 4179 }, { "epoch": 1.5112075198843096, "grad_norm": 0.2151627642761568, "learning_rate": 1.5483923510378441e-06, "loss": 0.0254, "step": 4180 }, { "epoch": 1.5115690527838033, "grad_norm": 0.011548454923884371, "learning_rate": 1.5462303125649152e-06, "loss": 0.0004, "step": 4181 }, { "epoch": 1.5119305856832972, "grad_norm": 0.1292850795292207, "learning_rate": 1.5440695084685436e-06, "loss": 0.0203, "step": 4182 }, { "epoch": 1.5122921185827911, "grad_norm": 0.005020003023126254, "learning_rate": 1.5419099395210008e-06, "loss": 0.0002, "step": 4183 }, { "epoch": 1.5126536514822848, "grad_norm": 0.07320339694488122, "learning_rate": 1.5397516064941159e-06, "loss": 0.0034, "step": 4184 }, { "epoch": 1.5130151843817787, "grad_norm": 0.19841829374382872, "learning_rate": 1.5375945101592766e-06, "loss": 0.0114, "step": 4185 }, { "epoch": 1.5133767172812727, "grad_norm": 0.15660286509933707, "learning_rate": 1.535438651287428e-06, "loss": 0.0203, "step": 4186 }, { "epoch": 1.5137382501807664, "grad_norm": 0.03578272427713787, "learning_rate": 1.5332840306490732e-06, "loss": 0.0013, "step": 4187 }, { "epoch": 1.5140997830802603, "grad_norm": 0.21351314879302, "learning_rate": 1.5311306490142775e-06, "loss": 0.0283, "step": 4188 }, { "epoch": 1.5144613159797542, "grad_norm": 0.1900876639471773, "learning_rate": 1.5289785071526524e-06, "loss": 0.0161, "step": 4189 }, { "epoch": 1.514822848879248, "grad_norm": 0.179766038345947, "learning_rate": 1.5268276058333782e-06, "loss": 0.0227, "step": 4190 }, { "epoch": 1.5151843817787418, "grad_norm": 0.7845410221291395, "learning_rate": 1.5246779458251843e-06, "loss": 0.1143, "step": 4191 }, { "epoch": 1.5155459146782357, "grad_norm": 0.0030649067560985558, "learning_rate": 1.5225295278963592e-06, "loss": 0.0001, "step": 4192 }, { "epoch": 1.5159074475777294, "grad_norm": 0.7461884833165866, "learning_rate": 1.520382352814747e-06, "loss": 0.1309, "step": 4193 }, { "epoch": 1.5162689804772236, "grad_norm": 0.019842950671216227, "learning_rate": 1.5182364213477475e-06, "loss": 0.0009, "step": 4194 }, { "epoch": 1.5166305133767173, "grad_norm": 0.006896142852446043, "learning_rate": 1.5160917342623166e-06, "loss": 0.0003, "step": 4195 }, { "epoch": 1.516992046276211, "grad_norm": 0.623579799377402, "learning_rate": 1.5139482923249643e-06, "loss": 0.0227, "step": 4196 }, { "epoch": 1.5173535791757051, "grad_norm": 0.7703262314331408, "learning_rate": 1.511806096301755e-06, "loss": 0.1143, "step": 4197 }, { "epoch": 1.5177151120751988, "grad_norm": 0.7714757041223238, "learning_rate": 1.509665146958314e-06, "loss": 0.0977, "step": 4198 }, { "epoch": 1.5180766449746927, "grad_norm": 0.7418933059269965, "learning_rate": 1.5075254450598099e-06, "loss": 0.0579, "step": 4199 }, { "epoch": 1.5184381778741867, "grad_norm": 0.5140939256672, "learning_rate": 1.5053869913709762e-06, "loss": 0.0527, "step": 4200 }, { "epoch": 1.5187997107736804, "grad_norm": 0.227537049973522, "learning_rate": 1.5032497866560942e-06, "loss": 0.0182, "step": 4201 }, { "epoch": 1.5191612436731743, "grad_norm": 0.2419539290046825, "learning_rate": 1.5011138316790002e-06, "loss": 0.0114, "step": 4202 }, { "epoch": 1.5195227765726682, "grad_norm": 0.4258001848285287, "learning_rate": 1.4989791272030846e-06, "loss": 0.0432, "step": 4203 }, { "epoch": 1.519884309472162, "grad_norm": 0.29864039656228836, "learning_rate": 1.49684567399129e-06, "loss": 0.0388, "step": 4204 }, { "epoch": 1.5202458423716558, "grad_norm": 0.10086181254613809, "learning_rate": 1.4947134728061124e-06, "loss": 0.0181, "step": 4205 }, { "epoch": 1.5206073752711498, "grad_norm": 0.036647181198517335, "learning_rate": 1.4925825244095998e-06, "loss": 0.0017, "step": 4206 }, { "epoch": 1.5209689081706435, "grad_norm": 0.349590173373748, "learning_rate": 1.4904528295633513e-06, "loss": 0.0114, "step": 4207 }, { "epoch": 1.5213304410701374, "grad_norm": 0.6572268298097119, "learning_rate": 1.4883243890285237e-06, "loss": 0.0579, "step": 4208 }, { "epoch": 1.5216919739696313, "grad_norm": 0.10361391801821848, "learning_rate": 1.4861972035658157e-06, "loss": 0.0203, "step": 4209 }, { "epoch": 1.522053506869125, "grad_norm": 0.039314715094689555, "learning_rate": 1.4840712739354867e-06, "loss": 0.0019, "step": 4210 }, { "epoch": 1.522415039768619, "grad_norm": 0.27842229093486426, "learning_rate": 1.4819466008973427e-06, "loss": 0.0315, "step": 4211 }, { "epoch": 1.5227765726681128, "grad_norm": 0.8507041103319, "learning_rate": 1.4798231852107409e-06, "loss": 0.0579, "step": 4212 }, { "epoch": 1.5231381055676065, "grad_norm": 0.0008202256518274586, "learning_rate": 1.47770102763459e-06, "loss": 0.0, "step": 4213 }, { "epoch": 1.5234996384671005, "grad_norm": 0.079453353884817, "learning_rate": 1.475580128927348e-06, "loss": 0.0019, "step": 4214 }, { "epoch": 1.5238611713665944, "grad_norm": 0.37479855418129643, "learning_rate": 1.4734604898470246e-06, "loss": 0.0388, "step": 4215 }, { "epoch": 1.524222704266088, "grad_norm": 0.26012884844834216, "learning_rate": 1.4713421111511778e-06, "loss": 0.0227, "step": 4216 }, { "epoch": 1.5245842371655822, "grad_norm": 1.045777124674899, "learning_rate": 1.4692249935969138e-06, "loss": 0.083, "step": 4217 }, { "epoch": 1.524945770065076, "grad_norm": 0.004978822311651192, "learning_rate": 1.4671091379408957e-06, "loss": 0.0002, "step": 4218 }, { "epoch": 1.5253073029645696, "grad_norm": 0.7139846661628642, "learning_rate": 1.4649945449393228e-06, "loss": 0.0476, "step": 4219 }, { "epoch": 1.5256688358640638, "grad_norm": 0.1829965988839813, "learning_rate": 1.462881215347955e-06, "loss": 0.0227, "step": 4220 }, { "epoch": 1.5260303687635575, "grad_norm": 0.3375597063992324, "learning_rate": 1.4607691499220943e-06, "loss": 0.0315, "step": 4221 }, { "epoch": 1.5263919016630514, "grad_norm": 0.4581553084921104, "learning_rate": 1.458658349416593e-06, "loss": 0.0432, "step": 4222 }, { "epoch": 1.5267534345625453, "grad_norm": 0.1774830038755547, "learning_rate": 1.4565488145858497e-06, "loss": 0.0254, "step": 4223 }, { "epoch": 1.527114967462039, "grad_norm": 0.17120520446502202, "learning_rate": 1.4544405461838107e-06, "loss": 0.0283, "step": 4224 }, { "epoch": 1.527476500361533, "grad_norm": 1.4289740332576975, "learning_rate": 1.4523335449639753e-06, "loss": 0.0143, "step": 4225 }, { "epoch": 1.5278380332610269, "grad_norm": 0.09493659692990336, "learning_rate": 1.450227811679381e-06, "loss": 0.0024, "step": 4226 }, { "epoch": 1.5281995661605206, "grad_norm": 0.2501367909860166, "learning_rate": 1.4481233470826162e-06, "loss": 0.0315, "step": 4227 }, { "epoch": 1.5285610990600145, "grad_norm": 0.14067981768275437, "learning_rate": 1.4460201519258205e-06, "loss": 0.0203, "step": 4228 }, { "epoch": 1.5289226319595084, "grad_norm": 0.4147033772980946, "learning_rate": 1.4439182269606695e-06, "loss": 0.0254, "step": 4229 }, { "epoch": 1.529284164859002, "grad_norm": 0.09659935836541067, "learning_rate": 1.4418175729383949e-06, "loss": 0.0181, "step": 4230 }, { "epoch": 1.529645697758496, "grad_norm": 0.07197034301480149, "learning_rate": 1.439718190609769e-06, "loss": 0.0027, "step": 4231 }, { "epoch": 1.53000723065799, "grad_norm": 0.09766367272818793, "learning_rate": 1.43762008072511e-06, "loss": 0.0161, "step": 4232 }, { "epoch": 1.5303687635574836, "grad_norm": 0.21739531518028968, "learning_rate": 1.4355232440342831e-06, "loss": 0.0129, "step": 4233 }, { "epoch": 1.5307302964569776, "grad_norm": 0.1867384090592487, "learning_rate": 1.4334276812866949e-06, "loss": 0.0101, "step": 4234 }, { "epoch": 1.5310918293564715, "grad_norm": 1.2156283655784776, "learning_rate": 1.4313333932313034e-06, "loss": 0.083, "step": 4235 }, { "epoch": 1.5314533622559652, "grad_norm": 0.15208810531856254, "learning_rate": 1.4292403806166027e-06, "loss": 0.0227, "step": 4236 }, { "epoch": 1.531814895155459, "grad_norm": 0.00670933505972446, "learning_rate": 1.4271486441906346e-06, "loss": 0.0003, "step": 4237 }, { "epoch": 1.532176428054953, "grad_norm": 0.48753263563578975, "learning_rate": 1.4250581847009903e-06, "loss": 0.0388, "step": 4238 }, { "epoch": 1.5325379609544467, "grad_norm": 0.5132889935691207, "learning_rate": 1.4229690028947935e-06, "loss": 0.0203, "step": 4239 }, { "epoch": 1.5328994938539409, "grad_norm": 0.193980899186702, "learning_rate": 1.4208810995187217e-06, "loss": 0.0283, "step": 4240 }, { "epoch": 1.5332610267534346, "grad_norm": 0.001453754717249526, "learning_rate": 1.4187944753189893e-06, "loss": 0.0001, "step": 4241 }, { "epoch": 1.5336225596529283, "grad_norm": 0.004500294921343772, "learning_rate": 1.4167091310413554e-06, "loss": 0.0002, "step": 4242 }, { "epoch": 1.5339840925524224, "grad_norm": 1.0165217810589342, "learning_rate": 1.4146250674311224e-06, "loss": 0.0579, "step": 4243 }, { "epoch": 1.534345625451916, "grad_norm": 0.20045468526661273, "learning_rate": 1.4125422852331323e-06, "loss": 0.0227, "step": 4244 }, { "epoch": 1.53470715835141, "grad_norm": 0.37934960711260773, "learning_rate": 1.4104607851917744e-06, "loss": 0.0432, "step": 4245 }, { "epoch": 1.535068691250904, "grad_norm": 0.103874842513773, "learning_rate": 1.408380568050972e-06, "loss": 0.0161, "step": 4246 }, { "epoch": 1.5354302241503976, "grad_norm": 0.11957725196850756, "learning_rate": 1.4063016345541986e-06, "loss": 0.0203, "step": 4247 }, { "epoch": 1.5357917570498916, "grad_norm": 0.27479676154615584, "learning_rate": 1.4042239854444633e-06, "loss": 0.0317, "step": 4248 }, { "epoch": 1.5361532899493855, "grad_norm": 0.002153215421512831, "learning_rate": 1.4021476214643138e-06, "loss": 0.0001, "step": 4249 }, { "epoch": 1.5365148228488792, "grad_norm": 0.08030460420255822, "learning_rate": 1.400072543355847e-06, "loss": 0.0128, "step": 4250 }, { "epoch": 1.5368763557483731, "grad_norm": 0.08930103392251577, "learning_rate": 1.397998751860693e-06, "loss": 0.0027, "step": 4251 }, { "epoch": 1.537237888647867, "grad_norm": 0.14306563153876936, "learning_rate": 1.3959262477200253e-06, "loss": 0.0227, "step": 4252 }, { "epoch": 1.5375994215473607, "grad_norm": 0.08889363887647383, "learning_rate": 1.3938550316745564e-06, "loss": 0.0035, "step": 4253 }, { "epoch": 1.5379609544468547, "grad_norm": 0.2494637059827425, "learning_rate": 1.3917851044645359e-06, "loss": 0.0315, "step": 4254 }, { "epoch": 1.5383224873463486, "grad_norm": 0.22679870474942482, "learning_rate": 1.3897164668297607e-06, "loss": 0.0349, "step": 4255 }, { "epoch": 1.5386840202458423, "grad_norm": 0.07887770589024966, "learning_rate": 1.387649119509556e-06, "loss": 0.0034, "step": 4256 }, { "epoch": 1.5390455531453362, "grad_norm": 0.5750951544423355, "learning_rate": 1.3855830632427941e-06, "loss": 0.0283, "step": 4257 }, { "epoch": 1.5394070860448301, "grad_norm": 0.7881081568376409, "learning_rate": 1.3835182987678852e-06, "loss": 0.1504, "step": 4258 }, { "epoch": 1.5397686189443238, "grad_norm": 0.0947704832180603, "learning_rate": 1.3814548268227695e-06, "loss": 0.0143, "step": 4259 }, { "epoch": 1.5401301518438177, "grad_norm": 0.08817824745982282, "learning_rate": 1.3793926481449365e-06, "loss": 0.0143, "step": 4260 }, { "epoch": 1.5404916847433117, "grad_norm": 0.23309903842222277, "learning_rate": 1.3773317634714073e-06, "loss": 0.0317, "step": 4261 }, { "epoch": 1.5408532176428054, "grad_norm": 0.018642703205590548, "learning_rate": 1.3752721735387409e-06, "loss": 0.0008, "step": 4262 }, { "epoch": 1.5412147505422995, "grad_norm": 0.0010710837944233242, "learning_rate": 1.3732138790830352e-06, "loss": 0.0, "step": 4263 }, { "epoch": 1.5415762834417932, "grad_norm": 0.001406076193970907, "learning_rate": 1.3711568808399216e-06, "loss": 0.0001, "step": 4264 }, { "epoch": 1.541937816341287, "grad_norm": 0.1825864816164767, "learning_rate": 1.3691011795445764e-06, "loss": 0.0181, "step": 4265 }, { "epoch": 1.542299349240781, "grad_norm": 0.16349636476113188, "learning_rate": 1.3670467759317002e-06, "loss": 0.0203, "step": 4266 }, { "epoch": 1.5426608821402747, "grad_norm": 0.0037987162199883914, "learning_rate": 1.364993670735541e-06, "loss": 0.0001, "step": 4267 }, { "epoch": 1.5430224150397687, "grad_norm": 0.14941032927997763, "learning_rate": 1.3629418646898767e-06, "loss": 0.0128, "step": 4268 }, { "epoch": 1.5433839479392626, "grad_norm": 0.10557373491837692, "learning_rate": 1.3608913585280231e-06, "loss": 0.0143, "step": 4269 }, { "epoch": 1.5437454808387563, "grad_norm": 0.019174251019036796, "learning_rate": 1.3588421529828293e-06, "loss": 0.0002, "step": 4270 }, { "epoch": 1.5441070137382502, "grad_norm": 0.04194579869600378, "learning_rate": 1.356794248786682e-06, "loss": 0.0013, "step": 4271 }, { "epoch": 1.5444685466377441, "grad_norm": 0.2887195872260428, "learning_rate": 1.354747646671501e-06, "loss": 0.0315, "step": 4272 }, { "epoch": 1.5448300795372378, "grad_norm": 2.466973254896221, "learning_rate": 1.3527023473687417e-06, "loss": 0.2129, "step": 4273 }, { "epoch": 1.5451916124367318, "grad_norm": 1.1329905245539171, "learning_rate": 1.3506583516093924e-06, "loss": 0.0388, "step": 4274 }, { "epoch": 1.5455531453362257, "grad_norm": 0.33970482500275934, "learning_rate": 1.3486156601239808e-06, "loss": 0.0129, "step": 4275 }, { "epoch": 1.5459146782357194, "grad_norm": 0.15520608802762764, "learning_rate": 1.346574273642559e-06, "loss": 0.0181, "step": 4276 }, { "epoch": 1.5462762111352133, "grad_norm": 0.003506373249354051, "learning_rate": 1.3445341928947225e-06, "loss": 0.0001, "step": 4277 }, { "epoch": 1.5466377440347072, "grad_norm": 0.6029426382357742, "learning_rate": 1.3424954186095935e-06, "loss": 0.1504, "step": 4278 }, { "epoch": 1.546999276934201, "grad_norm": 0.09139678777797867, "learning_rate": 1.3404579515158305e-06, "loss": 0.0143, "step": 4279 }, { "epoch": 1.5473608098336948, "grad_norm": 0.1124914126495076, "learning_rate": 1.3384217923416232e-06, "loss": 0.0161, "step": 4280 }, { "epoch": 1.5477223427331888, "grad_norm": 0.2063848638808947, "learning_rate": 1.336386941814694e-06, "loss": 0.0254, "step": 4281 }, { "epoch": 1.5480838756326825, "grad_norm": 0.5803616137098887, "learning_rate": 1.3343534006622993e-06, "loss": 0.1914, "step": 4282 }, { "epoch": 1.5484454085321764, "grad_norm": 0.05959222108243555, "learning_rate": 1.3323211696112253e-06, "loss": 0.0027, "step": 4283 }, { "epoch": 1.5488069414316703, "grad_norm": 0.5429056490305006, "learning_rate": 1.33029024938779e-06, "loss": 0.0476, "step": 4284 }, { "epoch": 1.549168474331164, "grad_norm": 0.8008702194265246, "learning_rate": 1.3282606407178477e-06, "loss": 0.0527, "step": 4285 }, { "epoch": 1.5495300072306581, "grad_norm": 0.8757280315768171, "learning_rate": 1.3262323443267748e-06, "loss": 0.0388, "step": 4286 }, { "epoch": 1.5498915401301518, "grad_norm": 0.0006092830647534157, "learning_rate": 1.3242053609394884e-06, "loss": 0.0, "step": 4287 }, { "epoch": 1.5502530730296455, "grad_norm": 0.7106668270920576, "learning_rate": 1.3221796912804303e-06, "loss": 0.1504, "step": 4288 }, { "epoch": 1.5506146059291397, "grad_norm": 0.6198160687443403, "learning_rate": 1.3201553360735742e-06, "loss": 0.1914, "step": 4289 }, { "epoch": 1.5509761388286334, "grad_norm": 0.0016477946303373183, "learning_rate": 1.3181322960424243e-06, "loss": 0.0001, "step": 4290 }, { "epoch": 1.5513376717281273, "grad_norm": 0.4695347618849243, "learning_rate": 1.3161105719100148e-06, "loss": 0.0388, "step": 4291 }, { "epoch": 1.5516992046276212, "grad_norm": 0.6355827593668237, "learning_rate": 1.3140901643989095e-06, "loss": 0.1602, "step": 4292 }, { "epoch": 1.552060737527115, "grad_norm": 1.0839773685149723, "learning_rate": 1.3120710742312015e-06, "loss": 0.1504, "step": 4293 }, { "epoch": 1.5524222704266089, "grad_norm": 0.09724022300641141, "learning_rate": 1.3100533021285116e-06, "loss": 0.0143, "step": 4294 }, { "epoch": 1.5527838033261028, "grad_norm": 0.13425239669522498, "learning_rate": 1.308036848811996e-06, "loss": 0.0203, "step": 4295 }, { "epoch": 1.5531453362255965, "grad_norm": 0.5423946720687274, "learning_rate": 1.3060217150023285e-06, "loss": 0.1699, "step": 4296 }, { "epoch": 1.5535068691250904, "grad_norm": 0.1935562695828615, "learning_rate": 1.304007901419722e-06, "loss": 0.0227, "step": 4297 }, { "epoch": 1.5538684020245843, "grad_norm": 0.190840330981391, "learning_rate": 1.3019954087839115e-06, "loss": 0.0227, "step": 4298 }, { "epoch": 1.554229934924078, "grad_norm": 0.07115388495702182, "learning_rate": 1.2999842378141619e-06, "loss": 0.003, "step": 4299 }, { "epoch": 1.554591467823572, "grad_norm": 0.12660043459691508, "learning_rate": 1.297974389229264e-06, "loss": 0.0203, "step": 4300 }, { "epoch": 1.5549530007230659, "grad_norm": 1.8865344556321244, "learning_rate": 1.2959658637475387e-06, "loss": 0.1602, "step": 4301 }, { "epoch": 1.5553145336225596, "grad_norm": 1.0263249265377068, "learning_rate": 1.293958662086832e-06, "loss": 0.0762, "step": 4302 }, { "epoch": 1.5556760665220535, "grad_norm": 0.000639337485394356, "learning_rate": 1.2919527849645174e-06, "loss": 0.0, "step": 4303 }, { "epoch": 1.5560375994215474, "grad_norm": 0.16120919400454792, "learning_rate": 1.2899482330974934e-06, "loss": 0.0203, "step": 4304 }, { "epoch": 1.556399132321041, "grad_norm": 0.11985043607645529, "learning_rate": 1.2879450072021905e-06, "loss": 0.0161, "step": 4305 }, { "epoch": 1.556760665220535, "grad_norm": 0.04138919679335346, "learning_rate": 1.2859431079945556e-06, "loss": 0.0012, "step": 4306 }, { "epoch": 1.557122198120029, "grad_norm": 0.23123014947661555, "learning_rate": 1.2839425361900721e-06, "loss": 0.0254, "step": 4307 }, { "epoch": 1.5574837310195226, "grad_norm": 1.812376753692816, "learning_rate": 1.2819432925037418e-06, "loss": 0.2695, "step": 4308 }, { "epoch": 1.5578452639190168, "grad_norm": 0.05642492342025191, "learning_rate": 1.2799453776500935e-06, "loss": 0.0021, "step": 4309 }, { "epoch": 1.5582067968185105, "grad_norm": 0.8142422323153733, "learning_rate": 1.2779487923431833e-06, "loss": 0.1309, "step": 4310 }, { "epoch": 1.5585683297180042, "grad_norm": 0.11467872186943645, "learning_rate": 1.275953537296588e-06, "loss": 0.0128, "step": 4311 }, { "epoch": 1.5589298626174983, "grad_norm": 0.2299041715960221, "learning_rate": 1.2739596132234133e-06, "loss": 0.0283, "step": 4312 }, { "epoch": 1.559291395516992, "grad_norm": 1.190769411111794, "learning_rate": 1.271967020836285e-06, "loss": 0.0977, "step": 4313 }, { "epoch": 1.559652928416486, "grad_norm": 1.047367858241319, "learning_rate": 1.2699757608473579e-06, "loss": 0.1226, "step": 4314 }, { "epoch": 1.5600144613159799, "grad_norm": 0.19994657464715349, "learning_rate": 1.2679858339683083e-06, "loss": 0.0254, "step": 4315 }, { "epoch": 1.5603759942154736, "grad_norm": 2.4105095310972344, "learning_rate": 1.265997240910331e-06, "loss": 0.1226, "step": 4316 }, { "epoch": 1.5607375271149675, "grad_norm": 0.7222032843982487, "learning_rate": 1.2640099823841535e-06, "loss": 0.1699, "step": 4317 }, { "epoch": 1.5610990600144614, "grad_norm": 0.42638686170918294, "learning_rate": 1.2620240591000194e-06, "loss": 0.0114, "step": 4318 }, { "epoch": 1.5614605929139551, "grad_norm": 0.8763242890224581, "learning_rate": 1.2600394717676983e-06, "loss": 0.0527, "step": 4319 }, { "epoch": 1.561822125813449, "grad_norm": 6.3928616419674915, "learning_rate": 1.2580562210964802e-06, "loss": 0.0903, "step": 4320 }, { "epoch": 1.562183658712943, "grad_norm": 2.5301021135327524, "learning_rate": 1.2560743077951776e-06, "loss": 0.1406, "step": 4321 }, { "epoch": 1.5625451916124367, "grad_norm": 1.0400527247997495, "learning_rate": 1.2540937325721302e-06, "loss": 0.1226, "step": 4322 }, { "epoch": 1.5629067245119306, "grad_norm": 0.7199127566855105, "learning_rate": 1.2521144961351893e-06, "loss": 0.1406, "step": 4323 }, { "epoch": 1.5632682574114245, "grad_norm": 0.18510323517264063, "learning_rate": 1.2501365991917386e-06, "loss": 0.0254, "step": 4324 }, { "epoch": 1.5636297903109182, "grad_norm": 0.0894865947334964, "learning_rate": 1.2481600424486768e-06, "loss": 0.0161, "step": 4325 }, { "epoch": 1.5639913232104121, "grad_norm": 0.0006337030616359267, "learning_rate": 1.2461848266124216e-06, "loss": 0.0, "step": 4326 }, { "epoch": 1.564352856109906, "grad_norm": 0.3507667611108386, "learning_rate": 1.244210952388918e-06, "loss": 0.0162, "step": 4327 }, { "epoch": 1.5647143890093997, "grad_norm": 0.09881000123135456, "learning_rate": 1.242238420483628e-06, "loss": 0.0143, "step": 4328 }, { "epoch": 1.5650759219088937, "grad_norm": 0.001102587056695518, "learning_rate": 1.240267231601533e-06, "loss": 0.0, "step": 4329 }, { "epoch": 1.5654374548083876, "grad_norm": 0.19040280406581322, "learning_rate": 1.2382973864471359e-06, "loss": 0.0315, "step": 4330 }, { "epoch": 1.5657989877078813, "grad_norm": 0.6821069384551516, "learning_rate": 1.2363288857244582e-06, "loss": 0.0635, "step": 4331 }, { "epoch": 1.5661605206073754, "grad_norm": 0.021087635514920204, "learning_rate": 1.2343617301370447e-06, "loss": 0.0008, "step": 4332 }, { "epoch": 1.5665220535068691, "grad_norm": 0.2661511175528891, "learning_rate": 1.2323959203879515e-06, "loss": 0.0315, "step": 4333 }, { "epoch": 1.5668835864063628, "grad_norm": 0.9436232160871432, "learning_rate": 1.2304314571797626e-06, "loss": 0.1406, "step": 4334 }, { "epoch": 1.567245119305857, "grad_norm": 0.0013231248415669376, "learning_rate": 1.228468341214577e-06, "loss": 0.0001, "step": 4335 }, { "epoch": 1.5676066522053507, "grad_norm": 0.7602078736396878, "learning_rate": 1.2265065731940074e-06, "loss": 0.0071, "step": 4336 }, { "epoch": 1.5679681851048446, "grad_norm": 1.2968048124466254, "learning_rate": 1.2245461538191938e-06, "loss": 0.0903, "step": 4337 }, { "epoch": 1.5683297180043385, "grad_norm": 0.5030545804849534, "learning_rate": 1.2225870837907876e-06, "loss": 0.1602, "step": 4338 }, { "epoch": 1.5686912509038322, "grad_norm": 0.09334773600057172, "learning_rate": 1.220629363808961e-06, "loss": 0.0161, "step": 4339 }, { "epoch": 1.5690527838033261, "grad_norm": 0.001600950409601046, "learning_rate": 1.2186729945734017e-06, "loss": 0.0001, "step": 4340 }, { "epoch": 1.56941431670282, "grad_norm": 0.038664167054424256, "learning_rate": 1.2167179767833148e-06, "loss": 0.0015, "step": 4341 }, { "epoch": 1.5697758496023138, "grad_norm": 0.28110018629362626, "learning_rate": 1.214764311137428e-06, "loss": 0.0254, "step": 4342 }, { "epoch": 1.5701373825018077, "grad_norm": 3.1902803155562336, "learning_rate": 1.2128119983339743e-06, "loss": 0.1406, "step": 4343 }, { "epoch": 1.5704989154013016, "grad_norm": 0.0005802068138673932, "learning_rate": 1.2108610390707143e-06, "loss": 0.0, "step": 4344 }, { "epoch": 1.5708604483007953, "grad_norm": 0.15738601737054272, "learning_rate": 1.2089114340449209e-06, "loss": 0.0254, "step": 4345 }, { "epoch": 1.5712219812002892, "grad_norm": 0.0936062349180792, "learning_rate": 1.206963183953378e-06, "loss": 0.0161, "step": 4346 }, { "epoch": 1.5715835140997831, "grad_norm": 0.5474101667608611, "learning_rate": 1.2050162894923945e-06, "loss": 0.0182, "step": 4347 }, { "epoch": 1.5719450469992768, "grad_norm": 0.001573638223017215, "learning_rate": 1.2030707513577876e-06, "loss": 0.0001, "step": 4348 }, { "epoch": 1.5723065798987708, "grad_norm": 0.04273699428784397, "learning_rate": 1.2011265702448932e-06, "loss": 0.0021, "step": 4349 }, { "epoch": 1.5726681127982647, "grad_norm": 0.37863564740230726, "learning_rate": 1.1991837468485606e-06, "loss": 0.0227, "step": 4350 }, { "epoch": 1.5730296456977584, "grad_norm": 0.12318302576240898, "learning_rate": 1.197242281863154e-06, "loss": 0.0181, "step": 4351 }, { "epoch": 1.5733911785972523, "grad_norm": 0.33069595228219184, "learning_rate": 1.1953021759825556e-06, "loss": 0.0283, "step": 4352 }, { "epoch": 1.5737527114967462, "grad_norm": 1.9336445543339016, "learning_rate": 1.193363429900154e-06, "loss": 0.0635, "step": 4353 }, { "epoch": 1.57411424439624, "grad_norm": 0.20356340124426753, "learning_rate": 1.1914260443088604e-06, "loss": 0.009, "step": 4354 }, { "epoch": 1.574475777295734, "grad_norm": 0.0010799432112870088, "learning_rate": 1.1894900199010962e-06, "loss": 0.0, "step": 4355 }, { "epoch": 1.5748373101952278, "grad_norm": 1.1881134629743222, "learning_rate": 1.187555357368792e-06, "loss": 0.1309, "step": 4356 }, { "epoch": 1.5751988430947215, "grad_norm": 0.045481991364301325, "learning_rate": 1.1856220574034e-06, "loss": 0.0005, "step": 4357 }, { "epoch": 1.5755603759942156, "grad_norm": 0.7807002487137785, "learning_rate": 1.1836901206958794e-06, "loss": 0.0579, "step": 4358 }, { "epoch": 1.5759219088937093, "grad_norm": 0.10490257986325462, "learning_rate": 1.181759547936705e-06, "loss": 0.0161, "step": 4359 }, { "epoch": 1.5762834417932032, "grad_norm": 1.517782245535088, "learning_rate": 1.1798303398158617e-06, "loss": 0.1055, "step": 4360 }, { "epoch": 1.5766449746926972, "grad_norm": 0.2140262725603072, "learning_rate": 1.1779024970228475e-06, "loss": 0.0227, "step": 4361 }, { "epoch": 1.5770065075921909, "grad_norm": 0.29571357270395904, "learning_rate": 1.1759760202466768e-06, "loss": 0.0317, "step": 4362 }, { "epoch": 1.5773680404916848, "grad_norm": 2.340774715859279, "learning_rate": 1.1740509101758675e-06, "loss": 0.1807, "step": 4363 }, { "epoch": 1.5777295733911787, "grad_norm": 0.13119451108210345, "learning_rate": 1.1721271674984557e-06, "loss": 0.005, "step": 4364 }, { "epoch": 1.5780911062906724, "grad_norm": 0.1499688795255958, "learning_rate": 1.170204792901989e-06, "loss": 0.0181, "step": 4365 }, { "epoch": 1.5784526391901663, "grad_norm": 0.23812713356646303, "learning_rate": 1.1682837870735174e-06, "loss": 0.0063, "step": 4366 }, { "epoch": 1.5788141720896602, "grad_norm": 0.9787056015222304, "learning_rate": 1.1663641506996132e-06, "loss": 0.0432, "step": 4367 }, { "epoch": 1.579175704989154, "grad_norm": 0.3328389503824865, "learning_rate": 1.1644458844663524e-06, "loss": 0.0115, "step": 4368 }, { "epoch": 1.5795372378886479, "grad_norm": 1.4679571031682495, "learning_rate": 1.162528989059324e-06, "loss": 0.0903, "step": 4369 }, { "epoch": 1.5798987707881418, "grad_norm": 0.6421564718933236, "learning_rate": 1.160613465163624e-06, "loss": 0.0476, "step": 4370 }, { "epoch": 1.5802603036876355, "grad_norm": 0.14803822665903965, "learning_rate": 1.1586993134638602e-06, "loss": 0.0254, "step": 4371 }, { "epoch": 1.5806218365871294, "grad_norm": 0.6450893836760758, "learning_rate": 1.1567865346441537e-06, "loss": 0.0527, "step": 4372 }, { "epoch": 1.5809833694866233, "grad_norm": 0.18228634589619364, "learning_rate": 1.154875129388126e-06, "loss": 0.0203, "step": 4373 }, { "epoch": 1.581344902386117, "grad_norm": 0.7937206928843864, "learning_rate": 1.152965098378917e-06, "loss": 0.0476, "step": 4374 }, { "epoch": 1.581706435285611, "grad_norm": 0.0009623259151896334, "learning_rate": 1.1510564422991704e-06, "loss": 0.0, "step": 4375 }, { "epoch": 1.5820679681851049, "grad_norm": 0.16568524349161992, "learning_rate": 1.1491491618310385e-06, "loss": 0.0181, "step": 4376 }, { "epoch": 1.5824295010845986, "grad_norm": 0.5597408525811002, "learning_rate": 1.1472432576561836e-06, "loss": 0.0527, "step": 4377 }, { "epoch": 1.5827910339840927, "grad_norm": 0.040879100744224024, "learning_rate": 1.145338730455775e-06, "loss": 0.0019, "step": 4378 }, { "epoch": 1.5831525668835864, "grad_norm": 0.1511022632307111, "learning_rate": 1.1434355809104914e-06, "loss": 0.0181, "step": 4379 }, { "epoch": 1.58351409978308, "grad_norm": 0.08289512070584153, "learning_rate": 1.1415338097005152e-06, "loss": 0.0039, "step": 4380 }, { "epoch": 1.5838756326825743, "grad_norm": 0.3919452260066501, "learning_rate": 1.139633417505543e-06, "loss": 0.0317, "step": 4381 }, { "epoch": 1.584237165582068, "grad_norm": 0.5375434278830804, "learning_rate": 1.137734405004774e-06, "loss": 0.1504, "step": 4382 }, { "epoch": 1.5845986984815619, "grad_norm": 0.14428232702780272, "learning_rate": 1.1358367728769103e-06, "loss": 0.0203, "step": 4383 }, { "epoch": 1.5849602313810558, "grad_norm": 1.0213907367560846, "learning_rate": 1.1339405218001704e-06, "loss": 0.0317, "step": 4384 }, { "epoch": 1.5853217642805495, "grad_norm": 0.2629255510545419, "learning_rate": 1.1320456524522721e-06, "loss": 0.0283, "step": 4385 }, { "epoch": 1.5856832971800434, "grad_norm": 0.4880593181083206, "learning_rate": 1.1301521655104413e-06, "loss": 0.1699, "step": 4386 }, { "epoch": 1.5860448300795373, "grad_norm": 0.24521959026023948, "learning_rate": 1.1282600616514093e-06, "loss": 0.0145, "step": 4387 }, { "epoch": 1.586406362979031, "grad_norm": 0.1780551548053547, "learning_rate": 1.1263693415514148e-06, "loss": 0.0227, "step": 4388 }, { "epoch": 1.586767895878525, "grad_norm": 0.19945572161107145, "learning_rate": 1.1244800058862004e-06, "loss": 0.0254, "step": 4389 }, { "epoch": 1.5871294287780189, "grad_norm": 0.7424890430806667, "learning_rate": 1.1225920553310121e-06, "loss": 0.0762, "step": 4390 }, { "epoch": 1.5874909616775126, "grad_norm": 0.6435596897305177, "learning_rate": 1.1207054905606064e-06, "loss": 0.1602, "step": 4391 }, { "epoch": 1.5878524945770065, "grad_norm": 0.10819248643061695, "learning_rate": 1.118820312249241e-06, "loss": 0.0181, "step": 4392 }, { "epoch": 1.5882140274765004, "grad_norm": 0.104579826915622, "learning_rate": 1.116936521070674e-06, "loss": 0.0181, "step": 4393 }, { "epoch": 1.5885755603759941, "grad_norm": 0.0005723224657033697, "learning_rate": 1.1150541176981767e-06, "loss": 0.0, "step": 4394 }, { "epoch": 1.588937093275488, "grad_norm": 0.13170515329770943, "learning_rate": 1.1131731028045179e-06, "loss": 0.0181, "step": 4395 }, { "epoch": 1.589298626174982, "grad_norm": 0.8429817693196018, "learning_rate": 1.1112934770619715e-06, "loss": 0.1309, "step": 4396 }, { "epoch": 1.5896601590744757, "grad_norm": 0.33938417228303036, "learning_rate": 1.109415241142316e-06, "loss": 0.0101, "step": 4397 }, { "epoch": 1.5900216919739696, "grad_norm": 0.20440514141771368, "learning_rate": 1.1075383957168317e-06, "loss": 0.0227, "step": 4398 }, { "epoch": 1.5903832248734635, "grad_norm": 0.0008927773099841068, "learning_rate": 1.105662941456304e-06, "loss": 0.0, "step": 4399 }, { "epoch": 1.5907447577729572, "grad_norm": 0.00047909158765793367, "learning_rate": 1.1037888790310174e-06, "loss": 0.0, "step": 4400 }, { "epoch": 1.5911062906724514, "grad_norm": 0.0017964898627449738, "learning_rate": 1.1019162091107648e-06, "loss": 0.0001, "step": 4401 }, { "epoch": 1.591467823571945, "grad_norm": 0.0008158377677631158, "learning_rate": 1.1000449323648376e-06, "loss": 0.0, "step": 4402 }, { "epoch": 1.5918293564714388, "grad_norm": 0.17079360836728993, "learning_rate": 1.0981750494620257e-06, "loss": 0.0203, "step": 4403 }, { "epoch": 1.592190889370933, "grad_norm": 0.20391135862743542, "learning_rate": 1.0963065610706292e-06, "loss": 0.0128, "step": 4404 }, { "epoch": 1.5925524222704266, "grad_norm": 0.277401911360893, "learning_rate": 1.0944394678584441e-06, "loss": 0.0315, "step": 4405 }, { "epoch": 1.5929139551699205, "grad_norm": 0.17131961990121689, "learning_rate": 1.0925737704927691e-06, "loss": 0.0181, "step": 4406 }, { "epoch": 1.5932754880694144, "grad_norm": 0.5341661688368766, "learning_rate": 1.0907094696404036e-06, "loss": 0.1914, "step": 4407 }, { "epoch": 1.5936370209689081, "grad_norm": 1.0036810981305821, "learning_rate": 1.0888465659676484e-06, "loss": 0.0635, "step": 4408 }, { "epoch": 1.593998553868402, "grad_norm": 0.2010737626449888, "learning_rate": 1.0869850601403054e-06, "loss": 0.0063, "step": 4409 }, { "epoch": 1.594360086767896, "grad_norm": 0.336629797718, "learning_rate": 1.0851249528236746e-06, "loss": 0.0227, "step": 4410 }, { "epoch": 1.5947216196673897, "grad_norm": 0.12896702056989465, "learning_rate": 1.083266244682561e-06, "loss": 0.0063, "step": 4411 }, { "epoch": 1.5950831525668836, "grad_norm": 1.1250765601442083, "learning_rate": 1.0814089363812664e-06, "loss": 0.0283, "step": 4412 }, { "epoch": 1.5954446854663775, "grad_norm": 0.03739159038684218, "learning_rate": 1.0795530285835882e-06, "loss": 0.0015, "step": 4413 }, { "epoch": 1.5958062183658712, "grad_norm": 0.03911593152927251, "learning_rate": 1.0776985219528313e-06, "loss": 0.0019, "step": 4414 }, { "epoch": 1.5961677512653651, "grad_norm": 0.08041294840123202, "learning_rate": 1.0758454171517951e-06, "loss": 0.0039, "step": 4415 }, { "epoch": 1.596529284164859, "grad_norm": 0.714749788176226, "learning_rate": 1.0739937148427788e-06, "loss": 0.1309, "step": 4416 }, { "epoch": 1.5968908170643528, "grad_norm": 0.004245914077195536, "learning_rate": 1.0721434156875798e-06, "loss": 0.0002, "step": 4417 }, { "epoch": 1.5972523499638467, "grad_norm": 0.0019422376181229164, "learning_rate": 1.070294520347494e-06, "loss": 0.0, "step": 4418 }, { "epoch": 1.5976138828633406, "grad_norm": 0.1796544734558825, "learning_rate": 1.06844702948332e-06, "loss": 0.0283, "step": 4419 }, { "epoch": 1.5979754157628343, "grad_norm": 0.20785876035644016, "learning_rate": 1.0666009437553455e-06, "loss": 0.0203, "step": 4420 }, { "epoch": 1.5983369486623282, "grad_norm": 0.17305695382733452, "learning_rate": 1.064756263823365e-06, "loss": 0.0227, "step": 4421 }, { "epoch": 1.5986984815618221, "grad_norm": 0.10129341523529056, "learning_rate": 1.0629129903466662e-06, "loss": 0.0044, "step": 4422 }, { "epoch": 1.5990600144613158, "grad_norm": 0.01920284539119667, "learning_rate": 1.061071123984031e-06, "loss": 0.0009, "step": 4423 }, { "epoch": 1.59942154736081, "grad_norm": 0.09853207870742553, "learning_rate": 1.0592306653937468e-06, "loss": 0.0143, "step": 4424 }, { "epoch": 1.5997830802603037, "grad_norm": 0.025967824236143237, "learning_rate": 1.0573916152335905e-06, "loss": 0.0011, "step": 4425 }, { "epoch": 1.6001446131597974, "grad_norm": 0.4260245288765066, "learning_rate": 1.0555539741608395e-06, "loss": 0.0283, "step": 4426 }, { "epoch": 1.6005061460592915, "grad_norm": 0.18128659714084935, "learning_rate": 1.0537177428322653e-06, "loss": 0.0081, "step": 4427 }, { "epoch": 1.6008676789587852, "grad_norm": 0.34581501805204495, "learning_rate": 1.0518829219041355e-06, "loss": 0.0352, "step": 4428 }, { "epoch": 1.6012292118582792, "grad_norm": 0.12930619525707904, "learning_rate": 1.050049512032219e-06, "loss": 0.0181, "step": 4429 }, { "epoch": 1.601590744757773, "grad_norm": 0.632627502109231, "learning_rate": 1.048217513871771e-06, "loss": 0.1807, "step": 4430 }, { "epoch": 1.6019522776572668, "grad_norm": 0.21535693139151826, "learning_rate": 1.0463869280775508e-06, "loss": 0.0254, "step": 4431 }, { "epoch": 1.6023138105567607, "grad_norm": 0.3930609704609464, "learning_rate": 1.0445577553038088e-06, "loss": 0.0388, "step": 4432 }, { "epoch": 1.6026753434562546, "grad_norm": 0.3705495995674346, "learning_rate": 1.0427299962042886e-06, "loss": 0.0114, "step": 4433 }, { "epoch": 1.6030368763557483, "grad_norm": 0.0995953750747575, "learning_rate": 1.0409036514322336e-06, "loss": 0.0161, "step": 4434 }, { "epoch": 1.6033984092552422, "grad_norm": 0.16730773096687118, "learning_rate": 1.0390787216403776e-06, "loss": 0.0181, "step": 4435 }, { "epoch": 1.6037599421547362, "grad_norm": 0.012955745561881536, "learning_rate": 1.037255207480951e-06, "loss": 0.0003, "step": 4436 }, { "epoch": 1.6041214750542299, "grad_norm": 0.5312341822337423, "learning_rate": 1.0354331096056768e-06, "loss": 0.0145, "step": 4437 }, { "epoch": 1.6044830079537238, "grad_norm": 0.8666330266536705, "learning_rate": 1.0336124286657712e-06, "loss": 0.0227, "step": 4438 }, { "epoch": 1.6048445408532177, "grad_norm": 0.6657680399254163, "learning_rate": 1.031793165311949e-06, "loss": 0.1504, "step": 4439 }, { "epoch": 1.6052060737527114, "grad_norm": 1.1017449450200423, "learning_rate": 1.0299753201944097e-06, "loss": 0.0903, "step": 4440 }, { "epoch": 1.6055676066522053, "grad_norm": 0.21308376530794734, "learning_rate": 1.0281588939628546e-06, "loss": 0.0315, "step": 4441 }, { "epoch": 1.6059291395516992, "grad_norm": 0.12397588605534816, "learning_rate": 1.026343887266474e-06, "loss": 0.0182, "step": 4442 }, { "epoch": 1.606290672451193, "grad_norm": 0.016032915221822076, "learning_rate": 1.0245303007539465e-06, "loss": 0.0007, "step": 4443 }, { "epoch": 1.6066522053506869, "grad_norm": 0.3157031436020191, "learning_rate": 1.0227181350734517e-06, "loss": 0.0352, "step": 4444 }, { "epoch": 1.6070137382501808, "grad_norm": 0.2764288575612955, "learning_rate": 1.0209073908726563e-06, "loss": 0.009, "step": 4445 }, { "epoch": 1.6073752711496745, "grad_norm": 0.9791645772098062, "learning_rate": 1.0190980687987196e-06, "loss": 0.1309, "step": 4446 }, { "epoch": 1.6077368040491686, "grad_norm": 0.9345269528211784, "learning_rate": 1.0172901694982917e-06, "loss": 0.0635, "step": 4447 }, { "epoch": 1.6080983369486623, "grad_norm": 0.27654540602781025, "learning_rate": 1.0154836936175182e-06, "loss": 0.0254, "step": 4448 }, { "epoch": 1.608459869848156, "grad_norm": 0.22091067170518336, "learning_rate": 1.013678641802033e-06, "loss": 0.0254, "step": 4449 }, { "epoch": 1.6088214027476502, "grad_norm": 0.021064508942706364, "learning_rate": 1.011875014696957e-06, "loss": 0.0008, "step": 4450 }, { "epoch": 1.6091829356471439, "grad_norm": 0.16446202201032403, "learning_rate": 1.0100728129469105e-06, "loss": 0.0254, "step": 4451 }, { "epoch": 1.6095444685466378, "grad_norm": 0.00047070708603158, "learning_rate": 1.0082720371959986e-06, "loss": 0.0, "step": 4452 }, { "epoch": 1.6099060014461317, "grad_norm": 0.14536351512103612, "learning_rate": 1.0064726880878183e-06, "loss": 0.0227, "step": 4453 }, { "epoch": 1.6102675343456254, "grad_norm": 0.0002797437282182093, "learning_rate": 1.0046747662654566e-06, "loss": 0.0, "step": 4454 }, { "epoch": 1.6106290672451193, "grad_norm": 0.0007728714414728953, "learning_rate": 1.0028782723714904e-06, "loss": 0.0, "step": 4455 }, { "epoch": 1.6109906001446133, "grad_norm": 0.14460858128039816, "learning_rate": 1.001083207047986e-06, "loss": 0.0203, "step": 4456 }, { "epoch": 1.611352133044107, "grad_norm": 0.11781187927116472, "learning_rate": 9.992895709364974e-07, "loss": 0.0027, "step": 4457 }, { "epoch": 1.6117136659436009, "grad_norm": 0.3953719808397564, "learning_rate": 9.97497364678074e-07, "loss": 0.0352, "step": 4458 }, { "epoch": 1.6120751988430948, "grad_norm": 0.4848954643169107, "learning_rate": 9.95706588913249e-07, "loss": 0.0182, "step": 4459 }, { "epoch": 1.6124367317425885, "grad_norm": 0.01256460919951821, "learning_rate": 9.93917244282041e-07, "loss": 0.0006, "step": 4460 }, { "epoch": 1.6127982646420824, "grad_norm": 0.3406819194456112, "learning_rate": 9.921293314239667e-07, "loss": 0.0315, "step": 4461 }, { "epoch": 1.6131597975415763, "grad_norm": 1.5774350854095809, "learning_rate": 9.903428509780226e-07, "loss": 0.1143, "step": 4462 }, { "epoch": 1.61352133044107, "grad_norm": 0.0027990443722605085, "learning_rate": 9.885578035826976e-07, "loss": 0.0001, "step": 4463 }, { "epoch": 1.613882863340564, "grad_norm": 0.48678110473255637, "learning_rate": 9.86774189875967e-07, "loss": 0.0128, "step": 4464 }, { "epoch": 1.6142443962400579, "grad_norm": 2.7726527212230114, "learning_rate": 9.849920104952942e-07, "loss": 0.1914, "step": 4465 }, { "epoch": 1.6146059291395516, "grad_norm": 0.21535623096383832, "learning_rate": 9.832112660776295e-07, "loss": 0.0283, "step": 4466 }, { "epoch": 1.6149674620390455, "grad_norm": 0.46930189139690376, "learning_rate": 9.81431957259409e-07, "loss": 0.0145, "step": 4467 }, { "epoch": 1.6153289949385394, "grad_norm": 0.334353625251463, "learning_rate": 9.796540846765606e-07, "loss": 0.0227, "step": 4468 }, { "epoch": 1.6156905278380331, "grad_norm": 0.6188554388493186, "learning_rate": 9.778776489644948e-07, "loss": 0.1406, "step": 4469 }, { "epoch": 1.6160520607375273, "grad_norm": 0.7242727411004378, "learning_rate": 9.76102650758106e-07, "loss": 0.0693, "step": 4470 }, { "epoch": 1.616413593637021, "grad_norm": 0.11123234916265753, "learning_rate": 9.743290906917818e-07, "loss": 0.0181, "step": 4471 }, { "epoch": 1.6167751265365147, "grad_norm": 0.16493739391679613, "learning_rate": 9.725569693993914e-07, "loss": 0.0227, "step": 4472 }, { "epoch": 1.6171366594360088, "grad_norm": 0.5221301362229307, "learning_rate": 9.707862875142898e-07, "loss": 0.0162, "step": 4473 }, { "epoch": 1.6174981923355025, "grad_norm": 0.23586956386310562, "learning_rate": 9.690170456693187e-07, "loss": 0.0283, "step": 4474 }, { "epoch": 1.6178597252349964, "grad_norm": 0.47796745354988646, "learning_rate": 9.67249244496805e-07, "loss": 0.0432, "step": 4475 }, { "epoch": 1.6182212581344904, "grad_norm": 1.33686436132168, "learning_rate": 9.654828846285601e-07, "loss": 0.0903, "step": 4476 }, { "epoch": 1.618582791033984, "grad_norm": 0.41883810939400207, "learning_rate": 9.6371796669588e-07, "loss": 0.0388, "step": 4477 }, { "epoch": 1.618944323933478, "grad_norm": 0.19442761881004467, "learning_rate": 9.619544913295475e-07, "loss": 0.0283, "step": 4478 }, { "epoch": 1.619305856832972, "grad_norm": 0.30831591659414315, "learning_rate": 9.601924591598294e-07, "loss": 0.0315, "step": 4479 }, { "epoch": 1.6196673897324656, "grad_norm": 1.0220443572513889, "learning_rate": 9.584318708164707e-07, "loss": 0.1055, "step": 4480 }, { "epoch": 1.6200289226319595, "grad_norm": 0.928088049838355, "learning_rate": 9.566727269287103e-07, "loss": 0.0227, "step": 4481 }, { "epoch": 1.6203904555314534, "grad_norm": 0.8973525404222168, "learning_rate": 9.549150281252633e-07, "loss": 0.1143, "step": 4482 }, { "epoch": 1.6207519884309471, "grad_norm": 0.07947651250814886, "learning_rate": 9.531587750343318e-07, "loss": 0.0039, "step": 4483 }, { "epoch": 1.621113521330441, "grad_norm": 0.002804108935145926, "learning_rate": 9.514039682836002e-07, "loss": 0.0001, "step": 4484 }, { "epoch": 1.621475054229935, "grad_norm": 0.11076577289862558, "learning_rate": 9.49650608500236e-07, "loss": 0.0181, "step": 4485 }, { "epoch": 1.6218365871294287, "grad_norm": 0.7644700619707288, "learning_rate": 9.478986963108894e-07, "loss": 0.1406, "step": 4486 }, { "epoch": 1.6221981200289226, "grad_norm": 0.20848142759929017, "learning_rate": 9.461482323416921e-07, "loss": 0.0056, "step": 4487 }, { "epoch": 1.6225596529284165, "grad_norm": 0.12956713895059033, "learning_rate": 9.443992172182625e-07, "loss": 0.0012, "step": 4488 }, { "epoch": 1.6229211858279102, "grad_norm": 0.49981351852752115, "learning_rate": 9.426516515656986e-07, "loss": 0.0352, "step": 4489 }, { "epoch": 1.6232827187274042, "grad_norm": 0.14758192576573353, "learning_rate": 9.409055360085751e-07, "loss": 0.0181, "step": 4490 }, { "epoch": 1.623644251626898, "grad_norm": 0.8108064131280429, "learning_rate": 9.391608711709577e-07, "loss": 0.1602, "step": 4491 }, { "epoch": 1.6240057845263918, "grad_norm": 0.09062608359000554, "learning_rate": 9.374176576763877e-07, "loss": 0.0143, "step": 4492 }, { "epoch": 1.624367317425886, "grad_norm": 0.0017817266320239338, "learning_rate": 9.356758961478902e-07, "loss": 0.0001, "step": 4493 }, { "epoch": 1.6247288503253796, "grad_norm": 0.00886325415206616, "learning_rate": 9.339355872079686e-07, "loss": 0.0001, "step": 4494 }, { "epoch": 1.6250903832248733, "grad_norm": 0.00030318732549033435, "learning_rate": 9.321967314786107e-07, "loss": 0.0, "step": 4495 }, { "epoch": 1.6254519161243675, "grad_norm": 0.23116883593608975, "learning_rate": 9.304593295812825e-07, "loss": 0.0315, "step": 4496 }, { "epoch": 1.6258134490238612, "grad_norm": 0.3308397432605821, "learning_rate": 9.287233821369302e-07, "loss": 0.0317, "step": 4497 }, { "epoch": 1.626174981923355, "grad_norm": 0.08662527668277342, "learning_rate": 9.269888897659824e-07, "loss": 0.0143, "step": 4498 }, { "epoch": 1.626536514822849, "grad_norm": 1.885231225414049, "learning_rate": 9.252558530883477e-07, "loss": 0.0693, "step": 4499 }, { "epoch": 1.6268980477223427, "grad_norm": 0.2886118159070132, "learning_rate": 9.235242727234084e-07, "loss": 0.0283, "step": 4500 }, { "epoch": 1.6272595806218366, "grad_norm": 0.4723941102845316, "learning_rate": 9.21794149290035e-07, "loss": 0.0145, "step": 4501 }, { "epoch": 1.6276211135213305, "grad_norm": 1.0083196352098824, "learning_rate": 9.200654834065719e-07, "loss": 0.0693, "step": 4502 }, { "epoch": 1.6279826464208242, "grad_norm": 0.12805566681872121, "learning_rate": 9.183382756908438e-07, "loss": 0.0181, "step": 4503 }, { "epoch": 1.6283441793203182, "grad_norm": 0.11758954210350687, "learning_rate": 9.16612526760155e-07, "loss": 0.0143, "step": 4504 }, { "epoch": 1.628705712219812, "grad_norm": 0.22903167728327975, "learning_rate": 9.148882372312872e-07, "loss": 0.0315, "step": 4505 }, { "epoch": 1.6290672451193058, "grad_norm": 0.02260480490078233, "learning_rate": 9.131654077205016e-07, "loss": 0.0012, "step": 4506 }, { "epoch": 1.6294287780187997, "grad_norm": 0.11620373791876674, "learning_rate": 9.114440388435353e-07, "loss": 0.0181, "step": 4507 }, { "epoch": 1.6297903109182936, "grad_norm": 0.1201804178999716, "learning_rate": 9.097241312156074e-07, "loss": 0.0203, "step": 4508 }, { "epoch": 1.6301518438177873, "grad_norm": 1.8732627248205944, "learning_rate": 9.080056854514135e-07, "loss": 0.2129, "step": 4509 }, { "epoch": 1.6305133767172812, "grad_norm": 0.4718936225668434, "learning_rate": 9.062887021651218e-07, "loss": 0.0315, "step": 4510 }, { "epoch": 1.6308749096167752, "grad_norm": 0.33141870936751316, "learning_rate": 9.045731819703851e-07, "loss": 0.0388, "step": 4511 }, { "epoch": 1.6312364425162689, "grad_norm": 0.1926541166367481, "learning_rate": 9.028591254803287e-07, "loss": 0.0283, "step": 4512 }, { "epoch": 1.6315979754157628, "grad_norm": 1.793133562616521, "learning_rate": 9.011465333075565e-07, "loss": 0.1699, "step": 4513 }, { "epoch": 1.6319595083152567, "grad_norm": 0.06052009504872099, "learning_rate": 8.99435406064148e-07, "loss": 0.0017, "step": 4514 }, { "epoch": 1.6323210412147504, "grad_norm": 1.7176202450591396, "learning_rate": 8.977257443616588e-07, "loss": 0.2012, "step": 4515 }, { "epoch": 1.6326825741142446, "grad_norm": 0.14421494884875566, "learning_rate": 8.960175488111255e-07, "loss": 0.0203, "step": 4516 }, { "epoch": 1.6330441070137383, "grad_norm": 0.3828952785407965, "learning_rate": 8.943108200230516e-07, "loss": 0.0227, "step": 4517 }, { "epoch": 1.633405639913232, "grad_norm": 0.22594814759427623, "learning_rate": 8.92605558607425e-07, "loss": 0.0283, "step": 4518 }, { "epoch": 1.633767172812726, "grad_norm": 0.006085323386826051, "learning_rate": 8.90901765173705e-07, "loss": 0.0003, "step": 4519 }, { "epoch": 1.6341287057122198, "grad_norm": 0.0003759756113932648, "learning_rate": 8.891994403308274e-07, "loss": 0.0, "step": 4520 }, { "epoch": 1.6344902386117137, "grad_norm": 0.0003306207366272599, "learning_rate": 8.87498584687202e-07, "loss": 0.0, "step": 4521 }, { "epoch": 1.6348517715112076, "grad_norm": 0.7383321813340681, "learning_rate": 8.857991988507148e-07, "loss": 0.0283, "step": 4522 }, { "epoch": 1.6352133044107013, "grad_norm": 0.0445989342769343, "learning_rate": 8.841012834287254e-07, "loss": 0.0015, "step": 4523 }, { "epoch": 1.6355748373101953, "grad_norm": 0.11181682167180852, "learning_rate": 8.824048390280676e-07, "loss": 0.0161, "step": 4524 }, { "epoch": 1.6359363702096892, "grad_norm": 0.0004152428756599964, "learning_rate": 8.807098662550523e-07, "loss": 0.0, "step": 4525 }, { "epoch": 1.6362979031091829, "grad_norm": 0.26960124556514087, "learning_rate": 8.790163657154633e-07, "loss": 0.0227, "step": 4526 }, { "epoch": 1.6366594360086768, "grad_norm": 0.17789786911854608, "learning_rate": 8.773243380145524e-07, "loss": 0.0227, "step": 4527 }, { "epoch": 1.6370209689081707, "grad_norm": 0.23363443582494134, "learning_rate": 8.756337837570544e-07, "loss": 0.0254, "step": 4528 }, { "epoch": 1.6373825018076644, "grad_norm": 0.0008389788389973434, "learning_rate": 8.739447035471721e-07, "loss": 0.0, "step": 4529 }, { "epoch": 1.6377440347071583, "grad_norm": 0.00022787155989045286, "learning_rate": 8.722570979885814e-07, "loss": 0.0, "step": 4530 }, { "epoch": 1.6381055676066523, "grad_norm": 0.0007431311498604104, "learning_rate": 8.705709676844331e-07, "loss": 0.0, "step": 4531 }, { "epoch": 1.638467100506146, "grad_norm": 0.08359826783586104, "learning_rate": 8.688863132373498e-07, "loss": 0.0143, "step": 4532 }, { "epoch": 1.63882863340564, "grad_norm": 0.750631061667475, "learning_rate": 8.672031352494259e-07, "loss": 0.0388, "step": 4533 }, { "epoch": 1.6391901663051338, "grad_norm": 0.28047470876296976, "learning_rate": 8.655214343222279e-07, "loss": 0.0315, "step": 4534 }, { "epoch": 1.6395516992046275, "grad_norm": 0.16254340942824352, "learning_rate": 8.638412110567984e-07, "loss": 0.0071, "step": 4535 }, { "epoch": 1.6399132321041214, "grad_norm": 1.0359373888026278, "learning_rate": 8.621624660536482e-07, "loss": 0.0977, "step": 4536 }, { "epoch": 1.6402747650036154, "grad_norm": 0.8994347204489422, "learning_rate": 8.604851999127567e-07, "loss": 0.1143, "step": 4537 }, { "epoch": 1.640636297903109, "grad_norm": 0.26799404885920847, "learning_rate": 8.588094132335828e-07, "loss": 0.0283, "step": 4538 }, { "epoch": 1.6409978308026032, "grad_norm": 0.12929744229896586, "learning_rate": 8.571351066150502e-07, "loss": 0.0161, "step": 4539 }, { "epoch": 1.641359363702097, "grad_norm": 0.2061385401189049, "learning_rate": 8.554622806555563e-07, "loss": 0.0071, "step": 4540 }, { "epoch": 1.6417208966015906, "grad_norm": 0.6139132006728267, "learning_rate": 8.53790935952969e-07, "loss": 0.1807, "step": 4541 }, { "epoch": 1.6420824295010847, "grad_norm": 0.3747828672431238, "learning_rate": 8.521210731046259e-07, "loss": 0.0182, "step": 4542 }, { "epoch": 1.6424439624005784, "grad_norm": 0.5510282469124328, "learning_rate": 8.504526927073359e-07, "loss": 0.1914, "step": 4543 }, { "epoch": 1.6428054953000724, "grad_norm": 0.13639443014684885, "learning_rate": 8.487857953573769e-07, "loss": 0.0181, "step": 4544 }, { "epoch": 1.6431670281995663, "grad_norm": 1.2326550493396922, "learning_rate": 8.471203816504991e-07, "loss": 0.1504, "step": 4545 }, { "epoch": 1.64352856109906, "grad_norm": 0.6654783955727265, "learning_rate": 8.454564521819226e-07, "loss": 0.0476, "step": 4546 }, { "epoch": 1.643890093998554, "grad_norm": 0.21685662534231176, "learning_rate": 8.437940075463303e-07, "loss": 0.0283, "step": 4547 }, { "epoch": 1.6442516268980478, "grad_norm": 0.2517430010282464, "learning_rate": 8.421330483378836e-07, "loss": 0.0254, "step": 4548 }, { "epoch": 1.6446131597975415, "grad_norm": 0.002488445989665015, "learning_rate": 8.404735751502086e-07, "loss": 0.0001, "step": 4549 }, { "epoch": 1.6449746926970354, "grad_norm": 0.19296127722942955, "learning_rate": 8.388155885763994e-07, "loss": 0.0161, "step": 4550 }, { "epoch": 1.6453362255965294, "grad_norm": 0.31432675102285207, "learning_rate": 8.371590892090209e-07, "loss": 0.0161, "step": 4551 }, { "epoch": 1.645697758496023, "grad_norm": 0.05668009743782658, "learning_rate": 8.355040776401058e-07, "loss": 0.0024, "step": 4552 }, { "epoch": 1.646059291395517, "grad_norm": 0.48665302404742966, "learning_rate": 8.338505544611536e-07, "loss": 0.0227, "step": 4553 }, { "epoch": 1.646420824295011, "grad_norm": 0.014199816225598495, "learning_rate": 8.321985202631333e-07, "loss": 0.0005, "step": 4554 }, { "epoch": 1.6467823571945046, "grad_norm": 0.14808046805290526, "learning_rate": 8.305479756364837e-07, "loss": 0.0227, "step": 4555 }, { "epoch": 1.6471438900939985, "grad_norm": 0.1881264488601294, "learning_rate": 8.28898921171109e-07, "loss": 0.0143, "step": 4556 }, { "epoch": 1.6475054229934925, "grad_norm": 1.0304619748886477, "learning_rate": 8.272513574563767e-07, "loss": 0.1143, "step": 4557 }, { "epoch": 1.6478669558929862, "grad_norm": 0.0002682672349507736, "learning_rate": 8.256052850811303e-07, "loss": 0.0, "step": 4558 }, { "epoch": 1.64822848879248, "grad_norm": 0.5166325743332373, "learning_rate": 8.239607046336734e-07, "loss": 0.0388, "step": 4559 }, { "epoch": 1.648590021691974, "grad_norm": 0.09142861550837007, "learning_rate": 8.223176167017799e-07, "loss": 0.0161, "step": 4560 }, { "epoch": 1.6489515545914677, "grad_norm": 0.2384104793259862, "learning_rate": 8.206760218726884e-07, "loss": 0.0227, "step": 4561 }, { "epoch": 1.6493130874909618, "grad_norm": 0.003612973992680509, "learning_rate": 8.190359207331045e-07, "loss": 0.0001, "step": 4562 }, { "epoch": 1.6496746203904555, "grad_norm": 0.900440379444634, "learning_rate": 8.173973138692004e-07, "loss": 0.1055, "step": 4563 }, { "epoch": 1.6500361532899492, "grad_norm": 0.2314611741087015, "learning_rate": 8.157602018666121e-07, "loss": 0.0227, "step": 4564 }, { "epoch": 1.6503976861894434, "grad_norm": 0.0019390483973704062, "learning_rate": 8.141245853104463e-07, "loss": 0.0001, "step": 4565 }, { "epoch": 1.650759219088937, "grad_norm": 1.1179060175183324, "learning_rate": 8.124904647852711e-07, "loss": 0.1143, "step": 4566 }, { "epoch": 1.651120751988431, "grad_norm": 0.14716860738630755, "learning_rate": 8.108578408751183e-07, "loss": 0.0161, "step": 4567 }, { "epoch": 1.651482284887925, "grad_norm": 0.0024151058414926072, "learning_rate": 8.092267141634897e-07, "loss": 0.0001, "step": 4568 }, { "epoch": 1.6518438177874186, "grad_norm": 0.0007166301721604379, "learning_rate": 8.075970852333492e-07, "loss": 0.0, "step": 4569 }, { "epoch": 1.6522053506869125, "grad_norm": 0.30619865863784534, "learning_rate": 8.059689546671256e-07, "loss": 0.0254, "step": 4570 }, { "epoch": 1.6525668835864065, "grad_norm": 0.16498284433268368, "learning_rate": 8.043423230467124e-07, "loss": 0.0161, "step": 4571 }, { "epoch": 1.6529284164859002, "grad_norm": 0.23479416421096644, "learning_rate": 8.027171909534676e-07, "loss": 0.0254, "step": 4572 }, { "epoch": 1.653289949385394, "grad_norm": 0.18207091267541642, "learning_rate": 8.010935589682134e-07, "loss": 0.0254, "step": 4573 }, { "epoch": 1.653651482284888, "grad_norm": 0.23774175739486214, "learning_rate": 7.994714276712334e-07, "loss": 0.0254, "step": 4574 }, { "epoch": 1.6540130151843817, "grad_norm": 2.071112171177674, "learning_rate": 7.9785079764228e-07, "loss": 0.1504, "step": 4575 }, { "epoch": 1.6543745480838756, "grad_norm": 0.13207615790082872, "learning_rate": 7.96231669460566e-07, "loss": 0.0056, "step": 4576 }, { "epoch": 1.6547360809833696, "grad_norm": 0.20577937181965544, "learning_rate": 7.946140437047634e-07, "loss": 0.0254, "step": 4577 }, { "epoch": 1.6550976138828633, "grad_norm": 0.1172420751834058, "learning_rate": 7.929979209530153e-07, "loss": 0.0181, "step": 4578 }, { "epoch": 1.6554591467823572, "grad_norm": 0.020042448732474384, "learning_rate": 7.913833017829231e-07, "loss": 0.0002, "step": 4579 }, { "epoch": 1.655820679681851, "grad_norm": 0.18382832359618007, "learning_rate": 7.897701867715501e-07, "loss": 0.0182, "step": 4580 }, { "epoch": 1.6561822125813448, "grad_norm": 0.7539141795919455, "learning_rate": 7.881585764954236e-07, "loss": 0.0388, "step": 4581 }, { "epoch": 1.6565437454808387, "grad_norm": 0.75242014604062, "learning_rate": 7.865484715305339e-07, "loss": 0.1807, "step": 4582 }, { "epoch": 1.6569052783803326, "grad_norm": 1.8053639167572906, "learning_rate": 7.849398724523305e-07, "loss": 0.0977, "step": 4583 }, { "epoch": 1.6572668112798263, "grad_norm": 0.5561201638606319, "learning_rate": 7.83332779835726e-07, "loss": 0.0254, "step": 4584 }, { "epoch": 1.6576283441793205, "grad_norm": 1.035074663918273, "learning_rate": 7.817271942550975e-07, "loss": 0.0635, "step": 4585 }, { "epoch": 1.6579898770788142, "grad_norm": 0.12301250922172276, "learning_rate": 7.801231162842804e-07, "loss": 0.0203, "step": 4586 }, { "epoch": 1.6583514099783079, "grad_norm": 0.6552470465800767, "learning_rate": 7.78520546496569e-07, "loss": 0.0579, "step": 4587 }, { "epoch": 1.658712942877802, "grad_norm": 0.2627986933732707, "learning_rate": 7.769194854647244e-07, "loss": 0.0315, "step": 4588 }, { "epoch": 1.6590744757772957, "grad_norm": 0.25343631715863213, "learning_rate": 7.753199337609646e-07, "loss": 0.0315, "step": 4589 }, { "epoch": 1.6594360086767896, "grad_norm": 0.021614945693003988, "learning_rate": 7.737218919569689e-07, "loss": 0.0008, "step": 4590 }, { "epoch": 1.6597975415762836, "grad_norm": 0.09855887005998763, "learning_rate": 7.721253606238759e-07, "loss": 0.0143, "step": 4591 }, { "epoch": 1.6601590744757773, "grad_norm": 0.20587819182301215, "learning_rate": 7.705303403322889e-07, "loss": 0.0203, "step": 4592 }, { "epoch": 1.6605206073752712, "grad_norm": 0.5632788255212933, "learning_rate": 7.689368316522644e-07, "loss": 0.0527, "step": 4593 }, { "epoch": 1.660882140274765, "grad_norm": 0.0362917848299007, "learning_rate": 7.673448351533224e-07, "loss": 0.0008, "step": 4594 }, { "epoch": 1.6612436731742588, "grad_norm": 0.2315874988983834, "learning_rate": 7.657543514044446e-07, "loss": 0.0254, "step": 4595 }, { "epoch": 1.6616052060737527, "grad_norm": 0.13861405203640031, "learning_rate": 7.641653809740679e-07, "loss": 0.0035, "step": 4596 }, { "epoch": 1.6619667389732466, "grad_norm": 0.1588310953601343, "learning_rate": 7.625779244300896e-07, "loss": 0.0227, "step": 4597 }, { "epoch": 1.6623282718727403, "grad_norm": 0.0007358336426408298, "learning_rate": 7.60991982339867e-07, "loss": 0.0, "step": 4598 }, { "epoch": 1.6626898047722343, "grad_norm": 0.1966896868647173, "learning_rate": 7.594075552702157e-07, "loss": 0.0203, "step": 4599 }, { "epoch": 1.6630513376717282, "grad_norm": 0.7998553772799422, "learning_rate": 7.578246437874087e-07, "loss": 0.1406, "step": 4600 }, { "epoch": 1.663412870571222, "grad_norm": 0.2271540679980039, "learning_rate": 7.56243248457178e-07, "loss": 0.0044, "step": 4601 }, { "epoch": 1.6637744034707158, "grad_norm": 0.5731327368210644, "learning_rate": 7.54663369844717e-07, "loss": 0.2012, "step": 4602 }, { "epoch": 1.6641359363702097, "grad_norm": 0.0172961851152889, "learning_rate": 7.530850085146701e-07, "loss": 0.0005, "step": 4603 }, { "epoch": 1.6644974692697034, "grad_norm": 1.420773049417447, "learning_rate": 7.51508165031144e-07, "loss": 0.1406, "step": 4604 }, { "epoch": 1.6648590021691974, "grad_norm": 0.9202681145979127, "learning_rate": 7.499328399577044e-07, "loss": 0.0635, "step": 4605 }, { "epoch": 1.6652205350686913, "grad_norm": 0.16296797052430956, "learning_rate": 7.483590338573709e-07, "loss": 0.0254, "step": 4606 }, { "epoch": 1.665582067968185, "grad_norm": 0.6450873931599623, "learning_rate": 7.467867472926211e-07, "loss": 0.0349, "step": 4607 }, { "epoch": 1.6659436008676791, "grad_norm": 0.12905960142165698, "learning_rate": 7.452159808253905e-07, "loss": 0.0161, "step": 4608 }, { "epoch": 1.6663051337671728, "grad_norm": 0.4589338623773756, "learning_rate": 7.4364673501707e-07, "loss": 0.0349, "step": 4609 }, { "epoch": 1.6666666666666665, "grad_norm": 0.19755737165037596, "learning_rate": 7.420790104285086e-07, "loss": 0.008, "step": 4610 }, { "epoch": 1.6670281995661607, "grad_norm": 0.3920299139530422, "learning_rate": 7.405128076200091e-07, "loss": 0.0349, "step": 4611 }, { "epoch": 1.6673897324656544, "grad_norm": 0.11909228510524915, "learning_rate": 7.389481271513343e-07, "loss": 0.0143, "step": 4612 }, { "epoch": 1.6677512653651483, "grad_norm": 0.18193346012733677, "learning_rate": 7.37384969581701e-07, "loss": 0.0254, "step": 4613 }, { "epoch": 1.6681127982646422, "grad_norm": 0.00016825521980682825, "learning_rate": 7.358233354697775e-07, "loss": 0.0, "step": 4614 }, { "epoch": 1.668474331164136, "grad_norm": 0.011882387383102287, "learning_rate": 7.342632253736947e-07, "loss": 0.0004, "step": 4615 }, { "epoch": 1.6688358640636298, "grad_norm": 0.8045537321918941, "learning_rate": 7.327046398510357e-07, "loss": 0.0203, "step": 4616 }, { "epoch": 1.6691973969631237, "grad_norm": 0.01268607133438399, "learning_rate": 7.311475794588374e-07, "loss": 0.0005, "step": 4617 }, { "epoch": 1.6695589298626174, "grad_norm": 0.6060017506913556, "learning_rate": 7.295920447535932e-07, "loss": 0.0527, "step": 4618 }, { "epoch": 1.6699204627621114, "grad_norm": 0.8369790822357085, "learning_rate": 7.280380362912515e-07, "loss": 0.1807, "step": 4619 }, { "epoch": 1.6702819956616053, "grad_norm": 0.6671237405827405, "learning_rate": 7.264855546272137e-07, "loss": 0.1699, "step": 4620 }, { "epoch": 1.670643528561099, "grad_norm": 0.14042054114140673, "learning_rate": 7.249346003163354e-07, "loss": 0.0181, "step": 4621 }, { "epoch": 1.671005061460593, "grad_norm": 0.11228528517586833, "learning_rate": 7.233851739129306e-07, "loss": 0.0161, "step": 4622 }, { "epoch": 1.6713665943600868, "grad_norm": 0.9385631383938827, "learning_rate": 7.218372759707626e-07, "loss": 0.083, "step": 4623 }, { "epoch": 1.6717281272595805, "grad_norm": 0.017516935510197063, "learning_rate": 7.202909070430469e-07, "loss": 0.0005, "step": 4624 }, { "epoch": 1.6720896601590745, "grad_norm": 0.26730716828243656, "learning_rate": 7.187460676824592e-07, "loss": 0.0114, "step": 4625 }, { "epoch": 1.6724511930585684, "grad_norm": 0.8001555283682246, "learning_rate": 7.17202758441124e-07, "loss": 0.1914, "step": 4626 }, { "epoch": 1.672812725958062, "grad_norm": 0.09411295646381673, "learning_rate": 7.156609798706183e-07, "loss": 0.0027, "step": 4627 }, { "epoch": 1.673174258857556, "grad_norm": 0.9623400352256244, "learning_rate": 7.141207325219745e-07, "loss": 0.0579, "step": 4628 }, { "epoch": 1.67353579175705, "grad_norm": 0.00041414399618388604, "learning_rate": 7.125820169456766e-07, "loss": 0.0, "step": 4629 }, { "epoch": 1.6738973246565436, "grad_norm": 0.12596360312055607, "learning_rate": 7.110448336916609e-07, "loss": 0.0037, "step": 4630 }, { "epoch": 1.6742588575560378, "grad_norm": 0.014503995656939795, "learning_rate": 7.095091833093154e-07, "loss": 0.0004, "step": 4631 }, { "epoch": 1.6746203904555315, "grad_norm": 0.29896908041568715, "learning_rate": 7.079750663474832e-07, "loss": 0.008, "step": 4632 }, { "epoch": 1.6749819233550252, "grad_norm": 1.054652750362952, "learning_rate": 7.064424833544581e-07, "loss": 0.0693, "step": 4633 }, { "epoch": 1.6753434562545193, "grad_norm": 0.6616620110395597, "learning_rate": 7.049114348779807e-07, "loss": 0.0432, "step": 4634 }, { "epoch": 1.675704989154013, "grad_norm": 0.11093220103497257, "learning_rate": 7.033819214652509e-07, "loss": 0.0143, "step": 4635 }, { "epoch": 1.676066522053507, "grad_norm": 0.12911398687684303, "learning_rate": 7.018539436629163e-07, "loss": 0.0203, "step": 4636 }, { "epoch": 1.6764280549530008, "grad_norm": 0.00032589131036207873, "learning_rate": 7.003275020170747e-07, "loss": 0.0, "step": 4637 }, { "epoch": 1.6767895878524945, "grad_norm": 2.046435355822985, "learning_rate": 6.988025970732765e-07, "loss": 0.293, "step": 4638 }, { "epoch": 1.6771511207519885, "grad_norm": 0.023860292827064592, "learning_rate": 6.972792293765229e-07, "loss": 0.0008, "step": 4639 }, { "epoch": 1.6775126536514824, "grad_norm": 0.2007417033717869, "learning_rate": 6.957573994712641e-07, "loss": 0.0203, "step": 4640 }, { "epoch": 1.677874186550976, "grad_norm": 0.1157861379571065, "learning_rate": 6.942371079014015e-07, "loss": 0.0181, "step": 4641 }, { "epoch": 1.67823571945047, "grad_norm": 0.11348368261141085, "learning_rate": 6.92718355210289e-07, "loss": 0.0181, "step": 4642 }, { "epoch": 1.678597252349964, "grad_norm": 0.2179241174183853, "learning_rate": 6.912011419407283e-07, "loss": 0.0315, "step": 4643 }, { "epoch": 1.6789587852494576, "grad_norm": 0.8002748201790836, "learning_rate": 6.89685468634968e-07, "loss": 0.0283, "step": 4644 }, { "epoch": 1.6793203181489516, "grad_norm": 0.7174875908454177, "learning_rate": 6.881713358347126e-07, "loss": 0.1807, "step": 4645 }, { "epoch": 1.6796818510484455, "grad_norm": 0.04642261884706546, "learning_rate": 6.866587440811117e-07, "loss": 0.0019, "step": 4646 }, { "epoch": 1.6800433839479392, "grad_norm": 0.14409711394682417, "learning_rate": 6.851476939147655e-07, "loss": 0.0203, "step": 4647 }, { "epoch": 1.680404916847433, "grad_norm": 1.0755974341499552, "learning_rate": 6.836381858757229e-07, "loss": 0.083, "step": 4648 }, { "epoch": 1.680766449746927, "grad_norm": 0.21613463461216043, "learning_rate": 6.821302205034819e-07, "loss": 0.0254, "step": 4649 }, { "epoch": 1.6811279826464207, "grad_norm": 0.9105926233672637, "learning_rate": 6.806237983369885e-07, "loss": 0.0432, "step": 4650 }, { "epoch": 1.6814895155459146, "grad_norm": 0.10825434665216441, "learning_rate": 6.79118919914637e-07, "loss": 0.0181, "step": 4651 }, { "epoch": 1.6818510484454086, "grad_norm": 0.0004632725791906342, "learning_rate": 6.776155857742733e-07, "loss": 0.0, "step": 4652 }, { "epoch": 1.6822125813449023, "grad_norm": 0.21669466713839625, "learning_rate": 6.761137964531878e-07, "loss": 0.0254, "step": 4653 }, { "epoch": 1.6825741142443964, "grad_norm": 0.18940041630109244, "learning_rate": 6.746135524881164e-07, "loss": 0.0181, "step": 4654 }, { "epoch": 1.68293564714389, "grad_norm": 0.4725877048952268, "learning_rate": 6.731148544152499e-07, "loss": 0.0129, "step": 4655 }, { "epoch": 1.6832971800433838, "grad_norm": 0.00029792585816636056, "learning_rate": 6.716177027702215e-07, "loss": 0.0, "step": 4656 }, { "epoch": 1.683658712942878, "grad_norm": 1.367793307524584, "learning_rate": 6.701220980881123e-07, "loss": 0.0522, "step": 4657 }, { "epoch": 1.6840202458423716, "grad_norm": 0.0017717835423807745, "learning_rate": 6.686280409034501e-07, "loss": 0.0001, "step": 4658 }, { "epoch": 1.6843817787418656, "grad_norm": 0.2016598759812576, "learning_rate": 6.671355317502148e-07, "loss": 0.0227, "step": 4659 }, { "epoch": 1.6847433116413595, "grad_norm": 1.9534235615370137, "learning_rate": 6.656445711618248e-07, "loss": 0.0129, "step": 4660 }, { "epoch": 1.6851048445408532, "grad_norm": 0.14800803033349175, "learning_rate": 6.641551596711493e-07, "loss": 0.0227, "step": 4661 }, { "epoch": 1.685466377440347, "grad_norm": 0.2910960947100605, "learning_rate": 6.626672978105053e-07, "loss": 0.0315, "step": 4662 }, { "epoch": 1.685827910339841, "grad_norm": 0.0018227957883913946, "learning_rate": 6.611809861116542e-07, "loss": 0.0001, "step": 4663 }, { "epoch": 1.6861894432393347, "grad_norm": 0.07068438626605784, "learning_rate": 6.596962251058031e-07, "loss": 0.0019, "step": 4664 }, { "epoch": 1.6865509761388287, "grad_norm": 0.0004686558146831168, "learning_rate": 6.582130153236049e-07, "loss": 0.0, "step": 4665 }, { "epoch": 1.6869125090383226, "grad_norm": 0.6149765977596082, "learning_rate": 6.567313572951589e-07, "loss": 0.0635, "step": 4666 }, { "epoch": 1.6872740419378163, "grad_norm": 0.8640418326838921, "learning_rate": 6.552512515500093e-07, "loss": 0.0579, "step": 4667 }, { "epoch": 1.6876355748373102, "grad_norm": 0.00019854494535880178, "learning_rate": 6.537726986171439e-07, "loss": 0.0, "step": 4668 }, { "epoch": 1.6879971077368041, "grad_norm": 0.03196440101736428, "learning_rate": 6.522956990250012e-07, "loss": 0.0013, "step": 4669 }, { "epoch": 1.6883586406362978, "grad_norm": 0.00656292362727253, "learning_rate": 6.508202533014574e-07, "loss": 0.0002, "step": 4670 }, { "epoch": 1.6887201735357917, "grad_norm": 0.18718229607426923, "learning_rate": 6.493463619738355e-07, "loss": 0.0254, "step": 4671 }, { "epoch": 1.6890817064352857, "grad_norm": 0.11301562820280191, "learning_rate": 6.47874025568907e-07, "loss": 0.0181, "step": 4672 }, { "epoch": 1.6894432393347794, "grad_norm": 0.2275907550211229, "learning_rate": 6.464032446128837e-07, "loss": 0.0181, "step": 4673 }, { "epoch": 1.6898047722342733, "grad_norm": 1.0401380609227364, "learning_rate": 6.449340196314214e-07, "loss": 0.0903, "step": 4674 }, { "epoch": 1.6901663051337672, "grad_norm": 0.00157637678007997, "learning_rate": 6.43466351149622e-07, "loss": 0.0001, "step": 4675 }, { "epoch": 1.690527838033261, "grad_norm": 0.1519568288719225, "learning_rate": 6.420002396920289e-07, "loss": 0.0227, "step": 4676 }, { "epoch": 1.690889370932755, "grad_norm": 0.017349846366507268, "learning_rate": 6.405356857826306e-07, "loss": 0.0007, "step": 4677 }, { "epoch": 1.6912509038322487, "grad_norm": 0.00024993767283089873, "learning_rate": 6.390726899448569e-07, "loss": 0.0, "step": 4678 }, { "epoch": 1.6916124367317424, "grad_norm": 0.521150454349694, "learning_rate": 6.37611252701586e-07, "loss": 0.0432, "step": 4679 }, { "epoch": 1.6919739696312366, "grad_norm": 0.06064305999232256, "learning_rate": 6.361513745751313e-07, "loss": 0.0017, "step": 4680 }, { "epoch": 1.6923355025307303, "grad_norm": 0.0045115157056043135, "learning_rate": 6.346930560872533e-07, "loss": 0.0002, "step": 4681 }, { "epoch": 1.692697035430224, "grad_norm": 0.009288982233118102, "learning_rate": 6.332362977591572e-07, "loss": 0.0001, "step": 4682 }, { "epoch": 1.6930585683297181, "grad_norm": 0.0003961691800680487, "learning_rate": 6.317811001114866e-07, "loss": 0.0, "step": 4683 }, { "epoch": 1.6934201012292118, "grad_norm": 0.00353123886067803, "learning_rate": 6.303274636643286e-07, "loss": 0.0001, "step": 4684 }, { "epoch": 1.6937816341287057, "grad_norm": 0.2063686136691257, "learning_rate": 6.288753889372128e-07, "loss": 0.0254, "step": 4685 }, { "epoch": 1.6941431670281997, "grad_norm": 0.006848392670429646, "learning_rate": 6.274248764491114e-07, "loss": 0.0002, "step": 4686 }, { "epoch": 1.6945046999276934, "grad_norm": 0.10311157657174101, "learning_rate": 6.259759267184357e-07, "loss": 0.0143, "step": 4687 }, { "epoch": 1.6948662328271873, "grad_norm": 0.0010671873558334133, "learning_rate": 6.245285402630396e-07, "loss": 0.0, "step": 4688 }, { "epoch": 1.6952277657266812, "grad_norm": 0.5520637087439146, "learning_rate": 6.230827176002225e-07, "loss": 0.1807, "step": 4689 }, { "epoch": 1.695589298626175, "grad_norm": 0.41011914695972035, "learning_rate": 6.216384592467167e-07, "loss": 0.0203, "step": 4690 }, { "epoch": 1.6959508315256688, "grad_norm": 0.13998388562036618, "learning_rate": 6.201957657187008e-07, "loss": 0.0203, "step": 4691 }, { "epoch": 1.6963123644251628, "grad_norm": 0.23763394223613873, "learning_rate": 6.187546375317954e-07, "loss": 0.0254, "step": 4692 }, { "epoch": 1.6966738973246565, "grad_norm": 0.3661160381823922, "learning_rate": 6.173150752010571e-07, "loss": 0.0317, "step": 4693 }, { "epoch": 1.6970354302241504, "grad_norm": 0.8700595647740883, "learning_rate": 6.158770792409869e-07, "loss": 0.0977, "step": 4694 }, { "epoch": 1.6973969631236443, "grad_norm": 0.00031246499500069076, "learning_rate": 6.144406501655226e-07, "loss": 0.0, "step": 4695 }, { "epoch": 1.697758496023138, "grad_norm": 1.1118522311879944, "learning_rate": 6.130057884880447e-07, "loss": 0.1226, "step": 4696 }, { "epoch": 1.698120028922632, "grad_norm": 0.04636542769432533, "learning_rate": 6.11572494721372e-07, "loss": 0.0019, "step": 4697 }, { "epoch": 1.6984815618221258, "grad_norm": 2.8030025448126037, "learning_rate": 6.101407693777628e-07, "loss": 0.0693, "step": 4698 }, { "epoch": 1.6988430947216195, "grad_norm": 0.004942021020338567, "learning_rate": 6.08710612968918e-07, "loss": 0.0001, "step": 4699 }, { "epoch": 1.6992046276211137, "grad_norm": 0.6523635308799999, "learning_rate": 6.072820260059725e-07, "loss": 0.0388, "step": 4700 }, { "epoch": 1.6995661605206074, "grad_norm": 0.0010864581918175204, "learning_rate": 6.05855008999503e-07, "loss": 0.0, "step": 4701 }, { "epoch": 1.699927693420101, "grad_norm": 0.3370241202593477, "learning_rate": 6.04429562459527e-07, "loss": 0.0145, "step": 4702 }, { "epoch": 1.7002892263195952, "grad_norm": 0.0705774545314724, "learning_rate": 6.030056868954975e-07, "loss": 0.0027, "step": 4703 }, { "epoch": 1.700650759219089, "grad_norm": 0.102466284556992, "learning_rate": 6.015833828163075e-07, "loss": 0.0161, "step": 4704 }, { "epoch": 1.7010122921185826, "grad_norm": 0.023622942666771184, "learning_rate": 6.001626507302882e-07, "loss": 0.0009, "step": 4705 }, { "epoch": 1.7013738250180768, "grad_norm": 0.9476074488783784, "learning_rate": 5.987434911452095e-07, "loss": 0.0977, "step": 4706 }, { "epoch": 1.7017353579175705, "grad_norm": 0.5477600190368705, "learning_rate": 5.973259045682777e-07, "loss": 0.0227, "step": 4707 }, { "epoch": 1.7020968908170644, "grad_norm": 0.2944642792694706, "learning_rate": 5.959098915061373e-07, "loss": 0.008, "step": 4708 }, { "epoch": 1.7024584237165583, "grad_norm": 0.007098353733579652, "learning_rate": 5.944954524648738e-07, "loss": 0.0003, "step": 4709 }, { "epoch": 1.702819956616052, "grad_norm": 0.042414776755863304, "learning_rate": 5.930825879500069e-07, "loss": 0.0015, "step": 4710 }, { "epoch": 1.703181489515546, "grad_norm": 0.05333403959048125, "learning_rate": 5.916712984664902e-07, "loss": 0.0024, "step": 4711 }, { "epoch": 1.7035430224150399, "grad_norm": 0.0006844669532216486, "learning_rate": 5.902615845187226e-07, "loss": 0.0, "step": 4712 }, { "epoch": 1.7039045553145336, "grad_norm": 0.6743245293736396, "learning_rate": 5.888534466105345e-07, "loss": 0.0476, "step": 4713 }, { "epoch": 1.7042660882140275, "grad_norm": 0.021421432392530148, "learning_rate": 5.874468852451931e-07, "loss": 0.0012, "step": 4714 }, { "epoch": 1.7046276211135214, "grad_norm": 0.14649318987779836, "learning_rate": 5.860419009254036e-07, "loss": 0.0181, "step": 4715 }, { "epoch": 1.704989154013015, "grad_norm": 0.11391742419512228, "learning_rate": 5.846384941533078e-07, "loss": 0.0143, "step": 4716 }, { "epoch": 1.705350686912509, "grad_norm": 0.00033805047216653667, "learning_rate": 5.832366654304822e-07, "loss": 0.0, "step": 4717 }, { "epoch": 1.705712219812003, "grad_norm": 0.03701997437986328, "learning_rate": 5.818364152579386e-07, "loss": 0.0013, "step": 4718 }, { "epoch": 1.7060737527114966, "grad_norm": 0.38075400926588493, "learning_rate": 5.804377441361297e-07, "loss": 0.0129, "step": 4719 }, { "epoch": 1.7064352856109906, "grad_norm": 0.118497875766744, "learning_rate": 5.79040652564939e-07, "loss": 0.0161, "step": 4720 }, { "epoch": 1.7067968185104845, "grad_norm": 0.7053854878593969, "learning_rate": 5.77645141043684e-07, "loss": 0.0476, "step": 4721 }, { "epoch": 1.7071583514099782, "grad_norm": 1.3949110706666425, "learning_rate": 5.762512100711232e-07, "loss": 0.1699, "step": 4722 }, { "epoch": 1.7075198843094723, "grad_norm": 0.10793027181281853, "learning_rate": 5.748588601454463e-07, "loss": 0.0027, "step": 4723 }, { "epoch": 1.707881417208966, "grad_norm": 0.15311670832222612, "learning_rate": 5.734680917642788e-07, "loss": 0.0227, "step": 4724 }, { "epoch": 1.7082429501084597, "grad_norm": 0.11996417887050688, "learning_rate": 5.720789054246817e-07, "loss": 0.0161, "step": 4725 }, { "epoch": 1.7086044830079539, "grad_norm": 0.01763357362293054, "learning_rate": 5.706913016231486e-07, "loss": 0.0006, "step": 4726 }, { "epoch": 1.7089660159074476, "grad_norm": 0.00045309076672603695, "learning_rate": 5.693052808556099e-07, "loss": 0.0, "step": 4727 }, { "epoch": 1.7093275488069413, "grad_norm": 0.0005913045644384376, "learning_rate": 5.679208436174283e-07, "loss": 0.0, "step": 4728 }, { "epoch": 1.7096890817064354, "grad_norm": 0.33257433416452775, "learning_rate": 5.665379904034018e-07, "loss": 0.0254, "step": 4729 }, { "epoch": 1.710050614605929, "grad_norm": 0.00026956091245837493, "learning_rate": 5.651567217077624e-07, "loss": 0.0, "step": 4730 }, { "epoch": 1.710412147505423, "grad_norm": 0.7202402734003733, "learning_rate": 5.637770380241753e-07, "loss": 0.1699, "step": 4731 }, { "epoch": 1.710773680404917, "grad_norm": 0.2858633449691998, "learning_rate": 5.62398939845738e-07, "loss": 0.0254, "step": 4732 }, { "epoch": 1.7111352133044107, "grad_norm": 0.4403032731365512, "learning_rate": 5.610224276649828e-07, "loss": 0.0388, "step": 4733 }, { "epoch": 1.7114967462039046, "grad_norm": 0.16094476526764254, "learning_rate": 5.596475019738757e-07, "loss": 0.0203, "step": 4734 }, { "epoch": 1.7118582791033985, "grad_norm": 0.26540396200409394, "learning_rate": 5.582741632638134e-07, "loss": 0.0315, "step": 4735 }, { "epoch": 1.7122198120028922, "grad_norm": 0.5503184404760711, "learning_rate": 5.569024120256294e-07, "loss": 0.043, "step": 4736 }, { "epoch": 1.7125813449023861, "grad_norm": 0.029962539296641222, "learning_rate": 5.555322487495846e-07, "loss": 0.001, "step": 4737 }, { "epoch": 1.71294287780188, "grad_norm": 0.23804138944564898, "learning_rate": 5.541636739253753e-07, "loss": 0.0283, "step": 4738 }, { "epoch": 1.7133044107013737, "grad_norm": 0.7245399644701411, "learning_rate": 5.527966880421315e-07, "loss": 0.1602, "step": 4739 }, { "epoch": 1.7136659436008677, "grad_norm": 0.3052099988945735, "learning_rate": 5.514312915884135e-07, "loss": 0.0315, "step": 4740 }, { "epoch": 1.7140274765003616, "grad_norm": 0.14002449707495698, "learning_rate": 5.500674850522125e-07, "loss": 0.0181, "step": 4741 }, { "epoch": 1.7143890093998553, "grad_norm": 1.138777617982391, "learning_rate": 5.487052689209532e-07, "loss": 0.0635, "step": 4742 }, { "epoch": 1.7147505422993492, "grad_norm": 0.3869408813688376, "learning_rate": 5.473446436814917e-07, "loss": 0.0315, "step": 4743 }, { "epoch": 1.7151120751988431, "grad_norm": 0.14826591074853215, "learning_rate": 5.459856098201144e-07, "loss": 0.0203, "step": 4744 }, { "epoch": 1.7154736080983368, "grad_norm": 0.5371434450803664, "learning_rate": 5.446281678225396e-07, "loss": 0.0203, "step": 4745 }, { "epoch": 1.715835140997831, "grad_norm": 0.0013753356857329655, "learning_rate": 5.432723181739192e-07, "loss": 0.0001, "step": 4746 }, { "epoch": 1.7161966738973247, "grad_norm": 0.5699533333627332, "learning_rate": 5.419180613588309e-07, "loss": 0.1406, "step": 4747 }, { "epoch": 1.7165582067968184, "grad_norm": 0.20687353807834977, "learning_rate": 5.405653978612857e-07, "loss": 0.0203, "step": 4748 }, { "epoch": 1.7169197396963125, "grad_norm": 0.15271378134967753, "learning_rate": 5.392143281647267e-07, "loss": 0.0181, "step": 4749 }, { "epoch": 1.7172812725958062, "grad_norm": 5.173019557692322, "learning_rate": 5.378648527520259e-07, "loss": 0.0635, "step": 4750 }, { "epoch": 1.7176428054953, "grad_norm": 0.5373715490563863, "learning_rate": 5.365169721054847e-07, "loss": 0.0432, "step": 4751 }, { "epoch": 1.718004338394794, "grad_norm": 0.1778905645879472, "learning_rate": 5.351706867068357e-07, "loss": 0.0161, "step": 4752 }, { "epoch": 1.7183658712942878, "grad_norm": 1.1265343770405478, "learning_rate": 5.338259970372411e-07, "loss": 0.0317, "step": 4753 }, { "epoch": 1.7187274041937817, "grad_norm": 0.0004709687441594774, "learning_rate": 5.324829035772927e-07, "loss": 0.0, "step": 4754 }, { "epoch": 1.7190889370932756, "grad_norm": 0.12636360168634758, "learning_rate": 5.311414068070109e-07, "loss": 0.0181, "step": 4755 }, { "epoch": 1.7194504699927693, "grad_norm": 0.7377245296409163, "learning_rate": 5.298015072058493e-07, "loss": 0.0432, "step": 4756 }, { "epoch": 1.7198120028922632, "grad_norm": 0.000638211769953464, "learning_rate": 5.284632052526839e-07, "loss": 0.0, "step": 4757 }, { "epoch": 1.7201735357917571, "grad_norm": 0.15295343748810314, "learning_rate": 5.271265014258247e-07, "loss": 0.0143, "step": 4758 }, { "epoch": 1.7205350686912508, "grad_norm": 0.01926759324415325, "learning_rate": 5.257913962030115e-07, "loss": 0.0009, "step": 4759 }, { "epoch": 1.7208966015907448, "grad_norm": 0.20526603975386842, "learning_rate": 5.244578900614083e-07, "loss": 0.0254, "step": 4760 }, { "epoch": 1.7212581344902387, "grad_norm": 0.29812881422694426, "learning_rate": 5.231259834776109e-07, "loss": 0.0101, "step": 4761 }, { "epoch": 1.7216196673897324, "grad_norm": 0.10223987533382423, "learning_rate": 5.217956769276422e-07, "loss": 0.0161, "step": 4762 }, { "epoch": 1.7219812002892263, "grad_norm": 0.18157934268949028, "learning_rate": 5.204669708869537e-07, "loss": 0.0044, "step": 4763 }, { "epoch": 1.7223427331887202, "grad_norm": 0.0018853538844636891, "learning_rate": 5.191398658304242e-07, "loss": 0.0001, "step": 4764 }, { "epoch": 1.722704266088214, "grad_norm": 0.01846182370163252, "learning_rate": 5.1781436223236e-07, "loss": 0.0007, "step": 4765 }, { "epoch": 1.7230657989877078, "grad_norm": 0.7250963696035109, "learning_rate": 5.164904605664989e-07, "loss": 0.1602, "step": 4766 }, { "epoch": 1.7234273318872018, "grad_norm": 0.9902691222620261, "learning_rate": 5.15168161306e-07, "loss": 0.1055, "step": 4767 }, { "epoch": 1.7237888647866955, "grad_norm": 0.05441237171958295, "learning_rate": 5.138474649234527e-07, "loss": 0.0013, "step": 4768 }, { "epoch": 1.7241503976861896, "grad_norm": 1.9501614802315013, "learning_rate": 5.125283718908758e-07, "loss": 0.1226, "step": 4769 }, { "epoch": 1.7245119305856833, "grad_norm": 0.16663968688408137, "learning_rate": 5.112108826797118e-07, "loss": 0.0227, "step": 4770 }, { "epoch": 1.724873463485177, "grad_norm": 0.3605133662356331, "learning_rate": 5.098949977608309e-07, "loss": 0.0315, "step": 4771 }, { "epoch": 1.7252349963846711, "grad_norm": 0.0037219585486764455, "learning_rate": 5.085807176045299e-07, "loss": 0.0001, "step": 4772 }, { "epoch": 1.7255965292841648, "grad_norm": 1.0062898248889485, "learning_rate": 5.072680426805332e-07, "loss": 0.0283, "step": 4773 }, { "epoch": 1.7259580621836585, "grad_norm": 0.00216383769017251, "learning_rate": 5.059569734579894e-07, "loss": 0.0001, "step": 4774 }, { "epoch": 1.7263195950831527, "grad_norm": 0.00022229413466504235, "learning_rate": 5.046475104054738e-07, "loss": 0.0, "step": 4775 }, { "epoch": 1.7266811279826464, "grad_norm": 0.1282456104438317, "learning_rate": 5.033396539909919e-07, "loss": 0.0181, "step": 4776 }, { "epoch": 1.7270426608821403, "grad_norm": 0.39738745399932623, "learning_rate": 5.020334046819669e-07, "loss": 0.0432, "step": 4777 }, { "epoch": 1.7274041937816342, "grad_norm": 0.12448576823572441, "learning_rate": 5.007287629452534e-07, "loss": 0.0161, "step": 4778 }, { "epoch": 1.727765726681128, "grad_norm": 0.6144458082667931, "learning_rate": 4.994257292471316e-07, "loss": 0.0476, "step": 4779 }, { "epoch": 1.7281272595806219, "grad_norm": 0.31544055981233926, "learning_rate": 4.98124304053304e-07, "loss": 0.0283, "step": 4780 }, { "epoch": 1.7284887924801158, "grad_norm": 0.18189279660619295, "learning_rate": 4.968244878289002e-07, "loss": 0.0227, "step": 4781 }, { "epoch": 1.7288503253796095, "grad_norm": 0.6874775639945684, "learning_rate": 4.955262810384742e-07, "loss": 0.0476, "step": 4782 }, { "epoch": 1.7292118582791034, "grad_norm": 0.2178684891847086, "learning_rate": 4.942296841460043e-07, "loss": 0.0254, "step": 4783 }, { "epoch": 1.7295733911785973, "grad_norm": 1.100024829825646, "learning_rate": 4.929346976148946e-07, "loss": 0.1055, "step": 4784 }, { "epoch": 1.729934924078091, "grad_norm": 0.26620323301165727, "learning_rate": 4.916413219079725e-07, "loss": 0.0227, "step": 4785 }, { "epoch": 1.730296456977585, "grad_norm": 0.38031237041811444, "learning_rate": 4.903495574874917e-07, "loss": 0.0283, "step": 4786 }, { "epoch": 1.7306579898770789, "grad_norm": 0.002070498585060531, "learning_rate": 4.890594048151265e-07, "loss": 0.0001, "step": 4787 }, { "epoch": 1.7310195227765726, "grad_norm": 0.0036878147750839057, "learning_rate": 4.877708643519768e-07, "loss": 0.0001, "step": 4788 }, { "epoch": 1.7313810556760665, "grad_norm": 0.21991764155881438, "learning_rate": 4.864839365585694e-07, "loss": 0.009, "step": 4789 }, { "epoch": 1.7317425885755604, "grad_norm": 0.9915854065249847, "learning_rate": 4.851986218948501e-07, "loss": 0.0693, "step": 4790 }, { "epoch": 1.732104121475054, "grad_norm": 0.23755516982983163, "learning_rate": 4.83914920820191e-07, "loss": 0.0203, "step": 4791 }, { "epoch": 1.7324656543745482, "grad_norm": 0.595261732000991, "learning_rate": 4.826328337933861e-07, "loss": 0.0432, "step": 4792 }, { "epoch": 1.732827187274042, "grad_norm": 0.4408713072349342, "learning_rate": 4.813523612726529e-07, "loss": 0.0388, "step": 4793 }, { "epoch": 1.7331887201735356, "grad_norm": 0.8180476509161744, "learning_rate": 4.800735037156318e-07, "loss": 0.0432, "step": 4794 }, { "epoch": 1.7335502530730298, "grad_norm": 0.8582124884844976, "learning_rate": 4.787962615793862e-07, "loss": 0.1143, "step": 4795 }, { "epoch": 1.7339117859725235, "grad_norm": 0.3805671171262701, "learning_rate": 4.775206353204043e-07, "loss": 0.0315, "step": 4796 }, { "epoch": 1.7342733188720172, "grad_norm": 0.13258650667623817, "learning_rate": 4.7624662539459187e-07, "loss": 0.0128, "step": 4797 }, { "epoch": 1.7346348517715113, "grad_norm": 0.16791664016928884, "learning_rate": 4.7497423225727977e-07, "loss": 0.0063, "step": 4798 }, { "epoch": 1.734996384671005, "grad_norm": 0.1187733238356468, "learning_rate": 4.7370345636322314e-07, "loss": 0.0056, "step": 4799 }, { "epoch": 1.735357917570499, "grad_norm": 0.03250418390745437, "learning_rate": 4.724342981665958e-07, "loss": 0.0009, "step": 4800 }, { "epoch": 1.7357194504699929, "grad_norm": 0.7256351917383497, "learning_rate": 4.71166758120995e-07, "loss": 0.0388, "step": 4801 }, { "epoch": 1.7360809833694866, "grad_norm": 0.20152119957660197, "learning_rate": 4.6990083667943833e-07, "loss": 0.0203, "step": 4802 }, { "epoch": 1.7364425162689805, "grad_norm": 0.3073346570302057, "learning_rate": 4.6863653429436827e-07, "loss": 0.0349, "step": 4803 }, { "epoch": 1.7368040491684744, "grad_norm": 0.18490895785451844, "learning_rate": 4.67373851417644e-07, "loss": 0.0203, "step": 4804 }, { "epoch": 1.7371655820679681, "grad_norm": 0.2508186930394989, "learning_rate": 4.6611278850054733e-07, "loss": 0.0283, "step": 4805 }, { "epoch": 1.737527114967462, "grad_norm": 1.701475520468137, "learning_rate": 4.648533459937843e-07, "loss": 0.083, "step": 4806 }, { "epoch": 1.737888647866956, "grad_norm": 0.002013143698053956, "learning_rate": 4.6359552434747823e-07, "loss": 0.0001, "step": 4807 }, { "epoch": 1.7382501807664497, "grad_norm": 1.2215543858729276, "learning_rate": 4.62339324011174e-07, "loss": 0.1226, "step": 4808 }, { "epoch": 1.7386117136659436, "grad_norm": 0.2123001642527912, "learning_rate": 4.6108474543383853e-07, "loss": 0.0227, "step": 4809 }, { "epoch": 1.7389732465654375, "grad_norm": 0.3564575493514232, "learning_rate": 4.5983178906385653e-07, "loss": 0.0162, "step": 4810 }, { "epoch": 1.7393347794649312, "grad_norm": 0.00026406277305784795, "learning_rate": 4.5858045534903526e-07, "loss": 0.0, "step": 4811 }, { "epoch": 1.7396963123644251, "grad_norm": 0.9549017467957739, "learning_rate": 4.573307447365999e-07, "loss": 0.0693, "step": 4812 }, { "epoch": 1.740057845263919, "grad_norm": 0.45767376150810146, "learning_rate": 4.560826576731997e-07, "loss": 0.0349, "step": 4813 }, { "epoch": 1.7404193781634127, "grad_norm": 0.042626096350793945, "learning_rate": 4.548361946048974e-07, "loss": 0.0012, "step": 4814 }, { "epoch": 1.7407809110629069, "grad_norm": 0.00047350483282293656, "learning_rate": 4.535913559771793e-07, "loss": 0.0, "step": 4815 }, { "epoch": 1.7411424439624006, "grad_norm": 0.47804574728268934, "learning_rate": 4.5234814223495183e-07, "loss": 0.0476, "step": 4816 }, { "epoch": 1.7415039768618943, "grad_norm": 0.2388888875671315, "learning_rate": 4.511065538225384e-07, "loss": 0.0227, "step": 4817 }, { "epoch": 1.7418655097613884, "grad_norm": 0.19559630339303286, "learning_rate": 4.49866591183683e-07, "loss": 0.0254, "step": 4818 }, { "epoch": 1.7422270426608821, "grad_norm": 0.30011716358965035, "learning_rate": 4.486282547615478e-07, "loss": 0.0203, "step": 4819 }, { "epoch": 1.7425885755603758, "grad_norm": 0.409517920485271, "learning_rate": 4.4739154499871376e-07, "loss": 0.0388, "step": 4820 }, { "epoch": 1.74295010845987, "grad_norm": 0.1678425545422849, "learning_rate": 4.4615646233718057e-07, "loss": 0.0227, "step": 4821 }, { "epoch": 1.7433116413593637, "grad_norm": 0.00034488983602245344, "learning_rate": 4.449230072183658e-07, "loss": 0.0, "step": 4822 }, { "epoch": 1.7436731742588576, "grad_norm": 0.1337866198882219, "learning_rate": 4.436911800831084e-07, "loss": 0.0143, "step": 4823 }, { "epoch": 1.7440347071583515, "grad_norm": 1.0671381922147372, "learning_rate": 4.424609813716613e-07, "loss": 0.0977, "step": 4824 }, { "epoch": 1.7443962400578452, "grad_norm": 0.21919327081942183, "learning_rate": 4.4123241152369654e-07, "loss": 0.0254, "step": 4825 }, { "epoch": 1.7447577729573391, "grad_norm": 0.5511889637915671, "learning_rate": 4.400054709783069e-07, "loss": 0.1807, "step": 4826 }, { "epoch": 1.745119305856833, "grad_norm": 0.08827092279286045, "learning_rate": 4.387801601739994e-07, "loss": 0.0128, "step": 4827 }, { "epoch": 1.7454808387563268, "grad_norm": 0.13333411755271082, "learning_rate": 4.3755647954870026e-07, "loss": 0.0039, "step": 4828 }, { "epoch": 1.7458423716558207, "grad_norm": 0.18835586867057322, "learning_rate": 4.36334429539752e-07, "loss": 0.0203, "step": 4829 }, { "epoch": 1.7462039045553146, "grad_norm": 0.6571171394027955, "learning_rate": 4.351140105839158e-07, "loss": 0.1699, "step": 4830 }, { "epoch": 1.7465654374548083, "grad_norm": 0.004724873032154782, "learning_rate": 4.338952231173693e-07, "loss": 0.0, "step": 4831 }, { "epoch": 1.7469269703543022, "grad_norm": 0.34148095376506005, "learning_rate": 4.3267806757570475e-07, "loss": 0.0129, "step": 4832 }, { "epoch": 1.7472885032537961, "grad_norm": 0.2565525296036126, "learning_rate": 4.3146254439393744e-07, "loss": 0.0283, "step": 4833 }, { "epoch": 1.7476500361532898, "grad_norm": 0.10743323227838367, "learning_rate": 4.3024865400649094e-07, "loss": 0.0161, "step": 4834 }, { "epoch": 1.7480115690527838, "grad_norm": 0.7678080372759774, "learning_rate": 4.2903639684721113e-07, "loss": 0.1504, "step": 4835 }, { "epoch": 1.7483731019522777, "grad_norm": 1.7810381976276564, "learning_rate": 4.278257733493585e-07, "loss": 0.0693, "step": 4836 }, { "epoch": 1.7487346348517714, "grad_norm": 0.3250128230029461, "learning_rate": 4.2661678394561035e-07, "loss": 0.0283, "step": 4837 }, { "epoch": 1.7490961677512655, "grad_norm": 0.0016246651737684811, "learning_rate": 4.254094290680577e-07, "loss": 0.0, "step": 4838 }, { "epoch": 1.7494577006507592, "grad_norm": 0.13974821621688585, "learning_rate": 4.242037091482104e-07, "loss": 0.0031, "step": 4839 }, { "epoch": 1.749819233550253, "grad_norm": 0.4075612981563391, "learning_rate": 4.2299962461699095e-07, "loss": 0.0162, "step": 4840 }, { "epoch": 1.750180766449747, "grad_norm": 0.1115199497180114, "learning_rate": 4.2179717590474014e-07, "loss": 0.0044, "step": 4841 }, { "epoch": 1.7505422993492408, "grad_norm": 0.1185905492385119, "learning_rate": 4.2059636344121193e-07, "loss": 0.0143, "step": 4842 }, { "epoch": 1.7509038322487345, "grad_norm": 3.095201880709069, "learning_rate": 4.193971876555786e-07, "loss": 0.0613, "step": 4843 }, { "epoch": 1.7512653651482286, "grad_norm": 0.8751903691006778, "learning_rate": 4.181996489764223e-07, "loss": 0.1309, "step": 4844 }, { "epoch": 1.7516268980477223, "grad_norm": 1.4005859522378779, "learning_rate": 4.1700374783174403e-07, "loss": 0.0388, "step": 4845 }, { "epoch": 1.7519884309472162, "grad_norm": 0.40658619537045065, "learning_rate": 4.158094846489602e-07, "loss": 0.0182, "step": 4846 }, { "epoch": 1.7523499638467102, "grad_norm": 0.10067714843213045, "learning_rate": 4.1461685985489884e-07, "loss": 0.0143, "step": 4847 }, { "epoch": 1.7527114967462039, "grad_norm": 0.32390523308977887, "learning_rate": 4.1342587387580404e-07, "loss": 0.0315, "step": 4848 }, { "epoch": 1.7530730296456978, "grad_norm": 0.06911444804056303, "learning_rate": 4.122365271373341e-07, "loss": 0.0027, "step": 4849 }, { "epoch": 1.7534345625451917, "grad_norm": 0.07974815948216403, "learning_rate": 4.1104882006456125e-07, "loss": 0.0002, "step": 4850 }, { "epoch": 1.7537960954446854, "grad_norm": 0.18260271120410812, "learning_rate": 4.0986275308197153e-07, "loss": 0.0203, "step": 4851 }, { "epoch": 1.7541576283441793, "grad_norm": 0.0003223350839785429, "learning_rate": 4.086783266134642e-07, "loss": 0.0, "step": 4852 }, { "epoch": 1.7545191612436732, "grad_norm": 0.2873774018217996, "learning_rate": 4.0749554108235557e-07, "loss": 0.0283, "step": 4853 }, { "epoch": 1.754880694143167, "grad_norm": 0.34476425322026755, "learning_rate": 4.0631439691137087e-07, "loss": 0.0315, "step": 4854 }, { "epoch": 1.7552422270426609, "grad_norm": 0.15419608251217382, "learning_rate": 4.051348945226502e-07, "loss": 0.0161, "step": 4855 }, { "epoch": 1.7556037599421548, "grad_norm": 0.2208771700776716, "learning_rate": 4.039570343377497e-07, "loss": 0.0254, "step": 4856 }, { "epoch": 1.7559652928416485, "grad_norm": 0.0010474687396540287, "learning_rate": 4.027808167776348e-07, "loss": 0.0, "step": 4857 }, { "epoch": 1.7563268257411424, "grad_norm": 3.523808514860856, "learning_rate": 4.0160624226268597e-07, "loss": 0.3047, "step": 4858 }, { "epoch": 1.7566883586406363, "grad_norm": 0.26688863945589303, "learning_rate": 4.004333112126957e-07, "loss": 0.0227, "step": 4859 }, { "epoch": 1.75704989154013, "grad_norm": 0.7117515437965557, "learning_rate": 3.992620240468692e-07, "loss": 0.1914, "step": 4860 }, { "epoch": 1.7574114244396242, "grad_norm": 0.9093481225047271, "learning_rate": 3.980923811838244e-07, "loss": 0.0349, "step": 4861 }, { "epoch": 1.7577729573391179, "grad_norm": 1.1235512565681585, "learning_rate": 3.969243830415903e-07, "loss": 0.0527, "step": 4862 }, { "epoch": 1.7581344902386116, "grad_norm": 0.13136427144388235, "learning_rate": 3.957580300376124e-07, "loss": 0.0203, "step": 4863 }, { "epoch": 1.7584960231381057, "grad_norm": 0.8400709255915219, "learning_rate": 3.945933225887422e-07, "loss": 0.0227, "step": 4864 }, { "epoch": 1.7588575560375994, "grad_norm": 0.00033817106574864997, "learning_rate": 3.9343026111124506e-07, "loss": 0.0, "step": 4865 }, { "epoch": 1.7592190889370931, "grad_norm": 0.20832568437078489, "learning_rate": 3.9226884602080186e-07, "loss": 0.0227, "step": 4866 }, { "epoch": 1.7595806218365873, "grad_norm": 0.6152970300998895, "learning_rate": 3.911090777325005e-07, "loss": 0.0388, "step": 4867 }, { "epoch": 1.759942154736081, "grad_norm": 0.2697187084174008, "learning_rate": 3.899509566608417e-07, "loss": 0.0315, "step": 4868 }, { "epoch": 1.7603036876355749, "grad_norm": 0.3215191192354956, "learning_rate": 3.887944832197377e-07, "loss": 0.0315, "step": 4869 }, { "epoch": 1.7606652205350688, "grad_norm": 0.12028299485240207, "learning_rate": 3.876396578225139e-07, "loss": 0.0181, "step": 4870 }, { "epoch": 1.7610267534345625, "grad_norm": 0.15423444285190813, "learning_rate": 3.8648648088190186e-07, "loss": 0.0203, "step": 4871 }, { "epoch": 1.7613882863340564, "grad_norm": 1.3389278069289958, "learning_rate": 3.853349528100475e-07, "loss": 0.1504, "step": 4872 }, { "epoch": 1.7617498192335503, "grad_norm": 0.5534891442048946, "learning_rate": 3.841850740185088e-07, "loss": 0.0388, "step": 4873 }, { "epoch": 1.762111352133044, "grad_norm": 0.30453682101311724, "learning_rate": 3.830368449182487e-07, "loss": 0.0283, "step": 4874 }, { "epoch": 1.762472885032538, "grad_norm": 0.9022488379408872, "learning_rate": 3.8189026591964664e-07, "loss": 0.1406, "step": 4875 }, { "epoch": 1.7628344179320319, "grad_norm": 0.33831310288326305, "learning_rate": 3.8074533743248877e-07, "loss": 0.0315, "step": 4876 }, { "epoch": 1.7631959508315256, "grad_norm": 0.072421876533313, "learning_rate": 3.7960205986597275e-07, "loss": 0.0015, "step": 4877 }, { "epoch": 1.7635574837310195, "grad_norm": 0.060381623141366865, "learning_rate": 3.784604336287051e-07, "loss": 0.001, "step": 4878 }, { "epoch": 1.7639190166305134, "grad_norm": 0.2358090128987776, "learning_rate": 3.7732045912870276e-07, "loss": 0.0181, "step": 4879 }, { "epoch": 1.7642805495300071, "grad_norm": 0.1032838524112769, "learning_rate": 3.761821367733942e-07, "loss": 0.0161, "step": 4880 }, { "epoch": 1.764642082429501, "grad_norm": 0.001053173363598287, "learning_rate": 3.750454669696135e-07, "loss": 0.0, "step": 4881 }, { "epoch": 1.765003615328995, "grad_norm": 0.11766839088177325, "learning_rate": 3.739104501236057e-07, "loss": 0.0034, "step": 4882 }, { "epoch": 1.7653651482284887, "grad_norm": 0.5725010380465791, "learning_rate": 3.727770866410285e-07, "loss": 0.0432, "step": 4883 }, { "epoch": 1.7657266811279828, "grad_norm": 0.5930609799489748, "learning_rate": 3.716453769269424e-07, "loss": 0.0203, "step": 4884 }, { "epoch": 1.7660882140274765, "grad_norm": 0.10058763590801752, "learning_rate": 3.705153213858226e-07, "loss": 0.0143, "step": 4885 }, { "epoch": 1.7664497469269702, "grad_norm": 0.011974813359299983, "learning_rate": 3.6938692042155e-07, "loss": 0.0006, "step": 4886 }, { "epoch": 1.7668112798264644, "grad_norm": 0.9432058413872734, "learning_rate": 3.682601744374148e-07, "loss": 0.0527, "step": 4887 }, { "epoch": 1.767172812725958, "grad_norm": 0.1660879969643803, "learning_rate": 3.6713508383611596e-07, "loss": 0.0203, "step": 4888 }, { "epoch": 1.7675343456254518, "grad_norm": 0.43563796894594803, "learning_rate": 3.660116490197596e-07, "loss": 0.0315, "step": 4889 }, { "epoch": 1.767895878524946, "grad_norm": 0.00012309840374785606, "learning_rate": 3.648898703898629e-07, "loss": 0.0, "step": 4890 }, { "epoch": 1.7682574114244396, "grad_norm": 0.20457544309697231, "learning_rate": 3.637697483473485e-07, "loss": 0.0227, "step": 4891 }, { "epoch": 1.7686189443239335, "grad_norm": 0.18599225858204002, "learning_rate": 3.6265128329254605e-07, "loss": 0.0227, "step": 4892 }, { "epoch": 1.7689804772234274, "grad_norm": 0.23255799130340052, "learning_rate": 3.6153447562519816e-07, "loss": 0.0227, "step": 4893 }, { "epoch": 1.7693420101229211, "grad_norm": 0.14280239224736133, "learning_rate": 3.604193257444483e-07, "loss": 0.0181, "step": 4894 }, { "epoch": 1.769703543022415, "grad_norm": 0.012467792680898492, "learning_rate": 3.593058340488531e-07, "loss": 0.0005, "step": 4895 }, { "epoch": 1.770065075921909, "grad_norm": 0.9783708149949378, "learning_rate": 3.5819400093637323e-07, "loss": 0.0579, "step": 4896 }, { "epoch": 1.7704266088214027, "grad_norm": 0.09527600417527296, "learning_rate": 3.5708382680437804e-07, "loss": 0.0034, "step": 4897 }, { "epoch": 1.7707881417208966, "grad_norm": 0.2888020413735372, "learning_rate": 3.559753120496429e-07, "loss": 0.0283, "step": 4898 }, { "epoch": 1.7711496746203905, "grad_norm": 0.9995958715330407, "learning_rate": 3.548684570683508e-07, "loss": 0.1055, "step": 4899 }, { "epoch": 1.7715112075198842, "grad_norm": 0.28452765191260676, "learning_rate": 3.5376326225609325e-07, "loss": 0.0254, "step": 4900 }, { "epoch": 1.7718727404193781, "grad_norm": 0.1569826309816423, "learning_rate": 3.5265972800786417e-07, "loss": 0.0143, "step": 4901 }, { "epoch": 1.772234273318872, "grad_norm": 0.6834835774728225, "learning_rate": 3.5155785471806705e-07, "loss": 0.0527, "step": 4902 }, { "epoch": 1.7725958062183658, "grad_norm": 0.18155973648448837, "learning_rate": 3.504576427805123e-07, "loss": 0.0161, "step": 4903 }, { "epoch": 1.7729573391178597, "grad_norm": 0.7422588870192008, "learning_rate": 3.4935909258841493e-07, "loss": 0.1914, "step": 4904 }, { "epoch": 1.7733188720173536, "grad_norm": 0.08298609326535152, "learning_rate": 3.482622045343964e-07, "loss": 0.0128, "step": 4905 }, { "epoch": 1.7736804049168473, "grad_norm": 0.10945811577773368, "learning_rate": 3.4716697901048426e-07, "loss": 0.0143, "step": 4906 }, { "epoch": 1.7740419378163415, "grad_norm": 1.2589622948889092, "learning_rate": 3.46073416408112e-07, "loss": 0.1143, "step": 4907 }, { "epoch": 1.7744034707158352, "grad_norm": 0.16567554824604752, "learning_rate": 3.449815171181181e-07, "loss": 0.0227, "step": 4908 }, { "epoch": 1.7747650036153289, "grad_norm": 0.07962414692003568, "learning_rate": 3.438912815307471e-07, "loss": 0.0021, "step": 4909 }, { "epoch": 1.775126536514823, "grad_norm": 0.31752267196065576, "learning_rate": 3.4280271003565126e-07, "loss": 0.0315, "step": 4910 }, { "epoch": 1.7754880694143167, "grad_norm": 0.03773888805650883, "learning_rate": 3.4171580302188314e-07, "loss": 0.0016, "step": 4911 }, { "epoch": 1.7758496023138104, "grad_norm": 0.05510796621291383, "learning_rate": 3.4063056087790314e-07, "loss": 0.0017, "step": 4912 }, { "epoch": 1.7762111352133045, "grad_norm": 1.7684565638404315, "learning_rate": 3.395469839915777e-07, "loss": 0.0579, "step": 4913 }, { "epoch": 1.7765726681127982, "grad_norm": 0.17831190525982726, "learning_rate": 3.3846507275017706e-07, "loss": 0.0203, "step": 4914 }, { "epoch": 1.7769342010122922, "grad_norm": 0.5119773380210759, "learning_rate": 3.3738482754037574e-07, "loss": 0.0432, "step": 4915 }, { "epoch": 1.777295733911786, "grad_norm": 0.01095481761208674, "learning_rate": 3.3630624874825267e-07, "loss": 0.0003, "step": 4916 }, { "epoch": 1.7776572668112798, "grad_norm": 0.6931005954182804, "learning_rate": 3.3522933675929234e-07, "loss": 0.0898, "step": 4917 }, { "epoch": 1.7780187997107737, "grad_norm": 0.006712001452617703, "learning_rate": 3.3415409195838244e-07, "loss": 0.0002, "step": 4918 }, { "epoch": 1.7783803326102676, "grad_norm": 0.00448177578270357, "learning_rate": 3.3308051472981494e-07, "loss": 0.0001, "step": 4919 }, { "epoch": 1.7787418655097613, "grad_norm": 0.3662100734449562, "learning_rate": 3.32008605457288e-07, "loss": 0.0283, "step": 4920 }, { "epoch": 1.7791033984092552, "grad_norm": 0.7295816205085801, "learning_rate": 3.309383645238995e-07, "loss": 0.1699, "step": 4921 }, { "epoch": 1.7794649313087492, "grad_norm": 0.19186638942254283, "learning_rate": 3.29869792312153e-07, "loss": 0.0203, "step": 4922 }, { "epoch": 1.7798264642082429, "grad_norm": 0.16698056697654753, "learning_rate": 3.288028892039585e-07, "loss": 0.0203, "step": 4923 }, { "epoch": 1.7801879971077368, "grad_norm": 0.00764713933360847, "learning_rate": 3.2773765558062553e-07, "loss": 0.0003, "step": 4924 }, { "epoch": 1.7805495300072307, "grad_norm": 1.498436137611441, "learning_rate": 3.266740918228678e-07, "loss": 0.1226, "step": 4925 }, { "epoch": 1.7809110629067244, "grad_norm": 0.09950052326712984, "learning_rate": 3.25612198310804e-07, "loss": 0.0143, "step": 4926 }, { "epoch": 1.7812725958062183, "grad_norm": 0.10998296431916979, "learning_rate": 3.24551975423954e-07, "loss": 0.0161, "step": 4927 }, { "epoch": 1.7816341287057122, "grad_norm": 0.011283919804379377, "learning_rate": 3.2349342354124126e-07, "loss": 0.0004, "step": 4928 }, { "epoch": 1.781995661605206, "grad_norm": 0.016284507441295237, "learning_rate": 3.224365430409909e-07, "loss": 0.0005, "step": 4929 }, { "epoch": 1.7823571945047, "grad_norm": 0.2387670386404284, "learning_rate": 3.2138133430093477e-07, "loss": 0.0161, "step": 4930 }, { "epoch": 1.7827187274041938, "grad_norm": 0.43461828570427075, "learning_rate": 3.2032779769820165e-07, "loss": 0.0349, "step": 4931 }, { "epoch": 1.7830802603036875, "grad_norm": 0.000917572736934667, "learning_rate": 3.192759336093254e-07, "loss": 0.0, "step": 4932 }, { "epoch": 1.7834417932031816, "grad_norm": 0.23736799138921022, "learning_rate": 3.1822574241024307e-07, "loss": 0.0254, "step": 4933 }, { "epoch": 1.7838033261026753, "grad_norm": 0.28064063632642594, "learning_rate": 3.1717722447629274e-07, "loss": 0.0071, "step": 4934 }, { "epoch": 1.784164859002169, "grad_norm": 0.0002299178816886415, "learning_rate": 3.1613038018221353e-07, "loss": 0.0, "step": 4935 }, { "epoch": 1.7845263919016632, "grad_norm": 0.002265392865205847, "learning_rate": 3.150852099021484e-07, "loss": 0.0001, "step": 4936 }, { "epoch": 1.7848879248011569, "grad_norm": 0.36746605787622455, "learning_rate": 3.1404171400964024e-07, "loss": 0.0162, "step": 4937 }, { "epoch": 1.7852494577006508, "grad_norm": 1.3039646591357017, "learning_rate": 3.129998928776351e-07, "loss": 0.1699, "step": 4938 }, { "epoch": 1.7856109906001447, "grad_norm": 0.9175759962550614, "learning_rate": 3.1195974687847796e-07, "loss": 0.1226, "step": 4939 }, { "epoch": 1.7859725234996384, "grad_norm": 1.0404919699494335, "learning_rate": 3.109212763839198e-07, "loss": 0.0579, "step": 4940 }, { "epoch": 1.7863340563991323, "grad_norm": 0.0003451263970553426, "learning_rate": 3.098844817651059e-07, "loss": 0.0, "step": 4941 }, { "epoch": 1.7866955892986263, "grad_norm": 1.3524413127370623, "learning_rate": 3.088493633925893e-07, "loss": 0.0579, "step": 4942 }, { "epoch": 1.78705712219812, "grad_norm": 1.1822210834601268, "learning_rate": 3.0781592163632025e-07, "loss": 0.0977, "step": 4943 }, { "epoch": 1.7874186550976139, "grad_norm": 0.3036307741610575, "learning_rate": 3.0678415686565045e-07, "loss": 0.0283, "step": 4944 }, { "epoch": 1.7877801879971078, "grad_norm": 3.082828415525656, "learning_rate": 3.057540694493327e-07, "loss": 0.4629, "step": 4945 }, { "epoch": 1.7881417208966015, "grad_norm": 0.22464533438314782, "learning_rate": 3.047256597555198e-07, "loss": 0.0254, "step": 4946 }, { "epoch": 1.7885032537960954, "grad_norm": 0.0025015563129331887, "learning_rate": 3.036989281517655e-07, "loss": 0.0001, "step": 4947 }, { "epoch": 1.7888647866955893, "grad_norm": 0.24621772531025465, "learning_rate": 3.026738750050229e-07, "loss": 0.0254, "step": 4948 }, { "epoch": 1.789226319595083, "grad_norm": 0.005133244063168426, "learning_rate": 3.01650500681645e-07, "loss": 0.0002, "step": 4949 }, { "epoch": 1.789587852494577, "grad_norm": 0.002491915526010838, "learning_rate": 3.006288055473888e-07, "loss": 0.0001, "step": 4950 }, { "epoch": 1.789949385394071, "grad_norm": 1.256412984945478, "learning_rate": 2.996087899674038e-07, "loss": 0.0352, "step": 4951 }, { "epoch": 1.7903109182935646, "grad_norm": 0.06056698039422621, "learning_rate": 2.985904543062462e-07, "loss": 0.0024, "step": 4952 }, { "epoch": 1.7906724511930587, "grad_norm": 0.35354518737688473, "learning_rate": 2.975737989278682e-07, "loss": 0.0254, "step": 4953 }, { "epoch": 1.7910339840925524, "grad_norm": 0.3116308358958884, "learning_rate": 2.9655882419562186e-07, "loss": 0.0283, "step": 4954 }, { "epoch": 1.7913955169920461, "grad_norm": 0.2029091180386179, "learning_rate": 2.955455304722588e-07, "loss": 0.0254, "step": 4955 }, { "epoch": 1.7917570498915403, "grad_norm": 0.07462410395822236, "learning_rate": 2.945339181199297e-07, "loss": 0.0114, "step": 4956 }, { "epoch": 1.792118582791034, "grad_norm": 0.6146326673449177, "learning_rate": 2.935239875001872e-07, "loss": 0.0527, "step": 4957 }, { "epoch": 1.7924801156905277, "grad_norm": 0.0003411434871708671, "learning_rate": 2.9251573897397743e-07, "loss": 0.0, "step": 4958 }, { "epoch": 1.7928416485900218, "grad_norm": 0.335867615300863, "learning_rate": 2.9150917290164784e-07, "loss": 0.0315, "step": 4959 }, { "epoch": 1.7932031814895155, "grad_norm": 0.015635653830868384, "learning_rate": 2.905042896429483e-07, "loss": 0.0005, "step": 4960 }, { "epoch": 1.7935647143890094, "grad_norm": 0.18338613635859613, "learning_rate": 2.8950108955701995e-07, "loss": 0.0063, "step": 4961 }, { "epoch": 1.7939262472885034, "grad_norm": 0.9027283235064918, "learning_rate": 2.884995730024087e-07, "loss": 0.1309, "step": 4962 }, { "epoch": 1.794287780187997, "grad_norm": 0.11804217688403587, "learning_rate": 2.8749974033705607e-07, "loss": 0.0143, "step": 4963 }, { "epoch": 1.794649313087491, "grad_norm": 0.1264014000874338, "learning_rate": 2.8650159191830175e-07, "loss": 0.0181, "step": 4964 }, { "epoch": 1.795010845986985, "grad_norm": 0.22338540317511027, "learning_rate": 2.8550512810288367e-07, "loss": 0.0203, "step": 4965 }, { "epoch": 1.7953723788864786, "grad_norm": 0.1368270668624331, "learning_rate": 2.84510349246937e-07, "loss": 0.0161, "step": 4966 }, { "epoch": 1.7957339117859725, "grad_norm": 0.24699031897256676, "learning_rate": 2.8351725570599787e-07, "loss": 0.0143, "step": 4967 }, { "epoch": 1.7960954446854664, "grad_norm": 1.8692303994259214, "learning_rate": 2.825258478349951e-07, "loss": 0.1055, "step": 4968 }, { "epoch": 1.7964569775849601, "grad_norm": 0.00030953412699949467, "learning_rate": 2.815361259882582e-07, "loss": 0.0, "step": 4969 }, { "epoch": 1.796818510484454, "grad_norm": 0.17819553929088558, "learning_rate": 2.805480905195157e-07, "loss": 0.0227, "step": 4970 }, { "epoch": 1.797180043383948, "grad_norm": 0.6328804783804713, "learning_rate": 2.7956174178188856e-07, "loss": 0.0283, "step": 4971 }, { "epoch": 1.7975415762834417, "grad_norm": 2.351819067948468, "learning_rate": 2.785770801278992e-07, "loss": 0.3047, "step": 4972 }, { "epoch": 1.7979031091829356, "grad_norm": 0.07139396799302897, "learning_rate": 2.7759410590946446e-07, "loss": 0.0017, "step": 4973 }, { "epoch": 1.7982646420824295, "grad_norm": 0.6884258321021425, "learning_rate": 2.7661281947789995e-07, "loss": 0.0579, "step": 4974 }, { "epoch": 1.7986261749819232, "grad_norm": 3.8572909366243384, "learning_rate": 2.7563322118391745e-07, "loss": 0.2578, "step": 4975 }, { "epoch": 1.7989877078814174, "grad_norm": 1.751133730999122, "learning_rate": 2.7465531137762347e-07, "loss": 0.0579, "step": 4976 }, { "epoch": 1.799349240780911, "grad_norm": 0.009105396065451235, "learning_rate": 2.7367909040852514e-07, "loss": 0.0003, "step": 4977 }, { "epoch": 1.7997107736804048, "grad_norm": 1.0260618607458323, "learning_rate": 2.7270455862552225e-07, "loss": 0.1226, "step": 4978 }, { "epoch": 1.800072306579899, "grad_norm": 0.1612901256191531, "learning_rate": 2.7173171637691176e-07, "loss": 0.0203, "step": 4979 }, { "epoch": 1.8004338394793926, "grad_norm": 1.3516486117746502, "learning_rate": 2.707605640103894e-07, "loss": 0.0579, "step": 4980 }, { "epoch": 1.8007953723788863, "grad_norm": 0.11566396875252505, "learning_rate": 2.6979110187304147e-07, "loss": 0.0161, "step": 4981 }, { "epoch": 1.8011569052783805, "grad_norm": 0.1353996926840675, "learning_rate": 2.6882333031135634e-07, "loss": 0.0181, "step": 4982 }, { "epoch": 1.8015184381778742, "grad_norm": 0.17702591700297535, "learning_rate": 2.6785724967121505e-07, "loss": 0.0203, "step": 4983 }, { "epoch": 1.801879971077368, "grad_norm": 0.7922857578888313, "learning_rate": 2.6689286029789373e-07, "loss": 0.0635, "step": 4984 }, { "epoch": 1.802241503976862, "grad_norm": 0.2439430074497473, "learning_rate": 2.65930162536065e-07, "loss": 0.0227, "step": 4985 }, { "epoch": 1.8026030368763557, "grad_norm": 0.736443177451701, "learning_rate": 2.6496915672979684e-07, "loss": 0.0432, "step": 4986 }, { "epoch": 1.8029645697758496, "grad_norm": 0.19571009006247878, "learning_rate": 2.6400984322255405e-07, "loss": 0.0227, "step": 4987 }, { "epoch": 1.8033261026753435, "grad_norm": 3.1698263260037156, "learning_rate": 2.63052222357193e-07, "loss": 0.3789, "step": 4988 }, { "epoch": 1.8036876355748372, "grad_norm": 0.037565918459661174, "learning_rate": 2.62096294475967e-07, "loss": 0.0009, "step": 4989 }, { "epoch": 1.8040491684743312, "grad_norm": 1.036260633004016, "learning_rate": 2.611420599205272e-07, "loss": 0.1143, "step": 4990 }, { "epoch": 1.804410701373825, "grad_norm": 0.19910478261260534, "learning_rate": 2.601895190319137e-07, "loss": 0.0254, "step": 4991 }, { "epoch": 1.8047722342733188, "grad_norm": 0.15799079247287437, "learning_rate": 2.5923867215056687e-07, "loss": 0.0203, "step": 4992 }, { "epoch": 1.8051337671728127, "grad_norm": 0.011016925168476428, "learning_rate": 2.5828951961631766e-07, "loss": 0.0004, "step": 4993 }, { "epoch": 1.8054953000723066, "grad_norm": 0.33516247825159035, "learning_rate": 2.573420617683936e-07, "loss": 0.0081, "step": 4994 }, { "epoch": 1.8058568329718003, "grad_norm": 0.17544993775398893, "learning_rate": 2.563962989454155e-07, "loss": 0.0203, "step": 4995 }, { "epoch": 1.8062183658712943, "grad_norm": 0.3371992306878531, "learning_rate": 2.5545223148539855e-07, "loss": 0.0161, "step": 4996 }, { "epoch": 1.8065798987707882, "grad_norm": 1.0687385789086454, "learning_rate": 2.545098597257545e-07, "loss": 0.0432, "step": 4997 }, { "epoch": 1.8069414316702819, "grad_norm": 0.47081695075200014, "learning_rate": 2.535691840032839e-07, "loss": 0.0181, "step": 4998 }, { "epoch": 1.807302964569776, "grad_norm": 0.004271910031722519, "learning_rate": 2.526302046541851e-07, "loss": 0.0002, "step": 4999 }, { "epoch": 1.8076644974692697, "grad_norm": 1.9490070930314174, "learning_rate": 2.516929220140496e-07, "loss": 0.0977, "step": 5000 }, { "epoch": 1.8080260303687634, "grad_norm": 0.22388967279139244, "learning_rate": 2.507573364178617e-07, "loss": 0.0283, "step": 5001 }, { "epoch": 1.8083875632682576, "grad_norm": 0.35420862090748684, "learning_rate": 2.4982344819999947e-07, "loss": 0.0315, "step": 5002 }, { "epoch": 1.8087490961677513, "grad_norm": 0.8945239099565205, "learning_rate": 2.488912576942343e-07, "loss": 0.1602, "step": 5003 }, { "epoch": 1.809110629067245, "grad_norm": 0.33887756446277156, "learning_rate": 2.4796076523373125e-07, "loss": 0.0349, "step": 5004 }, { "epoch": 1.809472161966739, "grad_norm": 0.008418264397989308, "learning_rate": 2.470319711510477e-07, "loss": 0.0002, "step": 5005 }, { "epoch": 1.8098336948662328, "grad_norm": 1.737933819971719, "learning_rate": 2.461048757781337e-07, "loss": 0.1055, "step": 5006 }, { "epoch": 1.8101952277657267, "grad_norm": 2.358417999420021, "learning_rate": 2.4517947944633527e-07, "loss": 0.2812, "step": 5007 }, { "epoch": 1.8105567606652206, "grad_norm": 0.08437291050309079, "learning_rate": 2.4425578248638616e-07, "loss": 0.0143, "step": 5008 }, { "epoch": 1.8109182935647143, "grad_norm": 0.00506715109730599, "learning_rate": 2.433337852284179e-07, "loss": 0.0002, "step": 5009 }, { "epoch": 1.8112798264642083, "grad_norm": 0.05857081203043882, "learning_rate": 2.424134880019502e-07, "loss": 0.0021, "step": 5010 }, { "epoch": 1.8116413593637022, "grad_norm": 0.23165343735000943, "learning_rate": 2.414948911358983e-07, "loss": 0.0227, "step": 5011 }, { "epoch": 1.8120028922631959, "grad_norm": 0.9469794775836532, "learning_rate": 2.4057799495856795e-07, "loss": 0.1226, "step": 5012 }, { "epoch": 1.8123644251626898, "grad_norm": 0.2974967216582441, "learning_rate": 2.396627997976575e-07, "loss": 0.0315, "step": 5013 }, { "epoch": 1.8127259580621837, "grad_norm": 1.1540228527772909, "learning_rate": 2.3874930598025816e-07, "loss": 0.0977, "step": 5014 }, { "epoch": 1.8130874909616774, "grad_norm": 0.00042996711716513974, "learning_rate": 2.3783751383285202e-07, "loss": 0.0, "step": 5015 }, { "epoch": 1.8134490238611713, "grad_norm": 0.20329784332004874, "learning_rate": 2.3692742368131295e-07, "loss": 0.0143, "step": 5016 }, { "epoch": 1.8138105567606653, "grad_norm": 0.17936623365005147, "learning_rate": 2.3601903585090845e-07, "loss": 0.0181, "step": 5017 }, { "epoch": 1.814172089660159, "grad_norm": 0.14235944343339815, "learning_rate": 2.351123506662939e-07, "loss": 0.0161, "step": 5018 }, { "epoch": 1.814533622559653, "grad_norm": 1.7483187915619853, "learning_rate": 2.3420736845152003e-07, "loss": 0.0476, "step": 5019 }, { "epoch": 1.8148951554591468, "grad_norm": 0.1241510312160668, "learning_rate": 2.333040895300276e-07, "loss": 0.0049, "step": 5020 }, { "epoch": 1.8152566883586405, "grad_norm": 1.1357721399552096, "learning_rate": 2.3240251422464667e-07, "loss": 0.0903, "step": 5021 }, { "epoch": 1.8156182212581344, "grad_norm": 0.2817605412960243, "learning_rate": 2.315026428576017e-07, "loss": 0.0254, "step": 5022 }, { "epoch": 1.8159797541576284, "grad_norm": 0.18534465612804588, "learning_rate": 2.306044757505055e-07, "loss": 0.0143, "step": 5023 }, { "epoch": 1.816341287057122, "grad_norm": 1.0177406296341527, "learning_rate": 2.297080132243634e-07, "loss": 0.083, "step": 5024 }, { "epoch": 1.8167028199566162, "grad_norm": 1.0769149178373865, "learning_rate": 2.288132555995709e-07, "loss": 0.0579, "step": 5025 }, { "epoch": 1.81706435285611, "grad_norm": 1.1121969420800388, "learning_rate": 2.2792020319591267e-07, "loss": 0.0977, "step": 5026 }, { "epoch": 1.8174258857556036, "grad_norm": 0.20962088937249876, "learning_rate": 2.2702885633256845e-07, "loss": 0.0181, "step": 5027 }, { "epoch": 1.8177874186550977, "grad_norm": 0.19755653430122286, "learning_rate": 2.2613921532810223e-07, "loss": 0.0227, "step": 5028 }, { "epoch": 1.8181489515545914, "grad_norm": 0.1006542787924651, "learning_rate": 2.2525128050047362e-07, "loss": 0.0161, "step": 5029 }, { "epoch": 1.8185104844540854, "grad_norm": 0.0004778930705552247, "learning_rate": 2.243650521670293e-07, "loss": 0.0, "step": 5030 }, { "epoch": 1.8188720173535793, "grad_norm": 1.0095601365419842, "learning_rate": 2.2348053064450814e-07, "loss": 0.0522, "step": 5031 }, { "epoch": 1.819233550253073, "grad_norm": 0.39253043924763004, "learning_rate": 2.2259771624903726e-07, "loss": 0.0315, "step": 5032 }, { "epoch": 1.819595083152567, "grad_norm": 0.1863936570239847, "learning_rate": 2.2171660929613424e-07, "loss": 0.0254, "step": 5033 }, { "epoch": 1.8199566160520608, "grad_norm": 1.237482228783106, "learning_rate": 2.2083721010070613e-07, "loss": 0.1406, "step": 5034 }, { "epoch": 1.8203181489515545, "grad_norm": 0.049318391257346696, "learning_rate": 2.1995951897705093e-07, "loss": 0.0021, "step": 5035 }, { "epoch": 1.8206796818510484, "grad_norm": 1.2776848519583208, "learning_rate": 2.1908353623885436e-07, "loss": 0.1055, "step": 5036 }, { "epoch": 1.8210412147505424, "grad_norm": 0.14411518592055583, "learning_rate": 2.1820926219919437e-07, "loss": 0.0161, "step": 5037 }, { "epoch": 1.821402747650036, "grad_norm": 0.875613481255118, "learning_rate": 2.173366971705332e-07, "loss": 0.1143, "step": 5038 }, { "epoch": 1.82176428054953, "grad_norm": 0.0010049506470556162, "learning_rate": 2.1646584146472805e-07, "loss": 0.0001, "step": 5039 }, { "epoch": 1.822125813449024, "grad_norm": 1.528983112014803, "learning_rate": 2.1559669539302165e-07, "loss": 0.083, "step": 5040 }, { "epoch": 1.8224873463485176, "grad_norm": 0.15815723101317145, "learning_rate": 2.147292592660466e-07, "loss": 0.0203, "step": 5041 }, { "epoch": 1.8228488792480115, "grad_norm": 0.7836182509190448, "learning_rate": 2.1386353339382438e-07, "loss": 0.0349, "step": 5042 }, { "epoch": 1.8232104121475055, "grad_norm": 0.4097308523901164, "learning_rate": 2.1299951808576525e-07, "loss": 0.0388, "step": 5043 }, { "epoch": 1.8235719450469992, "grad_norm": 0.000355834771251171, "learning_rate": 2.121372136506683e-07, "loss": 0.0, "step": 5044 }, { "epoch": 1.823933477946493, "grad_norm": 0.011453662620909848, "learning_rate": 2.1127662039672037e-07, "loss": 0.0004, "step": 5045 }, { "epoch": 1.824295010845987, "grad_norm": 0.5639063536433546, "learning_rate": 2.104177386314976e-07, "loss": 0.0162, "step": 5046 }, { "epoch": 1.8246565437454807, "grad_norm": 0.8170467409598573, "learning_rate": 2.0956056866196506e-07, "loss": 0.1807, "step": 5047 }, { "epoch": 1.8250180766449748, "grad_norm": 0.10521548719283676, "learning_rate": 2.087051107944732e-07, "loss": 0.0039, "step": 5048 }, { "epoch": 1.8253796095444685, "grad_norm": 0.0002726942373749819, "learning_rate": 2.0785136533476412e-07, "loss": 0.0, "step": 5049 }, { "epoch": 1.8257411424439622, "grad_norm": 0.21457816098352314, "learning_rate": 2.0699933258796601e-07, "loss": 0.0071, "step": 5050 }, { "epoch": 1.8261026753434564, "grad_norm": 0.1923796789101854, "learning_rate": 2.0614901285859467e-07, "loss": 0.0181, "step": 5051 }, { "epoch": 1.82646420824295, "grad_norm": 0.08429886833245123, "learning_rate": 2.0530040645055428e-07, "loss": 0.0128, "step": 5052 }, { "epoch": 1.826825741142444, "grad_norm": 0.004955457505876239, "learning_rate": 2.0445351366713661e-07, "loss": 0.0002, "step": 5053 }, { "epoch": 1.827187274041938, "grad_norm": 0.5590119777176072, "learning_rate": 2.0360833481102182e-07, "loss": 0.0476, "step": 5054 }, { "epoch": 1.8275488069414316, "grad_norm": 0.00029031253300217666, "learning_rate": 2.0276487018427605e-07, "loss": 0.0, "step": 5055 }, { "epoch": 1.8279103398409255, "grad_norm": 0.023452688837576004, "learning_rate": 2.01923120088352e-07, "loss": 0.0004, "step": 5056 }, { "epoch": 1.8282718727404195, "grad_norm": 0.18593980909873872, "learning_rate": 2.0108308482409356e-07, "loss": 0.0227, "step": 5057 }, { "epoch": 1.8286334056399132, "grad_norm": 0.01754451122105793, "learning_rate": 2.002447646917266e-07, "loss": 0.0007, "step": 5058 }, { "epoch": 1.828994938539407, "grad_norm": 0.007425290727937827, "learning_rate": 1.994081599908687e-07, "loss": 0.0003, "step": 5059 }, { "epoch": 1.829356471438901, "grad_norm": 0.006529864962045999, "learning_rate": 1.9857327102052127e-07, "loss": 0.0002, "step": 5060 }, { "epoch": 1.8297180043383947, "grad_norm": 0.7734250149623227, "learning_rate": 1.9774009807907334e-07, "loss": 0.0522, "step": 5061 }, { "epoch": 1.8300795372378886, "grad_norm": 0.013913118783388956, "learning_rate": 1.9690864146430122e-07, "loss": 0.0005, "step": 5062 }, { "epoch": 1.8304410701373826, "grad_norm": 0.9963715912179123, "learning_rate": 1.9607890147336606e-07, "loss": 0.0635, "step": 5063 }, { "epoch": 1.8308026030368763, "grad_norm": 0.8833791884800248, "learning_rate": 1.9525087840281897e-07, "loss": 0.0432, "step": 5064 }, { "epoch": 1.8311641359363702, "grad_norm": 0.2035457644436658, "learning_rate": 1.9442457254859325e-07, "loss": 0.0203, "step": 5065 }, { "epoch": 1.831525668835864, "grad_norm": 0.18255057886348408, "learning_rate": 1.9359998420601044e-07, "loss": 0.0181, "step": 5066 }, { "epoch": 1.8318872017353578, "grad_norm": 0.4534264545433761, "learning_rate": 1.9277711366978036e-07, "loss": 0.0349, "step": 5067 }, { "epoch": 1.8322487346348517, "grad_norm": 0.4713740806997103, "learning_rate": 1.9195596123399385e-07, "loss": 0.0114, "step": 5068 }, { "epoch": 1.8326102675343456, "grad_norm": 0.6779558764619359, "learning_rate": 1.9113652719213283e-07, "loss": 0.2012, "step": 5069 }, { "epoch": 1.8329718004338393, "grad_norm": 1.2702526611904728, "learning_rate": 1.9031881183706247e-07, "loss": 0.083, "step": 5070 }, { "epoch": 1.8333333333333335, "grad_norm": 0.49626511867393563, "learning_rate": 1.8950281546103343e-07, "loss": 0.0349, "step": 5071 }, { "epoch": 1.8336948662328272, "grad_norm": 0.21856376364588934, "learning_rate": 1.8868853835568303e-07, "loss": 0.0254, "step": 5072 }, { "epoch": 1.8340563991323209, "grad_norm": 1.8981727315418109, "learning_rate": 1.8787598081203285e-07, "loss": 0.1504, "step": 5073 }, { "epoch": 1.834417932031815, "grad_norm": 0.10092985427624876, "learning_rate": 1.870651431204934e-07, "loss": 0.0143, "step": 5074 }, { "epoch": 1.8347794649313087, "grad_norm": 0.34713011836956176, "learning_rate": 1.8625602557085453e-07, "loss": 0.0315, "step": 5075 }, { "epoch": 1.8351409978308026, "grad_norm": 0.48257766920816475, "learning_rate": 1.854486284522966e-07, "loss": 0.0388, "step": 5076 }, { "epoch": 1.8355025307302966, "grad_norm": 0.03622906713122463, "learning_rate": 1.8464295205338322e-07, "loss": 0.0015, "step": 5077 }, { "epoch": 1.8358640636297903, "grad_norm": 0.1596489120434206, "learning_rate": 1.8383899666206183e-07, "loss": 0.0161, "step": 5078 }, { "epoch": 1.8362255965292842, "grad_norm": 0.1634971640920262, "learning_rate": 1.8303676256566704e-07, "loss": 0.0181, "step": 5079 }, { "epoch": 1.836587129428778, "grad_norm": 0.004459501467728156, "learning_rate": 1.822362500509167e-07, "loss": 0.0002, "step": 5080 }, { "epoch": 1.8369486623282718, "grad_norm": 0.019576310434717928, "learning_rate": 1.8143745940391365e-07, "loss": 0.0006, "step": 5081 }, { "epoch": 1.8373101952277657, "grad_norm": 0.0002714925364959055, "learning_rate": 1.8064039091014619e-07, "loss": 0.0, "step": 5082 }, { "epoch": 1.8376717281272597, "grad_norm": 0.658955637967841, "learning_rate": 1.7984504485448528e-07, "loss": 0.1914, "step": 5083 }, { "epoch": 1.8380332610267534, "grad_norm": 0.0010461468299221006, "learning_rate": 1.7905142152118914e-07, "loss": 0.0, "step": 5084 }, { "epoch": 1.8383947939262473, "grad_norm": 0.07714316246191134, "learning_rate": 1.7825952119389645e-07, "loss": 0.0114, "step": 5085 }, { "epoch": 1.8387563268257412, "grad_norm": 0.09959730279968981, "learning_rate": 1.7746934415563356e-07, "loss": 0.0027, "step": 5086 }, { "epoch": 1.839117859725235, "grad_norm": 0.9919377214581339, "learning_rate": 1.7668089068881012e-07, "loss": 0.0762, "step": 5087 }, { "epoch": 1.8394793926247288, "grad_norm": 2.3110565811728367, "learning_rate": 1.758941610752174e-07, "loss": 0.2812, "step": 5088 }, { "epoch": 1.8398409255242227, "grad_norm": 1.3850182165156781, "learning_rate": 1.751091555960338e-07, "loss": 0.0762, "step": 5089 }, { "epoch": 1.8402024584237164, "grad_norm": 0.5120140733957639, "learning_rate": 1.7432587453181992e-07, "loss": 0.0388, "step": 5090 }, { "epoch": 1.8405639913232104, "grad_norm": 1.7014226020210288, "learning_rate": 1.7354431816252016e-07, "loss": 0.0903, "step": 5091 }, { "epoch": 1.8409255242227043, "grad_norm": 0.09306037117598762, "learning_rate": 1.7276448676746216e-07, "loss": 0.0128, "step": 5092 }, { "epoch": 1.841287057122198, "grad_norm": 0.3957137098571125, "learning_rate": 1.7198638062535743e-07, "loss": 0.0315, "step": 5093 }, { "epoch": 1.8416485900216921, "grad_norm": 0.03832541163568551, "learning_rate": 1.7121000001430243e-07, "loss": 0.001, "step": 5094 }, { "epoch": 1.8420101229211858, "grad_norm": 0.07324135430061902, "learning_rate": 1.7043534521177352e-07, "loss": 0.0101, "step": 5095 }, { "epoch": 1.8423716558206795, "grad_norm": 0.0003221966659097876, "learning_rate": 1.696624164946331e-07, "loss": 0.0, "step": 5096 }, { "epoch": 1.8427331887201737, "grad_norm": 0.6929218088371849, "learning_rate": 1.6889121413912633e-07, "loss": 0.1602, "step": 5097 }, { "epoch": 1.8430947216196674, "grad_norm": 2.710928417362207, "learning_rate": 1.6812173842087943e-07, "loss": 0.1699, "step": 5098 }, { "epoch": 1.8434562545191613, "grad_norm": 0.18301159116541146, "learning_rate": 1.67353989614904e-07, "loss": 0.0049, "step": 5099 }, { "epoch": 1.8438177874186552, "grad_norm": 0.0024479052494948564, "learning_rate": 1.665879679955923e-07, "loss": 0.0001, "step": 5100 }, { "epoch": 1.844179320318149, "grad_norm": 0.10019556289927319, "learning_rate": 1.6582367383672137e-07, "loss": 0.0161, "step": 5101 }, { "epoch": 1.8445408532176428, "grad_norm": 0.0002228822582886819, "learning_rate": 1.6506110741144886e-07, "loss": 0.0, "step": 5102 }, { "epoch": 1.8449023861171367, "grad_norm": 0.18446150004512213, "learning_rate": 1.6430026899231567e-07, "loss": 0.0143, "step": 5103 }, { "epoch": 1.8452639190166304, "grad_norm": 0.6515282525193478, "learning_rate": 1.6354115885124645e-07, "loss": 0.0352, "step": 5104 }, { "epoch": 1.8456254519161244, "grad_norm": 1.6927530685185952, "learning_rate": 1.6278377725954587e-07, "loss": 0.1699, "step": 5105 }, { "epoch": 1.8459869848156183, "grad_norm": 0.1251845528077707, "learning_rate": 1.6202812448790294e-07, "loss": 0.0161, "step": 5106 }, { "epoch": 1.846348517715112, "grad_norm": 0.7661515784040152, "learning_rate": 1.6127420080638723e-07, "loss": 0.0527, "step": 5107 }, { "epoch": 1.846710050614606, "grad_norm": 0.17024962330630788, "learning_rate": 1.6052200648445092e-07, "loss": 0.0161, "step": 5108 }, { "epoch": 1.8470715835140998, "grad_norm": 0.1450779731205042, "learning_rate": 1.5977154179092846e-07, "loss": 0.0161, "step": 5109 }, { "epoch": 1.8474331164135935, "grad_norm": 0.007093473838274545, "learning_rate": 1.5902280699403527e-07, "loss": 0.0002, "step": 5110 }, { "epoch": 1.8477946493130875, "grad_norm": 0.3706550975226956, "learning_rate": 1.5827580236136898e-07, "loss": 0.0254, "step": 5111 }, { "epoch": 1.8481561822125814, "grad_norm": 0.5601798197468173, "learning_rate": 1.5753052815990933e-07, "loss": 0.0203, "step": 5112 }, { "epoch": 1.848517715112075, "grad_norm": 2.4138710087361073, "learning_rate": 1.567869846560166e-07, "loss": 0.0476, "step": 5113 }, { "epoch": 1.848879248011569, "grad_norm": 0.14594821143369882, "learning_rate": 1.5604517211543435e-07, "loss": 0.0203, "step": 5114 }, { "epoch": 1.849240780911063, "grad_norm": 0.17527302030984884, "learning_rate": 1.5530509080328437e-07, "loss": 0.008, "step": 5115 }, { "epoch": 1.8496023138105566, "grad_norm": 0.4029410332703956, "learning_rate": 1.5456674098407287e-07, "loss": 0.0283, "step": 5116 }, { "epoch": 1.8499638467100508, "grad_norm": 0.7936365431763819, "learning_rate": 1.5383012292168597e-07, "loss": 0.1602, "step": 5117 }, { "epoch": 1.8503253796095445, "grad_norm": 1.0566542684037827, "learning_rate": 1.530952368793903e-07, "loss": 0.1406, "step": 5118 }, { "epoch": 1.8506869125090382, "grad_norm": 1.065350458142005, "learning_rate": 1.5236208311983357e-07, "loss": 0.1504, "step": 5119 }, { "epoch": 1.8510484454085323, "grad_norm": 1.2020286430095979, "learning_rate": 1.516306619050456e-07, "loss": 0.1226, "step": 5120 }, { "epoch": 1.851409978308026, "grad_norm": 0.0007329872013380135, "learning_rate": 1.5090097349643618e-07, "loss": 0.0, "step": 5121 }, { "epoch": 1.85177151120752, "grad_norm": 1.697809125392883, "learning_rate": 1.5017301815479502e-07, "loss": 0.0898, "step": 5122 }, { "epoch": 1.8521330441070138, "grad_norm": 0.5237719094529085, "learning_rate": 1.4944679614029346e-07, "loss": 0.0203, "step": 5123 }, { "epoch": 1.8524945770065075, "grad_norm": 0.3262514438282779, "learning_rate": 1.4872230771248386e-07, "loss": 0.0101, "step": 5124 }, { "epoch": 1.8528561099060015, "grad_norm": 0.23940912532004818, "learning_rate": 1.479995531302969e-07, "loss": 0.0227, "step": 5125 }, { "epoch": 1.8532176428054954, "grad_norm": 0.5524444539516916, "learning_rate": 1.472785326520465e-07, "loss": 0.0476, "step": 5126 }, { "epoch": 1.853579175704989, "grad_norm": 0.0975778214794262, "learning_rate": 1.465592465354243e-07, "loss": 0.0143, "step": 5127 }, { "epoch": 1.853940708604483, "grad_norm": 0.3899913889011248, "learning_rate": 1.4584169503750301e-07, "loss": 0.0315, "step": 5128 }, { "epoch": 1.854302241503977, "grad_norm": 0.08542061074138695, "learning_rate": 1.4512587841473524e-07, "loss": 0.0128, "step": 5129 }, { "epoch": 1.8546637744034706, "grad_norm": 0.8000328574322049, "learning_rate": 1.4441179692295416e-07, "loss": 0.1602, "step": 5130 }, { "epoch": 1.8550253073029646, "grad_norm": 1.051549255157186, "learning_rate": 1.4369945081737225e-07, "loss": 0.1226, "step": 5131 }, { "epoch": 1.8553868402024585, "grad_norm": 0.160555987271993, "learning_rate": 1.4298884035258198e-07, "loss": 0.0039, "step": 5132 }, { "epoch": 1.8557483731019522, "grad_norm": 0.8804565225602848, "learning_rate": 1.422799657825541e-07, "loss": 0.1602, "step": 5133 }, { "epoch": 1.856109906001446, "grad_norm": 0.16530750304692007, "learning_rate": 1.415728273606426e-07, "loss": 0.0049, "step": 5134 }, { "epoch": 1.85647143890094, "grad_norm": 0.00021185682185095783, "learning_rate": 1.4086742533957587e-07, "loss": 0.0, "step": 5135 }, { "epoch": 1.8568329718004337, "grad_norm": 0.20630746733069888, "learning_rate": 1.401637599714667e-07, "loss": 0.0181, "step": 5136 }, { "epoch": 1.8571945046999276, "grad_norm": 1.8495213759922335, "learning_rate": 1.3946183150780446e-07, "loss": 0.0898, "step": 5137 }, { "epoch": 1.8575560375994216, "grad_norm": 0.0009738648776981803, "learning_rate": 1.3876164019945736e-07, "loss": 0.0, "step": 5138 }, { "epoch": 1.8579175704989153, "grad_norm": 0.9369419612135085, "learning_rate": 1.3806318629667413e-07, "loss": 0.1406, "step": 5139 }, { "epoch": 1.8582791033984094, "grad_norm": 0.21858125787994198, "learning_rate": 1.3736647004908233e-07, "loss": 0.0283, "step": 5140 }, { "epoch": 1.858640636297903, "grad_norm": 0.5513897129802139, "learning_rate": 1.3667149170568771e-07, "loss": 0.0129, "step": 5141 }, { "epoch": 1.8590021691973968, "grad_norm": 0.08683547623755344, "learning_rate": 1.3597825151487553e-07, "loss": 0.0128, "step": 5142 }, { "epoch": 1.859363702096891, "grad_norm": 0.8874199345772176, "learning_rate": 1.3528674972440982e-07, "loss": 0.0476, "step": 5143 }, { "epoch": 1.8597252349963846, "grad_norm": 1.1540282695197153, "learning_rate": 1.3459698658143338e-07, "loss": 0.1226, "step": 5144 }, { "epoch": 1.8600867678958786, "grad_norm": 0.13153392337394942, "learning_rate": 1.3390896233246687e-07, "loss": 0.0063, "step": 5145 }, { "epoch": 1.8604483007953725, "grad_norm": 0.9087430179954747, "learning_rate": 1.332226772234102e-07, "loss": 0.1602, "step": 5146 }, { "epoch": 1.8608098336948662, "grad_norm": 0.6936484202421255, "learning_rate": 1.3253813149954164e-07, "loss": 0.043, "step": 5147 }, { "epoch": 1.86117136659436, "grad_norm": 0.00017666835280395234, "learning_rate": 1.318553254055177e-07, "loss": 0.0, "step": 5148 }, { "epoch": 1.861532899493854, "grad_norm": 0.3221991083542457, "learning_rate": 1.3117425918537318e-07, "loss": 0.0227, "step": 5149 }, { "epoch": 1.8618944323933477, "grad_norm": 0.3215152932643398, "learning_rate": 1.304949330825206e-07, "loss": 0.0145, "step": 5150 }, { "epoch": 1.8622559652928417, "grad_norm": 0.02305132952557259, "learning_rate": 1.2981734733975237e-07, "loss": 0.0008, "step": 5151 }, { "epoch": 1.8626174981923356, "grad_norm": 1.2448487582696834, "learning_rate": 1.2914150219923483e-07, "loss": 0.0898, "step": 5152 }, { "epoch": 1.8629790310918293, "grad_norm": 0.005214456012764186, "learning_rate": 1.284673979025175e-07, "loss": 0.0001, "step": 5153 }, { "epoch": 1.8633405639913232, "grad_norm": 0.044955214779740245, "learning_rate": 1.2779503469052434e-07, "loss": 0.0015, "step": 5154 }, { "epoch": 1.8637020968908171, "grad_norm": 0.09791679618442849, "learning_rate": 1.2712441280355647e-07, "loss": 0.0128, "step": 5155 }, { "epoch": 1.8640636297903108, "grad_norm": 0.12654893279168536, "learning_rate": 1.264555324812955e-07, "loss": 0.0161, "step": 5156 }, { "epoch": 1.8644251626898047, "grad_norm": 0.10240305646743295, "learning_rate": 1.2578839396279907e-07, "loss": 0.0128, "step": 5157 }, { "epoch": 1.8647866955892987, "grad_norm": 0.4067694529435368, "learning_rate": 1.2512299748650202e-07, "loss": 0.0227, "step": 5158 }, { "epoch": 1.8651482284887924, "grad_norm": 0.11339703048953499, "learning_rate": 1.2445934329021637e-07, "loss": 0.0181, "step": 5159 }, { "epoch": 1.8655097613882863, "grad_norm": 1.2013937423886045, "learning_rate": 1.2379743161113235e-07, "loss": 0.0527, "step": 5160 }, { "epoch": 1.8658712942877802, "grad_norm": 0.0076943477499528, "learning_rate": 1.231372626858185e-07, "loss": 0.0003, "step": 5161 }, { "epoch": 1.866232827187274, "grad_norm": 0.423162686809153, "learning_rate": 1.224788367502161e-07, "loss": 0.0388, "step": 5162 }, { "epoch": 1.866594360086768, "grad_norm": 0.18180257304869527, "learning_rate": 1.2182215403964916e-07, "loss": 0.0181, "step": 5163 }, { "epoch": 1.8669558929862617, "grad_norm": 0.1831411859614502, "learning_rate": 1.211672147888149e-07, "loss": 0.009, "step": 5164 }, { "epoch": 1.8673174258857554, "grad_norm": 0.00010881148835117849, "learning_rate": 1.2051401923178774e-07, "loss": 0.0, "step": 5165 }, { "epoch": 1.8676789587852496, "grad_norm": 3.0195357539459313, "learning_rate": 1.1986256760202097e-07, "loss": 0.2812, "step": 5166 }, { "epoch": 1.8680404916847433, "grad_norm": 1.279953027621004, "learning_rate": 1.1921286013234225e-07, "loss": 0.0693, "step": 5167 }, { "epoch": 1.8684020245842372, "grad_norm": 0.08648166514653391, "learning_rate": 1.1856489705495744e-07, "loss": 0.0128, "step": 5168 }, { "epoch": 1.8687635574837311, "grad_norm": 0.6975309714275851, "learning_rate": 1.1791867860144801e-07, "loss": 0.1699, "step": 5169 }, { "epoch": 1.8691250903832248, "grad_norm": 0.10943068603901468, "learning_rate": 1.1727420500277254e-07, "loss": 0.0181, "step": 5170 }, { "epoch": 1.8694866232827188, "grad_norm": 0.7915918285746805, "learning_rate": 1.1663147648926676e-07, "loss": 0.1602, "step": 5171 }, { "epoch": 1.8698481561822127, "grad_norm": 0.1332369820154938, "learning_rate": 1.1599049329063916e-07, "loss": 0.0181, "step": 5172 }, { "epoch": 1.8702096890817064, "grad_norm": 0.985392120216551, "learning_rate": 1.1535125563597927e-07, "loss": 0.1406, "step": 5173 }, { "epoch": 1.8705712219812003, "grad_norm": 0.0002334722476648782, "learning_rate": 1.1471376375375043e-07, "loss": 0.0, "step": 5174 }, { "epoch": 1.8709327548806942, "grad_norm": 0.11830463653629948, "learning_rate": 1.140780178717904e-07, "loss": 0.0161, "step": 5175 }, { "epoch": 1.871294287780188, "grad_norm": 0.010703153050903195, "learning_rate": 1.1344401821731633e-07, "loss": 0.0003, "step": 5176 }, { "epoch": 1.8716558206796818, "grad_norm": 0.5951803899426551, "learning_rate": 1.128117650169186e-07, "loss": 0.0388, "step": 5177 }, { "epoch": 1.8720173535791758, "grad_norm": 0.14859624875158575, "learning_rate": 1.1218125849656425e-07, "loss": 0.0161, "step": 5178 }, { "epoch": 1.8723788864786695, "grad_norm": 1.5653204859221543, "learning_rate": 1.1155249888159748e-07, "loss": 0.0977, "step": 5179 }, { "epoch": 1.8727404193781634, "grad_norm": 0.6981964884426086, "learning_rate": 1.1092548639673517e-07, "loss": 0.1699, "step": 5180 }, { "epoch": 1.8731019522776573, "grad_norm": 0.2676168238190625, "learning_rate": 1.1030022126607254e-07, "loss": 0.0254, "step": 5181 }, { "epoch": 1.873463485177151, "grad_norm": 0.1294126219132581, "learning_rate": 1.096767037130786e-07, "loss": 0.0161, "step": 5182 }, { "epoch": 1.873825018076645, "grad_norm": 0.0596571336480471, "learning_rate": 1.0905493396059896e-07, "loss": 0.0019, "step": 5183 }, { "epoch": 1.8741865509761388, "grad_norm": 0.9986317833373741, "learning_rate": 1.0843491223085478e-07, "loss": 0.1406, "step": 5184 }, { "epoch": 1.8745480838756325, "grad_norm": 0.38325060303858277, "learning_rate": 1.078166387454388e-07, "loss": 0.0315, "step": 5185 }, { "epoch": 1.8749096167751267, "grad_norm": 0.23320063506668218, "learning_rate": 1.0720011372532479e-07, "loss": 0.0283, "step": 5186 }, { "epoch": 1.8752711496746204, "grad_norm": 0.33412407396685895, "learning_rate": 1.0658533739085764e-07, "loss": 0.0102, "step": 5187 }, { "epoch": 1.875632682574114, "grad_norm": 0.11969697528476243, "learning_rate": 1.0597230996175767e-07, "loss": 0.0203, "step": 5188 }, { "epoch": 1.8759942154736082, "grad_norm": 0.409527004578534, "learning_rate": 1.0536103165712185e-07, "loss": 0.0315, "step": 5189 }, { "epoch": 1.876355748373102, "grad_norm": 0.00012524101520499322, "learning_rate": 1.0475150269541933e-07, "loss": 0.0, "step": 5190 }, { "epoch": 1.8767172812725958, "grad_norm": 0.005382554524339595, "learning_rate": 1.0414372329449807e-07, "loss": 0.0002, "step": 5191 }, { "epoch": 1.8770788141720898, "grad_norm": 0.5386210971632046, "learning_rate": 1.035376936715754e-07, "loss": 0.0254, "step": 5192 }, { "epoch": 1.8774403470715835, "grad_norm": 0.4991783645035857, "learning_rate": 1.0293341404324863e-07, "loss": 0.0432, "step": 5193 }, { "epoch": 1.8778018799710774, "grad_norm": 0.45248413878223903, "learning_rate": 1.0233088462548613e-07, "loss": 0.0203, "step": 5194 }, { "epoch": 1.8781634128705713, "grad_norm": 0.6783247684658564, "learning_rate": 1.0173010563363173e-07, "loss": 0.0203, "step": 5195 }, { "epoch": 1.878524945770065, "grad_norm": 0.6290946542284519, "learning_rate": 1.0113107728240368e-07, "loss": 0.0388, "step": 5196 }, { "epoch": 1.878886478669559, "grad_norm": 0.06309127724343117, "learning_rate": 1.0053379978589517e-07, "loss": 0.0024, "step": 5197 }, { "epoch": 1.8792480115690529, "grad_norm": 0.4826169908552445, "learning_rate": 9.993827335757267e-08, "loss": 0.0203, "step": 5198 }, { "epoch": 1.8796095444685466, "grad_norm": 0.6679871482499316, "learning_rate": 9.934449821027647e-08, "loss": 0.0254, "step": 5199 }, { "epoch": 1.8799710773680405, "grad_norm": 0.00019440936844837605, "learning_rate": 9.875247455622239e-08, "loss": 0.0, "step": 5200 }, { "epoch": 1.8803326102675344, "grad_norm": 0.001258919648476272, "learning_rate": 9.81622026070006e-08, "loss": 0.0, "step": 5201 }, { "epoch": 1.880694143167028, "grad_norm": 0.046253368647975926, "learning_rate": 9.757368257357237e-08, "loss": 0.0011, "step": 5202 }, { "epoch": 1.881055676066522, "grad_norm": 0.13430283825703754, "learning_rate": 9.698691466627552e-08, "loss": 0.0143, "step": 5203 }, { "epoch": 1.881417208966016, "grad_norm": 0.35171814249750327, "learning_rate": 9.640189909482068e-08, "loss": 0.0101, "step": 5204 }, { "epoch": 1.8817787418655096, "grad_norm": 0.2555029554812324, "learning_rate": 9.581863606829223e-08, "loss": 0.0254, "step": 5205 }, { "epoch": 1.8821402747650036, "grad_norm": 0.7726407249265141, "learning_rate": 9.523712579514788e-08, "loss": 0.0352, "step": 5206 }, { "epoch": 1.8825018076644975, "grad_norm": 1.1475276250124427, "learning_rate": 9.465736848322026e-08, "loss": 0.0527, "step": 5207 }, { "epoch": 1.8828633405639912, "grad_norm": 0.14830605640351482, "learning_rate": 9.407936433971366e-08, "loss": 0.0203, "step": 5208 }, { "epoch": 1.8832248734634853, "grad_norm": 0.22365905815341863, "learning_rate": 9.350311357120723e-08, "loss": 0.0203, "step": 5209 }, { "epoch": 1.883586406362979, "grad_norm": 0.16558907911231902, "learning_rate": 9.29286163836518e-08, "loss": 0.0042, "step": 5210 }, { "epoch": 1.8839479392624727, "grad_norm": 0.2527264556756529, "learning_rate": 9.23558729823737e-08, "loss": 0.0283, "step": 5211 }, { "epoch": 1.8843094721619669, "grad_norm": 0.7721910557065517, "learning_rate": 9.178488357207083e-08, "loss": 0.2129, "step": 5212 }, { "epoch": 1.8846710050614606, "grad_norm": 0.7348006799475298, "learning_rate": 9.121564835681495e-08, "loss": 0.0388, "step": 5213 }, { "epoch": 1.8850325379609545, "grad_norm": 2.027974721925593, "learning_rate": 9.064816754005001e-08, "loss": 0.1602, "step": 5214 }, { "epoch": 1.8853940708604484, "grad_norm": 0.09902255752721457, "learning_rate": 9.008244132459431e-08, "loss": 0.0016, "step": 5215 }, { "epoch": 1.8857556037599421, "grad_norm": 0.7882002862544496, "learning_rate": 8.95184699126378e-08, "loss": 0.1504, "step": 5216 }, { "epoch": 1.886117136659436, "grad_norm": 0.11355918335595858, "learning_rate": 8.895625350574422e-08, "loss": 0.0143, "step": 5217 }, { "epoch": 1.88647866955893, "grad_norm": 1.2442190605849732, "learning_rate": 8.839579230484841e-08, "loss": 0.1055, "step": 5218 }, { "epoch": 1.8868402024584237, "grad_norm": 0.44755135400218543, "learning_rate": 8.783708651026013e-08, "loss": 0.0476, "step": 5219 }, { "epoch": 1.8872017353579176, "grad_norm": 1.2116152664564315, "learning_rate": 8.728013632166076e-08, "loss": 0.0762, "step": 5220 }, { "epoch": 1.8875632682574115, "grad_norm": 0.0010710025275845506, "learning_rate": 8.672494193810388e-08, "loss": 0.0, "step": 5221 }, { "epoch": 1.8879248011569052, "grad_norm": 0.02892190043336691, "learning_rate": 8.617150355801573e-08, "loss": 0.0013, "step": 5222 }, { "epoch": 1.8882863340563991, "grad_norm": 0.8907073540807469, "learning_rate": 8.561982137919478e-08, "loss": 0.0315, "step": 5223 }, { "epoch": 1.888647866955893, "grad_norm": 0.22053962272215283, "learning_rate": 8.506989559881329e-08, "loss": 0.0181, "step": 5224 }, { "epoch": 1.8890093998553867, "grad_norm": 0.11481938725905523, "learning_rate": 8.45217264134135e-08, "loss": 0.0181, "step": 5225 }, { "epoch": 1.8893709327548807, "grad_norm": 0.00015764681936338796, "learning_rate": 8.397531401891146e-08, "loss": 0.0, "step": 5226 }, { "epoch": 1.8897324656543746, "grad_norm": 0.18943572164000835, "learning_rate": 8.343065861059485e-08, "loss": 0.0056, "step": 5227 }, { "epoch": 1.8900939985538683, "grad_norm": 0.12245878467064958, "learning_rate": 8.288776038312297e-08, "loss": 0.0161, "step": 5228 }, { "epoch": 1.8904555314533622, "grad_norm": 0.3929787831097791, "learning_rate": 8.234661953052725e-08, "loss": 0.0349, "step": 5229 }, { "epoch": 1.8908170643528561, "grad_norm": 1.0872207879562539, "learning_rate": 8.180723624621245e-08, "loss": 0.1699, "step": 5230 }, { "epoch": 1.8911785972523498, "grad_norm": 0.11169061414521633, "learning_rate": 8.126961072295436e-08, "loss": 0.0161, "step": 5231 }, { "epoch": 1.891540130151844, "grad_norm": 0.10682636622766942, "learning_rate": 8.073374315289816e-08, "loss": 0.0044, "step": 5232 }, { "epoch": 1.8919016630513377, "grad_norm": 0.2242899335027498, "learning_rate": 8.019963372756456e-08, "loss": 0.0203, "step": 5233 }, { "epoch": 1.8922631959508314, "grad_norm": 1.0488664567662533, "learning_rate": 7.966728263784307e-08, "loss": 0.083, "step": 5234 }, { "epoch": 1.8926247288503255, "grad_norm": 0.0002469467874620169, "learning_rate": 7.913669007399705e-08, "loss": 0.0, "step": 5235 }, { "epoch": 1.8929862617498192, "grad_norm": 1.8711234428816428, "learning_rate": 7.860785622565926e-08, "loss": 0.083, "step": 5236 }, { "epoch": 1.8933477946493131, "grad_norm": 0.7270398223576383, "learning_rate": 7.808078128183516e-08, "loss": 0.1914, "step": 5237 }, { "epoch": 1.893709327548807, "grad_norm": 0.2554731709611894, "learning_rate": 7.755546543090076e-08, "loss": 0.0254, "step": 5238 }, { "epoch": 1.8940708604483008, "grad_norm": 0.1319920608555448, "learning_rate": 7.703190886060418e-08, "loss": 0.0181, "step": 5239 }, { "epoch": 1.8944323933477947, "grad_norm": 0.4133146072278172, "learning_rate": 7.651011175806411e-08, "loss": 0.0349, "step": 5240 }, { "epoch": 1.8947939262472886, "grad_norm": 0.36157435753069433, "learning_rate": 7.599007430977245e-08, "loss": 0.0315, "step": 5241 }, { "epoch": 1.8951554591467823, "grad_norm": 0.1782735607053291, "learning_rate": 7.547179670158777e-08, "loss": 0.0203, "step": 5242 }, { "epoch": 1.8955169920462762, "grad_norm": 1.2548391354011508, "learning_rate": 7.495527911874411e-08, "loss": 0.1406, "step": 5243 }, { "epoch": 1.8958785249457701, "grad_norm": 1.383426190713084, "learning_rate": 7.444052174584437e-08, "loss": 0.0635, "step": 5244 }, { "epoch": 1.8962400578452638, "grad_norm": 2.886474510926369, "learning_rate": 7.392752476686249e-08, "loss": 0.1055, "step": 5245 }, { "epoch": 1.8966015907447578, "grad_norm": 0.001972035829314649, "learning_rate": 7.341628836514348e-08, "loss": 0.0001, "step": 5246 }, { "epoch": 1.8969631236442517, "grad_norm": 1.3408615600963893, "learning_rate": 7.290681272340339e-08, "loss": 0.1226, "step": 5247 }, { "epoch": 1.8973246565437454, "grad_norm": 0.23159807354156306, "learning_rate": 7.239909802372935e-08, "loss": 0.0181, "step": 5248 }, { "epoch": 1.8976861894432393, "grad_norm": 0.18717914351384105, "learning_rate": 7.189314444757678e-08, "loss": 0.0161, "step": 5249 }, { "epoch": 1.8980477223427332, "grad_norm": 0.41001789135623895, "learning_rate": 7.13889521757749e-08, "loss": 0.0129, "step": 5250 }, { "epoch": 1.898409255242227, "grad_norm": 0.0883639459071222, "learning_rate": 7.088652138852181e-08, "loss": 0.0128, "step": 5251 }, { "epoch": 1.8987707881417208, "grad_norm": 0.01543529966691766, "learning_rate": 7.038585226538441e-08, "loss": 0.0004, "step": 5252 }, { "epoch": 1.8991323210412148, "grad_norm": 0.9617228524994159, "learning_rate": 6.988694498530401e-08, "loss": 0.0432, "step": 5253 }, { "epoch": 1.8994938539407085, "grad_norm": 0.019809787906396602, "learning_rate": 6.93897997265891e-08, "loss": 0.0004, "step": 5254 }, { "epoch": 1.8998553868402026, "grad_norm": 1.1958245062528081, "learning_rate": 6.889441666691865e-08, "loss": 0.0977, "step": 5255 }, { "epoch": 1.9002169197396963, "grad_norm": 0.37586297964213955, "learning_rate": 6.840079598334382e-08, "loss": 0.0315, "step": 5256 }, { "epoch": 1.90057845263919, "grad_norm": 0.0002502652430934489, "learning_rate": 6.790893785228292e-08, "loss": 0.0, "step": 5257 }, { "epoch": 1.9009399855386842, "grad_norm": 0.6490055592113665, "learning_rate": 6.741884244952757e-08, "loss": 0.0432, "step": 5258 }, { "epoch": 1.9013015184381779, "grad_norm": 1.1048011158974143, "learning_rate": 6.693050995023598e-08, "loss": 0.0757, "step": 5259 }, { "epoch": 1.9016630513376718, "grad_norm": 0.005324516772624901, "learning_rate": 6.644394052893965e-08, "loss": 0.0002, "step": 5260 }, { "epoch": 1.9020245842371657, "grad_norm": 0.7334441366455597, "learning_rate": 6.595913435953838e-08, "loss": 0.0128, "step": 5261 }, { "epoch": 1.9023861171366594, "grad_norm": 0.8521733952421549, "learning_rate": 6.547609161530021e-08, "loss": 0.0977, "step": 5262 }, { "epoch": 1.9027476500361533, "grad_norm": 0.6035134345890436, "learning_rate": 6.499481246886597e-08, "loss": 0.043, "step": 5263 }, { "epoch": 1.9031091829356472, "grad_norm": 0.10686301590220249, "learning_rate": 6.451529709224414e-08, "loss": 0.0143, "step": 5264 }, { "epoch": 1.903470715835141, "grad_norm": 0.014200665942316901, "learning_rate": 6.403754565681375e-08, "loss": 0.0005, "step": 5265 }, { "epoch": 1.9038322487346349, "grad_norm": 0.2410308864647396, "learning_rate": 6.356155833332267e-08, "loss": 0.0049, "step": 5266 }, { "epoch": 1.9041937816341288, "grad_norm": 0.10107685270207663, "learning_rate": 6.308733529188815e-08, "loss": 0.0161, "step": 5267 }, { "epoch": 1.9045553145336225, "grad_norm": 0.010238531118848926, "learning_rate": 6.261487670199962e-08, "loss": 0.0004, "step": 5268 }, { "epoch": 1.9049168474331164, "grad_norm": 0.3095917293865779, "learning_rate": 6.214418273251088e-08, "loss": 0.0315, "step": 5269 }, { "epoch": 1.9052783803326103, "grad_norm": 0.5644254937077092, "learning_rate": 6.16752535516496e-08, "loss": 0.0315, "step": 5270 }, { "epoch": 1.905639913232104, "grad_norm": 0.26253889076188, "learning_rate": 6.120808932701117e-08, "loss": 0.0254, "step": 5271 }, { "epoch": 1.906001446131598, "grad_norm": 1.5578539659113013, "learning_rate": 6.074269022555867e-08, "loss": 0.083, "step": 5272 }, { "epoch": 1.9063629790310919, "grad_norm": 0.3070171414661627, "learning_rate": 6.02790564136263e-08, "loss": 0.0283, "step": 5273 }, { "epoch": 1.9067245119305856, "grad_norm": 0.3517136072188574, "learning_rate": 5.98171880569165e-08, "loss": 0.0315, "step": 5274 }, { "epoch": 1.9070860448300795, "grad_norm": 0.0002806798603125065, "learning_rate": 5.935708532050166e-08, "loss": 0.0, "step": 5275 }, { "epoch": 1.9074475777295734, "grad_norm": 0.4595553335421589, "learning_rate": 5.889874836882137e-08, "loss": 0.0349, "step": 5276 }, { "epoch": 1.907809110629067, "grad_norm": 0.22775882864709135, "learning_rate": 5.844217736568569e-08, "loss": 0.0227, "step": 5277 }, { "epoch": 1.9081706435285612, "grad_norm": 0.11912090688973741, "learning_rate": 5.7987372474273554e-08, "loss": 0.0161, "step": 5278 }, { "epoch": 1.908532176428055, "grad_norm": 0.016524957968880122, "learning_rate": 5.753433385713103e-08, "loss": 0.0006, "step": 5279 }, { "epoch": 1.9088937093275486, "grad_norm": 0.01470833076513571, "learning_rate": 5.708306167617472e-08, "loss": 0.0004, "step": 5280 }, { "epoch": 1.9092552422270428, "grad_norm": 1.0379873571581777, "learning_rate": 5.663355609268894e-08, "loss": 0.1055, "step": 5281 }, { "epoch": 1.9096167751265365, "grad_norm": 0.38940367927860553, "learning_rate": 5.6185817267326856e-08, "loss": 0.0349, "step": 5282 }, { "epoch": 1.9099783080260304, "grad_norm": 0.2742620608340407, "learning_rate": 5.5739845360110454e-08, "loss": 0.0254, "step": 5283 }, { "epoch": 1.9103398409255243, "grad_norm": 0.0013917192849985964, "learning_rate": 5.5295640530430016e-08, "loss": 0.0, "step": 5284 }, { "epoch": 1.910701373825018, "grad_norm": 3.551630515555177, "learning_rate": 5.48532029370441e-08, "loss": 0.1602, "step": 5285 }, { "epoch": 1.911062906724512, "grad_norm": 0.4996449164814753, "learning_rate": 5.441253273807956e-08, "loss": 0.0254, "step": 5286 }, { "epoch": 1.9114244396240059, "grad_norm": 0.3456289382036746, "learning_rate": 5.397363009103207e-08, "loss": 0.0315, "step": 5287 }, { "epoch": 1.9117859725234996, "grad_norm": 1.3416148122522165, "learning_rate": 5.35364951527656e-08, "loss": 0.1226, "step": 5288 }, { "epoch": 1.9121475054229935, "grad_norm": 5.320985926985352, "learning_rate": 5.3101128079511844e-08, "loss": 0.2129, "step": 5289 }, { "epoch": 1.9125090383224874, "grad_norm": 1.529581867161848, "learning_rate": 5.266752902687078e-08, "loss": 0.0762, "step": 5290 }, { "epoch": 1.9128705712219811, "grad_norm": 2.6695635795764865, "learning_rate": 5.223569814981066e-08, "loss": 0.0476, "step": 5291 }, { "epoch": 1.913232104121475, "grad_norm": 0.010019503951760817, "learning_rate": 5.18056356026686e-08, "loss": 0.0003, "step": 5292 }, { "epoch": 1.913593637020969, "grad_norm": 0.023764544693704315, "learning_rate": 5.1377341539147195e-08, "loss": 0.0007, "step": 5293 }, { "epoch": 1.9139551699204627, "grad_norm": 0.17778904755886898, "learning_rate": 5.0950816112320114e-08, "loss": 0.0161, "step": 5294 }, { "epoch": 1.9143167028199566, "grad_norm": 0.02080591610849619, "learning_rate": 5.052605947462708e-08, "loss": 0.0007, "step": 5295 }, { "epoch": 1.9146782357194505, "grad_norm": 0.6968817683051256, "learning_rate": 5.010307177787499e-08, "loss": 0.0476, "step": 5296 }, { "epoch": 1.9150397686189442, "grad_norm": 0.1659549819552528, "learning_rate": 4.968185317324126e-08, "loss": 0.0203, "step": 5297 }, { "epoch": 1.9154013015184381, "grad_norm": 0.010717403527075845, "learning_rate": 4.926240381126823e-08, "loss": 0.0005, "step": 5298 }, { "epoch": 1.915762834417932, "grad_norm": 0.4512791167373778, "learning_rate": 4.884472384186712e-08, "loss": 0.0349, "step": 5299 }, { "epoch": 1.9161243673174257, "grad_norm": 0.18166940372352589, "learning_rate": 4.842881341431682e-08, "loss": 0.0181, "step": 5300 }, { "epoch": 1.91648590021692, "grad_norm": 1.0330598078199333, "learning_rate": 4.8014672677264005e-08, "loss": 0.1504, "step": 5301 }, { "epoch": 1.9168474331164136, "grad_norm": 0.7379725276167292, "learning_rate": 4.7602301778721935e-08, "loss": 0.1807, "step": 5302 }, { "epoch": 1.9172089660159073, "grad_norm": 0.001793944873135086, "learning_rate": 4.719170086607161e-08, "loss": 0.0001, "step": 5303 }, { "epoch": 1.9175704989154014, "grad_norm": 1.3842265365712063, "learning_rate": 4.6782870086062856e-08, "loss": 0.0476, "step": 5304 }, { "epoch": 1.9179320318148951, "grad_norm": 0.27994342372271563, "learning_rate": 4.637580958481047e-08, "loss": 0.0283, "step": 5305 }, { "epoch": 1.918293564714389, "grad_norm": 0.7583441578785795, "learning_rate": 4.5970519507798076e-08, "loss": 0.0579, "step": 5306 }, { "epoch": 1.918655097613883, "grad_norm": 0.020603170487931272, "learning_rate": 4.5566999999877015e-08, "loss": 0.0008, "step": 5307 }, { "epoch": 1.9190166305133767, "grad_norm": 0.7760471860475596, "learning_rate": 4.516525120526527e-08, "loss": 0.1309, "step": 5308 }, { "epoch": 1.9193781634128706, "grad_norm": 0.00016950307135700103, "learning_rate": 4.476527326754576e-08, "loss": 0.0, "step": 5309 }, { "epoch": 1.9197396963123645, "grad_norm": 1.246720271623761, "learning_rate": 4.436706632967247e-08, "loss": 0.1143, "step": 5310 }, { "epoch": 1.9201012292118582, "grad_norm": 0.015863789727503994, "learning_rate": 4.397063053396378e-08, "loss": 0.0001, "step": 5311 }, { "epoch": 1.9204627621113521, "grad_norm": 0.21257927739319496, "learning_rate": 4.357596602210579e-08, "loss": 0.0203, "step": 5312 }, { "epoch": 1.920824295010846, "grad_norm": 0.16942288919749166, "learning_rate": 4.318307293515178e-08, "loss": 0.0203, "step": 5313 }, { "epoch": 1.9211858279103398, "grad_norm": 0.8364916855145754, "learning_rate": 4.2791951413521106e-08, "loss": 0.1807, "step": 5314 }, { "epoch": 1.9215473608098337, "grad_norm": 0.004277405981713332, "learning_rate": 4.240260159700138e-08, "loss": 0.0001, "step": 5315 }, { "epoch": 1.9219088937093276, "grad_norm": 0.11307208306142137, "learning_rate": 4.2015023624744635e-08, "loss": 0.0161, "step": 5316 }, { "epoch": 1.9222704266088213, "grad_norm": 0.1784117045574033, "learning_rate": 4.162921763527283e-08, "loss": 0.0203, "step": 5317 }, { "epoch": 1.9226319595083152, "grad_norm": 0.05633572344375067, "learning_rate": 4.124518376647235e-08, "loss": 0.0003, "step": 5318 }, { "epoch": 1.9229934924078091, "grad_norm": 1.211861353556137, "learning_rate": 4.0862922155596175e-08, "loss": 0.0388, "step": 5319 }, { "epoch": 1.9233550253073028, "grad_norm": 1.1083008889538495, "learning_rate": 4.048243293926501e-08, "loss": 0.1309, "step": 5320 }, { "epoch": 1.9237165582067968, "grad_norm": 1.47952181904649, "learning_rate": 4.010371625346621e-08, "loss": 0.0432, "step": 5321 }, { "epoch": 1.9240780911062907, "grad_norm": 0.014559712318625266, "learning_rate": 3.972677223355203e-08, "loss": 0.0004, "step": 5322 }, { "epoch": 1.9244396240057844, "grad_norm": 0.0005925947127066831, "learning_rate": 3.935160101424251e-08, "loss": 0.0, "step": 5323 }, { "epoch": 1.9248011569052785, "grad_norm": 0.007290902522448779, "learning_rate": 3.897820272962427e-08, "loss": 0.0003, "step": 5324 }, { "epoch": 1.9251626898047722, "grad_norm": 0.15153460377136752, "learning_rate": 3.8606577513149444e-08, "loss": 0.0128, "step": 5325 }, { "epoch": 1.925524222704266, "grad_norm": 0.9942987936664481, "learning_rate": 3.823672549763624e-08, "loss": 0.1406, "step": 5326 }, { "epoch": 1.92588575560376, "grad_norm": 0.22400201040398726, "learning_rate": 3.786864681527114e-08, "loss": 0.0254, "step": 5327 }, { "epoch": 1.9262472885032538, "grad_norm": 0.0016752509097442139, "learning_rate": 3.7502341597603906e-08, "loss": 0.0001, "step": 5328 }, { "epoch": 1.9266088214027477, "grad_norm": 0.010210649956956733, "learning_rate": 3.713780997555205e-08, "loss": 0.0003, "step": 5329 }, { "epoch": 1.9269703543022416, "grad_norm": 0.7850845709207689, "learning_rate": 3.677505207940024e-08, "loss": 0.0579, "step": 5330 }, { "epoch": 1.9273318872017353, "grad_norm": 1.2774106429490408, "learning_rate": 3.641406803879754e-08, "loss": 0.0977, "step": 5331 }, { "epoch": 1.9276934201012292, "grad_norm": 0.21429683925059587, "learning_rate": 3.605485798275854e-08, "loss": 0.0021, "step": 5332 }, { "epoch": 1.9280549530007232, "grad_norm": 0.2523702907318975, "learning_rate": 3.569742203966664e-08, "loss": 0.0227, "step": 5333 }, { "epoch": 1.9284164859002169, "grad_norm": 0.0002922136035640457, "learning_rate": 3.5341760337267995e-08, "loss": 0.0, "step": 5334 }, { "epoch": 1.9287780187997108, "grad_norm": 0.040684992081537454, "learning_rate": 3.4987873002676476e-08, "loss": 0.0016, "step": 5335 }, { "epoch": 1.9291395516992047, "grad_norm": 0.24171998953660886, "learning_rate": 3.463576016237091e-08, "loss": 0.0254, "step": 5336 }, { "epoch": 1.9295010845986984, "grad_norm": 0.001532507719808761, "learning_rate": 3.428542194219731e-08, "loss": 0.0001, "step": 5337 }, { "epoch": 1.9298626174981923, "grad_norm": 0.29153535112789514, "learning_rate": 3.3936858467366055e-08, "loss": 0.0283, "step": 5338 }, { "epoch": 1.9302241503976862, "grad_norm": 2.0396861819049303, "learning_rate": 3.3590069862453054e-08, "loss": 0.0693, "step": 5339 }, { "epoch": 1.93058568329718, "grad_norm": 0.00017212975777071483, "learning_rate": 3.32450562514014e-08, "loss": 0.0, "step": 5340 }, { "epoch": 1.9309472161966739, "grad_norm": 0.00031917274713574444, "learning_rate": 3.290181775751855e-08, "loss": 0.0, "step": 5341 }, { "epoch": 1.9313087490961678, "grad_norm": 1.4297492837005619, "learning_rate": 3.25603545034775e-08, "loss": 0.0347, "step": 5342 }, { "epoch": 1.9316702819956615, "grad_norm": 0.19657502509957506, "learning_rate": 3.2220666611317284e-08, "loss": 0.0203, "step": 5343 }, { "epoch": 1.9320318148951554, "grad_norm": 3.8565203095839964, "learning_rate": 3.1882754202442465e-08, "loss": 0.416, "step": 5344 }, { "epoch": 1.9323933477946493, "grad_norm": 0.6098855030778256, "learning_rate": 3.1546617397623105e-08, "loss": 0.0283, "step": 5345 }, { "epoch": 1.932754880694143, "grad_norm": 0.08126296244229883, "learning_rate": 3.121225631699365e-08, "loss": 0.0128, "step": 5346 }, { "epoch": 1.9331164135936372, "grad_norm": 0.36367464610783146, "learning_rate": 3.087967108005574e-08, "loss": 0.0283, "step": 5347 }, { "epoch": 1.9334779464931309, "grad_norm": 0.10234555394506495, "learning_rate": 3.0548861805674846e-08, "loss": 0.0161, "step": 5348 }, { "epoch": 1.9338394793926246, "grad_norm": 0.18438343022023965, "learning_rate": 3.021982861208139e-08, "loss": 0.0044, "step": 5349 }, { "epoch": 1.9342010122921187, "grad_norm": 0.9503930670806954, "learning_rate": 2.9892571616872424e-08, "loss": 0.1807, "step": 5350 }, { "epoch": 1.9345625451916124, "grad_norm": 0.008945896003385235, "learning_rate": 2.9567090937009934e-08, "loss": 0.0003, "step": 5351 }, { "epoch": 1.9349240780911063, "grad_norm": 0.01000384542428427, "learning_rate": 2.9243386688819762e-08, "loss": 0.0005, "step": 5352 }, { "epoch": 1.9352856109906003, "grad_norm": 0.2497161575068837, "learning_rate": 2.8921458987994366e-08, "loss": 0.0254, "step": 5353 }, { "epoch": 1.935647143890094, "grad_norm": 0.8274235492146294, "learning_rate": 2.860130794959004e-08, "loss": 0.0476, "step": 5354 }, { "epoch": 1.9360086767895879, "grad_norm": 0.15734942674518626, "learning_rate": 2.82829336880297e-08, "loss": 0.0227, "step": 5355 }, { "epoch": 1.9363702096890818, "grad_norm": 0.9537224562647797, "learning_rate": 2.796633631709955e-08, "loss": 0.0476, "step": 5356 }, { "epoch": 1.9367317425885755, "grad_norm": 0.12668422666307894, "learning_rate": 2.76515159499513e-08, "loss": 0.0181, "step": 5357 }, { "epoch": 1.9370932754880694, "grad_norm": 0.052356359372569626, "learning_rate": 2.7338472699101616e-08, "loss": 0.0024, "step": 5358 }, { "epoch": 1.9374548083875633, "grad_norm": 0.006198340632086754, "learning_rate": 2.702720667643266e-08, "loss": 0.0002, "step": 5359 }, { "epoch": 1.937816341287057, "grad_norm": 0.14565507627810043, "learning_rate": 2.6717717993191007e-08, "loss": 0.0181, "step": 5360 }, { "epoch": 1.938177874186551, "grad_norm": 0.2138118701485895, "learning_rate": 2.6410006759986506e-08, "loss": 0.0227, "step": 5361 }, { "epoch": 1.9385394070860449, "grad_norm": 0.09014200439392064, "learning_rate": 2.6104073086796743e-08, "loss": 0.0143, "step": 5362 }, { "epoch": 1.9389009399855386, "grad_norm": 0.0002507950472928596, "learning_rate": 2.5799917082960924e-08, "loss": 0.0, "step": 5363 }, { "epoch": 1.9392624728850325, "grad_norm": 0.0045412838567748266, "learning_rate": 2.549753885718542e-08, "loss": 0.0001, "step": 5364 }, { "epoch": 1.9396240057845264, "grad_norm": 0.00027543610691906306, "learning_rate": 2.5196938517539348e-08, "loss": 0.0, "step": 5365 }, { "epoch": 1.9399855386840201, "grad_norm": 0.2814550108577519, "learning_rate": 2.4898116171457875e-08, "loss": 0.0283, "step": 5366 }, { "epoch": 1.940347071583514, "grad_norm": 0.02636165082047011, "learning_rate": 2.4601071925739463e-08, "loss": 0.0009, "step": 5367 }, { "epoch": 1.940708604483008, "grad_norm": 0.00019620640044123164, "learning_rate": 2.4305805886548073e-08, "loss": 0.0, "step": 5368 }, { "epoch": 1.9410701373825017, "grad_norm": 1.0744041041057586, "learning_rate": 2.4012318159412073e-08, "loss": 0.0579, "step": 5369 }, { "epoch": 1.9414316702819958, "grad_norm": 0.3759903157539298, "learning_rate": 2.3720608849223115e-08, "loss": 0.0315, "step": 5370 }, { "epoch": 1.9417932031814895, "grad_norm": 0.24679591861112332, "learning_rate": 2.34306780602378e-08, "loss": 0.0071, "step": 5371 }, { "epoch": 1.9421547360809832, "grad_norm": 0.6216064942033954, "learning_rate": 2.3142525896078794e-08, "loss": 0.0114, "step": 5372 }, { "epoch": 1.9425162689804774, "grad_norm": 0.04720299936257582, "learning_rate": 2.285615245972983e-08, "loss": 0.0013, "step": 5373 }, { "epoch": 1.942877801879971, "grad_norm": 0.754707203017471, "learning_rate": 2.257155785354237e-08, "loss": 0.1143, "step": 5374 }, { "epoch": 1.943239334779465, "grad_norm": 0.5730058829230266, "learning_rate": 2.2288742179229494e-08, "loss": 0.0388, "step": 5375 }, { "epoch": 1.943600867678959, "grad_norm": 0.042522884856350765, "learning_rate": 2.200770553786924e-08, "loss": 0.0003, "step": 5376 }, { "epoch": 1.9439624005784526, "grad_norm": 1.1531017340191168, "learning_rate": 2.1728448029904592e-08, "loss": 0.0527, "step": 5377 }, { "epoch": 1.9443239334779465, "grad_norm": 0.2681317184727472, "learning_rate": 2.145096975514238e-08, "loss": 0.0283, "step": 5378 }, { "epoch": 1.9446854663774404, "grad_norm": 0.16454344006808563, "learning_rate": 2.1175270812752723e-08, "loss": 0.0181, "step": 5379 }, { "epoch": 1.9450469992769341, "grad_norm": 0.18074021067062257, "learning_rate": 2.090135130127069e-08, "loss": 0.0227, "step": 5380 }, { "epoch": 1.945408532176428, "grad_norm": 1.4728135506882907, "learning_rate": 2.062921131859463e-08, "loss": 0.0903, "step": 5381 }, { "epoch": 1.945770065075922, "grad_norm": 0.575791304028908, "learning_rate": 2.0358850961987864e-08, "loss": 0.0432, "step": 5382 }, { "epoch": 1.9461315979754157, "grad_norm": 0.9370892749215188, "learning_rate": 2.009027032807642e-08, "loss": 0.0527, "step": 5383 }, { "epoch": 1.9464931308749096, "grad_norm": 0.13475729572706316, "learning_rate": 1.9823469512851856e-08, "loss": 0.0203, "step": 5384 }, { "epoch": 1.9468546637744035, "grad_norm": 1.088935161670154, "learning_rate": 1.9558448611668444e-08, "loss": 0.0432, "step": 5385 }, { "epoch": 1.9472161966738972, "grad_norm": 0.15808938262613886, "learning_rate": 1.9295207719243758e-08, "loss": 0.0049, "step": 5386 }, { "epoch": 1.9475777295733911, "grad_norm": 0.00015636796631805417, "learning_rate": 1.9033746929661424e-08, "loss": 0.0, "step": 5387 }, { "epoch": 1.947939262472885, "grad_norm": 0.2482048615694267, "learning_rate": 1.8774066336366138e-08, "loss": 0.0254, "step": 5388 }, { "epoch": 1.9483007953723788, "grad_norm": 0.10202143271948066, "learning_rate": 1.851616603216866e-08, "loss": 0.0044, "step": 5389 }, { "epoch": 1.9486623282718727, "grad_norm": 0.14064317504633647, "learning_rate": 1.8260046109241926e-08, "loss": 0.0143, "step": 5390 }, { "epoch": 1.9490238611713666, "grad_norm": 0.23503232448313516, "learning_rate": 1.800570665912382e-08, "loss": 0.0227, "step": 5391 }, { "epoch": 1.9493853940708603, "grad_norm": 1.351385575370565, "learning_rate": 1.775314777271442e-08, "loss": 0.0476, "step": 5392 }, { "epoch": 1.9497469269703545, "grad_norm": 0.0010640538573562944, "learning_rate": 1.7502369540278174e-08, "loss": 0.0, "step": 5393 }, { "epoch": 1.9501084598698482, "grad_norm": 0.06574104635385178, "learning_rate": 1.7253372051443952e-08, "loss": 0.0031, "step": 5394 }, { "epoch": 1.9504699927693419, "grad_norm": 0.10744561650166885, "learning_rate": 1.700615539520334e-08, "loss": 0.0143, "step": 5395 }, { "epoch": 1.950831525668836, "grad_norm": 0.004269382305453448, "learning_rate": 1.6760719659910664e-08, "loss": 0.0002, "step": 5396 }, { "epoch": 1.9511930585683297, "grad_norm": 2.211183865528397, "learning_rate": 1.6517064933285752e-08, "loss": 0.1226, "step": 5397 }, { "epoch": 1.9515545914678236, "grad_norm": 0.0005466330072579568, "learning_rate": 1.627519130240951e-08, "loss": 0.0, "step": 5398 }, { "epoch": 1.9519161243673175, "grad_norm": 0.2073324865304438, "learning_rate": 1.6035098853728338e-08, "loss": 0.0227, "step": 5399 }, { "epoch": 1.9522776572668112, "grad_norm": 0.2247583705043979, "learning_rate": 1.579678767305082e-08, "loss": 0.0227, "step": 5400 }, { "epoch": 1.9526391901663052, "grad_norm": 0.20093322193076257, "learning_rate": 1.5560257845549932e-08, "loss": 0.0181, "step": 5401 }, { "epoch": 1.953000723065799, "grad_norm": 0.21451781310026982, "learning_rate": 1.5325509455760834e-08, "loss": 0.008, "step": 5402 }, { "epoch": 1.9533622559652928, "grad_norm": 0.002169589879563959, "learning_rate": 1.5092542587582525e-08, "loss": 0.0001, "step": 5403 }, { "epoch": 1.9537237888647867, "grad_norm": 1.685355295990856, "learning_rate": 1.4861357324277292e-08, "loss": 0.1602, "step": 5404 }, { "epoch": 1.9540853217642806, "grad_norm": 0.42746908798457617, "learning_rate": 1.4631953748471262e-08, "loss": 0.0254, "step": 5405 }, { "epoch": 1.9544468546637743, "grad_norm": 0.6547701994372614, "learning_rate": 1.4404331942152738e-08, "loss": 0.1807, "step": 5406 }, { "epoch": 1.9548083875632682, "grad_norm": 0.7957095717980562, "learning_rate": 1.417849198667387e-08, "loss": 0.0388, "step": 5407 }, { "epoch": 1.9551699204627622, "grad_norm": 0.9069417025846368, "learning_rate": 1.3954433962749536e-08, "loss": 0.1309, "step": 5408 }, { "epoch": 1.9555314533622559, "grad_norm": 0.0036857164736490066, "learning_rate": 1.3732157950458458e-08, "loss": 0.0001, "step": 5409 }, { "epoch": 1.9558929862617498, "grad_norm": 1.6526333948082428, "learning_rate": 1.3511664029242088e-08, "loss": 0.083, "step": 5410 }, { "epoch": 1.9562545191612437, "grad_norm": 0.4754674703539436, "learning_rate": 1.3292952277904059e-08, "loss": 0.0388, "step": 5411 }, { "epoch": 1.9566160520607374, "grad_norm": 0.5944074927325989, "learning_rate": 1.3076022774612952e-08, "loss": 0.0432, "step": 5412 }, { "epoch": 1.9569775849602313, "grad_norm": 0.04697490453449403, "learning_rate": 1.2860875596898414e-08, "loss": 0.0015, "step": 5413 }, { "epoch": 1.9573391178597253, "grad_norm": 0.366095137160925, "learning_rate": 1.264751082165505e-08, "loss": 0.0349, "step": 5414 }, { "epoch": 1.957700650759219, "grad_norm": 0.0008510127578331359, "learning_rate": 1.243592852513853e-08, "loss": 0.0, "step": 5415 }, { "epoch": 1.958062183658713, "grad_norm": 0.27049909757141993, "learning_rate": 1.222612878296836e-08, "loss": 0.0283, "step": 5416 }, { "epoch": 1.9584237165582068, "grad_norm": 0.06852102199506115, "learning_rate": 1.2018111670127341e-08, "loss": 0.0031, "step": 5417 }, { "epoch": 1.9587852494577005, "grad_norm": 0.751750651588073, "learning_rate": 1.1811877260961002e-08, "loss": 0.0388, "step": 5418 }, { "epoch": 1.9591467823571946, "grad_norm": 0.11899403054669348, "learning_rate": 1.1607425629176494e-08, "loss": 0.0181, "step": 5419 }, { "epoch": 1.9595083152566883, "grad_norm": 0.20435860175570172, "learning_rate": 1.1404756847845366e-08, "loss": 0.0254, "step": 5420 }, { "epoch": 1.9598698481561823, "grad_norm": 0.21645495023352648, "learning_rate": 1.1203870989401344e-08, "loss": 0.0227, "step": 5421 }, { "epoch": 1.9602313810556762, "grad_norm": 0.1852885227864628, "learning_rate": 1.1004768125640886e-08, "loss": 0.0024, "step": 5422 }, { "epoch": 1.9605929139551699, "grad_norm": 0.28734548787242614, "learning_rate": 1.0807448327723735e-08, "loss": 0.0227, "step": 5423 }, { "epoch": 1.9609544468546638, "grad_norm": 0.29595546017000174, "learning_rate": 1.061191166617126e-08, "loss": 0.0283, "step": 5424 }, { "epoch": 1.9613159797541577, "grad_norm": 0.04211171803661508, "learning_rate": 1.0418158210869223e-08, "loss": 0.0016, "step": 5425 }, { "epoch": 1.9616775126536514, "grad_norm": 0.0003273679756650616, "learning_rate": 1.02261880310639e-08, "loss": 0.0, "step": 5426 }, { "epoch": 1.9620390455531453, "grad_norm": 0.09988285559780215, "learning_rate": 1.0036001195365964e-08, "loss": 0.0128, "step": 5427 }, { "epoch": 1.9624005784526393, "grad_norm": 0.5200450485730926, "learning_rate": 9.847597771748262e-09, "loss": 0.0388, "step": 5428 }, { "epoch": 1.962762111352133, "grad_norm": 0.0017540071096767635, "learning_rate": 9.660977827545825e-09, "loss": 0.0001, "step": 5429 }, { "epoch": 1.9631236442516269, "grad_norm": 0.13821802903183242, "learning_rate": 9.476141429456964e-09, "loss": 0.0044, "step": 5430 }, { "epoch": 1.9634851771511208, "grad_norm": 0.20945436046057936, "learning_rate": 9.293088643541614e-09, "loss": 0.0254, "step": 5431 }, { "epoch": 1.9638467100506145, "grad_norm": 0.00021069034713493593, "learning_rate": 9.111819535223554e-09, "loss": 0.0, "step": 5432 }, { "epoch": 1.9642082429501084, "grad_norm": 0.43154410784387215, "learning_rate": 8.932334169287627e-09, "loss": 0.0227, "step": 5433 }, { "epoch": 1.9645697758496024, "grad_norm": 0.18212350473514177, "learning_rate": 8.754632609882519e-09, "loss": 0.0203, "step": 5434 }, { "epoch": 1.964931308749096, "grad_norm": 0.8418351073020106, "learning_rate": 8.578714920517984e-09, "loss": 0.1699, "step": 5435 }, { "epoch": 1.96529284164859, "grad_norm": 0.4533366657454846, "learning_rate": 8.404581164067615e-09, "loss": 0.0432, "step": 5436 }, { "epoch": 1.965654374548084, "grad_norm": 0.13527112206437852, "learning_rate": 8.232231402766633e-09, "loss": 0.0181, "step": 5437 }, { "epoch": 1.9660159074475776, "grad_norm": 0.14551451651515634, "learning_rate": 8.061665698212429e-09, "loss": 0.0203, "step": 5438 }, { "epoch": 1.9663774403470717, "grad_norm": 0.7663362861140982, "learning_rate": 7.892884111365128e-09, "loss": 0.1807, "step": 5439 }, { "epoch": 1.9667389732465654, "grad_norm": 0.15078455909665142, "learning_rate": 7.725886702548146e-09, "loss": 0.008, "step": 5440 }, { "epoch": 1.9671005061460591, "grad_norm": 1.001622268652585, "learning_rate": 7.560673531445406e-09, "loss": 0.0898, "step": 5441 }, { "epoch": 1.9674620390455533, "grad_norm": 0.005541869461939314, "learning_rate": 7.397244657104119e-09, "loss": 0.0002, "step": 5442 }, { "epoch": 1.967823571945047, "grad_norm": 1.1486480566977866, "learning_rate": 7.235600137934229e-09, "loss": 0.1406, "step": 5443 }, { "epoch": 1.968185104844541, "grad_norm": 0.0078513245702513, "learning_rate": 7.075740031707301e-09, "loss": 0.0002, "step": 5444 }, { "epoch": 1.9685466377440348, "grad_norm": 0.002322908581700603, "learning_rate": 6.917664395556522e-09, "loss": 0.0001, "step": 5445 }, { "epoch": 1.9689081706435285, "grad_norm": 3.0540945600497134, "learning_rate": 6.761373285979478e-09, "loss": 0.1504, "step": 5446 }, { "epoch": 1.9692697035430224, "grad_norm": 0.2873434052505256, "learning_rate": 6.606866758833708e-09, "loss": 0.0254, "step": 5447 }, { "epoch": 1.9696312364425164, "grad_norm": 0.019846683474902004, "learning_rate": 6.4541448693394845e-09, "loss": 0.0006, "step": 5448 }, { "epoch": 1.96999276934201, "grad_norm": 0.022646306793382868, "learning_rate": 6.303207672080924e-09, "loss": 0.0008, "step": 5449 }, { "epoch": 1.970354302241504, "grad_norm": 0.14095081542443808, "learning_rate": 6.154055221000988e-09, "loss": 0.0181, "step": 5450 }, { "epoch": 1.970715835140998, "grad_norm": 0.1931923951908448, "learning_rate": 6.006687569408698e-09, "loss": 0.0227, "step": 5451 }, { "epoch": 1.9710773680404916, "grad_norm": 0.699324336400578, "learning_rate": 5.861104769971926e-09, "loss": 0.0476, "step": 5452 }, { "epoch": 1.9714389009399855, "grad_norm": 0.6360746902631306, "learning_rate": 5.717306874722384e-09, "loss": 0.0349, "step": 5453 }, { "epoch": 1.9718004338394794, "grad_norm": 0.6568970288959003, "learning_rate": 5.575293935053405e-09, "loss": 0.1699, "step": 5454 }, { "epoch": 1.9721619667389731, "grad_norm": 0.23866930922147866, "learning_rate": 5.435066001720502e-09, "loss": 0.0063, "step": 5455 }, { "epoch": 1.972523499638467, "grad_norm": 0.15407713918733387, "learning_rate": 5.296623124840805e-09, "loss": 0.0203, "step": 5456 }, { "epoch": 1.972885032537961, "grad_norm": 0.147708377236167, "learning_rate": 5.159965353894181e-09, "loss": 0.0161, "step": 5457 }, { "epoch": 1.9732465654374547, "grad_norm": 0.18848800134887334, "learning_rate": 5.025092737721559e-09, "loss": 0.0227, "step": 5458 }, { "epoch": 1.9736080983369486, "grad_norm": 1.3502562956490465, "learning_rate": 4.892005324526605e-09, "loss": 0.0898, "step": 5459 }, { "epoch": 1.9739696312364425, "grad_norm": 1.0129615995915269, "learning_rate": 4.760703161874602e-09, "loss": 0.0693, "step": 5460 }, { "epoch": 1.9743311641359362, "grad_norm": 0.0016209350524224763, "learning_rate": 4.631186296693568e-09, "loss": 0.0001, "step": 5461 }, { "epoch": 1.9746926970354304, "grad_norm": 0.019649161252477544, "learning_rate": 4.503454775272031e-09, "loss": 0.0008, "step": 5462 }, { "epoch": 1.975054229934924, "grad_norm": 0.13958361969563404, "learning_rate": 4.3775086432618075e-09, "loss": 0.0039, "step": 5463 }, { "epoch": 1.9754157628344178, "grad_norm": 0.045719328941321295, "learning_rate": 4.253347945675779e-09, "loss": 0.0013, "step": 5464 }, { "epoch": 1.975777295733912, "grad_norm": 0.7849604790285173, "learning_rate": 4.130972726888449e-09, "loss": 0.1143, "step": 5465 }, { "epoch": 1.9761388286334056, "grad_norm": 0.39201762748919167, "learning_rate": 4.010383030637055e-09, "loss": 0.0317, "step": 5466 }, { "epoch": 1.9765003615328995, "grad_norm": 0.40320588621862735, "learning_rate": 3.8915789000210095e-09, "loss": 0.0352, "step": 5467 }, { "epoch": 1.9768618944323935, "grad_norm": 1.419376654169083, "learning_rate": 3.774560377500236e-09, "loss": 0.083, "step": 5468 }, { "epoch": 1.9772234273318872, "grad_norm": 0.18588770250155348, "learning_rate": 3.659327504896837e-09, "loss": 0.0227, "step": 5469 }, { "epoch": 1.977584960231381, "grad_norm": 0.4578631358298903, "learning_rate": 3.54588032339509e-09, "loss": 0.0349, "step": 5470 }, { "epoch": 1.977946493130875, "grad_norm": 0.00024211278845189964, "learning_rate": 3.4342188735420055e-09, "loss": 0.0, "step": 5471 }, { "epoch": 1.9783080260303687, "grad_norm": 0.33197789307417525, "learning_rate": 3.324343195243995e-09, "loss": 0.0283, "step": 5472 }, { "epoch": 1.9786695589298626, "grad_norm": 0.0004212099983745789, "learning_rate": 3.2162533277713125e-09, "loss": 0.0, "step": 5473 }, { "epoch": 1.9790310918293565, "grad_norm": 0.00011343320242239064, "learning_rate": 3.1099493097558333e-09, "loss": 0.0, "step": 5474 }, { "epoch": 1.9793926247288502, "grad_norm": 2.7953628552798437, "learning_rate": 3.0054311791893886e-09, "loss": 0.3047, "step": 5475 }, { "epoch": 1.9797541576283442, "grad_norm": 1.1823551968505375, "learning_rate": 2.9026989734270983e-09, "loss": 0.0432, "step": 5476 }, { "epoch": 1.980115690527838, "grad_norm": 0.04342320017036718, "learning_rate": 2.801752729185703e-09, "loss": 0.0016, "step": 5477 }, { "epoch": 1.9804772234273318, "grad_norm": 0.45360312679065307, "learning_rate": 2.7025924825435644e-09, "loss": 0.0388, "step": 5478 }, { "epoch": 1.9808387563268257, "grad_norm": 0.5023995620391468, "learning_rate": 2.605218268940113e-09, "loss": 0.0143, "step": 5479 }, { "epoch": 1.9812002892263196, "grad_norm": 0.2672213603551281, "learning_rate": 2.5096301231769537e-09, "loss": 0.0254, "step": 5480 }, { "epoch": 1.9815618221258133, "grad_norm": 0.17663641079965278, "learning_rate": 2.415828079417315e-09, "loss": 0.0161, "step": 5481 }, { "epoch": 1.9819233550253073, "grad_norm": 1.099275959955394, "learning_rate": 2.3238121711860463e-09, "loss": 0.0635, "step": 5482 }, { "epoch": 1.9822848879248012, "grad_norm": 0.13828622931566292, "learning_rate": 2.2335824313696187e-09, "loss": 0.0031, "step": 5483 }, { "epoch": 1.9826464208242949, "grad_norm": 0.02734935011001408, "learning_rate": 2.1451388922161253e-09, "loss": 0.0009, "step": 5484 }, { "epoch": 1.983007953723789, "grad_norm": 0.06906387624543636, "learning_rate": 2.058481585335281e-09, "loss": 0.0024, "step": 5485 }, { "epoch": 1.9833694866232827, "grad_norm": 0.15420399936223506, "learning_rate": 1.973610541698423e-09, "loss": 0.0227, "step": 5486 }, { "epoch": 1.9837310195227764, "grad_norm": 0.01174584026076598, "learning_rate": 1.890525791637954e-09, "loss": 0.0006, "step": 5487 }, { "epoch": 1.9840925524222706, "grad_norm": 1.0291759961097158, "learning_rate": 1.8092273648495639e-09, "loss": 0.1699, "step": 5488 }, { "epoch": 1.9844540853217643, "grad_norm": 0.0017498934101418092, "learning_rate": 1.7297152903877901e-09, "loss": 0.0, "step": 5489 }, { "epoch": 1.9848156182212582, "grad_norm": 0.0800653108041366, "learning_rate": 1.6519895966710109e-09, "loss": 0.0031, "step": 5490 }, { "epoch": 1.985177151120752, "grad_norm": 2.2991717424709646, "learning_rate": 1.5760503114786719e-09, "loss": 0.1504, "step": 5491 }, { "epoch": 1.9855386840202458, "grad_norm": 1.2759711674034713, "learning_rate": 1.50189746195073e-09, "loss": 0.083, "step": 5492 }, { "epoch": 1.9859002169197397, "grad_norm": 0.2863163117063323, "learning_rate": 1.4295310745898738e-09, "loss": 0.0181, "step": 5493 }, { "epoch": 1.9862617498192336, "grad_norm": 1.547717914888112, "learning_rate": 1.3589511752598595e-09, "loss": 0.0903, "step": 5494 }, { "epoch": 1.9866232827187273, "grad_norm": 0.2723288989741633, "learning_rate": 1.2901577891855089e-09, "loss": 0.0283, "step": 5495 }, { "epoch": 1.9869848156182213, "grad_norm": 0.24979154332078946, "learning_rate": 1.2231509409543763e-09, "loss": 0.0227, "step": 5496 }, { "epoch": 1.9873463485177152, "grad_norm": 0.7193333159433795, "learning_rate": 1.1579306545134173e-09, "loss": 0.2129, "step": 5497 }, { "epoch": 1.9877078814172089, "grad_norm": 0.48172772063730707, "learning_rate": 1.0944969531734296e-09, "loss": 0.0227, "step": 5498 }, { "epoch": 1.9880694143167028, "grad_norm": 0.003471801843863059, "learning_rate": 1.0328498596051673e-09, "loss": 0.0001, "step": 5499 }, { "epoch": 1.9884309472161967, "grad_norm": 0.22276396479676672, "learning_rate": 9.729893958415614e-10, "loss": 0.0227, "step": 5500 }, { "epoch": 1.9887924801156904, "grad_norm": 0.321389530360716, "learning_rate": 9.149155832766099e-10, "loss": 0.0203, "step": 5501 }, { "epoch": 1.9891540130151844, "grad_norm": 0.1327303297162203, "learning_rate": 8.586284426659319e-10, "loss": 0.0181, "step": 5502 }, { "epoch": 1.9895155459146783, "grad_norm": 0.18794186189693732, "learning_rate": 8.041279941262137e-10, "loss": 0.0161, "step": 5503 }, { "epoch": 1.989877078814172, "grad_norm": 0.0365548017801654, "learning_rate": 7.51414257135763e-10, "loss": 0.0015, "step": 5504 }, { "epoch": 1.990238611713666, "grad_norm": 0.0019801854227431506, "learning_rate": 7.004872505350647e-10, "loss": 0.0, "step": 5505 }, { "epoch": 1.9906001446131598, "grad_norm": 2.2144185482696743, "learning_rate": 6.513469925256699e-10, "loss": 0.1055, "step": 5506 }, { "epoch": 1.9909616775126535, "grad_norm": 0.6310992196137056, "learning_rate": 6.039935006690867e-10, "loss": 0.1406, "step": 5507 }, { "epoch": 1.9913232104121477, "grad_norm": 3.781528421947134, "learning_rate": 5.58426791890665e-10, "loss": 0.1504, "step": 5508 }, { "epoch": 1.9916847433116414, "grad_norm": 0.04893003315620177, "learning_rate": 5.14646882475156e-10, "loss": 0.0024, "step": 5509 }, { "epoch": 1.992046276211135, "grad_norm": 0.5687252958653529, "learning_rate": 4.726537880700432e-10, "loss": 0.2012, "step": 5510 }, { "epoch": 1.9924078091106292, "grad_norm": 0.21395255251307732, "learning_rate": 4.3244752368276634e-10, "loss": 0.0143, "step": 5511 }, { "epoch": 1.992769342010123, "grad_norm": 0.6483078624854585, "learning_rate": 3.940281036840521e-10, "loss": 0.0579, "step": 5512 }, { "epoch": 1.9931308749096168, "grad_norm": 0.13482281220270012, "learning_rate": 3.573955418045838e-10, "loss": 0.0161, "step": 5513 }, { "epoch": 1.9934924078091107, "grad_norm": 0.4890220258440897, "learning_rate": 3.225498511372216e-10, "loss": 0.0227, "step": 5514 }, { "epoch": 1.9938539407086044, "grad_norm": 0.22994030733342016, "learning_rate": 2.89491044134782e-10, "loss": 0.0227, "step": 5515 }, { "epoch": 1.9942154736080984, "grad_norm": 0.35697522291205547, "learning_rate": 2.5821913261336875e-10, "loss": 0.043, "step": 5516 }, { "epoch": 1.9945770065075923, "grad_norm": 0.00043879196648282433, "learning_rate": 2.287341277490418e-10, "loss": 0.0, "step": 5517 }, { "epoch": 1.994938539407086, "grad_norm": 0.0026616116138983054, "learning_rate": 2.010360400805933e-10, "loss": 0.0001, "step": 5518 }, { "epoch": 1.99530007230658, "grad_norm": 0.577109860363808, "learning_rate": 1.7512487950621659e-10, "loss": 0.2012, "step": 5519 }, { "epoch": 1.9956616052060738, "grad_norm": 0.14013250926849175, "learning_rate": 1.5100065528683704e-10, "loss": 0.0027, "step": 5520 }, { "epoch": 1.9960231381055675, "grad_norm": 1.248222607913373, "learning_rate": 1.286633760450018e-10, "loss": 0.1226, "step": 5521 }, { "epoch": 1.9963846710050615, "grad_norm": 0.0003669126409337979, "learning_rate": 1.0811304976376946e-10, "loss": 0.0, "step": 5522 }, { "epoch": 1.9967462039045554, "grad_norm": 0.18998607587166966, "learning_rate": 8.93496837878205e-11, "loss": 0.0254, "step": 5523 }, { "epoch": 1.997107736804049, "grad_norm": 0.027238043152741725, "learning_rate": 7.237328482290196e-11, "loss": 0.0011, "step": 5524 }, { "epoch": 1.997469269703543, "grad_norm": 0.2478449758076414, "learning_rate": 5.718385893693779e-11, "loss": 0.0181, "step": 5525 }, { "epoch": 1.997830802603037, "grad_norm": 0.32774081169935576, "learning_rate": 4.378141155780835e-11, "loss": 0.0283, "step": 5526 }, { "epoch": 1.9981923355025306, "grad_norm": 0.35646301247178175, "learning_rate": 3.2165947476126004e-11, "loss": 0.0283, "step": 5527 }, { "epoch": 1.9985538684020245, "grad_norm": 0.16614438716300325, "learning_rate": 2.233747084301463e-11, "loss": 0.0203, "step": 5528 }, { "epoch": 1.9989154013015185, "grad_norm": 0.4733684548011406, "learning_rate": 1.4295985171774996e-11, "loss": 0.0388, "step": 5529 }, { "epoch": 1.9992769342010122, "grad_norm": 0.22463672043397395, "learning_rate": 8.041493335664286e-12, "loss": 0.0227, "step": 5530 }, { "epoch": 1.9996384671005063, "grad_norm": 0.675430268582995, "learning_rate": 3.573997570116561e-12, "loss": 0.1807, "step": 5531 }, { "epoch": 2.0, "grad_norm": 0.03643560063594245, "learning_rate": 8.934994721876422e-13, "loss": 0.0012, "step": 5532 }, { "epoch": 2.0, "step": 5532, "total_flos": 2085928185102336.0, "train_loss": 0.060792557523462624, "train_runtime": 49353.6459, "train_samples_per_second": 0.897, "train_steps_per_second": 0.112 } ], "logging_steps": 1.0, "max_steps": 5532, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2085928185102336.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }